diff --git a/tinygrad/device.py b/tinygrad/device.py index c4f35111..324ba9c2 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -267,8 +267,8 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method self.device._wait_signal(self.device.timeline_signal, self.b_timeline[self.b_next]) ctypes.memmove(self.b[self.b_next].va_addr, from_mv(src[i:]), lsize:=min(self.b[self.b_next].size, src.nbytes-i)) self.device.hw_copy_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1) \ - .copy(dest.va_addr+i, self.b[self.b_next].va_addr, lsize) \ - .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device) + .copy(dest.va_addr+i, self.b[self.b_next].va_addr, lsize) \ + .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device) self.b_timeline[self.b_next] = self.device.timeline_value self.device.timeline_value += 1 @@ -283,8 +283,8 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"DISK -> {self.device.dname}", enabled=PROFILE): for (batch_info, dst_off, src_off, copy_size) in src.device.allocator._copyout_sharded(src, size, _get_temp_buf, seg_len=self.b[0].size): self.device.hw_copy_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1) \ - .copy(dest.va_addr + dst_off, batch_info[0] + src_off, copy_size) \ - .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device) + .copy(dest.va_addr + dst_off, batch_info[0] + src_off, copy_size) \ + .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device) self.b_timeline[batch_info[1]] = self.device.timeline_value self.device.timeline_value += 1 @@ -294,8 +294,8 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"{self.device.dname} -> CPU", enabled=PROFILE): for i in range(0, dest.nbytes, self.b[0].size): self.device.hw_copy_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1) \ - .copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i)) \ - .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device) + .copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i)) \ + .signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device) self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value) self.device.timeline_value += 1 @@ -306,9 +306,9 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"{src_dev.dname} -> {dest_dev.dname}", enabled=PROFILE): src_dev.hw_copy_queue_t().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \ - .wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \ - .copy(dest.va_addr, src.va_addr, sz) \ - .signal(src_dev.timeline_signal, src_dev.timeline_value).submit(src_dev) + .wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \ + .copy(dest.va_addr, src.va_addr, sz) \ + .signal(src_dev.timeline_signal, src_dev.timeline_value).submit(src_dev) dest_dev.hw_compute_queue_t().wait(src_dev.timeline_signal, src_dev.timeline_value).submit(dest_dev) src_dev.timeline_value += 1 diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index 89fc0941..02ea724f 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -165,8 +165,7 @@ class ProfileLogger: self.subactors[subactor_key] = (tid:=len(self.subactors)) self.mjson.append({"name": "thread_name", "ph": "M", "pid": self.actors[actor_name], "tid":tid, "args": {"name": subactor_name}}) - self.mjson.append({"name": name, "ph": "B", "pid": self.actors[actor_name], "tid": self.subactors.get(subactor_key, -1), "ts": st}) - self.mjson.append({"name": name, "ph": "E", "pid": self.actors[actor_name], "tid": self.subactors.get(subactor_key, -1), "ts": et}) + self.mjson.append({"name": name, "ph": "X", "pid": self.actors[actor_name], "tid": self.subactors.get(subactor_key, -1), "ts":st, "dur":et-st}) ProfileLogger.writers -= 1 if ProfileLogger.writers == 0: