From 42609300fff4aadd9d62aa9a005cff7096ee4bec Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Mon, 7 Oct 2024 23:36:19 +0300 Subject: [PATCH] hcq no timeline signals in init (#6944) --- tinygrad/runtime/ops_amd.py | 6 +++--- tinygrad/runtime/ops_nv.py | 4 ++-- tinygrad/runtime/ops_qcom.py | 4 ++-- tinygrad/runtime/support/hcq.py | 6 +++--- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index dd6bcd0d..b878c36c 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -30,10 +30,10 @@ def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2 class AMDSignal(HCQSignal): - def __init__(self, value=0, alloc_event=False): + def __init__(self, value=0, is_timeline=False): self._signal = AMDDevice.signals_pool.pop() self._value_addr, self._timestamp_addr = mv_address(self._signal), mv_address(self._signal) + 8 - if alloc_event: + if is_timeline: sync_event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, auto_reset=1) self._event_mailbox_ptr = AMDDevice.event_page.va_addr + sync_event.event_slot_index*8 self._event_id = sync_event.event_id @@ -418,7 +418,7 @@ class AMDDevice(HCQCompiled): self.sdma_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x100000) super().__init__(device, AMDAllocator(self), AMDRenderer(), AMDCompiler(self.arch), functools.partial(AMDProgram, self), - AMDSignal, AMDComputeQueue, AMDCopyQueue, (AMDSignal(alloc_event=True), AMDSignal(alloc_event=True))) + AMDSignal, AMDComputeQueue, AMDCopyQueue) def _alloc_queue(self, queue_type, ring_size, ctx_save_restore_size=None, eop_buffer_size=None) -> AMDQueueDesc: gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index 678d5935..62d3fa6b 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -68,7 +68,7 @@ assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4 def nvmethod(subc, mthd, size, typ=2): return (typ << 28) | (size << 16) | (subc << 13) | (mthd >> 2) class NVSignal(HCQSignal): - def __init__(self, value=0): + def __init__(self, value=0, is_timeline=False): self._signal = NVDevice.signals_pool.pop() self.signal_addr = mv_address(self._signal) super().__init__(value) @@ -480,7 +480,7 @@ class NVDevice(HCQCompiled): compiler_t = (PTXCompiler if PTX else CUDACompiler) if MOCKGPU else (NVPTXCompiler if PTX else NVCompiler) super().__init__(device, NVAllocator(self), PTXRenderer(self.arch, device="NV") if PTX else NVRenderer(self.arch), compiler_t(self.arch), - functools.partial(NVProgram, self), NVSignal, NVComputeQueue, NVCopyQueue, timeline_signals=(NVSignal(), NVSignal())) + functools.partial(NVProgram, self), NVSignal, NVComputeQueue, NVCopyQueue) self._setup_gpfifos() diff --git a/tinygrad/runtime/ops_qcom.py b/tinygrad/runtime/ops_qcom.py index 80136664..37b7ae80 100644 --- a/tinygrad/runtime/ops_qcom.py +++ b/tinygrad/runtime/ops_qcom.py @@ -32,7 +32,7 @@ class QCOMCompiler(CLCompiler): def __init__(self, device:str=""): super().__init__(CLDevice(device), 'compile_qcom') class QCOMSignal(HCQSignal): - def __init__(self, value=0, **kwargs): + def __init__(self, value=0, is_timeline=False): self._signal = QCOMDevice.signals_pool.pop() super().__init__(value) def __del__(self): QCOMDevice.signals_pool.append(self._signal) @@ -351,7 +351,7 @@ class QCOMDevice(HCQCompiled): if QCOMDevice.gpu_id >= 700: raise RuntimeError(f"Unsupported GPU: {QCOMDevice.gpu_id}") super().__init__(device, QCOMAllocator(self), QCOMRenderer(), QCOMCompiler(device), functools.partial(QCOMProgram, self), - QCOMSignal, QCOMComputeQueue, None, timeline_signals=(QCOMSignal(), QCOMSignal())) + QCOMSignal, QCOMComputeQueue, None) def _ctx_create(self): cr = kgsl.IOCTL_KGSL_DRAWCTXT_CREATE(self.fd, flags=(kgsl.KGSL_CONTEXT_PREAMBLE | kgsl.KGSL_CONTEXT_PWR_CONSTRAINT | diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index 5c0a214f..64366383 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -195,7 +195,7 @@ class HWCopyQueue(HWCommandQueue): def _update_copy(self, cmd_idx, dest, src): raise NotImplementedError("backend should overload this function") class HCQSignal: - def __init__(self, value:int=0): self._set_value(value) + def __init__(self, value:int=0, is_timeline:bool=False): self._set_value(value) @property def value(self) -> int: return self._get_value() @@ -346,10 +346,10 @@ class HCQCompiled(Compiled): gpu2cpu_compute_time_diff: decimal.Decimal = decimal.Decimal('nan') def __init__(self, device:str, allocator:Allocator, renderer:Renderer, compiler:Compiler, runtime, signal_t:Type[HCQSignal], - comp_queue_t:Type[HWComputeQueue], copy_queue_t:Optional[Type[HWCopyQueue]], timeline_signals:Tuple[HCQSignal, HCQSignal]): + comp_queue_t:Type[HWComputeQueue], copy_queue_t:Optional[Type[HWCopyQueue]]): self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t self.timeline_value:int = 1 - self.timeline_signal, self._shadow_timeline_signal = timeline_signals + self.timeline_signal, self._shadow_timeline_signal = self.signal_t(0, is_timeline=True), self.signal_t(0, is_timeline=True) self.sig_prof_records:List[Tuple[HCQSignal, HCQSignal, str, bool]] = [] self.raw_prof_records:List[Tuple[decimal.Decimal, decimal.Decimal, str, bool, Optional[Dict]]] = [] self.dep_prof_records:List[Tuple[decimal.Decimal, decimal.Decimal, HCQCompiled, bool, decimal.Decimal, decimal.Decimal, HCQCompiled, bool]] = []