mirror of https://github.com/commaai/tinygrad.git
hcq no timeline signals in init (#6944)
This commit is contained in:
parent
0ecc417dd2
commit
42609300ff
|
@ -30,10 +30,10 @@ def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START
|
|||
def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2
|
||||
|
||||
class AMDSignal(HCQSignal):
|
||||
def __init__(self, value=0, alloc_event=False):
|
||||
def __init__(self, value=0, is_timeline=False):
|
||||
self._signal = AMDDevice.signals_pool.pop()
|
||||
self._value_addr, self._timestamp_addr = mv_address(self._signal), mv_address(self._signal) + 8
|
||||
if alloc_event:
|
||||
if is_timeline:
|
||||
sync_event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, auto_reset=1)
|
||||
self._event_mailbox_ptr = AMDDevice.event_page.va_addr + sync_event.event_slot_index*8
|
||||
self._event_id = sync_event.event_id
|
||||
|
@ -418,7 +418,7 @@ class AMDDevice(HCQCompiled):
|
|||
self.sdma_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x100000)
|
||||
|
||||
super().__init__(device, AMDAllocator(self), AMDRenderer(), AMDCompiler(self.arch), functools.partial(AMDProgram, self),
|
||||
AMDSignal, AMDComputeQueue, AMDCopyQueue, (AMDSignal(alloc_event=True), AMDSignal(alloc_event=True)))
|
||||
AMDSignal, AMDComputeQueue, AMDCopyQueue)
|
||||
|
||||
def _alloc_queue(self, queue_type, ring_size, ctx_save_restore_size=None, eop_buffer_size=None) -> AMDQueueDesc:
|
||||
gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
|
|
|
@ -68,7 +68,7 @@ assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4
|
|||
def nvmethod(subc, mthd, size, typ=2): return (typ << 28) | (size << 16) | (subc << 13) | (mthd >> 2)
|
||||
|
||||
class NVSignal(HCQSignal):
|
||||
def __init__(self, value=0):
|
||||
def __init__(self, value=0, is_timeline=False):
|
||||
self._signal = NVDevice.signals_pool.pop()
|
||||
self.signal_addr = mv_address(self._signal)
|
||||
super().__init__(value)
|
||||
|
@ -480,7 +480,7 @@ class NVDevice(HCQCompiled):
|
|||
|
||||
compiler_t = (PTXCompiler if PTX else CUDACompiler) if MOCKGPU else (NVPTXCompiler if PTX else NVCompiler)
|
||||
super().__init__(device, NVAllocator(self), PTXRenderer(self.arch, device="NV") if PTX else NVRenderer(self.arch), compiler_t(self.arch),
|
||||
functools.partial(NVProgram, self), NVSignal, NVComputeQueue, NVCopyQueue, timeline_signals=(NVSignal(), NVSignal()))
|
||||
functools.partial(NVProgram, self), NVSignal, NVComputeQueue, NVCopyQueue)
|
||||
|
||||
self._setup_gpfifos()
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ class QCOMCompiler(CLCompiler):
|
|||
def __init__(self, device:str=""): super().__init__(CLDevice(device), 'compile_qcom')
|
||||
|
||||
class QCOMSignal(HCQSignal):
|
||||
def __init__(self, value=0, **kwargs):
|
||||
def __init__(self, value=0, is_timeline=False):
|
||||
self._signal = QCOMDevice.signals_pool.pop()
|
||||
super().__init__(value)
|
||||
def __del__(self): QCOMDevice.signals_pool.append(self._signal)
|
||||
|
@ -351,7 +351,7 @@ class QCOMDevice(HCQCompiled):
|
|||
if QCOMDevice.gpu_id >= 700: raise RuntimeError(f"Unsupported GPU: {QCOMDevice.gpu_id}")
|
||||
|
||||
super().__init__(device, QCOMAllocator(self), QCOMRenderer(), QCOMCompiler(device), functools.partial(QCOMProgram, self),
|
||||
QCOMSignal, QCOMComputeQueue, None, timeline_signals=(QCOMSignal(), QCOMSignal()))
|
||||
QCOMSignal, QCOMComputeQueue, None)
|
||||
|
||||
def _ctx_create(self):
|
||||
cr = kgsl.IOCTL_KGSL_DRAWCTXT_CREATE(self.fd, flags=(kgsl.KGSL_CONTEXT_PREAMBLE | kgsl.KGSL_CONTEXT_PWR_CONSTRAINT |
|
||||
|
|
|
@ -195,7 +195,7 @@ class HWCopyQueue(HWCommandQueue):
|
|||
def _update_copy(self, cmd_idx, dest, src): raise NotImplementedError("backend should overload this function")
|
||||
|
||||
class HCQSignal:
|
||||
def __init__(self, value:int=0): self._set_value(value)
|
||||
def __init__(self, value:int=0, is_timeline:bool=False): self._set_value(value)
|
||||
|
||||
@property
|
||||
def value(self) -> int: return self._get_value()
|
||||
|
@ -346,10 +346,10 @@ class HCQCompiled(Compiled):
|
|||
gpu2cpu_compute_time_diff: decimal.Decimal = decimal.Decimal('nan')
|
||||
|
||||
def __init__(self, device:str, allocator:Allocator, renderer:Renderer, compiler:Compiler, runtime, signal_t:Type[HCQSignal],
|
||||
comp_queue_t:Type[HWComputeQueue], copy_queue_t:Optional[Type[HWCopyQueue]], timeline_signals:Tuple[HCQSignal, HCQSignal]):
|
||||
comp_queue_t:Type[HWComputeQueue], copy_queue_t:Optional[Type[HWCopyQueue]]):
|
||||
self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t
|
||||
self.timeline_value:int = 1
|
||||
self.timeline_signal, self._shadow_timeline_signal = timeline_signals
|
||||
self.timeline_signal, self._shadow_timeline_signal = self.signal_t(0, is_timeline=True), self.signal_t(0, is_timeline=True)
|
||||
self.sig_prof_records:List[Tuple[HCQSignal, HCQSignal, str, bool]] = []
|
||||
self.raw_prof_records:List[Tuple[decimal.Decimal, decimal.Decimal, str, bool, Optional[Dict]]] = []
|
||||
self.dep_prof_records:List[Tuple[decimal.Decimal, decimal.Decimal, HCQCompiled, bool, decimal.Decimal, decimal.Decimal, HCQCompiled, bool]] = []
|
||||
|
|
Loading…
Reference in New Issue