mirror of https://github.com/commaai/tinygrad.git
nv tagged memory + resnet failed kernel (#7061)
* nv tagged memory * linter * metal fix?
This commit is contained in:
parent
0f0c3934b1
commit
9f00eacde5
|
@ -1239,5 +1239,27 @@ class TestLinearizerFailures(unittest.TestCase):
|
|||
opts = [Opt(op=OptOps.TC, axis=0, amt=2)]
|
||||
helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=[])
|
||||
|
||||
@unittest.skipIf(CI and Device.DEFAULT in {"METAL"}, "hangs metal gpu CI")
|
||||
def test_failure_52(self):
|
||||
# resnet beam.
|
||||
# NV also fails with a pf.
|
||||
# CUDA Error 700, an illegal memory access was encountered
|
||||
ast = UOp(UOps.SINK, dtypes.void, arg=None, src=(
|
||||
UOp(UOps.STORE, dtypes.void, arg=None, src=(
|
||||
UOp(UOps.DEFINE_GLOBAL, PtrDType(dtypes.half), arg=0, src=()),
|
||||
UOp(UOps.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(256, 1, 64, 112, 112, 1, 1, 1), strides=(802816, 0, 12544, 112, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()),
|
||||
UOp(UOps.CAST, dtypes.half, arg=None, src=(
|
||||
UOp(UOps.REDUCE_AXIS, dtypes.float, arg=(BinaryOps.ADD, (5, 6, 7)), src=(
|
||||
UOp(UOps.CAST, dtypes.float, arg=None, src=(
|
||||
UOp(UOps.ALU, dtypes.half, arg=BinaryOps.MUL, src=(
|
||||
UOp(UOps.LOAD, dtypes.half, arg=None, src=(
|
||||
UOp(UOps.DEFINE_GLOBAL, PtrDType(dtypes.half), arg=1, src=()),
|
||||
UOp(UOps.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 256, 1, 3, 8, 230, 8, 230), strides=(0, 150528, 0, 50176, 0, 224, 0, 1), offset=-675, mask=((0, 1), (0, 256), (0, 1), (0, 3), (0, 8), (3, 227), (0, 8), (3, 227)), contiguous=False), View(shape=(256, 1, 64, 112, 112, 3, 7, 7), strides=(10156800, 0, 0, 3680, 2, 3385600, 425040, 231), offset=0, mask=None, contiguous=False))), src=()),)),
|
||||
UOp(UOps.LOAD, dtypes.half, arg=None, src=(
|
||||
UOp(UOps.DEFINE_GLOBAL, PtrDType(dtypes.half), arg=2, src=()),
|
||||
UOp(UOps.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(256, 1, 64, 112, 112, 3, 7, 7), strides=(0, 0, 147, 0, 0, 49, 7, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)),)),))
|
||||
opts = [Opt(op=OptOps.TC, axis=0, amt=2), Opt(op=OptOps.UPCAST, axis=1, amt=4), Opt(op=OptOps.LOCAL, axis=0, amt=16)]
|
||||
helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=["CUDA", "NV"])
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from __future__ import annotations
|
||||
import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, decimal
|
||||
from typing import Set, Tuple, List, Any, cast, Union, Dict, Type
|
||||
from typing import Tuple, List, Any, cast, Union, Dict, Type
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWCommandQueue, HWComputeQueue, HWCopyQueue, hcq_command
|
||||
from tinygrad.runtime.support.hcq import HCQArgsState, HCQProgram, HCQSignal
|
||||
|
@ -286,8 +286,8 @@ class NVProgram(HCQProgram):
|
|||
|
||||
class NVAllocator(HCQAllocator):
|
||||
def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer:
|
||||
if options.host: return self.device._gpu_host_alloc(size)
|
||||
return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access, huge_page=(size > (16 << 20)))
|
||||
if options.host: return self.device._gpu_host_alloc(size, tag="user host memory")
|
||||
return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access, huge_page=(size > (16 << 20)), tag=f"user memory ({options})")
|
||||
|
||||
def _free(self, opaque, options:BufferOptions):
|
||||
self.device.synchronize()
|
||||
|
@ -330,7 +330,7 @@ class NVDevice(HCQCompiled):
|
|||
os.close(fd_dev)
|
||||
return res
|
||||
|
||||
def _gpu_alloc(self, size:int, contig=False, huge_page=False, va_addr=None, map_to_cpu=False, map_flags=0):
|
||||
def _gpu_alloc(self, size:int, contig=False, huge_page=False, va_addr=None, map_to_cpu=False, map_flags=0, tag=""):
|
||||
size = round_up(size, align:=((2 << 20) if huge_page else (4 << 10)))
|
||||
alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, alignment=align, offset=0, limit=size-1, format=6, size=size,
|
||||
attr=(((nv_gpu.NVOS32_ATTR_PAGE_SIZE_HUGE << 23) if huge_page else 0) |
|
||||
|
@ -343,9 +343,9 @@ class NVDevice(HCQCompiled):
|
|||
|
||||
if va_addr is None: va_addr = self._alloc_gpu_vaddr(size, alignment=align, force_low=map_to_cpu)
|
||||
if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags)
|
||||
return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=map_to_cpu)
|
||||
return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=map_to_cpu, tag=tag)
|
||||
|
||||
def _gpu_system_alloc(self, size:int, va_addr=None, map_to_cpu=False, map_flags=0):
|
||||
def _gpu_system_alloc(self, size:int, va_addr=None, map_to_cpu=False, map_flags=0, tag=""):
|
||||
alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, type=13,
|
||||
attr=(nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS << 27) | (nv_gpu.NVOS32_ATTR_LOCATION_PCI << 25),
|
||||
attr2=(nv_gpu.NVOS32_ATTR2_ZBC_PREFER_NO_ZBC << 0) | (nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_NO << 2),
|
||||
|
@ -356,9 +356,9 @@ class NVDevice(HCQCompiled):
|
|||
if va_addr is None: va_addr = self._alloc_gpu_vaddr(size, force_low=True)
|
||||
if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags, system=True)
|
||||
|
||||
return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=map_to_cpu)
|
||||
return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=map_to_cpu, tag=tag)
|
||||
|
||||
def _gpu_host_alloc(self, size):
|
||||
def _gpu_host_alloc(self, size, tag=""):
|
||||
va_base = self._alloc_gpu_vaddr(aligned_sz:=round_up(size, 4 << 10))
|
||||
mapped_addr = libc.mmap(va_base, aligned_sz, mmap.PROT_READ|mmap.PROT_WRITE, MAP_FIXED|mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
|
||||
assert mapped_addr == va_base, f"Not mmaped at correct address {va_base=} != {mapped_addr=}"
|
||||
|
@ -371,30 +371,30 @@ class NVDevice(HCQCompiled):
|
|||
nv_iowr(self.fd_dev, nv_gpu.NV_ESC_RM_ALLOC_MEMORY, made)
|
||||
|
||||
if made.params.status != 0: raise RuntimeError(f"_map_to_gpu returned {get_error_str(made.params.status)}")
|
||||
return self._gpu_uvm_map(va_base, aligned_sz, made.params.hObjectNew, has_cpu_mapping=True)
|
||||
return self._gpu_uvm_map(va_base, aligned_sz, made.params.hObjectNew, has_cpu_mapping=True, tag=tag)
|
||||
|
||||
def _gpu_free(self, mem):
|
||||
if mem.hMemory > NVDevice.host_object_enumerator: # not a host object, clear phys mem.
|
||||
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made:=nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.device, hObjectOld=mem.hMemory))
|
||||
if made.status != 0: raise RuntimeError(f"_gpu_free returned {get_error_str(made.status)}")
|
||||
|
||||
self._debug_mappings.remove((mem.va_addr, mem.size))
|
||||
self._debug_mappings.pop((mem.va_addr, mem.size))
|
||||
uvm.free(self.fd_uvm, base=mem.va_addr, length=mem.size)
|
||||
if mem.has_cpu_mapping: libc.munmap(mem.va_addr, mem.size)
|
||||
|
||||
def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False) -> nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS:
|
||||
def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") -> nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS:
|
||||
if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size)
|
||||
attrs = (nv_gpu.struct_c__SA_UvmGpuMappingAttributes*256)(nv_gpu.struct_c__SA_UvmGpuMappingAttributes(gpuUuid=self.gpu_uuid, gpuMappingType=1))
|
||||
|
||||
# NOTE: va_addr is set to make rawbufs compatable with HCQBuffer protocol.
|
||||
self._debug_mappings.add((va_base, size))
|
||||
self._debug_mappings[(va_base, size)] = tag
|
||||
return uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root, hMemory=mem_handle,
|
||||
gpuAttributesCount=1, perGpuAttributes=attrs, va_addr=va_base, size=size, mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping)
|
||||
|
||||
def _gpu_map(self, mem):
|
||||
if self.gpu_uuid in mem.mapped_gpu_ids: return
|
||||
mem.mapped_gpu_ids.append(self.gpu_uuid)
|
||||
self._gpu_uvm_map(mem.va_addr, mem.size, mem.hMemory, create_range=False)
|
||||
self._gpu_uvm_map(mem.va_addr, mem.size, mem.hMemory, create_range=False, tag="p2p mem")
|
||||
|
||||
def _alloc_gpu_vaddr(self, size, alignment=(4 << 10), force_low=False):
|
||||
if force_low:
|
||||
|
@ -439,7 +439,7 @@ class NVDevice(HCQCompiled):
|
|||
self.gpu_mmio = to_mv(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz).cast("I")
|
||||
|
||||
self._setup_nvclasses()
|
||||
self._debug_mappings: Set[Tuple[int, int]] = set()
|
||||
self._debug_mappings: Dict[Tuple[int, int], str] = dict()
|
||||
|
||||
rmctrl.perf_boost(self.fd_ctl, self.root, self.subdevice, duration=0xffffffff, flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | \
|
||||
(nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_PRIORITY_HIGH << 6) | (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CMD_BOOST_TO_MAX << 0)))
|
||||
|
@ -466,7 +466,7 @@ class NVDevice(HCQCompiled):
|
|||
channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
|
||||
channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.device, channel_params).hObjectNew
|
||||
|
||||
gpfifo_area = self._gpu_alloc(0x200000, contig=True, huge_page=True, map_to_cpu=True, map_flags=0x10d0000)
|
||||
gpfifo_area = self._gpu_alloc(0x200000, contig=True, huge_page=True, map_to_cpu=True, map_flags=0x10d0000, tag="gpfifo")
|
||||
|
||||
ctxshare_params = nv_gpu.NV_CTXSHARE_ALLOCATION_PARAMETERS(hVASpace=vaspace, flags=nv_gpu.NV_CTXSHARE_ALLOCATION_FLAGS_SUBCONTEXT_ASYNC)
|
||||
ctxshare = rm_alloc(self.fd_ctl, nv_gpu.FERMI_CONTEXT_SHARE_A, self.root, channel_group, ctxshare_params).hObjectNew
|
||||
|
@ -476,7 +476,7 @@ class NVDevice(HCQCompiled):
|
|||
|
||||
rmctrl.gpfifo_schedule(self.fd_ctl, self.root, channel_group, bEnable=1)
|
||||
|
||||
self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, map_to_cpu=True, huge_page=True)
|
||||
self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, map_to_cpu=True, huge_page=True, tag="cmdq")
|
||||
self.cmdq: memoryview = to_mv(self.cmdq_page.va_addr, 0x200000).cast("I")
|
||||
self.cmdq_wptr: int = 0 # in bytes
|
||||
|
||||
|
@ -536,7 +536,7 @@ class NVDevice(HCQCompiled):
|
|||
self.slm_per_thread = round_up(required, 32)
|
||||
bytes_per_warp = round_up(self.slm_per_thread * 32, 0x200)
|
||||
bytes_per_tpc = round_up(bytes_per_warp * 48 * 2, 0x8000)
|
||||
self.shader_local_mem = self._gpu_alloc(round_up(bytes_per_tpc * 64, 0x20000), huge_page=True, contig=True)
|
||||
self.shader_local_mem = self._gpu_alloc(round_up(bytes_per_tpc * 64, 0x20000), huge_page=True, contig=True, tag="local_memory")
|
||||
|
||||
NVComputeQueue().wait(self.timeline_signal, self.timeline_value - 1) \
|
||||
.setup(local_mem=self.shader_local_mem.va_addr, local_mem_tpc_bytes=bytes_per_tpc) \
|
||||
|
@ -560,7 +560,7 @@ class NVDevice(HCQCompiled):
|
|||
for i in range(mmu_info.count):
|
||||
pfinfo = mmu_info.mmuFaultInfoList[i]
|
||||
report += [f"MMU fault: 0x{pfinfo.faultAddress:X} | {NV_PFAULT_FAULT_TYPE[pfinfo.faultType]} | {NV_PFAULT_ACCESS_TYPE[pfinfo.accessType]}"]
|
||||
report += ["All GPU mappings:" + "\n".join([f"\t0x{x:X} | size: 0x{y:X}" for x,y in self._debug_mappings])]
|
||||
report += ["GPU mappings:\n"+"\n".join([f"\t0x{x:X} - 0x{x+y-1:X} | {self._debug_mappings[(x,y)]}" for x,y in sorted(self._debug_mappings)])]
|
||||
else:
|
||||
for i, e in enumerate(sm_errors.smErrorStateArray):
|
||||
if e.hwwGlobalEsr or e.hwwWarpEsr: report += [f"SM{i} fault: esr={e.hwwGlobalEsr} warp_esr={e.hwwWarpEsr} warp_pc={e.hwwWarpEsrPc64}"]
|
||||
|
|
Loading…
Reference in New Issue