nv tagged memory + resnet failed kernel (#7061)

* nv tagged memory

* linter

* metal fix?
This commit is contained in:
nimlgen 2024-10-15 18:19:58 +03:00 committed by GitHub
parent 0f0c3934b1
commit 9f00eacde5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 40 additions and 18 deletions

View File

@ -1239,5 +1239,27 @@ class TestLinearizerFailures(unittest.TestCase):
opts = [Opt(op=OptOps.TC, axis=0, amt=2)]
helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=[])
@unittest.skipIf(CI and Device.DEFAULT in {"METAL"}, "hangs metal gpu CI")
def test_failure_52(self):
# resnet beam.
# NV also fails with a pf.
# CUDA Error 700, an illegal memory access was encountered
ast = UOp(UOps.SINK, dtypes.void, arg=None, src=(
UOp(UOps.STORE, dtypes.void, arg=None, src=(
UOp(UOps.DEFINE_GLOBAL, PtrDType(dtypes.half), arg=0, src=()),
UOp(UOps.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(256, 1, 64, 112, 112, 1, 1, 1), strides=(802816, 0, 12544, 112, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()),
UOp(UOps.CAST, dtypes.half, arg=None, src=(
UOp(UOps.REDUCE_AXIS, dtypes.float, arg=(BinaryOps.ADD, (5, 6, 7)), src=(
UOp(UOps.CAST, dtypes.float, arg=None, src=(
UOp(UOps.ALU, dtypes.half, arg=BinaryOps.MUL, src=(
UOp(UOps.LOAD, dtypes.half, arg=None, src=(
UOp(UOps.DEFINE_GLOBAL, PtrDType(dtypes.half), arg=1, src=()),
UOp(UOps.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 256, 1, 3, 8, 230, 8, 230), strides=(0, 150528, 0, 50176, 0, 224, 0, 1), offset=-675, mask=((0, 1), (0, 256), (0, 1), (0, 3), (0, 8), (3, 227), (0, 8), (3, 227)), contiguous=False), View(shape=(256, 1, 64, 112, 112, 3, 7, 7), strides=(10156800, 0, 0, 3680, 2, 3385600, 425040, 231), offset=0, mask=None, contiguous=False))), src=()),)),
UOp(UOps.LOAD, dtypes.half, arg=None, src=(
UOp(UOps.DEFINE_GLOBAL, PtrDType(dtypes.half), arg=2, src=()),
UOp(UOps.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(256, 1, 64, 112, 112, 3, 7, 7), strides=(0, 0, 147, 0, 0, 49, 7, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)),)),))
opts = [Opt(op=OptOps.TC, axis=0, amt=2), Opt(op=OptOps.UPCAST, axis=1, amt=4), Opt(op=OptOps.LOCAL, axis=0, amt=16)]
helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=["CUDA", "NV"])
if __name__ == '__main__':
unittest.main()

View File

@ -1,6 +1,6 @@
from __future__ import annotations
import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, decimal
from typing import Set, Tuple, List, Any, cast, Union, Dict, Type
from typing import Tuple, List, Any, cast, Union, Dict, Type
from dataclasses import dataclass
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWCommandQueue, HWComputeQueue, HWCopyQueue, hcq_command
from tinygrad.runtime.support.hcq import HCQArgsState, HCQProgram, HCQSignal
@ -286,8 +286,8 @@ class NVProgram(HCQProgram):
class NVAllocator(HCQAllocator):
def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer:
if options.host: return self.device._gpu_host_alloc(size)
return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access, huge_page=(size > (16 << 20)))
if options.host: return self.device._gpu_host_alloc(size, tag="user host memory")
return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access, huge_page=(size > (16 << 20)), tag=f"user memory ({options})")
def _free(self, opaque, options:BufferOptions):
self.device.synchronize()
@ -330,7 +330,7 @@ class NVDevice(HCQCompiled):
os.close(fd_dev)
return res
def _gpu_alloc(self, size:int, contig=False, huge_page=False, va_addr=None, map_to_cpu=False, map_flags=0):
def _gpu_alloc(self, size:int, contig=False, huge_page=False, va_addr=None, map_to_cpu=False, map_flags=0, tag=""):
size = round_up(size, align:=((2 << 20) if huge_page else (4 << 10)))
alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, alignment=align, offset=0, limit=size-1, format=6, size=size,
attr=(((nv_gpu.NVOS32_ATTR_PAGE_SIZE_HUGE << 23) if huge_page else 0) |
@ -343,9 +343,9 @@ class NVDevice(HCQCompiled):
if va_addr is None: va_addr = self._alloc_gpu_vaddr(size, alignment=align, force_low=map_to_cpu)
if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags)
return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=map_to_cpu)
return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=map_to_cpu, tag=tag)
def _gpu_system_alloc(self, size:int, va_addr=None, map_to_cpu=False, map_flags=0):
def _gpu_system_alloc(self, size:int, va_addr=None, map_to_cpu=False, map_flags=0, tag=""):
alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, type=13,
attr=(nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS << 27) | (nv_gpu.NVOS32_ATTR_LOCATION_PCI << 25),
attr2=(nv_gpu.NVOS32_ATTR2_ZBC_PREFER_NO_ZBC << 0) | (nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_NO << 2),
@ -356,9 +356,9 @@ class NVDevice(HCQCompiled):
if va_addr is None: va_addr = self._alloc_gpu_vaddr(size, force_low=True)
if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags, system=True)
return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=map_to_cpu)
return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=map_to_cpu, tag=tag)
def _gpu_host_alloc(self, size):
def _gpu_host_alloc(self, size, tag=""):
va_base = self._alloc_gpu_vaddr(aligned_sz:=round_up(size, 4 << 10))
mapped_addr = libc.mmap(va_base, aligned_sz, mmap.PROT_READ|mmap.PROT_WRITE, MAP_FIXED|mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
assert mapped_addr == va_base, f"Not mmaped at correct address {va_base=} != {mapped_addr=}"
@ -371,30 +371,30 @@ class NVDevice(HCQCompiled):
nv_iowr(self.fd_dev, nv_gpu.NV_ESC_RM_ALLOC_MEMORY, made)
if made.params.status != 0: raise RuntimeError(f"_map_to_gpu returned {get_error_str(made.params.status)}")
return self._gpu_uvm_map(va_base, aligned_sz, made.params.hObjectNew, has_cpu_mapping=True)
return self._gpu_uvm_map(va_base, aligned_sz, made.params.hObjectNew, has_cpu_mapping=True, tag=tag)
def _gpu_free(self, mem):
if mem.hMemory > NVDevice.host_object_enumerator: # not a host object, clear phys mem.
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made:=nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.device, hObjectOld=mem.hMemory))
if made.status != 0: raise RuntimeError(f"_gpu_free returned {get_error_str(made.status)}")
self._debug_mappings.remove((mem.va_addr, mem.size))
self._debug_mappings.pop((mem.va_addr, mem.size))
uvm.free(self.fd_uvm, base=mem.va_addr, length=mem.size)
if mem.has_cpu_mapping: libc.munmap(mem.va_addr, mem.size)
def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False) -> nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS:
def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") -> nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS:
if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size)
attrs = (nv_gpu.struct_c__SA_UvmGpuMappingAttributes*256)(nv_gpu.struct_c__SA_UvmGpuMappingAttributes(gpuUuid=self.gpu_uuid, gpuMappingType=1))
# NOTE: va_addr is set to make rawbufs compatable with HCQBuffer protocol.
self._debug_mappings.add((va_base, size))
self._debug_mappings[(va_base, size)] = tag
return uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root, hMemory=mem_handle,
gpuAttributesCount=1, perGpuAttributes=attrs, va_addr=va_base, size=size, mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping)
def _gpu_map(self, mem):
if self.gpu_uuid in mem.mapped_gpu_ids: return
mem.mapped_gpu_ids.append(self.gpu_uuid)
self._gpu_uvm_map(mem.va_addr, mem.size, mem.hMemory, create_range=False)
self._gpu_uvm_map(mem.va_addr, mem.size, mem.hMemory, create_range=False, tag="p2p mem")
def _alloc_gpu_vaddr(self, size, alignment=(4 << 10), force_low=False):
if force_low:
@ -439,7 +439,7 @@ class NVDevice(HCQCompiled):
self.gpu_mmio = to_mv(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz).cast("I")
self._setup_nvclasses()
self._debug_mappings: Set[Tuple[int, int]] = set()
self._debug_mappings: Dict[Tuple[int, int], str] = dict()
rmctrl.perf_boost(self.fd_ctl, self.root, self.subdevice, duration=0xffffffff, flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | \
(nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_PRIORITY_HIGH << 6) | (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CMD_BOOST_TO_MAX << 0)))
@ -466,7 +466,7 @@ class NVDevice(HCQCompiled):
channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.device, channel_params).hObjectNew
gpfifo_area = self._gpu_alloc(0x200000, contig=True, huge_page=True, map_to_cpu=True, map_flags=0x10d0000)
gpfifo_area = self._gpu_alloc(0x200000, contig=True, huge_page=True, map_to_cpu=True, map_flags=0x10d0000, tag="gpfifo")
ctxshare_params = nv_gpu.NV_CTXSHARE_ALLOCATION_PARAMETERS(hVASpace=vaspace, flags=nv_gpu.NV_CTXSHARE_ALLOCATION_FLAGS_SUBCONTEXT_ASYNC)
ctxshare = rm_alloc(self.fd_ctl, nv_gpu.FERMI_CONTEXT_SHARE_A, self.root, channel_group, ctxshare_params).hObjectNew
@ -476,7 +476,7 @@ class NVDevice(HCQCompiled):
rmctrl.gpfifo_schedule(self.fd_ctl, self.root, channel_group, bEnable=1)
self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, map_to_cpu=True, huge_page=True)
self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, map_to_cpu=True, huge_page=True, tag="cmdq")
self.cmdq: memoryview = to_mv(self.cmdq_page.va_addr, 0x200000).cast("I")
self.cmdq_wptr: int = 0 # in bytes
@ -536,7 +536,7 @@ class NVDevice(HCQCompiled):
self.slm_per_thread = round_up(required, 32)
bytes_per_warp = round_up(self.slm_per_thread * 32, 0x200)
bytes_per_tpc = round_up(bytes_per_warp * 48 * 2, 0x8000)
self.shader_local_mem = self._gpu_alloc(round_up(bytes_per_tpc * 64, 0x20000), huge_page=True, contig=True)
self.shader_local_mem = self._gpu_alloc(round_up(bytes_per_tpc * 64, 0x20000), huge_page=True, contig=True, tag="local_memory")
NVComputeQueue().wait(self.timeline_signal, self.timeline_value - 1) \
.setup(local_mem=self.shader_local_mem.va_addr, local_mem_tpc_bytes=bytes_per_tpc) \
@ -560,7 +560,7 @@ class NVDevice(HCQCompiled):
for i in range(mmu_info.count):
pfinfo = mmu_info.mmuFaultInfoList[i]
report += [f"MMU fault: 0x{pfinfo.faultAddress:X} | {NV_PFAULT_FAULT_TYPE[pfinfo.faultType]} | {NV_PFAULT_ACCESS_TYPE[pfinfo.accessType]}"]
report += ["All GPU mappings:" + "\n".join([f"\t0x{x:X} | size: 0x{y:X}" for x,y in self._debug_mappings])]
report += ["GPU mappings:\n"+"\n".join([f"\t0x{x:X} - 0x{x+y-1:X} | {self._debug_mappings[(x,y)]}" for x,y in sorted(self._debug_mappings)])]
else:
for i, e in enumerate(sm_errors.smErrorStateArray):
if e.hwwGlobalEsr or e.hwwWarpEsr: report += [f"SM{i} fault: esr={e.hwwGlobalEsr} warp_esr={e.hwwWarpEsr} warp_pc={e.hwwWarpEsrPc64}"]