diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d9557176..2617ef50 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -332,7 +332,7 @@ jobs: strategy: fail-fast: false matrix: - backend: [llvm, clang, gpu, cuda, ptx, amd] #, triton] + backend: [llvm, clang, gpu, cuda, ptx, amd, nv] #, triton] name: Tests on (${{ matrix.backend }}) runs-on: ubuntu-latest @@ -356,7 +356,7 @@ jobs: path: ~/.cache/tinygrad/downloads/ key: downloads-cache-${{ matrix.backend }}-${{ env.DOWNLOAD_CACHE_VERSION }} - name: Set env - run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'clang' && 'CLANG=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'cuda' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\n' || matrix.backend == 'PTX' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nPTX=1' || matrix.backend == 'triton' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nTRITON=1\nTRITON_PTXAS_PATH=/usr/bin/ptxas' || matrix.backend == 'amd' && 'AMD=1\nMOCKGPU=1\nFORWARD_ONLY=1' }}" >> $GITHUB_ENV + run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'clang' && 'CLANG=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'cuda' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\n' || matrix.backend == 'PTX' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nPTX=1' || matrix.backend == 'triton' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nTRITON=1\nTRITON_PTXAS_PATH=/usr/bin/ptxas' || matrix.backend == 'amd' && 'AMD=1\nMOCKGPU=1\nFORWARD_ONLY=1' || matrix.backend == 'nv' && 'NV=1\nMOCKGPU=1\nFORWARD_ONLY=1' }}" >> $GITHUB_ENV - name: Install OpenCL if: matrix.backend == 'gpu' run: | @@ -368,14 +368,14 @@ jobs: intel-oneapi-runtime-dpcpp-sycl-opencl-cpu=2023.2.1-16 intel-oneapi-runtime-tbb-common=2021.10.0-49541 \ intel-oneapi-runtime-tbb=2021.10.0-49541 intel-oneapi-runtime-opencl=2023.2.1-16 - name: Install packages (cuda) - if: matrix.backend == 'cuda' || matrix.backend == 'ptx' || matrix.backend == 'triton' + if: matrix.backend == 'cuda' || matrix.backend == 'ptx' || matrix.backend == 'triton' || matrix.backend == 'nv' run: | echo 'Acquire::http::Pipeline-Depth "5";' | sudo tee -a /etc/apt/apt.conf.d/99parallel sudo apt update -y || true sudo apt install -y --no-install-recommends git g++ cmake ninja-build llvm-15-dev zlib1g-dev libglew-dev \ flex bison libfl-dev libboost-thread-dev libboost-filesystem-dev nvidia-cuda-toolkit-gcc libzstd-dev - name: Cache gpuocelot - if: matrix.backend == 'cuda' || matrix.backend == 'ptx' || matrix.backend == 'triton' + if: matrix.backend == 'cuda' || matrix.backend == 'ptx' || matrix.backend == 'triton' || matrix.backend == 'nv' id: cache-build uses: actions/cache@v4 env: @@ -384,7 +384,7 @@ jobs: path: ${{ github.workspace }}/gpuocelot/ocelot key: ubuntu22.04-gpuocelot-18401f4245b27ca4b3af433196583cc81ef84480-rebuild-5 - name: Clone/compile gpuocelot - if: (matrix.backend == 'cuda' || matrix.backend == 'ptx' || matrix.backend == 'triton') && steps.cache-build.outputs.cache-hit != 'true' + if: (matrix.backend == 'cuda' || matrix.backend == 'ptx' || matrix.backend == 'triton' || matrix.backend == 'nv') && steps.cache-build.outputs.cache-hit != 'true' run: | git clone --recurse-submodules https://github.com/gpuocelot/gpuocelot.git ${{ github.workspace }}/gpuocelot cd ${{ github.workspace }}/gpuocelot/ocelot @@ -394,7 +394,7 @@ jobs: cmake .. -Wno-dev -G Ninja -DOCELOT_BUILD_TOOLS=OFF -DCMAKE_BUILD_ALWAYS=0 -DBUILD_TESTS_CUDA=OFF ninja - name: Install gpuocelot - if: matrix.backend == 'cuda' || matrix.backend == 'ptx' || matrix.backend == 'triton' + if: matrix.backend == 'cuda' || matrix.backend == 'ptx' || matrix.backend == 'triton' || matrix.backend == 'nv' run: | cd ${{ github.workspace }}/gpuocelot/ocelot/build sudo ninja install -d explain @@ -416,7 +416,7 @@ jobs: run: pip install -e '.[testing${{matrix.backend=='llvm'&&',llvm'||matrix.backend=='cuda'&&',cuda'||matrix.backend=='ptx'&&',cuda'||matrix.backend=='triton'&&',triton'||''}}]' --extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ - name: Check Device.DEFAULT and print some source run: | - PYTHONPATH=${{ github.workspace }} python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['LLVM','CLANG','CUDA','GPU','AMD'], Device.DEFAULT" + PYTHONPATH=${{ github.workspace }} python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['LLVM','CLANG','CUDA','GPU','AMD','NV'], Device.DEFAULT" DEBUG=5 PYTHONPATH=${{ github.workspace }} FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add - name: Verify OpenCL autogen if: matrix.backend == 'gpu' @@ -443,13 +443,13 @@ jobs: diff /tmp/hsa.py.bak tinygrad/runtime/autogen/hsa.py diff /tmp/comgr.py.bak tinygrad/runtime/autogen/comgr.py - name: Run pytest (not cuda or amd) - if: matrix.backend!='cuda' && matrix.backend!='ptx' && matrix.backend!='triton' && matrix.backend != 'amd' + if: matrix.backend!='cuda' && matrix.backend!='ptx' && matrix.backend!='triton' && matrix.backend != 'amd' && matrix.backend != 'nv' run: python -m pytest -n=auto test/ --durations=20 - name: Run ONNX (only LLVM) if: matrix.backend == 'llvm' run: python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20 - name: Run pytest (cuda) - if: matrix.backend=='cuda'||matrix.backend=='ptx'||matrix.backend=='triton' + if: matrix.backend=='cuda'||matrix.backend=='ptx'||matrix.backend=='triton'||matrix.backend=='nv' run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors)' --ignore=test/external --ignore=test/models --durations=20 - name: Run pytest (amd) if: matrix.backend=='amd' diff --git a/extra/mockgpu/amd/amdgpu.py b/extra/mockgpu/amd/amdgpu.py index eadb6854..50776ab4 100644 --- a/extra/mockgpu/amd/amdgpu.py +++ b/extra/mockgpu/amd/amdgpu.py @@ -19,9 +19,11 @@ WAIT_REG_MEM_FUNCTION_ALWAYS = 0 WAIT_REG_MEM_FUNCTION_EQ = 3 # == WAIT_REG_MEM_FUNCTION_GEQ = 5 # >= -remu = ctypes.CDLL("/usr/local/lib/libremu.so") -remu.run_asm.restype = ctypes.c_uint32 -remu.run_asm.argtypes = [ctypes.c_void_p, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_void_p] +try: + remu = ctypes.CDLL("/usr/local/lib/libremu.so") + remu.run_asm.restype = ctypes.c_uint32 + remu.run_asm.argtypes = [ctypes.c_void_p, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_void_p] +except Exception: pass def create_sdma_packets(): # TODO: clean up this, if we want to keep it diff --git a/extra/mockgpu/driver.py b/extra/mockgpu/driver.py index 8c1d2aa2..033bb48f 100644 --- a/extra/mockgpu/driver.py +++ b/extra/mockgpu/driver.py @@ -93,4 +93,4 @@ class VirtDriver: self.tracked_files = [] self.tracked_addresses = [] def track_address(self, staddr, enaddr, rcb, wcb): self.tracked_addresses.append((staddr, enaddr, rcb, wcb)) - def open(self, name, flags, mode): raise NotImplementedError() + def open(self, name, flags, mode, fdcls): raise NotImplementedError() diff --git a/extra/mockgpu/mockgpu.py b/extra/mockgpu/mockgpu.py index e32b2118..9c4985bc 100644 --- a/extra/mockgpu/mockgpu.py +++ b/extra/mockgpu/mockgpu.py @@ -1,4 +1,5 @@ import ctypes, ctypes.util, struct, platform, pathlib, re, time, os, builtins, atexit +from extra.mockgpu.nv.nvdriver import NVDriver from extra.mockgpu.amd.amddriver import AMDDriver from tinygrad.helpers import from_mv, to_mv start = time.perf_counter() @@ -51,7 +52,7 @@ def install_hook(c_function, python_function): def __restore(): libc.memcpy(ioctl_address.contents, original_bc, len(tramp)) atexit.register(__restore) -drivers = [AMDDriver()] +drivers = [AMDDriver(), NVDriver()] tracked_fds = {} @ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_ulong) diff --git a/extra/mockgpu/nv/nvdriver.py b/extra/mockgpu/nv/nvdriver.py new file mode 100644 index 00000000..a2877d2e --- /dev/null +++ b/extra/mockgpu/nv/nvdriver.py @@ -0,0 +1,217 @@ +import pathlib, re, ctypes, mmap, collections, struct, functools, os, copy +import tinygrad.runtime.autogen.nv_gpu as nv_gpu +from typing import Optional, Any +from tinygrad.helpers import from_mv +from extra.mockgpu.driver import VirtDriver, VirtFileDesc, TextFileDesc, DirFileDesc, VirtFile +from extra.mockgpu.nv.nvgpu import NVGPU + +MAP_FIXED = 0x10 +libc = ctypes.CDLL(ctypes.util.find_library("c")) +libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long] +libc.mmap.restype = ctypes.c_void_p +libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t] +libc.munmap.restype = ctypes.c_int + +NVSubDevice = collections.namedtuple('NVSubDevice', ['device']) +NVUserMode = collections.namedtuple('NVUserMode', ['subdevice']) +NVVASpace = collections.namedtuple('NVVASpace', ['device']) +NVAllocation = collections.namedtuple('NVAllocation', ['device', 'size']) +NVChannelGroup = collections.namedtuple('NVChannelGroup', ['device']) +NVContextShare = collections.namedtuple('NVContextShare', ['channel_group']) +NVGPFIFO = collections.namedtuple('NVGPFIFO', ['device', 'token']) + +class NVCtlFileDesc(VirtFileDesc): + def __init__(self, fd, driver): + super().__init__(fd) + self.driver = driver + + def ioctl(self, fd, request, argp): return self.driver.ctl_ioctl(request, argp) + def mmap(self, start, sz, prot, flags, fd, offset): return libc.mmap(start, sz, prot, flags|mmap.MAP_ANONYMOUS, -1, 0) + +class NVUVMFileDesc(VirtFileDesc): + def __init__(self, fd, driver): + super().__init__(fd) + self.driver = driver + + def ioctl(self, fd, request, argp): return self.driver.uvm_ioctl(request, argp) + def mmap(self, start, sz, prot, flags, fd, offset): return libc.mmap(start, sz, prot, flags|mmap.MAP_ANONYMOUS, -1, 0) + +class NVDevFileDesc(VirtFileDesc): + def __init__(self, fd, driver, gpu): + super().__init__(fd) + self.driver, self.gpu = driver, gpu + self._mapping_userland = False + + def ioctl(self, fd, request, argp): return self.driver.dev_ioctl(self.gpu, request, argp) + def mmap(self, start, sz, prot, flags, fd, offset): + start = libc.mmap(start, sz, prot, flags|mmap.MAP_ANONYMOUS, -1, 0) + if self._mapping_userland: self.driver.track_address(start, start+sz, lambda mv,off: None, lambda mv, off: self.driver._gpu_mmio_write(mv, off, self.gpu)) + return start + +class NVDriver(VirtDriver): + def __init__(self, gpus=6): + super().__init__() + + self.tracked_files += [VirtFile('/dev/nvidiactl', functools.partial(NVCtlFileDesc, driver=self)), + VirtFile('/dev/nvidia-uvm', functools.partial(NVUVMFileDesc, driver=self))] + + self.root_handle = None + + self.gpus = {} + self.next_fd = (1 << 30) + self.next_handle = 1 + + self.object_by_handle = {} + self.opened_fds = {} + self.next_doorbell = collections.defaultdict(int) + + for i in range(gpus): self._prepare_gpu(i) + + def _alloc_fd(self): + my_fd = self.next_fd + self.next_fd = self.next_fd + 1 + return my_fd + + def _alloc_handle(self): + handle = self.next_handle + self.next_handle += 1 + return handle + + def _prepare_gpu(self, gpu_id): + self.gpus[gpu_id] = NVGPU(gpu_id) + self.tracked_files += [VirtFile(f'/dev/nvidia{gpu_id}', functools.partial(NVDevFileDesc, driver=self, gpu=self.gpus[gpu_id]))] + + def open(self, name, flags, mode, virtfile): + cl = virtfile.fdcls(self._alloc_fd()) + self.opened_fds[cl.fd] = cl + return cl + + def rm_alloc(self, argp): + struct = nv_gpu.NVOS21_PARAMETERS.from_address(argp) + params_ptr = struct.pAllocParms if struct.pAllocParms else None + if struct.hClass == nv_gpu.NV01_ROOT_CLIENT: self.root_handle = struct.hObjectNew = self._alloc_handle() + elif struct.hClass == nv_gpu.NV01_DEVICE_0: + params:Any = nv_gpu.NV0080_ALLOC_PARAMETERS.from_address(params_ptr) + assert params.hClientShare == self.root_handle + struct.hObjectNew = self._alloc_handle() + self.object_by_handle[struct.hObjectNew] = self.gpus[params.deviceId] + elif struct.hClass == nv_gpu.NV20_SUBDEVICE_0: + assert struct.hObjectParent in self.object_by_handle and isinstance(self.object_by_handle[struct.hObjectParent], NVGPU) + struct.hObjectNew = self._alloc_handle() + self.object_by_handle[struct.hObjectNew] = NVSubDevice(self.object_by_handle[struct.hObjectParent]) + elif struct.hClass == nv_gpu.TURING_USERMODE_A: + assert struct.hObjectParent in self.object_by_handle and isinstance(self.object_by_handle[struct.hObjectParent], NVSubDevice) + struct.hObjectNew = self._alloc_handle() + self.object_by_handle[struct.hObjectNew] = NVUserMode(self.object_by_handle[struct.hObjectParent]) + elif struct.hClass == nv_gpu.FERMI_VASPACE_A: + assert struct.hObjectParent in self.object_by_handle and isinstance(self.object_by_handle[struct.hObjectParent], NVGPU) + struct.hObjectNew = self._alloc_handle() + self.object_by_handle[struct.hObjectNew] = NVVASpace(self.object_by_handle[struct.hObjectParent]) + elif struct.hClass == nv_gpu.NV1_MEMORY_SYSTEM or struct.hClass == nv_gpu.NV1_MEMORY_USER: + assert struct.hObjectParent in self.object_by_handle and isinstance(self.object_by_handle[struct.hObjectParent], NVGPU) + params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS.from_address(params_ptr) + struct.hObjectNew = self._alloc_handle() + self.object_by_handle[struct.hObjectNew] = NVAllocation(self.object_by_handle[struct.hObjectParent], params.size) + elif struct.hClass == nv_gpu.KEPLER_CHANNEL_GROUP_A: + assert struct.hObjectParent in self.object_by_handle and isinstance(self.object_by_handle[struct.hObjectParent], NVGPU) + struct.hObjectNew = self._alloc_handle() + self.object_by_handle[struct.hObjectNew] = NVChannelGroup(self.object_by_handle[struct.hObjectParent]) + elif struct.hClass == nv_gpu.FERMI_CONTEXT_SHARE_A: + assert struct.hObjectParent in self.object_by_handle and isinstance(self.object_by_handle[struct.hObjectParent], NVChannelGroup) + struct.hObjectNew = self._alloc_handle() + self.object_by_handle[struct.hObjectNew] = NVContextShare(self.object_by_handle[struct.hObjectParent]) + elif struct.hClass == nv_gpu.AMPERE_CHANNEL_GPFIFO_A: + assert struct.hObjectParent in self.object_by_handle and isinstance(self.object_by_handle[struct.hObjectParent], NVChannelGroup) + struct.hObjectNew = self._alloc_handle() + params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS.from_address(params_ptr) + gpu = self.object_by_handle[struct.hObjectParent].device + gpfifo_token = gpu.add_gpfifo(params.gpFifoOffset, params.gpFifoEntries) + self.object_by_handle[struct.hObjectNew] = NVGPFIFO(gpu, gpfifo_token) + elif struct.hClass == nv_gpu.AMPERE_DMA_COPY_B or struct.hClass == nv_gpu.ADA_COMPUTE_A: + assert struct.hObjectParent in self.object_by_handle and isinstance(self.object_by_handle[struct.hObjectParent], NVGPFIFO) + else: raise RuntimeError(f"Unknown {struct.hClass} to rm_alloc") + return 0 + + def rm_control(self, argp): + struct = nv_gpu.NVOS54_PARAMETERS.from_address(argp) + params_ptr = struct.params if struct.params else None + if struct.cmd == nv_gpu.NV0000_CTRL_CMD_GPU_GET_ID_INFO_V2: + params:Any = nv_gpu.NV0000_CTRL_GPU_GET_ID_INFO_V2_PARAMS.from_address(params_ptr) + params.deviceInstance = params.gpuId # emulate them to be the same + elif struct.cmd == nv_gpu.NV2080_CTRL_CMD_GPU_GET_GID_INFO: + assert struct.hObject in self.object_by_handle and isinstance(self.object_by_handle[struct.hObject], NVSubDevice) + gpu = self.object_by_handle[struct.hObject].device + params = nv_gpu.NV2080_CTRL_GPU_GET_GID_INFO_PARAMS.from_address(params_ptr) + if params.flags != nv_gpu.NV2080_GPU_CMD_GPU_GET_GID_FLAGS_FORMAT_BINARY: raise RuntimeError(f"Unknown format") + bts = gpu.gpu_uuid(sz=params.length) + for i in range(params.length): params.data[i] = bts[i] + elif struct.cmd == nv_gpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN: + assert struct.hObject in self.object_by_handle and isinstance(self.object_by_handle[struct.hObject], NVGPFIFO) + params = nv_gpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN_PARAMS.from_address(params_ptr) + gpu_fifo = self.object_by_handle[struct.hObject] + params.workSubmitToken = gpu_fifo.token + elif struct.cmd == nv_gpu.NVA06C_CTRL_CMD_GPFIFO_SCHEDULE: pass + elif struct.cmd == nv_gpu.NV2080_CTRL_CMD_PERF_BOOST: pass + else: raise RuntimeError(f"Unknown {struct.cmd} to rm_control") + return 0 + + def ctl_ioctl(self, req, argp): + nr = req & 0xff + if nr == nv_gpu.NV_ESC_RM_ALLOC: return self.rm_alloc(argp) + elif nr == nv_gpu.NV_ESC_RM_ALLOC_MEMORY: pass + elif nr == nv_gpu.NV_ESC_RM_CONTROL: return self.rm_control(argp) + elif nr == nv_gpu.NV_ESC_RM_MAP_MEMORY: + st:Any = nv_gpu.nv_ioctl_nvos33_parameters_with_fd.from_address(argp) + obj = self.object_by_handle[st.params.hMemory] + if isinstance(obj, NVUserMode): + file = self.opened_fds[st.fd] + assert isinstance(file, NVDevFileDesc) + file._mapping_userland = True + elif nr == nv_gpu.NV_ESC_RM_FREE: + st = nv_gpu.NVOS00_PARAMETERS.from_address(argp) + self.object_by_handle.pop(st.hObjectOld) + elif nr == nv_gpu.NV_ESC_CARD_INFO: + for i,gpu in enumerate(self.gpus.values()): + st = nv_gpu.nv_ioctl_card_info_t.from_address(argp + i * ctypes.sizeof(nv_gpu.nv_ioctl_card_info_t)) + st.gpu_id = gpu.gpuid + st.pci_info.device_id = 0x2684 + st.valid = True + else: raise RuntimeError(f"Unknown {nr} to nvidiactl") + return 0 + def uvm_ioctl(self, nr, argp): + if nr == nv_gpu.UVM_INITIALIZE: pass + elif nr == nv_gpu.UVM_MM_INITIALIZE: pass + elif nr == nv_gpu.UVM_REGISTER_GPU: + st:Any = nv_gpu.UVM_REGISTER_GPU_PARAMS.from_address(argp) + assert any(all(st.gpu_uuid.uuid[i] == gpu.gpu_uuid()[i] for i in range(16)) for gpu in self.gpus.values()) + elif nr == nv_gpu.UVM_REGISTER_GPU_VASPACE: pass + elif nr == nv_gpu.UVM_ENABLE_PEER_ACCESS: pass # uvm and shared spaced are setup already, no emulation for now + elif nr == nv_gpu.UVM_CREATE_EXTERNAL_RANGE: + st = nv_gpu.UVM_CREATE_EXTERNAL_RANGE_PARAMS.from_address(argp) + libc.mmap(st.base, st.length, mmap.PROT_READ|mmap.PROT_WRITE, MAP_FIXED|mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0) + elif nr == nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION: + st = nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS.from_address(argp) + for gpu_attr_id in range(st.gpuAttributesCount): + gpu = None + for _gpu in self.gpus.values(): + if all(st.perGpuAttributes[gpu_attr_id].gpuUuid.uuid[i] == _gpu.gpu_uuid()[i] for i in range(16)): + gpu = _gpu + break + if gpu is None: return -1 + gpu.map_range(st.base, st.length) + elif nr == nv_gpu.UVM_REGISTER_CHANNEL: pass + elif nr == nv_gpu.UVM_FREE: + st = nv_gpu.UVM_FREE_PARAMS.from_address(argp) + libc.munmap(st.base, st.length) + else: raise RuntimeError(f"Unknown {nr} to nvidia-uvm") + return 0 + + def dev_ioctl(self, dev, req, argp): return 0 + def _gpu_mmio_write(self, mv, off, gpu): + any_progress = True + while any_progress: + any_progress = False + for gpu in self.gpus.values(): + for q in gpu.queues: + if (prev_rptr:=q.ctrl.GPGet) != q.ctrl.GPPut: + any_progress |= q.execute() \ No newline at end of file diff --git a/extra/mockgpu/nv/nvgpu.py b/extra/mockgpu/nv/nvgpu.py new file mode 100644 index 00000000..eae3e8f6 --- /dev/null +++ b/extra/mockgpu/nv/nvgpu.py @@ -0,0 +1,165 @@ +import ctypes, ctypes.util, time +import tinygrad.runtime.autogen.nv_gpu as nv_gpu +from enum import Enum, auto +from extra.mockgpu.gpu import VirtGPU +from tinygrad.helpers import to_mv, init_c_struct_t + +def make_qmd_struct_type(): + fields = [] + bits = [(name,dt) for name,dt in nv_gpu.__dict__.items() if name.startswith("NVC6C0_QMDV03_00") and isinstance(dt, tuple)] + bits += [(name+f"_{i}",dt(i)) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith("NVC6C0_QMDV03_00") and callable(dt)] + bits = sorted(bits, key=lambda x: x[1][1]) + for i,(name, data) in enumerate(bits): + if i > 0 and (gap:=(data[1] - bits[i-1][1][0] - 1)) != 0: fields.append((f"_reserved{i}", ctypes.c_uint32, gap)) + fields.append((name.replace("NVC6C0_QMDV03_00_", "").lower(), ctypes.c_uint32, data[0]-data[1]+1)) + return init_c_struct_t(tuple(fields)) +qmd_struct_t = make_qmd_struct_type() +assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4 + +try: + gpuocelot_lib = ctypes.CDLL(ctypes.util.find_library("gpuocelot")) + gpuocelot_lib.ptx_run.argtypes = [ctypes.c_char_p, ctypes.c_int, ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int] # noqa: E501 +except Exception: pass + +class SchedResult(Enum): CONT = auto(); YIELD = auto() # noqa: E702 + +class GPFIFO: + def __init__(self, token, base, entries_cnt): + self.token, self.base, self.entries_cnt = token, base, entries_cnt + self.gpfifo = to_mv(self.base, self.entries_cnt * 8).cast("Q") + self.ctrl = nv_gpu.AmpereAControlGPFifo.from_address(self.base + self.entries_cnt * 8) + self.state = {} + + # Buf exec state + self.buf = None + self.buf_sz = 0 + self.buf_ptr = 0 + + def _next_dword(self): + assert self.buf is not None + x = self.buf[self.buf_ptr] + self.buf_ptr += 1 + return x + + def _next_header(self): + header = self._next_dword() + typ = (header >> 28) & 0b111 + size = (header >> 16) & 0xFFF + subc = (header >> 13) & 0x7 + mthd = (header & 0x1FFF) << 2 + return typ, size, subc, mthd + + def _state(self, reg): return self.state[reg] + def _state64(self, reg): return (self.state[reg] << 32) + self.state[reg + 4] + def _state64_le(self, reg): return (self.state[reg + 4] << 32) + self.state[reg] + + def _reset_buf_state(self): self.buf, self.buf_ptr = None, 0 + def _set_buf_state(self, gpfifo_entry): + ptr = ((gpfifo_entry >> 2) & 0xfffffffff) << 2 + sz = ((gpfifo_entry >> 42) & 0x1fffff) << 2 + self.buf = to_mv(ptr, sz).cast("I") + self.buf_sz = sz // 4 + + def execute(self) -> bool: + initial_off = self.buf_ptr + while self.ctrl.GPGet != self.ctrl.GPPut: + self._set_buf_state(self.gpfifo[self.ctrl.GPGet]) + + if not self.execute_buf(): + # Buffer isn't executed fully, check if any progress and report. + # Do not move GPGet in this case, will continue from the same state next time. + return self.buf_ptr != initial_off + + self.ctrl.GPGet = (self.ctrl.GPGet + 1) % self.entries_cnt + self._reset_buf_state() + return True + + def execute_buf(self) -> bool: + while self.buf_ptr < self.buf_sz: + init_off = self.buf_ptr + typ, size, subc, mthd = self._next_header() + cmd_end_off = self.buf_ptr + size + + while self.buf_ptr < cmd_end_off: + res = self.execute_cmd(mthd) + if res == SchedResult.YIELD: + self.buf_ptr = init_off # just revert to the header + return False + mthd += 4 + return True + + def execute_qmd(self, qmd_addr): + qmd = qmd_struct_t.from_address(qmd_addr) + prg_addr = qmd.program_address_lower + (qmd.program_address_upper << 32) + const0 = to_mv(qmd.constant_buffer_addr_lower_0 + (qmd.constant_buffer_addr_upper_0 << 32), 0x160).cast('I') + args_cnt, vals_cnt = const0[0], const0[1] + args_addr = qmd.constant_buffer_addr_lower_0 + (qmd.constant_buffer_addr_upper_0 << 32) + 0x160 + args = to_mv(args_addr, args_cnt*8).cast('Q') + vals = to_mv(args_addr + args_cnt*8, vals_cnt*4).cast('I') + cargs = [ctypes.cast(args[i], ctypes.c_void_p) for i in range(args_cnt)] + [ctypes.cast(vals[i], ctypes.c_void_p) for i in range(vals_cnt)] + gx, gy, gz = qmd.cta_raster_width, qmd.cta_raster_height, qmd.cta_raster_depth + lx, ly, lz = qmd.cta_thread_dimension0, qmd.cta_thread_dimension1, qmd.cta_thread_dimension2 + gpuocelot_lib.ptx_run(ctypes.cast(prg_addr, ctypes.c_char_p), args_cnt+vals_cnt, (ctypes.c_void_p*len(cargs))(*cargs), lx, ly, lz, gx, gy, gz, 0) + + def execute_cmd(self, cmd) -> SchedResult: + if cmd == nv_gpu.NVC56F_SEM_EXECUTE: return self._exec_signal() + elif cmd == nv_gpu.NVC6C0_LAUNCH_DMA: return self._exec_nvc6c0_dma() + elif cmd == nv_gpu.NVC6B5_LAUNCH_DMA: return self._exec_nvc6b5_dma() + elif cmd == 0x0320: return self._exec_load_inline_qmd() # NVC6C0_LOAD_INLINE_QMD_DATA + else: self.state[cmd] = self._next_dword() # just state update + return SchedResult.CONT + + def _exec_signal(self) -> SchedResult: + signal = self._state64_le(nv_gpu.NVC56F_SEM_ADDR_LO) + val = self._state64_le(nv_gpu.NVC56F_SEM_PAYLOAD_LO) + flags = self._next_dword() + typ = (flags >> 0) & 0b111 + if typ == 1: to_mv(signal, 8).cast('Q')[0] = val + elif typ == 3: + mval = to_mv(signal, 8).cast('Q')[0] + return SchedResult.CONT if mval >= val else SchedResult.YIELD + else: raise RuntimeError(f"Unsupported type={typ} in exec wait/signal") + return SchedResult.CONT + + def _exec_load_inline_qmd(self): + qmd_addr = self._state64(nv_gpu.NVC6C0_SET_INLINE_QMD_ADDRESS_A) << 8 + assert qmd_addr != 0x0, f"invalid qmd address {qmd_addr}" + qmd_data = [self._next_dword() for _ in range(0x40)] + cdata = (ctypes.c_uint32 * len(qmd_data))(*qmd_data) + ctypes.memmove(qmd_addr, cdata, 0x40 * 4) + self.execute_qmd(qmd_addr) + + def _exec_nvc6c0_dma(self): + addr = self._state64(nv_gpu.NVC6C0_OFFSET_OUT_UPPER) + sz = self._state(nv_gpu.NVC6C0_LINE_LENGTH_IN) + lanes = self._state(nv_gpu.NVC6C0_LINE_COUNT) + assert lanes == 1, f"unsupported lanes > 1 in _exec_nvc6c0_dma: {lanes}" + flags = self._next_dword() + assert flags == 0x41, f"unsupported flags in _exec_nvc6c0_dma: {flags}" + typ, dsize, subc, mthd = self._next_header() + assert typ == 6 and mthd == nv_gpu.NVC6C0_LOAD_INLINE_DATA, f"Expected inline data not found after nvc6c0_dma, {typ=} {mthd=}" + copy_data = [self._next_dword() for _ in range(dsize)] + assert len(copy_data) * 4 == sz, f"different copy sizes in _exec_nvc6c0_dma: {len(copy_data) * 4} != {sz}" + cdata = (ctypes.c_uint32 * len(copy_data))(*copy_data) + ctypes.memmove(addr, cdata, sz) + + def _exec_nvc6b5_dma(self): + src = self._state64(nv_gpu.NVC6B5_OFFSET_IN_UPPER) + dst = self._state64(nv_gpu.NVC6B5_OFFSET_OUT_UPPER) + sz = self._state(nv_gpu.NVC6B5_LINE_LENGTH_IN) + flags = self._next_dword() + assert flags == 0x182, f"unsupported flags in _exec_nvc6b5_dma: {flags}" + ctypes.memmove(dst, src, sz) + +class NVGPU(VirtGPU): + def __init__(self, gpuid): + super().__init__(gpuid) + self.mapped_ranges = set() + self.queues = [] + + def map_range(self, vaddr, size): self.mapped_ranges.add((vaddr, size)) + def unmap_range(self, vaddr, size): self.mapped_ranges.remove((vaddr, size)) + def add_gpfifo(self, base, entries_count): + self.queues.append(GPFIFO(token:=len(self.queues), base, entries_count)) + return token + def gpu_uuid(self, sz=16): return self.gpuid.to_bytes(sz, byteorder='big', signed=False) diff --git a/test/helpers.py b/test/helpers.py index cf09bb6f..c7088213 100644 --- a/test/helpers.py +++ b/test/helpers.py @@ -26,7 +26,7 @@ def assert_jit_cache_len(fxn, expected_len): def is_dtype_supported(dtype: DType, device: str = Device.DEFAULT): if dtype == dtypes.bfloat16: # NOTE: this requires bf16 buffer support - return device in {"HSA", "AMD"} or (device == "CUDA" and not CI and not getenv("PTX")) + return device in {"HSA", "AMD"} or (device in {"CUDA", "NV"} and not CI and not getenv("PTX")) if device in ["WEBGPU", "WEBGL"]: return dtype in [dtypes.float, dtypes.int32, dtypes.uint32] if device == "CUDA" and getenv("PTX") and dtype in (dtypes.int8, dtypes.uint8): return False # for CI GPU and OSX, cl_khr_fp16 isn't supported @@ -35,7 +35,7 @@ def is_dtype_supported(dtype: DType, device: str = Device.DEFAULT): # PYTHON supports half memoryview in 3.12+ https://github.com/python/cpython/issues/90751 if dtype == dtypes.half: if device == "GPU": return not CI and not OSX - if device in ["LLVM", "CUDA"]: return not CI + if device in ["LLVM", "CUDA", "NV"]: return not CI if device == "PYTHON": return sys.version_info >= (3, 12) if dtype == dtypes.float64: return device != "METAL" and not (OSX and device == "GPU") return True diff --git a/test/test_dtype.py b/test/test_dtype.py index 0d972f0e..aeba03ad 100644 --- a/test/test_dtype.py +++ b/test/test_dtype.py @@ -200,7 +200,7 @@ class TestFloatDType(TestDType): class TestDoubleDtype(TestDType): DTYPE = dtypes.double - @unittest.skipIf(getenv("CUDACPU") or getenv("PTX"), "conversion not supported on CUDACPU and PTX") # TODO: why not? + @unittest.skipIf((CI and Device.DEFAULT in {"CUDA", "NV"}) or getenv("PTX"), "conversion not supported on CUDACPU and PTX") # TODO: why not? def test_float64_increased_precision(self): for func in [ lambda t: t.exp(), diff --git a/test/test_dtype_alu.py b/test/test_dtype_alu.py index c3f8e27c..013df95c 100644 --- a/test/test_dtype_alu.py +++ b/test/test_dtype_alu.py @@ -40,7 +40,7 @@ unary_operations = [(Tensor.exp, np.exp), (Tensor.log, np.log), operator.neg, (T # TODO: CUDACPU segfaults on sin # TODO: METAL sin is flaky for float16 -if getenv("CUDACPU") or Device.DEFAULT == "METAL": unary_operations.remove((Tensor.sin, np.sin)) +if getenv("CUDACPU") or (getenv("MOCKGPU") and Device.DEFAULT == "NV") or Device.DEFAULT == "METAL": unary_operations.remove((Tensor.sin, np.sin)) class ht: float64 = strat.floats(width=64, allow_subnormal=False) @@ -145,7 +145,7 @@ class TestDTypeALU(unittest.TestCase): def test_int32_midcast_float(self, a, b, c, op1, op2): universal_test_midcast(a, b, c, op1, op2, dtypes.int32, dtypes.float32) # Metal and CUDACPU and HIP behave differently than numpy in CI for overflows - skip_overflow = CI and (Device.DEFAULT in {"HSA", "AMD"} or getenv("CUDACPU")) + skip_overflow = CI and (Device.DEFAULT in {"HSA", "AMD", "NV"} or getenv("CUDACPU")) @given(strat.floats(width=32, min_value=0, max_value=10.0) if skip_overflow else ht.float32, strat.floats(width=32, min_value=0, max_value=10.0) if skip_overflow else ht.float32, ht.int32, strat.sampled_from(binary_operations), strat.sampled_from(integer_binary_operations)) diff --git a/test/test_jit.py b/test/test_jit.py index 53f21bb4..20a4aa12 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -287,7 +287,7 @@ class TestJit(unittest.TestCase): for i in range(5): np.testing.assert_equal(g(Tensor([i]*3), Tensor.ones(3), Tensor.zeros(3)).numpy(), np.array([i+1]*3)) - @unittest.skipIf(CI and Device.DEFAULT in {"GPU", "CUDA", "METAL", "HSA"}, "no GPU CI") + @unittest.skipIf(CI and Device.DEFAULT in {"GPU", "CUDA", "METAL", "HSA", "NV", "AMD"}, "no GPU CI") def test_jitted_transfers(self): d0, d1 = f"{Device.DEFAULT}:0", f"{Device.DEFAULT}:1" diff --git a/test/test_linearizer.py b/test/test_linearizer.py index f51eaa5d..d4adbe83 100644 --- a/test/test_linearizer.py +++ b/test/test_linearizer.py @@ -852,7 +852,7 @@ class TestKernelOpts(unittest.TestCase): ], apply_tc=True, atol=atol, rtol=rtol) def test_padto_matmul(self): - if CI and Device.DEFAULT in ["CUDA", "AMD"]: self.skipTest("super slow on CUDA and AMD because of the big grid dims") + if CI and Device.DEFAULT in ["CUDA", "AMD", "NV"]: self.skipTest("super slow on CUDA and AMD because of the big grid dims") N = 17 * 17 Tensor.manual_seed(289) a = Tensor.rand(N, N) diff --git a/test/test_linearizer_failures.py b/test/test_linearizer_failures.py index f034c417..0c977f6f 100644 --- a/test/test_linearizer_failures.py +++ b/test/test_linearizer_failures.py @@ -32,7 +32,7 @@ def helper_test_lin(lin: Linearizer, opts, failed_platforms, rtol=1e-2, atol=1e- else: assert Device.DEFAULT in failed_platforms, f"failed on {Device.DEFAULT} with {compare_result[0]}" -@unittest.skipIf(CI and Device.DEFAULT=="CUDA", "failed on CUDA CI") +@unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "failed on CUDA CI") class TestLinearizerFailures(unittest.TestCase): def setUp(self): random.seed(42) diff --git a/test/test_multitensor.py b/test/test_multitensor.py index cbfde105..8156cc6e 100644 --- a/test/test_multitensor.py +++ b/test/test_multitensor.py @@ -148,6 +148,7 @@ class TestMultiTensor(unittest.TestCase): a,b = _test_allreduce(Tensor.rand(256, 256)) np.testing.assert_almost_equal(a.numpy(), b.numpy(), decimal=5) + @unittest.skipIf(Device.DEFAULT in {"NV", "AMD"}, "not supported in HCQ") def test_copy_jit(self): @TinyJit def copy_tensor(x:Tensor): return (x.to(f"{x.device.split(':')[0]}:1") + 1) @@ -292,6 +293,7 @@ class TestMultiTensor(unittest.TestCase): y_shard = layer_norm_sharded(x_sharded).realize() np.testing.assert_allclose(y.numpy(), y_shard.numpy(), atol=1e-6, rtol=1e-6) + @unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "slow") def test_data_parallel_resnet(self): import sys, pathlib sys.path.append((pathlib.Path(__file__).parent.parent / "extra" / "models").as_posix()) @@ -310,6 +312,7 @@ class TestMultiTensor(unittest.TestCase): shard_output_np = shard_output.numpy() np.testing.assert_allclose(real_output, shard_output_np, atol=1e-6, rtol=1e-6) + @unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "slow") def test_data_parallel_resnet_train_step(self): import sys, pathlib sys.path.append((pathlib.Path(__file__).parent.parent / "extra" / "models").as_posix()) diff --git a/test/test_net_speed.py b/test/test_net_speed.py index 6a1e9353..1da097db 100644 --- a/test/test_net_speed.py +++ b/test/test_net_speed.py @@ -5,7 +5,7 @@ import torch from tinygrad import Tensor, Device from tinygrad.helpers import Profiling, CI -@unittest.skipIf(CI and Device.DEFAULT == "CUDA", "slow") +@unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "slow") class TestConvSpeed(unittest.TestCase): def test_mnist(self): diff --git a/test/test_nn.py b/test/test_nn.py index ad72979d..561ef8e9 100755 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -10,7 +10,7 @@ from tinygrad.nn.state import load_state_dict from tinygrad.engine.schedule import create_schedule from tinygrad.engine.realize import run_schedule -@unittest.skipIf(CI and Device.DEFAULT == "CUDA", "slow") +@unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "slow") class TestNN(unittest.TestCase): @unittest.skipIf(Device.DEFAULT == "WEBGPU", "no int64 on WebGPU") def test_sparse_cat_cross_entropy(self): diff --git a/test/test_ops.py b/test/test_ops.py index c0293da5..c56a4dbf 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -74,7 +74,7 @@ def prepare_test_op(low, high, shps, vals, forward_only=False): class TestOps(unittest.TestCase): def helper_test_exception(self, shps, torch_fxn, tinygrad_fxn, expected, exact=False, vals=None, low=-1.5, high=1.5): - if getenv("CUDACPU"): self.skipTest('helper_test_exception fails in CUDACPU') + if getenv("CUDACPU") or (getenv("MOCKGPU") and Device.DEFAULT == "NV"): self.skipTest('helper_test_exception fails in CUDACPU') ts, tst = prepare_test_op(low, high, shps, vals) with self.assertRaises(expected) as torch_cm: torch_fxn(*ts) @@ -1493,7 +1493,7 @@ class TestOps(unittest.TestCase): lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(2,2), stride=stride, dilation=dilation), lambda x: Tensor.max_pool2d(x, kernel_size=(2,2), stride=stride, dilation=dilation)) - @unittest.skipIf(Device.DEFAULT == "CUDA", "CUDA fails on this") + @unittest.skipIf( Device.DEFAULT in {"CUDA", "NV"}, "CUDA fails on this") def test_maxpool2d_unit_stride(self): helper_test_op([(8, 2, 17, 14)], lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(5,5), stride=1), diff --git a/test/test_optim.py b/test/test_optim.py index 0d280d39..8501d20d 100644 --- a/test/test_optim.py +++ b/test/test_optim.py @@ -41,7 +41,7 @@ def step(tensor, optim, steps=1, teeny=False, **kwargs): optim.step() return net.x.detach().numpy(), net.W.detach().numpy() -@unittest.skipIf(CI and Device.DEFAULT == "CUDA", "slow") +@unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "slow") class TestOptim(unittest.TestCase): def setUp(self): self.old_training = Tensor.training diff --git a/test/test_search.py b/test/test_search.py index 4ef969af..baac4913 100644 --- a/test/test_search.py +++ b/test/test_search.py @@ -11,6 +11,7 @@ from tinygrad.helpers import Context from tinygrad.engine.realize import capturing class TestTimeLinearizer(unittest.TestCase): + @unittest.skipIf(Device.DEFAULT in {"AMD", "NV"}, "Tries to open HSA/CUDA. #4607") def test_reasonable_time(self): si = [i for i in create_schedule([Tensor([1,2,3,4]).add(1).lazydata]) if i.ast[0].op not in LoadOps][0] out = Buffer(Device.DEFAULT, si.outputs[0].size, si.outputs[0].dtype).allocate() @@ -19,6 +20,7 @@ class TestTimeLinearizer(unittest.TestCase): tm = time_linearizer(Linearizer(*si.ast), rawbufs, allow_test_size=False, cnt=10) assert tm > 0 and tm != float('inf') + @unittest.skipIf(Device.DEFAULT in {"AMD", "NV"}, "Tries to open HSA/CUDA. #4607") def test_bufs_from_lin(self): si = [i for i in create_schedule([Tensor([1,2,3,4]).add(1).lazydata]) if i.ast[0].op not in LoadOps][0] rawbufs = bufs_from_lin(lin:=Linearizer(*si.ast)) @@ -28,6 +30,7 @@ class TestTimeLinearizer(unittest.TestCase): assert all(r.size > 0 for r in rawbufs) class TestBEAM(unittest.TestCase): + @unittest.skipIf(Device.DEFAULT in {"AMD", "NV"}, "Tries to open HSA/CUDA. #4607") def test_dynamic_beam(self): # TODO: make this infra globally usable class Capture: diff --git a/test/test_specific_conv.py b/test/test_specific_conv.py index cf794790..3ecff4e0 100644 --- a/test/test_specific_conv.py +++ b/test/test_specific_conv.py @@ -4,7 +4,7 @@ from tinygrad import Tensor, Device, dtypes from test.helpers import is_dtype_supported # similar to test/external/external_test_gpu_ast.py, but universal -@unittest.skipIf(Device.DEFAULT == "CUDA" and CI, "slow on CUDA CI") +@unittest.skipIf(Device.DEFAULT in {"CUDA", "NV"} and CI, "slow on CUDA CI") class TestSpecific(unittest.TestCase): # from openpilot diff --git a/test/test_speed_v_torch.py b/test/test_speed_v_torch.py index 36210d7f..1283b8e8 100644 --- a/test/test_speed_v_torch.py +++ b/test/test_speed_v_torch.py @@ -117,7 +117,7 @@ def helper_test_conv(bs, in_chans, out_chans, kernel_size, img_size_y, img_size_ helper_test_generic(f"conv bs:{bs:3d} chans:{in_chans:3d} -> {out_chans:3d} k:{kernel_size}", f1, (torch_dat,), TinyJit(f2), (tiny_dat,)) @unittest.skipIf(getenv("BIG") == 0, "no big tests") -@unittest.skipIf(getenv("CUDACPU"), "no CUDACPU") +@unittest.skipIf(getenv("CUDACPU") or getenv("MOCKGPU"), "no CUDACPU or MOCKGPUs") class TestBigSpeed(unittest.TestCase): def test_add(self): def f(a, b): return a+b @@ -138,7 +138,7 @@ class TestBigSpeed(unittest.TestCase): def test_matvec_16384_4096(self): helper_test_matvec('matvec_16384_4096', 16384, 4096) @unittest.skipIf(getenv("BIG") == 1, "only big tests") -@unittest.skipIf(getenv("CUDACPU"), "no CUDACPU") +@unittest.skipIf(getenv("CUDACPU") or getenv("MOCKGPU"), "no CUDACPU or MOCKGPUs") class TestSpeed(unittest.TestCase): def test_sub(self): def f(a, b): return a-b diff --git a/test/test_tensor.py b/test/test_tensor.py index 057cf8c3..f675ccaf 100644 --- a/test/test_tensor.py +++ b/test/test_tensor.py @@ -360,7 +360,7 @@ class TestTinygrad(unittest.TestCase): c = (a + b).mean().backward() print(c) -@unittest.skipIf(CI and Device.DEFAULT in {"GPU", "CUDA", "METAL"}, "no GPU CI") +@unittest.skipIf(CI and Device.DEFAULT in {"GPU", "CUDA", "METAL", "NV", "AMD"}, "no GPU CI") class TestMoveTensor(unittest.TestCase): d0, d1 = f"{Device.DEFAULT}:0", f"{Device.DEFAULT}:1" @given(strat.sampled_from([d0, d1]), strat.sampled_from([d0, d1]), diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index 3be52c38..0ffe680b 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -4,18 +4,22 @@ from typing import Tuple, List, Any, cast from tinygrad.device import Compiled, Compiler, LRUAllocator, BufferOptions from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG, prod from tinygrad.renderer.cstyle import CUDARenderer -from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes +from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes, CUDACompiler import tinygrad.runtime.autogen.cuda as cuda import tinygrad.runtime.autogen.nv_gpu as nv_gpu if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 -libc = ctypes.CDLL("libc.so.6") -libc.memset.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_int] +libc = ctypes.CDLL(ctypes.util.find_library("c")) libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long] libc.mmap.restype = ctypes.c_void_p libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t] libc.munmap.restype = ctypes.c_int +if MOCKGPU:=getenv("MOCKGPU"): + import extra.mockgpu.mockgpu # noqa: F401 + libc.mmap = extra.mockgpu.mockgpu._mmap # type: ignore + libc.munmap = extra.mockgpu.mockgpu._munmap # type: ignore + def nv_iowr(fd, nr, args): ret = fcntl.ioctl(fd, (3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args) if ret != 0: raise RuntimeError(f"ioctl returned {ret}") @@ -161,25 +165,29 @@ class NVProgram: print(subprocess.check_output(["nvdisasm", fn+".cubin"]).decode('utf-8')) except Exception as e: print("failed to disasm cubin", str(e)) - _phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("> 24 - if match := re.match(r'\.nv\.constant(\d+)', section_name): - constant_buffers_data[int(match.group(1))] = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I") - if section_name == ".nv.info": - section_data = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I") - for i in range(sh_size // 12): - if section_data[i * 3 + 0] & 0xffff == 0x1204 and section_data[i * 3 + 2] + 0x240 > self.device.slm_per_thread: - raise RuntimeError("too high local memory") + + if MOCKGPU: + self.program, self.registers_usage = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), 0x10 + constant_buffers_data[0] = memoryview(bytearray(0x190)) + else: + _phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("> 24 + if match := re.match(r'\.nv\.constant(\d+)', section_name): + constant_buffers_data[int(match.group(1))] = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I") + if section_name == ".nv.info": + section_data = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I") + for i in range(sh_size // 12): + if section_data[i * 3 + 0] & 0xffff == 0x1204 and section_data[i * 3 + 2] + 0x240 > self.device.slm_per_thread: + raise RuntimeError("too high local memory") # Registers allocation granularity per warp is 256, warp allocaiton granularity is 4. Register file size is 65536. self.max_threads = ((65536 // round_up(self.registers_usage * 32, 256)) // 4) * 4 * 32 @@ -187,8 +195,8 @@ class NVProgram: # Load program and constant buffers (if any) self.lib_sz = round_up(round_up(self.program.nbytes, 128) + sum([round_up(x.nbytes, 128) for i,x in constant_buffers_data.items()]), 0x1000) self.lib_gpu = self.device.allocator.alloc(self.lib_sz) - for st in range(0, len(self.program), 4096): - HWComputeQueue().copy_from_cpu(self.lib_gpu.base+st*4, self.program[st:st+4096]).submit(self.device) + for st in range(0, len(self.program), 4095): + HWComputeQueue().copy_from_cpu(self.lib_gpu.base+st*4, self.program[st:st+4095]).submit(self.device) self.constbuffer_0 = [0] * 88 self.constbuffer_0[6:12] = [*nvdata64_le(self.device.shared_mem_window), *nvdata64_le(self.device.local_mem_window), *nvdata64_le(0xfffdc0)] @@ -238,6 +246,8 @@ class NVProgram: if self.device.kernargs_ptr >= (self.device.kernargs_page.base + self.device.kernargs_page.length - self.kernargs_segment_size): self.device.kernargs_ptr = self.device.kernargs_page.base + # HACK: Save counts of args and vars to "unused" constbuffer for later extraction in mockgpu to pass into gpuocelot. + if MOCKGPU: self.constbuffer_0[0:2] = [len(args), len(vals)] kernargs = [arg_half for arg in args for arg_half in nvdata64_le(arg.base)] + [val for val in vals] queue = HWComputeQueue() @@ -488,11 +498,11 @@ class NVDevice(Compiled): self.kernargs_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x4000000, map_to_cpu=True) self.kernargs_ptr: int = self.kernargs_page.base - self.arch: str = 'sm_89' # TODO: fix + self.arch: str = "sm_89" if not MOCKGPU else "sm_35" # TODO: fix from tinygrad.runtime.graph.hcq import HCQGraph - super().__init__(device, NVAllocator(self), CUDARenderer(self.arch), NVCompiler(self.arch), functools.partial(NVProgram, self), - functools.partial(HCQGraph, NVDevice, HWComputeQueue, HWCopyQueue)) + super().__init__(device, NVAllocator(self), CUDARenderer(self.arch), CUDACompiler(self.arch) if MOCKGPU else NVCompiler(self.arch), + functools.partial(NVProgram, self), functools.partial(HCQGraph, NVDevice, HWComputeQueue, HWCopyQueue)) self._cmdq_setup_compute_gpfifo() self._cmdq_setup_dma_gpfifo()