fix multigpu on tinybox (#2595)

* fix multigpu on tinybox

* fixed multigpu
This commit is contained in:
George Hotz 2023-12-03 16:48:07 -08:00 committed by GitHub
parent 61c0113928
commit fcd0b2ee6c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 19 additions and 10 deletions

View File

@ -3,7 +3,7 @@
# LD_PRELOAD=$PWD/disassemblers/cuda_ioctl_sniffer/out/sniff.so GPU=1 python3 test/external/external_multi_gpu.py
import numpy as np
from tinygrad.tensor import Tensor
from tinygrad.helpers import colored, Timing
from tinygrad.helpers import colored, Timing, getenv
from tinygrad.device import Device
d0, d1 = f'{Device.DEFAULT}:0', f'{Device.DEFAULT}:1'
@ -14,12 +14,15 @@ def sync():
if __name__ == "__main__":
print("GPU devices", d0, d1)
sz = 1024*1024*256 # 1 GB
#sz = 1024*64
sz = getenv("N", 1024*1024*256) # 1 GB
with Timing("GPU initial sync: "): sync()
with Timing("CPU creation: ", on_exit=lambda x: f", {(sz*4*2)/x:.2f} GB/sec"):
c0 = Tensor.ones(sz, device="cpu").realize()
c1 = (Tensor.ones(sz, device="cpu")/2).realize()
c0 = (Tensor.ones(sz, device="clang")/2).realize()
c1 = (Tensor.ones(sz, device="clang")/4).realize()
print(c0.lazydata.realized)
print(c1.lazydata.realized)
with Timing("CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
a0 = c0.to(d0).realize()

View File

@ -16,8 +16,9 @@ if TYPE_CHECKING:
class _Device:
def __init__(self) -> None: self._buffers: List[str] = [x.stem[len("ops_"):].upper() for x in (pathlib.Path(__file__).parent/"runtime").iterdir() if x.stem.startswith("ops_")]
def canonicalize(self, device:Optional[str]) -> str: return (device.split(":", 1)[0].upper() + ((":"+device.split(":", 1)[1]) if ':' in device else '')).replace(":0", "") if device is not None else self.DEFAULT
def __getitem__(self, ix:str) -> Union[Interpreted, Compiled]: return self.__get_canonicalized_item(self.canonicalize(ix))
@functools.lru_cache(maxsize=None) # this class is a singleton, pylint: disable=method-cache-max-size-none
def __getitem__(self, ix:str) -> Union[Interpreted, Compiled]:
def __get_canonicalized_item(self, ix:str) -> Union[Interpreted, Compiled]:
x = ix.split(":")[0].upper()
ret = [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{x.lower()}')) if (cname.lower() == x.lower() + "device") and x in self._buffers][0]
if isinstance(ret, type): ret = ret(ix)
@ -83,7 +84,7 @@ class Buffer:
self.allocator.free(self._real_buf, self.size * self.dtype.itemsize)
else:
self.allocator.free(self._buf, self.size * self.dtype.itemsize)
def __repr__(self): return f"<buf device:{self.device} size:{self.size}>"
def __repr__(self): return f"<buf device:{self.device} size:{self.size} dtype:{self.dtype}>"
def copyin(self, mv:memoryview):
mv = flat_mv(mv)
assert len(mv) == self.size*self.dtype.itemsize, f"size mismatch, {len(mv)=} != {self.dtype=} {self.size=}"
@ -106,6 +107,7 @@ class _BufferCopy(JITRunner):
if DEBUG >= 2: print(f"*** copy {dest.device} <- {src.device} size {dest.size:<16d} dtype {dest.dtype}")
if hasattr(dest.allocator, 'transfer') and type(dest.allocator) is type(src.allocator):
# fast path, used on HIP between GPUs
# NOTE: it's important we use the dest device here to ensure the transfer is ready
dest.allocator.transfer(dest._buf, src._buf, dest.size*dest.dtype.itemsize)
return
if getenv("FROM_BUFFER") and hasattr(dest.allocator, 'from_buffer') and hasattr(dest.allocator, 'transfer') and hasattr(src.allocator, 'as_buffer'):

View File

@ -2,7 +2,7 @@ from __future__ import annotations
from typing import Tuple, Optional, Union, List, cast
import ctypes, functools
import gpuctypes.opencl as cl
from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, diskcache, OSX, ImageDType
from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, diskcache, OSX, ImageDType, DEBUG
from tinygrad.codegen.kernel import LinearizerOptions
from tinygrad.renderer.cstyle import OpenCLRenderer
from tinygrad.device import Compiled, LRUAllocator
@ -81,8 +81,12 @@ class CLDevice(Compiled):
if CLDevice.device_ids is None:
num_platforms = init_c_var(ctypes.c_uint32(), lambda x: check(cl.clGetPlatformIDs(0, None, ctypes.byref(x))))
platform_ids = init_c_var((cl.cl_platform_id * num_platforms.value)(), lambda x: check(cl.clGetPlatformIDs(num_platforms.value, x, None)))
num_devices = init_c_var(ctypes.c_uint32(), lambda x: check(cl.clGetDeviceIDs(platform_ids[0], cl.CL_DEVICE_TYPE_DEFAULT, 0, None, ctypes.byref(x))))
CLDevice.device_ids = init_c_var((cl.cl_device_id * num_devices.value)(), lambda x: check(cl.clGetDeviceIDs(platform_ids[0], cl.CL_DEVICE_TYPE_DEFAULT, num_devices, x, None)))
for device_type in [cl.CL_DEVICE_TYPE_GPU, cl.CL_DEVICE_TYPE_DEFAULT]:
num_devices = ctypes.c_uint32()
err = cl.clGetDeviceIDs(platform_ids[0], device_type, 0, None, ctypes.byref(num_devices))
if err == 0 and num_devices.value != 0: break
if DEBUG >= 1: print(f"CLDevice: got {num_platforms.value} platforms and {num_devices.value} devices")
CLDevice.device_ids = init_c_var((cl.cl_device_id * num_devices.value)(), lambda x: check(cl.clGetDeviceIDs(platform_ids[0], device_type, num_devices, x, None)))
self.device_id = CLDevice.device_ids[0 if ":" not in device else int(device.split(":")[1])]
self.context = checked(cl.clCreateContext(None, 1, ctypes.byref(self.device_id), cl.clCreateContext.argtypes[3](), None, ctypes.byref(status := ctypes.c_int32())), status)