mirror of https://github.com/commaai/tinygrad.git
fix multigpu on tinybox (#2595)
* fix multigpu on tinybox * fixed multigpu
This commit is contained in:
parent
61c0113928
commit
fcd0b2ee6c
|
@ -3,7 +3,7 @@
|
|||
# LD_PRELOAD=$PWD/disassemblers/cuda_ioctl_sniffer/out/sniff.so GPU=1 python3 test/external/external_multi_gpu.py
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.helpers import colored, Timing
|
||||
from tinygrad.helpers import colored, Timing, getenv
|
||||
from tinygrad.device import Device
|
||||
|
||||
d0, d1 = f'{Device.DEFAULT}:0', f'{Device.DEFAULT}:1'
|
||||
|
@ -14,12 +14,15 @@ def sync():
|
|||
|
||||
if __name__ == "__main__":
|
||||
print("GPU devices", d0, d1)
|
||||
sz = 1024*1024*256 # 1 GB
|
||||
#sz = 1024*64
|
||||
sz = getenv("N", 1024*1024*256) # 1 GB
|
||||
|
||||
with Timing("GPU initial sync: "): sync()
|
||||
|
||||
with Timing("CPU creation: ", on_exit=lambda x: f", {(sz*4*2)/x:.2f} GB/sec"):
|
||||
c0 = Tensor.ones(sz, device="cpu").realize()
|
||||
c1 = (Tensor.ones(sz, device="cpu")/2).realize()
|
||||
c0 = (Tensor.ones(sz, device="clang")/2).realize()
|
||||
c1 = (Tensor.ones(sz, device="clang")/4).realize()
|
||||
print(c0.lazydata.realized)
|
||||
print(c1.lazydata.realized)
|
||||
|
||||
with Timing("CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
a0 = c0.to(d0).realize()
|
||||
|
|
|
@ -16,8 +16,9 @@ if TYPE_CHECKING:
|
|||
class _Device:
|
||||
def __init__(self) -> None: self._buffers: List[str] = [x.stem[len("ops_"):].upper() for x in (pathlib.Path(__file__).parent/"runtime").iterdir() if x.stem.startswith("ops_")]
|
||||
def canonicalize(self, device:Optional[str]) -> str: return (device.split(":", 1)[0].upper() + ((":"+device.split(":", 1)[1]) if ':' in device else '')).replace(":0", "") if device is not None else self.DEFAULT
|
||||
def __getitem__(self, ix:str) -> Union[Interpreted, Compiled]: return self.__get_canonicalized_item(self.canonicalize(ix))
|
||||
@functools.lru_cache(maxsize=None) # this class is a singleton, pylint: disable=method-cache-max-size-none
|
||||
def __getitem__(self, ix:str) -> Union[Interpreted, Compiled]:
|
||||
def __get_canonicalized_item(self, ix:str) -> Union[Interpreted, Compiled]:
|
||||
x = ix.split(":")[0].upper()
|
||||
ret = [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{x.lower()}')) if (cname.lower() == x.lower() + "device") and x in self._buffers][0]
|
||||
if isinstance(ret, type): ret = ret(ix)
|
||||
|
@ -83,7 +84,7 @@ class Buffer:
|
|||
self.allocator.free(self._real_buf, self.size * self.dtype.itemsize)
|
||||
else:
|
||||
self.allocator.free(self._buf, self.size * self.dtype.itemsize)
|
||||
def __repr__(self): return f"<buf device:{self.device} size:{self.size}>"
|
||||
def __repr__(self): return f"<buf device:{self.device} size:{self.size} dtype:{self.dtype}>"
|
||||
def copyin(self, mv:memoryview):
|
||||
mv = flat_mv(mv)
|
||||
assert len(mv) == self.size*self.dtype.itemsize, f"size mismatch, {len(mv)=} != {self.dtype=} {self.size=}"
|
||||
|
@ -106,6 +107,7 @@ class _BufferCopy(JITRunner):
|
|||
if DEBUG >= 2: print(f"*** copy {dest.device} <- {src.device} size {dest.size:<16d} dtype {dest.dtype}")
|
||||
if hasattr(dest.allocator, 'transfer') and type(dest.allocator) is type(src.allocator):
|
||||
# fast path, used on HIP between GPUs
|
||||
# NOTE: it's important we use the dest device here to ensure the transfer is ready
|
||||
dest.allocator.transfer(dest._buf, src._buf, dest.size*dest.dtype.itemsize)
|
||||
return
|
||||
if getenv("FROM_BUFFER") and hasattr(dest.allocator, 'from_buffer') and hasattr(dest.allocator, 'transfer') and hasattr(src.allocator, 'as_buffer'):
|
||||
|
|
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
|||
from typing import Tuple, Optional, Union, List, cast
|
||||
import ctypes, functools
|
||||
import gpuctypes.opencl as cl
|
||||
from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, diskcache, OSX, ImageDType
|
||||
from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, diskcache, OSX, ImageDType, DEBUG
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
from tinygrad.renderer.cstyle import OpenCLRenderer
|
||||
from tinygrad.device import Compiled, LRUAllocator
|
||||
|
@ -81,8 +81,12 @@ class CLDevice(Compiled):
|
|||
if CLDevice.device_ids is None:
|
||||
num_platforms = init_c_var(ctypes.c_uint32(), lambda x: check(cl.clGetPlatformIDs(0, None, ctypes.byref(x))))
|
||||
platform_ids = init_c_var((cl.cl_platform_id * num_platforms.value)(), lambda x: check(cl.clGetPlatformIDs(num_platforms.value, x, None)))
|
||||
num_devices = init_c_var(ctypes.c_uint32(), lambda x: check(cl.clGetDeviceIDs(platform_ids[0], cl.CL_DEVICE_TYPE_DEFAULT, 0, None, ctypes.byref(x))))
|
||||
CLDevice.device_ids = init_c_var((cl.cl_device_id * num_devices.value)(), lambda x: check(cl.clGetDeviceIDs(platform_ids[0], cl.CL_DEVICE_TYPE_DEFAULT, num_devices, x, None)))
|
||||
for device_type in [cl.CL_DEVICE_TYPE_GPU, cl.CL_DEVICE_TYPE_DEFAULT]:
|
||||
num_devices = ctypes.c_uint32()
|
||||
err = cl.clGetDeviceIDs(platform_ids[0], device_type, 0, None, ctypes.byref(num_devices))
|
||||
if err == 0 and num_devices.value != 0: break
|
||||
if DEBUG >= 1: print(f"CLDevice: got {num_platforms.value} platforms and {num_devices.value} devices")
|
||||
CLDevice.device_ids = init_c_var((cl.cl_device_id * num_devices.value)(), lambda x: check(cl.clGetDeviceIDs(platform_ids[0], device_type, num_devices, x, None)))
|
||||
|
||||
self.device_id = CLDevice.device_ids[0 if ":" not in device else int(device.split(":")[1])]
|
||||
self.context = checked(cl.clCreateContext(None, 1, ctypes.byref(self.device_id), cl.clCreateContext.argtypes[3](), None, ctypes.byref(status := ctypes.c_int32())), status)
|
||||
|
|
Loading…
Reference in New Issue