fix multigpu on tinybox (#2595)

* fix multigpu on tinybox * fixed multigpu
2023-12-03 16:48:07 -08:00 · 2023-12-03 16:48:07 -08:00 · fcd0b2ee6c
parent 61c0113928
commit fcd0b2ee6c
3 changed files with 19 additions and 10 deletions
--- a/test/external/external_multi_gpu.py
+++ b/test/external/external_multi_gpu.py
@ -3,7 +3,7 @@
 # LD_PRELOAD=$PWD/disassemblers/cuda_ioctl_sniffer/out/sniff.so GPU=1 python3 test/external/external_multi_gpu.py
 import numpy as np
 from tinygrad.tensor import Tensor
-from tinygrad.helpers import colored, Timing
+from tinygrad.helpers import colored, Timing, getenv
 from tinygrad.device import Device

 d0, d1 = f'{Device.DEFAULT}:0', f'{Device.DEFAULT}:1'
@ -14,12 +14,15 @@ def sync():

 if __name__ == "__main__":
  print("GPU devices", d0, d1)
-  sz = 1024*1024*256  # 1 GB
-  #sz = 1024*64
+  sz = getenv("N", 1024*1024*256)  # 1 GB
+
+  with Timing("GPU initial sync: "): sync()

  with Timing("CPU creation: ", on_exit=lambda x: f", {(sz*4*2)/x:.2f} GB/sec"):
-    c0 = Tensor.ones(sz, device="cpu").realize()
-    c1 = (Tensor.ones(sz, device="cpu")/2).realize()
+    c0 = (Tensor.ones(sz, device="clang")/2).realize()
+    c1 = (Tensor.ones(sz, device="clang")/4).realize()
+    print(c0.lazydata.realized)
+    print(c1.lazydata.realized)

  with Timing("CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
    a0 = c0.to(d0).realize()
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@ -16,8 +16,9 @@ if TYPE_CHECKING:
 class _Device:
  def __init__(self) -> None: self._buffers: List[str] = [x.stem[len("ops_"):].upper() for x in (pathlib.Path(__file__).parent/"runtime").iterdir() if x.stem.startswith("ops_")]
  def canonicalize(self, device:Optional[str]) -> str: return (device.split(":", 1)[0].upper() + ((":"+device.split(":", 1)[1]) if ':' in device else '')).replace(":0", "") if device is not None else self.DEFAULT
+  def __getitem__(self, ix:str) -> Union[Interpreted, Compiled]: return self.__get_canonicalized_item(self.canonicalize(ix))
  @functools.lru_cache(maxsize=None)  # this class is a singleton, pylint: disable=method-cache-max-size-none
-  def __getitem__(self, ix:str) -> Union[Interpreted, Compiled]:
+  def __get_canonicalized_item(self, ix:str) -> Union[Interpreted, Compiled]:
    x = ix.split(":")[0].upper()
    ret = [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{x.lower()}')) if (cname.lower() == x.lower() + "device") and x in self._buffers][0]
    if isinstance(ret, type): ret = ret(ix)
@ -83,7 +84,7 @@ class Buffer:
      self.allocator.free(self._real_buf, self.size * self.dtype.itemsize)
    else:
      self.allocator.free(self._buf, self.size * self.dtype.itemsize)
-  def __repr__(self): return f"<buf device:{self.device} size:{self.size}>"
+  def __repr__(self): return f"<buf device:{self.device} size:{self.size} dtype:{self.dtype}>"
  def copyin(self, mv:memoryview):
    mv = flat_mv(mv)
    assert len(mv) == self.size*self.dtype.itemsize, f"size mismatch, {len(mv)=} != {self.dtype=} {self.size=}"
@ -106,6 +107,7 @@ class _BufferCopy(JITRunner):
    if DEBUG >= 2: print(f"***      copy {dest.device} <- {src.device} size {dest.size:<16d} dtype {dest.dtype}")
    if hasattr(dest.allocator, 'transfer') and type(dest.allocator) is type(src.allocator):
      # fast path, used on HIP between GPUs
+      # NOTE: it's important we use the dest device here to ensure the transfer is ready
      dest.allocator.transfer(dest._buf, src._buf, dest.size*dest.dtype.itemsize)
      return
    if getenv("FROM_BUFFER") and hasattr(dest.allocator, 'from_buffer') and hasattr(dest.allocator, 'transfer') and hasattr(src.allocator, 'as_buffer'):
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@ -2,7 +2,7 @@ from __future__ import annotations
 from typing import Tuple, Optional, Union, List, cast
 import ctypes, functools
 import gpuctypes.opencl as cl
-from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, diskcache, OSX, ImageDType
+from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, diskcache, OSX, ImageDType, DEBUG
 from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.renderer.cstyle import OpenCLRenderer
 from tinygrad.device import Compiled, LRUAllocator
@ -81,8 +81,12 @@ class CLDevice(Compiled):
    if CLDevice.device_ids is None:
      num_platforms = init_c_var(ctypes.c_uint32(), lambda x: check(cl.clGetPlatformIDs(0, None, ctypes.byref(x))))
      platform_ids = init_c_var((cl.cl_platform_id * num_platforms.value)(), lambda x: check(cl.clGetPlatformIDs(num_platforms.value, x, None)))
-      num_devices = init_c_var(ctypes.c_uint32(), lambda x: check(cl.clGetDeviceIDs(platform_ids[0], cl.CL_DEVICE_TYPE_DEFAULT, 0, None, ctypes.byref(x))))
-      CLDevice.device_ids = init_c_var((cl.cl_device_id * num_devices.value)(), lambda x: check(cl.clGetDeviceIDs(platform_ids[0], cl.CL_DEVICE_TYPE_DEFAULT, num_devices, x, None)))
+      for device_type in [cl.CL_DEVICE_TYPE_GPU, cl.CL_DEVICE_TYPE_DEFAULT]:
+        num_devices = ctypes.c_uint32()
+        err = cl.clGetDeviceIDs(platform_ids[0], device_type, 0, None, ctypes.byref(num_devices))
+        if err == 0 and num_devices.value != 0: break
+      if DEBUG >= 1: print(f"CLDevice: got {num_platforms.value} platforms and {num_devices.value} devices")
+      CLDevice.device_ids = init_c_var((cl.cl_device_id * num_devices.value)(), lambda x: check(cl.clGetDeviceIDs(platform_ids[0], device_type, num_devices, x, None)))

    self.device_id = CLDevice.device_ids[0 if ":" not in device else int(device.split(":")[1])]
    self.context = checked(cl.clCreateContext(None, 1, ctypes.byref(self.device_id), cl.clCreateContext.argtypes[3](), None, ctypes.byref(status := ctypes.c_int32())), status)