Hip driver (#2992)

* start hip driver * fix hip llama * make HIP default if we can * don't change those
2024-01-03 12:53:47 -08:00 · 2024-01-03 12:53:47 -08:00 · 753a7ecc05
parent f290ca3924
commit 753a7ecc05
5 changed files with 3140 additions and 15 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -122,11 +122,10 @@ jobs:
      run: HIP=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt
    - name: Run Stable Diffusion
      run: python3 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd.txt
-    # TODO: rocm 6.0 broke this
-    # - name: Run LLaMA
-    #   run: |
-    #     JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
-    #     JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
+    - name: Run LLaMA (with HIP)
+      run: |
+        HIP=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
+        HIP=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
    - name: Run GPT2 (with HIP)
      run: |
        HIP=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
--- a/extra/hip_gpu_driver/hip_ioctl.py
+++ b/extra/hip_gpu_driver/hip_ioctl.py
@ -0,0 +1,90 @@
+import ctypes, ctypes.util, struct, platform, pathlib, re, time
+
+# *** ioctl lib ***
+libc = ctypes.CDLL(ctypes.util.find_library("c"))
+processor = platform.processor()
+IOCTL_SYSCALL = {"aarch64": 0x1d, "x86_64":16}[processor]
+
+def get_struct(argp, stype):
+  return ctypes.cast(ctypes.c_void_p(argp), ctypes.POINTER(stype)).contents
+
+def format_struct(s):
+  sdats = []
+  for field_name, field_type in s._fields_:
+    dat = getattr(s, field_name)
+    if isinstance(dat, int): sdats.append(f"{field_name}:0x{dat:X}")
+    else: sdats.append(f"{field_name}:{dat}")
+  return sdats
+
+def install_hook(c_function, python_function):
+  python_function_addr = ctypes.cast(ctypes.byref(python_function), ctypes.POINTER(ctypes.c_ulong)).contents.value
+  # AARCH64 trampoline to ioctl
+  if processor == "aarch64":
+    # 0x0000000000000000:  70 00 00 10    adr x16, #0xc
+    # 0x0000000000000004:  10 02 40 F9    ldr x16, [x16]
+    # 0x0000000000000008:  00 02 1F D6    br  x16
+    tramp = b"\x70\x00\x00\x10\x10\x02\x40\xf9\x00\x02\x1f\xd6"
+    tramp += struct.pack("Q", python_function_addr)
+  elif processor == "x86_64":
+    # 0x0000000000000000:  49 B8 aa aa aa aa aa aa aa aa    movabs r8, <address>
+    # 0x000000000000000a:  41 FF E0                         jmp    r8
+    tramp = b"\x49\xB8" + struct.pack("Q", python_function_addr) + b"\x41\xFF\xE0"
+  else:
+    raise Exception(f"processor {processor} not supported")
+
+  # get real ioctl address
+  ioctl_address = ctypes.cast(ctypes.byref(c_function), ctypes.POINTER(ctypes.c_ulong))
+
+  # hook ioctl
+  ret = libc.mprotect(ctypes.c_ulong((ioctl_address.contents.value//0x1000)*0x1000), 0x2000, 7)
+  assert ret == 0
+  libc.memcpy(ioctl_address.contents, ctypes.create_string_buffer(tramp), len(tramp))
+
+# *** ioctl lib end ***
+
+# clang2py kfd_ioctl.h -o kfd_ioctl.py
+from extra.hip_gpu_driver import kfd_ioctl
+def ioctls_from_header():
+  hdr = (pathlib.Path(__file__).parent.parent.parent / "extra/hip_gpu_driver/kfd_ioctl.h").read_text().replace("\\\n", "")
+  pattern = r'#define\s+(AMDKFD_IOC_[A-Z0-9_]+)\s+AMDKFD_IOWR?\((0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
+  matches = re.findall(pattern, hdr, re.MULTILINE)
+  return {int(nr, 0x10):(name, getattr(kfd_ioctl, "struct_"+sname)) for name, nr, sname in matches}
+nrs = ioctls_from_header()
+
+@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_ulong, ctypes.c_void_p)
+def ioctl(fd, request, argp):
+  st = time.perf_counter()
+  ret = libc.syscall(IOCTL_SYSCALL, ctypes.c_int(fd), ctypes.c_ulong(request), ctypes.c_void_p(argp))
+  et = time.perf_counter()-st
+  idir, size, itype, nr = (request>>30), (request>>16)&0x3FFF, (request>>8)&0xFF, request&0xFF
+  if nr in nrs and itype == 75:
+    name, stype = nrs[nr]
+    s = get_struct(argp, stype)
+    print(f"{et*1000.:7.2f} ms : {ret:2d} = {name:40s}", ' '.join(format_struct(s)))
+  else:
+    print("ioctl", idir, size, itype, nr, f"fd={fd} ret={ret}")
+  return ret
+
+install_hook(libc.ioctl, ioctl)
+
+# AMD_LOG_LEVEL=4 HSAKMT_DEBUG_LEVEL=7
+if __name__ == "__main__":
+  print("***** import tinygrad")
+  from tinygrad import Tensor, Device, TinyJit
+  print("***** access HIP")
+  dev = Device["HIP"]
+  print("***** create tensor a")
+  a = Tensor([1.,2.]*200, device="HIP").realize()
+  print("***** create tensor b")
+  b = Tensor([3.,4.]*200, device="HIP").realize()
+  @TinyJit
+  def add(a, b): return (a+b).realize()
+  for i in range(4):
+    print(f"***** add tensors {i}")
+    c = add(a, b)
+    #dev.synchronize()
+    c = add(b, a)
+    dev.synchronize()
+  print(f"***** delete")
+  del add, a, b, c, dev
+  print(f"***** done")
--- a/extra/hip_gpu_driver/kfd_ioctl.h
+++ b/extra/hip_gpu_driver/kfd_ioctl.h
--- a/extra/hip_gpu_driver/kfd_ioctl.py
+++ b/extra/hip_gpu_driver/kfd_ioctl.py
--- a/tinygrad/runtime/ops_hip.py
+++ b/tinygrad/runtime/ops_hip.py
@ -1,5 +1,6 @@
+from __future__ import annotations
 import ctypes, functools, subprocess
-from typing import Tuple, TypeVar
+from typing import Tuple, TypeVar, List
 import gpuctypes.hip as hip
 from tinygrad.helpers import DEBUG, getenv, from_mv, init_c_var, compile_cuda_style, encode_args_cuda_style, time_execution_cuda_style
 from tinygrad.device import Compiled, LRUAllocator, MallocAllocator
@ -40,22 +41,24 @@ class HIPProgram:

 T = TypeVar("T")
 class HIPAllocator(LRUAllocator):
-  def __init__(self, device):
+  def __init__(self, device:HIPDevice):
    self.device = device
    super().__init__()
  def _alloc(self, size:int):
-    check(hip.hipSetDevice(self.device))
+    check(hip.hipSetDevice(self.device.device))
    return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipMalloc(ctypes.byref(x), size)))
  def _free(self, opaque:T): check(hip.hipFree(opaque))
  def copyin(self, dest:T, src: memoryview):
-    check(hip.hipSetDevice(self.device))
-    # TODO: have to make sure src isn't freed to make this async
-    check(hip.hipMemcpy(dest, from_mv(src), len(src), hip.hipMemcpyHostToDevice))
+    check(hip.hipSetDevice(self.device.device))
+    host_mem = init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipHostMalloc(ctypes.byref(x), len(src), 0)))
+    self.device.pending_copyin.append(host_mem)
+    ctypes.memmove(host_mem, from_mv(src), len(src))
+    check(hip.hipMemcpyAsync(dest, host_mem, len(src), hip.hipMemcpyHostToDevice, None))
  def copyout(self, dest:memoryview, src:T):
-    check(hip.hipSetDevice(self.device))
+    check(hip.hipSetDevice(self.device.device))
    check(hip.hipMemcpy(from_mv(dest), src, len(dest), hip.hipMemcpyDeviceToHost))
  def transfer(self, dest:T, src:T, sz:int):
-    check(hip.hipSetDevice(self.device))
+    check(hip.hipSetDevice(self.device.device))
    # TODO: hipMemcpyAsync, but you have to track the "src" buffer to not free it
    check(hip.hipMemcpy(dest, src, sz, hip.hipMemcpyDeviceToDevice))

@ -63,11 +66,14 @@ class HIPDevice(Compiled):
  default_arch_name = "gfx1100"
  def __init__(self, device:str=""):
    self.device = int(device.split(":")[1]) if ":" in device else 0
+    self.pending_copyin: List[hip.hipDeviceptr_t] = []
    if self.device == 0 and not MOCKHIP: HIPDevice.default_arch_name = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device))).gcnArchName.decode()  # noqa: E501

    from tinygrad.runtime.graph.hip import HIPGraph
-    super().__init__(MallocAllocator if MOCKHIP else HIPAllocator(self.device), LinearizerOptions(device="HIP"), HIPRenderer,
+    super().__init__(MallocAllocator if MOCKHIP else HIPAllocator(self), LinearizerOptions(device="HIP"), HIPRenderer,
                     compile_hip, functools.partial(HIPProgram, self.device), HIPGraph)
  def synchronize(self):
    check(hip.hipSetDevice(self.device))
-    check(hip.hipDeviceSynchronize())
+    check(hip.hipDeviceSynchronize())
+    for opaque in self.pending_copyin: check(hip.hipFree(opaque))
+    self.pending_copyin.clear()