Hip driver (#2992)

* start hip driver

* fix hip llama

* make HIP default if we can

* don't change those
This commit is contained in:
George Hotz 2024-01-03 12:53:47 -08:00 committed by GitHub
parent f290ca3924
commit 753a7ecc05
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 3140 additions and 15 deletions

View File

@ -122,11 +122,10 @@ jobs:
run: HIP=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt
- name: Run Stable Diffusion
run: python3 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd.txt
# TODO: rocm 6.0 broke this
# - name: Run LLaMA
# run: |
# JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
# JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
- name: Run LLaMA (with HIP)
run: |
HIP=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
HIP=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
- name: Run GPT2 (with HIP)
run: |
HIP=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt

View File

@ -0,0 +1,90 @@
import ctypes, ctypes.util, struct, platform, pathlib, re, time
# *** ioctl lib ***
libc = ctypes.CDLL(ctypes.util.find_library("c"))
processor = platform.processor()
IOCTL_SYSCALL = {"aarch64": 0x1d, "x86_64":16}[processor]
def get_struct(argp, stype):
return ctypes.cast(ctypes.c_void_p(argp), ctypes.POINTER(stype)).contents
def format_struct(s):
sdats = []
for field_name, field_type in s._fields_:
dat = getattr(s, field_name)
if isinstance(dat, int): sdats.append(f"{field_name}:0x{dat:X}")
else: sdats.append(f"{field_name}:{dat}")
return sdats
def install_hook(c_function, python_function):
python_function_addr = ctypes.cast(ctypes.byref(python_function), ctypes.POINTER(ctypes.c_ulong)).contents.value
# AARCH64 trampoline to ioctl
if processor == "aarch64":
# 0x0000000000000000: 70 00 00 10 adr x16, #0xc
# 0x0000000000000004: 10 02 40 F9 ldr x16, [x16]
# 0x0000000000000008: 00 02 1F D6 br x16
tramp = b"\x70\x00\x00\x10\x10\x02\x40\xf9\x00\x02\x1f\xd6"
tramp += struct.pack("Q", python_function_addr)
elif processor == "x86_64":
# 0x0000000000000000: 49 B8 aa aa aa aa aa aa aa aa movabs r8, <address>
# 0x000000000000000a: 41 FF E0 jmp r8
tramp = b"\x49\xB8" + struct.pack("Q", python_function_addr) + b"\x41\xFF\xE0"
else:
raise Exception(f"processor {processor} not supported")
# get real ioctl address
ioctl_address = ctypes.cast(ctypes.byref(c_function), ctypes.POINTER(ctypes.c_ulong))
# hook ioctl
ret = libc.mprotect(ctypes.c_ulong((ioctl_address.contents.value//0x1000)*0x1000), 0x2000, 7)
assert ret == 0
libc.memcpy(ioctl_address.contents, ctypes.create_string_buffer(tramp), len(tramp))
# *** ioctl lib end ***
# clang2py kfd_ioctl.h -o kfd_ioctl.py
from extra.hip_gpu_driver import kfd_ioctl
def ioctls_from_header():
hdr = (pathlib.Path(__file__).parent.parent.parent / "extra/hip_gpu_driver/kfd_ioctl.h").read_text().replace("\\\n", "")
pattern = r'#define\s+(AMDKFD_IOC_[A-Z0-9_]+)\s+AMDKFD_IOWR?\((0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
matches = re.findall(pattern, hdr, re.MULTILINE)
return {int(nr, 0x10):(name, getattr(kfd_ioctl, "struct_"+sname)) for name, nr, sname in matches}
nrs = ioctls_from_header()
@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_ulong, ctypes.c_void_p)
def ioctl(fd, request, argp):
st = time.perf_counter()
ret = libc.syscall(IOCTL_SYSCALL, ctypes.c_int(fd), ctypes.c_ulong(request), ctypes.c_void_p(argp))
et = time.perf_counter()-st
idir, size, itype, nr = (request>>30), (request>>16)&0x3FFF, (request>>8)&0xFF, request&0xFF
if nr in nrs and itype == 75:
name, stype = nrs[nr]
s = get_struct(argp, stype)
print(f"{et*1000.:7.2f} ms : {ret:2d} = {name:40s}", ' '.join(format_struct(s)))
else:
print("ioctl", idir, size, itype, nr, f"fd={fd} ret={ret}")
return ret
install_hook(libc.ioctl, ioctl)
# AMD_LOG_LEVEL=4 HSAKMT_DEBUG_LEVEL=7
if __name__ == "__main__":
print("***** import tinygrad")
from tinygrad import Tensor, Device, TinyJit
print("***** access HIP")
dev = Device["HIP"]
print("***** create tensor a")
a = Tensor([1.,2.]*200, device="HIP").realize()
print("***** create tensor b")
b = Tensor([3.,4.]*200, device="HIP").realize()
@TinyJit
def add(a, b): return (a+b).realize()
for i in range(4):
print(f"***** add tensors {i}")
c = add(a, b)
#dev.synchronize()
c = add(b, a)
dev.synchronize()
print(f"***** delete")
del add, a, b, c, dev
print(f"***** done")

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,6 @@
from __future__ import annotations
import ctypes, functools, subprocess
from typing import Tuple, TypeVar
from typing import Tuple, TypeVar, List
import gpuctypes.hip as hip
from tinygrad.helpers import DEBUG, getenv, from_mv, init_c_var, compile_cuda_style, encode_args_cuda_style, time_execution_cuda_style
from tinygrad.device import Compiled, LRUAllocator, MallocAllocator
@ -40,22 +41,24 @@ class HIPProgram:
T = TypeVar("T")
class HIPAllocator(LRUAllocator):
def __init__(self, device):
def __init__(self, device:HIPDevice):
self.device = device
super().__init__()
def _alloc(self, size:int):
check(hip.hipSetDevice(self.device))
check(hip.hipSetDevice(self.device.device))
return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipMalloc(ctypes.byref(x), size)))
def _free(self, opaque:T): check(hip.hipFree(opaque))
def copyin(self, dest:T, src: memoryview):
check(hip.hipSetDevice(self.device))
# TODO: have to make sure src isn't freed to make this async
check(hip.hipMemcpy(dest, from_mv(src), len(src), hip.hipMemcpyHostToDevice))
check(hip.hipSetDevice(self.device.device))
host_mem = init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipHostMalloc(ctypes.byref(x), len(src), 0)))
self.device.pending_copyin.append(host_mem)
ctypes.memmove(host_mem, from_mv(src), len(src))
check(hip.hipMemcpyAsync(dest, host_mem, len(src), hip.hipMemcpyHostToDevice, None))
def copyout(self, dest:memoryview, src:T):
check(hip.hipSetDevice(self.device))
check(hip.hipSetDevice(self.device.device))
check(hip.hipMemcpy(from_mv(dest), src, len(dest), hip.hipMemcpyDeviceToHost))
def transfer(self, dest:T, src:T, sz:int):
check(hip.hipSetDevice(self.device))
check(hip.hipSetDevice(self.device.device))
# TODO: hipMemcpyAsync, but you have to track the "src" buffer to not free it
check(hip.hipMemcpy(dest, src, sz, hip.hipMemcpyDeviceToDevice))
@ -63,11 +66,14 @@ class HIPDevice(Compiled):
default_arch_name = "gfx1100"
def __init__(self, device:str=""):
self.device = int(device.split(":")[1]) if ":" in device else 0
self.pending_copyin: List[hip.hipDeviceptr_t] = []
if self.device == 0 and not MOCKHIP: HIPDevice.default_arch_name = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device))).gcnArchName.decode() # noqa: E501
from tinygrad.runtime.graph.hip import HIPGraph
super().__init__(MallocAllocator if MOCKHIP else HIPAllocator(self.device), LinearizerOptions(device="HIP"), HIPRenderer,
super().__init__(MallocAllocator if MOCKHIP else HIPAllocator(self), LinearizerOptions(device="HIP"), HIPRenderer,
compile_hip, functools.partial(HIPProgram, self.device), HIPGraph)
def synchronize(self):
check(hip.hipSetDevice(self.device))
check(hip.hipDeviceSynchronize())
check(hip.hipDeviceSynchronize())
for opaque in self.pending_copyin: check(hip.hipFree(opaque))
self.pending_copyin.clear()