mirror of https://github.com/commaai/tinygrad.git
save a few lines in ops_gpu (#2564)
Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
This commit is contained in:
parent
5e87083783
commit
065495e0c9
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
|||
from typing import Tuple, Optional, Union, List, cast
|
||||
import ctypes, functools
|
||||
import gpuctypes.opencl as cl
|
||||
from tinygrad.helpers import to_char_p_p, from_mv, diskcache, OSX, ImageDType
|
||||
from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, diskcache, OSX, ImageDType
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
from tinygrad.renderer.opencl import OpenCLRenderer
|
||||
from tinygrad.device import Compiled, LRUAllocator
|
||||
|
@ -11,33 +11,26 @@ OSX_TIMING_RATIO = (125/3) if OSX else 1.0 # see test/external/external_osx_pr
|
|||
|
||||
def check(status):
|
||||
if status != 0: raise RuntimeError(f"OpenCL Error {status}")
|
||||
def checked(ret, status):
|
||||
check(status.value)
|
||||
return ret
|
||||
def checked(ret, status): return (check(status.value), ret)[1]
|
||||
|
||||
@diskcache
|
||||
def compile_cl(prg:str) -> bytes:
|
||||
assert CLDevice.compiler_context is not None, 'OpenCL requires a "compiler_context" to compile, init a device before you call this'
|
||||
prg_bytes = prg.encode()
|
||||
program = checked(cl.clCreateProgramWithSource(CLDevice.compiler_context.context, 1, to_char_p_p([prg_bytes]), (ctypes.c_size_t * 1)(len(prg_bytes)), ctypes.byref(status := ctypes.c_int32())), status)
|
||||
program = checked(cl.clCreateProgramWithSource(CLDevice.compiler_context.context, 1, to_char_p_p([prg_bytes := prg.encode()]), ctypes.byref(ctypes.c_size_t(len(prg_bytes))), ctypes.byref(status := ctypes.c_int32())), status)
|
||||
status = cl.clBuildProgram(program, 1, ctypes.byref(CLDevice.compiler_context.device_id), None, cl.clBuildProgram.argtypes[4](), None)
|
||||
if status != 0:
|
||||
cl.clGetProgramBuildInfo(program, CLDevice.compiler_context.device_id, cl.CL_PROGRAM_BUILD_LOG, 0, None, ctypes.byref(log_size := ctypes.c_size_t()))
|
||||
cl.clGetProgramBuildInfo(program, CLDevice.compiler_context.device_id, cl.CL_PROGRAM_BUILD_LOG, log_size.value, mstr := ctypes.create_string_buffer(log_size.value), None)
|
||||
raise RuntimeError(f"OpenCL Compile Error\n\n{ctypes.string_at(mstr, size=log_size.value).decode()}")
|
||||
binary_sizes = (ctypes.c_size_t * 1)()
|
||||
check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARY_SIZES, ctypes.sizeof(binary_sizes), ctypes.byref(binary_sizes), None))
|
||||
binary = (ctypes.c_char * binary_sizes[0])()
|
||||
binary_pointers = (ctypes.c_char_p * 1)(ctypes.cast(ctypes.addressof(binary), ctypes.c_char_p))
|
||||
check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARIES, ctypes.sizeof(binary_pointers), ctypes.byref(binary_pointers), None))
|
||||
binary_sizes = init_c_var((ctypes.c_size_t * 1)(), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARY_SIZES, ctypes.sizeof(x), ctypes.byref(x), None)))
|
||||
binary = init_c_var(ctypes.create_string_buffer(binary_sizes[0]), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARIES, ctypes.sizeof(ctypes.c_void_p), ctypes.byref((ctypes.c_void_p * 1)(ctypes.addressof(x))), None)))
|
||||
check(cl.clReleaseProgram(program))
|
||||
return bytes(binary)
|
||||
|
||||
class CLProgram:
|
||||
def __init__(self, device:CLDevice, name:str, lib:bytes):
|
||||
self.device, self.name, self.lib = device, name, lib
|
||||
self.program = checked(cl.clCreateProgramWithBinary(device.context, 1, ctypes.byref(device.device_id), (ctypes.c_size_t * 1)(len(lib)),
|
||||
to_char_p_p([lib], ctypes.c_ubyte),
|
||||
self.program = checked(cl.clCreateProgramWithBinary(device.context, 1, ctypes.byref(device.device_id), (ctypes.c_size_t * 1)(len(lib)), to_char_p_p([lib], ctypes.c_ubyte),
|
||||
ctypes.byref(binary_status := ctypes.c_int32()), ctypes.byref(errcode_ret := ctypes.c_int32())), errcode_ret)
|
||||
check(binary_status.value)
|
||||
check(cl.clBuildProgram(self.program, 1, ctypes.byref(device.device_id), None, cl.clBuildProgram.argtypes[4](), None)) # NOTE: OSX requires this
|
||||
|
@ -55,10 +48,9 @@ class CLProgram:
|
|||
event = cl.cl_event() if wait else None
|
||||
check(cl.clEnqueueNDRangeKernel(self.device.queue, self.kernel, len(global_size), None, (ctypes.c_size_t * len(global_size))(*global_size), (ctypes.c_size_t * len(local_size))(*local_size) if local_size else None, 0, None, event))
|
||||
if wait:
|
||||
start, end = ctypes.c_ulong(), ctypes.c_ulong()
|
||||
check(cl.clWaitForEvents(1, ctypes.byref(event)))
|
||||
check(cl.clGetEventProfilingInfo(event, cl.CL_PROFILING_COMMAND_START, ctypes.sizeof(start), ctypes.byref(start), None))
|
||||
check(cl.clGetEventProfilingInfo(event, cl.CL_PROFILING_COMMAND_END, ctypes.sizeof(end), ctypes.byref(end), None))
|
||||
start = init_c_var(ctypes.c_ulong(), lambda x: check(cl.clGetEventProfilingInfo(event, cl.CL_PROFILING_COMMAND_START, ctypes.sizeof(x), ctypes.byref(x), None)))
|
||||
end = init_c_var(ctypes.c_ulong(), lambda x: check(cl.clGetEventProfilingInfo(event, cl.CL_PROFILING_COMMAND_END, ctypes.sizeof(x), ctypes.byref(x), None)))
|
||||
return float(end.value-start.value) * OSX_TIMING_RATIO * 1e-9
|
||||
return None
|
||||
|
||||
|
@ -87,11 +79,11 @@ class CLDevice(Compiled):
|
|||
compiler_context = None # this is the first created context. we make an assumption they are all the same for the compiler
|
||||
def __init__(self, device:str=""):
|
||||
if CLDevice.device_ids is None:
|
||||
check(cl.clGetPlatformIDs(0, None, ctypes.byref(num_platforms := ctypes.c_uint32())))
|
||||
check(cl.clGetPlatformIDs(num_platforms.value, platform_ids := (cl.cl_platform_id * num_platforms.value)(), None))
|
||||
check(cl.clGetDeviceIDs(platform_ids[0], cl.CL_DEVICE_TYPE_DEFAULT, 0, None, ctypes.byref(num_devices := ctypes.c_uint32())))
|
||||
CLDevice.device_ids = (cl.cl_device_id * num_devices.value)()
|
||||
check(cl.clGetDeviceIDs(platform_ids[0], cl.CL_DEVICE_TYPE_DEFAULT, num_devices, CLDevice.device_ids, None))
|
||||
num_platforms = init_c_var(ctypes.c_uint32(), lambda x: check(cl.clGetPlatformIDs(0, None, ctypes.byref(x))))
|
||||
platform_ids = init_c_var((cl.cl_platform_id * num_platforms.value)(), lambda x: check(cl.clGetPlatformIDs(num_platforms.value, x, None)))
|
||||
num_devices = init_c_var(ctypes.c_uint32(), lambda x: check(cl.clGetDeviceIDs(platform_ids[0], cl.CL_DEVICE_TYPE_DEFAULT, 0, None, ctypes.byref(x))))
|
||||
CLDevice.device_ids = init_c_var((cl.cl_device_id * num_devices.value)(), lambda x: check(cl.clGetDeviceIDs(platform_ids[0], cl.CL_DEVICE_TYPE_DEFAULT, num_devices, x, None)))
|
||||
|
||||
self.device_id = CLDevice.device_ids[0 if ":" not in device else int(device.split(":")[1])]
|
||||
self.context = checked(cl.clCreateContext(None, 1, ctypes.byref(self.device_id), cl.clCreateContext.argtypes[3](), None, ctypes.byref(status := ctypes.c_int32())), status)
|
||||
if CLDevice.compiler_context is None: CLDevice.compiler_context = self
|
||||
|
|
Loading…
Reference in New Issue