mirror of https://github.com/commaai/tinygrad.git
Minor improvements + cleanup to `ops_gpu.py` (#1006)
* Minor improvements + cleanup to `ops_gpu.py` * Add some previously undocumented environment variables from `ops_gpu.py` to `env_vars.md` * Update debug print for OpenCL to print the devices that will be used post-filtering with `CL_EXCLUDE` * Remove a couple unused or superfluous variables and assignments * Use `fromimport` shorthand to shave off a couple precious LOC * Couple small whitespace changes to clean things up * Revert change to ordering of OpenCL devices * Small refactor for OpenCL context creation
This commit is contained in:
parent
5428b5d774
commit
651d6ea457
|
@ -38,6 +38,9 @@ ENABLE_METHOD_CACHE | [1] | enable method cache (this is the default)
|
|||
EARLY_STOPPING | [# > 0] | stop after this many kernels
|
||||
DISALLOW_ASSIGN | [1] | disallow assignment of tensors
|
||||
NATIVE_EXPLOG | [1] | enable using native exp and log
|
||||
CL_EXCLUDE | [name0,name1] | comma-separated list of device names to exclude when using OpenCL GPU backend (like `CL_EXCLUDE=gfx1036`)
|
||||
CL_PLATFORM | [# >= 0] | index of the OpenCL [platform](https://documen.tician.de/pyopencl/runtime_platform.html#pyopencl.Platform) to run on. Defaults to 0.
|
||||
RDNA | [1] | enable the specialized [RDNA 3](https://en.wikipedia.org/wiki/RDNA_3) assembler for AMD 7000-series GPUs. If not set, defaults to generic OpenCL codegen backend.
|
||||
|
||||
## File Specific Variables
|
||||
|
||||
|
|
|
@ -9,21 +9,21 @@ from tinygrad.runtime.lib import RawBufferCopyInOut
|
|||
from tinygrad.codegen.cstyle import CStyleCodegen, CStyleLanguage
|
||||
|
||||
OSX_TIMING_RATIO = (125/3) if OSX else 1.0 # see test/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
|
||||
FLOAT16 = getenv("FLOAT16", 0)
|
||||
|
||||
# TODO: if you fork and exit the child process after creating anything with cl on AMD, it hangs on e.wait()
|
||||
ROCM_LLVM_PATH = pathlib.Path("/opt/rocm/llvm/bin")
|
||||
#ROCM_LLVM_PATH = pathlib.Path(__file__).parent.parent.parent.parent / "extra/rocm/build/llvm-project/bin"
|
||||
if DEBUG >= 5:
|
||||
from extra.helpers import enable_early_exec
|
||||
early_exec = enable_early_exec()
|
||||
early_exec = fromimport("extra.helpers", "enable_early_exec")()
|
||||
|
||||
class _CL:
|
||||
def __init__(self):
|
||||
platforms: List[List[cl.Device]] = [y for y in ([x.get_devices(device_type=cl.device_type.GPU) for x in cl.get_platforms()] + [x.get_devices(device_type=cl.device_type.CPU) for x in cl.get_platforms()]) if len(y)]
|
||||
if DEBUG >= 1: print(f"using {platforms[getenv('CL_PLATFORM', 0)]}")
|
||||
self.cl_ctx: cl.Context = cl.Context(devices=[x for x in platforms[getenv('CL_PLATFORM', 0)] if x.name not in getenv('CL_EXCLUDE', "").split(",")])
|
||||
devices: List[cl.Device] = [x for x in platforms[getenv('CL_PLATFORM', 0)] if x.name not in getenv('CL_EXCLUDE', '').split(',')]
|
||||
if DEBUG >= 1: print(f"using devices: {[d.hashable_model_and_version_identifier for d in devices]}")
|
||||
self.cl_ctx: cl.Context = cl.Context(devices=devices)
|
||||
self.cl_queue: List[cl.CommandQueue] = [cl.CommandQueue(self.cl_ctx, device=device, properties=cl.command_queue_properties.PROFILING_ENABLE) for device in self.cl_ctx.devices]
|
||||
|
||||
def synchronize(self):
|
||||
for q in self.cl_queue: q.finish()
|
||||
CL = _CL()
|
||||
|
@ -40,10 +40,12 @@ class CLBuffer(RawBufferCopyInOut):
|
|||
buf = cl.Buffer(CL.cl_ctx, cl.mem_flags.READ_WRITE, size * dtype.itemsize)
|
||||
setattr(buf, 'device', int(device)) # device is tracked on the underlying buffer
|
||||
super().__init__(size, dtype, buf)
|
||||
def _copyin(self, x:np.ndarray):
|
||||
|
||||
def _copyin(self, x: np.ndarray):
|
||||
assert not self.dtype.name.startswith("image"), f"can't copyin images {self.dtype}"
|
||||
cl.enqueue_copy(CL.cl_queue[self._buf.device], self._buf, np.require(x, requirements='C'), is_blocking=False)
|
||||
def _copyout(self, x:np.ndarray):
|
||||
|
||||
def _copyout(self, x: np.ndarray):
|
||||
assert not self.dtype.name.startswith("image"), f"can't copyout images {self.dtype}"
|
||||
cl.enqueue_copy(CL.cl_queue[self._buf.device], x, self._buf, is_blocking=True)
|
||||
|
||||
|
@ -58,8 +60,7 @@ class CLProgram:
|
|||
self.clprg = self._clprg.__getattr__(name)
|
||||
if DEBUG >= 5 and not OSX:
|
||||
if 'Adreno' in CL.cl_ctx.devices[0].name:
|
||||
from disassemblers.adreno import disasm
|
||||
disasm(self.binary())
|
||||
fromimport('disassemblers.adreno', 'disasm')(self.binary())
|
||||
elif CL.cl_ctx.devices[0].name.startswith('gfx'):
|
||||
asm = early_exec(([ROCM_LLVM_PATH / "llvm-objdump", '-d', '-'], self.binary()))
|
||||
print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
|
||||
|
@ -91,7 +92,5 @@ class CLCodegen(CStyleCodegen):
|
|||
half_prekernel = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable",
|
||||
barrier = "barrier(CLK_LOCAL_MEM_FENCE);", float4 = "(float4)",
|
||||
gid = [f'get_global_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)], uses_vload=True)
|
||||
supports_float4_alu = True
|
||||
supports_float4 = True
|
||||
|
||||
GPUBuffer = Compiled(CLBuffer, fromimport("tinygrad.codegen.assembly_rdna", "RDNACodegen") if getenv("RDNA") else CLCodegen, CLProgram, CL.synchronize)
|
||||
|
|
Loading…
Reference in New Issue