mirror of https://github.com/commaai/tinygrad.git
_alloc and _free with options (#3934)
* _alloc has options * linter * fix hsa
This commit is contained in:
parent
739f47eb0f
commit
e2d6f76723
|
@ -12,7 +12,7 @@ class FakeProgram:
|
|||
def __call__(self, *bufs, global_size, local_size, vals=(), wait=False): pass
|
||||
|
||||
class FakeAllocator(Allocator):
|
||||
def _alloc(self, sz): return None
|
||||
def _alloc(self, sz, options): return None
|
||||
def copyin(self, dest, src:memoryview): pass
|
||||
|
||||
class TestLLaMASpeed(unittest.TestCase):
|
||||
|
|
|
@ -148,11 +148,11 @@ class BufferXfer(BufferCopy):
|
|||
class Allocator:
|
||||
def alloc(self, size:int, options:Optional[BufferOptions]=None):
|
||||
assert not isinstance(size, int) or size > 0, f"alloc size must be positve, getting {size}"
|
||||
return self._alloc_with_options(size, options) if options is not None else self._alloc(size)
|
||||
def _alloc(self, size:int): raise NotImplementedError("need alloc")
|
||||
def _alloc_with_options(self, size:int, options:BufferOptions): return self._alloc(size) # TODO: override this if you support options
|
||||
def free(self, opaque, size:int, options:Optional[BufferOptions]=None): self._free(opaque)
|
||||
def _free(self, opaque): pass # if opaque is a Python object, you don't need a free
|
||||
return self._alloc(size, options if options is not None else BufferOptions())
|
||||
def _alloc(self, size:int, options:BufferOptions): raise NotImplementedError("need alloc")
|
||||
def free(self, opaque, size:int, options:Optional[BufferOptions]=None):
|
||||
self._free(opaque, options if options is not None else BufferOptions())
|
||||
def _free(self, opaque, options:BufferOptions): pass # if opaque is a Python object, you don't need a free
|
||||
def copyin(self, dest, src:memoryview): raise NotImplementedError("need copyin")
|
||||
def copyout(self, dest:memoryview, src): raise NotImplementedError("need copyout")
|
||||
|
||||
|
@ -165,15 +165,15 @@ class LRUAllocator(Allocator): # pylint: disable=abstract-method
|
|||
self.free_cache()
|
||||
return super().alloc(size, options)
|
||||
def free_cache(self):
|
||||
for opaques in self.cache.values():
|
||||
for opaque in opaques: self._free(opaque)
|
||||
for (sz,options),opaques in self.cache.items():
|
||||
for opaque in opaques: super().free(opaque, sz, options)
|
||||
opaques.clear()
|
||||
def free(self, opaque:Any, size:int, options:Optional[BufferOptions]=None):
|
||||
if getenv("LRU", 1) and (options is None or not options.signal): self.cache[(size, options)].append(opaque)
|
||||
else: self._free(opaque)
|
||||
else: super().free(size, size, options)
|
||||
|
||||
class _MallocAllocator(LRUAllocator):
|
||||
def _alloc(self, size:int): return (ctypes.c_uint8 * size)()
|
||||
def _alloc(self, size:int, options:BufferOptions): return (ctypes.c_uint8 * size)()
|
||||
def as_buffer(self, src) -> memoryview: return flat_mv(memoryview(src))
|
||||
def copyin(self, dest, src:memoryview): ctypes.memmove(dest, from_mv(src), len(src))
|
||||
def copyout(self, dest:memoryview, src): ctypes.memmove(from_mv(dest), src, len(dest))
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import ctypes, collections, time, itertools
|
||||
from typing import List, Any, Dict, cast, Optional, Union, Tuple
|
||||
from tinygrad.helpers import GraphException, init_c_var, round_up
|
||||
from tinygrad.device import Compiled, Buffer, CompiledASTRunner, BufferXfer, MultiDeviceJITGraph, update_stats
|
||||
from tinygrad.device import Compiled, Buffer, BufferOptions, CompiledASTRunner, BufferXfer, MultiDeviceJITGraph, update_stats
|
||||
from tinygrad.shape.symbolic import Variable
|
||||
from tinygrad.runtime.ops_hsa import HSADevice, PROFILE, Profiler
|
||||
from tinygrad.features.jit import JitItem, get_input_replace, get_jit_stats, \
|
||||
|
@ -47,7 +47,7 @@ class HSAGraph(MultiDeviceJITGraph):
|
|||
kernargs_size: Dict[Compiled, int] = collections.defaultdict(int)
|
||||
for ji in self.jit_cache:
|
||||
if isinstance(ji.prg, CompiledASTRunner): kernargs_size[ji.prg.device] += round_up(ctypes.sizeof(ji.prg.clprg.args_struct_t), 16)
|
||||
kernargs_ptrs: Dict[Compiled, int] = {dev:dev.allocator._alloc(sz) for dev,sz in kernargs_size.items()}
|
||||
kernargs_ptrs: Dict[Compiled, int] = {dev:dev.allocator._alloc(sz, BufferOptions()) for dev,sz in kernargs_size.items()}
|
||||
|
||||
# Fill initial arguments.
|
||||
self.ji_kargs_structs: Dict[int, ctypes.Structure] = {}
|
||||
|
|
|
@ -123,13 +123,13 @@ class CUDAAllocator(LRUAllocator):
|
|||
def __init__(self, device:CUDADevice):
|
||||
self.device = device
|
||||
super().__init__()
|
||||
def _alloc(self, size):
|
||||
def _alloc(self, size, options:BufferOptions):
|
||||
check(cuda.cuCtxSetCurrent(self.device.context))
|
||||
return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size)))
|
||||
def _alloc_with_options(self, size:int, options:BufferOptions):
|
||||
if options.host: return init_c_var(ctypes.c_void_p(), lambda x: check(cuda.cuMemHostAlloc(ctypes.byref(x), size, 0)))
|
||||
else: raise ValueError("no options")
|
||||
def _free(self, opaque): check(cuda.cuMemFree_v2(opaque))
|
||||
else: return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size)))
|
||||
def _free(self, opaque, options:BufferOptions):
|
||||
if options.host: return check(cuda.cuMemFreeHost(opaque))
|
||||
else: check(cuda.cuMemFree_v2(opaque))
|
||||
def copyin(self, dest, src:memoryview):
|
||||
check(cuda.cuCtxSetCurrent(self.device.context))
|
||||
host_mem = self.alloc(len(src), BufferOptions(host=True))
|
||||
|
|
|
@ -20,7 +20,7 @@ class DiskBuffer:
|
|||
MAP_LOCKED, MAP_POPULATE = 0 if OSX else 0x2000, getattr(mmap, "MAP_POPULATE", 0 if OSX else 0x008000)
|
||||
class DiskAllocator(Allocator):
|
||||
def __init__(self, device:str): self.device = device
|
||||
def _alloc(self, size:int):
|
||||
def _alloc(self, size:int, options):
|
||||
if self.device.startswith("shm:"):
|
||||
fd = _posixshmem.shm_open("/"+self.device[4:].lstrip("/"), os.O_RDWR, 0o600)
|
||||
mem = mmap.mmap(fd, size, mmap.MAP_SHARED | MAP_POPULATE | MAP_LOCKED)
|
||||
|
|
|
@ -65,15 +65,13 @@ class CLAllocator(LRUAllocator):
|
|||
def __init__(self, device:CLDevice):
|
||||
self.device = device
|
||||
super().__init__()
|
||||
def _alloc(self, size:int) -> ctypes._CData:
|
||||
return checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, ctypes.byref(status := ctypes.c_int32())), status)
|
||||
def _alloc_with_options(self, size:int, options:BufferOptions) -> ctypes._CData:
|
||||
def _alloc(self, size:int, options:BufferOptions) -> ctypes._CData:
|
||||
if options.image is not None:
|
||||
return checked(cl.clCreateImage2D(self.device.context, cl.CL_MEM_READ_WRITE,
|
||||
cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]),
|
||||
options.image.shape[1], options.image.shape[0], 0, None, ctypes.byref(status := ctypes.c_int32())), status)
|
||||
else: return self._alloc(size)
|
||||
def _free(self, buf:ctypes._CData): check(cl.clReleaseMemObject(buf))
|
||||
else: return checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, ctypes.byref(status := ctypes.c_int32())), status)
|
||||
def _free(self, buf:ctypes._CData, options:BufferOptions): check(cl.clReleaseMemObject(buf))
|
||||
def copyin(self, dest:ctypes._CData, src:memoryview):
|
||||
check(cl.clEnqueueWriteBuffer(self.device.queue, dest, False, 0, len(src)*src.itemsize, from_mv(src), 0, None, None))
|
||||
self.device.pending_copyin.append(src) # NOTE: these can't be freed until the GPU actually executes this command
|
||||
|
|
|
@ -105,27 +105,25 @@ class HSAAllocator(LRUAllocator):
|
|||
self.device = device
|
||||
super().__init__()
|
||||
|
||||
def _alloc(self, size:int):
|
||||
c_agents = (hsa.hsa_agent_t * len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]))(*HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU])
|
||||
check(hsa.hsa_amd_memory_pool_allocate(self.device.gpu_mempool, size, 0, ctypes.byref(buf := ctypes.c_void_p())))
|
||||
check(hsa.hsa_amd_agents_allow_access(len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]), c_agents, None, buf))
|
||||
return buf.value
|
||||
|
||||
def _alloc_with_options(self, size:int, options:BufferOptions):
|
||||
def _alloc(self, size:int, options:BufferOptions):
|
||||
if options.host:
|
||||
check(hsa.hsa_amd_memory_pool_allocate(HSADevice.cpu_mempool, size, 0, ctypes.byref(mem := ctypes.c_void_p())))
|
||||
check(hsa.hsa_amd_agents_allow_access(2, (hsa.hsa_agent_t*2)(HSADevice.cpu_agent, self.device.agent), None, mem))
|
||||
return mem.value
|
||||
else: raise ValueError("no options")
|
||||
else:
|
||||
c_agents = (hsa.hsa_agent_t * len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]))(*HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU])
|
||||
check(hsa.hsa_amd_memory_pool_allocate(self.device.gpu_mempool, size, 0, ctypes.byref(buf := ctypes.c_void_p())))
|
||||
check(hsa.hsa_amd_agents_allow_access(len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]), c_agents, None, buf))
|
||||
return buf.value
|
||||
|
||||
def _free(self, opaque:T):
|
||||
def _free(self, opaque:T, options:BufferOptions):
|
||||
HSADevice.synchronize_system()
|
||||
check(hsa.hsa_amd_memory_pool_free(opaque))
|
||||
|
||||
def copyin(self, dest:T, src: memoryview):
|
||||
# Async copyin sync model uses barriers on the main hw queue, since barriers are guaranteed to execute in order with all other packets.
|
||||
self.device.hw_queue.submit_barrier([], sync_signal := self.device.alloc_signal(reusable=True))
|
||||
mem = self._alloc_with_options(src.nbytes, BufferOptions(host=True))
|
||||
mem = self._alloc(src.nbytes, BufferOptions(host=True))
|
||||
ctypes.memmove(mem, from_mv(src), src.nbytes)
|
||||
check(hsa.hsa_amd_memory_async_copy_on_engine(dest, self.device.agent, mem, HSADevice.cpu_agent, src.nbytes, 1, ctypes.byref(sync_signal),
|
||||
copy_signal := self.device.alloc_signal(reusable=True), hsa.HSA_AMD_SDMA_ENGINE_0, True))
|
||||
|
@ -137,7 +135,7 @@ class HSAAllocator(LRUAllocator):
|
|||
self.device.hw_queue.submit_barrier([], sync_signal := self.device.alloc_signal(reusable=True))
|
||||
|
||||
if not hasattr(self, 'hb'):
|
||||
self.hb = [self._alloc_with_options(CHUNK_SIZE, BufferOptions(host=True)) for _ in range(2)]
|
||||
self.hb = [self._alloc(CHUNK_SIZE, BufferOptions(host=True)) for _ in range(2)]
|
||||
self.hb_signals = [self.device.alloc_signal(reusable=False) for _ in range(2)]
|
||||
self.hb_polarity = 0
|
||||
self.sdma = [hsa.HSA_AMD_SDMA_ENGINE_0, hsa.HSA_AMD_SDMA_ENGINE_1]
|
||||
|
@ -262,7 +260,7 @@ class HSADevice(Compiled):
|
|||
|
||||
def _new_kernargs_region(self, sz:int):
|
||||
if hasattr(self, 'kernarg_start_addr'): self.delayed_free.append(self.kernarg_start_addr)
|
||||
self.kernarg_start_addr: int = self.allocator._alloc(sz)
|
||||
self.kernarg_start_addr: int = self.allocator._alloc(sz, BufferOptions())
|
||||
self.kernarg_next_addr = self.kernarg_start_addr
|
||||
self.kernarg_pool_sz: int = sz
|
||||
|
||||
|
|
|
@ -67,7 +67,7 @@ class MetalAllocator(LRUAllocator):
|
|||
for x in self.track_cross_device: x.synchronize()
|
||||
self.track_cross_device.clear()
|
||||
return super().free_cache()
|
||||
def _alloc(self, size:int) -> Any:
|
||||
def _alloc(self, size:int, options) -> Any:
|
||||
ret = self.device.device.newBufferWithLength_options_(size, Metal.MTLResourceStorageModeShared)
|
||||
if ret is None: raise MemoryError(f"Metal OOM while allocating {size=}")
|
||||
return ret
|
||||
|
@ -82,7 +82,7 @@ class MetalAllocator(LRUAllocator):
|
|||
ret = self.device.device.newBufferWithBytesNoCopy_length_options_deallocator_(src, len(src), Metal.MTLResourceStorageModeShared, None)
|
||||
if ret: self.device.mv_in_metal.append(src)
|
||||
return ret
|
||||
def _free(self, opaque:Any): opaque.release()
|
||||
def _free(self, opaque:Any, options): opaque.release()
|
||||
def as_buffer(self, src:Any) -> memoryview:
|
||||
self.device.synchronize()
|
||||
return src.contents().as_buffer(src.length())
|
||||
|
|
|
@ -188,7 +188,7 @@ class PythonCompiler(Compiler):
|
|||
def compile(self, src:str) -> bytes: return base64.b64decode(src)
|
||||
|
||||
class PythonAllocator(Allocator):
|
||||
def _alloc(self, size): return memoryview(bytearray(size))
|
||||
def _alloc(self, size, options): return memoryview(bytearray(size))
|
||||
def copyin(self, dest, src:memoryview): dest[:] = src
|
||||
def copyout(self, dest:memoryview, src): dest[:] = src
|
||||
|
||||
|
|
Loading…
Reference in New Issue