_alloc and _free with options (#3934)

* _alloc has options

* linter

* fix hsa
This commit is contained in:
nimlgen 2024-03-26 19:11:41 +03:00 committed by GitHub
parent 739f47eb0f
commit e2d6f76723
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 34 additions and 38 deletions

View File

@ -12,7 +12,7 @@ class FakeProgram:
def __call__(self, *bufs, global_size, local_size, vals=(), wait=False): pass
class FakeAllocator(Allocator):
def _alloc(self, sz): return None
def _alloc(self, sz, options): return None
def copyin(self, dest, src:memoryview): pass
class TestLLaMASpeed(unittest.TestCase):

View File

@ -148,11 +148,11 @@ class BufferXfer(BufferCopy):
class Allocator:
def alloc(self, size:int, options:Optional[BufferOptions]=None):
assert not isinstance(size, int) or size > 0, f"alloc size must be positve, getting {size}"
return self._alloc_with_options(size, options) if options is not None else self._alloc(size)
def _alloc(self, size:int): raise NotImplementedError("need alloc")
def _alloc_with_options(self, size:int, options:BufferOptions): return self._alloc(size) # TODO: override this if you support options
def free(self, opaque, size:int, options:Optional[BufferOptions]=None): self._free(opaque)
def _free(self, opaque): pass # if opaque is a Python object, you don't need a free
return self._alloc(size, options if options is not None else BufferOptions())
def _alloc(self, size:int, options:BufferOptions): raise NotImplementedError("need alloc")
def free(self, opaque, size:int, options:Optional[BufferOptions]=None):
self._free(opaque, options if options is not None else BufferOptions())
def _free(self, opaque, options:BufferOptions): pass # if opaque is a Python object, you don't need a free
def copyin(self, dest, src:memoryview): raise NotImplementedError("need copyin")
def copyout(self, dest:memoryview, src): raise NotImplementedError("need copyout")
@ -165,15 +165,15 @@ class LRUAllocator(Allocator): # pylint: disable=abstract-method
self.free_cache()
return super().alloc(size, options)
def free_cache(self):
for opaques in self.cache.values():
for opaque in opaques: self._free(opaque)
for (sz,options),opaques in self.cache.items():
for opaque in opaques: super().free(opaque, sz, options)
opaques.clear()
def free(self, opaque:Any, size:int, options:Optional[BufferOptions]=None):
if getenv("LRU", 1) and (options is None or not options.signal): self.cache[(size, options)].append(opaque)
else: self._free(opaque)
else: super().free(size, size, options)
class _MallocAllocator(LRUAllocator):
def _alloc(self, size:int): return (ctypes.c_uint8 * size)()
def _alloc(self, size:int, options:BufferOptions): return (ctypes.c_uint8 * size)()
def as_buffer(self, src) -> memoryview: return flat_mv(memoryview(src))
def copyin(self, dest, src:memoryview): ctypes.memmove(dest, from_mv(src), len(src))
def copyout(self, dest:memoryview, src): ctypes.memmove(from_mv(dest), src, len(dest))

View File

@ -1,7 +1,7 @@
import ctypes, collections, time, itertools
from typing import List, Any, Dict, cast, Optional, Union, Tuple
from tinygrad.helpers import GraphException, init_c_var, round_up
from tinygrad.device import Compiled, Buffer, CompiledASTRunner, BufferXfer, MultiDeviceJITGraph, update_stats
from tinygrad.device import Compiled, Buffer, BufferOptions, CompiledASTRunner, BufferXfer, MultiDeviceJITGraph, update_stats
from tinygrad.shape.symbolic import Variable
from tinygrad.runtime.ops_hsa import HSADevice, PROFILE, Profiler
from tinygrad.features.jit import JitItem, get_input_replace, get_jit_stats, \
@ -47,7 +47,7 @@ class HSAGraph(MultiDeviceJITGraph):
kernargs_size: Dict[Compiled, int] = collections.defaultdict(int)
for ji in self.jit_cache:
if isinstance(ji.prg, CompiledASTRunner): kernargs_size[ji.prg.device] += round_up(ctypes.sizeof(ji.prg.clprg.args_struct_t), 16)
kernargs_ptrs: Dict[Compiled, int] = {dev:dev.allocator._alloc(sz) for dev,sz in kernargs_size.items()}
kernargs_ptrs: Dict[Compiled, int] = {dev:dev.allocator._alloc(sz, BufferOptions()) for dev,sz in kernargs_size.items()}
# Fill initial arguments.
self.ji_kargs_structs: Dict[int, ctypes.Structure] = {}

View File

@ -123,13 +123,13 @@ class CUDAAllocator(LRUAllocator):
def __init__(self, device:CUDADevice):
self.device = device
super().__init__()
def _alloc(self, size):
def _alloc(self, size, options:BufferOptions):
check(cuda.cuCtxSetCurrent(self.device.context))
return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size)))
def _alloc_with_options(self, size:int, options:BufferOptions):
if options.host: return init_c_var(ctypes.c_void_p(), lambda x: check(cuda.cuMemHostAlloc(ctypes.byref(x), size, 0)))
else: raise ValueError("no options")
def _free(self, opaque): check(cuda.cuMemFree_v2(opaque))
else: return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size)))
def _free(self, opaque, options:BufferOptions):
if options.host: return check(cuda.cuMemFreeHost(opaque))
else: check(cuda.cuMemFree_v2(opaque))
def copyin(self, dest, src:memoryview):
check(cuda.cuCtxSetCurrent(self.device.context))
host_mem = self.alloc(len(src), BufferOptions(host=True))

View File

@ -20,7 +20,7 @@ class DiskBuffer:
MAP_LOCKED, MAP_POPULATE = 0 if OSX else 0x2000, getattr(mmap, "MAP_POPULATE", 0 if OSX else 0x008000)
class DiskAllocator(Allocator):
def __init__(self, device:str): self.device = device
def _alloc(self, size:int):
def _alloc(self, size:int, options):
if self.device.startswith("shm:"):
fd = _posixshmem.shm_open("/"+self.device[4:].lstrip("/"), os.O_RDWR, 0o600)
mem = mmap.mmap(fd, size, mmap.MAP_SHARED | MAP_POPULATE | MAP_LOCKED)

View File

@ -65,15 +65,13 @@ class CLAllocator(LRUAllocator):
def __init__(self, device:CLDevice):
self.device = device
super().__init__()
def _alloc(self, size:int) -> ctypes._CData:
return checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, ctypes.byref(status := ctypes.c_int32())), status)
def _alloc_with_options(self, size:int, options:BufferOptions) -> ctypes._CData:
def _alloc(self, size:int, options:BufferOptions) -> ctypes._CData:
if options.image is not None:
return checked(cl.clCreateImage2D(self.device.context, cl.CL_MEM_READ_WRITE,
cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]),
options.image.shape[1], options.image.shape[0], 0, None, ctypes.byref(status := ctypes.c_int32())), status)
else: return self._alloc(size)
def _free(self, buf:ctypes._CData): check(cl.clReleaseMemObject(buf))
else: return checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, ctypes.byref(status := ctypes.c_int32())), status)
def _free(self, buf:ctypes._CData, options:BufferOptions): check(cl.clReleaseMemObject(buf))
def copyin(self, dest:ctypes._CData, src:memoryview):
check(cl.clEnqueueWriteBuffer(self.device.queue, dest, False, 0, len(src)*src.itemsize, from_mv(src), 0, None, None))
self.device.pending_copyin.append(src) # NOTE: these can't be freed until the GPU actually executes this command

View File

@ -105,27 +105,25 @@ class HSAAllocator(LRUAllocator):
self.device = device
super().__init__()
def _alloc(self, size:int):
c_agents = (hsa.hsa_agent_t * len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]))(*HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU])
check(hsa.hsa_amd_memory_pool_allocate(self.device.gpu_mempool, size, 0, ctypes.byref(buf := ctypes.c_void_p())))
check(hsa.hsa_amd_agents_allow_access(len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]), c_agents, None, buf))
return buf.value
def _alloc_with_options(self, size:int, options:BufferOptions):
def _alloc(self, size:int, options:BufferOptions):
if options.host:
check(hsa.hsa_amd_memory_pool_allocate(HSADevice.cpu_mempool, size, 0, ctypes.byref(mem := ctypes.c_void_p())))
check(hsa.hsa_amd_agents_allow_access(2, (hsa.hsa_agent_t*2)(HSADevice.cpu_agent, self.device.agent), None, mem))
return mem.value
else: raise ValueError("no options")
else:
c_agents = (hsa.hsa_agent_t * len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]))(*HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU])
check(hsa.hsa_amd_memory_pool_allocate(self.device.gpu_mempool, size, 0, ctypes.byref(buf := ctypes.c_void_p())))
check(hsa.hsa_amd_agents_allow_access(len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]), c_agents, None, buf))
return buf.value
def _free(self, opaque:T):
def _free(self, opaque:T, options:BufferOptions):
HSADevice.synchronize_system()
check(hsa.hsa_amd_memory_pool_free(opaque))
def copyin(self, dest:T, src: memoryview):
# Async copyin sync model uses barriers on the main hw queue, since barriers are guaranteed to execute in order with all other packets.
self.device.hw_queue.submit_barrier([], sync_signal := self.device.alloc_signal(reusable=True))
mem = self._alloc_with_options(src.nbytes, BufferOptions(host=True))
mem = self._alloc(src.nbytes, BufferOptions(host=True))
ctypes.memmove(mem, from_mv(src), src.nbytes)
check(hsa.hsa_amd_memory_async_copy_on_engine(dest, self.device.agent, mem, HSADevice.cpu_agent, src.nbytes, 1, ctypes.byref(sync_signal),
copy_signal := self.device.alloc_signal(reusable=True), hsa.HSA_AMD_SDMA_ENGINE_0, True))
@ -137,7 +135,7 @@ class HSAAllocator(LRUAllocator):
self.device.hw_queue.submit_barrier([], sync_signal := self.device.alloc_signal(reusable=True))
if not hasattr(self, 'hb'):
self.hb = [self._alloc_with_options(CHUNK_SIZE, BufferOptions(host=True)) for _ in range(2)]
self.hb = [self._alloc(CHUNK_SIZE, BufferOptions(host=True)) for _ in range(2)]
self.hb_signals = [self.device.alloc_signal(reusable=False) for _ in range(2)]
self.hb_polarity = 0
self.sdma = [hsa.HSA_AMD_SDMA_ENGINE_0, hsa.HSA_AMD_SDMA_ENGINE_1]
@ -262,7 +260,7 @@ class HSADevice(Compiled):
def _new_kernargs_region(self, sz:int):
if hasattr(self, 'kernarg_start_addr'): self.delayed_free.append(self.kernarg_start_addr)
self.kernarg_start_addr: int = self.allocator._alloc(sz)
self.kernarg_start_addr: int = self.allocator._alloc(sz, BufferOptions())
self.kernarg_next_addr = self.kernarg_start_addr
self.kernarg_pool_sz: int = sz

View File

@ -67,7 +67,7 @@ class MetalAllocator(LRUAllocator):
for x in self.track_cross_device: x.synchronize()
self.track_cross_device.clear()
return super().free_cache()
def _alloc(self, size:int) -> Any:
def _alloc(self, size:int, options) -> Any:
ret = self.device.device.newBufferWithLength_options_(size, Metal.MTLResourceStorageModeShared)
if ret is None: raise MemoryError(f"Metal OOM while allocating {size=}")
return ret
@ -82,7 +82,7 @@ class MetalAllocator(LRUAllocator):
ret = self.device.device.newBufferWithBytesNoCopy_length_options_deallocator_(src, len(src), Metal.MTLResourceStorageModeShared, None)
if ret: self.device.mv_in_metal.append(src)
return ret
def _free(self, opaque:Any): opaque.release()
def _free(self, opaque:Any, options): opaque.release()
def as_buffer(self, src:Any) -> memoryview:
self.device.synchronize()
return src.contents().as_buffer(src.length())

View File

@ -188,7 +188,7 @@ class PythonCompiler(Compiler):
def compile(self, src:str) -> bytes: return base64.b64decode(src)
class PythonAllocator(Allocator):
def _alloc(self, size): return memoryview(bytearray(size))
def _alloc(self, size, options): return memoryview(bytearray(size))
def copyin(self, dest, src:memoryview): dest[:] = src
def copyout(self, dest:memoryview, src): dest[:] = src