diff --git a/setup.py b/setup.py index 005926b1..b2f3c541 100644 --- a/setup.py +++ b/setup.py @@ -21,9 +21,7 @@ setup(name='tinygrad', "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License" ], - install_requires=["numpy", - "pyobjc-framework-Metal; platform_system=='Darwin'", - "pyobjc-framework-libdispatch; platform_system=='Darwin'"], + install_requires=["numpy"], python_requires='>=3.8', extras_require={ 'llvm': ["llvmlite"], diff --git a/test/test_metal.py b/test/test_metal.py new file mode 100644 index 00000000..7999428e --- /dev/null +++ b/test/test_metal.py @@ -0,0 +1,54 @@ +import unittest +from tinygrad.device import CompileError, Device +if Device.DEFAULT=="METAL": + from tinygrad.runtime.ops_metal import MetalDevice, MetalCompiler, MetalProgram +@unittest.skipIf(Device.DEFAULT!="METAL", "Metal support required") +class TestMetal(unittest.TestCase): + def test_alloc_oom(self): + device = MetalDevice("metal") + with self.assertRaises(MemoryError): + device.allocator.alloc(10000000000000000000) + + def test_compile_error(self): + device = MetalDevice("metal") + compiler = MetalCompiler(device) + with self.assertRaises(CompileError): + compiler.compile("this is not valid metal") + + def test_compile_success(self): + device = MetalDevice("metal") + compiler = MetalCompiler(device) + ret = compiler.compile(""" +#include + using namespace metal; + kernel void E_4n1(device int* data0, const device int* data1, const device int* data2, + uint3 gid [[threadgroup_position_in_grid]], uint3 lid [[thread_position_in_threadgroup]]) { + int val0 = *(data1+0); + int val1 = *(data1+1); + int val2 = *(data1+2); + int val3 = *(data1+3); + int val4 = *(data2+0); + int val5 = *(data2+1); + int val6 = *(data2+2); + int val7 = *(data2+3); + *(data0+0) = (val0+val4); + *(data0+1) = (val1+val5); + *(data0+2) = (val2+val6); + *(data0+3) = (val3+val7); + } +""") + assert ret is not None + + def test_failed_newLibraryWithData(self): + device = MetalDevice("metal") + compiler = MetalCompiler(device) + compiled = compiler.compile(""" +#include +kernel void r_5(device int* data0, const device int* data1, uint3 gid [[threadgroup_position_in_grid]], uint3 lid [[thread_position_in_threadgroup]]){ + data0[0] = 0; +} +""") + with self.assertRaises(RuntimeError): + compiled = compiled[:40] # corrupt the compiled program + MetalProgram(device, "r_5", compiled) + diff --git a/tinygrad/runtime/graph/metal.py b/tinygrad/runtime/graph/metal.py index 02435fb5..ff8df6d1 100644 --- a/tinygrad/runtime/graph/metal.py +++ b/tinygrad/runtime/graph/metal.py @@ -1,12 +1,20 @@ from typing import List, Any, Dict, cast, Optional -import Metal +import ctypes from tinygrad.dtype import dtypes from tinygrad.helpers import dedup, getenv from tinygrad.device import Buffer from tinygrad.engine.realize import ExecItem, CompiledRunner from tinygrad.engine.jit import GraphRunner, GraphException from tinygrad.shape.symbolic import Variable -from tinygrad.runtime.ops_metal import wait_check +from tinygrad.runtime.ops_metal import wait_check, msg, libobjc, to_struct, objc_instance,\ + MTLResourceOptions, elapsed_time, objc_id + +class MTLIndirectCommandType: + MTLIndirectCommandTypeConcurrentDispatch = (1 << 5) + +class MTLResourceUsage: + MTLResourceUsageRead = 0b01 + MTLResourceUsageWrite = 0b10 class MetalGraph(GraphRunner): def __init__(self, jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int]): @@ -14,58 +22,64 @@ class MetalGraph(GraphRunner): if not all(isinstance(ji.prg, CompiledRunner) for ji in jit_cache): raise GraphException # create metal batch exec - icb_descriptor = Metal.MTLIndirectCommandBufferDescriptor.new() - icb_descriptor.setCommandTypes_(Metal.MTLIndirectCommandType(Metal.MTLIndirectCommandTypeConcurrentDispatch)) - icb_descriptor.setInheritBuffers_(False) - icb_descriptor.setInheritPipelineState_(False) - icb_descriptor.setMaxKernelBufferBindCount_(31) - self.icb = self.device.device.newIndirectCommandBufferWithDescriptor_maxCommandCount_options_(icb_descriptor, len(self.jit_cache), - Metal.MTLResourceOptions(0)) - if self.icb is None: raise GraphException("create indirect command buffer failed, does your system support this?") - self.needs_icb_fix = int(type(self.icb).__name__ != "AGXG15XFamilyIndirectCommandBuffer") # not required on M3 + icb_descriptor = msg(libobjc.objc_getClass(b"MTLIndirectCommandBufferDescriptor"), "new", restype=objc_instance) + msg(icb_descriptor, "setCommandTypes:", MTLIndirectCommandType.MTLIndirectCommandTypeConcurrentDispatch) + msg(icb_descriptor, "setInheritBuffers:", False) + msg(icb_descriptor, "setInheritPipelineState:", False) + msg(icb_descriptor, "setMaxKernelBufferBindCount:", 31) + + self.icb = msg(self.device.device, "newIndirectCommandBufferWithDescriptor:maxCommandCount:options:", + icb_descriptor, len(self.jit_cache), MTLResourceOptions.MTLResourceCPUCacheModeDefaultCache, restype=objc_instance) + if self.icb.value is None: raise GraphException("create indirect command buffer failed, does your system support this?") + icb_label = bytes(msg(msg(self.icb, "description", restype=objc_instance), "UTF8String", restype=ctypes.c_char_p)).decode() + self.needs_icb_fix = int("AGXG15XFamilyIndirectCommandBuffer" not in icb_label) # not required on M3 if len(self.vars): self.int_buf = self.device.allocator.alloc(len(self.vars)*dtypes.int32.itemsize) all_resources = [self.int_buf.buf] if len(self.vars) else [] all_pipelines = [] for j,ji in enumerate(self.jit_cache): prg: CompiledRunner = cast(CompiledRunner, ji.prg) - icb_command = self.icb.indirectComputeCommandAtIndex_(j) + icb_command = msg(self.icb, "indirectComputeCommandAtIndex:", j, restype=objc_instance) all_pipelines.append(prg.clprg.pipeline_state) - icb_command.setComputePipelineState_(prg.clprg.pipeline_state) + msg(icb_command, "setComputePipelineState:", prg.clprg.pipeline_state) for i,b in enumerate(ji.bufs): if b is not None and b not in input_rawbuffers: - icb_command.setKernelBuffer_offset_atIndex_(b._buf.buf, b._buf.offset, i) + msg(icb_command, "setKernelBuffer:offset:atIndex:", b._buf.buf, b._buf.offset, i) all_resources.append(b._buf.buf) - for i,v in enumerate(prg.p.vars): icb_command.setKernelBuffer_offset_atIndex_(self.int_buf.buf, self.vars.index(v)*4, len(ji.bufs)+i) + for i,v in enumerate(prg.p.vars): msg(icb_command, "setKernelBuffer:offset:atIndex:", self.int_buf.buf, self.vars.index(v)*4, len(ji.bufs)+i) global_size, local_size = prg.p.launch_dims(var_vals) - icb_command.concurrentDispatchThreadgroups_threadsPerThreadgroup_(Metal.MTLSize(*global_size), Metal.MTLSize(*local_size)) - icb_command.setBarrier() + msg(icb_command, "concurrentDispatchThreadgroups:threadsPerThreadgroup:", to_struct(*global_size), to_struct(*local_size)) + msg(icb_command, "setBarrier") self.all_resources = dedup(all_resources) self.all_pipelines = dedup(all_pipelines) self.command_buffer: Any = None - if len(self.vars): self.int_buf_view = self.int_buf.buf.contents().as_buffer(self.int_buf.buf.length()).cast('i') - self.range = Metal.MTLIndirectCommandBufferExecutionRangeMake(0, len(self.jit_cache)) + if len(self.vars): self.int_buf_view = self.device.allocator.as_buffer(self.int_buf).cast('i') + self.range = to_struct(0, len(self.jit_cache)) def __call__(self, input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int], wait=False) -> Optional[float]: + if self.command_buffer is not None and self.command_buffer in self.device.mtl_buffers_in_flight: wait_check(self.command_buffer) all_resources = dedup(self.all_resources + [x._buf.buf for x in input_rawbuffers]) for (j,i),input_idx in self.input_replace.items(): - self.icb.indirectComputeCommandAtIndex_(j).setKernelBuffer_offset_atIndex_(input_rawbuffers[input_idx]._buf.buf, + computeCommand = msg(self.icb, "indirectComputeCommandAtIndex:", j, restype=objc_id) + msg(computeCommand, "setKernelBuffer:offset:atIndex:", input_rawbuffers[input_idx]._buf.buf, input_rawbuffers[input_idx]._buf.offset, i) for j, global_dims, local_dims in self.updated_launch_dims(var_vals): prg = cast(CompiledRunner, self.jit_cache[j].prg) global_size, local_size = global_dims or prg.p.global_size, local_dims or prg.p.local_size - self.icb.indirectComputeCommandAtIndex_(j).concurrentDispatchThreadgroups_threadsPerThreadgroup_(Metal.MTLSize(*global_size), - Metal.MTLSize(*local_size)) + computeCommand = msg(self.icb, "indirectComputeCommandAtIndex:", j) + msg(computeCommand, "concurrentDispatchThreadgroups:threadsPerThreadgroup:", + to_struct(*cast(tuple, global_size)), to_struct(*cast(tuple, local_size))) for j, var in enumerate(self.vars): self.int_buf_view[j] = var_vals[var] - command_buffer = self.device.mtl_queue.commandBuffer() - encoder = command_buffer.computeCommandEncoder() - encoder.useResources_count_usage_(all_resources, len(all_resources), Metal.MTLResourceUsageRead | Metal.MTLResourceUsageWrite) + command_buffer = msg(self.device.mtl_queue, "commandBuffer", restype=objc_instance) + encoder = msg(command_buffer, "computeCommandEncoder", restype=objc_instance) + msg(encoder, "useResources:count:usage:", (objc_id * len(all_resources))(*all_resources), len(all_resources), + MTLResourceUsage.MTLResourceUsageRead | MTLResourceUsage.MTLResourceUsageWrite) # NOTE: the pipelines likely need to be added to the used resources to fix the crash on M1/M2, but I haven't figured out how # this is a O(n) hack to get them used. what should work is: @@ -74,16 +88,16 @@ class MetalGraph(GraphRunner): # to repro the crash (which can also crash other running GPU apps), run with FIX_METAL_ICB=0 if getenv("FIX_METAL_ICB", self.needs_icb_fix): for ps in self.all_pipelines: - encoder.setComputePipelineState_(ps) - encoder.dispatchThreadgroups_threadsPerThreadgroup_(Metal.MTLSize(0,0,0), Metal.MTLSize(0,0,0)) + msg(encoder, "setComputePipelineState:", ps) + msg(encoder, "dispatchThreadgroups:threadsPerThreadgroup:", to_struct(0,0,0), to_struct(0,0,0)) - encoder.executeCommandsInBuffer_withRange_(self.icb, self.range) - encoder.endEncoding() - command_buffer.commit() + msg(encoder, "executeCommandsInBuffer:withRange:", self.icb, self.range) + msg(encoder, "endEncoding") + msg(command_buffer, "commit") self.command_buffer = command_buffer if wait: wait_check(command_buffer) - return command_buffer.GPUEndTime() - command_buffer.GPUStartTime() + return elapsed_time(command_buffer) self.device.mtl_buffers_in_flight.append(command_buffer) return None diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py index faf467cc..1b996420 100644 --- a/tinygrad/runtime/ops_metal.py +++ b/tinygrad/runtime/ops_metal.py @@ -1,15 +1,61 @@ from __future__ import annotations import os, subprocess, pathlib, ctypes, tempfile, functools -import Metal, libdispatch -from typing import List, Any, Tuple, Optional -from tinygrad.helpers import prod, getenv, DEBUG, unwrap2 +from typing import List, Any, Tuple, Optional, cast, TypeVar +from tinygrad.helpers import prod, getenv, DEBUG from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator from tinygrad.renderer.cstyle import MetalRenderer +class objc_id(ctypes.c_void_p): # This prevents ctypes from converting response to plain int, and dict.fromkeys() can use it to dedup + def __hash__(self): return hash(self.value) + def __eq__(self, other): return self.value == other.value + +class objc_instance(objc_id): # method with name "new", "alloc" should be freed after use + def __del__(self): msg(self, "release") + +@functools.lru_cache(None) +def sel(name: str): return libobjc.sel_registerName(name.encode()) + +class MTLResourceOptions: + MTLResourceCPUCacheModeDefaultCache = 0 + MTLResourceStorageModeShared = 0 << 4 + +class MTLPipelineOption: + MTLPipelineOptionNone = 0 + +libobjc = ctypes.CDLL("/usr/lib/libobjc.dylib") +libmetal = ctypes.CDLL("/System/Library/Frameworks/Metal.framework/Metal") +# Must be loaded for default Metal Device: https://developer.apple.com/documentation/metal/1433401-mtlcreatesystemdefaultdevice?language=objc +ctypes.CDLL("/System/Library/Frameworks/CoreGraphics.framework/CoreGraphics") +libdispatch = ctypes.CDLL("/usr/lib/libSystem.dylib") # libdispatch is part of libSystem on mac +libobjc.objc_getClass.restype = objc_id +libobjc.sel_registerName.restype = objc_id +libmetal.MTLCreateSystemDefaultDevice.restype = objc_instance +libdispatch.dispatch_data_create.restype = objc_instance + +T = TypeVar("T") +# Ignore mypy error reporting incompatible default, because typevar default only works on python 3.12 +def msg(ptr: objc_id, selector: str, /, *args: Any, restype: type[T] = objc_id) -> T: # type: ignore [assignment] + sender = libobjc["objc_msgSend"] # Using attribute access returns a new reference so setting restype is safe + sender.restype = restype + return sender(ptr, sel(selector), *args) + +def to_ns_str(s: str): return msg(libobjc.objc_getClass(b"NSString"), "stringWithUTF8String:", s.encode(), restype=objc_instance) + +def to_struct(*t: int, _type: type = ctypes.c_ulong): + class Struct(ctypes.Structure): pass + Struct._fields_ = [(f"field{i}", _type) for i in range(len(t))] + return Struct(*t) + def wait_check(cbuf: Any): - cbuf.waitUntilCompleted() - if (error := cbuf.error()) is not None: - raise RuntimeError(error) + msg(cbuf, "waitUntilCompleted") + if (error := cast(int, msg(cbuf, "error", restype=ctypes.c_ulong))) != 0: raise RuntimeError(error) + +def elapsed_time(cbuf: objc_id): + return cast(float, msg(cbuf, "GPUEndTime", restype=ctypes.c_double)) - cast(float, msg(cbuf, "GPUStartTime", restype=ctypes.c_double)) + +def error_check(error: objc_instance, error_constructor: type[Exception] = RuntimeError): + if error.value is None: return None + raise error_constructor(bytes(msg(msg(error, "localizedDescription", restype=objc_instance), "UTF8String", restype=ctypes.c_char_p)).decode()) class MetalCompiler(Compiler): def __init__(self, device:Optional[MetalDevice]): @@ -20,11 +66,14 @@ class MetalCompiler(Compiler): # NOTE: if you run llvm-dis on "air" you can see the llvm bytecode air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=src.encode('utf-8')) return subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metallib', '-', '-o', '-'], input=air) - options = Metal.MTLCompileOptions.new() - options.setFastMathEnabled_(getenv("METAL_FAST_MATH")) - try: library = unwrap2(self.device.device.newLibraryWithSource_options_error_(src, options, None)) - except AssertionError as e: raise CompileError(e) from e - return library.libraryDataContents().bytes().tobytes() + options = msg(libobjc.objc_getClass(b"MTLCompileOptions"), "new", restype=objc_instance) + msg(options, "setFastMathEnabled:", getenv("METAL_FAST_MATH")) + compileError = objc_instance() + library = msg(self.device.device, "newLibraryWithSource:options:error:", to_ns_str(src), + options, ctypes.byref(compileError), restype=objc_instance) + error_check(compileError, CompileError) + library_contents = msg(library, "libraryDataContents", restype=objc_instance) + return ctypes.string_at(msg(library_contents, "bytes"), cast(int, msg(library_contents, "length", restype=ctypes.c_ulong))) class MetalProgram: def __init__(self, device:MetalDevice, name:str, lib:bytes): @@ -38,27 +87,35 @@ class MetalProgram: print("Error running disassembler: Make sure you have https://github.com/dougallj/applegpu cloned to tinygrad/extra/disassemblers/applegpu") assert lib[:4] == b"MTLB", "Invalid Metal library. Could be due to using conda. Try system python or METAL_XCODE=1 DISABLE_COMPILER_CACHE=1." data = libdispatch.dispatch_data_create(lib, len(lib), None, None) - self.library = unwrap2(self.device.device.newLibraryWithData_error_(data, None)) - self.fxn = self.library.newFunctionWithName_(name) - descriptor = Metal.MTLComputePipelineDescriptor.new() - descriptor.setComputeFunction_(self.fxn) - descriptor.setSupportIndirectCommandBuffers_(True) - self.pipeline_state = unwrap2(self.device.device.newComputePipelineStateWithDescriptor_options_reflection_error_( - descriptor, Metal.MTLPipelineOption(0), None, None)) + error_library_creation = objc_instance() + self.library = msg(self.device.device, "newLibraryWithData:error:", data, ctypes.byref(error_library_creation), restype=objc_instance) + error_check(error_library_creation) + self.fxn = msg(self.library, "newFunctionWithName:", to_ns_str(name), restype=objc_instance) + descriptor = msg(libobjc.objc_getClass(b"MTLComputePipelineDescriptor"), "new", restype=objc_instance) + msg(descriptor, "setComputeFunction:", self.fxn) + msg(descriptor, "setSupportIndirectCommandBuffers:", True) + error_pipeline_creation = objc_instance() + self.pipeline_state = msg(self.device.device, "newComputePipelineStateWithDescriptor:options:reflection:error:", + descriptor, MTLPipelineOption.MTLPipelineOptionNone, None, ctypes.byref(error_pipeline_creation), restype=objc_instance) + error_check(error_pipeline_creation) def __call__(self, *bufs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False): - if prod(local_size) > self.pipeline_state.maxTotalThreadsPerThreadgroup(): raise RuntimeError(f"local size {local_size} bigger than {self.pipeline_state.maxTotalThreadsPerThreadgroup()} with exec width {self.pipeline_state.threadExecutionWidth()} memory length {self.pipeline_state.staticThreadgroupMemoryLength()}") # noqa: E501 - command_buffer = self.device.mtl_queue.commandBuffer() - encoder = command_buffer.computeCommandEncoder() - encoder.setComputePipelineState_(self.pipeline_state) - for i,a in enumerate(bufs): encoder.setBuffer_offset_atIndex_(a.buf, a.offset, i) - for i,a in enumerate(vals,start=len(bufs)): encoder.setBytes_length_atIndex_(ctypes.c_int32(a), 4, i) - encoder.dispatchThreadgroups_threadsPerThreadgroup_(Metal.MTLSize(*global_size), Metal.MTLSize(*local_size)) - encoder.endEncoding() - command_buffer.commit() + max_total_threads = msg(self.pipeline_state, "maxTotalThreadsPerThreadgroup", restype=ctypes.c_ulong) + if prod(local_size) > cast(int, max_total_threads): + exec_width = msg(self.pipeline_state, "threadExecutionWidth", restype=ctypes.c_ulong) + memory_length = msg(self.pipeline_state, "staticThreadgroupMemoryLength", restype=ctypes.c_ulong) + raise RuntimeError(f"local size {local_size} bigger than {max_total_threads} with exec width {exec_width} memory length {memory_length}") + command_buffer = msg(self.device.mtl_queue, "commandBuffer", restype=objc_instance) + encoder = msg(command_buffer, "computeCommandEncoder", restype=objc_instance) + msg(encoder, "setComputePipelineState:", self.pipeline_state) + for i,a in enumerate(bufs): msg(encoder, "setBuffer:offset:atIndex:", a.buf, a.offset, i) + for i,a in enumerate(vals,start=len(bufs)): msg(encoder, "setBytes:length:atIndex:", bytes(ctypes.c_int(a)), 4, i) + msg(encoder, "dispatchThreadgroups:threadsPerThreadgroup:", to_struct(*global_size), to_struct(*local_size)) + msg(encoder, "endEncoding") + msg(command_buffer, "commit") if wait: wait_check(command_buffer) - return command_buffer.GPUEndTime() - command_buffer.GPUStartTime() + return elapsed_time(command_buffer) self.device.mtl_buffers_in_flight.append(command_buffer) class MetalBuffer: @@ -69,46 +126,48 @@ class MetalAllocator(LRUAllocator): self.device:MetalDevice = device super().__init__() def _alloc(self, size:int, options) -> MetalBuffer: - ret = self.device.device.newBufferWithLength_options_(size, Metal.MTLResourceStorageModeShared) - if ret is None: raise MemoryError(f"Metal OOM while allocating {size=}") + # Buffer is explicitly released in _free() rather than garbage collected via reference count + ret = msg(self.device.device, "newBufferWithLength:options:", size, MTLResourceOptions.MTLResourceStorageModeShared, restype=objc_id) + if ret.value is None: raise MemoryError(f"Metal OOM while allocating {size=}") return MetalBuffer(ret, size) - def _free(self, opaque:MetalBuffer, options): opaque.buf.release() + def _free(self, opaque:MetalBuffer, options): msg(opaque.buf, "release") def transfer(self, dest:MetalBuffer, src:MetalBuffer, sz:int, src_dev:MetalDevice, dest_dev:MetalDevice): dest_dev.synchronize() - src_command_buffer = src_dev.mtl_queue.commandBuffer() - encoder = src_command_buffer.blitCommandEncoder() - encoder.copyFromBuffer_sourceOffset_toBuffer_destinationOffset_size_(src.buf, src.offset, dest.buf, dest.offset, sz) - encoder.endEncoding() + src_command_buffer = msg(src_dev.mtl_queue, "commandBuffer", restype=objc_instance) + encoder = msg(src_command_buffer, "blitCommandEncoder", restype=objc_instance) + msg(encoder, "copyFromBuffer:sourceOffset:toBuffer:destinationOffset:size:", src.buf, src.offset, dest.buf, dest.offset, sz) + msg(encoder, "endEncoding") if src_dev != dest_dev: - src_command_buffer.encodeSignalEvent_value_(src_dev.timeline_signal, src_dev.timeline_value) - dest_command_buffer = dest_dev.mtl_queue.commandBuffer() - dest_command_buffer.encodeWaitForEvent_value_(src_dev.timeline_signal, src_dev.timeline_value) - dest_command_buffer.commit() + msg(src_command_buffer, "encodeSignalEvent:value:", src_dev.timeline_signal, src_dev.timeline_value) + dest_command_buffer = msg(dest_dev.mtl_queue, "commandBuffer", restype=objc_instance) + msg(dest_command_buffer, "encodeWaitForEvent:value:", src_dev.timeline_signal, src_dev.timeline_value) + msg(dest_command_buffer, "commit") dest_dev.mtl_buffers_in_flight.append(dest_command_buffer) src_dev.timeline_value += 1 - src_command_buffer.commit() + msg(src_command_buffer, "commit") src_dev.mtl_buffers_in_flight.append(src_command_buffer) def from_buffer(self, src:memoryview) -> Optional[Any]: - ret = self.device.device.newBufferWithBytesNoCopy_length_options_deallocator_(src, src.nbytes, Metal.MTLResourceStorageModeShared, None) + ptr = (ctypes.c_char * src.nbytes).from_buffer(src) + ret = msg(self.device.device, "newBufferWithBytesNoCopy:length:options:deallocator:", ptr, src.nbytes, 0, None, restype=objc_instance) if ret: self.device.mv_in_metal.append(src) return MetalBuffer(ret, src.nbytes) def as_buffer(self, src:MetalBuffer) -> memoryview: self.device.synchronize() - return src.buf.contents().as_buffer(src.offset+src.size)[src.offset:] + ptr = msg(src.buf, "contents", restype=objc_id) # Shared memory, do not release here + array = (ctypes.c_char * (src.offset + src.size)).from_address(ptr.value) + return memoryview(array).cast("B")[src.offset:] def copyin(self, dest:MetalBuffer, src:memoryview): self.as_buffer(dest)[:] = src def copyout(self, dest:memoryview, src:MetalBuffer): dest[:] = self.as_buffer(src) def offset(self, buf:MetalBuffer, size:int, offset:int): return MetalBuffer(buf.buf, size, offset) class MetalDevice(Compiled): def __init__(self, device:str): - self.device = Metal.MTLCreateSystemDefaultDevice() - self.mtl_queue = self.device.newCommandQueueWithMaxCommandBufferCount_(1024) + self.device = libmetal.MTLCreateSystemDefaultDevice() + self.mtl_queue = msg(self.device, "newCommandQueueWithMaxCommandBufferCount:", 1024, restype=objc_instance) if self.mtl_queue is None: raise RuntimeError("Cannot allocate a new command queue") - self.mtl_buffers_in_flight: List[Any] = [] self.mv_in_metal: List[memoryview] = [] - - self.timeline_signal = self.device.newSharedEvent() + self.timeline_signal = msg(self.device, "newSharedEvent", restype=objc_instance) self.timeline_value = 0 from tinygrad.runtime.graph.metal import MetalGraph