diff --git a/extra/gemm/metal_matmul.py b/extra/gemm/metal_matmul.py index 6f189989..ca0e952a 100644 --- a/extra/gemm/metal_matmul.py +++ b/extra/gemm/metal_matmul.py @@ -2,23 +2,31 @@ import os os.environ["METAL"] = "1" import time import numpy as np -from tinygrad.helpers import dtypes, getenv -from tinygrad.runtime.ops_metal import RawMetalBuffer, MetalProgram, compile_metal +from tinygrad.helpers import dtypes, getenv, flat_mv +from tinygrad import Device +from tinygrad.runtime.ops_metal import MetalAllocator, MetalDevice, MetalProgram, compile_metal N = getenv("N", 2048) LID = 2 -a = RawMetalBuffer(N*N, dtypes.float32) +device = MetalDevice("METAL") +metalalloc = MetalAllocator(device) -nb = np.random.default_rng().standard_normal(size=(N,N), dtype=np.float32) #.astype(np.int32).astype(np.float32) +a = metalalloc.alloc(N*N*4) +b = metalalloc.alloc(N*N*4) +c = metalalloc.alloc(N*N*4) + +na = np.zeros((N,N),dtype=np.float32) +nb = np.random.default_rng().standard_normal(size=(N,N), dtype=np.float32) #.astype(np.int32).astype(np.float32)N nc = np.random.default_rng().standard_normal(size=(N,N), dtype=np.float32) #.astype(np.int32).astype(np.float32) -b = RawMetalBuffer.fromCPU(nb) -c = RawMetalBuffer.fromCPU(nc) + +metalalloc.copyin(b,nb.tobytes()) +metalalloc.copyin(c,nc.tobytes()) FLOPS = N*N*N*2 BW = N*N*3*4 -prog = MetalProgram("test", compile_metal(f""" +prog = MetalProgram(device,"test", compile_metal(f""" #include #include // Available from Metal version 2.3 released with OS X 11.0+ using namespace metal; @@ -87,8 +95,8 @@ def timeit(fxn): # NOTE: et doesn't contain the launch overhead return time.perf_counter() - st tm = min([timeit(lambda: prog(a, b, c, global_size=[N//(8*4), N//(8*4*LID), 1], local_size=[32, LID, 1], wait=True)) for _ in range(20)]) -na = a.toCPU().reshape(N,N) comp = nb@nc +metalalloc.copyout(flat_mv(na.data), a) if N <= 32: print(na) print(comp) @@ -109,7 +117,6 @@ print(f"{N*N:10d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS matmul from tinygrad.tensor import Tensor from tinygrad.jit import TinyJit -from tinygrad.runtime.ops_metal import METAL b = Tensor(nb) c = Tensor(nc) # TODO: slowness without the JIT I suspect comes from a lack of a caching allocator @@ -119,7 +126,7 @@ def tiny_jit(b, c): def tiny_prog(b, c): st = time.perf_counter() a = tiny_jit(b, c) - METAL.synchronize() + Device["METAL"].synchronize() return time.perf_counter() - st tm = min([tiny_prog(b, c) for _ in range(20)]) print(f"{N*N:10d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS matmul in tinygrad") diff --git a/extra/gemm/metal_matvec.py b/extra/gemm/metal_matvec.py index 60df010d..e0979634 100644 --- a/extra/gemm/metal_matvec.py +++ b/extra/gemm/metal_matvec.py @@ -7,14 +7,14 @@ from tinygrad.helpers import GlobalCounters from tinygrad.tensor import Tensor from tinygrad.jit import TinyJit from tinygrad import Device -from tinygrad.helpers import colored, getenv, CI +from tinygrad.helpers import colored, getenv, CI, flat_mv import os os.environ["METAL"] = "1" import time import numpy as np from tinygrad.helpers import dtypes, getenv -from tinygrad.runtime.ops_metal import RawMetalBuffer, MetalProgram, compile_metal +from tinygrad.runtime.ops_metal import MetalAllocator, MetalDevice, MetalProgram, compile_metal N = 16384 M = 4096 @@ -36,6 +36,8 @@ tm = min([torch_prog(b, c) for _ in range(200)]) print(f"{N:d}x{M:d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS matvec in torch") torch_a = (b@c).cpu() +device = MetalDevice("METAL") +metalalloc = MetalAllocator(device) WORKSIZE_ROW = 16 WORKSIZE_COL = 1 LOCAL_SIZE = [32, WORKSIZE_COL, WORKSIZE_ROW] @@ -87,13 +89,13 @@ kernel void test(device float* data0, const device float* data1, const device fl }} }} """) -prog = MetalProgram("test", prog) -# print(prog_string) -na = np.zeros(M, dtype=np.float32) -b = RawMetalBuffer.fromCPU(nb) -c = RawMetalBuffer.fromCPU(nc) +prog = MetalProgram(device,"test", prog) +a = metalalloc.alloc(M*4) +b = metalalloc.alloc(N*4) +c = metalalloc.alloc(N*M*4) +metalalloc.copyin(b,nb.tobytes()) +metalalloc.copyin(c,nc.tobytes()) def metalrun(): - a = RawMetalBuffer.fromCPU(na) prog(a, b, c, global_size=GLOBAL_SIZE, local_size=LOCAL_SIZE, wait=True) return a def timeit(fxn): @@ -103,12 +105,12 @@ def timeit(fxn): return time.perf_counter() - st tm = min([timeit(metalrun) for _ in range(200)]) print(f"{N:d}x{M:d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS matvec in metal") -metal_a = metalrun().toCPU().reshape(M) +metal_a = np.zeros(M, dtype=np.float32) +metalalloc.copyout(flat_mv(metal_a.data), a) np.testing.assert_allclose(metal_a, torch_a, atol=5e-3) from tinygrad.tensor import Tensor from tinygrad.jit import TinyJit -from tinygrad.runtime.ops_metal import METAL b = Tensor(nb) c = Tensor(nc) # TODO: slowness without the JIT I suspect comes from a lack of a caching allocator @@ -118,7 +120,7 @@ def tiny_jit(b, c): def tiny_prog(b, c): st = time.perf_counter() a = tiny_jit(b, c) - METAL.synchronize() + Device["METAL"].synchronize() return time.perf_counter() - st tm = min([tiny_prog(b, c) for _ in range(200)]) print(f"{N:d}x{M:d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS matvec in tinygrad")