update metal matmul and matvec for new device style (#2732)

* update for new device style * create device before compile --------- Co-authored-by: chenyu <chenyu@fastmail.com>
2023-12-17 23:15:07 +02:00 · 2023-12-17 23:15:07 +02:00 · f409b57854
parent 91adb119b8
commit f409b57854
2 changed files with 30 additions and 21 deletions
--- a/extra/gemm/metal_matmul.py
+++ b/extra/gemm/metal_matmul.py
@ -2,23 +2,31 @@ import os
 os.environ["METAL"] = "1"
 import time
 import numpy as np
-from tinygrad.helpers import dtypes, getenv
-from tinygrad.runtime.ops_metal import RawMetalBuffer, MetalProgram, compile_metal
+from tinygrad.helpers import dtypes, getenv, flat_mv
+from tinygrad import Device
+from tinygrad.runtime.ops_metal import MetalAllocator, MetalDevice, MetalProgram, compile_metal

 N = getenv("N", 2048)
 LID = 2

-a = RawMetalBuffer(N*N, dtypes.float32)
+device = MetalDevice("METAL")
+metalalloc = MetalAllocator(device)

-nb = np.random.default_rng().standard_normal(size=(N,N), dtype=np.float32) #.astype(np.int32).astype(np.float32)
+a = metalalloc.alloc(N*N*4)
+b = metalalloc.alloc(N*N*4)
+c = metalalloc.alloc(N*N*4)
+
+na = np.zeros((N,N),dtype=np.float32)
+nb = np.random.default_rng().standard_normal(size=(N,N), dtype=np.float32) #.astype(np.int32).astype(np.float32)N
 nc = np.random.default_rng().standard_normal(size=(N,N), dtype=np.float32) #.astype(np.int32).astype(np.float32)
-b = RawMetalBuffer.fromCPU(nb)
-c = RawMetalBuffer.fromCPU(nc)
+
+metalalloc.copyin(b,nb.tobytes())
+metalalloc.copyin(c,nc.tobytes())

 FLOPS = N*N*N*2
 BW = N*N*3*4

-prog = MetalProgram("test", compile_metal(f"""
+prog = MetalProgram(device,"test", compile_metal(f"""
 #include <metal_stdlib>
 #include <metal_simdgroup_matrix>  // Available from Metal version 2.3 released with OS X 11.0+
 using namespace metal;
@ -87,8 +95,8 @@ def timeit(fxn):
  # NOTE: et doesn't contain the launch overhead
  return time.perf_counter() - st
 tm = min([timeit(lambda: prog(a, b, c, global_size=[N//(8*4), N//(8*4*LID), 1], local_size=[32, LID, 1], wait=True)) for _ in range(20)])
-na = a.toCPU().reshape(N,N)
 comp = nb@nc
+metalalloc.copyout(flat_mv(na.data), a)
 if N <= 32:
  print(na)
  print(comp)
@ -109,7 +117,6 @@ print(f"{N*N:10d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS matmul

 from tinygrad.tensor import Tensor
 from tinygrad.jit import TinyJit
-from tinygrad.runtime.ops_metal import METAL
 b = Tensor(nb)
 c = Tensor(nc)
 # TODO: slowness without the JIT I suspect comes from a lack of a caching allocator
@ -119,7 +126,7 @@ def tiny_jit(b, c):
 def tiny_prog(b, c):
  st = time.perf_counter()
  a = tiny_jit(b, c)
-  METAL.synchronize()
+  Device["METAL"].synchronize()
  return time.perf_counter() - st
 tm = min([tiny_prog(b, c) for _ in range(20)])
 print(f"{N*N:10d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS matmul in tinygrad")
--- a/extra/gemm/metal_matvec.py
+++ b/extra/gemm/metal_matvec.py
@ -7,14 +7,14 @@ from tinygrad.helpers import GlobalCounters
 from tinygrad.tensor import Tensor
 from tinygrad.jit import TinyJit
 from tinygrad import Device
-from tinygrad.helpers import colored, getenv, CI
+from tinygrad.helpers import colored, getenv, CI, flat_mv

 import os
 os.environ["METAL"] = "1"
 import time
 import numpy as np
 from tinygrad.helpers import dtypes, getenv
-from tinygrad.runtime.ops_metal import RawMetalBuffer, MetalProgram, compile_metal
+from tinygrad.runtime.ops_metal import MetalAllocator, MetalDevice, MetalProgram, compile_metal

 N = 16384
 M = 4096
@ -36,6 +36,8 @@ tm = min([torch_prog(b, c) for _ in range(200)])
 print(f"{N:d}x{M:d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS matvec in torch")
 torch_a = (b@c).cpu()

+device = MetalDevice("METAL")
+metalalloc = MetalAllocator(device)
 WORKSIZE_ROW = 16
 WORKSIZE_COL = 1
 LOCAL_SIZE = [32, WORKSIZE_COL, WORKSIZE_ROW]
@ -87,13 +89,13 @@ kernel void test(device float* data0, const device float* data1, const device fl
  }}
 }}
 """)
-prog = MetalProgram("test", prog)
-# print(prog_string)
-na = np.zeros(M, dtype=np.float32)
-b = RawMetalBuffer.fromCPU(nb)
-c = RawMetalBuffer.fromCPU(nc)
+prog = MetalProgram(device,"test", prog)
+a = metalalloc.alloc(M*4)
+b = metalalloc.alloc(N*4)
+c = metalalloc.alloc(N*M*4)
+metalalloc.copyin(b,nb.tobytes())
+metalalloc.copyin(c,nc.tobytes())
 def metalrun():
-  a = RawMetalBuffer.fromCPU(na)
  prog(a, b, c, global_size=GLOBAL_SIZE, local_size=LOCAL_SIZE, wait=True)
  return a
 def timeit(fxn):
@ -103,12 +105,12 @@ def timeit(fxn):
  return time.perf_counter() - st
 tm = min([timeit(metalrun) for _ in range(200)])
 print(f"{N:d}x{M:d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS matvec in metal")
-metal_a = metalrun().toCPU().reshape(M)
+metal_a = np.zeros(M, dtype=np.float32)
+metalalloc.copyout(flat_mv(metal_a.data), a)
 np.testing.assert_allclose(metal_a, torch_a, atol=5e-3)

 from tinygrad.tensor import Tensor
 from tinygrad.jit import TinyJit
-from tinygrad.runtime.ops_metal import METAL
 b = Tensor(nb)
 c = Tensor(nc)
 # TODO: slowness without the JIT I suspect comes from a lack of a caching allocator
@ -118,7 +120,7 @@ def tiny_jit(b, c):
 def tiny_prog(b, c):
  st = time.perf_counter()
  a = tiny_jit(b, c)
-  METAL.synchronize()
+  Device["METAL"].synchronize()
  return time.perf_counter() - st
 tm = min([tiny_prog(b, c) for _ in range(200)])
 print(f"{N:d}x{M:d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS matvec in tinygrad")