update metal matmul and matvec for new device style (#2732)

* update for new device style

* create device before compile

---------

Co-authored-by: chenyu <chenyu@fastmail.com>
This commit is contained in:
Rory Clear 2023-12-17 23:15:07 +02:00 committed by GitHub
parent 91adb119b8
commit f409b57854
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 30 additions and 21 deletions

View File

@ -2,23 +2,31 @@ import os
os.environ["METAL"] = "1"
import time
import numpy as np
from tinygrad.helpers import dtypes, getenv
from tinygrad.runtime.ops_metal import RawMetalBuffer, MetalProgram, compile_metal
from tinygrad.helpers import dtypes, getenv, flat_mv
from tinygrad import Device
from tinygrad.runtime.ops_metal import MetalAllocator, MetalDevice, MetalProgram, compile_metal
N = getenv("N", 2048)
LID = 2
a = RawMetalBuffer(N*N, dtypes.float32)
device = MetalDevice("METAL")
metalalloc = MetalAllocator(device)
nb = np.random.default_rng().standard_normal(size=(N,N), dtype=np.float32) #.astype(np.int32).astype(np.float32)
a = metalalloc.alloc(N*N*4)
b = metalalloc.alloc(N*N*4)
c = metalalloc.alloc(N*N*4)
na = np.zeros((N,N),dtype=np.float32)
nb = np.random.default_rng().standard_normal(size=(N,N), dtype=np.float32) #.astype(np.int32).astype(np.float32)N
nc = np.random.default_rng().standard_normal(size=(N,N), dtype=np.float32) #.astype(np.int32).astype(np.float32)
b = RawMetalBuffer.fromCPU(nb)
c = RawMetalBuffer.fromCPU(nc)
metalalloc.copyin(b,nb.tobytes())
metalalloc.copyin(c,nc.tobytes())
FLOPS = N*N*N*2
BW = N*N*3*4
prog = MetalProgram("test", compile_metal(f"""
prog = MetalProgram(device,"test", compile_metal(f"""
#include <metal_stdlib>
#include <metal_simdgroup_matrix> // Available from Metal version 2.3 released with OS X 11.0+
using namespace metal;
@ -87,8 +95,8 @@ def timeit(fxn):
# NOTE: et doesn't contain the launch overhead
return time.perf_counter() - st
tm = min([timeit(lambda: prog(a, b, c, global_size=[N//(8*4), N//(8*4*LID), 1], local_size=[32, LID, 1], wait=True)) for _ in range(20)])
na = a.toCPU().reshape(N,N)
comp = nb@nc
metalalloc.copyout(flat_mv(na.data), a)
if N <= 32:
print(na)
print(comp)
@ -109,7 +117,6 @@ print(f"{N*N:10d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS matmul
from tinygrad.tensor import Tensor
from tinygrad.jit import TinyJit
from tinygrad.runtime.ops_metal import METAL
b = Tensor(nb)
c = Tensor(nc)
# TODO: slowness without the JIT I suspect comes from a lack of a caching allocator
@ -119,7 +126,7 @@ def tiny_jit(b, c):
def tiny_prog(b, c):
st = time.perf_counter()
a = tiny_jit(b, c)
METAL.synchronize()
Device["METAL"].synchronize()
return time.perf_counter() - st
tm = min([tiny_prog(b, c) for _ in range(20)])
print(f"{N*N:10d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS matmul in tinygrad")

View File

@ -7,14 +7,14 @@ from tinygrad.helpers import GlobalCounters
from tinygrad.tensor import Tensor
from tinygrad.jit import TinyJit
from tinygrad import Device
from tinygrad.helpers import colored, getenv, CI
from tinygrad.helpers import colored, getenv, CI, flat_mv
import os
os.environ["METAL"] = "1"
import time
import numpy as np
from tinygrad.helpers import dtypes, getenv
from tinygrad.runtime.ops_metal import RawMetalBuffer, MetalProgram, compile_metal
from tinygrad.runtime.ops_metal import MetalAllocator, MetalDevice, MetalProgram, compile_metal
N = 16384
M = 4096
@ -36,6 +36,8 @@ tm = min([torch_prog(b, c) for _ in range(200)])
print(f"{N:d}x{M:d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS matvec in torch")
torch_a = (b@c).cpu()
device = MetalDevice("METAL")
metalalloc = MetalAllocator(device)
WORKSIZE_ROW = 16
WORKSIZE_COL = 1
LOCAL_SIZE = [32, WORKSIZE_COL, WORKSIZE_ROW]
@ -87,13 +89,13 @@ kernel void test(device float* data0, const device float* data1, const device fl
}}
}}
""")
prog = MetalProgram("test", prog)
# print(prog_string)
na = np.zeros(M, dtype=np.float32)
b = RawMetalBuffer.fromCPU(nb)
c = RawMetalBuffer.fromCPU(nc)
prog = MetalProgram(device,"test", prog)
a = metalalloc.alloc(M*4)
b = metalalloc.alloc(N*4)
c = metalalloc.alloc(N*M*4)
metalalloc.copyin(b,nb.tobytes())
metalalloc.copyin(c,nc.tobytes())
def metalrun():
a = RawMetalBuffer.fromCPU(na)
prog(a, b, c, global_size=GLOBAL_SIZE, local_size=LOCAL_SIZE, wait=True)
return a
def timeit(fxn):
@ -103,12 +105,12 @@ def timeit(fxn):
return time.perf_counter() - st
tm = min([timeit(metalrun) for _ in range(200)])
print(f"{N:d}x{M:d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS matvec in metal")
metal_a = metalrun().toCPU().reshape(M)
metal_a = np.zeros(M, dtype=np.float32)
metalalloc.copyout(flat_mv(metal_a.data), a)
np.testing.assert_allclose(metal_a, torch_a, atol=5e-3)
from tinygrad.tensor import Tensor
from tinygrad.jit import TinyJit
from tinygrad.runtime.ops_metal import METAL
b = Tensor(nb)
c = Tensor(nc)
# TODO: slowness without the JIT I suspect comes from a lack of a caching allocator
@ -118,7 +120,7 @@ def tiny_jit(b, c):
def tiny_prog(b, c):
st = time.perf_counter()
a = tiny_jit(b, c)
METAL.synchronize()
Device["METAL"].synchronize()
return time.perf_counter() - st
tm = min([tiny_prog(b, c) for _ in range(200)])
print(f"{N:d}x{M:d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS matvec in tinygrad")