mirror of https://github.com/commaai/tinygrad.git
update metal matmul and matvec for new device style (#2732)
* update for new device style * create device before compile --------- Co-authored-by: chenyu <chenyu@fastmail.com>
This commit is contained in:
parent
91adb119b8
commit
f409b57854
|
@ -2,23 +2,31 @@ import os
|
|||
os.environ["METAL"] = "1"
|
||||
import time
|
||||
import numpy as np
|
||||
from tinygrad.helpers import dtypes, getenv
|
||||
from tinygrad.runtime.ops_metal import RawMetalBuffer, MetalProgram, compile_metal
|
||||
from tinygrad.helpers import dtypes, getenv, flat_mv
|
||||
from tinygrad import Device
|
||||
from tinygrad.runtime.ops_metal import MetalAllocator, MetalDevice, MetalProgram, compile_metal
|
||||
|
||||
N = getenv("N", 2048)
|
||||
LID = 2
|
||||
|
||||
a = RawMetalBuffer(N*N, dtypes.float32)
|
||||
device = MetalDevice("METAL")
|
||||
metalalloc = MetalAllocator(device)
|
||||
|
||||
nb = np.random.default_rng().standard_normal(size=(N,N), dtype=np.float32) #.astype(np.int32).astype(np.float32)
|
||||
a = metalalloc.alloc(N*N*4)
|
||||
b = metalalloc.alloc(N*N*4)
|
||||
c = metalalloc.alloc(N*N*4)
|
||||
|
||||
na = np.zeros((N,N),dtype=np.float32)
|
||||
nb = np.random.default_rng().standard_normal(size=(N,N), dtype=np.float32) #.astype(np.int32).astype(np.float32)N
|
||||
nc = np.random.default_rng().standard_normal(size=(N,N), dtype=np.float32) #.astype(np.int32).astype(np.float32)
|
||||
b = RawMetalBuffer.fromCPU(nb)
|
||||
c = RawMetalBuffer.fromCPU(nc)
|
||||
|
||||
metalalloc.copyin(b,nb.tobytes())
|
||||
metalalloc.copyin(c,nc.tobytes())
|
||||
|
||||
FLOPS = N*N*N*2
|
||||
BW = N*N*3*4
|
||||
|
||||
prog = MetalProgram("test", compile_metal(f"""
|
||||
prog = MetalProgram(device,"test", compile_metal(f"""
|
||||
#include <metal_stdlib>
|
||||
#include <metal_simdgroup_matrix> // Available from Metal version 2.3 released with OS X 11.0+
|
||||
using namespace metal;
|
||||
|
@ -87,8 +95,8 @@ def timeit(fxn):
|
|||
# NOTE: et doesn't contain the launch overhead
|
||||
return time.perf_counter() - st
|
||||
tm = min([timeit(lambda: prog(a, b, c, global_size=[N//(8*4), N//(8*4*LID), 1], local_size=[32, LID, 1], wait=True)) for _ in range(20)])
|
||||
na = a.toCPU().reshape(N,N)
|
||||
comp = nb@nc
|
||||
metalalloc.copyout(flat_mv(na.data), a)
|
||||
if N <= 32:
|
||||
print(na)
|
||||
print(comp)
|
||||
|
@ -109,7 +117,6 @@ print(f"{N*N:10d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS matmul
|
|||
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.jit import TinyJit
|
||||
from tinygrad.runtime.ops_metal import METAL
|
||||
b = Tensor(nb)
|
||||
c = Tensor(nc)
|
||||
# TODO: slowness without the JIT I suspect comes from a lack of a caching allocator
|
||||
|
@ -119,7 +126,7 @@ def tiny_jit(b, c):
|
|||
def tiny_prog(b, c):
|
||||
st = time.perf_counter()
|
||||
a = tiny_jit(b, c)
|
||||
METAL.synchronize()
|
||||
Device["METAL"].synchronize()
|
||||
return time.perf_counter() - st
|
||||
tm = min([tiny_prog(b, c) for _ in range(20)])
|
||||
print(f"{N*N:10d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS matmul in tinygrad")
|
||||
|
|
|
@ -7,14 +7,14 @@ from tinygrad.helpers import GlobalCounters
|
|||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.jit import TinyJit
|
||||
from tinygrad import Device
|
||||
from tinygrad.helpers import colored, getenv, CI
|
||||
from tinygrad.helpers import colored, getenv, CI, flat_mv
|
||||
|
||||
import os
|
||||
os.environ["METAL"] = "1"
|
||||
import time
|
||||
import numpy as np
|
||||
from tinygrad.helpers import dtypes, getenv
|
||||
from tinygrad.runtime.ops_metal import RawMetalBuffer, MetalProgram, compile_metal
|
||||
from tinygrad.runtime.ops_metal import MetalAllocator, MetalDevice, MetalProgram, compile_metal
|
||||
|
||||
N = 16384
|
||||
M = 4096
|
||||
|
@ -36,6 +36,8 @@ tm = min([torch_prog(b, c) for _ in range(200)])
|
|||
print(f"{N:d}x{M:d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS matvec in torch")
|
||||
torch_a = (b@c).cpu()
|
||||
|
||||
device = MetalDevice("METAL")
|
||||
metalalloc = MetalAllocator(device)
|
||||
WORKSIZE_ROW = 16
|
||||
WORKSIZE_COL = 1
|
||||
LOCAL_SIZE = [32, WORKSIZE_COL, WORKSIZE_ROW]
|
||||
|
@ -87,13 +89,13 @@ kernel void test(device float* data0, const device float* data1, const device fl
|
|||
}}
|
||||
}}
|
||||
""")
|
||||
prog = MetalProgram("test", prog)
|
||||
# print(prog_string)
|
||||
na = np.zeros(M, dtype=np.float32)
|
||||
b = RawMetalBuffer.fromCPU(nb)
|
||||
c = RawMetalBuffer.fromCPU(nc)
|
||||
prog = MetalProgram(device,"test", prog)
|
||||
a = metalalloc.alloc(M*4)
|
||||
b = metalalloc.alloc(N*4)
|
||||
c = metalalloc.alloc(N*M*4)
|
||||
metalalloc.copyin(b,nb.tobytes())
|
||||
metalalloc.copyin(c,nc.tobytes())
|
||||
def metalrun():
|
||||
a = RawMetalBuffer.fromCPU(na)
|
||||
prog(a, b, c, global_size=GLOBAL_SIZE, local_size=LOCAL_SIZE, wait=True)
|
||||
return a
|
||||
def timeit(fxn):
|
||||
|
@ -103,12 +105,12 @@ def timeit(fxn):
|
|||
return time.perf_counter() - st
|
||||
tm = min([timeit(metalrun) for _ in range(200)])
|
||||
print(f"{N:d}x{M:d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS matvec in metal")
|
||||
metal_a = metalrun().toCPU().reshape(M)
|
||||
metal_a = np.zeros(M, dtype=np.float32)
|
||||
metalalloc.copyout(flat_mv(metal_a.data), a)
|
||||
np.testing.assert_allclose(metal_a, torch_a, atol=5e-3)
|
||||
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.jit import TinyJit
|
||||
from tinygrad.runtime.ops_metal import METAL
|
||||
b = Tensor(nb)
|
||||
c = Tensor(nc)
|
||||
# TODO: slowness without the JIT I suspect comes from a lack of a caching allocator
|
||||
|
@ -118,7 +120,7 @@ def tiny_jit(b, c):
|
|||
def tiny_prog(b, c):
|
||||
st = time.perf_counter()
|
||||
a = tiny_jit(b, c)
|
||||
METAL.synchronize()
|
||||
Device["METAL"].synchronize()
|
||||
return time.perf_counter() - st
|
||||
tm = min([tiny_prog(b, c) for _ in range(200)])
|
||||
print(f"{N:d}x{M:d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS matvec in tinygrad")
|
||||
|
|
Loading…
Reference in New Issue