tinygrad/test/test_speed_v_torch.py

272 lines
10 KiB
Python
Raw Normal View History

2022-10-29 02:22:15 +08:00
import os
2023-01-24 08:24:46 +08:00
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
2023-03-02 13:54:51 +08:00
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"
os.environ["OMP_NUM_THREADS"] = "1"
2022-10-11 07:06:00 +08:00
import unittest
import torch
2022-10-30 04:42:33 +08:00
torch.set_num_threads(1)
2022-10-11 07:06:00 +08:00
import time
import numpy as np
Simple chonker (#431) * chonker will make llvm fast * work * better speed tests, we will make them fast * with the cache add is the same speed * relu and neg are fast * fix sum speed * maximum maxnum? * hack for gemm opt * gemm very slow * zeros like * test_permute * shapetracker returns self * fix shapetracker factorization * err, int strides * permutes are faster now in tinygrad than pytorch * support -1 in expand * gemm unrolled * improve final test case * WIP GEMM * why isn't GEMM fast? * revert cache dim * ffp contract works on clang, not llvm? * ignore llvm ir * this makes fma work at least, but no faster * USE_4x4 * 63 GFLOPS * 87 GFLOPS * that wasn't matmul, 44 GFLOPS now * 82 GFLOPS permuted * this permute too * a little speed for the convs * 45 GFLOPS * speed tests pass again * clean up prints * fix FMA WHAT A WASTE OF TIME * colors * moar fair * GPU * useless on chonker * cleanups * improve factorized shapetracker * better threshold * label conv * work * ops test pass again * hot load the index * run the last view, no need to create * ZeroView needs a repr for the key to work * fix segfault on out of bounds * one more test * start amx, and llvm.initialize_native_asmparser * amx works * nice AMX class * nicer AMX class * refactor get_idxs * amx working * is slower... * useless flip * cache * SZ_X * AMX_SZ_X/Y work alone * Contiguous mlop * test gemm packed * PREPARE in packed * use_amx factor * prefetch isn't faster * loop * same 3ms * 2.24 ms * allow double on store in TG * amx reduce is the same speed as non amx reduce * include memory bandwidth * clean up shapetracker * flip returns stride * prepare for upstream * Update ops_llvm.py (#426) * permutes are yellow and green now * faster conv * llvm cleanups * Show optimised IR under debug 4 (#428) * ASTKernel class * Make tinygrad work with older python version (#427) * Make tinygrad work with older python version * Use partialmethod instead of partial * smiple chonker is chonking * remove junk from test speed vs torch * fix linker and types * AMX is only here now * add LLVM tests, it's a valid backend now * oops, run llvm test * contiguous_op * fix loadops compare * dedup reduceops Co-authored-by: calledit <1573053+calledit@users.noreply.github.com>
2022-11-11 15:17:09 +08:00
np.set_printoptions(linewidth=160)
from functools import partial
2023-03-25 01:24:27 +08:00
from tinygrad.lazy import Device
2023-02-22 22:58:27 +08:00
from tinygrad.ops import GlobalCounters
2022-10-11 07:06:00 +08:00
from tinygrad.tensor import Tensor
from tinygrad.nn import Conv2d
from tinygrad.helpers import colored, getenv, CI
from tinygrad.jit import TinyJit
CI < 5 minutes (#1252) * models matrix * fix typo and install gpu deps * install llvm deps if needed * fix * testops with cuda * remove pip cache since not work * cuda env * install cuda deps * maybe it will work now * i can't read * all tests in matrix * trim down more * opencl stuff in matrix * opencl pip cache * test split * change cuda test exclusion * test * fix cuda maybe * add models * add more n=auto * third thing * fix bug * cache pip more * change name * update tests * try again cause why not * balance * try again... * try apt cache for cuda * try on gpu: * try cuda again * update packages step * replace libz-dev with zlib1g-dev * only cache cuda * why error * fix gpuocelot bug * apt cache err * apt cache to slow? * opt and image in single runner * add a couple n=autos * remove test matrix * try cuda apt cache again * libz-dev -> zlib1g-dev * remove -s since not supported by xdist * the cache takes too long and doesn't work * combine webgpu and metal tests * combine imagenet to c and cpu tests * torch tests with linters * torch back by itself * small windows clang test with torch tests * fix a goofy windows bug * im dumb * bro * clang with linters * fix pylint error * linter not work on windows * try with clang again * clang and imagenet? * install deps * fix * fix quote * clang by itself (windows too slow) * env vars for imagenet * cache pip for metal and webgpu tests * try torch with metal and webgpu * doesn't work, too long * remove -v * try -n=logical * don't use logical * revert accidental thing * remove some prints unless CI * fix print unless CI * ignore speed tests for slow tests * clang windows in matrix (ubuntu being tested in imagenet->c test) * try manual pip cache * fix windows pip cache path * all manual pip cache * fix pip cache dir for macos * print_ci function in helpers * CI as variable, no print_ci * missed one * cuda tests with docker image * remove setup-python action for cuda * python->python3? * remove -s -v * try fix pip cache * maybe fix * try to fix pip cache * is this the path? * maybe cache pip * try again * create wheels dir * ? * cuda pip deps in dockerfile * disable pip cache for clang * image from ghcr instead of docker hub * why is clang like this * fast deps * try use different caches * remove the fast thing * try with lighter image * remove setup python for cuda * small docker and cuda fast deps * ignore a few more tests * cool docker thing (maybe) * oops * quotes * fix docker command * fix bug * ignore train efficientnet test * remove dockerfile (docker stuff takes too long) * remove docker stuff and normal cuda * oops * ignore the tests for cuda * does this work * ignore test_train on slow backends * add space * llvm ignore same tests as cuda * nvm * ignore lr scheduler tests * get some stats * fix ignore bug * remove extra ' * remove and * ignore test for llvm * change ignored tests and durationon all backends * fix * and -> or * ignore some more cuda tests * finally? * does this fix it * remove durations=0 * add some more tests to llvm * make last pytest more readable * fix * don't train efficientnet on cpu * try w/out pip cache * pip cache seems to be generally better * pytest file markers * try apt fast for cuda * use quick install for apt-fast * apt-fast not worth * apt-get to apt * fix typo * suppress warnings * register markers * disable debug on fuzz tests * change marker names * apt update and apt install in one command * update marker names in test.yml * webgpu pytest marker
2023-07-24 04:00:56 +08:00
import pytest
pytestmark = [pytest.mark.exclude_cuda, pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
2022-10-11 07:06:00 +08:00
IN_CHANS = [int(x) for x in getenv("IN_CHANS", "4,16,64").split(",")]
2022-11-08 13:12:08 +08:00
torch_device = torch.device('mps' if getenv("MPS", 0) else ('cuda' if getenv("TORCHCUDA", 0) else 'cpu'))
2023-03-25 01:24:27 +08:00
if str(torch_device) == "mps":
import torch.mps
sync = lambda: torch.mps.synchronize()
elif str(torch_device) == "cuda":
import torch.cuda
sync = lambda: torch.cuda.synchronize()
else:
sync = lambda: None
2022-11-08 13:12:08 +08:00
def colorize_float(x):
ret = f"{x:7.2f}x"
if x < 0.75:
return colored(ret, 'green')
elif x > 1.15:
return colored(ret, 'red')
2022-11-08 13:12:08 +08:00
else:
return colored(ret, 'yellow')
2022-11-08 13:12:08 +08:00
save_ops, save_mem = 0, 0
2022-11-08 13:12:08 +08:00
CNT = 8
2022-11-08 13:27:56 +08:00
def helper_test_speed(f1, *args):
global save_ops, save_mem
2022-11-08 13:12:08 +08:00
ets = []
ret = None
cache_defeat = np.zeros((2048,2048))
for i in range(CNT):
2022-11-08 13:12:08 +08:00
del ret
# operation cache defeats
args = [(x+1).realize() if isinstance(x, Tensor) else (None if x is None else (x+1)) for x in args]
2023-02-18 04:31:05 +08:00
# force syncing
2023-03-02 13:34:45 +08:00
[x.numpy() if isinstance(x, Tensor) or str(torch_device) == "cpu" else x.cpu().numpy() for x in args if x is not None]
2023-02-18 04:31:05 +08:00
# clear 32MB global memory cache (CPU and global memory only)
cache_defeat += 1
# manual pre sync
if isinstance(args[0], Tensor): Device[args[0].device].synchronize()
else: sync()
GlobalCounters.global_ops = 0
GlobalCounters.global_mem = 0
st = time.perf_counter()
2022-11-08 13:12:08 +08:00
ret = f1(*args)
if isinstance(ret, Tensor): Device[ret.device].synchronize()
else: sync()
et = (time.perf_counter() - st) * 1000
if i >= 1: ets.append(et)
if GlobalCounters.global_ops:
save_ops, save_mem = GlobalCounters.global_ops, GlobalCounters.global_mem
2023-08-22 07:57:59 +08:00
return ret.numpy() if isinstance(ret, Tensor) else ret.cpu().numpy(), np.min(ets)
2022-11-08 13:12:08 +08:00
2023-02-12 23:43:17 +08:00
def helper_test_generic_square(name, N, f1, f2, onearg=False):
2022-11-08 13:12:08 +08:00
torch.manual_seed(0)
torch_a = (torch.rand(N, N) - 0.5).to(torch_device)
2023-02-12 23:43:17 +08:00
torch_b = (torch.rand(N, N) - 0.5).to(torch_device) if not onearg else None
Simple chonker (#431) * chonker will make llvm fast * work * better speed tests, we will make them fast * with the cache add is the same speed * relu and neg are fast * fix sum speed * maximum maxnum? * hack for gemm opt * gemm very slow * zeros like * test_permute * shapetracker returns self * fix shapetracker factorization * err, int strides * permutes are faster now in tinygrad than pytorch * support -1 in expand * gemm unrolled * improve final test case * WIP GEMM * why isn't GEMM fast? * revert cache dim * ffp contract works on clang, not llvm? * ignore llvm ir * this makes fma work at least, but no faster * USE_4x4 * 63 GFLOPS * 87 GFLOPS * that wasn't matmul, 44 GFLOPS now * 82 GFLOPS permuted * this permute too * a little speed for the convs * 45 GFLOPS * speed tests pass again * clean up prints * fix FMA WHAT A WASTE OF TIME * colors * moar fair * GPU * useless on chonker * cleanups * improve factorized shapetracker * better threshold * label conv * work * ops test pass again * hot load the index * run the last view, no need to create * ZeroView needs a repr for the key to work * fix segfault on out of bounds * one more test * start amx, and llvm.initialize_native_asmparser * amx works * nice AMX class * nicer AMX class * refactor get_idxs * amx working * is slower... * useless flip * cache * SZ_X * AMX_SZ_X/Y work alone * Contiguous mlop * test gemm packed * PREPARE in packed * use_amx factor * prefetch isn't faster * loop * same 3ms * 2.24 ms * allow double on store in TG * amx reduce is the same speed as non amx reduce * include memory bandwidth * clean up shapetracker * flip returns stride * prepare for upstream * Update ops_llvm.py (#426) * permutes are yellow and green now * faster conv * llvm cleanups * Show optimised IR under debug 4 (#428) * ASTKernel class * Make tinygrad work with older python version (#427) * Make tinygrad work with older python version * Use partialmethod instead of partial * smiple chonker is chonking * remove junk from test speed vs torch * fix linker and types * AMX is only here now * add LLVM tests, it's a valid backend now * oops, run llvm test * contiguous_op * fix loadops compare * dedup reduceops Co-authored-by: calledit <1573053+calledit@users.noreply.github.com>
2022-11-11 15:17:09 +08:00
2022-11-08 13:12:08 +08:00
tiny_a = Tensor(torch_a.cpu().numpy())
2023-02-12 23:43:17 +08:00
tiny_b = Tensor(torch_b.cpu().numpy()) if not onearg else None
2022-11-08 13:12:08 +08:00
helper_test_generic(f"{name:30s} {N:5d}x{N:5d}", f1, (torch_a, torch_b), TinyJit(lambda a,b:f2(a,b).realize()), (tiny_a, tiny_b))
prefix = None
2023-02-12 23:43:17 +08:00
def helper_test_generic(name, f1, f1_args, f2, f2_args):
global prefix
2022-11-08 13:12:08 +08:00
with torch.no_grad():
2023-02-12 23:43:17 +08:00
val_torch, et_torch = helper_test_speed(f1, *f1_args)
val_tinygrad, et_tinygrad = helper_test_speed(f2, *f2_args)
2022-11-08 13:12:08 +08:00
desc = "faster" if et_torch > et_tinygrad else "slower"
flops = save_ops*1e-6
2023-03-11 01:44:12 +08:00
mem = save_mem*1e-6
print(("\r" if not CI else "")+f"{name:42s} {et_torch:7.2f} ms ({flops/et_torch:8.2f} GFLOPS {mem/et_torch:8.2f} GB/s) in torch, {et_tinygrad:7.2f} ms ({flops/et_tinygrad:8.2f} GFLOPS {mem/et_tinygrad:8.2f} GB/s) in tinygrad, {colorize_float(et_tinygrad/et_torch)} {desc} {flops:10.2f} MOPS {mem:8.2f} MB")
2022-11-08 13:12:08 +08:00
np.testing.assert_allclose(val_tinygrad, val_torch, atol=1e-4, rtol=1e-3)
2022-10-29 02:22:15 +08:00
def helper_test_conv(bs, in_chans, out_chans, kernel_size, img_size_y, img_size_x):
torch.manual_seed(0)
torch_dat = torch.rand(bs, in_chans, img_size_y, img_size_x).to(torch_device)
torch_conv = torch.nn.Conv2d(in_chans, out_chans, kernel_size, bias=None).to(torch_device)
tiny_dat = Tensor(torch_dat.cpu().numpy())
tiny_conv = Conv2d(in_chans, out_chans, kernel_size, bias=None)
tiny_conv.weight = Tensor(torch_conv.weight.detach().cpu().numpy())
def f1(torch_dat): return torch_conv(torch_dat)
def f2(tiny_dat): return tiny_conv(tiny_dat).realize()
2023-06-27 09:55:27 +08:00
helper_test_generic(f"conv bs:{bs:3d} chans:{in_chans:3d} -> {out_chans:3d} k:{kernel_size}", f1, (torch_dat,), TinyJit(f2), (tiny_dat,))
@unittest.skipIf(getenv("BIG") == 0, "no big tests")
class TestBigSpeed(unittest.TestCase):
2023-06-27 09:55:27 +08:00
def test_add(self):
def f(a, b): return a+b
helper_test_generic_square('add', 8192, f, f)
def test_exp(self):
def f(a, b): return a.exp()
helper_test_generic_square('exp', 8192, f, f, onearg=True)
def test_gemm_2048(self):
def f(a, b): return a @ b
helper_test_generic_square('gemm', 2048, f, f)
def test_gemm_4096(self):
def f(a, b): return a @ b
helper_test_generic_square('gemm', 4096, f, f)
def test_large_conv_1x1(self): helper_test_conv(bs=32, in_chans=128, out_chans=128, kernel_size=1, img_size_y=128, img_size_x=128)
def test_large_conv_3x3(self): helper_test_conv(bs=4, in_chans=128, out_chans=128, kernel_size=3, img_size_y=130, img_size_x=130)
def test_large_conv_5x5(self): helper_test_conv(bs=4, in_chans=128, out_chans=128, kernel_size=5, img_size_y=130, img_size_x=130)
@unittest.skipIf(getenv("BIG") == 1, "only big tests")
2022-10-30 04:42:33 +08:00
class TestSpeed(unittest.TestCase):
def test_sub(self):
def f(a, b): return a-b
helper_test_generic_square('sub', 4096, f, f)
@unittest.skipIf(getenv("CI","")!="" and Device.DEFAULT == "WEBGPU", "breaking on webgpu CI")
def test_pow(self):
def f(a, b): return a.pow(b)
helper_test_generic_square('pow', 2048, f, f)
2022-11-08 13:12:08 +08:00
def test_sum(self):
def f(a, b): return a.sum()
2023-02-18 04:31:05 +08:00
helper_test_generic_square('sum', 2048, f, f, onearg=True)
2023-02-12 23:43:17 +08:00
helper_test_generic_square('sum', 4096, f, f, onearg=True)
2022-11-08 13:12:08 +08:00
2023-01-23 13:28:40 +08:00
def test_partial_sum(self):
R = 256
def f(a, b): return a.reshape(int(4096//R), int(4096*R)).sum(axis=1)
2023-02-12 23:43:17 +08:00
helper_test_generic_square('partial_sum', 4096, f, f, onearg=True)
2023-07-20 00:37:23 +08:00
@unittest.skip("not really used in models")
def test_cumsum(self):
def f0(a, b): return a.cumsum(axis=0)
def f1(a, b): return a.cumsum(axis=1)
helper_test_generic_square('cumsum_0', 256, f0, f0, onearg=True)
helper_test_generic_square('cumsum_1', 256, f1, f1, onearg=True)
2023-07-20 00:37:23 +08:00
def test_cat(self):
helper_test_generic_square('cat_0', 256, lambda x,y: torch.cat((x,y),dim=0), lambda x,y: x.cat(y,dim=0))
helper_test_generic_square('cat_1', 256, lambda x,y: torch.cat((x,y),dim=1), lambda x,y: x.cat(y,dim=1))
Simple chonker (#431) * chonker will make llvm fast * work * better speed tests, we will make them fast * with the cache add is the same speed * relu and neg are fast * fix sum speed * maximum maxnum? * hack for gemm opt * gemm very slow * zeros like * test_permute * shapetracker returns self * fix shapetracker factorization * err, int strides * permutes are faster now in tinygrad than pytorch * support -1 in expand * gemm unrolled * improve final test case * WIP GEMM * why isn't GEMM fast? * revert cache dim * ffp contract works on clang, not llvm? * ignore llvm ir * this makes fma work at least, but no faster * USE_4x4 * 63 GFLOPS * 87 GFLOPS * that wasn't matmul, 44 GFLOPS now * 82 GFLOPS permuted * this permute too * a little speed for the convs * 45 GFLOPS * speed tests pass again * clean up prints * fix FMA WHAT A WASTE OF TIME * colors * moar fair * GPU * useless on chonker * cleanups * improve factorized shapetracker * better threshold * label conv * work * ops test pass again * hot load the index * run the last view, no need to create * ZeroView needs a repr for the key to work * fix segfault on out of bounds * one more test * start amx, and llvm.initialize_native_asmparser * amx works * nice AMX class * nicer AMX class * refactor get_idxs * amx working * is slower... * useless flip * cache * SZ_X * AMX_SZ_X/Y work alone * Contiguous mlop * test gemm packed * PREPARE in packed * use_amx factor * prefetch isn't faster * loop * same 3ms * 2.24 ms * allow double on store in TG * amx reduce is the same speed as non amx reduce * include memory bandwidth * clean up shapetracker * flip returns stride * prepare for upstream * Update ops_llvm.py (#426) * permutes are yellow and green now * faster conv * llvm cleanups * Show optimised IR under debug 4 (#428) * ASTKernel class * Make tinygrad work with older python version (#427) * Make tinygrad work with older python version * Use partialmethod instead of partial * smiple chonker is chonking * remove junk from test speed vs torch * fix linker and types * AMX is only here now * add LLVM tests, it's a valid backend now * oops, run llvm test * contiguous_op * fix loadops compare * dedup reduceops Co-authored-by: calledit <1573053+calledit@users.noreply.github.com>
2022-11-11 15:17:09 +08:00
def test_array_packing(self):
N = 2048
Simple chonker (#431) * chonker will make llvm fast * work * better speed tests, we will make them fast * with the cache add is the same speed * relu and neg are fast * fix sum speed * maximum maxnum? * hack for gemm opt * gemm very slow * zeros like * test_permute * shapetracker returns self * fix shapetracker factorization * err, int strides * permutes are faster now in tinygrad than pytorch * support -1 in expand * gemm unrolled * improve final test case * WIP GEMM * why isn't GEMM fast? * revert cache dim * ffp contract works on clang, not llvm? * ignore llvm ir * this makes fma work at least, but no faster * USE_4x4 * 63 GFLOPS * 87 GFLOPS * that wasn't matmul, 44 GFLOPS now * 82 GFLOPS permuted * this permute too * a little speed for the convs * 45 GFLOPS * speed tests pass again * clean up prints * fix FMA WHAT A WASTE OF TIME * colors * moar fair * GPU * useless on chonker * cleanups * improve factorized shapetracker * better threshold * label conv * work * ops test pass again * hot load the index * run the last view, no need to create * ZeroView needs a repr for the key to work * fix segfault on out of bounds * one more test * start amx, and llvm.initialize_native_asmparser * amx works * nice AMX class * nicer AMX class * refactor get_idxs * amx working * is slower... * useless flip * cache * SZ_X * AMX_SZ_X/Y work alone * Contiguous mlop * test gemm packed * PREPARE in packed * use_amx factor * prefetch isn't faster * loop * same 3ms * 2.24 ms * allow double on store in TG * amx reduce is the same speed as non amx reduce * include memory bandwidth * clean up shapetracker * flip returns stride * prepare for upstream * Update ops_llvm.py (#426) * permutes are yellow and green now * faster conv * llvm cleanups * Show optimised IR under debug 4 (#428) * ASTKernel class * Make tinygrad work with older python version (#427) * Make tinygrad work with older python version * Use partialmethod instead of partial * smiple chonker is chonking * remove junk from test speed vs torch * fix linker and types * AMX is only here now * add LLVM tests, it's a valid backend now * oops, run llvm test * contiguous_op * fix loadops compare * dedup reduceops Co-authored-by: calledit <1573053+calledit@users.noreply.github.com>
2022-11-11 15:17:09 +08:00
def f(a, b): return a.reshape(N, N // 32, 32).permute(1,0,2).contiguous()
2023-02-12 23:43:17 +08:00
helper_test_generic_square('array_packing', N, f, f, onearg=True)
Simple chonker (#431) * chonker will make llvm fast * work * better speed tests, we will make them fast * with the cache add is the same speed * relu and neg are fast * fix sum speed * maximum maxnum? * hack for gemm opt * gemm very slow * zeros like * test_permute * shapetracker returns self * fix shapetracker factorization * err, int strides * permutes are faster now in tinygrad than pytorch * support -1 in expand * gemm unrolled * improve final test case * WIP GEMM * why isn't GEMM fast? * revert cache dim * ffp contract works on clang, not llvm? * ignore llvm ir * this makes fma work at least, but no faster * USE_4x4 * 63 GFLOPS * 87 GFLOPS * that wasn't matmul, 44 GFLOPS now * 82 GFLOPS permuted * this permute too * a little speed for the convs * 45 GFLOPS * speed tests pass again * clean up prints * fix FMA WHAT A WASTE OF TIME * colors * moar fair * GPU * useless on chonker * cleanups * improve factorized shapetracker * better threshold * label conv * work * ops test pass again * hot load the index * run the last view, no need to create * ZeroView needs a repr for the key to work * fix segfault on out of bounds * one more test * start amx, and llvm.initialize_native_asmparser * amx works * nice AMX class * nicer AMX class * refactor get_idxs * amx working * is slower... * useless flip * cache * SZ_X * AMX_SZ_X/Y work alone * Contiguous mlop * test gemm packed * PREPARE in packed * use_amx factor * prefetch isn't faster * loop * same 3ms * 2.24 ms * allow double on store in TG * amx reduce is the same speed as non amx reduce * include memory bandwidth * clean up shapetracker * flip returns stride * prepare for upstream * Update ops_llvm.py (#426) * permutes are yellow and green now * faster conv * llvm cleanups * Show optimised IR under debug 4 (#428) * ASTKernel class * Make tinygrad work with older python version (#427) * Make tinygrad work with older python version * Use partialmethod instead of partial * smiple chonker is chonking * remove junk from test speed vs torch * fix linker and types * AMX is only here now * add LLVM tests, it's a valid backend now * oops, run llvm test * contiguous_op * fix loadops compare * dedup reduceops Co-authored-by: calledit <1573053+calledit@users.noreply.github.com>
2022-11-11 15:17:09 +08:00
2022-11-08 13:12:08 +08:00
def test_permute(self):
Simple chonker (#431) * chonker will make llvm fast * work * better speed tests, we will make them fast * with the cache add is the same speed * relu and neg are fast * fix sum speed * maximum maxnum? * hack for gemm opt * gemm very slow * zeros like * test_permute * shapetracker returns self * fix shapetracker factorization * err, int strides * permutes are faster now in tinygrad than pytorch * support -1 in expand * gemm unrolled * improve final test case * WIP GEMM * why isn't GEMM fast? * revert cache dim * ffp contract works on clang, not llvm? * ignore llvm ir * this makes fma work at least, but no faster * USE_4x4 * 63 GFLOPS * 87 GFLOPS * that wasn't matmul, 44 GFLOPS now * 82 GFLOPS permuted * this permute too * a little speed for the convs * 45 GFLOPS * speed tests pass again * clean up prints * fix FMA WHAT A WASTE OF TIME * colors * moar fair * GPU * useless on chonker * cleanups * improve factorized shapetracker * better threshold * label conv * work * ops test pass again * hot load the index * run the last view, no need to create * ZeroView needs a repr for the key to work * fix segfault on out of bounds * one more test * start amx, and llvm.initialize_native_asmparser * amx works * nice AMX class * nicer AMX class * refactor get_idxs * amx working * is slower... * useless flip * cache * SZ_X * AMX_SZ_X/Y work alone * Contiguous mlop * test gemm packed * PREPARE in packed * use_amx factor * prefetch isn't faster * loop * same 3ms * 2.24 ms * allow double on store in TG * amx reduce is the same speed as non amx reduce * include memory bandwidth * clean up shapetracker * flip returns stride * prepare for upstream * Update ops_llvm.py (#426) * permutes are yellow and green now * faster conv * llvm cleanups * Show optimised IR under debug 4 (#428) * ASTKernel class * Make tinygrad work with older python version (#427) * Make tinygrad work with older python version * Use partialmethod instead of partial * smiple chonker is chonking * remove junk from test speed vs torch * fix linker and types * AMX is only here now * add LLVM tests, it's a valid backend now * oops, run llvm test * contiguous_op * fix loadops compare * dedup reduceops Co-authored-by: calledit <1573053+calledit@users.noreply.github.com>
2022-11-11 15:17:09 +08:00
for N in [1024, 4096]:
# this is a 64MB tensor, M1 L1 cache is 128kB
# to fit easily in L1, rotations should be 128x128 chunks. 128x128 is also the AMX size
def f(a, b): return a.permute(1,0).contiguous()
2023-02-12 23:43:17 +08:00
helper_test_generic_square('permute', N, f, f, onearg=True)
2023-03-11 01:44:12 +08:00
def test_double_permute(self):
N = 64
torch.manual_seed(0)
torch_a = (torch.rand(N, N, N, N) - 0.5).to(torch_device)
tiny_a = Tensor(torch_a.cpu().numpy())
def f(a): return a.permute(1,0,3,2).contiguous()
2023-02-12 23:43:17 +08:00
helper_test_generic(f"double_permute {tiny_a.shape}", f, (torch_a,), TinyJit(lambda a: f(a).realize()), (tiny_a,))
2022-11-08 13:12:08 +08:00
def test_neg(self):
def f(a, b): return -a
2023-02-12 23:43:17 +08:00
helper_test_generic_square('neg', 4096, f, f, onearg=True)
2022-11-08 13:12:08 +08:00
def test_exp(self):
def f(a, b): return a.exp()
2023-02-12 23:43:17 +08:00
helper_test_generic_square('exp', 2048, f, f, onearg=True)
2022-11-08 13:12:08 +08:00
def test_relu(self):
def f(a, b): return a.relu()
2023-02-12 23:43:17 +08:00
helper_test_generic_square('relu', 4096, f, f, onearg=True)
2022-11-08 13:12:08 +08:00
def test_max(self):
def f(a, b): return a.max()
2023-02-12 23:43:17 +08:00
helper_test_generic_square('max', 4096, f, f, onearg=True)
2022-11-08 13:12:08 +08:00
def test_mul_sum(self):
def f(a, b): return (a*b).sum()
2022-11-08 13:27:56 +08:00
helper_test_generic_square('mul_sum', 4096, f, f)
2022-11-08 13:12:08 +08:00
def test_add(self):
2023-01-29 16:23:06 +08:00
for N in [1, 1024, 4096]:
2022-11-08 13:12:08 +08:00
def f(a, b): return a + b
2022-11-08 13:27:56 +08:00
helper_test_generic_square('add', N, f, f)
2022-11-08 13:12:08 +08:00
2023-01-26 10:11:26 +08:00
def test_add_constant(self):
def f(a, b): return a+2.0
2023-02-12 23:43:17 +08:00
helper_test_generic_square('add_constant', 4096, f, f, onearg=True)
2023-01-26 10:11:26 +08:00
2022-11-08 13:12:08 +08:00
def test_add_sq(self):
def f(a, b): return a*a + b*b
2022-11-08 13:27:56 +08:00
helper_test_generic_square('add_sq', 4096, f, f)
2022-11-08 13:12:08 +08:00
2022-10-30 04:42:33 +08:00
def test_gemm(self):
2022-11-08 13:12:08 +08:00
def f(a, b): return a @ b
2023-03-25 01:24:27 +08:00
helper_test_generic_square('gemm', 1024, f, f)
2022-11-08 13:12:08 +08:00
def test_gemm_small(self):
def f(a, b): return a @ b
helper_test_generic_square('gemm', 256, f, f)
2022-11-08 13:12:08 +08:00
def test_gemm_unrolled(self):
N = 512
def f1(a, b): return a@b.T
def f2(a, b): return (a.reshape(N, 1, N).expand(N, N, N) * b.reshape(1, N, N).expand(N, N, N)).sum(axis=2)
2022-11-08 13:27:56 +08:00
helper_test_generic_square('gemm_unrolled', N, f1, f2)
2023-03-11 01:44:12 +08:00
Simple chonker (#431) * chonker will make llvm fast * work * better speed tests, we will make them fast * with the cache add is the same speed * relu and neg are fast * fix sum speed * maximum maxnum? * hack for gemm opt * gemm very slow * zeros like * test_permute * shapetracker returns self * fix shapetracker factorization * err, int strides * permutes are faster now in tinygrad than pytorch * support -1 in expand * gemm unrolled * improve final test case * WIP GEMM * why isn't GEMM fast? * revert cache dim * ffp contract works on clang, not llvm? * ignore llvm ir * this makes fma work at least, but no faster * USE_4x4 * 63 GFLOPS * 87 GFLOPS * that wasn't matmul, 44 GFLOPS now * 82 GFLOPS permuted * this permute too * a little speed for the convs * 45 GFLOPS * speed tests pass again * clean up prints * fix FMA WHAT A WASTE OF TIME * colors * moar fair * GPU * useless on chonker * cleanups * improve factorized shapetracker * better threshold * label conv * work * ops test pass again * hot load the index * run the last view, no need to create * ZeroView needs a repr for the key to work * fix segfault on out of bounds * one more test * start amx, and llvm.initialize_native_asmparser * amx works * nice AMX class * nicer AMX class * refactor get_idxs * amx working * is slower... * useless flip * cache * SZ_X * AMX_SZ_X/Y work alone * Contiguous mlop * test gemm packed * PREPARE in packed * use_amx factor * prefetch isn't faster * loop * same 3ms * 2.24 ms * allow double on store in TG * amx reduce is the same speed as non amx reduce * include memory bandwidth * clean up shapetracker * flip returns stride * prepare for upstream * Update ops_llvm.py (#426) * permutes are yellow and green now * faster conv * llvm cleanups * Show optimised IR under debug 4 (#428) * ASTKernel class * Make tinygrad work with older python version (#427) * Make tinygrad work with older python version * Use partialmethod instead of partial * smiple chonker is chonking * remove junk from test speed vs torch * fix linker and types * AMX is only here now * add LLVM tests, it's a valid backend now * oops, run llvm test * contiguous_op * fix loadops compare * dedup reduceops Co-authored-by: calledit <1573053+calledit@users.noreply.github.com>
2022-11-11 15:17:09 +08:00
def test_gemm_unrolled_permute_l(self):
N = 512
def f1(a, b): return a.T@b.T
def f2(a, b): return (a.permute(1,0).reshape(N, 1, N).expand(N, N, N) * b.reshape(1, N, N).expand(N, N, N)).sum(axis=2)
helper_test_generic_square('gemm_unrolled_permute_l', N, f1, f2)
2022-11-08 13:12:08 +08:00
def test_gemm_unrolled_permute_r(self):
N = 512
def f1(a, b): return a@b
def f2(a, b): return (a.reshape(N, 1, N).expand(N, N, N) * b.permute(1,0).reshape(1, N, N).expand(N, N, N)).sum(axis=2)
2022-11-08 13:27:56 +08:00
helper_test_generic_square('gemm_unrolled_permute_r', N, f1, f2)
2022-11-08 13:12:08 +08:00
def test_gemm_unrolled_permute_lr(self):
N = 512
def f1(a, b): return a.T@b
def f2(a, b): return (a.permute(1,0).reshape(N, 1, N).expand(N, N, N) * b.permute(1,0).reshape(1, N, N).expand(N, N, N)).sum(axis=2)
2022-11-08 13:27:56 +08:00
helper_test_generic_square('gemm_unrolled_permute_lr', N, f1, f2)
2022-10-30 04:42:33 +08:00
def test_openpilot_conv2d(self):
bs, in_chans, out_chans = 1,12,32
torch.manual_seed(0)
torch_dat = torch.rand(bs, 64, 128, 12).to(torch_device)
torch_conv = torch.nn.Conv2d(in_chans, out_chans, 3, bias=None, padding=1).to(torch_device)
tiny_dat = Tensor(torch_dat.cpu().numpy())
tiny_conv = Conv2d(in_chans, out_chans, 3, bias=None, padding=1)
tiny_conv.weight = Tensor(torch_conv.weight.detach().cpu().numpy())
2023-02-12 23:43:17 +08:00
def f1(torch_dat): return torch_conv(torch_dat.permute(0,3,1,2))
def f2(tiny_dat): return tiny_conv(tiny_dat.permute(0,3,1,2)).realize()
2023-07-20 00:37:23 +08:00
helper_test_generic(f"conv bs:{bs:3d} chans:{in_chans:3d} -> {out_chans:3d} k:3", f1, (torch_dat,), TinyJit(f2), (tiny_dat,))
2022-10-11 07:06:00 +08:00
def test_conv2d(self):
for bs in [32]:
2022-10-29 02:22:15 +08:00
for in_chans in IN_CHANS:
2022-11-08 13:12:08 +08:00
for out_chans in [32]:
helper_test_conv(bs, in_chans, out_chans, 3, 34, 34)
2022-10-11 07:06:00 +08:00
if __name__ == '__main__':
unittest.main()