2023-09-29 19:40:35 +08:00
|
|
|
import unittest
|
2024-01-29 01:21:26 +08:00
|
|
|
from tinygrad import Tensor, GlobalCounters
|
2024-08-17 06:17:57 +08:00
|
|
|
from tinygrad.ops import UOps
|
Linearizer -> Lowerer (#4957)
* st to uops function
* lowerer
* uops reduce
* uops reduce
* acc_number correct
* reduce unroll
* complete unroll
* do upcasts
* handle multioutput
* define_accs
* fix valid
* get grouped dims
* revert lin
* minor
* fixup_ast
* group for reduce
* group works now
* all forwards pass
* all ops tests pass
* fix clang
* mypy
* lil cleanups, no image yet
* ugh, variables everywhere
* bugfix
* counters and name fix
* use symbolic, not uops
* cleanups
* Fix tests
* linearizer tests
* expands
* float4 expand load
* tests pass
* woooo, float4 test
* test ops works again
* one more lin test
* more lin tests
* bypass
* fix tests
* something like this
* const in defineacc
* uops get_reduce_acc
* move around
* allow consts in the LOAD/STORE
* each axis should only appear once, 21 failures
* 16 failures
* fix some image
* optional float4
* onnx tests
* gate the stores
* add reorder
* fix terrible skip function
* tc work
* opt add/mul merge
* fix float4 tests
* tiny tweak, 9 failing
* 7 test failures
* start tc, but i don't think this will work
* progress on tensorcores
* note
* fix ops tests
* closer on tc
* weeee...one tensor core works
* still works, more generic
* large WMMA works
* tc test passes
* use WMMA as accumulator
* basic tc tests passing
* small gemm padded works
* 4 failures
* 3 tests failing
* super barrier
* now two tests failing
* one test failing
* cleanpus, add reduce to UopGraph
* remove the linearizer
* remove unused
* lil cleanups
* Lowerer everywhere
* remove test that doesn't exist now
* image indexing
* llvm fix
* fix metal
* fix image
* fix images
* might fix ptx
* fix image type mismatch
* more tests pass
* CAST -> VECTORIZE
* forgot that one
* fix TestOps.test_flip_eye_crash
* locals shouldn't be image dtype
* change less files
* test fix
* fix recursive expands
* touches
* MULACC support in python
* delete unneeded
* alu before contract
* bug fixes
* tests
* no var multireduce
* simpler tc
* metal works in new style
* working on AMD and METAL
* fix amd
* shot in the dark, fix amd
* something for CUDA
* CUDA WORKS from the docs
* comment
* correct merge
* cleanups + ptx fix + get_reduce_acc
* local alias isn't used anymore
* add store sanity check
* fix for AMD
* cleanups and single expand pass
* more correct with acc_cache
* tests should pass
* block on WMMA
* tests pass
* merge contract and reduce
* contractor fixes issue
* multicontract
* pre expand wmma (same as a reduce)
* expand wmma and only take one
* all expands
* comments and whitespace
2024-07-11 06:07:42 +08:00
|
|
|
from tinygrad.helpers import Timing, CI, Profiling, WINO, DEBUG, getenv
|
2024-07-13 09:50:55 +08:00
|
|
|
from tinygrad.codegen.kernel import Kernel
|
2024-03-27 12:02:46 +08:00
|
|
|
from tinygrad.engine.schedule import create_schedule
|
2023-09-29 19:40:35 +08:00
|
|
|
|
|
|
|
class TestWinograd(unittest.TestCase):
|
|
|
|
def setUp(self):
|
2024-01-18 09:21:26 +08:00
|
|
|
self.old = WINO.value
|
|
|
|
WINO.value = 1
|
|
|
|
def tearDown(self):
|
|
|
|
WINO.value = self.old
|
2023-09-29 19:40:35 +08:00
|
|
|
|
|
|
|
def test_speed(self):
|
|
|
|
x = Tensor.empty(1,4,9,9)
|
|
|
|
w = Tensor.empty(4,4,3,3)
|
|
|
|
|
|
|
|
with Timing("running conv: "):
|
|
|
|
out = Tensor.conv2d(x, w)
|
|
|
|
|
|
|
|
with Timing("scheduling: "):
|
2024-02-13 01:10:45 +08:00
|
|
|
sched = create_schedule([out.lazydata])
|
2023-09-29 19:40:35 +08:00
|
|
|
|
|
|
|
for i,s in enumerate(sched):
|
2024-08-17 03:09:00 +08:00
|
|
|
if s.ast.op is not UOps.SINK: continue
|
2024-08-15 22:27:32 +08:00
|
|
|
ops = s.ast.parents
|
2023-09-29 20:41:29 +08:00
|
|
|
with Timing(f"linearize {i} with {len(ops):4d} ops: "):
|
2024-07-13 09:50:55 +08:00
|
|
|
l = Kernel(s.ast)
|
2023-09-29 19:40:35 +08:00
|
|
|
l.hand_coded_optimizations()
|
|
|
|
l.linearize()
|
2024-02-02 16:47:45 +08:00
|
|
|
assert len(l.sts) <= 256 # just the current value to prevent regression
|
2024-01-31 01:47:20 +08:00
|
|
|
if DEBUG >= 2: print(f"{len(l.sts):4d} shapetrackers with max {max(len(x.views) for x in l.sts)} views")
|
|
|
|
for st in l.sts:
|
|
|
|
assert len(st.views) <= 2, "too many views in winograd"
|
|
|
|
if DEBUG >= 3:
|
|
|
|
print(f"{len(st.views):3d} views")
|
|
|
|
for v in st.views: print(v)
|
2023-09-29 19:40:35 +08:00
|
|
|
|
2023-10-07 19:39:21 +08:00
|
|
|
def test_profile(self):
|
|
|
|
x,w = Tensor.rand(1,4,9,9).realize(), Tensor.rand(4,4,3,3).realize()
|
2023-11-17 06:15:56 +08:00
|
|
|
with Profiling(enabled=not CI, sort='time'):
|
|
|
|
out = Tensor.conv2d(x,w).realize()
|
2023-10-07 19:39:21 +08:00
|
|
|
out.numpy()
|
|
|
|
|
2024-01-29 01:21:26 +08:00
|
|
|
def test_four_kernels(self):
|
|
|
|
x,w = Tensor.rand(1,4,9,9).realize(), Tensor.rand(4,4,3,3).realize()
|
|
|
|
GlobalCounters.reset()
|
|
|
|
out = Tensor.conv2d(x,w).realize()
|
|
|
|
assert GlobalCounters.kernel_count == 4
|
|
|
|
out.numpy()
|
|
|
|
|
Linearizer -> Lowerer (#4957)
* st to uops function
* lowerer
* uops reduce
* uops reduce
* acc_number correct
* reduce unroll
* complete unroll
* do upcasts
* handle multioutput
* define_accs
* fix valid
* get grouped dims
* revert lin
* minor
* fixup_ast
* group for reduce
* group works now
* all forwards pass
* all ops tests pass
* fix clang
* mypy
* lil cleanups, no image yet
* ugh, variables everywhere
* bugfix
* counters and name fix
* use symbolic, not uops
* cleanups
* Fix tests
* linearizer tests
* expands
* float4 expand load
* tests pass
* woooo, float4 test
* test ops works again
* one more lin test
* more lin tests
* bypass
* fix tests
* something like this
* const in defineacc
* uops get_reduce_acc
* move around
* allow consts in the LOAD/STORE
* each axis should only appear once, 21 failures
* 16 failures
* fix some image
* optional float4
* onnx tests
* gate the stores
* add reorder
* fix terrible skip function
* tc work
* opt add/mul merge
* fix float4 tests
* tiny tweak, 9 failing
* 7 test failures
* start tc, but i don't think this will work
* progress on tensorcores
* note
* fix ops tests
* closer on tc
* weeee...one tensor core works
* still works, more generic
* large WMMA works
* tc test passes
* use WMMA as accumulator
* basic tc tests passing
* small gemm padded works
* 4 failures
* 3 tests failing
* super barrier
* now two tests failing
* one test failing
* cleanpus, add reduce to UopGraph
* remove the linearizer
* remove unused
* lil cleanups
* Lowerer everywhere
* remove test that doesn't exist now
* image indexing
* llvm fix
* fix metal
* fix image
* fix images
* might fix ptx
* fix image type mismatch
* more tests pass
* CAST -> VECTORIZE
* forgot that one
* fix TestOps.test_flip_eye_crash
* locals shouldn't be image dtype
* change less files
* test fix
* fix recursive expands
* touches
* MULACC support in python
* delete unneeded
* alu before contract
* bug fixes
* tests
* no var multireduce
* simpler tc
* metal works in new style
* working on AMD and METAL
* fix amd
* shot in the dark, fix amd
* something for CUDA
* CUDA WORKS from the docs
* comment
* correct merge
* cleanups + ptx fix + get_reduce_acc
* local alias isn't used anymore
* add store sanity check
* fix for AMD
* cleanups and single expand pass
* more correct with acc_cache
* tests should pass
* block on WMMA
* tests pass
* merge contract and reduce
* contractor fixes issue
* multicontract
* pre expand wmma (same as a reduce)
* expand wmma and only take one
* all expands
* comments and whitespace
2024-07-11 06:07:42 +08:00
|
|
|
@unittest.skipIf(getenv("PTX"), "winograd uses too much in PTX")
|
2024-02-12 23:26:50 +08:00
|
|
|
def test_counters(self):
|
|
|
|
IC, OC, X, Y = 4,4,9,9
|
|
|
|
#OC, IC, X, Y = 512, 256, 8, 8
|
|
|
|
x,w = Tensor.rand(1,IC,Y,X).realize(), Tensor.rand(OC,IC,3,3).realize()
|
|
|
|
GlobalCounters.reset()
|
|
|
|
Tensor.conv2d(x,w).realize()
|
|
|
|
ops_wino, mem_wino = GlobalCounters.global_ops, GlobalCounters.global_mem
|
|
|
|
WINO.value = 0
|
|
|
|
GlobalCounters.reset()
|
|
|
|
Tensor.conv2d(x,w).realize()
|
|
|
|
ops_normal, mem_normal = GlobalCounters.global_ops, GlobalCounters.global_mem
|
|
|
|
|
|
|
|
ops_ratio, mem_ratio = ops_wino/ops_normal, mem_wino/mem_normal
|
|
|
|
print(f"ops: normal {ops_normal:9d} wino {ops_wino:9d} ratio {ops_ratio:.2f}")
|
|
|
|
print(f"mem: normal {mem_normal:9d} wino {mem_wino:9d} ratio {mem_ratio:.2f}")
|
2024-06-28 02:36:05 +08:00
|
|
|
self.assertLess(ops_ratio, 2.6) # TODO: there's issues with factorization now
|
|
|
|
self.assertLess(mem_ratio, 10)
|
2024-02-12 23:26:50 +08:00
|
|
|
|
2023-09-29 19:40:35 +08:00
|
|
|
if __name__ == '__main__':
|
2024-03-13 23:59:38 +08:00
|
|
|
unittest.main(verbosity=2)
|