From ebdec2b72f369d9c09536132d6b9f6ea74fae839 Mon Sep 17 00:00:00 2001
From: George Hotz <geohot@gmail.com>
Date: Sun, 29 Jan 2023 00:23:06 -0800
Subject: [PATCH] fix optimizer

---
 extra/kernel_search.py     | 16 +++++++++++-----
 test/test_speed_v_torch.py |  2 +-
 tinygrad/llops/ops_gpu.py  |  2 +-
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/extra/kernel_search.py b/extra/kernel_search.py
index 1a5c4f6b..a9404abd 100644
--- a/extra/kernel_search.py
+++ b/extra/kernel_search.py
@@ -20,7 +20,7 @@ def get_random_intervention(k):
     while 1:
       up_axis = random.randint(0, k.shape_len-1)
       amount = random.choice([4, 8])
-      if not all(x[up_axis] == 1 or x[up_axis]%amount == 0 for x in k.shapes): continue
+      if not all(st.shape[up_axis] == 1 or st.shape[up_axis]%amount == 0 for st in k.sts): continue
       return 1, up_axis, amount
 
 def apply_intervention(k, typ, *dat):
@@ -63,16 +63,16 @@ def search(ast):
     # TODO: support upcasting, splitting, and local grouping for reduce
     CL.time_sum = 0
     k.codegen()(*k.bufs)
-    if CL.time_sum < best_time:
-      print(f"accepting {inter} with time {best_time} -> {CL.time_sum}")
+    if CL.time_sum < best_time * 0.95:
+      print(f"accepting {inter} with time {best_time:.2f} -> {CL.time_sum:.2f} ratio {best_time/CL.time_sum:.2f}x")
       best_time = CL.time_sum
       winning_interventions.append(inter)
 
-  for i in range(200):
+  for i in range(100):
     try:
       test()
     except Exception as e:
-      #traceback.print_exc()
+      traceback.print_exc()
       pass
 
   # run best
@@ -138,6 +138,12 @@ if __name__ == "__main__":
     op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
     op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 1, 512, 512, 1, 1, 1, 1))
     ast = LazyOp(MovementOps.RESHAPE, (op1,), (512, 512))
+  elif int(os.getenv("FASTCONV", "0")):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(32, 1, 32, 32, 32, 64, 3, 3), views=[View((32, 1, 32, 32, 32, 64, 3, 3), (73984, 73984, 0, 34, 1, 1156, 34, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 64, 34, 34), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(32, 1, 32, 32, 32, 64, 3, 3), views=[View((32, 1, 32, 32, 32, 64, 3, 3), (0, 0, 576, 0, 0, 9, 3, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 64, 3, 3), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (32, 1, 32, 32, 32, 1, 1, 1))
+    ast = LazyOp(MovementOps.RESHAPE, (op1,), (32, 32, 32, 32))
   else:
     # reduce
     buf0 = GPUBuffer(shape=ShapeTracker(shape=(3, 1, 32, 3, 3, 32, 112, 112), views=[View((3, 32, 225, 225), (50176, 150528, 224, 1), 0), ZeroView((3, 32, 224, 224), ((0, 3), (0, 32), (0, 225), (0, 225))), View((3, 1, 32, 3, 3, 32, 112, 112), (1620000, 1620000, 0, 225, 1, 50625, 450, 2), 0)]), hostbuf=GPUBuffer(shape=(32, 3, 224, 224), force_create=True))
diff --git a/test/test_speed_v_torch.py b/test/test_speed_v_torch.py
index d0ed90a6..61e2e3ed 100644
--- a/test/test_speed_v_torch.py
+++ b/test/test_speed_v_torch.py
@@ -139,7 +139,7 @@ class TestSpeed(unittest.TestCase):
     helper_test_generic_square('mul_sum', 4096, f, f)
 
   def test_add(self):
-    for N in [1024, 4096]:
+    for N in [1, 1024, 4096]:
       def f(a, b): return a + b
       helper_test_generic_square('add', N, f, f)
 
diff --git a/tinygrad/llops/ops_gpu.py b/tinygrad/llops/ops_gpu.py
index a62911be..1ecae944 100644
--- a/tinygrad/llops/ops_gpu.py
+++ b/tinygrad/llops/ops_gpu.py
@@ -356,7 +356,7 @@ class GPUBuffer(ExplicitExecAST):
   def exec_ast(cls, ast:LazyOp):
     k = CLASTKernel(ast)
     k.codegen()(*k.bufs)
-    if PRINT_AST == "1" or PRINT_AST == k.fxn.name:
+    if PRINT_AST == "1" or (hasattr(k, "fxn") and PRINT_AST == k.fxn.name):
       print(k.fxn.name)
       k.print()
     if TEST_AST: