From 2fb10f6a1970ba15db7bb4cc86b9586ebd5cf6ad Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Mon, 16 Oct 2023 14:26:32 -0700
Subject: [PATCH] limit metal buffers and revert the 207 fix (#2087)

* limit metal buffers

* Revert "Revert "openpilot kernel fix from 209 to 207 (#2006)" (#2065)"

This reverts commit 924ecc4d6a05327814d9b373beafdef6383ddaf1.
---
 .github/workflows/test.yml         |  2 +-
 test/external/external_test_opt.py |  6 +++---
 test/test_schedule.py              |  6 ++++++
 tinygrad/lazy.py                   | 22 +++++++++++++++-------
 4 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 184faf6e..4fe9cd7b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -154,7 +154,7 @@ jobs:
       - if: ${{ matrix.task == 'openpilot' }}
         name: Test openpilot model compile and size
         run: |
-          DEBUG=2 ALLOWED_KERNEL_COUNT=209 VALIDTEST=1 FLOAT16=1 DEBUGCL=1 GPU=1 IMAGE=2 python openpilot/compile.py
+          DEBUG=2 ALLOWED_KERNEL_COUNT=207 VALIDTEST=1 FLOAT16=1 DEBUGCL=1 GPU=1 IMAGE=2 python openpilot/compile.py
           python -c 'import os; assert os.path.getsize("/tmp/output.thneed") < 100_000_000'
       - if: ${{ matrix.task == 'openpilot' }}
         name: Test openpilot model correctness (float32)
diff --git a/test/external/external_test_opt.py b/test/external/external_test_opt.py
index 48dc84e4..25a0b0cc 100644
--- a/test/external/external_test_opt.py
+++ b/test/external/external_test_opt.py
@@ -64,7 +64,7 @@ class TestInferenceMinKernels(unittest.TestCase):
     for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
     img = Tensor.randn(1, 3, 224, 224)
     # TODO: this seems very high
-    with CLCache(116):
+    with CLCache(115):
       model.forward(img).realize()
 
   def test_resnet(self):
@@ -78,7 +78,7 @@ class TestInferenceMinKernels(unittest.TestCase):
     model = ViT(embed_dim=192, num_heads=3)
     for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
     img = Tensor.randn(1, 3, 224, 224)
-    with CLCache(223): # NOTE: this is way too high
+    with CLCache(222): # NOTE: this is way too high
       out = model.forward(img)
       assert len(CacheCollector.cache) == 0, "ViT prerealized?"
       out.realize()
@@ -88,7 +88,7 @@ class TestInferenceMinKernels(unittest.TestCase):
     args_tiny = {"dim": 512, "multiple_of": 256, "n_heads": 8, "n_layers": 4, "norm_eps": 1e-05, "vocab_size": 1000}
     model = Transformer(**args_tiny)
     for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
-    with CLCache(94):
+    with CLCache(85):
       model(Tensor([[1,2,3,4]]), 0).realize()
 
 @unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
diff --git a/test/test_schedule.py b/test/test_schedule.py
index d5dc2b21..dfd2c6c8 100644
--- a/test/test_schedule.py
+++ b/test/test_schedule.py
@@ -326,5 +326,11 @@ class TestSchedule(unittest.TestCase):
     out = x.to('cpu')
     check_schedule(out, 0, filter_loadops=False)
 
+  @unittest.skipUnless(Device.DEFAULT == "METAL", "only for metal")
+  def test_metal_limit_buffers(self):
+    t = sum([Tensor([1,2,3,4]) for _ in range(40)])
+    for si in t.lazydata.schedule():
+      assert len(si.inputs) <= 30
+
 if __name__ == '__main__':
   unittest.main(verbosity=2)
diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py
index c073f13a..f3b8e9c4 100644
--- a/tinygrad/lazy.py
+++ b/tinygrad/lazy.py
@@ -202,9 +202,9 @@ class LazyBuffer:
 
   # *** elementwise ops ***
 
-  def e(self:LazyBuffer, op:Union[UnaryOps, BinaryOps, TernaryOps], *srcs:LazyBuffer, arg:Optional[Any]=None) -> LazyBuffer:
+  def e(self:LazyBuffer, op:Union[UnaryOps, BinaryOps, TernaryOps], *_srcs:LazyBuffer, arg:Optional[Any]=None) -> LazyBuffer:
     # srcs includes self
-    srcs = (self,)+srcs
+    srcs:Tuple[LazyBuffer, ...] = (self,)+_srcs
 
     # if we are separated from other binary ops by movement ops, we push those movement ops above those binaryops
     if SHUFFLE_MOVEMENT_OPS: srcs = _push_movement_ops(srcs)
@@ -225,9 +225,13 @@ class LazyBuffer:
 
     if MERGE_ELEMENTWISE_OPS:
       # remove the buffers from any (childless) BinaryOps that feed into this
-      srcs = tuple([x.op if x.optype == BinaryOps and not x.children and not x.realized else x for x in srcs])  # type: ignore
+      merged_srcs:Tuple[Union[LazyOp, LazyBuffer], ...] = tuple([x.op if x.optype == BinaryOps and not x.children and not x.realized else x for x in srcs])  # type: ignore
+      # NOTE: this is incompete, you can still fuse with reduce ops and exceed the limit
+      merged_srcs = merged_srcs if self.device != "METAL" or sum(len(x.buffers) for x in merged_srcs) < 30 else srcs
+    else:
+      merged_srcs = srcs
 
-    return create_lazybuffer(out_device, ShapeTracker.from_shape(out_shape), BinaryOps, LazyOp(op, srcs, arg), out_dtype)
+    return create_lazybuffer(out_device, ShapeTracker.from_shape(out_shape), BinaryOps, LazyOp(op, merged_srcs, arg), out_dtype)
 
   # *** reduce ops ***
 
@@ -247,8 +251,12 @@ class LazyBuffer:
   # *** movement ops ***
 
   def _movement_op(self, st: ShapeTracker, op: MovementOps, arg: Union[Tuple[sint, ...], Tuple[Tuple[sint, sint], ...]]) -> LazyBuffer:
-    if SHUFFLE_MOVEMENT_OPS and self.optype == BinaryOps and not self.realized and (op in {MovementOps.SHRINK, MovementOps.STRIDE, MovementOps.PERMUTE} or (op == MovementOps.RESHAPE and self.op.op in UnaryOps)) and not self.children:
-      return self.op.replace_with_movement_ops([(op, arg)])
+    if SHUFFLE_MOVEMENT_OPS and not self.realized and self.optype == BinaryOps and not self.children:
+      base_bufs = (x.base for x in self.op.buffers)
+      # don't push if all ast buffers (.base) are realized or sourceless
+      push_reshape_safe = (self.op.op in UnaryOps) or (any(isinstance(x, LazyOp) or not x.children for x in self.op.src) and not all(x.realized or len(x.op.src) == 0 for x in base_bufs))
+      if op not in {MovementOps.EXPAND, MovementOps.PAD} and (op is not MovementOps.RESHAPE or push_reshape_safe):
+        return self.op.replace_with_movement_ops([(op, arg)])
     if REMOVE_MOVEMENT_NOPS and not self.realized and st.contiguous:
       # MovementOps aren't stacked any more, they each have one parent, find the root
       root = get_movementroot(self)
@@ -324,7 +332,7 @@ def _push_movement_ops(srcs:Tuple[LazyBuffer, ...]) -> Tuple[LazyBuffer, ...]:
       assert isinstance(bx.op.src[0], LazyBuffer)
       bx = bx.op.src[0]
     # NOTE: can't push pads past anything where f(0, 0) != 0 or f(0) != 0
-    if mops and not bx.realized and bx.optype is BinaryOps and len(bx.children) <= 1 and (all(x[0] is not MovementOps.PAD for x in mops) or all(x.op not in UNSAFE_PAD_OPS for x in bx.op.get_lazyops())):
+    if mops and not bx.realized and bx.optype is BinaryOps and len(bx.children) <= 1 and (all(y[0] is not MovementOps.PAD for y in mops) or all(y.op not in UNSAFE_PAD_OPS for y in bx.op.get_lazyops())):
       new_srcs.append(bx.op.replace_with_movement_ops(mops[::-1]))
     else:
       new_srcs.append(x)