Revert "openpilot kernel fix from 209 to 207 (#2006)" (#2065)

This reverts commit 63869c62fc.
2023-10-13 12:01:55 -07:00 · 2023-10-13 12:01:55 -07:00 · 924ecc4d6a
parent 63869c62fc
commit 924ecc4d6a
3 changed files with 7 additions and 11 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -154,7 +154,7 @@ jobs:
      - if: ${{ matrix.task == 'openpilot' }}
        name: Test openpilot model compile and size
        run: |
-          DEBUG=2 ALLOWED_KERNEL_COUNT=207 VALIDTEST=1 FLOAT16=1 DEBUGCL=1 GPU=1 IMAGE=2 python openpilot/compile.py
+          DEBUG=2 ALLOWED_KERNEL_COUNT=209 VALIDTEST=1 FLOAT16=1 DEBUGCL=1 GPU=1 IMAGE=2 python openpilot/compile.py
          python -c 'import os; assert os.path.getsize("/tmp/output.thneed") < 100_000_000'
      - if: ${{ matrix.task == 'openpilot' }}
        name: Test openpilot model correctness (float32)
--- a/test/external/external_test_opt.py
+++ b/test/external/external_test_opt.py
@ -64,7 +64,7 @@ class TestInferenceMinKernels(unittest.TestCase):
    for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
    img = Tensor.randn(1, 3, 224, 224)
    # TODO: this seems very high
-    with CLCache(115):
+    with CLCache(116):
      model.forward(img).realize()

  def test_resnet(self):
@ -78,7 +78,7 @@ class TestInferenceMinKernels(unittest.TestCase):
    model = ViT(embed_dim=192, num_heads=3)
    for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
    img = Tensor.randn(1, 3, 224, 224)
-    with CLCache(222): # NOTE: this is way too high
+    with CLCache(223): # NOTE: this is way too high
      out = model.forward(img)
      assert len(CacheCollector.cache) == 0, "ViT prerealized?"
      out.realize()
@ -88,7 +88,7 @@ class TestInferenceMinKernels(unittest.TestCase):
    args_tiny = {"dim": 512, "multiple_of": 256, "n_heads": 8, "n_layers": 4, "norm_eps": 1e-05, "vocab_size": 1000}
    model = Transformer(**args_tiny)
    for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
-    with CLCache(85):
+    with CLCache(94):
      model(Tensor([[1,2,3,4]]), 0).realize()

@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
--- a/tinygrad/lazy.py
+++ b/tinygrad/lazy.py
@ -247,12 +247,8 @@ class LazyBuffer:
  # *** movement ops ***

  def _movement_op(self, st: ShapeTracker, op: MovementOps, arg: Union[Tuple[sint, ...], Tuple[Tuple[sint, sint], ...]]) -> LazyBuffer:
-    if SHUFFLE_MOVEMENT_OPS and not self.realized and self.optype == BinaryOps and not self.children:
-      base_bufs = (x.base for x in self.op.buffers)
-      # don't push if all ast buffers (.base) are realized or sourceless
-      push_reshape_safe = (self.op.op in UnaryOps) or (any(isinstance(x, LazyOp) or not x.children for x in self.op.src) and not all(x.realized or len(x.op.src) == 0 for x in base_bufs))
-      if op not in {MovementOps.EXPAND, MovementOps.PAD} and (op is not MovementOps.RESHAPE or push_reshape_safe):
-        return self.op.replace_with_movement_ops([(op, arg)])
+    if SHUFFLE_MOVEMENT_OPS and self.optype == BinaryOps and not self.realized and (op in {MovementOps.SHRINK, MovementOps.STRIDE, MovementOps.PERMUTE} or (op == MovementOps.RESHAPE and self.op.op in UnaryOps)) and not self.children:
+      return self.op.replace_with_movement_ops([(op, arg)])
    if REMOVE_MOVEMENT_NOPS and not self.realized and st.contiguous:
      # MovementOps aren't stacked any more, they each have one parent, find the root
      root = get_movementroot(self)
@ -328,7 +324,7 @@ def _push_movement_ops(srcs:Tuple[LazyBuffer, ...]) -> Tuple[LazyBuffer, ...]:
      assert isinstance(bx.op.src[0], LazyBuffer)
      bx = bx.op.src[0]
    # NOTE: can't push pads past anything where f(0, 0) != 0 or f(0) != 0
-    if mops and not bx.realized and bx.optype is BinaryOps and len(bx.children) <= 1 and (all(y[0] is not MovementOps.PAD for y in mops) or all(y.op not in UNSAFE_PAD_OPS for y in bx.op.get_lazyops())):
+    if mops and not bx.realized and bx.optype is BinaryOps and len(bx.children) <= 1 and (all(x[0] is not MovementOps.PAD for x in mops) or all(x.op not in UNSAFE_PAD_OPS for x in bx.op.get_lazyops())):
      new_srcs.append(bx.op.replace_with_movement_ops(mops[::-1]))
    else:
      new_srcs.append(x)