From ba8602612bff7eabeff77b416f84f45f0d3930a4 Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Wed, 17 Apr 2024 04:03:21 +0300
Subject: [PATCH] Fuzz all permutations of schedule (#4136)

* simple toposort

* fuzzer

* init in_degree

* move to tests

* same seed

* configure paths

* internal graph

* compare LazyBuffers

* simpler

* simple graph

* assign works

* simpler

* fix JIT

* upstream ci

* move ci

* fix the path

* DEBUG=1

* limit max paths

* launch a cmp kernel

* Revert "launch a cmp kernel"

This reverts commit 791c6089922fa7d800456f28fc167842f188ac7e.

* exec ground truth

* better perf

* copy ground truth once

* gpu allclose ast try1

* Revert "gpu allclose ast try1"

This reverts commit 1f82103af3a7bfedb9f858b6c58b0b94f1c7e6b0.

* prerealized bufs freezing

* teeny cleanups

* reuse Buffers

* Revert "reuse Buffers"

This reverts commit a71de94b035bd5ceb1ec257f6b2529b166bcd30b.

---------

Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
---
 .github/workflows/test.yml     |  2 +
 test/external/fuzz_schedule.py | 84 ++++++++++++++++++++++++++++++++++
 tinygrad/buffer.py             |  2 +-
 tinygrad/engine/schedule.py    | 11 +++--
 tinygrad/tensor.py             |  4 ++
 5 files changed, 99 insertions(+), 4 deletions(-)
 create mode 100644 test/external/fuzz_schedule.py

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index decbc85a..9aaf0895 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -317,6 +317,8 @@ jobs:
       run: PYTHONPATH="." METAL=1 IGNORE_BEAM_CACHE=1 python3 -m pytest extra/optimization/test_beam_search.py
     - name: Fuzz Test linearizer
       run: PYTHONPATH="." METAL=1 CACHELEVEL=0 FUZZ_ALL_ACTIONS=1 DEPTH=2 FUZZ_N=48 FUZZ_MAX_SIZE=10000000 python test/external/fuzz_linearizer.py
+    - name: Fuzz Test models schedule
+      run: FUZZ_SCHEDULE=1 FUZZ_SCHEDULE_MAX_PATHS=5 python -m pytest test/models/test_train.py test/models/test_end2end.py
 
 
 #  testwebgl:
diff --git a/test/external/fuzz_schedule.py b/test/external/fuzz_schedule.py
new file mode 100644
index 00000000..304a3fba
--- /dev/null
+++ b/test/external/fuzz_schedule.py
@@ -0,0 +1,84 @@
+import numpy as np
+from typing import DefaultDict, Dict, List, Set, TypeVar
+from tinygrad.buffer import Buffer
+from tinygrad.engine.realize import CustomOp, ExecItem, capturing, lower_schedule_item
+from tinygrad.helpers import DEBUG, colored, getenv
+from tinygrad.lazy import LazyBuffer
+from tinygrad.engine.schedule import _graph_schedule
+from tinygrad.ops import LoadOps, ScheduleItem
+from tinygrad.tensor import Tensor
+
+def fuzz_schedule(outs: List[LazyBuffer]):
+  graph, in_degree, prescheduled = _graph_schedule(outs, seen:=set())
+  toposorts = find_all_toposorts(graph, in_degree)
+  if DEBUG >= 1: print(colored(f"fuzzing {len(toposorts)} schedule permutations", "yellow"))
+
+  # setup ground truth
+  ground_truth: Dict[LazyBuffer, memoryview] = {}
+  # IMPORTANT: freeze prerealized bufs before ScheduleItem exec
+  prerealized: Dict[LazyBuffer, memoryview] = {}
+  seed = Tensor._seed
+  for key in toposorts[0]:
+    for out in (ps:=prescheduled[key]).outputs:
+      seen.add(out)
+      # freeze assign state before exec
+      if out.op is LoadOps.ASSIGN: prerealized[out] = out.buffer.as_buffer()
+    for x in ps.inputs:
+      if x not in ground_truth and x.device != "NPY": prerealized[x] = x.buffer.as_buffer()
+    si = ScheduleItem(ps.ast, tuple(x.buffer for x in ps.outputs if x.size != 0), tuple(x.buffer for x in ps.inputs if x.size != 0))
+    _exec_si(si, seed)
+    for out in ps.outputs: ground_truth[out] = out.buffer.as_buffer()
+
+  # exec and validate each permutation with new Buffers
+  for i, ts in enumerate(toposorts[1:]):
+    if DEBUG >= 1: print(colored(f"testing permutation {i}", "yellow"))
+    rawbufs: Dict[LazyBuffer, Buffer] = {}
+    for key in ts:
+      for out in (ps:=prescheduled[key]).outputs:
+        rawbufs[out] = Buffer(out.buffer.device, out.buffer.size, out.buffer.dtype)
+        if out.op is LoadOps.ASSIGN: rawbufs[out].ensure_allocated().copyin(prerealized[out])
+      for x in ps.inputs:
+        if x not in rawbufs:
+          if x.device == "NPY": rawbufs[x] = x.buffer
+          # copy the pre realized input
+          else: rawbufs[x] = Buffer(x.buffer.device, x.buffer.size, x.buffer.dtype, initial_value=prerealized[x])
+      si = ScheduleItem(ps.ast, tuple(rawbufs[x] for x in ps.outputs if x.size != 0), tuple(rawbufs[x] for x in ps.inputs if x.size != 0))
+      _exec_si(si, seed)
+      for out in ps.outputs:
+        outbuf = np.frombuffer(rawbufs[out].as_buffer(), out.dtype.np)
+        try: np.testing.assert_allclose(outbuf, np.frombuffer(ground_truth[out], out.dtype.np), atol=1e-2, rtol=1e-2)
+        except Exception as e:
+          print(f"FAILED FOR {out}")
+          raise e
+
+def _exec_si(si: ScheduleItem, seed:int):
+  ei = ExecItem(lower_schedule_item(si), list(si.outputs+si.inputs))
+  if len(capturing): capturing[0].add(ei)
+  if isinstance(ei.prg, CustomOp): Tensor._seed = seed
+  ei.run()
+
+T = TypeVar("T")
+def find_all_toposorts(graph:DefaultDict[T, List[T]], in_degree:DefaultDict[T, int]) -> List[List[T]]:
+  visited: Set[T] = set()
+  ret: List[List[T]] = []
+  path: List[T] = []
+
+  def recurse_paths(path:List[T]):
+    for v, d in in_degree.items():
+      if d != 0 or v in visited: continue
+      for u in graph[v]: in_degree[u] -= 1
+      path.append(v)
+      visited.add(v)
+      recurse_paths(path)
+      if len(ret) >= getenv("FUZZ_SCHEDULE_MAX_PATHS", 10): return
+      # backtrack
+      for u in graph[v]: in_degree[u] += 1
+      path.pop()
+      visited.remove(v)
+    if len(path) == len(in_degree): ret.append([*path])
+  recurse_paths(path)
+
+  if len(ret) == 0: raise RuntimeError("detected cycle in the graph")
+  # verify all paths are unique
+  assert len(ret) == len(set(map(tuple, ret)))
+  return ret
diff --git a/tinygrad/buffer.py b/tinygrad/buffer.py
index 882df8f2..e4daac74 100644
--- a/tinygrad/buffer.py
+++ b/tinygrad/buffer.py
@@ -58,4 +58,4 @@ class Buffer:
     mv = flat_mv(mv)
     assert len(mv) == self.nbytes, f"size mismatch, {len(mv)=} != {self.dtype=} {self.size=}"
     self.allocator.copyout(mv, self._buf)
-    return mv
\ No newline at end of file
+    return mv
diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py
index 56345d20..087ca3d5 100644
--- a/tinygrad/engine/schedule.py
+++ b/tinygrad/engine/schedule.py
@@ -128,9 +128,8 @@ def _is_padding_okay(buf:LazyBuffer, realizes:Dict[LazyBuffer, None]) -> bool:
   if buf.op in UNSAFE_PAD_OPS: return False
   return all(_is_padding_okay(x.base, realizes) for x in buf.srcs)
 
-def create_schedule_with_vars(outs:List[LazyBuffer], seen:Optional[Set[LazyBuffer]]=None) -> Tuple[List[ScheduleItem], Dict[Variable, int]]:
-  if seen is None: seen = set()
-
+def _graph_schedule(outs:List[LazyBuffer], seen:Set[LazyBuffer]) -> Tuple[DefaultDict[LazyBuffer, List[LazyBuffer]], DefaultDict[LazyBuffer, int],
+                                                                    Dict[LazyBuffer, _LBScheduleItem]]:
   # start by just realizing the buffers passed in
   realizes: Dict[LazyBuffer, None] = {x.base: None for x in outs if not x.base.realized}
   allbufs: Dict[LazyBuffer, None] = {}
@@ -209,6 +208,7 @@ def create_schedule_with_vars(outs:List[LazyBuffer], seen:Optional[Set[LazyBuffe
   graph: DefaultDict[LazyBuffer, List[LazyBuffer]] = defaultdict(list)
   in_degree: DefaultDict[LazyBuffer, int] = defaultdict(int)
   for key, lsi in prescheduled.items():
+    if key not in in_degree: in_degree[key] = 0
     # realize outputs after all parents are realized
     scheduled_parents = set(schedule_targets[x].outputs[0] for x in lsi.inputs if x in schedule_targets)
     for x in scheduled_parents:
@@ -221,6 +221,11 @@ def create_schedule_with_vars(outs:List[LazyBuffer], seen:Optional[Set[LazyBuffe
       in_degree[assign] += 1
     for out in lsi.outputs: del out.srcs  # can only schedule once
 
+  return graph, in_degree, prescheduled
+
+def create_schedule_with_vars(outs:List[LazyBuffer], seen:Optional[Set[LazyBuffer]]=None) -> Tuple[List[ScheduleItem], Dict[Variable, int]]:
+  if seen is None: seen = set()
+  graph, in_degree, prescheduled = _graph_schedule(outs, seen)
   queue = deque(si for key, si in prescheduled.items() if in_degree[key] == 0)
   schedule: List[ScheduleItem] = []
   var_vals: Dict[Variable, int] = {}
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index d690b123..03c700a3 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -8,6 +8,7 @@ import numpy as np
 
 from tinygrad.dtype import DType, dtypes, ImageDType, ConstType, least_upper_float, least_upper_dtype
 from tinygrad.helpers import argfix, make_pair, flatten, prod, all_int, round_up, merge_dicts, fully_flatten, argsort, IMAGE, DEBUG, WINO, THREEFRY
+from tinygrad.helpers import getenv
 from tinygrad.lazy import LazyBuffer
 from tinygrad.features.multi import MultiLazyBuffer
 from tinygrad.ops import LoadOps
@@ -141,6 +142,9 @@ class Tensor:
 
   @staticmethod
   def corealize(lst:Iterable[Tensor]):
+    if getenv("FUZZ_SCHEDULE"):
+      from test.external.fuzz_schedule import fuzz_schedule
+      fuzz_schedule(flatten([x.lazydata.lbs for x in lst]))
     run_schedule(*create_schedule_with_vars(flatten([x.lazydata.lbs for x in lst])))
 
   def realize(self) -> Tensor: