tinygrad/test/test_arange.py

import unittest
import numpy as np
from tinygrad import Tensor, GlobalCounters, dtypes
from tinygrad.helpers import Context, getenv
from tinygrad.engine.realize import run_schedule
from tinygrad.codegen.kernel import Opt, OptOps, Kernel
from tinygrad.engine.realize import CompiledRunner, ExecItem

class TestArange(unittest.TestCase):
  def _get_flops(self, N, opts=None):
    GlobalCounters.reset()
    tt = Tensor.arange(N)
    sched = tt.schedule()
    self.assertEqual(len(sched), 1)
    k = Kernel(sched[-1].ast)
    if opts is not None:
      for o in opts: k.apply_opt(o)
    p = k.to_program()
    print(p.name)
    print(p.src)
    ExecItem(CompiledRunner(p), [tt.lazydata.buffer]).run()
    np.testing.assert_equal(tt.numpy(), np.arange(N))
    return p.op_estimate

  def test_complexity(self, opts=None):
    # add 1 to avoid divide by 0. arange is 0 flops now!
    f1 = self._get_flops(256, opts) + 1
    f2 = self._get_flops(2560, opts) + 1
    print(f"{f1=}, {f2=}")
    assert f2 / f1 < 15, f"bad complexity, flops {f2/f1:.1f}X while inputs 10X"

  def test_complexity_w_upcast(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4)])
  def test_complexity_w_unroll(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 4)])
  def test_complexity_w_upcast_and_unroll(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)])

class TestIndexing(unittest.TestCase):
  def test_arange_2_reduce(self):
    needle = Tensor.zeros(16384, dtype=dtypes.int).contiguous()
    needle[1337] = 1
    needle.realize()
    with Context(NOOPT=1, FUSE_ARANGE=1):
      GlobalCounters.reset()
      # TODO: it should work without these reshapes
      out = ((Tensor.arange(1,16385).reshape(16384,1)-1)*needle.reshape(16384,1)).sum()
      sched = out.schedule()
      assert len(sched) == 1
      run_schedule(sched)
    assert out.item() == 1337, f"expected 1337, got {out.item()}"

  @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
  def test_manual_index(self):
    dataset = Tensor.rand(16384, 256).realize()
    idxs = Tensor([0,3,5,6]).realize()
    real_index = dataset.numpy()[idxs.numpy()]
    print("*** indexing ***")
    with Context(NOOPT=1, FUSE_ARANGE=1):
      GlobalCounters.reset()
      rng = Tensor.ones(4, 256, 16384, dtype=dtypes.int)._cumsum(axis=-1, _first_zero=True).reshape(4, 256, 16384, 1)
      idxs = idxs.reshape(4,1,1,1).expand(4, 256, 16384, 1)
      reshape_dataset = dataset.T.reshape(1, 256, 16384, 1).expand(4, 256, 16384, 1)
      full = (rng==idxs).where(reshape_dataset, Tensor.zeros(4, 256, 16384, 1))
      X = full.sum(axis=(2,3))
      sched = X.schedule()
      assert len(sched) == 1
      run_schedule(sched)
      assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
    np.testing.assert_allclose(real_index, X.numpy())

  def test_index(self):
    dataset = Tensor.rand(16384, 256).realize()
    idxs = Tensor([0,3,5,6]).realize()
    real_index = dataset.numpy()[idxs.numpy()]
    print("*** indexing ***")
    with Context(NOOPT=1):
      GlobalCounters.reset()
      X = dataset[idxs]
      assert X.shape == (4,256)
      sched = X.schedule()
      # TODO: enable these asserts when the scheduler can handle this
      #assert len(sched) == 1, f"{len(sched)} != 1"
      run_schedule(sched)
      #assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
    np.testing.assert_allclose(real_index, X.numpy())

  def test_index_fused(self, noopt=1):
    dataset = Tensor.rand(16384, 256).realize()
    idxs = Tensor([0,3,5,6]).realize()
    real_index = dataset.numpy()[idxs.numpy()]
    print("*** indexing ***")
    with Context(NOOPT=noopt, FUSE_ARANGE=1):
      GlobalCounters.reset()
      X = dataset[idxs]
      assert X.shape == (4,256)
      sched = X.schedule()
      assert len(sched) == 2
      run_schedule(sched)
      assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops} != {4*16384}"
    np.testing.assert_allclose(real_index, X.numpy())
  @unittest.skip("not ready")
  def test_index_fused_opt(self): self.test_index_fused(0)

  @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
  def test_index_mnist(self, noopt=1):
    from tinygrad.nn.datasets import mnist
    X_train, Y_train, _, _ = mnist()
    with Context(NOOPT=noopt, FUSE_ARANGE=1, SPLIT_REDUCEOP=0):
      GlobalCounters.reset()
      samples = Tensor.randint(getenv("BS", 512), high=X_train.shape[0])
      x = X_train[samples].numpy()
      y = Y_train[samples].numpy()
      assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops} != {4*16384}"
    np.testing.assert_allclose(X_train.numpy()[samples.numpy()], x)
    np.testing.assert_allclose(Y_train.numpy()[samples.numpy()], y)
  @unittest.skip("not ready")
  def test_index_mnist_opt(self): self.test_index_mnist(0)

if __name__ == "__main__":
  unittest.main()
use old cumsum optimization for arange (#3813) revert to old cumsum opt while phi simplification is disabled. added a flops complexity test for this 2024-03-19 08:01:03 +08:00			`import unittest`
indexing getting better (#5389) * indexing getting better [run_process_replay] [no_assert] * fix test * test_arange_2_reduce is a simpler test * put that print back, NOOPT * don't merge reduces (they could be different reduces) * FUSE_AS_ONE_KERNEL * fix tests * fix test_var_multireduce * w/e put that there * fails on others too * fix test, revert UNMUL change * in case order matters * one kernel indexing works * one kernel indexing works (test other) 2024-07-12 07:41:51 +08:00			`import numpy as np`
			`from tinygrad import Tensor, GlobalCounters, dtypes`
improve single kernel indexing (#5398) * improve single kernel indexing * metadata in graph (#5399) * indexing is O(1) * add failing test * ugh, that all needs to be replaced with symbolic * broken on ptx, it's fine --------- Co-authored-by: wozeparrot <wozeparrot@gmail.com> 2024-07-12 10:00:57 +08:00			`from tinygrad.helpers import Context, getenv`
indexing getting better (#5389) * indexing getting better [run_process_replay] [no_assert] * fix test * test_arange_2_reduce is a simpler test * put that print back, NOOPT * don't merge reduces (they could be different reduces) * FUSE_AS_ONE_KERNEL * fix tests * fix test_var_multireduce * w/e put that there * fails on others too * fix test, revert UNMUL change * in case order matters * one kernel indexing works * one kernel indexing works (test other) 2024-07-12 07:41:51 +08:00			`from tinygrad.engine.realize import run_schedule`
unroll arange is broken (#5918) * unroll arange is broken * fix unrolled arange * one more test 2024-08-06 03:15:07 +08:00			`from tinygrad.codegen.kernel import Opt, OptOps, Kernel`
make the arange test check correctness [run_process_replay] (#5920) 2024-08-06 04:41:06 +08:00			`from tinygrad.engine.realize import CompiledRunner, ExecItem`
use old cumsum optimization for arange (#3813) revert to old cumsum opt while phi simplification is disabled. added a flops complexity test for this 2024-03-19 08:01:03 +08:00
			`class TestArange(unittest.TestCase):`
unroll arange is broken (#5918) * unroll arange is broken * fix unrolled arange * one more test 2024-08-06 03:15:07 +08:00			`def _get_flops(self, N, opts=None):`
use old cumsum optimization for arange (#3813) revert to old cumsum opt while phi simplification is disabled. added a flops complexity test for this 2024-03-19 08:01:03 +08:00			`GlobalCounters.reset()`
make the arange test check correctness [run_process_replay] (#5920) 2024-08-06 04:41:06 +08:00			`tt = Tensor.arange(N)`
			`sched = tt.schedule()`
unroll arange is broken (#5918) * unroll arange is broken * fix unrolled arange * one more test 2024-08-06 03:15:07 +08:00			`self.assertEqual(len(sched), 1)`
			`k = Kernel(sched[-1].ast)`
			`if opts is not None:`
			`for o in opts: k.apply_opt(o)`
			`p = k.to_program()`
			`print(p.name)`
			`print(p.src)`
make the arange test check correctness [run_process_replay] (#5920) 2024-08-06 04:41:06 +08:00			`ExecItem(CompiledRunner(p), [tt.lazydata.buffer]).run()`
			`np.testing.assert_equal(tt.numpy(), np.arange(N))`
unroll arange is broken (#5918) * unroll arange is broken * fix unrolled arange * one more test 2024-08-06 03:15:07 +08:00			`return p.op_estimate`
use old cumsum optimization for arange (#3813) revert to old cumsum opt while phi simplification is disabled. added a flops complexity test for this 2024-03-19 08:01:03 +08:00
unroll arange is broken (#5918) * unroll arange is broken * fix unrolled arange * one more test 2024-08-06 03:15:07 +08:00			`def test_complexity(self, opts=None):`
indexing getting better (#5389) * indexing getting better [run_process_replay] [no_assert] * fix test * test_arange_2_reduce is a simpler test * put that print back, NOOPT * don't merge reduces (they could be different reduces) * FUSE_AS_ONE_KERNEL * fix tests * fix test_var_multireduce * w/e put that there * fails on others too * fix test, revert UNMUL change * in case order matters * one kernel indexing works * one kernel indexing works (test other) 2024-07-12 07:41:51 +08:00			`# add 1 to avoid divide by 0. arange is 0 flops now!`
unroll arange is broken (#5918) * unroll arange is broken * fix unrolled arange * one more test 2024-08-06 03:15:07 +08:00			`f1 = self._get_flops(256, opts) + 1`
			`f2 = self._get_flops(2560, opts) + 1`
use old cumsum optimization for arange (#3813) revert to old cumsum opt while phi simplification is disabled. added a flops complexity test for this 2024-03-19 08:01:03 +08:00			`print(f"{f1=}, {f2=}")`
Added missing unittest execution code (#4400) same code as on every other test file, just missing from this one for some reason. 2024-05-03 10:34:30 +08:00			`assert f2 / f1 < 15, f"bad complexity, flops {f2/f1:.1f}X while inputs 10X"`

unroll arange is broken (#5918) * unroll arange is broken * fix unrolled arange * one more test 2024-08-06 03:15:07 +08:00			`def test_complexity_w_upcast(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4)])`
			`def test_complexity_w_unroll(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 4)])`
			`def test_complexity_w_upcast_and_unroll(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)])`

indexing getting better (#5389) * indexing getting better [run_process_replay] [no_assert] * fix test * test_arange_2_reduce is a simpler test * put that print back, NOOPT * don't merge reduces (they could be different reduces) * FUSE_AS_ONE_KERNEL * fix tests * fix test_var_multireduce * w/e put that there * fails on others too * fix test, revert UNMUL change * in case order matters * one kernel indexing works * one kernel indexing works (test other) 2024-07-12 07:41:51 +08:00			`class TestIndexing(unittest.TestCase):`
			`def test_arange_2_reduce(self):`
			`needle = Tensor.zeros(16384, dtype=dtypes.int).contiguous()`
			`needle[1337] = 1`
			`needle.realize()`
indexing fusion 2 (#5888) * arange fusion * kernels that fuse * tests 2024-08-03 18:13:39 +08:00			`with Context(NOOPT=1, FUSE_ARANGE=1):`
indexing getting better (#5389) * indexing getting better [run_process_replay] [no_assert] * fix test * test_arange_2_reduce is a simpler test * put that print back, NOOPT * don't merge reduces (they could be different reduces) * FUSE_AS_ONE_KERNEL * fix tests * fix test_var_multireduce * w/e put that there * fails on others too * fix test, revert UNMUL change * in case order matters * one kernel indexing works * one kernel indexing works (test other) 2024-07-12 07:41:51 +08:00			`GlobalCounters.reset()`
			`# TODO: it should work without these reshapes`
			`out = ((Tensor.arange(1,16385).reshape(16384,1)-1)*needle.reshape(16384,1)).sum()`
			`sched = out.schedule()`
			`assert len(sched) == 1`
			`run_schedule(sched)`
			`assert out.item() == 1337, f"expected 1337, got {out.item()}"`

improve single kernel indexing (#5398) * improve single kernel indexing * metadata in graph (#5399) * indexing is O(1) * add failing test * ugh, that all needs to be replaced with symbolic * broken on ptx, it's fine --------- Co-authored-by: wozeparrot <wozeparrot@gmail.com> 2024-07-12 10:00:57 +08:00			`@unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")`
indexing getting better (#5389) * indexing getting better [run_process_replay] [no_assert] * fix test * test_arange_2_reduce is a simpler test * put that print back, NOOPT * don't merge reduces (they could be different reduces) * FUSE_AS_ONE_KERNEL * fix tests * fix test_var_multireduce * w/e put that there * fails on others too * fix test, revert UNMUL change * in case order matters * one kernel indexing works * one kernel indexing works (test other) 2024-07-12 07:41:51 +08:00			`def test_manual_index(self):`
			`dataset = Tensor.rand(16384, 256).realize()`
			`idxs = Tensor([0,3,5,6]).realize()`
			`real_index = dataset.numpy()[idxs.numpy()]`
			`print("* indexing *")`
indexing fusion 2 (#5888) * arange fusion * kernels that fuse * tests 2024-08-03 18:13:39 +08:00			`with Context(NOOPT=1, FUSE_ARANGE=1):`
indexing getting better (#5389) * indexing getting better [run_process_replay] [no_assert] * fix test * test_arange_2_reduce is a simpler test * put that print back, NOOPT * don't merge reduces (they could be different reduces) * FUSE_AS_ONE_KERNEL * fix tests * fix test_var_multireduce * w/e put that there * fails on others too * fix test, revert UNMUL change * in case order matters * one kernel indexing works * one kernel indexing works (test other) 2024-07-12 07:41:51 +08:00			`GlobalCounters.reset()`
			`rng = Tensor.ones(4, 256, 16384, dtype=dtypes.int)._cumsum(axis=-1, _first_zero=True).reshape(4, 256, 16384, 1)`
			`idxs = idxs.reshape(4,1,1,1).expand(4, 256, 16384, 1)`
improve single kernel indexing (#5398) * improve single kernel indexing * metadata in graph (#5399) * indexing is O(1) * add failing test * ugh, that all needs to be replaced with symbolic * broken on ptx, it's fine --------- Co-authored-by: wozeparrot <wozeparrot@gmail.com> 2024-07-12 10:00:57 +08:00			`reshape_dataset = dataset.T.reshape(1, 256, 16384, 1).expand(4, 256, 16384, 1)`
			`full = (rng==idxs).where(reshape_dataset, Tensor.zeros(4, 256, 16384, 1))`
			`X = full.sum(axis=(2,3))`
indexing getting better (#5389) * indexing getting better [run_process_replay] [no_assert] * fix test * test_arange_2_reduce is a simpler test * put that print back, NOOPT * don't merge reduces (they could be different reduces) * FUSE_AS_ONE_KERNEL * fix tests * fix test_var_multireduce * w/e put that there * fails on others too * fix test, revert UNMUL change * in case order matters * one kernel indexing works * one kernel indexing works (test other) 2024-07-12 07:41:51 +08:00			`sched = X.schedule()`
			`assert len(sched) == 1`
			`run_schedule(sched)`
improve single kernel indexing (#5398) * improve single kernel indexing * metadata in graph (#5399) * indexing is O(1) * add failing test * ugh, that all needs to be replaced with symbolic * broken on ptx, it's fine --------- Co-authored-by: wozeparrot <wozeparrot@gmail.com> 2024-07-12 10:00:57 +08:00			`assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"`
indexing getting better (#5389) * indexing getting better [run_process_replay] [no_assert] * fix test * test_arange_2_reduce is a simpler test * put that print back, NOOPT * don't merge reduces (they could be different reduces) * FUSE_AS_ONE_KERNEL * fix tests * fix test_var_multireduce * w/e put that there * fails on others too * fix test, revert UNMUL change * in case order matters * one kernel indexing works * one kernel indexing works (test other) 2024-07-12 07:41:51 +08:00			`np.testing.assert_allclose(real_index, X.numpy())`

			`def test_index(self):`
			`dataset = Tensor.rand(16384, 256).realize()`
			`idxs = Tensor([0,3,5,6]).realize()`
			`real_index = dataset.numpy()[idxs.numpy()]`
			`print("* indexing *")`
			`with Context(NOOPT=1):`
			`GlobalCounters.reset()`
			`X = dataset[idxs]`
			`assert X.shape == (4,256)`
			`sched = X.schedule()`
improve single kernel indexing (#5398) * improve single kernel indexing * metadata in graph (#5399) * indexing is O(1) * add failing test * ugh, that all needs to be replaced with symbolic * broken on ptx, it's fine --------- Co-authored-by: wozeparrot <wozeparrot@gmail.com> 2024-07-12 10:00:57 +08:00			`# TODO: enable these asserts when the scheduler can handle this`
generalize FUSE_AS_ONE_KERNEL in the scheduler (#5397) * test: use const * hotfix: base * asserts * dont push through reshape * cleanup * dont need the cache * test_reduceop_reshape_dont_push and test_index_fused are next 2024-07-12 15:23:16 +08:00			`#assert len(sched) == 1, f"{len(sched)} != 1"`
indexing getting better (#5389) * indexing getting better [run_process_replay] [no_assert] * fix test * test_arange_2_reduce is a simpler test * put that print back, NOOPT * don't merge reduces (they could be different reduces) * FUSE_AS_ONE_KERNEL * fix tests * fix test_var_multireduce * w/e put that there * fails on others too * fix test, revert UNMUL change * in case order matters * one kernel indexing works * one kernel indexing works (test other) 2024-07-12 07:41:51 +08:00			`run_schedule(sched)`
improve single kernel indexing (#5398) * improve single kernel indexing * metadata in graph (#5399) * indexing is O(1) * add failing test * ugh, that all needs to be replaced with symbolic * broken on ptx, it's fine --------- Co-authored-by: wozeparrot <wozeparrot@gmail.com> 2024-07-12 10:00:57 +08:00			`#assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"`
			`np.testing.assert_allclose(real_index, X.numpy())`

fast mnist indexing (#5921) * fast mnist indexing * more tests * remove those tests, new indexing rule 2024-08-06 04:55:15 +08:00			`def test_index_fused(self, noopt=1):`
improve single kernel indexing (#5398) * improve single kernel indexing * metadata in graph (#5399) * indexing is O(1) * add failing test * ugh, that all needs to be replaced with symbolic * broken on ptx, it's fine --------- Co-authored-by: wozeparrot <wozeparrot@gmail.com> 2024-07-12 10:00:57 +08:00			`dataset = Tensor.rand(16384, 256).realize()`
			`idxs = Tensor([0,3,5,6]).realize()`
			`real_index = dataset.numpy()[idxs.numpy()]`
			`print("* indexing *")`
fast mnist indexing (#5921) * fast mnist indexing * more tests * remove those tests, new indexing rule 2024-08-06 04:55:15 +08:00			`with Context(NOOPT=noopt, FUSE_ARANGE=1):`
improve single kernel indexing (#5398) * improve single kernel indexing * metadata in graph (#5399) * indexing is O(1) * add failing test * ugh, that all needs to be replaced with symbolic * broken on ptx, it's fine --------- Co-authored-by: wozeparrot <wozeparrot@gmail.com> 2024-07-12 10:00:57 +08:00			`GlobalCounters.reset()`
			`X = dataset[idxs]`
			`assert X.shape == (4,256)`
			`sched = X.schedule()`
indexing fusion 2 (#5888) * arange fusion * kernels that fuse * tests 2024-08-03 18:13:39 +08:00			`assert len(sched) == 2`
improve single kernel indexing (#5398) * improve single kernel indexing * metadata in graph (#5399) * indexing is O(1) * add failing test * ugh, that all needs to be replaced with symbolic * broken on ptx, it's fine --------- Co-authored-by: wozeparrot <wozeparrot@gmail.com> 2024-07-12 10:00:57 +08:00			`run_schedule(sched)`
indexing fold with casted bool (#5551) * cast bool is where * universal transform is wrong 2024-07-19 01:02:29 +08:00			`assert GlobalCounters.global_ops < 416384, f"too many ops {GlobalCounters.global_ops} != {416384}"`
indexing getting better (#5389) * indexing getting better [run_process_replay] [no_assert] * fix test * test_arange_2_reduce is a simpler test * put that print back, NOOPT * don't merge reduces (they could be different reduces) * FUSE_AS_ONE_KERNEL * fix tests * fix test_var_multireduce * w/e put that there * fails on others too * fix test, revert UNMUL change * in case order matters * one kernel indexing works * one kernel indexing works (test other) 2024-07-12 07:41:51 +08:00			`np.testing.assert_allclose(real_index, X.numpy())`
fast mnist indexing (#5921) * fast mnist indexing * more tests * remove those tests, new indexing rule 2024-08-06 04:55:15 +08:00			`@unittest.skip("not ready")`
			`def test_index_fused_opt(self): self.test_index_fused(0)`

			`@unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")`
			`def test_index_mnist(self, noopt=1):`
			`from tinygrad.nn.datasets import mnist`
			`X_train, Y_train, _, _ = mnist()`
			`with Context(NOOPT=noopt, FUSE_ARANGE=1, SPLIT_REDUCEOP=0):`
			`GlobalCounters.reset()`
			`samples = Tensor.randint(getenv("BS", 512), high=X_train.shape[0])`
			`x = X_train[samples].numpy()`
			`y = Y_train[samples].numpy()`
			`assert GlobalCounters.global_ops < 416384, f"too many ops {GlobalCounters.global_ops} != {416384}"`
			`np.testing.assert_allclose(X_train.numpy()[samples.numpy()], x)`
			`np.testing.assert_allclose(Y_train.numpy()[samples.numpy()], y)`
			`@unittest.skip("not ready")`
			`def test_index_mnist_opt(self): self.test_index_mnist(0)`
indexing getting better (#5389) * indexing getting better [run_process_replay] [no_assert] * fix test * test_arange_2_reduce is a simpler test * put that print back, NOOPT * don't merge reduces (they could be different reduces) * FUSE_AS_ONE_KERNEL * fix tests * fix test_var_multireduce * w/e put that there * fails on others too * fix test, revert UNMUL change * in case order matters * one kernel indexing works * one kernel indexing works (test other) 2024-07-12 07:41:51 +08:00
Added missing unittest execution code (#4400) same code as on every other test file, just missing from this one for some reason. 2024-05-03 10:34:30 +08:00			`if __name__ == "__main__":`
generalize FUSE_AS_ONE_KERNEL in the scheduler (#5397) * test: use const * hotfix: base * asserts * dont push through reshape * cleanup * dont need the cache * test_reduceop_reshape_dont_push and test_index_fused are next 2024-07-12 15:23:16 +08:00			`unittest.main()`