tinygrad/test/test_uops.py

from typing import Optional, Tuple, Any, List
import unittest, math
import numpy as np
from tinygrad.helpers import dtypes, getenv, DType
from tinygrad.tensor import Device
from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps, ASTRunner, Compiled
from tinygrad.codegen.linearizer import UOps, MemOp, UOp
from tinygrad.shape.symbolic import Variable

def _uops_to_prg(uops):
  ret = Device[Device.DEFAULT].renderer("test", uops)
  src, global_size, local_size, binary = ret if len(ret) == 4 else ret + (False,)
  return ASTRunner("test", src, global_size, local_size, runtime_args={"binary": binary}).build(Device[Device.DEFAULT].runtime)

def uop(uops:List[UOp], uop:UOps, dtype:Optional[DType], vin:Tuple[UOp, ...], arg:Any=None) -> UOp:
  uops.append(UOp(uop, dtype, tuple(vin), arg, len(uops)))
  return uops[-1]

def _test_single_value(vals, op, dtype):
  uops = []
  uop(uops, UOps.DEFINE_GLOBAL, None, (), ('data0', dtype))
  for i in range(len(vals)): uop(uops, UOps.DEFINE_GLOBAL, None, (), (f'data{i+1}', dtype))
  loads = (uop(uops, UOps.LOAD, dtype, [], MemOp(f'data{i+1}', Variable.num(0), False, dtype, Variable.ands([]))) for i in range(len(vals)))
  alu = uop(uops, UOps.ALU, dtype, loads, op)
  uop(uops, UOps.STORE, None, (alu, ), MemOp('data0', Variable.num(0), False, dtype, Variable.ands([])))
  buf = Device[Device.DEFAULT].buffer(1, dtype)
  buf2 = [Device[Device.DEFAULT].buffer.fromCPU(np.array([a], dtype=dtype.np)) for a in vals]
  prg = _uops_to_prg(uops)
  prg([buf]+buf2)
  return buf.toCPU()[0]

def _test_single_value_const(vals, op, dtype):
  uops = []
  uop(uops, UOps.DEFINE_GLOBAL, None, (), ('data0', dtype))
  loads = (uop(uops, UOps.CONST, dtype, [], a) for a in vals)
  alu = uop(uops, UOps.ALU, dtype, loads, op)
  uop(uops, UOps.STORE, None, (alu, ), MemOp('data0', Variable.num(0), False, dtype, Variable.ands([])))
  buf = Device[Device.DEFAULT].buffer(1, dtype)
  prg = _uops_to_prg(uops)
  prg([buf])
  return buf.toCPU()[0]

class TestUOps(unittest.TestCase):
  def _equal(self, v1, v2):
    if not (math.isnan(v1) and math.isnan(v2)): self.assertAlmostEqual(v1, v2, places=5)

  def _test_uop_fxn(self, bop, fxn, dt=dtypes.float32):
    for f in [_test_single_value, _test_single_value_const]:
      for a in [-2.0, 0.0, 1.0, 2.0]:
        self._equal(f([a], bop, dt), fxn(a))

  def _test_bop_fxn(self, bop, fxn, dt=dtypes.float32, no_b_zero=False):
    for f in [_test_single_value, _test_single_value_const]:
      for a in [-2.0, 0.0, 1.0, 2.0]:
        for b in [-3.0, 1.0, 3.0] + ([] if no_b_zero else [0.0]):
          self._equal(f([a,b], bop, dt), fxn(a,b))

  def _test_top_fxn(self, bop, fxn, dt=dtypes.float32):
    for f in [_test_single_value, _test_single_value_const]:
      for a in [-2.0, 0, 1, 2.0]:
        for b in [-3.0, 3.0]:
          for c in [-4.0, 4.0]:
            self._equal(f([a,b,c], bop, dt), fxn(a,b,c))

@unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled), "only test for compiled backends")
class TestFloatUOps(TestUOps):
  def test_exp2(self): self._test_uop_fxn(UnaryOps.EXP2, lambda a: np.exp2(a))
  def test_log2(self): self._test_uop_fxn(UnaryOps.LOG2, lambda a: math.log2(a) if a > 0 else float('-inf' if a==0 else 'nan'))
  def test_sin(self): self._test_uop_fxn(UnaryOps.SIN, lambda a: math.sin(a))
  def test_sqrt(self): self._test_uop_fxn(UnaryOps.SQRT, lambda a: math.sqrt(a) if a >= 0 else float('nan'))
  # this is not on most backends
  #def test_recip(self): self._test_uop_fxn(UnaryOps.RECIP, lambda a: 1.0/a if a != 0 else float('inf'))

  def test_add(self): self._test_bop_fxn(BinaryOps.ADD, lambda a,b: a+b)
  def test_sub(self): self._test_bop_fxn(BinaryOps.SUB, lambda a,b: a-b)
  def test_mul(self): self._test_bop_fxn(BinaryOps.MUL, lambda a,b: a*b)
  def test_div(self): self._test_bop_fxn(BinaryOps.DIV, lambda a,b: a/b if b != 0 else a*float('inf'))
  def test_max(self): self._test_bop_fxn(BinaryOps.MAX, lambda a,b: max(a,b))
  def test_cmplt(self): self._test_bop_fxn(BinaryOps.CMPLT, lambda a,b: float(a<b))
  # MOD isn't tested on floats

  def test_mulacc(self): self._test_top_fxn(TernaryOps.MULACC, lambda a,b,c: (a*b)+c)
  def test_where(self): self._test_top_fxn(TernaryOps.WHERE, lambda a,b,c: b if a!=0 else c)

# TODO: fix this on all the backends
@unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled) or getenv('ARM64', False), "only test for compiled backends, broken on some")
class TestNonFloatUOps(TestUOps):
  def test_add_int32(self): self._test_bop_fxn(BinaryOps.ADD, lambda a,b: int(a)+int(b), dtypes.int32)
  def test_sub_int32(self): self._test_bop_fxn(BinaryOps.SUB, lambda a,b: int(a)-int(b), dtypes.int32)
  def test_mul_int32(self): self._test_bop_fxn(BinaryOps.MUL, lambda a,b: int(a)*int(b), dtypes.int32)
  def test_div_int32(self): self._test_bop_fxn(BinaryOps.DIV, lambda a,b: int(a/b), dtypes.int32, no_b_zero=True)
  def test_mod_int32(self): self._test_bop_fxn(BinaryOps.MOD, lambda a,b: abs(int(a))%abs(int(b))*(1,-1)[a<0], dtypes.int32, no_b_zero=True)
  def test_cmplt_int32(self): self._test_bop_fxn(BinaryOps.CMPLT, lambda a,b: float(a<b), dtypes.int32)
  def test_mul_bool(self): self._test_bop_fxn(BinaryOps.MUL, lambda a,b: bool(a) and bool(b), dtypes.bool)

if __name__ == '__main__':
  unittest.main(verbosity=2)
remove Token class (#1723) * no fusion * no float4 grouping * mulacc fusion is fine. remove uop_alu * fully remove get_grouped_maybe_float4 * removed that test * that's not float4 anymore * disable failing arm64 * metal ops pass tokenless * fix wmma * update test_uops with new style * fix gep * fix float4 store * fix float4 store more * cuda tests pass * disable broadcast pow * fix ptx * reenable arm64 * bring cse back * don't cache the acc * fix ptx bug 2023-09-02 03:53:07 +08:00			`from typing import Optional, Tuple, Any, List`
test uops (#1444) * test uops * tests should pass * improve uops * precision 2023-08-06 03:35:56 +08:00			`import unittest, math`
			`import numpy as np`
remove Token class (#1723) * no fusion * no float4 grouping * mulacc fusion is fine. remove uop_alu * fully remove get_grouped_maybe_float4 * removed that test * that's not float4 anymore * disable failing arm64 * metal ops pass tokenless * fix wmma * update test_uops with new style * fix gep * fix float4 store * fix float4 store more * cuda tests pass * disable broadcast pow * fix ptx * reenable arm64 * bring cse back * don't cache the acc * fix ptx bug 2023-09-02 03:53:07 +08:00			`from tinygrad.helpers import dtypes, getenv, DType`
test uops (#1444) * test uops * tests should pass * improve uops * precision 2023-08-06 03:35:56 +08:00			`from tinygrad.tensor import Device`
			`from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps, ASTRunner, Compiled`
render const (#1736) * render const * remove constop * fix llvm and webgpu * disable consts in llvm again * assembly special * fix const rendering * fix arm64 * imms are int * fix ptx * fix arm64 2023-09-02 10:01:43 +08:00			`from tinygrad.codegen.linearizer import UOps, MemOp, UOp`
test uops (#1444) * test uops * tests should pass * improve uops * precision 2023-08-06 03:35:56 +08:00			`from tinygrad.shape.symbolic import Variable`

			`def _uops_to_prg(uops):`
Arm (#1421) * testing new memops * better debugging * testing padded conv * branching with load * refactoring a bit * first try * fixing bugs * fixing some * eq * eq2 * do not use x's * working * fixing imm * getting things working * refactor * pow not working * working except one * refactor: one store mem * refactor: global load * refactor: imm * refactor: cleaning * fixing big offsets * refactor with ci * try ci * typo * another typo * ubuntu default * forgot git * do i need git? * missing packages * adding python-dev * with cache? * buildx action * buildx name issue? * maybe now? * python3 * newline warning * maybe now * i actually need this * ci should work now * improved caching * fixing cache * maybe now it will cache * this * testing cache * trying again * load * missing platform * caching gha * testing cache * full testing * typo * now? * why * adding checkout back * bad formatting * fixing convention issues * supporting python * adding CI flag * testing all * better comments * adding debugging * takes 12x longer * does it output progress now? * ignore models for speed * fixing merge * excluding conv_transpose2d * only 2 test cuz is to slow * another approach * let's see * faster duh * my bad * T_T * typo * sup * with output? * comment test * comment test * comment test * :? * no comment * with cache * back to normal * testing that ci works * back to passing * trying again * does it create another entry * does it create another entry? * build local * hey * Revert "excluding conv_transpose2d" This reverts commit cc7348de03033e032f47d69caff174e2f1a7bfea. * does it cache if done before? * does it cache? * done * adding test ops * bad formatting * no need for this * working static mem * sum 1d * add ndim * better reg import * fix stack * back to np * working except for softmax * 5 failing * no pogress * remove keystone * remove keystone * testops passing * cleanups * more cleanup * typo * ci * ci2 * cond import * ci3 * ci4 * ci4 * ci5 * ci5 * ci6 * aligment * test all * correct test * err read_unmapped * passing test * ignore for speed * ignore for speed * ci7 * cleanup * remove docker * fixing merge * fixing bugs * add skipload for const ops * comments * First merge to master: Renderer * fix emulation * passing all tests arm64 * cleaning * fix handcoded binary * cleaning * fix errs * fix runtime arg binary * clean git diff * fix and clean * fixing metal test * cleaning * fix metal test * ci ~8 min * fix pylint and clang * cache the files in ops_clang --------- Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com> 2023-08-15 10:29:30 +08:00			`ret = Device[Device.DEFAULT].renderer("test", uops)`
			`src, global_size, local_size, binary = ret if len(ret) == 4 else ret + (False,)`
			`return ASTRunner("test", src, global_size, local_size, runtime_args={"binary": binary}).build(Device[Device.DEFAULT].runtime)`
test uops (#1444) * test uops * tests should pass * improve uops * precision 2023-08-06 03:35:56 +08:00
remove Token class (#1723) * no fusion * no float4 grouping * mulacc fusion is fine. remove uop_alu * fully remove get_grouped_maybe_float4 * removed that test * that's not float4 anymore * disable failing arm64 * metal ops pass tokenless * fix wmma * update test_uops with new style * fix gep * fix float4 store * fix float4 store more * cuda tests pass * disable broadcast pow * fix ptx * reenable arm64 * bring cse back * don't cache the acc * fix ptx bug 2023-09-02 03:53:07 +08:00			`def uop(uops:List[UOp], uop:UOps, dtype:Optional[DType], vin:Tuple[UOp, ...], arg:Any=None) -> UOp:`
			`uops.append(UOp(uop, dtype, tuple(vin), arg, len(uops)))`
			`return uops[-1]`

			`def _test_single_value(vals, op, dtype):`
			`uops = []`
			`uop(uops, UOps.DEFINE_GLOBAL, None, (), ('data0', dtype))`
			`for i in range(len(vals)): uop(uops, UOps.DEFINE_GLOBAL, None, (), (f'data{i+1}', dtype))`
			`loads = (uop(uops, UOps.LOAD, dtype, [], MemOp(f'data{i+1}', Variable.num(0), False, dtype, Variable.ands([]))) for i in range(len(vals)))`
			`alu = uop(uops, UOps.ALU, dtype, loads, op)`
			`uop(uops, UOps.STORE, None, (alu, ), MemOp('data0', Variable.num(0), False, dtype, Variable.ands([])))`
			`buf = Device[Device.DEFAULT].buffer(1, dtype)`
			`buf2 = [Device[Device.DEFAULT].buffer.fromCPU(np.array([a], dtype=dtype.np)) for a in vals]`
test uops (#1444) * test uops * tests should pass * improve uops * precision 2023-08-06 03:35:56 +08:00			`prg = _uops_to_prg(uops)`
			`prg([buf]+buf2)`
			`return buf.toCPU()[0]`

remove Token class (#1723) * no fusion * no float4 grouping * mulacc fusion is fine. remove uop_alu * fully remove get_grouped_maybe_float4 * removed that test * that's not float4 anymore * disable failing arm64 * metal ops pass tokenless * fix wmma * update test_uops with new style * fix gep * fix float4 store * fix float4 store more * cuda tests pass * disable broadcast pow * fix ptx * reenable arm64 * bring cse back * don't cache the acc * fix ptx bug 2023-09-02 03:53:07 +08:00			`def _test_single_value_const(vals, op, dtype):`
			`uops = []`
			`uop(uops, UOps.DEFINE_GLOBAL, None, (), ('data0', dtype))`
render const (#1736) * render const * remove constop * fix llvm and webgpu * disable consts in llvm again * assembly special * fix const rendering * fix arm64 * imms are int * fix ptx * fix arm64 2023-09-02 10:01:43 +08:00			`loads = (uop(uops, UOps.CONST, dtype, [], a) for a in vals)`
remove Token class (#1723) * no fusion * no float4 grouping * mulacc fusion is fine. remove uop_alu * fully remove get_grouped_maybe_float4 * removed that test * that's not float4 anymore * disable failing arm64 * metal ops pass tokenless * fix wmma * update test_uops with new style * fix gep * fix float4 store * fix float4 store more * cuda tests pass * disable broadcast pow * fix ptx * reenable arm64 * bring cse back * don't cache the acc * fix ptx bug 2023-09-02 03:53:07 +08:00			`alu = uop(uops, UOps.ALU, dtype, loads, op)`
			`uop(uops, UOps.STORE, None, (alu, ), MemOp('data0', Variable.num(0), False, dtype, Variable.ands([])))`
			`buf = Device[Device.DEFAULT].buffer(1, dtype)`
test uops (#1444) * test uops * tests should pass * improve uops * precision 2023-08-06 03:35:56 +08:00			`prg = _uops_to_prg(uops)`
			`prg([buf])`
			`return buf.toCPU()[0]`

			`class TestUOps(unittest.TestCase):`
			`def _equal(self, v1, v2):`
			`if not (math.isnan(v1) and math.isnan(v2)): self.assertAlmostEqual(v1, v2, places=5)`

			`def _test_uop_fxn(self, bop, fxn, dt=dtypes.float32):`
			`for f in [_test_single_value, _test_single_value_const]:`
more uops testing, who isn't passing right now... (#1522) * more uops * llvm refactor * update test uops * rest of the nodes * ors and ands 2023-08-16 00:07:26 +08:00			`for a in [-2.0, 0.0, 1.0, 2.0]:`
remove Token class (#1723) * no fusion * no float4 grouping * mulacc fusion is fine. remove uop_alu * fully remove get_grouped_maybe_float4 * removed that test * that's not float4 anymore * disable failing arm64 * metal ops pass tokenless * fix wmma * update test_uops with new style * fix gep * fix float4 store * fix float4 store more * cuda tests pass * disable broadcast pow * fix ptx * reenable arm64 * bring cse back * don't cache the acc * fix ptx bug 2023-09-02 03:53:07 +08:00			`self._equal(f([a], bop, dt), fxn(a))`
test uops (#1444) * test uops * tests should pass * improve uops * precision 2023-08-06 03:35:56 +08:00
more uops testing, who isn't passing right now... (#1522) * more uops * llvm refactor * update test uops * rest of the nodes * ors and ands 2023-08-16 00:07:26 +08:00			`def _test_bop_fxn(self, bop, fxn, dt=dtypes.float32, no_b_zero=False):`
test uops (#1444) * test uops * tests should pass * improve uops * precision 2023-08-06 03:35:56 +08:00			`for f in [_test_single_value, _test_single_value_const]:`
more uops testing, who isn't passing right now... (#1522) * more uops * llvm refactor * update test uops * rest of the nodes * ors and ands 2023-08-16 00:07:26 +08:00			`for a in [-2.0, 0.0, 1.0, 2.0]:`
			`for b in [-3.0, 1.0, 3.0] + ([] if no_b_zero else [0.0]):`
remove Token class (#1723) * no fusion * no float4 grouping * mulacc fusion is fine. remove uop_alu * fully remove get_grouped_maybe_float4 * removed that test * that's not float4 anymore * disable failing arm64 * metal ops pass tokenless * fix wmma * update test_uops with new style * fix gep * fix float4 store * fix float4 store more * cuda tests pass * disable broadcast pow * fix ptx * reenable arm64 * bring cse back * don't cache the acc * fix ptx bug 2023-09-02 03:53:07 +08:00			`self._equal(f([a,b], bop, dt), fxn(a,b))`
simple bitcast 2 (#1445) * simple bitcast 2 * bc 2 * empty * Revert "empty" This reverts commit d8ee083655b67947afb1e577020b4395d001832c. 2023-08-06 15:30:50 +08:00
test uops (#1444) * test uops * tests should pass * improve uops * precision 2023-08-06 03:35:56 +08:00			`def _test_top_fxn(self, bop, fxn, dt=dtypes.float32):`
			`for f in [_test_single_value, _test_single_value_const]:`
			`for a in [-2.0, 0, 1, 2.0]:`
			`for b in [-3.0, 3.0]:`
			`for c in [-4.0, 4.0]:`
remove Token class (#1723) * no fusion * no float4 grouping * mulacc fusion is fine. remove uop_alu * fully remove get_grouped_maybe_float4 * removed that test * that's not float4 anymore * disable failing arm64 * metal ops pass tokenless * fix wmma * update test_uops with new style * fix gep * fix float4 store * fix float4 store more * cuda tests pass * disable broadcast pow * fix ptx * reenable arm64 * bring cse back * don't cache the acc * fix ptx bug 2023-09-02 03:53:07 +08:00			`self._equal(f([a,b,c], bop, dt), fxn(a,b,c))`
more uops testing, who isn't passing right now... (#1522) * more uops * llvm refactor * update test uops * rest of the nodes * ors and ands 2023-08-16 00:07:26 +08:00
			`@unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled), "only test for compiled backends")`
			`class TestFloatUOps(TestUOps):`
			`def test_exp2(self): self._test_uop_fxn(UnaryOps.EXP2, lambda a: np.exp2(a))`
			`def test_log2(self): self._test_uop_fxn(UnaryOps.LOG2, lambda a: math.log2(a) if a > 0 else float('-inf' if a==0 else 'nan'))`
			`def test_sin(self): self._test_uop_fxn(UnaryOps.SIN, lambda a: math.sin(a))`
			`def test_sqrt(self): self._test_uop_fxn(UnaryOps.SQRT, lambda a: math.sqrt(a) if a >= 0 else float('nan'))`
			`# this is not on most backends`
			`#def test_recip(self): self._test_uop_fxn(UnaryOps.RECIP, lambda a: 1.0/a if a != 0 else float('inf'))`

			`def test_add(self): self._test_bop_fxn(BinaryOps.ADD, lambda a,b: a+b)`
			`def test_sub(self): self._test_bop_fxn(BinaryOps.SUB, lambda a,b: a-b)`
			`def test_mul(self): self._test_bop_fxn(BinaryOps.MUL, lambda a,b: a*b)`
			`def test_div(self): self._test_bop_fxn(BinaryOps.DIV, lambda a,b: a/b if b != 0 else a*float('inf'))`
			`def test_max(self): self._test_bop_fxn(BinaryOps.MAX, lambda a,b: max(a,b))`
			`def test_cmplt(self): self._test_bop_fxn(BinaryOps.CMPLT, lambda a,b: float(a<b))`
			`# MOD isn't tested on floats`

test uops (#1444) * test uops * tests should pass * improve uops * precision 2023-08-06 03:35:56 +08:00			`def test_mulacc(self): self._test_top_fxn(TernaryOps.MULACC, lambda a,b,c: (a*b)+c)`
			`def test_where(self): self._test_top_fxn(TernaryOps.WHERE, lambda a,b,c: b if a!=0 else c)`

more uops testing, who isn't passing right now... (#1522) * more uops * llvm refactor * update test uops * rest of the nodes * ors and ands 2023-08-16 00:07:26 +08:00			`# TODO: fix this on all the backends`
Fix uops int32 for llvm (#1554) * fix-uops-int32-llvm * fix tests * Ignore mypy error 2023-08-16 14:22:32 +08:00			`@unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled) or getenv('ARM64', False), "only test for compiled backends, broken on some")`
more uops testing, who isn't passing right now... (#1522) * more uops * llvm refactor * update test uops * rest of the nodes * ors and ands 2023-08-16 00:07:26 +08:00			`class TestNonFloatUOps(TestUOps):`
			`def test_add_int32(self): self._test_bop_fxn(BinaryOps.ADD, lambda a,b: int(a)+int(b), dtypes.int32)`
Fix uops int32 for llvm (#1554) * fix-uops-int32-llvm * fix tests * Ignore mypy error 2023-08-16 14:22:32 +08:00			`def test_sub_int32(self): self._test_bop_fxn(BinaryOps.SUB, lambda a,b: int(a)-int(b), dtypes.int32)`
more uops testing, who isn't passing right now... (#1522) * more uops * llvm refactor * update test uops * rest of the nodes * ors and ands 2023-08-16 00:07:26 +08:00			`def test_mul_int32(self): self._test_bop_fxn(BinaryOps.MUL, lambda a,b: int(a)*int(b), dtypes.int32)`
			`def test_div_int32(self): self._test_bop_fxn(BinaryOps.DIV, lambda a,b: int(a/b), dtypes.int32, no_b_zero=True)`
			`def test_mod_int32(self): self._test_bop_fxn(BinaryOps.MOD, lambda a,b: abs(int(a))%abs(int(b))*(1,-1)[a<0], dtypes.int32, no_b_zero=True)`
			`def test_cmplt_int32(self): self._test_bop_fxn(BinaryOps.CMPLT, lambda a,b: float(a<b), dtypes.int32)`
			`def test_mul_bool(self): self._test_bop_fxn(BinaryOps.MUL, lambda a,b: bool(a) and bool(b), dtypes.bool)`

test uops (#1444) * test uops * tests should pass * improve uops * precision 2023-08-06 03:35:56 +08:00			`if __name__ == '__main__':`
			`unittest.main(verbosity=2)`