From 613bcd945d13cfc2b3f92334e72793ec369f434d Mon Sep 17 00:00:00 2001
From: Yosef Frost <92753960+FrostyTheSouthernSnowman@users.noreply.github.com>
Date: Wed, 12 Jul 2023 13:29:15 -0400
Subject: [PATCH] Added Test Coverage to Int32 and Make Sure Tests Succeed
 (#1174)

* Added test coverage for int32 in `test/test_dtype.py`

Tests for int32 include:
- testing that int32 can be converted into a numpy array
- testing that float and int64 can be cast into int32
- testing that int32 can be cast into float and int64
- testing addition, multiplication, and matrix multiplication with int32
- testing that addition, multiplication, and matrix multiplication with int32 and either float or int64 gets successfully cast into float and int64, respectively

Additional changes include testing that int8 casts into int32 and testing that float16 casts into int32

* Added type casting to the add, subtract, and divide binary operations

* Added automatic type casting when types differ to FusedOps.MULACC

I moved the match_types function back so that I could call it in einsum_mulacc where it would cast the types of the MULACC to be the same

* Added unit test for match_types and added type hints to the parameters

* Added tests for ops_cpu.match_types

* Changed ops_cpu.einsum logic to play nicely with PyTorch

Changed `tinygrad.runtime.ops_cpu.einsum_mulacc` logic to not perform type matching. Type matching was instead moved to the numpy_fxn_for_op dictionary in the ops_cpu file. Since ops_torch uses the same einsum_mulacc function, this should fix all the broken pytorch tests.

* empty commit to rerun ci

* reverting PR#1213 in attempt to fix broken test

* Removed all tests I added to see if they are causing CI issues

* Added back type matching tests

* removed type matching tests and added back int tests

* added back part of the type matching tests

* removed braking type matching tests

* empty commit for testing

* added test back but inside comment

* removed a test from the comment to see if it breaks CI

* removed another function

* more testing

* emptied test comment

* cleaned up comments

* Added optimize=True flag to einsum_mullac in cpu_ops.py

* Removed unnecessary imports from tests

* optimized match_types by removing unnecessary array copying
---
 test/test_dtype.py          | 23 +++++++++++++++++++++++
 tinygrad/runtime/ops_cpu.py | 16 +++++++++-------
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/test/test_dtype.py b/test/test_dtype.py
index ab2627e8..61fb6aa7 100644
--- a/test/test_dtype.py
+++ b/test/test_dtype.py
@@ -35,6 +35,7 @@ class TestHalfDtype(unittest.TestCase):
   def test_half_to_float(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.float16), dtypes.float32, [1,2,3,4])
   def test_half_to_int8(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.float16), dtypes.int8, [1,2,3,4])
   def test_half_to_uint8(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.float16), dtypes.uint8, [1,2,3,4])
+  def test_half_to_int32(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.float16), dtypes.int32, [1,2,3,4])
   def test_half_to_int64(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.float16), dtypes.int64, [1,2,3,4])
 
   def test_float_to_half(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.float32), dtypes.float16, [1,2,3,4])
@@ -63,6 +64,7 @@ class TestInt8Dtype(unittest.TestCase):
 
   def test_int8_to_float(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.int8), dtypes.float32, [1,2,3,4])
   def test_int8_to_uint8(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.int8), dtypes.uint8, [1,2,3,4])
+  def test_int8_to_int32(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.int8), dtypes.int32, [1,2,3,4])
   def test_int8_to_int64(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.int8), dtypes.int64, [1,2,3,4])
 
   def test_uint8_to_float(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.uint8), dtypes.float32, [1,2,3,4])
@@ -91,5 +93,26 @@ class TestInt8Dtype(unittest.TestCase):
 
   def test_uint8_to_int8_overflow(self): _test_op(lambda: Tensor([255, 254, 253, 252], dtype=dtypes.uint8).cast(dtypes.int8), dtypes.int8, [-1, -2, -3, -4])
 
+class TestInt32Dtype(unittest.TestCase):
+  def test_int32_to_np(self): _test_to_np(Tensor([1,2,3,4], dtype=dtypes.int32), np.int32, [1,2,3,4])
+
+  def test_float_to_int32(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.float32), dtypes.int32, [1,2,3,4])
+  def test_int64_to_int32(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.int64), dtypes.int32, [1,2,3,4])
+
+  def test_int32_to_float(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.int32), dtypes.float32, [1,2,3,4])
+  def test_int32_to_int64(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.int32), dtypes.int64, [1,2,3,4])
+
+  def test_int32_add(self): _test_add(Tensor([1,2,3,4], dtype=dtypes.int32), Tensor([1,2,3,4], dtype=dtypes.int32), dtypes.int32, [2,4,6,8])
+  def test_int32_mul(self): _test_mul(Tensor([1,2,3,4], dtype=dtypes.int32), Tensor([1,2,3,4], dtype=dtypes.int32), dtypes.int32, [1,4,9,16])
+  def test_int32_matmul(self): _test_matmul(Tensor([[1,2],[3,4]], dtype=dtypes.int32), Tensor.eye(2, dtype=dtypes.int32), dtypes.int32, [[1,2],[3,4]])
+
+  def test_int32_add_upcast_float(self): _test_add_upcast(Tensor([1,2,3,4], dtype=dtypes.int32), Tensor([1,2,3,4], dtype=dtypes.float32), dtypes.float32, [2,4,6,8])
+  def test_int32_mul_upcast_float(self): _test_mul_upcast(Tensor([1,2,3,4], dtype=dtypes.int32), Tensor([1,2,3,4], dtype=dtypes.float32), dtypes.float32, [1,4,9,16])
+  def test_int32_matmul_upcast_float(self): _test_matmul_upcast(Tensor([[1,2],[3,4]], dtype=dtypes.int32), Tensor.eye(2, dtype=dtypes.float32), dtypes.float32, [[1,2],[3,4]])
+
+  def test_int32_add_upcast_int64(self): _test_add_upcast(Tensor([1,2,3,4], dtype=dtypes.int32), Tensor([1,2,3,4], dtype=dtypes.int64), dtypes.int64, [2,4,6,8])
+  def test_int32_mul_upcast_int64(self): _test_mul_upcast(Tensor([1,2,3,4], dtype=dtypes.int32), Tensor([1,2,3,4], dtype=dtypes.int64), dtypes.int64, [1,4,9,16])
+  def test_int32_matmul_upcast_int64(self): _test_matmul_upcast(Tensor([[1,2],[3,4]], dtype=dtypes.int32), Tensor.eye(2, dtype=dtypes.int64), dtypes.int64, [[1,2],[3,4]])
+
 if __name__ == '__main__':
   unittest.main()
diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py
index a8d94996..a8f896c1 100644
--- a/tinygrad/runtime/ops_cpu.py
+++ b/tinygrad/runtime/ops_cpu.py
@@ -16,6 +16,10 @@ base_fxn_for_op: Dict[Op, Callable] = {
   MovementOps.RESHAPE: lambda x, arg: x.reshape(arg), MovementOps.SHRINK: lambda x, arg: x[tuple(slice(p[0], p[1], None) for p in arg)],
 }
 
+def match_types(x, y):
+  up = x.dtype if dtypes.from_np(x.dtype).priority > dtypes.from_np(y.dtype).priority else y.dtype
+  return x.astype(up, copy=False), y.astype(up, copy=False)
+
 def einsum_mulacc(einsum, get_strides, expand):
   def einscripts(x): return ''.join(["abcdefghijklmnopqrstuvwxyz"[i] for i in x])
   def axes_slice(strides): return [i for i in range(len(strides)) if strides[i] != 0], tuple(slice(None) if strides[i] != 0 else 0 for i in range(len(strides)))
@@ -26,16 +30,14 @@ def einsum_mulacc(einsum, get_strides, expand):
     return expand(ret.reshape([(1 if i not in a_axes and i not in b_axes else s) for i,s in enumerate(new_shape)]), new_shape)
   return mulacc
 
-def match_types(x, y):
-  up = x.dtype if dtypes.from_np(x.dtype).priority > dtypes.from_np(y.dtype).priority else y.dtype
-  return x.astype(up), y.astype(up)
-
 numpy_fxn_for_op: Dict[Op, Callable] = {**base_fxn_for_op, **{
   UnaryOps.NOOP: lambda x: np.require(x, requirements='C'), UnaryOps.EXP2: np.exp2, UnaryOps.LOG2: np.log2, UnaryOps.CAST: lambda x,y: x.astype(y.np), UnaryOps.SIN: np.sin,
-  BinaryOps.MAX: np.maximum, BinaryOps.CMPEQ: lambda x,y: (x==y).astype(np.promote_types(x.dtype,y.dtype)), BinaryOps.MUL: lambda x, y: np.multiply(*match_types(x, y)), UnaryOps.SQRT: np.sqrt,
+  BinaryOps.MAX: np.maximum, BinaryOps.CMPEQ: lambda x,y: (x==y).astype(np.promote_types(x.dtype,y.dtype)), BinaryOps.ADD: lambda x, y: np.add(*match_types(x, y)), 
+  BinaryOps.SUB: lambda x, y: np.subtract(*match_types(x, y)), BinaryOps.MUL: lambda x, y: np.multiply(*match_types(x, y)), 
+  BinaryOps.DIV: lambda x, y: np.divide(*match_types(x, y)), UnaryOps.SQRT: np.sqrt,
   MovementOps.PERMUTE: lambda x, order: x.transpose(order), MovementOps.PAD: np.pad, MovementOps.EXPAND: np.broadcast_to,
-  MovementOps.STRIDE: lambda x, arg: x[tuple(slice(None, None, i) for i in arg)],
-  FusedOps.MULACC: einsum_mulacc(lambda s,a,b: np.einsum(s, a.copy(), b.copy(), optimize=True), lambda x: x.strides, np.broadcast_to),
+  MovementOps.STRIDE: lambda x, arg: x[tuple(slice(None, None, i) for i in arg)], 
+  FusedOps.MULACC: einsum_mulacc(lambda s,a,b: np.einsum(s, *match_types(a.copy(), b.copy()), optimize=True), lambda x: x.strides, np.broadcast_to),
 }}
 
 class RawNumpyBuffer(RawBuffer):