tinygrad/test/test_custom_function.py

# this is an example of how you can write terrible DSP compute breaking ops like warpPerspective
# here we use a CUSTOM op to write atan2

import unittest
import numpy as np
from typing import Optional, Tuple
from tinygrad.helpers import prod
from tinygrad.dtype import dtypes

# *** first, we implement the atan2 op at the lowest level ***
# `atan2_gpu` for GPUBuffers and `atan2_cpu` for CPUBuffers
from tinygrad.lazy import Buffer, create_lazybuffer
from tinygrad.device import CompiledASTRunner, Device
from tinygrad.shape.shapetracker import ShapeTracker

# we don't always have GPU support, so the type signature is the abstract CompiledBuffer instead of GPUBuffer
def atan2_gpu(ret:Buffer, a:Buffer, b:Buffer):
  assert a.dtype == b.dtype and a.dtype == dtypes.float32, "gpu function only supports float32"
  src = """
  __kernel void atan2_gpu(global float *c, global float *a, global float *b) {
    int idx = get_global_id(0);
    c[idx] = atan2(a[idx], b[idx]);
  }"""
  CompiledASTRunner(None, "atan2_gpu", src, Device[ret.device], global_size=[ret.size]).exec([ret, a, b])

def atan2_cpu(ret:Buffer, a:Buffer, b:Buffer): ret.copyin(np.require(np.arctan2(a._buf, b._buf), requirements='C').data)

# *** second, we write the ATan2 mlop ***
# NOTE: The derivative of atan2 doesn't need a custom op! https://www.liquisearch.com/atan2/derivative
# In general, it is also optional to write a backward function, just your backward pass won't work without it

from tinygrad.ops import LoadOps, BinaryOps
from tinygrad.lazy import LazyBuffer
from tinygrad.tensor import Function

class ATan2(Function):
  def forward(self, a:LazyBuffer, b:LazyBuffer) -> LazyBuffer:
    assert prod(a.shape) == prod(b.shape) and a.device == b.device, "shape or device mismatch"
    self.a, self.b = a, b
    return create_lazybuffer(a.device, ShapeTracker.from_shape(a.shape), max(a.dtype, b.dtype), LoadOps.CUSTOM,
                             arg={"GPU": atan2_gpu, "CPU": atan2_cpu}[a.device], srcs=(a.contiguous(), b.contiguous()))
  def backward(self, grad_output:LazyBuffer) -> Tuple[Optional[LazyBuffer], Optional[LazyBuffer]]:
    denom = (self.a.e(BinaryOps.MUL, self.a)).e(BinaryOps.ADD, self.b.e(BinaryOps.MUL, self.b))
    return grad_output.e(BinaryOps.MUL, self.b.e(BinaryOps.DIV, denom)) if self.needs_input_grad[0] else None, \
           grad_output.e(BinaryOps.MUL, self.a.const(0).e(BinaryOps.SUB, self.a).e(BinaryOps.DIV, denom)) if self.needs_input_grad[1] else None

# *** third, we use our lovely new mlop in some tests ***

from tinygrad.tensor import Tensor

@unittest.skipUnless(Device.DEFAULT in ["CPU", "GPU"], "atan2 is only implemented for CPU and GPU")
class TestCustomFunction(unittest.TestCase):
  def test_atan2_forward(self):
    # create some random Tensors, permute them just because we can
    a = Tensor.randn(4,4,requires_grad=True).permute(1,0)
    b = Tensor.randn(4,4,requires_grad=True).permute(1,0)

    # run the forward pass. note: up until the .numpy(), it's all lazy
    c = ATan2.apply(a, b)
    print(c.numpy())

    # check the forward pass (in numpy)
    np.testing.assert_allclose(c.numpy(), np.arctan2(a.numpy(), b.numpy()), atol=1e-5)

  # fun fact, this never actually calls forward, so it works in all the backends
  def test_atan2_backward(self):
    # have to go forward before we can go backward
    a = Tensor.randn(4,4,requires_grad=True).permute(1,0)
    b = Tensor.randn(4,4,requires_grad=True).permute(1,0)
    c = ATan2.apply(a, b)

    # run the backward pass
    c.mean().backward()
    assert a.grad is not None and b.grad is not None, "tinygrad didn't compute gradients"
    print(a.grad.numpy())
    print(b.grad.numpy())

    # check the backward pass (in torch)
    import torch
    ta, tb = torch.tensor(a.numpy(), requires_grad=True), torch.tensor(b.numpy(), requires_grad=True)
    tc = torch.atan2(ta, tb)
    tc.mean().backward()
    assert ta.grad is not None and tb.grad is not None, "torch didn't compute gradients"
    np.testing.assert_allclose(a.grad.numpy(), ta.grad.numpy(), atol=1e-5)
    np.testing.assert_allclose(b.grad.numpy(), tb.grad.numpy(), atol=1e-5)

  @unittest.skipIf(Device.DEFAULT in ["CPU"], "atan2_cpu not jittable")
  def test_atan2_jit(self):
    # custom ops even work in the JIT!
    from tinygrad.features.jit import TinyJit

    @TinyJit
    def jitted_atan2(a:Tensor, b:Tensor) -> Tensor:
      return ATan2.apply(a, b).realize()

    for _ in range(5):
      a = Tensor.randn(4,4,requires_grad=True).permute(1,0)
      b = Tensor.randn(4,4,requires_grad=True).permute(1,0)
      c = jitted_atan2(a, b)
      np.testing.assert_allclose(c.numpy(), np.arctan2(a.numpy(), b.numpy()), atol=1e-5)

if __name__ == "__main__":
  unittest.main()
move the custom function example to a test 2023-03-09 02:05:04 +08:00			`# this is an example of how you can write terrible DSP compute breaking ops like warpPerspective`
			`# here we use a CUSTOM op to write atan2`

			`import unittest`
			`import numpy as np`
fix GPU import error and old python Tuple 2023-03-09 04:22:11 +08:00			`from typing import Optional, Tuple`
move dtypes to dtype.py (#2964) * move dtypes to dtype.py * fix urllib 2024-01-02 06:58:48 +08:00			`from tinygrad.helpers import prod`
			`from tinygrad.dtype import dtypes`
move the custom function example to a test 2023-03-09 02:05:04 +08:00
			`# * first, we implement the atan2 op at the lowest level *`
fix GPU import error and old python Tuple 2023-03-09 04:22:11 +08:00			# `atan2_gpu` for GPUBuffers and `atan2_cpu` for CPUBuffers
new style device (#2530) * cpu tests pass * torch works * works * metal works * fix ops_disk * metal jit works * fix openpilot * llvm and clang work * fix webgpu * docs are rly broken * LRU works on metal * delete comment * revert name to ._buf. LRU only on Compiled * changes * allocator * allocator, getting closer * lru alloc * LRUAllocator * all pass * metal * cuda * test examples * linearizer * test fixes * fix custom + clean realize * fix hip * skip tests * fix tests * fix size=0 * fix MOCKHIP * fix thneed * copy better * simple * old style metal copy * fix thneed * np reshape * give cuda a device 2023-12-01 09:07:16 +08:00			`from tinygrad.lazy import Buffer, create_lazybuffer`
move device to device.py (#2466) * move device to device.py * pylint test --disable R,C,W,E --enable E0611 * fix tests 2023-11-28 03:34:37 +08:00			`from tinygrad.device import CompiledASTRunner, Device`
Optimizations in lazy.py (#987) * optimizations in lazy.py * make mypy happy with stubs and fix the graph import hack * merge conflict in helpers.py 2023-06-27 04:55:42 +08:00			`from tinygrad.shape.shapetracker import ShapeTracker`
move the custom function example to a test 2023-03-09 02:05:04 +08:00
fix GPU import error and old python Tuple 2023-03-09 04:22:11 +08:00			`# we don't always have GPU support, so the type signature is the abstract CompiledBuffer instead of GPUBuffer`
new style device (#2530) * cpu tests pass * torch works * works * metal works * fix ops_disk * metal jit works * fix openpilot * llvm and clang work * fix webgpu * docs are rly broken * LRU works on metal * delete comment * revert name to ._buf. LRU only on Compiled * changes * allocator * allocator, getting closer * lru alloc * LRUAllocator * all pass * metal * cuda * test examples * linearizer * test fixes * fix custom + clean realize * fix hip * skip tests * fix tests * fix size=0 * fix MOCKHIP * fix thneed * copy better * simple * old style metal copy * fix thneed * np reshape * give cuda a device 2023-12-01 09:07:16 +08:00			`def atan2_gpu(ret:Buffer, a:Buffer, b:Buffer):`
dtypes nice and clean (#673) * add dtype class * dtypes * buffers are lazy * dtype is tracked by lazybuffer and GenericShape * fix types in llvm * llvm store * dtype tests * fix tests maybe * fix flop counter * fix CI * CI fix and check format * fix dtype and dtype check * fix custom test * fix test graph 2023-03-11 08:56:07 +08:00			`assert a.dtype == b.dtype and a.dtype == dtypes.float32, "gpu function only supports float32"`
search refactor (#2969) * minor search cleanup * now that saves lines * fix 2024-01-02 09:39:26 +08:00			`src = """`
			`__kernel void atan2_gpu(global float c, global float a, global float *b) {`
			`int idx = get_global_id(0);`
			`c[idx] = atan2(a[idx], b[idx]);`
			`}"""`
the runner does the build (#3220) 2024-01-24 10:45:43 +08:00			`CompiledASTRunner(None, "atan2_gpu", src, Device[ret.device], global_size=[ret.size]).exec([ret, a, b])`
move the custom function example to a test 2023-03-09 02:05:04 +08:00
new style device (#2530) * cpu tests pass * torch works * works * metal works * fix ops_disk * metal jit works * fix openpilot * llvm and clang work * fix webgpu * docs are rly broken * LRU works on metal * delete comment * revert name to ._buf. LRU only on Compiled * changes * allocator * allocator, getting closer * lru alloc * LRUAllocator * all pass * metal * cuda * test examples * linearizer * test fixes * fix custom + clean realize * fix hip * skip tests * fix tests * fix size=0 * fix MOCKHIP * fix thneed * copy better * simple * old style metal copy * fix thneed * np reshape * give cuda a device 2023-12-01 09:07:16 +08:00			`def atan2_cpu(ret:Buffer, a:Buffer, b:Buffer): ret.copyin(np.require(np.arctan2(a._buf, b._buf), requirements='C').data)`
move the custom function example to a test 2023-03-09 02:05:04 +08:00
			`# * second, we write the ATan2 mlop *`
			`# NOTE: The derivative of atan2 doesn't need a custom op! https://www.liquisearch.com/atan2/derivative`
			`# In general, it is also optional to write a backward function, just your backward pass won't work without it`

new lazy, benchmark (#2878) * lazy rewrite, try 2 * min fix tests * pass contig test * put broken pads back * move that to realize * no contig child fixes array packing * so wrong * now that's correct * base children * fix bind issues * disable to_image_idx * fix tests * that failure shouldn't break other tests * more fixes * fix torch * skip failing tests in CI * 1e-7 * half is broken * 1e-6 margin of error 2023-12-21 06:33:21 +08:00			`from tinygrad.ops import LoadOps, BinaryOps`
move the custom function example to a test 2023-03-09 02:05:04 +08:00			`from tinygrad.lazy import LazyBuffer`
			`from tinygrad.tensor import Function`

			`class ATan2(Function):`
			`def forward(self, a:LazyBuffer, b:LazyBuffer) -> LazyBuffer:`
fix GPU import error and old python Tuple 2023-03-09 04:22:11 +08:00			`assert prod(a.shape) == prod(b.shape) and a.device == b.device, "shape or device mismatch"`
move the custom function example to a test 2023-03-09 02:05:04 +08:00			`self.a, self.b = a, b`
new lazy, benchmark (#2878) * lazy rewrite, try 2 * min fix tests * pass contig test * put broken pads back * move that to realize * no contig child fixes array packing * so wrong * now that's correct * base children * fix bind issues * disable to_image_idx * fix tests * that failure shouldn't break other tests * more fixes * fix torch * skip failing tests in CI * 1e-7 * half is broken * 1e-6 margin of error 2023-12-21 06:33:21 +08:00			`return create_lazybuffer(a.device, ShapeTracker.from_shape(a.shape), max(a.dtype, b.dtype), LoadOps.CUSTOM,`
			`arg={"GPU": atan2_gpu, "CPU": atan2_cpu}[a.device], srcs=(a.contiguous(), b.contiguous()))`
fix GPU import error and old python Tuple 2023-03-09 04:22:11 +08:00			`def backward(self, grad_output:LazyBuffer) -> Tuple[Optional[LazyBuffer], Optional[LazyBuffer]]:`
Optional: Reduce line count and simplify the LazyBuffer interface (#1642) * less lines in lazybuffer, def e * custom function * cast * reorder functions * lb type 2023-08-23 12:01:10 +08:00			`denom = (self.a.e(BinaryOps.MUL, self.a)).e(BinaryOps.ADD, self.b.e(BinaryOps.MUL, self.b))`
			`return grad_output.e(BinaryOps.MUL, self.b.e(BinaryOps.DIV, denom)) if self.needs_input_grad[0] else None, \`
			`grad_output.e(BinaryOps.MUL, self.a.const(0).e(BinaryOps.SUB, self.a).e(BinaryOps.DIV, denom)) if self.needs_input_grad[1] else None`
move the custom function example to a test 2023-03-09 02:05:04 +08:00
			`# * third, we use our lovely new mlop in some tests *`

flake8 (#1323) * flake8: Ignore frequent violations, correct infrequent ones * Ignore some rules in test * Reorder test ignores * Lint test + main * EOF indent * Include all E71,E72 errors * Test the failing case in CI * Revert "Test the failing case in CI" This reverts commit 110add0a70f5a619d07631269104e84f908af6b9. * Push to test! This reverts commit f317532779a0e1ac8401e2474fd5c6c8695c08e9. * ok back to passing This reverts commit ba5052685f93f83e06152cdc696b9e26131d8ab7. * Prove that CI fails when formatting is incorrect. * Fix formatting * Remove duplicitous E117 rule * Use flake8 config for precommit --------- Co-authored-by: waifairer <waifairer@gmail.com> 2023-07-24 23:19:58 +08:00			`from tinygrad.tensor import Tensor`
move the custom function example to a test 2023-03-09 02:05:04 +08:00
			`@unittest.skipUnless(Device.DEFAULT in ["CPU", "GPU"], "atan2 is only implemented for CPU and GPU")`
			`class TestCustomFunction(unittest.TestCase):`
			`def test_atan2_forward(self):`
			`# create some random Tensors, permute them just because we can`
			`a = Tensor.randn(4,4,requires_grad=True).permute(1,0)`
			`b = Tensor.randn(4,4,requires_grad=True).permute(1,0)`

			`# run the forward pass. note: up until the .numpy(), it's all lazy`
			`c = ATan2.apply(a, b)`
			`print(c.numpy())`

			`# check the forward pass (in numpy)`
			`np.testing.assert_allclose(c.numpy(), np.arctan2(a.numpy(), b.numpy()), atol=1e-5)`

			`# fun fact, this never actually calls forward, so it works in all the backends`
			`def test_atan2_backward(self):`
			`# have to go forward before we can go backward`
			`a = Tensor.randn(4,4,requires_grad=True).permute(1,0)`
			`b = Tensor.randn(4,4,requires_grad=True).permute(1,0)`
			`c = ATan2.apply(a, b)`

			`# run the backward pass`
			`c.mean().backward()`
			`assert a.grad is not None and b.grad is not None, "tinygrad didn't compute gradients"`
			`print(a.grad.numpy())`
			`print(b.grad.numpy())`

			`# check the backward pass (in torch)`
			`import torch`
			`ta, tb = torch.tensor(a.numpy(), requires_grad=True), torch.tensor(b.numpy(), requires_grad=True)`
			`tc = torch.atan2(ta, tb)`
			`tc.mean().backward()`
			`assert ta.grad is not None and tb.grad is not None, "torch didn't compute gradients"`
			`np.testing.assert_allclose(a.grad.numpy(), ta.grad.numpy(), atol=1e-5)`
			`np.testing.assert_allclose(b.grad.numpy(), tb.grad.numpy(), atol=1e-5)`

JIT support in Interpreted (#2314) * factor that out * jit is supported everywhere * fix some tests * there's no jit supported device, the jit is everywhere * fix test uops 2023-11-16 03:13:38 +08:00			`@unittest.skipIf(Device.DEFAULT in ["CPU"], "atan2_cpu not jittable")`
move the custom function example to a test 2023-03-09 02:05:04 +08:00			`def test_atan2_jit(self):`
			`# custom ops even work in the JIT!`
move graph.py and jit.py into features (#3376) * move graph.py into features * move jit into features * fix quickstart 2024-02-13 00:34:34 +08:00			`from tinygrad.features.jit import TinyJit`
move the custom function example to a test 2023-03-09 02:05:04 +08:00
			`@TinyJit`
			`def jitted_atan2(a:Tensor, b:Tensor) -> Tensor:`
			`return ATan2.apply(a, b).realize()`

			`for _ in range(5):`
			`a = Tensor.randn(4,4,requires_grad=True).permute(1,0)`
			`b = Tensor.randn(4,4,requires_grad=True).permute(1,0)`
			`c = jitted_atan2(a, b)`
			`np.testing.assert_allclose(c.numpy(), np.arctan2(a.numpy(), b.numpy()), atol=1e-5)`

			`if __name__ == "__main__":`
			`unittest.main()`