diff --git a/tinygrad/gradcheck.py b/extra/gradcheck.py similarity index 87% rename from tinygrad/gradcheck.py rename to extra/gradcheck.py index 6cb38bc7..4ebf04db 100644 --- a/tinygrad/gradcheck.py +++ b/extra/gradcheck.py @@ -1,7 +1,10 @@ import numpy as np +from tinygrad.tensor import Tensor -from .utils import mask_like -from .tensor import Tensor +def mask_like(like, mask_inx, mask_value = 1.0): + mask = np.zeros_like(like).reshape(-1) + mask[mask_inx] = mask_value + return mask.reshape(like.shape) def jacobian(func, input): output = func(input) diff --git a/test/test_tensor.py b/test/test_tensor.py index d83d69ad..94e343ff 100644 --- a/test/test_tensor.py +++ b/test/test_tensor.py @@ -2,7 +2,7 @@ import numpy as np import torch import unittest from tinygrad.tensor import Tensor -from tinygrad.gradcheck import numerical_jacobian, jacobian, gradcheck +from extra.gradcheck import numerical_jacobian, jacobian, gradcheck x_init = np.random.randn(1,3).astype(np.float32) W_init = np.random.randn(3,3).astype(np.float32) diff --git a/tinygrad/ops.py b/tinygrad/ops.py index f8254418..fbfec5cc 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -4,8 +4,10 @@ import numpy as np from .tensor import Function, register # ************* basic ops ************* -def adBC(out, in_sh): #adjoint operation to broadcast is sum. Need to sum all axis with 1 = in_sh[i] < out.shape[i] - return out.sum(axis=tuple([i for i in range(len(in_sh)) if in_sh[i]==1 and out.shape[i]>1])).reshape(in_sh) +def unbroadcast(out, in_sh): + # adjoint operation to broadcast is sum. Need to sum all axis with 1 = in_sh[i] < out.shape[i] + sum_axis = [i for i in range(len(in_sh)) if in_sh[i]==1 and out.shape[i]>1] + return out.sum(axis=tuple(sum_axis)).reshape(in_sh) class Add(Function): @staticmethod @@ -16,7 +18,7 @@ class Add(Function): @staticmethod def backward(ctx, grad_output): shape_x, shape_y = ctx.saved_tensors - return adBC(grad_output, shape_x), adBC(grad_output, shape_y) + return unbroadcast(grad_output, shape_x), unbroadcast(grad_output, shape_y) register('add', Add) class Sub(Function): @@ -28,7 +30,7 @@ class Sub(Function): @staticmethod def backward(ctx, grad_output): shape_x, shape_y = ctx.saved_tensors - return adBC(grad_output, shape_x), adBC( -grad_output, shape_y) + return unbroadcast(grad_output, shape_x), unbroadcast(-grad_output, shape_y) register('sub', Sub) class Mul(Function): @@ -40,7 +42,7 @@ class Mul(Function): @staticmethod def backward(ctx, grad_output): x,y = ctx.saved_tensors - return adBC(y*grad_output, x.shape), adBC(x*grad_output, y.shape) + return unbroadcast(y*grad_output, x.shape), unbroadcast(x*grad_output, y.shape) register('mul', Mul) class Div(Function): @@ -52,7 +54,7 @@ class Div(Function): @staticmethod def backward(ctx, grad_output): x,y = ctx.saved_tensors - return adBC(grad_output / y, x.shape), adBC(-x * grad_output / y**2, y.shape) + return unbroadcast(grad_output / y, x.shape), unbroadcast(-x * grad_output / y**2, y.shape) # TODO: registering this breaks the default div on the GPU #register('div', Div) @@ -65,7 +67,8 @@ class Pow(Function): @staticmethod def backward(ctx, grad_output): x,y = ctx.saved_tensors - return adBC(y * (x**(y-1.0)) * grad_output,x.shape), adBC((x**y) * np.log(x) * grad_output,y.shape) + return unbroadcast(y * (x**(y-1.0)) * grad_output, x.shape), \ + unbroadcast((x**y) * np.log(x) * grad_output, y.shape) register('pow', Pow) class Sum(Function): diff --git a/tinygrad/utils.py b/tinygrad/utils.py index 97b5089a..282800e6 100644 --- a/tinygrad/utils.py +++ b/tinygrad/utils.py @@ -1,10 +1,5 @@ import numpy as np -def mask_like(like, mask_inx, mask_value = 1.0): - mask = np.zeros_like(like).reshape(-1) - mask[mask_inx] = mask_value - return mask.reshape(like.shape) - def layer_init_uniform(*x): ret = np.random.uniform(-1., 1., size=x)/np.sqrt(np.prod(x)) return ret.astype(np.float32)