diff --git a/README.md b/README.md index 0edad74f..d9974c1e 100644 --- a/README.md +++ b/README.md @@ -145,7 +145,7 @@ The autodiff stuff is all in mlops now so you can focus on the raw operations ``` Buffer # class of memory on this device -unary_op (RELU, EXP, LOG, NEG, SIGN) # A -> A +unary_op (RELU, EXP, LOG, NEG, GT0) # A -> A reduce_op (SUM, MAX) # A -> B (smaller size, B has 1 in shape) binary_op (ADD, SUB, MUL, DIV, POW, CMPEQ) # A + B -> C (all the same size) movement_op (RESHAPE, PERMUTE, PAD, SHRINK, EXPAND, FLIP) # A -> B (different size) diff --git a/accel/llvm/ops_llvm.py b/accel/llvm/ops_llvm.py index ff460686..527174d2 100644 --- a/accel/llvm/ops_llvm.py +++ b/accel/llvm/ops_llvm.py @@ -150,8 +150,7 @@ class LLVMBuffer(ExplicitExecAST): UnaryOps.RELU: lambda builder,x: builder.select(builder.fcmp_ordered("<=", ir.Constant(ir.FloatType(), 0), x, flags=('fast',)), x, ir.Constant(ir.FloatType(), 0)), UnaryOps.EXP: lambda builder,x: builder.call(builder._block.module.declare_intrinsic('llvm.exp', [ir.FloatType()]), [x], fastmath=('fast',)), UnaryOps.LOG: lambda builder,x: builder.call(builder._block.module.declare_intrinsic('llvm.log', [ir.FloatType()]), [x], fastmath=('fast',)), - UnaryOps.SIGN: lambda builder,x: builder.select(builder.fcmp_ordered("==", x, ir.Constant(ir.FloatType(), 0), flags=('fast',)), ir.Constant(ir.FloatType(), 0), - builder.select(builder.fcmp_ordered("<=", ir.Constant(ir.FloatType(), 0), x, flags=('fast',)), ir.Constant(ir.FloatType(), 1), ir.Constant(ir.FloatType(), -1))), + UnaryOps.GT0: lambda builder,x: builder.select(builder.fcmp_ordered(">", x, ir.Constant(ir.FloatType(), 0), flags=('fast',)), ir.Constant(ir.FloatType(), 1), ir.Constant(ir.FloatType(), 0)), UnaryOps.RECIPROCAL: lambda builder,x: builder.fdiv(ir.Constant(ir.FloatType(), 1), x, flags=('fast',)), BinaryOps.ADD: lambda builder,x,y: builder.fadd(x,y, flags=('fast',)), BinaryOps.SUB: lambda builder,x,y: builder.fsub(x,y, flags=('fast',)), diff --git a/tinygrad/llops/ops_cpu.py b/tinygrad/llops/ops_cpu.py index a90faf33..6dde86e2 100644 --- a/tinygrad/llops/ops_cpu.py +++ b/tinygrad/llops/ops_cpu.py @@ -7,7 +7,7 @@ from tinygrad.helpers import shape_to_axis class CPUBuffer(np.ndarray, GenericExecAST): fxn_for_op = { UnaryOps.NOOP: lambda x: x[:], UnaryOps.NEG: lambda x: -x, UnaryOps.RELU: lambda x: x.relu(), - UnaryOps.EXP: lambda x: x.exp(), UnaryOps.LOG: lambda x: x.log(), UnaryOps.SIGN: lambda x: x.sign(), UnaryOps.RECIPROCAL: lambda x: 1.0/x, + UnaryOps.EXP: lambda x: x.exp(), UnaryOps.LOG: lambda x: x.log(), UnaryOps.GT0: lambda x: operator.gt(x, 0.0), UnaryOps.RECIPROCAL: lambda x: 1.0/x, BinaryOps.ADD: operator.add, BinaryOps.SUB: operator.sub, BinaryOps.MUL: operator.mul, BinaryOps.DIV: operator.truediv, BinaryOps.POW: operator.pow, BinaryOps.CMPEQ: lambda x,y: (x==y).float(), ReduceOps.SUM: lambda x, new_shape: x.sum(shape_to_axis(x.shape, new_shape), keepdims=True) if tuple(x.shape) != tuple(new_shape) else x[:], @@ -18,7 +18,6 @@ class CPUBuffer(np.ndarray, GenericExecAST): def relu(x): return np.maximum(x, 0) def exp(x): return np.exp(x) def log(x): return np.log(x) - def sign(x): return np.sign(x) def float(x): return x.astype(np.float32) def flip(x, axis): return np.flip(x, axis) def amax(x, *args, **kwargs): return np.amax(x, *args, **kwargs) diff --git a/tinygrad/llops/ops_gpu.py b/tinygrad/llops/ops_gpu.py index 5fd55b54..37f4d8f9 100644 --- a/tinygrad/llops/ops_gpu.py +++ b/tinygrad/llops/ops_gpu.py @@ -29,7 +29,7 @@ def split_float4(x): class CLASTKernel(ASTKernel): code_for_op : Dict[Op, str] = { - UnaryOps.NOOP: "(A)", UnaryOps.NEG: "(-(A))", UnaryOps.RELU: "max(A, (float)0.)", UnaryOps.SIGN: "sign(A)", + UnaryOps.NOOP: "(A)", UnaryOps.NEG: "(-(A))", UnaryOps.RELU: "max(A, (float)0.)", UnaryOps.GT0: "((float)1.-step((float)0.,(-A)))", UnaryOps.EXP: "native_exp(A)" if NATIVE_EXPLOG else "exp(A)", UnaryOps.LOG: "native_log(A)" if NATIVE_EXPLOG else "log(A)", UnaryOps.RECIPROCAL: "native_recip(A)" if NATIVE_EXPLOG else "((float)1.0/A)", @@ -92,7 +92,6 @@ class CLASTKernel(ASTKernel): if isinstance(x.op, ReduceOps) and not do_reduce: return acc values = ([acc] if isinstance(x.op, ReduceOps) else []) + [self.ast_parse(v, acc, do_reduce) for v in x.src] code = CLASTKernel.code_for_op[x.op] # TODO: replace this with a function - if CUDA and x.op == UnaryOps.SIGN: self.prekernel.add("inline __device__ float sign(float x) { float val = (signbit(x) == 0.0f) ? 1.0f : -1.0f; return (x == 0.0f) ? 0.0f : val; }") if len(values) == 2: # TODO: sometimes this is split, sometimes it's multiply if isinstance(x.op, ReduceOps) and values[0][0].typ == Types.FLOAT4 and len(values[0])*4 == len(values[1]): values[0] = split_float4(values[0]) diff --git a/tinygrad/mlops.py b/tinygrad/mlops.py index 7b72123e..b77452f9 100644 --- a/tinygrad/mlops.py +++ b/tinygrad/mlops.py @@ -15,7 +15,7 @@ class ReLU(Function): return ret def backward(self, grad_output): - return self.saved_tensors[0].unary_op(UnaryOps.SIGN).binary_op(BinaryOps.MUL, grad_output) + return self.saved_tensors[0].unary_op(UnaryOps.GT0).binary_op(BinaryOps.MUL, grad_output) class Log(Function): def forward(self, x): diff --git a/tinygrad/ops.py b/tinygrad/ops.py index 251fcdfd..09e0d046 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -10,7 +10,7 @@ from tinygrad.helpers import getenv DEBUG = getenv("DEBUG", 0) # these are the llops your accelerator must implement, along with toCpu -UnaryOps = Enum("UnaryOps", ["NOOP", "NEG", "RELU", "EXP", "LOG", "SIGN", "RECIPROCAL"]) +UnaryOps = Enum("UnaryOps", ["NOOP", "NEG", "RELU", "EXP", "LOG", "GT0", "RECIPROCAL"]) BinaryOps = Enum("BinaryOps", ["ADD", "SUB", "MUL", "DIV", "POW", "CMPEQ"]) ReduceOps = Enum("ReduceOps", ["SUM", "MAX"]) MovementOps = Enum("MovementOps", ["RESHAPE", "PERMUTE", "EXPAND", "FLIP", "STRIDED", "PAD", "SHRINK"])