contiguous_view (#336)

* contiguous_view * non contig reduce too * conv fast * maybe faster valid * improve test_onnx * improve params * elementwise_op * draw non contig
2022-06-19 20:37:28 -07:00 · 2022-06-19 20:37:28 -07:00 · d05e7c291a
parent fb72ea3fbd
commit d05e7c291a
3 changed files with 152 additions and 138 deletions
--- a/test/test_onnx.py
+++ b/test/test_onnx.py
@ -12,7 +12,7 @@ from tinygrad.nn import batch_normalize

 MAX_CONVS = int(os.getenv("MAX_CONVS", -1))

-def run_onnx(onnx_model, inputs={}, debug=False):
+def get_run_onnx(onnx_model):
  def shape_to_tuple(s): return tuple(x.dim_value for x in s.dim)
  def buffer_parse(inp):
    if inp.data_type == 1:
@ -46,6 +46,9 @@ def run_onnx(onnx_model, inputs={}, debug=False):
      print(inp)
      raise Exception("no data")

+  def run_onnx(inputs={}, debug=False):
+    input_tensors = {}
+
    # get inputs
    for inp in onnx_model.graph.input:
      if inp.name in tensors: continue
@ -54,14 +57,14 @@ def run_onnx(onnx_model, inputs={}, debug=False):
      if inp.name in inputs:
        input_shape = inputs[inp.name].shape
        assert input_shape == shape, f"wrong shape for input {inp.name}, {input_shape} isn't {shape}"
-      tensors[inp.name] = Tensor(inputs[inp.name])
+        input_tensors[inp.name] = Tensor(inputs[inp.name])
      else:
        raise Exception(f"no data for {inp.name} with shape {shape}")

    conv_count = 0
    for num,n in enumerate(onnx_model.graph.node):
      if debug: print(f"{num}: op {n.op_type}")
-    inp = [tensors[x] for x in n.input]
+      inp = [tensors[x] if x in tensors else input_tensors[x] for x in n.input]
      opt = attribute_to_dict(n.attribute)

      # free ones
@ -132,6 +135,7 @@ def run_onnx(onnx_model, inputs={}, debug=False):
      #print(ret.numpy().mean())

    return {outp.name:tensors[outp.name] for outp in onnx_model.graph.output}
+  return run_onnx

 def run_onnx_torch(onnx_model, inputs):
  import torch
@ -142,9 +146,11 @@ def run_onnx_torch(onnx_model, inputs):
  return torch_out

 class TestOnnxModel(unittest.TestCase):
-  def test_openpilot_model(self):
+  def test_benchmark_openpilot_model(self):
    dat = fetch("https://github.com/commaai/openpilot/raw/7da48ebdba5e3cf4c0b8078c934bee9a199f0280/selfdrive/modeld/models/supercombo.onnx")
    onnx_model = onnx.load(io.BytesIO(dat))
+    run_onnx = get_run_onnx(onnx_model)
+    for _ in range(5):
      inputs = {
        "input_imgs": np.random.randn(*(1, 12, 128, 256)),
        "big_input_imgs": np.random.randn(*(1, 12, 128, 256)),
@ -154,7 +160,24 @@ class TestOnnxModel(unittest.TestCase):
      }
      inputs = {k:v.astype(np.float32) for k,v in inputs.items()}
      st = time.monotonic()
-    tinygrad_out = run_onnx(onnx_model, inputs)['outputs'].numpy()
+      tinygrad_out = run_onnx(inputs)['outputs'].numpy()
+      et = time.monotonic() - st
+      print(f"ran openpilot model in {et*1000.0:.2f} ms")
+
+  def test_openpilot_model(self):
+    dat = fetch("https://github.com/commaai/openpilot/raw/7da48ebdba5e3cf4c0b8078c934bee9a199f0280/selfdrive/modeld/models/supercombo.onnx")
+    onnx_model = onnx.load(io.BytesIO(dat))
+    run_onnx = get_run_onnx(onnx_model)
+    inputs = {
+      "input_imgs": np.random.randn(*(1, 12, 128, 256)),
+      "big_input_imgs": np.random.randn(*(1, 12, 128, 256)),
+      "desire": np.zeros((1, 8)),
+      "traffic_convention": np.array([[1., 0.]]),
+      "initial_state": np.zeros((1, 512))
+    }
+    inputs = {k:v.astype(np.float32) for k,v in inputs.items()}
+    st = time.monotonic()
+    tinygrad_out = run_onnx(inputs)['outputs'].numpy()
    et = time.monotonic() - st
    print(f"ran openpilot model in {et*1000.0:.2f} ms")

@ -167,10 +190,11 @@ class TestOnnxModel(unittest.TestCase):
    dat = fetch("https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx")
    onnx_model = onnx.load(io.BytesIO(dat))
    from test.test_efficientnet import chicken_img, car_img, preprocess, _LABELS
+    run_onnx = get_run_onnx(onnx_model)

    def run(img):
      inputs = {"images:0": preprocess(img, new=True)}
-      tinygrad_out = list(run_onnx(onnx_model, inputs, False).values())[0].numpy()
+      tinygrad_out = list(run_onnx(inputs, False).values())[0].numpy()
      return tinygrad_out.argmax()
    
    cls = run(chicken_img)
--- a/tinygrad/llops/ops_gpu.py
+++ b/tinygrad/llops/ops_gpu.py
@ -1,7 +1,9 @@
 import functools
 import numpy as np
 import pyopencl as cl
+from typing import List, Tuple
 from tinygrad.helpers import prod
+from tinygrad.llops.ops_cpu import unary_op
 from tinygrad.ops import UnaryOps, BinaryOps, ReduceOps, MovementOps, ProcessingOps
 from tinygrad.shapetracker import ShapeTracker, View, strides_for_shape

@ -42,7 +44,7 @@ class GPUBuffer:

  def toCPU(self):
    data = np.empty(self.shape, dtype=np.float32)
-    cl.enqueue_copy(cl_queue, data, self.cl, is_blocking=True)
+    cl.enqueue_copy(cl_queue, data, contiguous(self).cl, is_blocking=True)
    return data

 class CLProgram:
@ -59,33 +61,27 @@ def clbuild(name, prg, options=tuple(), argdtypes=None):
  return CLProgram(name, prg, options, argdtypes)

 code_for_op = {
-  UnaryOps.RELU: 'max(A, (float)0.)', UnaryOps.EXP: 'exp(A)', UnaryOps.LOG: 'log(A)', UnaryOps.NEG: '-A', UnaryOps.SIGN: 'sign(A)',
+  UnaryOps.NOOP: "A", UnaryOps.RELU: "max(A, (float)0.)", UnaryOps.EXP: "exp(A)", UnaryOps.LOG: "log(A)", UnaryOps.NEG: "-A", UnaryOps.SIGN: "sign(A)",
  BinaryOps.ADD: "A+B", BinaryOps.SUB: "A-B", BinaryOps.MUL: "A*B", BinaryOps.DIV: "B/A", BinaryOps.POW: "pow(A,B)", BinaryOps.CMPEQ: "(A==B)"
 }

-def unary_op(op, x):
-  ret = GPUBuffer(x.shape)
-  unop = clbuild("unop", """
-  __kernel void unop(__global const float4 *a_g, __global float4 *res_g) {
-    int gid = get_global_id(0);
-    float4 A = a_g[gid];
-    res_g[gid] = convert_float4("""+code_for_op[op]+""");
-  }""")
-  unop([roundup(prod(ret.shape))//4], None, x.cl, ret.cl)
+def contiguous_view(x:GPUBuffer, name:str):
+  return f"inline float get_{name}(__global const float *x, int gid) {{ int valid = 1; int idx = gid; {x.st.expr().replace('//', '/')}; return valid ? x[idx] : 0.0;}}"
+
+def elementwise_op(bufs: List[Tuple[str, GPUBuffer]], code):
+  assert all(buf.shape == bufs[0][1].shape for _, buf in bufs)
+  ret = GPUBuffer(bufs[0][1].shape)
+  ewop = clbuild("ewop", '\n'.join([contiguous_view(buf, name) for name, buf in bufs])+
+    "__kernel void ewop(__global float *res_g, "+','.join([f"__global const float *{name}_g" for name, _ in bufs])+") {"+
+    "int gid = get_global_id(0);"+
+    '\n'.join([f"float {name} = get_{name}({name}_g, gid);" for name, _ in bufs])+
+    f"res_g[gid] = {code}; }}")
+  ewop([prod(ret.shape)], None, ret.cl, *[buf.cl for _, buf in bufs])
  return ret

-def binary_op(op, x, y):
-  ret = GPUBuffer(x.shape)
-  assert x.shape == ret.shape and y.shape == ret.shape
-  binop = clbuild("binop", """
-  __kernel void binop(__global const float4 *a_g, __global const float4 *b_g, __global float4 *res_g) {
-    int gid = get_global_id(0);
-    float4 A = a_g[gid];
-    float4 B = b_g[gid];
-    res_g[gid] = convert_float4("""+code_for_op[op]+""");
-  }""")
-  binop([roundup(prod(ret.shape))//4], None, x.cl, y.cl, ret.cl)
-  return ret
+def unary_op(op, x): return elementwise_op([("A", x)], code_for_op[op])
+def binary_op(op, x, y): return elementwise_op([("A", x), ("B", y)], code_for_op[op])
+def contiguous(x:GPUBuffer): return x if x.st.contiguous else unary_op(UnaryOps.NOOP, x)

 def reduce_op(op, inp, new_shape):
  ret = GPUBuffer(new_shape)
@ -108,12 +104,12 @@ def reduce_op(op, inp, new_shape):
    acc *= shp

  # TODO: support multistage reduces
-  prg = """
+  prg = contiguous_view(inp, 'A')+"""
  __kernel void reduce(__global const float *a_g, __global float *res_g) {
    int gid = get_global_id(0); int idx = gid;"""+view.expr.replace('//', '/')+""";
    float out = """+start+""";\n"""+ \
      '\n'.join(loop_start[::-1])+"""
-        float a = a_g[idx];
+        float a = get_A(a_g, idx);
        """+code+""";\n"""+ \
      '\n'.join(loop_end)+"""
    res_g[gid] = out;
@ -121,26 +117,20 @@ def reduce_op(op, inp, new_shape):
  clbuild("reduce", prg)([prod(ret.shape)], None, inp.cl, ret.cl)
  return ret

-def contiguous(x, ret=None):
-  if ret is None: ret = GPUBuffer(x.st.shape)
-  clbuild("contiguous", """__kernel void contiguous(__global const float *x, __global float *ret) {
-    int gid = get_global_id(0); int valid = 1; int idx = gid; """+x.st.expr().replace('//', '/')+""";
-    ret[gid] = valid ? x[idx] : 0.0;  // should never be out-of-bounds accesses
-  }""")([prod(ret.shape)], None, x.cl, ret.cl)
-  return ret
-
-def movement_op(op, x, arg=None):
+def movement_op(op, x, arg):
  ret = GPUBuffer(x.st, x)
  ret.st.movement_op(op, arg)
-  if ret.st.contiguous: return ret
-  else: return contiguous(ret)
+  return ret

 def processing_op(op,x,w,C):
  ret = GPUBuffer((C.bs, C.cout, C.oy, C.ox))
  assert op == ProcessingOps.CONV, f"{op} isn't supported"
+  ints = ''.join(f"int {x} = {getattr(C, x)};" for x in ["H", "W", "cin", "ys", "xs", "dx", "dy", "px", "py"])
+  params = [(f"int {x}", getattr(C, x)) for x in ["groups", "rcout", "oy", "ox", "iy", "ix"]]
  conv_prg = clbuild("conv", """
  __kernel void conv(__global const float* restrict input, __global const float* restrict weight, __global float* restrict output,
-    int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix, int ys, int xs, int bs, int dx, int dy, int px, int py) {
+    """+','.join([x[0] for x in params])+""") {
+    """+ints+"""
    int B = get_global_id(0)/(groups*rcout);  // range 0-bs
    int g = (get_global_id(0)/rcout)%groups;
    int c = get_global_id(0) % rcout;
@ -154,24 +144,22 @@ def processing_op(op,x,w,C):

    float acc = 0.0;
    for (int ci = 0; ci < cin; ci++) {
-
-#ifdef ONEBYONE
-      acc += input[B*groups*cin*iy*ix + g*cin*iy*ix + ci*iy*ix + IY*ix + IX] * \
-        weight[g*rcout*cin + c*cin + ci];
-#else
      for (int y = 0; y < H; y++) { for (int x = 0; x < W; x++) {
        int idx_y = y*dy + IY - py;
        int idx_x = x*dx + IX - px;
+#ifdef ALLVALID
+        acc += input[B*groups*cin*iy*ix + g*cin*iy*ix + ci*iy*ix + idx_y*ix + idx_x] * \
+          weight[g*rcout*cin*H*W + c*cin*H*W + ci*H*W + y*W + x];
+#else
        int valid = (idx_y >= 0 && idx_y < iy && idx_x >= 0 && idx_x < ix);
-        acc += valid ? input[B*groups*cin*iy*ix + g*cin*iy*ix + ci*iy*ix + idx_y*ix + idx_x] * \
-          weight[g*rcout*cin*H*W + c*cin*H*W + ci*H*W + y*W + x] : 0.0;
-      } }
+        acc += valid * input[B*groups*cin*iy*ix + g*cin*iy*ix + ci*iy*ix + clamp(idx_y, 0, iy-1)*ix + clamp(idx_x, 0, ix-1)] * \
+          weight[g*rcout*cin*H*W + c*cin*H*W + ci*H*W + y*W + x];
 #endif
+      } }
    }
    output[gid] = acc;
  }""",
-  options=tuple(["-DONEBYONE"]) if C.H == 1 and C.W == 1 and C.px == 0 and C.py == 0 else tuple(),
-  argdtypes=tuple([None, None, None] + [np.int32]*16))
-  conv_prg([C.bs*C.cout, C.oy, C.ox], None, x.cl, w.cl, ret.cl,
-    *[x for x in list(C[0:12])+[C.dx, C.dy, C.px, C.py]])
+  options=tuple(["-DALLVALID"]) if C.px == 0 and C.py == 0 else tuple(),
+  argdtypes=tuple([None, None, None] + [np.int32]*len(params)))
+  conv_prg([C.bs*C.cout, C.oy, C.ox], None, contiguous(x).cl, contiguous(w).cl, ret.cl, *[x[1] for x in params])
  return ret
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@ -1,6 +1,6 @@
 # TODO: move Device to here and proxy buffer call
 from enum import Enum
-UnaryOps = Enum("UnaryOps", ["RELU", "EXP", "LOG", "NEG", "SIGN"])
+UnaryOps = Enum("UnaryOps", ["NOOP", "RELU", "EXP", "LOG", "NEG", "SIGN"])
 BinaryOps = Enum("BinaryOps", ["ADD", "SUB", "MUL", "DIV", "POW", "CMPEQ"])
 ReduceOps = Enum("ReduceOps", ["SUM", "MAX"])
 MovementOps = Enum("MovementOps", ["RESHAPE", "PERMUTE", "SLICE", "EXPAND", "FLIP"])
@ -46,9 +46,11 @@ def log_op(optype, op, ret, inp, dashed=False):
      G.add_edge(nm(x), nm(ret), label=sop, color='#808080' if dashed else '', style='dashed' if dashed else '')
      if 'label' not in G.nodes[nm(x)]: G.nodes[nm(x)]['label'] = str(x.shape)
    if nm(ret) not in G.nodes: G.add_node(nm(ret))
+    st = getattr(ret, "st", None)
+    non_contiguous = st is not None and not st.contiguous
    G.nodes[nm(ret)]['label'] = str(ret.shape)
-    G.nodes[nm(ret)]['fillcolor'] = top_colors[optype]
-    G.nodes[nm(ret)]['style'] = 'filled'
+    G.nodes[nm(ret)]['fillcolor'] = top_colors[optype] + ('80' if non_contiguous else '')
+    G.nodes[nm(ret)]['style'] = 'filled, dashed' if non_contiguous else 'filled'

 class Ops:
  def unary_op(ctx, op:UnaryOps, x):