FPGA Based Accelerator for Tinygrad (#258)

* ops_risk * risk sim * guessing is for winners * minor * better * matmal with risk * conv doesn't work * closer * conv2d works * ops_risk * opt2 works * opt1 may not be possible * opt1 is a mulacc * arty * attosoc example building on mac * minor * riscv assembler * gucci gang * we got C code * not a scam * hello * make risk mergeable into master * unop support
2021-06-07 17:45:09 -07:00 · 2021-06-07 17:45:09 -07:00 · 2075fdeb4f
parent 77ba198b57
commit 2075fdeb4f
19 changed files with 4074 additions and 1 deletions
--- a/docs/nvidia_notes
+++ b/docs/nvidia_notes
@ -9,6 +9,13 @@ Why aren't the other accelerators 3D like this?

 --

+Tesla chip
+
+96x96 array
+32 MiB SRAM
+
+-- 
+
 SNPE is using 4x4x4 -> 4x4 (64 FMAs) in the convs.
 Then it's accumulating in that matrix.

--- a/extra/ops_risk.py
+++ b/extra/ops_risk.py
@ -0,0 +1,206 @@
+import numpy as np
+from tinygrad.tensor import Function
+from extra.risk import *
+
+# ************* unary ops *************
+
+class ReLU(Function):
+  def forward(ctx, input):
+    ctx.save_for_backward(input)
+    return risk_unop(input, UnaryOps.RELU)
+
+  def backward(ctx, grad_output):
+    input, = ctx.saved_tensors
+    return risk_binop(grad_output, risk_unop(input, UnaryOps.GT0), BinaryOps.MUL)
+
+class Log(Function):
+  def forward(ctx, input):
+    ctx.save_for_backward(input)
+    return risk_unop(input, UnaryOps.LOG)
+
+  def backward(ctx, grad_output):
+    input, = ctx.saved_tensors
+    return risk_binop(grad_output, input, BinaryOps.DIV)
+
+class Exp(Function):
+  def forward(ctx, input):
+    ret = risk_unop(input, UnaryOps.EXP)
+    ctx.save_for_backward(ret)
+    return ret
+
+  def backward(ctx, grad_output):
+    ret, = ctx.saved_tensors
+    return risk_binop(grad_output, ret, BinaryOps.MUL)
+
+# ************* processing ops *************
+
+class Matmul(Function):
+  def forward(ctx, input, weight):
+    ctx.save_for_backward(input, weight)
+    return risk_matmul(input, weight)
+
+  def backward(ctx, grad_output):
+    input, weight = ctx.saved_tensors
+    grad_input = risk_matmul(grad_output, weight, transpose_w=True)
+    grad_weight = risk_matmul(input, grad_output, transpose_x=True)
+    return grad_input, grad_weight
+
+class Conv2D(Function):
+  def forward(ctx, x, w, stride=1, groups=1):
+    if type(ctx.stride) == int:
+      ctx.stride = (ctx.stride, ctx.stride)
+    cout,cin,H,W = w.shape
+    ys,xs = ctx.stride
+    bs,cin_ = x.shape[0], x.shape[1]
+    iy,ix = x.shape[2],x.shape[3]
+    oy,ox = (x.shape[2]-(H-ys))//ys, (x.shape[3]-(W-xs))//xs
+    assert cin*ctx.groups == cin_
+    assert cout % ctx.groups == 0
+    rcout = cout//ctx.groups
+
+    # if H == 1 and W == 1 and ctx.groups == 1 and ctx.stride == (1,1):
+
+    gx = x.reshape(bs,ctx.groups,cin,x.shape[2],x.shape[3])
+    tx = np.lib.stride_tricks.as_strided(gx,
+      shape=(bs, ctx.groups, cin, oy, ox, H, W),
+      strides=(*gx.strides[0:3], gx.strides[3]*ys, gx.strides[4]*xs, *gx.strides[3:5]),
+      writeable=False,
+    )
+    tw = w.reshape(ctx.groups, rcout, cin, H, W)
+    ctx.save_for_backward(tx, tw, x.shape)
+
+    print((*gx.strides[0:3], gx.strides[3]*ys, gx.strides[4]*xs, *gx.strides[3:5]))
+
+    """
+    ret = np.zeros((bs,ctx.groups,oy,ox,rcout),dtype=x.dtype)
+    for g in range(ctx.groups):
+      #ijYXyx,kjyx -> iYXk ->ikYX
+      ret[:,g] += np.tensordot(tx[:,g], tw[g], ((1,4,5),(1,2,3)))
+
+    print(bs, ctx.groups, cin)
+    return np.moveaxis(ret,4,2).reshape(bs, cout, oy, ox)
+    """
+
+    riski_dmar(SLOT(0), x)   # bs, groups, cin, x.shape[2], x.shape[3]
+    riski_dmar(SLOT(1), w)   # groups, rcout, cin, H, W
+
+    risk_reset_counts()
+    print(bs, ctx.groups, rcout, oy, ox, cin, H, W)
+
+    for B in range(0, bs):
+      if cin == 1 and rcout == 1 and ctx.groups > 1:
+        # hmm, this doesn't work, it's not a matmul
+        # you always have to loop over the groups, since they aren't joint
+        # the idea would be to collapse the HxW into the matmul, but you'd be limited to 9 for 3x3
+        # and while the load is easy in the weight matrix, it's hard in the image matrix (3 strides)
+        # and only the diagonal of the matrix would be useful! groups aren't channels!
+        # [(1, 144, 58, 58), (144, 1, 3, 3)] -> (1, 144, 56, 56)
+
+        # what does a grouped 1x1 conv look like?
+        #    bs x groups x yx -- groups x 1 --> bs x groups x yx
+        #    it looks like a broadcasted multiply
+
+        print("opt1")
+
+        # x:   bs x groups x iy x ix
+        # w:        groups x H  x W
+        # out: bs x groups x oy x ox
+        # ix x groups x groups
+        for g in range(0, groups, SZ):
+          for Y in range(0, oy):
+            for X in range(0, ox, SZ):
+              IY,IX = Y*ys,X*xs
+              riski_mov(Reg.MATMUL_OUTPUT, Reg.ZERO)
+              for y in range(IY, IY+H):
+                for x in range(IX, IX+W):
+                  riski_load(Reg.MATMUL_INPUT,
+                    SLOT(0) + B*groups*iy*ix + g*iy*ix + y*ix + x,
+                    xs, iy*ix, min(SZ, ox-X), min(SZ, groups-g))
+                  # 0 here is for broadcasting
+                  riski_load(Reg.MATMUL_WEIGHTS,
+                    SLOT(1) + g*H*W + (y-IY)*W + (x-IX),
+                    0, H*W, SZ, min(SZ, groups-g))
+                  riski_mulacc()
+                  #risk_regdump()
+              riski_store(Reg.MATMUL_OUTPUT,
+                SLOT(2) + B*groups*oy*ox + g*oy*ox + Y*ox + X,
+                1, oy*ox, min(SZ, ox-X), min(SZ, groups-g))
+
+      elif H == 1 and W == 1 and xs == 1 and ys == 1:
+        print("opt2")
+        # oxy x cin x rcout -- unstrided 1x1
+        # this is a simple matmul
+        for g in range(0, groups):
+          for c in range(0, rcout, SZ):
+            yx = oy*ox
+            assert yx == iy*ix
+            for YX in range(0, oy*ox, SZ):   # these are next to each other
+              # inner conv
+              riski_mov(Reg.MATMUL_OUTPUT, Reg.ZERO)
+              for ci in range(0, cin, SZ):
+                riski_load(Reg.MATMUL_INPUT,
+                  SLOT(0) + B*groups*cin*yx + g*cin*yx + ci*yx + YX,
+                  1, yx, min(SZ, yx-YX), min(SZ, cin-ci))
+                riski_load(Reg.MATMUL_WEIGHTS,
+                  SLOT(1) + g*rcout*cin + c*cin + ci,
+                  1, cin, min(SZ, cin-ci), min(SZ, rcout-c))
+                riski_matmul()
+              riski_store(Reg.MATMUL_OUTPUT,
+                SLOT(2) + B*groups*rcout*yx + g*rcout*yx + c*yx + YX,
+                1, yx, min(SZ, yx-YX), min(SZ, rcout-c))
+      else:
+        print("unoptimized")
+        # ox x cin x rcout -- unoptimized
+        for g in range(0, groups):
+          for c in range(0, rcout, SZ):
+            for Y in range(0, oy):
+              for X in range(0, ox, SZ):
+                IY,IX = Y*ys,X*xs
+
+                # inner conv
+                riski_mov(Reg.MATMUL_OUTPUT, Reg.ZERO)
+                for ci in range(0, cin, SZ):
+                  # not a loop in 1x1 convs, 9 in 3x3, 25 in 5x5
+                  for y in range(IY, IY+H):
+                    for x in range(IX, IX+W):
+                      riski_load(Reg.MATMUL_INPUT,
+                        SLOT(0) + B*groups*cin*iy*ix + g*cin*iy*ix + ci*iy*ix + y*ix + x,
+                        xs, iy*ix, min(SZ, ox-X), min(SZ, cin-ci))
+                      riski_load(Reg.MATMUL_WEIGHTS,
+                        SLOT(1) + g*rcout*cin*H*W + c*cin*H*W + ci*H*W + (y-IY)*W + (x-IX),
+                        H*W, cin*H*W, min(SZ, cin-ci), min(SZ, rcout-c))
+                      riski_matmul()
+                riski_store(Reg.MATMUL_OUTPUT,
+                  SLOT(2) + B*groups*rcout*oy*ox + g*rcout*oy*ox + c*oy*ox + Y*ox + X,
+                  1, oy*ox, min(SZ, ox-X), min(SZ, rcout-c))
+    risk_print_counts()
+
+    #print(x.shape, w.shape, "->", ret.shape)
+    return riski_dmaw(SLOT(2), (bs, cout, oy, ox))
+
+  def backward(ctx, grad_output):
+    bs,_,oy,ox = grad_output.shape
+    tx, tw, x_shape = ctx.saved_tensors
+    _,rcout,cin,H,W = tw.shape
+    ys,xs = ctx.stride
+    OY,OX = x_shape[2:4]
+
+    ggg = grad_output.reshape(bs,ctx.groups,rcout,oy,ox)
+
+    gdw = np.zeros((ctx.groups,rcout,cin,H,W), dtype=tx.dtype)
+    for g in range(ctx.groups):
+      #'ikYX,ijYXyx -> kjyx'
+      gdw[g] += np.tensordot(ggg[:,g], tx[:,g], ((0,2,3),(0,2,3)))
+
+    # needs to be optimized
+    gdx = np.zeros((bs,ctx.groups,cin,OY,OX), dtype=tx.dtype)
+    for k in range(oy*ox):
+      Y, X = k//ox, k%ox
+      iY,iX = Y*ys, X*xs
+      #gdx[:,:,: , iY:iY+H, iX:iX+W] += np.einsum('igk,gkjyx->igjyx', ggg[:,:,:,Y,X], tw)
+      for g in range(ctx.groups):
+        tg = np.dot(ggg[:,g,:,Y,X].reshape(bs, -1), tw[g].reshape(rcout, -1))
+        gdx[:, g, :, iY:iY+H, iX:iX+W] += tg.reshape((bs, cin, H, W))
+
+    return gdx.reshape((bs, ctx.groups*cin, OY, OX)), gdw.reshape((ctx.groups*rcout, cin, H, W))
+
--- a/extra/risk.py
+++ b/extra/risk.py
@ -0,0 +1,283 @@
+#!/usr/bin/env python3
+
+# RISK architecture is going to change everything
+# implement on S7t-VG6 (lol, too much $$$)
+
+# Arty A7-100T
+#   256 MB of DDR3 with 2.6 GB/s of RAM bandwidth (vs 512 GB/s on S7t-VG6)
+#   255K 19-bit elements
+
+import functools
+import numpy as np
+from collections import defaultdict
+
+# 32x32 * 32x32 -> 32x32 matmul = 65536 FLOPS @ 1 GHz = 64 TOPS
+# mulacc is 2048 FLOPS, 32x less
+# 32x32 (aka 1024 element) ALU
+# 1024 wide permute
+# 1024 wide load/store (1 cycle to SRAM)
+# all in elements, aka TF32 (19 bits)
+
+# targets:
+#   matmul input
+#   matmul weights
+#   ALU
+#   permute
+
+# 1024x1024x4x19 bits = 10MB
+# fully strided
+# load1024 <target>, <address>, <stride x (32)>, <stride y (32)>
+
+# 4 slots
+# <input> <weight> <output> <empty>
+# <empty> <output> <input> <weight>
+# <weight> <input> <empty> <output>
+
+SZ = 32
+SLOTSIZE = 1024*1024*2   # 5MB, for 20MB total
+sram = np.zeros((SLOTSIZE*4), dtype=np.float32)
+regfile = {}
+SLOT = lambda x: x*SLOTSIZE
+
+from enum import Enum
+class Reg(Enum):
+  ZERO = 0
+  # can the ALU use the same registers?
+  MATMUL_INPUT = 1
+  MATMUL_WEIGHTS = 2
+  MATMUL_OUTPUT = 3
+
+# this should be a generic function
+class UnaryOps(Enum):
+  RELU = 0
+  EXP = 1
+  LOG = 2
+  GT0 = 3
+
+class BinaryOps(Enum):
+  ADD = 0
+  SUB = 1
+  MUL = 2
+  DIV = 3
+  MULACC = 4
+
+for t in Reg:
+  regfile[t] = np.zeros((SZ, SZ), dtype=np.float32)
+
+# *** profiler ***
+
+cnts = defaultdict(int)
+tcnts = defaultdict(int)
+utils = defaultdict(int)
+maxdma = 0
+def count(func):
+  @functools.wraps(func)
+  def wrapper(*args, **kwargs):
+    cnts[func.__name__] += 1
+    tcnts[func.__name__] += 1
+    return func(*args, **kwargs)
+  return wrapper
+
+import atexit
+@atexit.register
+def risk_print_counts():
+  print(cnts)
+  print(tcnts)
+  print(utils)
+  util_n = sum([k[0]*k[1]*v for k,v in utils.items()])
+  util_d = sum([SZ*SZ*v for k,v in utils.items()])
+  print("%.2f GOPS %d maxdma" % ((tcnts['riski_matmul']*SZ*SZ*SZ*2 + tcnts['riski_mulacc']*SZ*SZ*2)*1e-9, maxdma))
+  print("ran in %.2f us with util %.2f%% total %.2f us" % (sum(cnts.values())*1e-3, util_n*100/(util_d+1), sum(tcnts.values())*1e-3))
+
+def risk_reset_counts():
+  global cnts, utils
+  cnts = defaultdict(int)
+  utils = defaultdict(int)
+
+def risk_regdump():
+  print("\n***** regdump *****")
+  print(regfile[Reg.MATMUL_INPUT])
+  print(regfile[Reg.MATMUL_WEIGHTS])
+  print(regfile[Reg.MATMUL_OUTPUT])
+
+# *** instructions ***
+
+@count
+def riski_unop(op):
+  if op == UnaryOps.RELU:
+    regfile[Reg.MATMUL_OUTPUT] = np.maximum(regfile[Reg.MATMUL_INPUT], 0)
+  elif op == UnaryOps.LOG:
+    regfile[Reg.MATMUL_OUTPUT] = np.log(regfile[Reg.MATMUL_INPUT])
+  elif op == UnaryOps.EXP:
+    regfile[Reg.MATMUL_OUTPUT] = np.exp(regfile[Reg.MATMUL_INPUT])
+  elif op == UnaryOps.GT0:
+    regfile[Reg.MATMUL_OUTPUT] = (regfile[Reg.MATMUL_INPUT] >= 0)
+
+@count
+def riski_add():
+  regfile[Reg.MATMUL_OUTPUT] = regfile[Reg.MATMUL_INPUT] + regfile[Reg.MATMUL_WEIGHTS]
+
+@count
+def riski_sub():
+  regfile[Reg.MATMUL_OUTPUT] = regfile[Reg.MATMUL_INPUT] - regfile[Reg.MATMUL_WEIGHTS]
+
+@count
+def riski_mul():
+  regfile[Reg.MATMUL_OUTPUT] = regfile[Reg.MATMUL_INPUT] * regfile[Reg.MATMUL_WEIGHTS]
+
+@count
+def riski_div():
+  regfile[Reg.MATMUL_OUTPUT] = regfile[Reg.MATMUL_INPUT] / regfile[Reg.MATMUL_WEIGHTS]
+
+@count
+def riski_mulacc():
+  regfile[Reg.MATMUL_OUTPUT] += regfile[Reg.MATMUL_INPUT] * regfile[Reg.MATMUL_WEIGHTS]
+
+binops = {BinaryOps.ADD: riski_add,
+          BinaryOps.SUB: riski_sub,
+          BinaryOps.MUL: riski_mul,
+          BinaryOps.DIV: riski_div,
+          BinaryOps.MULACC: riski_mulacc}
+
+@count
+def riski_matmul():
+  #print("LLL:\n",regfile[Reg.MATMUL_INPUT],"\n",regfile[Reg.MATMUL_WEIGHTS])
+  regfile[Reg.MATMUL_OUTPUT] += \
+    regfile[Reg.MATMUL_INPUT] @ \
+    regfile[Reg.MATMUL_WEIGHTS]
+
+@count
+def riski_mov(tout, tin):
+  regfile[tout][:] = regfile[tin]
+
+@count
+def riski_load(target, address, stride_y=SZ, stride_x=1, len_y=SZ, len_x=SZ):
+  global util_n, util_d
+  utils[(len_y, len_x)] += 1
+  d = regfile[target]
+  d[:] = 0
+  for y in range(0, len_y):
+    for x in range(0, len_x):
+      d[y, x] = sram[address + y*stride_y + x*stride_x]
+
+@count
+def riski_store(target, address, stride_y=SZ, stride_x=1, len_y=SZ, len_x=SZ):
+  d = regfile[target]
+  for y in range(0, len_y):
+    for x in range(0, len_x):
+      sram[address + y*stride_y + x*stride_x] = d[y, x]
+
+@count
+def riski_dmar(address, arr):
+  global maxdma
+  arr = arr.reshape(-1)
+  assert(arr.shape[0] <= SLOTSIZE)
+  maxdma = max(maxdma, arr.shape[0])
+  print("DMAR %d elements" % arr.shape[0])
+  sram[address:address+arr.shape[0]] = arr
+
+@count
+def riski_dmaw(address, shp):
+  print("DMAW %d elements" % np.prod(shp))
+  return np.copy(sram[address:address+np.prod(shp)].reshape(shp))
+
+# *** RISK-5 code ***
+
+def risk_unop(x, op):
+  riski_dmar(SLOT(0), x)
+  cnt = np.prod(x.shape)
+  for i in range(0, np.prod(x.shape), SZ*SZ):
+    riski_load(Reg.MATMUL_INPUT, SLOT(0)+i)
+    riski_unop(op)
+    riski_store(Reg.MATMUL_OUTPUT, SLOT(2)+i)
+  return riski_dmaw(SLOT(2), x.shape)
+
+def risk_binop(x, w, op):
+  riski_dmar(SLOT(0), x)
+  riski_dmar(SLOT(1), w)
+  for i in range(0, np.prod(x.shape), SZ*SZ):
+    riski_load(Reg.MATMUL_INPUT, SLOT(0)+i)
+    riski_load(Reg.MATMUL_WEIGHTS, SLOT(1)+i)
+    binops[op]()
+    riski_store(Reg.MATMUL_OUTPUT, SLOT(2)+i)
+  return riski_dmaw(SLOT(2), x.shape)
+
+def risk_matmul(x, w, transpose_x=False, transpose_w=False):
+  # copy matrices into SRAM
+  # x is M x K
+  # w is K x N
+  # out is M x N
+  riski_dmar(SLOT(0), x)
+  riski_dmar(SLOT(1), w)
+
+  if transpose_x:
+    K,M = x.shape[-2], x.shape[-1]
+  else:
+    M,K = x.shape[-2], x.shape[-1]
+  if transpose_w:
+    N = w.shape[-2]
+    assert w.shape[-1] == K
+  else:
+    N = w.shape[-1]
+    assert w.shape[-2] == K
+  cnt = np.prod(x.shape[0:-2]) if len(x.shape) > 2 else 1
+
+  # do matmul
+  for c in range(cnt):
+    for m in range(0, M, SZ):
+      for n in range(0, N, SZ):
+        riski_mov(Reg.MATMUL_OUTPUT, Reg.ZERO)
+        for k in range(0, K, SZ):
+          if transpose_x:
+            riski_load(Reg.MATMUL_INPUT, SLOT(0)+c*M*K + k*M+m, 1, M, min(SZ, M-m), min(SZ, K-k))
+          else:
+            riski_load(Reg.MATMUL_INPUT, SLOT(0)+c*M*K + m*K+k, K, 1, min(SZ, M-m), min(SZ, K-k))
+          if transpose_w:
+            riski_load(Reg.MATMUL_WEIGHTS, SLOT(1)+c*K*N + n*K+k, 1, K, min(SZ, K-k), min(SZ, N-n))
+          else:
+            riski_load(Reg.MATMUL_WEIGHTS, SLOT(1)+c*K*N + k*N+n, N, 1, min(SZ, K-k), min(SZ, N-n))
+          riski_matmul()
+        riski_store(Reg.MATMUL_OUTPUT, SLOT(2)+c*M*N + m*N+n, N, 1, min(SZ, M-m), min(SZ, N-n))
+
+  # copy back from SRAM
+  return riski_dmaw(SLOT(2), (*x.shape[0:-2],M,N))
+
+import unittest
+class TestRisk(unittest.TestCase):
+  def test_matmul_even(self):
+    x = np.random.uniform(size=(SZ*8, SZ*8)).astype(np.float32)
+    w = np.random.uniform(size=(SZ*8, SZ*8)).astype(np.float32)
+    np.testing.assert_allclose(x @ w, risk_matmul(x, w), rtol=1e-5)
+
+  def test_matmul_small(self):
+    x = np.array([[1,2,3],[4,5,6],[7,8,9]])
+    w = np.array([[-1,-2,-3],[-4,-5,-6],[-7,-8,-9]])
+    np.testing.assert_allclose(x @ w, risk_matmul(x, w), rtol=1e-5)
+
+  def test_matmul_uneven(self):
+    x = np.random.uniform(size=(47, 79)).astype(np.float32)
+    w = np.random.uniform(size=(79, 42)).astype(np.float32)
+    np.testing.assert_allclose(x @ w, risk_matmul(x, w), rtol=1e-5)
+
+  def test_matmul_transpose(self):
+    x = np.random.uniform(size=(33, 33)).astype(np.float32)
+    w = np.random.uniform(size=(33, 33)).astype(np.float32)
+    np.testing.assert_allclose(x @ w, risk_matmul(x, w), rtol=1e-5)
+    np.testing.assert_allclose(x.T @ w, risk_matmul(x, w, True), rtol=1e-5)
+    np.testing.assert_allclose(x @ w.T, risk_matmul(x, w, False, True), rtol=1e-5)
+    np.testing.assert_allclose(x.T @ w.T, risk_matmul(x, w, True, True), rtol=1e-5)
+
+  def test_matmul_transpose_uneven_w(self):
+    x = np.random.uniform(size=(47, 79)).astype(np.float32)
+    w = np.random.uniform(size=(42, 79)).astype(np.float32)
+    np.testing.assert_allclose(x @ w.T, risk_matmul(x, w, transpose_w=True), rtol=1e-5)
+
+  def test_matmul_transpose_uneven_x(self):
+    x = np.random.uniform(size=(79, 47)).astype(np.float32)
+    w = np.random.uniform(size=(79, 42)).astype(np.float32)
+    np.testing.assert_allclose(x.T @ w, risk_matmul(x, w, transpose_x=True), rtol=1e-5)
+
+if __name__ == "__main__":
+  np.random.seed(1337)
+  unittest.main(verbosity=2)
+
--- a/fpga/.gitignore
+++ b/fpga/.gitignore
@ -0,0 +1 @@
+out/
--- a/fpga/all.sh
+++ b/fpga/all.sh
@ -0,0 +1,5 @@
+#!/bin/bash -e
+./riscv.sh
+./build.sh
+./prog.sh
+
--- a/fpga/build.sh
+++ b/fpga/build.sh
@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+set -ex
+mkdir -p out
+cd out
+
+BASE=/Users/taylor/fun/fpga
+
+# yosys commit 82f5829aba108be4a3786e7a237fd7bcebe61eb6
+# build normally
+$BASE/yosys/yosys -p "synth_xilinx -flatten -nowidelut -abc9 -arch xc7 -top top; write_json attosoc.json" ../src/attosoc.v ../src/attosoc_top.v ../src/simpleuart.v
+
+# nextpnr-xilinx 0be5cc19f3261101730ce9274720aaf3784f83e2
+# cmake -DARCH=xilinx -DBUILD_GUI=no -DBUILD_PYTHON=no -DUSE_OPENMP=No .
+# python3 xilinx/python/bbaexport.py --device xc7a100tcsg324-1 --bba xilinx/xc7a100t.bba
+# ./bbasm -l xilinx/xc7a100t.bba xilinx/xc7a100t.bin
+$BASE/nextpnr-xilinx/nextpnr-xilinx --chipdb $BASE/nextpnr-xilinx/xilinx/xc7a100t.bin --xdc ../src/arty.xdc --json attosoc.json --write attosoc_routed.json --fasm attosoc.fasm
+
+XRAY_UTILS_DIR=$BASE/prjxray/utils
+XRAY_TOOLS_DIR=$BASE/prjxray/build/tools
+XRAY_DATABASE_DIR=$BASE/prjxray/database
+
+"${XRAY_UTILS_DIR}/fasm2frames.py" --db-root "${XRAY_DATABASE_DIR}/artix7" --part xc7a100tcsg324-1 attosoc.fasm > attosoc.frames
+"${XRAY_TOOLS_DIR}/xc7frames2bit" --part_file "${XRAY_DATABASE_DIR}/artix7/xc7a100tcsg324-1/part.yaml" --part_name xc7a100tcsg324-1 --frm_file attosoc.frames --output_file attosoc.bit
+
--- a/fpga/console.py
+++ b/fpga/console.py
@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import time
+import pyftdi.serialext
+
+port = pyftdi.serialext.serial_for_url('ftdi://ftdi:2232h/2', baudrate=115200)
+print(port)
+
+while 1:
+  #port.write(b'a')
+  data = port.read(1)
+  print(data)
+  time.sleep(0.01)
+
+
+
--- a/fpga/digilent_arty.cfg
+++ b/fpga/digilent_arty.cfg
@ -0,0 +1,27 @@
+#
+# Digilent Arty with Xilinx Artix-7 FPGA
+#
+# http://store.digilentinc.com/arty-artix-7-fpga-development-board-for-makers-and-hobbyists/
+#
+
+# iManufacturer           1 Digilent
+# iProduct                2 Digilent USB Device
+# iSerial                 3 210319A28C7F
+
+interface ftdi
+ftdi_device_desc "Digilent USB Device"
+ftdi_vid_pid 0x0403 0x6010
+# channel 1 does not have any functionality
+ftdi_channel 0
+# just TCK TDI TDO TMS, no reset
+ftdi_layout_init 0x0088 0x008b
+reset_config none
+adapter_khz 10000
+
+source [find cpld/xilinx-xc7.cfg]
+source [find cpld/jtagspi.cfg]
+
+init
+xc7_program xc7.tap
+pld load 0 out/attosoc.bit
+exit
--- a/fpga/prog.sh
+++ b/fpga/prog.sh
@ -0,0 +1,3 @@
+#!/bin/bash
+openocd -d -f digilent_arty.cfg
+
--- a/fpga/riscv.sh
+++ b/fpga/riscv.sh
@ -0,0 +1,10 @@
+#!/bin/bash -e
+cd out
+riscv64-unknown-elf-gcc -Os -march=rv32i -mabi=ilp32 -nostdlib ../src/main.c
+#riscv64-unknown-elf-as ../src/riscv.asm
+riscv64-unknown-elf-objdump -d a.out
+riscv64-unknown-elf-objcopy -O binary a.out a.asm
+xxd a.asm
+python -c 'import struct; dat = open("a.asm", "rb").read(); print("\n".join(["%08x" % c for c in struct.unpack("I"*(len(dat)//4), dat)]))' > ../src/firmware.hex
+
+
--- a/fpga/src/.gitignore
+++ b/fpga/src/.gitignore
@ -0,0 +1 @@
+firmware.bin
--- a/fpga/src/arty.xdc
+++ b/fpga/src/arty.xdc
@ -0,0 +1,58 @@
+# R
+set_property LOC G6 [get_ports led[0]]
+set_property LOC G3 [get_ports led[1]]
+set_property LOC J3 [get_ports led[2]]
+set_property LOC K1 [get_ports led[3]]
+# G
+set_property LOC F6 [get_ports led[4]]
+set_property LOC J4 [get_ports led[5]]
+set_property LOC J2 [get_ports led[6]]
+set_property LOC H6 [get_ports led[7]]
+# B
+set_property LOC E1 [get_ports led[8]]
+set_property LOC G4 [get_ports led[9]]
+set_property LOC H4 [get_ports led[10]]
+set_property LOC K2 [get_ports led[11]]
+
+# second row
+# set_property LOC H5 [get_ports led[12]]
+# set_property LOC J5 [get_ports led[13]]
+# set_property LOC T9 [get_ports led[14]]
+# set_property LOC T10 [get_ports led[15]]
+
+set_property IOSTANDARD LVCMOS33 [get_ports led[0]]
+set_property IOSTANDARD LVCMOS33 [get_ports led[1]]
+set_property IOSTANDARD LVCMOS33 [get_ports led[2]]
+set_property IOSTANDARD LVCMOS33 [get_ports led[3]]
+set_property IOSTANDARD LVCMOS33 [get_ports led[4]]
+set_property IOSTANDARD LVCMOS33 [get_ports led[5]]
+set_property IOSTANDARD LVCMOS33 [get_ports led[6]]
+set_property IOSTANDARD LVCMOS33 [get_ports led[7]]
+set_property IOSTANDARD LVCMOS33 [get_ports led[8]]
+set_property IOSTANDARD LVCMOS33 [get_ports led[9]]
+set_property IOSTANDARD LVCMOS33 [get_ports led[10]]
+set_property IOSTANDARD LVCMOS33 [get_ports led[11]]
+set_property IOSTANDARD LVCMOS33 [get_ports led[12]]
+set_property IOSTANDARD LVCMOS33 [get_ports led[13]]
+set_property IOSTANDARD LVCMOS33 [get_ports led[14]]
+set_property IOSTANDARD LVCMOS33 [get_ports led[15]]
+
+set_property LOC A8 [get_ports sw[0]]
+set_property LOC C11 [get_ports sw[1]]
+set_property LOC C10 [get_ports sw[2]]
+set_property LOC A10 [get_ports sw[3]]
+
+set_property IOSTANDARD LVCMOS33 [get_ports sw[0]]
+set_property IOSTANDARD LVCMOS33 [get_ports sw[1]]
+set_property IOSTANDARD LVCMOS33 [get_ports sw[2]]
+set_property IOSTANDARD LVCMOS33 [get_ports sw[3]]
+
+set_property LOC E3 [get_ports clk_i]
+set_property IOSTANDARD LVCMOS33 [get_ports clk_i]
+
+set_property LOC A9 [get_ports ser_rx]
+set_property IOSTANDARD LVCMOS33 [get_ports ser_rx]
+
+set_property LOC D10 [get_ports ser_tx]
+set_property IOSTANDARD LVCMOS33 [get_ports ser_tx]
+
--- a/fpga/src/attosoc.v
+++ b/fpga/src/attosoc.v
--- a/fpga/src/attosoc_top.v
+++ b/fpga/src/attosoc_top.v
@ -0,0 +1,48 @@
+module top (
+  input clk_i,
+  input [3:0] sw,
+  output [11:0] led,
+  output ser_tx,
+  input  ser_rx,
+);
+
+    //assign led = {&sw, |sw, ^sw, ~^sw};
+
+    reg clk50 = 1'b0;
+    always @(posedge clk_i)
+        clk50 <= ~clk50;
+
+    wire clk;
+    BUFGCTRL bufg_i (
+        .I0(clk50),
+        .CE0(1'b1),
+        .S0(1'b1),
+        .O(clk)
+    );
+
+
+  //  wire clk = clk_i;
+
+    //reg clkdiv;
+    //reg [22:0] ctr;
+    //always @(posedge clk) {clkdiv, ctr} <= ctr + 1'b1;
+
+    wire [7:0] soc_led;
+    attosoc soc_i(
+        .clk(clk),
+        .reset(sw[0]),
+        .led(soc_led),
+        .ser_tx(ser_tx),
+        .ser_rx(ser_rx),
+    );
+
+    // this maps 2 bits to each LED
+    generate
+        genvar i;
+        for (i = 0; i < 4; i++) begin
+            assign led[0 + i] = soc_led[2 * i]; // R
+            assign led[4 + i] = soc_led[(2 * i) + 1]; // G
+            assign led[8 + i] = &soc_led[2 * i +: 2]; // B
+        end
+    endgenerate
+endmodule
--- a/fpga/src/firmware.hex
+++ b/fpga/src/firmware.hex
@ -0,0 +1,46 @@
+fd010113
+01312e23
+01412c23
+01512a23
+01612823
+02112623
+02812423
+02912223
+03212023
+01712623
+020007b7
+1b200713
+00e7a223
+00400a13
+06800a93
+06500b13
+06c00993
+00800913
+00100493
+02000437
+06f00b93
+00942023
+fff90913
+03c000ef
+00149493
+fe0900e3
+ff4496e3
+03c000ef
+01542423
+034000ef
+01642423
+02c000ef
+01342423
+024000ef
+01342423
+01c000ef
+01742423
+fc1ff06f
+001002b7
+fff28293
+fe029ee3
+00008067
+000102b7
+fff28293
+fe029ee3
+00008067
--- a/fpga/src/main.c
+++ b/fpga/src/main.c
@ -0,0 +1,46 @@
+#include <stdint.h>
+#define reg_leds (*(volatile uint32_t*)0x02000000)
+#define reg_uart_clkdiv (*(volatile uint32_t*)0x02000004)
+#define reg_uart_data (*(volatile uint32_t*)0x02000008)
+
+void delay();
+
+int main() {
+  // 50 mhz clock
+  reg_uart_clkdiv = 434;
+  while (1) {
+    for (int i = 1; i < 0x100; i <<= 1) {
+      if (i == 4) {
+        sdelay();
+        reg_uart_data = 'h';
+        sdelay();
+        reg_uart_data = 'e';
+        sdelay();
+        reg_uart_data = 'l';
+        sdelay();
+        reg_uart_data = 'l';
+        sdelay();
+        reg_uart_data = 'o';
+      }
+      reg_leds = i;
+      delay();
+    }
+  }
+}
+
+void __attribute__ ((noinline)) delay() {
+  asm ("lui t0, 0x100\n"
+       "lop:"
+       "addi t0,t0,-0x1\n"
+       "bne t0,zero,lop\n"
+       ::);
+}
+
+void __attribute__ ((noinline)) sdelay() {
+  asm ("lui t0, 0x10\n"
+       "lop2:"
+       "addi t0,t0,-0x1\n"
+       "bne t0,zero,lop2\n"
+       ::);
+}
+
--- a/fpga/src/simpleuart.v
+++ b/fpga/src/simpleuart.v
@ -0,0 +1,137 @@
+/*
+ *  PicoSoC - A simple example SoC using PicoRV32
+ *
+ *  Copyright (C) 2017  Clifford Wolf <clifford@clifford.at>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+module simpleuart #(parameter integer DEFAULT_DIV = 1) (
+	input clk,
+	input resetn,
+
+	output ser_tx,
+	input  ser_rx,
+
+	input   [3:0] reg_div_we,
+	input  [31:0] reg_div_di,
+	output [31:0] reg_div_do,
+
+	input         reg_dat_we,
+	input         reg_dat_re,
+	input  [31:0] reg_dat_di,
+	output [31:0] reg_dat_do,
+	output        reg_dat_wait
+);
+	reg [31:0] cfg_divider;
+
+	reg [3:0] recv_state;
+	reg [31:0] recv_divcnt;
+	reg [7:0] recv_pattern;
+	reg [7:0] recv_buf_data;
+	reg recv_buf_valid;
+
+	reg [9:0] send_pattern;
+	reg [3:0] send_bitcnt;
+	reg [31:0] send_divcnt;
+	reg send_dummy;
+
+	assign reg_div_do = cfg_divider;
+
+	assign reg_dat_wait = reg_dat_we && (send_bitcnt || send_dummy);
+	assign reg_dat_do = recv_buf_valid ? recv_buf_data : ~0;
+
+	always @(posedge clk) begin
+		if (!resetn) begin
+			cfg_divider <= DEFAULT_DIV;
+		end else begin
+			if (reg_div_we[0]) cfg_divider[ 7: 0] <= reg_div_di[ 7: 0];
+			if (reg_div_we[1]) cfg_divider[15: 8] <= reg_div_di[15: 8];
+			if (reg_div_we[2]) cfg_divider[23:16] <= reg_div_di[23:16];
+			if (reg_div_we[3]) cfg_divider[31:24] <= reg_div_di[31:24];
+		end
+	end
+
+	always @(posedge clk) begin
+		if (!resetn) begin
+			recv_state <= 0;
+			recv_divcnt <= 0;
+			recv_pattern <= 0;
+			recv_buf_data <= 0;
+			recv_buf_valid <= 0;
+		end else begin
+			recv_divcnt <= recv_divcnt + 1;
+			if (reg_dat_re)
+				recv_buf_valid <= 0;
+			case (recv_state)
+				0: begin
+					if (!ser_rx)
+						recv_state <= 1;
+					recv_divcnt <= 0;
+				end
+				1: begin
+					if (2*recv_divcnt > cfg_divider) begin
+						recv_state <= 2;
+						recv_divcnt <= 0;
+					end
+				end
+				10: begin
+					if (recv_divcnt > cfg_divider) begin
+						recv_buf_data <= recv_pattern;
+						recv_buf_valid <= 1;
+						recv_state <= 0;
+					end
+				end
+				default: begin
+					if (recv_divcnt > cfg_divider) begin
+						recv_pattern <= {ser_rx, recv_pattern[7:1]};
+						recv_state <= recv_state + 1;
+						recv_divcnt <= 0;
+					end
+				end
+			endcase
+		end
+	end
+
+	assign ser_tx = send_pattern[0];
+
+	always @(posedge clk) begin
+		if (reg_div_we)
+			send_dummy <= 1;
+		send_divcnt <= send_divcnt + 1;
+		if (!resetn) begin
+			send_pattern <= ~0;
+			send_bitcnt <= 0;
+			send_divcnt <= 0;
+			send_dummy <= 1;
+		end else begin
+			if (send_dummy && !send_bitcnt) begin
+				send_pattern <= ~0;
+				send_bitcnt <= 15;
+				send_divcnt <= 0;
+				send_dummy <= 0;
+			end else
+			if (reg_dat_we && !send_bitcnt) begin
+				send_pattern <= {1'b1, reg_dat_di[7:0], 1'b0};
+				send_bitcnt <= 10;
+				send_divcnt <= 0;
+			end else
+			if (send_divcnt > cfg_divider && send_bitcnt) begin
+				send_pattern <= {1'b1, send_pattern[9:1]};
+				send_bitcnt <= send_bitcnt - 1;
+				send_divcnt <= 0;
+			end
+		end
+	end
+endmodule
--- a/test/test_ops.py
+++ b/test/test_ops.py
@ -161,6 +161,22 @@ class TestOps(unittest.TestCase):
                  lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups).relu(),
                  lambda x,w: Tensor.conv2d(x,w,groups=groups).relu(), atol=1e-4, grad_rtol=1e-5)

+  def test_grouped_conv2d(self):
+    groups = 2
+    helper_test_op([(1,2,5,5), (groups,1,3,3)],
+      lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups).relu(),
+      lambda x,w: Tensor.conv2d(x,w,groups=groups).relu(), atol=1e-4, grad_rtol=1e-5, forward_only=True)
+
+  def test_fancy_conv2d(self):
+    bs = 2
+    cin = 3
+    cout = 1
+    groups = 3
+    H,W = 3,3
+    helper_test_op([(bs,cin,11,28), (groups*cout,cin//groups,H,W)],
+      lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups).relu(),
+      lambda x,w: Tensor.conv2d(x,w,groups=groups).relu(), atol=1e-4, grad_rtol=1e-5, forward_only=True)
+
  def test_strided_conv2d(self):
    bs = 4
    cin = 3
@ -191,4 +207,5 @@ class TestOps(unittest.TestCase):
          lambda x: Tensor.avg_pool2d(x, kernel_size=ksz), rtol=1e-5)

 if __name__ == '__main__':
+  np.random.seed(1337)
  unittest.main(verbosity=2)
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@ -144,7 +144,7 @@ class Tensor:
    for t0 in reversed(self.deepwalk()):
      assert (t0.grad is not None)
      with ProfileOp(t0._ctx.__class__.__name__, [t0.grad], backward=True) as po:
-        po.output = grads = t0._ctx.backward(t0._ctx, t0.grad.data)
+        grads = t0._ctx.backward(t0._ctx, t0.grad.data)
      if len(t0._ctx.parents) == 1:
        grads = [grads]
      for t, g in zip(t0._ctx.parents, grads):
@ -355,6 +355,9 @@ def _register_ops(namespace, device=Device.CPU):

 from tinygrad import ops_cpu
 _register_ops(ops_cpu)
+if os.getenv("RISK", None) is not None:
+  from extra import ops_risk
+  _register_ops(ops_risk)
 try:
  import pyopencl as cl
  # TODO: move this import to require_init_gpu?