modeld: Move from SNPE to tinygrad (#25207)

* compiling, won't work yet * running with inputs and outputs * there's some magic chance this works * no more dlc, include onnx * yolo tests plz * bump tinygrad * files_common + delete dlc * tinygrad_repo -> tinygrad * pre commit config * llops needed * extra in files_common * bump tinygrad * fix indent * tinygrad/nn/__init__ * tinygrad_repo * bump tinygrad repo * bump tinygrad * bump with native_exp, match maybe * native_explog is argument * pyopencl no cache * 5% chance this matches * work in float32? * bump tinygrad * fix build * no __init__ * fix recip * dumb hack * adding thneed PC support * fix pc segfault * pc thneed is working * to_image * prints stuff with debug=2 * it sort of works * copy host ptr is simpler * bug fix * build on c3 * this correct? * reenable float16 * fix private, fixup copy_inputs internal * bump tinygrad and update ref commit * fix OPTWG on PC * maybe fix non determinism * revert model replay ref commit * comments, init zeroed out buffers * upd ref commit * bump tinygrad to fix initial image * try this ref Co-authored-by: Comma Device <device@comma.ai> old-commit-hash: 40d6f4b65c
2022-09-01 10:31:14 -07:00 · 2022-09-01 10:31:14 -07:00 · 29cd51b876
parent 3308e740ea
commit 29cd51b876
17 changed files with 143 additions and 39 deletions
--- a/.gitignore
+++ b/.gitignore
@ -36,6 +36,7 @@ a.out
 config.json
 clcache
 compile_commands.json
 compare_runtime*.html
 persist
 board/obj/
--- a/.gitmodules
+++ b/.gitmodules
@ -16,3 +16,6 @@
 [submodule "body"]
  path = body
  url = ../../commaai/body.git
 [submodule "tinygrad"]
  path = tinygrad_repo
  url = https://github.com/geohot/tinygrad.git
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -28,7 +28,7 @@ repos:
    rev: v0.931
    hooks:
    -   id: mypy
-        exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(opendbc/)|(laika_repo/)|(rednose_repo/)/'
+        exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(opendbc/)|(laika_repo/)|(rednose_repo/)/|(tinygrad/)|(tinygrad_repo/)'
        additional_dependencies: ['types-PyYAML', 'lxml', 'numpy', 'types-atomicwrites', 'types-pycurl', 'types-requests', 'types-certifi']
        args:
        - --warn-redundant-casts
@ -40,7 +40,7 @@ repos:
    rev: 4.0.1
    hooks:
    -   id: flake8
-        exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(opendbc/)|(laika_repo/)|(rednose_repo/)|(selfdrive/debug/)/'
+        exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(opendbc/)|(laika_repo/)|(rednose_repo/)|(tinygrad/)|(tinygrad_repo/)|(selfdrive/debug/)/'
        additional_dependencies: ['flake8-no-implicit-concat']
        args:
        - --indent-size=2
@ -55,7 +55,7 @@ repos:
        entry: pylint
        language: system
        types: [python]
-        exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(laika_repo/)|(rednose_repo/)'
+        exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(laika_repo/)|(rednose_repo/)|(tinygrad/)|(tinygrad_repo/)'
        args:
        - -rn
        - -sn
--- a/5
+++ b/5
@ -49,6 +49,11 @@ AddOption('--no-thneed',
          dest='no_thneed',
          help='avoid using thneed')
 AddOption('--pc-thneed',
          action='store_true',
          dest='pc_thneed',
          help='use thneed on pc')
 AddOption('--no-test',
          action='store_false',
          dest='test',
--- a/common/clutil.cc
+++ b/common/clutil.cc
@ -78,7 +78,8 @@ cl_program cl_program_from_file(cl_context ctx, cl_device_id device_id, const ch
 }
 cl_program cl_program_from_source(cl_context ctx, cl_device_id device_id, const std::string& src, const char* args) {
-  cl_program prg = CL_CHECK_ERR(clCreateProgramWithSource(ctx, 1, (const char*[]){src.c_str()}, NULL, &err));
+  const char *csrc = src.c_str();
  cl_program prg = CL_CHECK_ERR(clCreateProgramWithSource(ctx, 1, &csrc, NULL, &err));
  if (int err = clBuildProgram(prg, 1, &device_id, args, NULL, NULL); err != 0) {
    cl_print_build_errors(prg, device_id);
    assert(0);
@ -87,7 +88,7 @@ cl_program cl_program_from_source(cl_context ctx, cl_device_id device_id, const
 }
 cl_program cl_program_from_binary(cl_context ctx, cl_device_id device_id, const uint8_t* binary, size_t length, const char* args) {
-  cl_program prg = CL_CHECK_ERR(clCreateProgramWithBinary(ctx, 1, &device_id, &length, (const uint8_t*[]){binary}, NULL, &err));
+  cl_program prg = CL_CHECK_ERR(clCreateProgramWithBinary(ctx, 1, &device_id, &length, &binary, NULL, &err));
  if (int err = clBuildProgram(prg, 1, &device_id, args, NULL, NULL); err != 0) {
    cl_print_build_errors(prg, device_id);
    assert(0);
--- a/release/build_release.sh
+++ b/release/build_release.sh
@ -78,7 +78,7 @@ find . -name 'moc_*' -delete
 find . -name '__pycache__' -delete
 rm -rf panda/board panda/certs panda/crypto
 rm -rf .sconsign.dblite Jenkinsfile release/
-rm selfdrive/modeld/models/supercombo.dlc
+rm selfdrive/modeld/models/supercombo.onnx
 # Move back signed panda fw
 mkdir -p panda/board/obj
--- a/release/files_common
+++ b/release/files_common
@ -352,7 +352,7 @@ selfdrive/modeld/models/driving.cc
 selfdrive/modeld/models/driving.h
 selfdrive/modeld/models/dmonitoring.cc
 selfdrive/modeld/models/dmonitoring.h
-selfdrive/modeld/models/supercombo.dlc
+selfdrive/modeld/models/supercombo.onnx
 selfdrive/modeld/models/dmonitoring_model_q.dlc
 selfdrive/modeld/transforms/loadyuv.cc
@ -561,3 +561,16 @@ opendbc/vw_mqb_2010.dbc
 opendbc/tesla_can.dbc
 opendbc/tesla_radar.dbc
 opendbc/tesla_powertrain.dbc
 tinygrad_repo/openpilot/compile.py
 tinygrad_repo/accel/opencl/*
 tinygrad_repo/extra/onnx.py
 tinygrad_repo/extra/utils.py
 tinygrad_repo/tinygrad/llops/ops_gpu.py
 tinygrad_repo/tinygrad/llops/ops_opencl.py
 tinygrad_repo/tinygrad/helpers.py
 tinygrad_repo/tinygrad/mlops.py
 tinygrad_repo/tinygrad/ops.py
 tinygrad_repo/tinygrad/shapetracker.py
 tinygrad_repo/tinygrad/tensor.py
 tinygrad_repo/tinygrad/nn/__init__.py
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@ -62,25 +62,65 @@ else:
 common_model = lenv.Object(common_src)
 # build thneed model
 if use_thneed and arch == "larch64":
  fn = File("models/supercombo").abspath
  compiler = lenv.Program('thneed/compile', ["thneed/compile.cc"]+common_model, LIBS=libs)
  cmd = f"cd {Dir('.').abspath} && {compiler[0].abspath} --in {fn}.dlc --out {fn}.thneed --binary --optimize"
  lib_paths = ':'.join(Dir(p).abspath for p in lenv["LIBPATH"])
  kernel_path = os.path.join(Dir('.').abspath, "thneed", "kernels")
  cenv = Environment(ENV={'LD_LIBRARY_PATH': f"{lib_paths}:{lenv['ENV']['LD_LIBRARY_PATH']}", 'KERNEL_PATH': kernel_path})
  kernels = [os.path.join(kernel_path, x) for x in os.listdir(kernel_path) if x.endswith(".cl")]
  cenv.Command(fn + ".thneed", [fn + ".dlc", kernels, compiler], cmd)
 lenv.Program('_dmonitoringmodeld', [
    "dmonitoringmodeld.cc",
    "models/dmonitoring.cc",
  ]+common_model, LIBS=libs)
-lenv.Program('_modeld', [
+# build thneed model
 if use_thneed and arch == "larch64" or GetOption('pc_thneed'):
  fn = File("models/supercombo").abspath
  if GetOption('pc_thneed'):
    cmd = f"cd {Dir('#').abspath}/tinygrad_repo && NATIVE_EXPLOG=1 OPTWG=1 UNSAFE_FLOAT4=1 DEBUGCL=1 python3 openpilot/compile.py {fn}.onnx {fn}.thneed"
  else:
    cmd = f"cd {Dir('#').abspath}/tinygrad_repo && FLOAT16=1 PYOPENCL_NO_CACHE=1 MATMUL=1 NATIVE_EXPLOG=1 OPTWG=1 UNSAFE_FLOAT4=1 DEBUGCL=1 python3 openpilot/compile.py {fn}.onnx {fn}.thneed"
  # is there a better way then listing all of tinygrad?
  lenv.Command(fn + ".thneed", [fn + ".onnx",
    "#tinygrad_repo/openpilot/compile.py",
    "#tinygrad_repo/accel/opencl/conv.cl",
    "#tinygrad_repo/accel/opencl/matmul.cl",
    "#tinygrad_repo/accel/opencl/ops_opencl.py",
    "#tinygrad_repo/accel/opencl/preprocessing.py",
    "#tinygrad_repo/extra/onnx.py",
    "#tinygrad_repo/extra/utils.py",
    "#tinygrad_repo/tinygrad/llops/ops_gpu.py",
    "#tinygrad_repo/tinygrad/llops/ops_opencl.py",
    "#tinygrad_repo/tinygrad/helpers.py",
    "#tinygrad_repo/tinygrad/mlops.py",
    "#tinygrad_repo/tinygrad/ops.py",
    "#tinygrad_repo/tinygrad/shapetracker.py",
    "#tinygrad_repo/tinygrad/tensor.py",
    "#tinygrad_repo/tinygrad/nn/__init__.py"
  ], cmd)
  # old thneed compiler. TODO: remove this once tinygrad stuff is stable
  #compiler = lenv.Program('thneed/compile', ["thneed/compile.cc"]+common_model, LIBS=libs)
  #cmd = f"cd {Dir('.').abspath} && {compiler[0].abspath} --in {fn}.dlc --out {fn}.thneed --binary --optimize"
  #lib_paths = ':'.join(Dir(p).abspath for p in lenv["LIBPATH"])
  #kernel_path = os.path.join(Dir('.').abspath, "thneed", "kernels")
  #cenv = Environment(ENV={'LD_LIBRARY_PATH': f"{lib_paths}:{lenv['ENV']['LD_LIBRARY_PATH']}", 'KERNEL_PATH': kernel_path})
  #kernels = [os.path.join(kernel_path, x) for x in os.listdir(kernel_path) if x.endswith(".cl")]
  #cenv.Command(fn + ".thneed", [fn + ".dlc", kernels, compiler], cmd)
 llenv = lenv.Clone()
 if GetOption('pc_thneed'):
  pc_thneed_src = [
    "thneed/thneed_common.cc",
    "thneed/thneed_pc.cc",
    "thneed/serialize.cc",
    "runners/thneedmodel.cc",
  ]
  llenv['CFLAGS'].append("-DUSE_THNEED")
  llenv['CXXFLAGS'].append("-DUSE_THNEED")
  common_model += llenv.Object(pc_thneed_src)
  libs += ['dl']
 llenv.Program('_modeld', [
    "modeld.cc",
    "models/driving.cc",
  ]+common_model, LIBS=libs + transformations)
--- a/selfdrive/modeld/models/supercombo.dlc
+++ b/selfdrive/modeld/models/supercombo.dlc
@ -1,3 +0,0 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:93d265fc88f05746ce47257e15fc2afe43b250b44715313049f829e8aa30a9d6
 size 94302331
--- a/selfdrive/modeld/thneed/serialize.cc
+++ b/selfdrive/modeld/thneed/serialize.cc
@ -33,11 +33,14 @@ void Thneed::load(const char *filename) {
      assert(mobj["needs_load"].bool_value() == false);
    } else {
      if (mobj["needs_load"].bool_value()) {
        //printf("loading %p %d @ 0x%X\n", clbuf, sz, ptr);
        clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, &buf[ptr], NULL);
        if (debug >= 1) printf("loading %p %d @ 0x%X\n", clbuf, sz, ptr);
        ptr += sz;
      } else {
-        clbuf = clCreateBuffer(context, CL_MEM_READ_WRITE, sz, NULL, NULL);
+        // TODO: is there a faster way to init zeroed out buffers?
        void *host_zeros = calloc(sz, 1);
        clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, host_zeros, NULL);
        free(host_zeros);
      }
    }
    assert(clbuf != NULL);
--- a/selfdrive/modeld/thneed/thneed.h
+++ b/selfdrive/modeld/thneed/thneed.h
@ -122,7 +122,7 @@ class Thneed {
    // all CL kernels
    void find_inputs_outputs();
-    void copy_inputs(float **finputs);
+    void copy_inputs(float **finputs, bool internal=false);
    void copy_output(float *foutput);
    cl_int clexec();
    vector<shared_ptr<CLQueuedKernel> > kq;
--- a/selfdrive/modeld/thneed/thneed_common.cc
+++ b/selfdrive/modeld/thneed/thneed_common.cc
@ -30,17 +30,16 @@ cl_int Thneed::clexec() {
  return clFinish(command_queue);
 }
-void Thneed::copy_inputs(float **finputs) {
+void Thneed::copy_inputs(float **finputs, bool internal) {
  //cl_int ret;
  for (int idx = 0; idx < inputs.size(); ++idx) {
    if (debug >= 1) printf("copying %lu -- %p -> %p (cl %p)\n", input_sizes[idx], finputs[idx], inputs[idx], input_clmem[idx]);
-    // TODO: fix thneed caching
+    if (internal) {
-    if (finputs[idx] != NULL) memcpy(inputs[idx], finputs[idx], input_sizes[idx]);
+      // if it's internal, using memcpy is fine since the buffer sync is cached in the ioctl layer
-    //if (finputs[idx] != NULL) CL_CHECK(clEnqueueWriteBuffer(command_queue, input_clmem[idx], CL_TRUE, 0, input_sizes[idx], finputs[idx], 0, NULL, NULL));
+      if (finputs[idx] != NULL) memcpy(inputs[idx], finputs[idx], input_sizes[idx]);
-
+    } else {
-    // HACK
+      if (finputs[idx] != NULL) CL_CHECK(clEnqueueWriteBuffer(command_queue, input_clmem[idx], CL_TRUE, 0, input_sizes[idx], finputs[idx], 0, NULL, NULL));
-    //if (input_sizes[idx] == 16) memset((char*)inputs[idx] + 8, 0, 8);
+    }
  }
 }
@ -202,8 +201,8 @@ void CLQueuedKernel::debug_print(bool verbose) {
            assert(slice_pitch == 0);
            clGetImageInfo(val, CL_IMAGE_BUFFER, sizeof(buf), &buf, NULL);
-            size_t sz;
+            size_t sz = 0;
-            clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
+            if (buf != NULL) clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
            printf(" image %zu x %zu rp %zu @ %p buffer %zu", width, height, row_pitch, buf, sz);
          } else {
            size_t sz;
--- a/selfdrive/modeld/thneed/thneed_pc.cc
+++ b/selfdrive/modeld/thneed/thneed_pc.cc
@ -0,0 +1,40 @@
 #include "selfdrive/modeld/thneed/thneed.h"
 #include <cassert>
 #include "common/clutil.h"
 #include "common/timing.h"
 Thneed::Thneed(bool do_clinit, cl_context _context) {
  context = _context;
  if (do_clinit) clinit();
  char *thneed_debug_env = getenv("THNEED_DEBUG");
  debug = (thneed_debug_env != NULL) ? atoi(thneed_debug_env) : 0;
 }
 void Thneed::execute(float **finputs, float *foutput, bool slow) {
  uint64_t tb, te;
  if (debug >= 1) tb = nanos_since_boot();
  // ****** copy inputs
  copy_inputs(finputs);
  // ****** run commands
  clexec();
  // ****** copy outputs
  copy_output(foutput);
  if (debug >= 1) {
    te = nanos_since_boot();
    printf("model exec in %lu us\n", (te-tb)/1000);
  }
 }
 void Thneed::stop() {
 }
 void Thneed::find_inputs_outputs() {
  // thneed on PC doesn't work on old style inputs/outputs
 }
--- a/selfdrive/modeld/thneed/thneed_qcom2.cc
+++ b/selfdrive/modeld/thneed/thneed_qcom2.cc
@ -269,7 +269,7 @@ void Thneed::execute(float **finputs, float *foutput, bool slow) {
  if (debug >= 1) tb = nanos_since_boot();
  // ****** copy inputs
-  copy_inputs(finputs);
+  copy_inputs(finputs, true);
  // ****** set power constraint
  int ret;
--- a/selfdrive/test/process_replay/model_replay_ref_commit
+++ b/selfdrive/test/process_replay/model_replay_ref_commit
@ -1 +1 @@
-ca90e11f8d59902af38d3785ddd91a27d0fbb411
+cffb4e720b0379bedd4ff802912d998ace775c37
--- a/1
+++ b/1
@ -0,0 +1 @@
 tinygrad_repo/tinygrad
--- a/1
+++ b/1
@ -0,0 +1 @@
 Subproject commit 2e9b7637b3c3c8895fda9f964215db3a35fe3441
`@ -1 +1 @@`
	`ca90e11f8d59902af38d3785ddd91a27d0fbb411`	`cffb4e720b0379bedd4ff802912d998ace775c37`
		`@ -0,0 +1 @@`
							`Subproject commit 2e9b7637b3c3c8895fda9f964215db3a35fe3441`