diff --git a/.gitignore b/.gitignore index 6aee0ed8e0..062358ef24 100644 --- a/.gitignore +++ b/.gitignore @@ -36,6 +36,7 @@ a.out config.json clcache compile_commands.json +compare_runtime*.html persist board/obj/ diff --git a/.gitmodules b/.gitmodules index bc439b451c..26f93ef164 100644 --- a/.gitmodules +++ b/.gitmodules @@ -16,3 +16,6 @@ [submodule "body"] path = body url = ../../commaai/body.git +[submodule "tinygrad"] + path = tinygrad_repo + url = https://github.com/geohot/tinygrad.git diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 50554acceb..347216f2fb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,7 +28,7 @@ repos: rev: v0.931 hooks: - id: mypy - exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(opendbc/)|(laika_repo/)|(rednose_repo/)/' + exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(opendbc/)|(laika_repo/)|(rednose_repo/)/|(tinygrad/)|(tinygrad_repo/)' additional_dependencies: ['types-PyYAML', 'lxml', 'numpy', 'types-atomicwrites', 'types-pycurl', 'types-requests', 'types-certifi'] args: - --warn-redundant-casts @@ -40,7 +40,7 @@ repos: rev: 4.0.1 hooks: - id: flake8 - exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(opendbc/)|(laika_repo/)|(rednose_repo/)|(selfdrive/debug/)/' + exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(opendbc/)|(laika_repo/)|(rednose_repo/)|(tinygrad/)|(tinygrad_repo/)|(selfdrive/debug/)/' additional_dependencies: ['flake8-no-implicit-concat'] args: - --indent-size=2 @@ -55,7 +55,7 @@ repos: entry: pylint language: system types: [python] - exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(laika_repo/)|(rednose_repo/)' + exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(laika_repo/)|(rednose_repo/)|(tinygrad/)|(tinygrad_repo/)' args: - -rn - -sn diff --git a/SConstruct b/SConstruct index d4155f4f83..66a94f9b1b 100644 --- a/SConstruct +++ b/SConstruct @@ -49,6 +49,11 @@ AddOption('--no-thneed', dest='no_thneed', help='avoid using thneed') +AddOption('--pc-thneed', + action='store_true', + dest='pc_thneed', + help='use thneed on pc') + AddOption('--no-test', action='store_false', dest='test', diff --git a/common/clutil.cc b/common/clutil.cc index b8f9fde4cf..9d3447d807 100644 --- a/common/clutil.cc +++ b/common/clutil.cc @@ -78,7 +78,8 @@ cl_program cl_program_from_file(cl_context ctx, cl_device_id device_id, const ch } cl_program cl_program_from_source(cl_context ctx, cl_device_id device_id, const std::string& src, const char* args) { - cl_program prg = CL_CHECK_ERR(clCreateProgramWithSource(ctx, 1, (const char*[]){src.c_str()}, NULL, &err)); + const char *csrc = src.c_str(); + cl_program prg = CL_CHECK_ERR(clCreateProgramWithSource(ctx, 1, &csrc, NULL, &err)); if (int err = clBuildProgram(prg, 1, &device_id, args, NULL, NULL); err != 0) { cl_print_build_errors(prg, device_id); assert(0); @@ -87,7 +88,7 @@ cl_program cl_program_from_source(cl_context ctx, cl_device_id device_id, const } cl_program cl_program_from_binary(cl_context ctx, cl_device_id device_id, const uint8_t* binary, size_t length, const char* args) { - cl_program prg = CL_CHECK_ERR(clCreateProgramWithBinary(ctx, 1, &device_id, &length, (const uint8_t*[]){binary}, NULL, &err)); + cl_program prg = CL_CHECK_ERR(clCreateProgramWithBinary(ctx, 1, &device_id, &length, &binary, NULL, &err)); if (int err = clBuildProgram(prg, 1, &device_id, args, NULL, NULL); err != 0) { cl_print_build_errors(prg, device_id); assert(0); diff --git a/release/build_release.sh b/release/build_release.sh index 1000e607fa..80106eefb2 100755 --- a/release/build_release.sh +++ b/release/build_release.sh @@ -78,7 +78,7 @@ find . -name 'moc_*' -delete find . -name '__pycache__' -delete rm -rf panda/board panda/certs panda/crypto rm -rf .sconsign.dblite Jenkinsfile release/ -rm selfdrive/modeld/models/supercombo.dlc +rm selfdrive/modeld/models/supercombo.onnx # Move back signed panda fw mkdir -p panda/board/obj diff --git a/release/files_common b/release/files_common index ca6e91fb65..5be07b1c75 100644 --- a/release/files_common +++ b/release/files_common @@ -352,7 +352,7 @@ selfdrive/modeld/models/driving.cc selfdrive/modeld/models/driving.h selfdrive/modeld/models/dmonitoring.cc selfdrive/modeld/models/dmonitoring.h -selfdrive/modeld/models/supercombo.dlc +selfdrive/modeld/models/supercombo.onnx selfdrive/modeld/models/dmonitoring_model_q.dlc selfdrive/modeld/transforms/loadyuv.cc @@ -561,3 +561,16 @@ opendbc/vw_mqb_2010.dbc opendbc/tesla_can.dbc opendbc/tesla_radar.dbc opendbc/tesla_powertrain.dbc + +tinygrad_repo/openpilot/compile.py +tinygrad_repo/accel/opencl/* +tinygrad_repo/extra/onnx.py +tinygrad_repo/extra/utils.py +tinygrad_repo/tinygrad/llops/ops_gpu.py +tinygrad_repo/tinygrad/llops/ops_opencl.py +tinygrad_repo/tinygrad/helpers.py +tinygrad_repo/tinygrad/mlops.py +tinygrad_repo/tinygrad/ops.py +tinygrad_repo/tinygrad/shapetracker.py +tinygrad_repo/tinygrad/tensor.py +tinygrad_repo/tinygrad/nn/__init__.py diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript index 4feb17f238..2544607aa4 100644 --- a/selfdrive/modeld/SConscript +++ b/selfdrive/modeld/SConscript @@ -62,25 +62,65 @@ else: common_model = lenv.Object(common_src) -# build thneed model -if use_thneed and arch == "larch64": - fn = File("models/supercombo").abspath - compiler = lenv.Program('thneed/compile', ["thneed/compile.cc"]+common_model, LIBS=libs) - cmd = f"cd {Dir('.').abspath} && {compiler[0].abspath} --in {fn}.dlc --out {fn}.thneed --binary --optimize" - - lib_paths = ':'.join(Dir(p).abspath for p in lenv["LIBPATH"]) - kernel_path = os.path.join(Dir('.').abspath, "thneed", "kernels") - cenv = Environment(ENV={'LD_LIBRARY_PATH': f"{lib_paths}:{lenv['ENV']['LD_LIBRARY_PATH']}", 'KERNEL_PATH': kernel_path}) - - kernels = [os.path.join(kernel_path, x) for x in os.listdir(kernel_path) if x.endswith(".cl")] - cenv.Command(fn + ".thneed", [fn + ".dlc", kernels, compiler], cmd) - lenv.Program('_dmonitoringmodeld', [ "dmonitoringmodeld.cc", "models/dmonitoring.cc", ]+common_model, LIBS=libs) -lenv.Program('_modeld', [ +# build thneed model +if use_thneed and arch == "larch64" or GetOption('pc_thneed'): + fn = File("models/supercombo").abspath + + if GetOption('pc_thneed'): + cmd = f"cd {Dir('#').abspath}/tinygrad_repo && NATIVE_EXPLOG=1 OPTWG=1 UNSAFE_FLOAT4=1 DEBUGCL=1 python3 openpilot/compile.py {fn}.onnx {fn}.thneed" + else: + cmd = f"cd {Dir('#').abspath}/tinygrad_repo && FLOAT16=1 PYOPENCL_NO_CACHE=1 MATMUL=1 NATIVE_EXPLOG=1 OPTWG=1 UNSAFE_FLOAT4=1 DEBUGCL=1 python3 openpilot/compile.py {fn}.onnx {fn}.thneed" + + # is there a better way then listing all of tinygrad? + lenv.Command(fn + ".thneed", [fn + ".onnx", + "#tinygrad_repo/openpilot/compile.py", + "#tinygrad_repo/accel/opencl/conv.cl", + "#tinygrad_repo/accel/opencl/matmul.cl", + "#tinygrad_repo/accel/opencl/ops_opencl.py", + "#tinygrad_repo/accel/opencl/preprocessing.py", + "#tinygrad_repo/extra/onnx.py", + "#tinygrad_repo/extra/utils.py", + "#tinygrad_repo/tinygrad/llops/ops_gpu.py", + "#tinygrad_repo/tinygrad/llops/ops_opencl.py", + "#tinygrad_repo/tinygrad/helpers.py", + "#tinygrad_repo/tinygrad/mlops.py", + "#tinygrad_repo/tinygrad/ops.py", + "#tinygrad_repo/tinygrad/shapetracker.py", + "#tinygrad_repo/tinygrad/tensor.py", + "#tinygrad_repo/tinygrad/nn/__init__.py" + ], cmd) + + # old thneed compiler. TODO: remove this once tinygrad stuff is stable + + #compiler = lenv.Program('thneed/compile', ["thneed/compile.cc"]+common_model, LIBS=libs) + #cmd = f"cd {Dir('.').abspath} && {compiler[0].abspath} --in {fn}.dlc --out {fn}.thneed --binary --optimize" + + #lib_paths = ':'.join(Dir(p).abspath for p in lenv["LIBPATH"]) + #kernel_path = os.path.join(Dir('.').abspath, "thneed", "kernels") + #cenv = Environment(ENV={'LD_LIBRARY_PATH': f"{lib_paths}:{lenv['ENV']['LD_LIBRARY_PATH']}", 'KERNEL_PATH': kernel_path}) + + #kernels = [os.path.join(kernel_path, x) for x in os.listdir(kernel_path) if x.endswith(".cl")] + #cenv.Command(fn + ".thneed", [fn + ".dlc", kernels, compiler], cmd) + +llenv = lenv.Clone() +if GetOption('pc_thneed'): + pc_thneed_src = [ + "thneed/thneed_common.cc", + "thneed/thneed_pc.cc", + "thneed/serialize.cc", + "runners/thneedmodel.cc", + ] + llenv['CFLAGS'].append("-DUSE_THNEED") + llenv['CXXFLAGS'].append("-DUSE_THNEED") + common_model += llenv.Object(pc_thneed_src) + libs += ['dl'] + +llenv.Program('_modeld', [ "modeld.cc", "models/driving.cc", ]+common_model, LIBS=libs + transformations) diff --git a/selfdrive/modeld/models/supercombo.dlc b/selfdrive/modeld/models/supercombo.dlc deleted file mode 100644 index fe133523fc..0000000000 --- a/selfdrive/modeld/models/supercombo.dlc +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:93d265fc88f05746ce47257e15fc2afe43b250b44715313049f829e8aa30a9d6 -size 94302331 diff --git a/selfdrive/modeld/thneed/serialize.cc b/selfdrive/modeld/thneed/serialize.cc index 0b8b02c6ac..afc84ee769 100644 --- a/selfdrive/modeld/thneed/serialize.cc +++ b/selfdrive/modeld/thneed/serialize.cc @@ -33,11 +33,14 @@ void Thneed::load(const char *filename) { assert(mobj["needs_load"].bool_value() == false); } else { if (mobj["needs_load"].bool_value()) { - //printf("loading %p %d @ 0x%X\n", clbuf, sz, ptr); clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, &buf[ptr], NULL); + if (debug >= 1) printf("loading %p %d @ 0x%X\n", clbuf, sz, ptr); ptr += sz; } else { - clbuf = clCreateBuffer(context, CL_MEM_READ_WRITE, sz, NULL, NULL); + // TODO: is there a faster way to init zeroed out buffers? + void *host_zeros = calloc(sz, 1); + clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, host_zeros, NULL); + free(host_zeros); } } assert(clbuf != NULL); diff --git a/selfdrive/modeld/thneed/thneed.h b/selfdrive/modeld/thneed/thneed.h index 19dc46e8f6..2a5800f302 100644 --- a/selfdrive/modeld/thneed/thneed.h +++ b/selfdrive/modeld/thneed/thneed.h @@ -122,7 +122,7 @@ class Thneed { // all CL kernels void find_inputs_outputs(); - void copy_inputs(float **finputs); + void copy_inputs(float **finputs, bool internal=false); void copy_output(float *foutput); cl_int clexec(); vector > kq; diff --git a/selfdrive/modeld/thneed/thneed_common.cc b/selfdrive/modeld/thneed/thneed_common.cc index b751cdf665..a3f5c908f9 100644 --- a/selfdrive/modeld/thneed/thneed_common.cc +++ b/selfdrive/modeld/thneed/thneed_common.cc @@ -30,17 +30,16 @@ cl_int Thneed::clexec() { return clFinish(command_queue); } -void Thneed::copy_inputs(float **finputs) { - //cl_int ret; +void Thneed::copy_inputs(float **finputs, bool internal) { for (int idx = 0; idx < inputs.size(); ++idx) { if (debug >= 1) printf("copying %lu -- %p -> %p (cl %p)\n", input_sizes[idx], finputs[idx], inputs[idx], input_clmem[idx]); - // TODO: fix thneed caching - if (finputs[idx] != NULL) memcpy(inputs[idx], finputs[idx], input_sizes[idx]); - //if (finputs[idx] != NULL) CL_CHECK(clEnqueueWriteBuffer(command_queue, input_clmem[idx], CL_TRUE, 0, input_sizes[idx], finputs[idx], 0, NULL, NULL)); - - // HACK - //if (input_sizes[idx] == 16) memset((char*)inputs[idx] + 8, 0, 8); + if (internal) { + // if it's internal, using memcpy is fine since the buffer sync is cached in the ioctl layer + if (finputs[idx] != NULL) memcpy(inputs[idx], finputs[idx], input_sizes[idx]); + } else { + if (finputs[idx] != NULL) CL_CHECK(clEnqueueWriteBuffer(command_queue, input_clmem[idx], CL_TRUE, 0, input_sizes[idx], finputs[idx], 0, NULL, NULL)); + } } } @@ -202,8 +201,8 @@ void CLQueuedKernel::debug_print(bool verbose) { assert(slice_pitch == 0); clGetImageInfo(val, CL_IMAGE_BUFFER, sizeof(buf), &buf, NULL); - size_t sz; - clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL); + size_t sz = 0; + if (buf != NULL) clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL); printf(" image %zu x %zu rp %zu @ %p buffer %zu", width, height, row_pitch, buf, sz); } else { size_t sz; diff --git a/selfdrive/modeld/thneed/thneed_pc.cc b/selfdrive/modeld/thneed/thneed_pc.cc new file mode 100644 index 0000000000..e32dd289ec --- /dev/null +++ b/selfdrive/modeld/thneed/thneed_pc.cc @@ -0,0 +1,40 @@ +#include "selfdrive/modeld/thneed/thneed.h" + +#include + +#include "common/clutil.h" +#include "common/timing.h" + +Thneed::Thneed(bool do_clinit, cl_context _context) { + context = _context; + if (do_clinit) clinit(); + char *thneed_debug_env = getenv("THNEED_DEBUG"); + debug = (thneed_debug_env != NULL) ? atoi(thneed_debug_env) : 0; +} + +void Thneed::execute(float **finputs, float *foutput, bool slow) { + uint64_t tb, te; + if (debug >= 1) tb = nanos_since_boot(); + + // ****** copy inputs + copy_inputs(finputs); + + // ****** run commands + clexec(); + + // ****** copy outputs + copy_output(foutput); + + if (debug >= 1) { + te = nanos_since_boot(); + printf("model exec in %lu us\n", (te-tb)/1000); + } +} + +void Thneed::stop() { +} + +void Thneed::find_inputs_outputs() { + // thneed on PC doesn't work on old style inputs/outputs +} + diff --git a/selfdrive/modeld/thneed/thneed_qcom2.cc b/selfdrive/modeld/thneed/thneed_qcom2.cc index e79bb77edf..f35317d2a7 100644 --- a/selfdrive/modeld/thneed/thneed_qcom2.cc +++ b/selfdrive/modeld/thneed/thneed_qcom2.cc @@ -269,7 +269,7 @@ void Thneed::execute(float **finputs, float *foutput, bool slow) { if (debug >= 1) tb = nanos_since_boot(); // ****** copy inputs - copy_inputs(finputs); + copy_inputs(finputs, true); // ****** set power constraint int ret; diff --git a/selfdrive/test/process_replay/model_replay_ref_commit b/selfdrive/test/process_replay/model_replay_ref_commit index 80be9b464f..e2ee2e7bb6 100644 --- a/selfdrive/test/process_replay/model_replay_ref_commit +++ b/selfdrive/test/process_replay/model_replay_ref_commit @@ -1 +1 @@ -ca90e11f8d59902af38d3785ddd91a27d0fbb411 +cffb4e720b0379bedd4ff802912d998ace775c37 diff --git a/tinygrad b/tinygrad new file mode 120000 index 0000000000..cb003823c6 --- /dev/null +++ b/tinygrad @@ -0,0 +1 @@ +tinygrad_repo/tinygrad \ No newline at end of file diff --git a/tinygrad_repo b/tinygrad_repo new file mode 160000 index 0000000000..2e9b7637b3 --- /dev/null +++ b/tinygrad_repo @@ -0,0 +1 @@ +Subproject commit 2e9b7637b3c3c8895fda9f964215db3a35fe3441