modeld: Move from SNPE to tinygrad (#25207)

* compiling, won't work yet

* running with inputs and outputs

* there's some magic chance this works

* no more dlc, include onnx

* yolo tests plz

* bump tinygrad

* files_common + delete dlc

* tinygrad_repo -> tinygrad

* pre commit config

* llops needed

* extra in files_common

* bump tinygrad

* fix indent

* tinygrad/nn/__init__

* tinygrad_repo

* bump tinygrad repo

* bump tinygrad

* bump with native_exp, match maybe

* native_explog is argument

* pyopencl no cache

* 5% chance this matches

* work in float32?

* bump tinygrad

* fix build

* no __init__

* fix recip

* dumb hack

* adding thneed PC support

* fix pc segfault

* pc thneed is working

* to_image

* prints stuff with debug=2

* it sort of works

* copy host ptr is simpler

* bug fix

* build on c3

* this correct?

* reenable float16

* fix private, fixup copy_inputs internal

* bump tinygrad and update ref commit

* fix OPTWG on PC

* maybe fix non determinism

* revert model replay ref commit

* comments, init zeroed out buffers

* upd ref commit

* bump tinygrad to fix initial image

* try this ref

Co-authored-by: Comma Device <device@comma.ai>
old-commit-hash: 40d6f4b65c
This commit is contained in:
George Hotz 2022-09-01 10:31:14 -07:00 committed by GitHub
parent 3308e740ea
commit 29cd51b876
17 changed files with 143 additions and 39 deletions

1
.gitignore vendored
View File

@ -36,6 +36,7 @@ a.out
config.json config.json
clcache clcache
compile_commands.json compile_commands.json
compare_runtime*.html
persist persist
board/obj/ board/obj/

3
.gitmodules vendored
View File

@ -16,3 +16,6 @@
[submodule "body"] [submodule "body"]
path = body path = body
url = ../../commaai/body.git url = ../../commaai/body.git
[submodule "tinygrad"]
path = tinygrad_repo
url = https://github.com/geohot/tinygrad.git

View File

@ -28,7 +28,7 @@ repos:
rev: v0.931 rev: v0.931
hooks: hooks:
- id: mypy - id: mypy
exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(opendbc/)|(laika_repo/)|(rednose_repo/)/' exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(opendbc/)|(laika_repo/)|(rednose_repo/)/|(tinygrad/)|(tinygrad_repo/)'
additional_dependencies: ['types-PyYAML', 'lxml', 'numpy', 'types-atomicwrites', 'types-pycurl', 'types-requests', 'types-certifi'] additional_dependencies: ['types-PyYAML', 'lxml', 'numpy', 'types-atomicwrites', 'types-pycurl', 'types-requests', 'types-certifi']
args: args:
- --warn-redundant-casts - --warn-redundant-casts
@ -40,7 +40,7 @@ repos:
rev: 4.0.1 rev: 4.0.1
hooks: hooks:
- id: flake8 - id: flake8
exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(opendbc/)|(laika_repo/)|(rednose_repo/)|(selfdrive/debug/)/' exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(opendbc/)|(laika_repo/)|(rednose_repo/)|(tinygrad/)|(tinygrad_repo/)|(selfdrive/debug/)/'
additional_dependencies: ['flake8-no-implicit-concat'] additional_dependencies: ['flake8-no-implicit-concat']
args: args:
- --indent-size=2 - --indent-size=2
@ -55,7 +55,7 @@ repos:
entry: pylint entry: pylint
language: system language: system
types: [python] types: [python]
exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(laika_repo/)|(rednose_repo/)' exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(laika_repo/)|(rednose_repo/)|(tinygrad/)|(tinygrad_repo/)'
args: args:
- -rn - -rn
- -sn - -sn

View File

@ -49,6 +49,11 @@ AddOption('--no-thneed',
dest='no_thneed', dest='no_thneed',
help='avoid using thneed') help='avoid using thneed')
AddOption('--pc-thneed',
action='store_true',
dest='pc_thneed',
help='use thneed on pc')
AddOption('--no-test', AddOption('--no-test',
action='store_false', action='store_false',
dest='test', dest='test',

View File

@ -78,7 +78,8 @@ cl_program cl_program_from_file(cl_context ctx, cl_device_id device_id, const ch
} }
cl_program cl_program_from_source(cl_context ctx, cl_device_id device_id, const std::string& src, const char* args) { cl_program cl_program_from_source(cl_context ctx, cl_device_id device_id, const std::string& src, const char* args) {
cl_program prg = CL_CHECK_ERR(clCreateProgramWithSource(ctx, 1, (const char*[]){src.c_str()}, NULL, &err)); const char *csrc = src.c_str();
cl_program prg = CL_CHECK_ERR(clCreateProgramWithSource(ctx, 1, &csrc, NULL, &err));
if (int err = clBuildProgram(prg, 1, &device_id, args, NULL, NULL); err != 0) { if (int err = clBuildProgram(prg, 1, &device_id, args, NULL, NULL); err != 0) {
cl_print_build_errors(prg, device_id); cl_print_build_errors(prg, device_id);
assert(0); assert(0);
@ -87,7 +88,7 @@ cl_program cl_program_from_source(cl_context ctx, cl_device_id device_id, const
} }
cl_program cl_program_from_binary(cl_context ctx, cl_device_id device_id, const uint8_t* binary, size_t length, const char* args) { cl_program cl_program_from_binary(cl_context ctx, cl_device_id device_id, const uint8_t* binary, size_t length, const char* args) {
cl_program prg = CL_CHECK_ERR(clCreateProgramWithBinary(ctx, 1, &device_id, &length, (const uint8_t*[]){binary}, NULL, &err)); cl_program prg = CL_CHECK_ERR(clCreateProgramWithBinary(ctx, 1, &device_id, &length, &binary, NULL, &err));
if (int err = clBuildProgram(prg, 1, &device_id, args, NULL, NULL); err != 0) { if (int err = clBuildProgram(prg, 1, &device_id, args, NULL, NULL); err != 0) {
cl_print_build_errors(prg, device_id); cl_print_build_errors(prg, device_id);
assert(0); assert(0);

View File

@ -78,7 +78,7 @@ find . -name 'moc_*' -delete
find . -name '__pycache__' -delete find . -name '__pycache__' -delete
rm -rf panda/board panda/certs panda/crypto rm -rf panda/board panda/certs panda/crypto
rm -rf .sconsign.dblite Jenkinsfile release/ rm -rf .sconsign.dblite Jenkinsfile release/
rm selfdrive/modeld/models/supercombo.dlc rm selfdrive/modeld/models/supercombo.onnx
# Move back signed panda fw # Move back signed panda fw
mkdir -p panda/board/obj mkdir -p panda/board/obj

View File

@ -352,7 +352,7 @@ selfdrive/modeld/models/driving.cc
selfdrive/modeld/models/driving.h selfdrive/modeld/models/driving.h
selfdrive/modeld/models/dmonitoring.cc selfdrive/modeld/models/dmonitoring.cc
selfdrive/modeld/models/dmonitoring.h selfdrive/modeld/models/dmonitoring.h
selfdrive/modeld/models/supercombo.dlc selfdrive/modeld/models/supercombo.onnx
selfdrive/modeld/models/dmonitoring_model_q.dlc selfdrive/modeld/models/dmonitoring_model_q.dlc
selfdrive/modeld/transforms/loadyuv.cc selfdrive/modeld/transforms/loadyuv.cc
@ -561,3 +561,16 @@ opendbc/vw_mqb_2010.dbc
opendbc/tesla_can.dbc opendbc/tesla_can.dbc
opendbc/tesla_radar.dbc opendbc/tesla_radar.dbc
opendbc/tesla_powertrain.dbc opendbc/tesla_powertrain.dbc
tinygrad_repo/openpilot/compile.py
tinygrad_repo/accel/opencl/*
tinygrad_repo/extra/onnx.py
tinygrad_repo/extra/utils.py
tinygrad_repo/tinygrad/llops/ops_gpu.py
tinygrad_repo/tinygrad/llops/ops_opencl.py
tinygrad_repo/tinygrad/helpers.py
tinygrad_repo/tinygrad/mlops.py
tinygrad_repo/tinygrad/ops.py
tinygrad_repo/tinygrad/shapetracker.py
tinygrad_repo/tinygrad/tensor.py
tinygrad_repo/tinygrad/nn/__init__.py

View File

@ -62,25 +62,65 @@ else:
common_model = lenv.Object(common_src) common_model = lenv.Object(common_src)
# build thneed model
if use_thneed and arch == "larch64":
fn = File("models/supercombo").abspath
compiler = lenv.Program('thneed/compile', ["thneed/compile.cc"]+common_model, LIBS=libs)
cmd = f"cd {Dir('.').abspath} && {compiler[0].abspath} --in {fn}.dlc --out {fn}.thneed --binary --optimize"
lib_paths = ':'.join(Dir(p).abspath for p in lenv["LIBPATH"])
kernel_path = os.path.join(Dir('.').abspath, "thneed", "kernels")
cenv = Environment(ENV={'LD_LIBRARY_PATH': f"{lib_paths}:{lenv['ENV']['LD_LIBRARY_PATH']}", 'KERNEL_PATH': kernel_path})
kernels = [os.path.join(kernel_path, x) for x in os.listdir(kernel_path) if x.endswith(".cl")]
cenv.Command(fn + ".thneed", [fn + ".dlc", kernels, compiler], cmd)
lenv.Program('_dmonitoringmodeld', [ lenv.Program('_dmonitoringmodeld', [
"dmonitoringmodeld.cc", "dmonitoringmodeld.cc",
"models/dmonitoring.cc", "models/dmonitoring.cc",
]+common_model, LIBS=libs) ]+common_model, LIBS=libs)
lenv.Program('_modeld', [ # build thneed model
if use_thneed and arch == "larch64" or GetOption('pc_thneed'):
fn = File("models/supercombo").abspath
if GetOption('pc_thneed'):
cmd = f"cd {Dir('#').abspath}/tinygrad_repo && NATIVE_EXPLOG=1 OPTWG=1 UNSAFE_FLOAT4=1 DEBUGCL=1 python3 openpilot/compile.py {fn}.onnx {fn}.thneed"
else:
cmd = f"cd {Dir('#').abspath}/tinygrad_repo && FLOAT16=1 PYOPENCL_NO_CACHE=1 MATMUL=1 NATIVE_EXPLOG=1 OPTWG=1 UNSAFE_FLOAT4=1 DEBUGCL=1 python3 openpilot/compile.py {fn}.onnx {fn}.thneed"
# is there a better way then listing all of tinygrad?
lenv.Command(fn + ".thneed", [fn + ".onnx",
"#tinygrad_repo/openpilot/compile.py",
"#tinygrad_repo/accel/opencl/conv.cl",
"#tinygrad_repo/accel/opencl/matmul.cl",
"#tinygrad_repo/accel/opencl/ops_opencl.py",
"#tinygrad_repo/accel/opencl/preprocessing.py",
"#tinygrad_repo/extra/onnx.py",
"#tinygrad_repo/extra/utils.py",
"#tinygrad_repo/tinygrad/llops/ops_gpu.py",
"#tinygrad_repo/tinygrad/llops/ops_opencl.py",
"#tinygrad_repo/tinygrad/helpers.py",
"#tinygrad_repo/tinygrad/mlops.py",
"#tinygrad_repo/tinygrad/ops.py",
"#tinygrad_repo/tinygrad/shapetracker.py",
"#tinygrad_repo/tinygrad/tensor.py",
"#tinygrad_repo/tinygrad/nn/__init__.py"
], cmd)
# old thneed compiler. TODO: remove this once tinygrad stuff is stable
#compiler = lenv.Program('thneed/compile', ["thneed/compile.cc"]+common_model, LIBS=libs)
#cmd = f"cd {Dir('.').abspath} && {compiler[0].abspath} --in {fn}.dlc --out {fn}.thneed --binary --optimize"
#lib_paths = ':'.join(Dir(p).abspath for p in lenv["LIBPATH"])
#kernel_path = os.path.join(Dir('.').abspath, "thneed", "kernels")
#cenv = Environment(ENV={'LD_LIBRARY_PATH': f"{lib_paths}:{lenv['ENV']['LD_LIBRARY_PATH']}", 'KERNEL_PATH': kernel_path})
#kernels = [os.path.join(kernel_path, x) for x in os.listdir(kernel_path) if x.endswith(".cl")]
#cenv.Command(fn + ".thneed", [fn + ".dlc", kernels, compiler], cmd)
llenv = lenv.Clone()
if GetOption('pc_thneed'):
pc_thneed_src = [
"thneed/thneed_common.cc",
"thneed/thneed_pc.cc",
"thneed/serialize.cc",
"runners/thneedmodel.cc",
]
llenv['CFLAGS'].append("-DUSE_THNEED")
llenv['CXXFLAGS'].append("-DUSE_THNEED")
common_model += llenv.Object(pc_thneed_src)
libs += ['dl']
llenv.Program('_modeld', [
"modeld.cc", "modeld.cc",
"models/driving.cc", "models/driving.cc",
]+common_model, LIBS=libs + transformations) ]+common_model, LIBS=libs + transformations)

View File

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:93d265fc88f05746ce47257e15fc2afe43b250b44715313049f829e8aa30a9d6
size 94302331

View File

@ -33,11 +33,14 @@ void Thneed::load(const char *filename) {
assert(mobj["needs_load"].bool_value() == false); assert(mobj["needs_load"].bool_value() == false);
} else { } else {
if (mobj["needs_load"].bool_value()) { if (mobj["needs_load"].bool_value()) {
//printf("loading %p %d @ 0x%X\n", clbuf, sz, ptr);
clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, &buf[ptr], NULL); clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, &buf[ptr], NULL);
if (debug >= 1) printf("loading %p %d @ 0x%X\n", clbuf, sz, ptr);
ptr += sz; ptr += sz;
} else { } else {
clbuf = clCreateBuffer(context, CL_MEM_READ_WRITE, sz, NULL, NULL); // TODO: is there a faster way to init zeroed out buffers?
void *host_zeros = calloc(sz, 1);
clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, host_zeros, NULL);
free(host_zeros);
} }
} }
assert(clbuf != NULL); assert(clbuf != NULL);

View File

@ -122,7 +122,7 @@ class Thneed {
// all CL kernels // all CL kernels
void find_inputs_outputs(); void find_inputs_outputs();
void copy_inputs(float **finputs); void copy_inputs(float **finputs, bool internal=false);
void copy_output(float *foutput); void copy_output(float *foutput);
cl_int clexec(); cl_int clexec();
vector<shared_ptr<CLQueuedKernel> > kq; vector<shared_ptr<CLQueuedKernel> > kq;

View File

@ -30,17 +30,16 @@ cl_int Thneed::clexec() {
return clFinish(command_queue); return clFinish(command_queue);
} }
void Thneed::copy_inputs(float **finputs) { void Thneed::copy_inputs(float **finputs, bool internal) {
//cl_int ret;
for (int idx = 0; idx < inputs.size(); ++idx) { for (int idx = 0; idx < inputs.size(); ++idx) {
if (debug >= 1) printf("copying %lu -- %p -> %p (cl %p)\n", input_sizes[idx], finputs[idx], inputs[idx], input_clmem[idx]); if (debug >= 1) printf("copying %lu -- %p -> %p (cl %p)\n", input_sizes[idx], finputs[idx], inputs[idx], input_clmem[idx]);
// TODO: fix thneed caching if (internal) {
if (finputs[idx] != NULL) memcpy(inputs[idx], finputs[idx], input_sizes[idx]); // if it's internal, using memcpy is fine since the buffer sync is cached in the ioctl layer
//if (finputs[idx] != NULL) CL_CHECK(clEnqueueWriteBuffer(command_queue, input_clmem[idx], CL_TRUE, 0, input_sizes[idx], finputs[idx], 0, NULL, NULL)); if (finputs[idx] != NULL) memcpy(inputs[idx], finputs[idx], input_sizes[idx]);
} else {
// HACK if (finputs[idx] != NULL) CL_CHECK(clEnqueueWriteBuffer(command_queue, input_clmem[idx], CL_TRUE, 0, input_sizes[idx], finputs[idx], 0, NULL, NULL));
//if (input_sizes[idx] == 16) memset((char*)inputs[idx] + 8, 0, 8); }
} }
} }
@ -202,8 +201,8 @@ void CLQueuedKernel::debug_print(bool verbose) {
assert(slice_pitch == 0); assert(slice_pitch == 0);
clGetImageInfo(val, CL_IMAGE_BUFFER, sizeof(buf), &buf, NULL); clGetImageInfo(val, CL_IMAGE_BUFFER, sizeof(buf), &buf, NULL);
size_t sz; size_t sz = 0;
clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL); if (buf != NULL) clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
printf(" image %zu x %zu rp %zu @ %p buffer %zu", width, height, row_pitch, buf, sz); printf(" image %zu x %zu rp %zu @ %p buffer %zu", width, height, row_pitch, buf, sz);
} else { } else {
size_t sz; size_t sz;

View File

@ -0,0 +1,40 @@
#include "selfdrive/modeld/thneed/thneed.h"
#include <cassert>
#include "common/clutil.h"
#include "common/timing.h"
Thneed::Thneed(bool do_clinit, cl_context _context) {
context = _context;
if (do_clinit) clinit();
char *thneed_debug_env = getenv("THNEED_DEBUG");
debug = (thneed_debug_env != NULL) ? atoi(thneed_debug_env) : 0;
}
void Thneed::execute(float **finputs, float *foutput, bool slow) {
uint64_t tb, te;
if (debug >= 1) tb = nanos_since_boot();
// ****** copy inputs
copy_inputs(finputs);
// ****** run commands
clexec();
// ****** copy outputs
copy_output(foutput);
if (debug >= 1) {
te = nanos_since_boot();
printf("model exec in %lu us\n", (te-tb)/1000);
}
}
void Thneed::stop() {
}
void Thneed::find_inputs_outputs() {
// thneed on PC doesn't work on old style inputs/outputs
}

View File

@ -269,7 +269,7 @@ void Thneed::execute(float **finputs, float *foutput, bool slow) {
if (debug >= 1) tb = nanos_since_boot(); if (debug >= 1) tb = nanos_since_boot();
// ****** copy inputs // ****** copy inputs
copy_inputs(finputs); copy_inputs(finputs, true);
// ****** set power constraint // ****** set power constraint
int ret; int ret;

View File

@ -1 +1 @@
ca90e11f8d59902af38d3785ddd91a27d0fbb411 cffb4e720b0379bedd4ff802912d998ace775c37

1
tinygrad Symbolic link
View File

@ -0,0 +1 @@
tinygrad_repo/tinygrad

1
tinygrad_repo Submodule

@ -0,0 +1 @@
Subproject commit 2e9b7637b3c3c8895fda9f964215db3a35fe3441