modeld: Move from SNPE to tinygrad (#25207)

* compiling, won't work yet

* running with inputs and outputs

* there's some magic chance this works

* no more dlc, include onnx

* yolo tests plz

* bump tinygrad

* files_common + delete dlc

* tinygrad_repo -> tinygrad

* pre commit config

* llops needed

* extra in files_common

* bump tinygrad

* fix indent

* tinygrad/nn/__init__

* tinygrad_repo

* bump tinygrad repo

* bump tinygrad

* bump with native_exp, match maybe

* native_explog is argument

* pyopencl no cache

* 5% chance this matches

* work in float32?

* bump tinygrad

* fix build

* no __init__

* fix recip

* dumb hack

* adding thneed PC support

* fix pc segfault

* pc thneed is working

* to_image

* prints stuff with debug=2

* it sort of works

* copy host ptr is simpler

* bug fix

* build on c3

* this correct?

* reenable float16

* fix private, fixup copy_inputs internal

* bump tinygrad and update ref commit

* fix OPTWG on PC

* maybe fix non determinism

* revert model replay ref commit

* comments, init zeroed out buffers

* upd ref commit

* bump tinygrad to fix initial image

* try this ref

Co-authored-by: Comma Device <device@comma.ai>
This commit is contained in:
George Hotz 2022-09-01 10:31:14 -07:00 committed by GitHub
parent 4bb399ba3c
commit 40d6f4b65c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 143 additions and 39 deletions

1
.gitignore vendored
View File

@ -36,6 +36,7 @@ a.out
config.json
clcache
compile_commands.json
compare_runtime*.html
persist
board/obj/

3
.gitmodules vendored
View File

@ -16,3 +16,6 @@
[submodule "body"]
path = body
url = ../../commaai/body.git
[submodule "tinygrad"]
path = tinygrad_repo
url = https://github.com/geohot/tinygrad.git

View File

@ -28,7 +28,7 @@ repos:
rev: v0.931
hooks:
- id: mypy
exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(opendbc/)|(laika_repo/)|(rednose_repo/)/'
exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(opendbc/)|(laika_repo/)|(rednose_repo/)/|(tinygrad/)|(tinygrad_repo/)'
additional_dependencies: ['types-PyYAML', 'lxml', 'numpy', 'types-atomicwrites', 'types-pycurl', 'types-requests', 'types-certifi']
args:
- --warn-redundant-casts
@ -40,7 +40,7 @@ repos:
rev: 4.0.1
hooks:
- id: flake8
exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(opendbc/)|(laika_repo/)|(rednose_repo/)|(selfdrive/debug/)/'
exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(opendbc/)|(laika_repo/)|(rednose_repo/)|(tinygrad/)|(tinygrad_repo/)|(selfdrive/debug/)/'
additional_dependencies: ['flake8-no-implicit-concat']
args:
- --indent-size=2
@ -55,7 +55,7 @@ repos:
entry: pylint
language: system
types: [python]
exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(laika_repo/)|(rednose_repo/)'
exclude: '^(pyextra/)|(cereal/)|(rednose/)|(panda/)|(laika/)|(laika_repo/)|(rednose_repo/)|(tinygrad/)|(tinygrad_repo/)'
args:
- -rn
- -sn

View File

@ -49,6 +49,11 @@ AddOption('--no-thneed',
dest='no_thneed',
help='avoid using thneed')
AddOption('--pc-thneed',
action='store_true',
dest='pc_thneed',
help='use thneed on pc')
AddOption('--no-test',
action='store_false',
dest='test',

View File

@ -78,7 +78,8 @@ cl_program cl_program_from_file(cl_context ctx, cl_device_id device_id, const ch
}
cl_program cl_program_from_source(cl_context ctx, cl_device_id device_id, const std::string& src, const char* args) {
cl_program prg = CL_CHECK_ERR(clCreateProgramWithSource(ctx, 1, (const char*[]){src.c_str()}, NULL, &err));
const char *csrc = src.c_str();
cl_program prg = CL_CHECK_ERR(clCreateProgramWithSource(ctx, 1, &csrc, NULL, &err));
if (int err = clBuildProgram(prg, 1, &device_id, args, NULL, NULL); err != 0) {
cl_print_build_errors(prg, device_id);
assert(0);
@ -87,7 +88,7 @@ cl_program cl_program_from_source(cl_context ctx, cl_device_id device_id, const
}
cl_program cl_program_from_binary(cl_context ctx, cl_device_id device_id, const uint8_t* binary, size_t length, const char* args) {
cl_program prg = CL_CHECK_ERR(clCreateProgramWithBinary(ctx, 1, &device_id, &length, (const uint8_t*[]){binary}, NULL, &err));
cl_program prg = CL_CHECK_ERR(clCreateProgramWithBinary(ctx, 1, &device_id, &length, &binary, NULL, &err));
if (int err = clBuildProgram(prg, 1, &device_id, args, NULL, NULL); err != 0) {
cl_print_build_errors(prg, device_id);
assert(0);

View File

@ -78,7 +78,7 @@ find . -name 'moc_*' -delete
find . -name '__pycache__' -delete
rm -rf panda/board panda/certs panda/crypto
rm -rf .sconsign.dblite Jenkinsfile release/
rm selfdrive/modeld/models/supercombo.dlc
rm selfdrive/modeld/models/supercombo.onnx
# Move back signed panda fw
mkdir -p panda/board/obj

View File

@ -352,7 +352,7 @@ selfdrive/modeld/models/driving.cc
selfdrive/modeld/models/driving.h
selfdrive/modeld/models/dmonitoring.cc
selfdrive/modeld/models/dmonitoring.h
selfdrive/modeld/models/supercombo.dlc
selfdrive/modeld/models/supercombo.onnx
selfdrive/modeld/models/dmonitoring_model_q.dlc
selfdrive/modeld/transforms/loadyuv.cc
@ -561,3 +561,16 @@ opendbc/vw_mqb_2010.dbc
opendbc/tesla_can.dbc
opendbc/tesla_radar.dbc
opendbc/tesla_powertrain.dbc
tinygrad_repo/openpilot/compile.py
tinygrad_repo/accel/opencl/*
tinygrad_repo/extra/onnx.py
tinygrad_repo/extra/utils.py
tinygrad_repo/tinygrad/llops/ops_gpu.py
tinygrad_repo/tinygrad/llops/ops_opencl.py
tinygrad_repo/tinygrad/helpers.py
tinygrad_repo/tinygrad/mlops.py
tinygrad_repo/tinygrad/ops.py
tinygrad_repo/tinygrad/shapetracker.py
tinygrad_repo/tinygrad/tensor.py
tinygrad_repo/tinygrad/nn/__init__.py

View File

@ -62,25 +62,65 @@ else:
common_model = lenv.Object(common_src)
# build thneed model
if use_thneed and arch == "larch64":
fn = File("models/supercombo").abspath
compiler = lenv.Program('thneed/compile', ["thneed/compile.cc"]+common_model, LIBS=libs)
cmd = f"cd {Dir('.').abspath} && {compiler[0].abspath} --in {fn}.dlc --out {fn}.thneed --binary --optimize"
lib_paths = ':'.join(Dir(p).abspath for p in lenv["LIBPATH"])
kernel_path = os.path.join(Dir('.').abspath, "thneed", "kernels")
cenv = Environment(ENV={'LD_LIBRARY_PATH': f"{lib_paths}:{lenv['ENV']['LD_LIBRARY_PATH']}", 'KERNEL_PATH': kernel_path})
kernels = [os.path.join(kernel_path, x) for x in os.listdir(kernel_path) if x.endswith(".cl")]
cenv.Command(fn + ".thneed", [fn + ".dlc", kernels, compiler], cmd)
lenv.Program('_dmonitoringmodeld', [
"dmonitoringmodeld.cc",
"models/dmonitoring.cc",
]+common_model, LIBS=libs)
lenv.Program('_modeld', [
# build thneed model
if use_thneed and arch == "larch64" or GetOption('pc_thneed'):
fn = File("models/supercombo").abspath
if GetOption('pc_thneed'):
cmd = f"cd {Dir('#').abspath}/tinygrad_repo && NATIVE_EXPLOG=1 OPTWG=1 UNSAFE_FLOAT4=1 DEBUGCL=1 python3 openpilot/compile.py {fn}.onnx {fn}.thneed"
else:
cmd = f"cd {Dir('#').abspath}/tinygrad_repo && FLOAT16=1 PYOPENCL_NO_CACHE=1 MATMUL=1 NATIVE_EXPLOG=1 OPTWG=1 UNSAFE_FLOAT4=1 DEBUGCL=1 python3 openpilot/compile.py {fn}.onnx {fn}.thneed"
# is there a better way then listing all of tinygrad?
lenv.Command(fn + ".thneed", [fn + ".onnx",
"#tinygrad_repo/openpilot/compile.py",
"#tinygrad_repo/accel/opencl/conv.cl",
"#tinygrad_repo/accel/opencl/matmul.cl",
"#tinygrad_repo/accel/opencl/ops_opencl.py",
"#tinygrad_repo/accel/opencl/preprocessing.py",
"#tinygrad_repo/extra/onnx.py",
"#tinygrad_repo/extra/utils.py",
"#tinygrad_repo/tinygrad/llops/ops_gpu.py",
"#tinygrad_repo/tinygrad/llops/ops_opencl.py",
"#tinygrad_repo/tinygrad/helpers.py",
"#tinygrad_repo/tinygrad/mlops.py",
"#tinygrad_repo/tinygrad/ops.py",
"#tinygrad_repo/tinygrad/shapetracker.py",
"#tinygrad_repo/tinygrad/tensor.py",
"#tinygrad_repo/tinygrad/nn/__init__.py"
], cmd)
# old thneed compiler. TODO: remove this once tinygrad stuff is stable
#compiler = lenv.Program('thneed/compile', ["thneed/compile.cc"]+common_model, LIBS=libs)
#cmd = f"cd {Dir('.').abspath} && {compiler[0].abspath} --in {fn}.dlc --out {fn}.thneed --binary --optimize"
#lib_paths = ':'.join(Dir(p).abspath for p in lenv["LIBPATH"])
#kernel_path = os.path.join(Dir('.').abspath, "thneed", "kernels")
#cenv = Environment(ENV={'LD_LIBRARY_PATH': f"{lib_paths}:{lenv['ENV']['LD_LIBRARY_PATH']}", 'KERNEL_PATH': kernel_path})
#kernels = [os.path.join(kernel_path, x) for x in os.listdir(kernel_path) if x.endswith(".cl")]
#cenv.Command(fn + ".thneed", [fn + ".dlc", kernels, compiler], cmd)
llenv = lenv.Clone()
if GetOption('pc_thneed'):
pc_thneed_src = [
"thneed/thneed_common.cc",
"thneed/thneed_pc.cc",
"thneed/serialize.cc",
"runners/thneedmodel.cc",
]
llenv['CFLAGS'].append("-DUSE_THNEED")
llenv['CXXFLAGS'].append("-DUSE_THNEED")
common_model += llenv.Object(pc_thneed_src)
libs += ['dl']
llenv.Program('_modeld', [
"modeld.cc",
"models/driving.cc",
]+common_model, LIBS=libs + transformations)

View File

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:93d265fc88f05746ce47257e15fc2afe43b250b44715313049f829e8aa30a9d6
size 94302331

View File

@ -33,11 +33,14 @@ void Thneed::load(const char *filename) {
assert(mobj["needs_load"].bool_value() == false);
} else {
if (mobj["needs_load"].bool_value()) {
//printf("loading %p %d @ 0x%X\n", clbuf, sz, ptr);
clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, &buf[ptr], NULL);
if (debug >= 1) printf("loading %p %d @ 0x%X\n", clbuf, sz, ptr);
ptr += sz;
} else {
clbuf = clCreateBuffer(context, CL_MEM_READ_WRITE, sz, NULL, NULL);
// TODO: is there a faster way to init zeroed out buffers?
void *host_zeros = calloc(sz, 1);
clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, host_zeros, NULL);
free(host_zeros);
}
}
assert(clbuf != NULL);

View File

@ -122,7 +122,7 @@ class Thneed {
// all CL kernels
void find_inputs_outputs();
void copy_inputs(float **finputs);
void copy_inputs(float **finputs, bool internal=false);
void copy_output(float *foutput);
cl_int clexec();
vector<shared_ptr<CLQueuedKernel> > kq;

View File

@ -30,17 +30,16 @@ cl_int Thneed::clexec() {
return clFinish(command_queue);
}
void Thneed::copy_inputs(float **finputs) {
//cl_int ret;
void Thneed::copy_inputs(float **finputs, bool internal) {
for (int idx = 0; idx < inputs.size(); ++idx) {
if (debug >= 1) printf("copying %lu -- %p -> %p (cl %p)\n", input_sizes[idx], finputs[idx], inputs[idx], input_clmem[idx]);
// TODO: fix thneed caching
if (internal) {
// if it's internal, using memcpy is fine since the buffer sync is cached in the ioctl layer
if (finputs[idx] != NULL) memcpy(inputs[idx], finputs[idx], input_sizes[idx]);
//if (finputs[idx] != NULL) CL_CHECK(clEnqueueWriteBuffer(command_queue, input_clmem[idx], CL_TRUE, 0, input_sizes[idx], finputs[idx], 0, NULL, NULL));
// HACK
//if (input_sizes[idx] == 16) memset((char*)inputs[idx] + 8, 0, 8);
} else {
if (finputs[idx] != NULL) CL_CHECK(clEnqueueWriteBuffer(command_queue, input_clmem[idx], CL_TRUE, 0, input_sizes[idx], finputs[idx], 0, NULL, NULL));
}
}
}
@ -202,8 +201,8 @@ void CLQueuedKernel::debug_print(bool verbose) {
assert(slice_pitch == 0);
clGetImageInfo(val, CL_IMAGE_BUFFER, sizeof(buf), &buf, NULL);
size_t sz;
clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
size_t sz = 0;
if (buf != NULL) clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
printf(" image %zu x %zu rp %zu @ %p buffer %zu", width, height, row_pitch, buf, sz);
} else {
size_t sz;

View File

@ -0,0 +1,40 @@
#include "selfdrive/modeld/thneed/thneed.h"
#include <cassert>
#include "common/clutil.h"
#include "common/timing.h"
Thneed::Thneed(bool do_clinit, cl_context _context) {
context = _context;
if (do_clinit) clinit();
char *thneed_debug_env = getenv("THNEED_DEBUG");
debug = (thneed_debug_env != NULL) ? atoi(thneed_debug_env) : 0;
}
void Thneed::execute(float **finputs, float *foutput, bool slow) {
uint64_t tb, te;
if (debug >= 1) tb = nanos_since_boot();
// ****** copy inputs
copy_inputs(finputs);
// ****** run commands
clexec();
// ****** copy outputs
copy_output(foutput);
if (debug >= 1) {
te = nanos_since_boot();
printf("model exec in %lu us\n", (te-tb)/1000);
}
}
void Thneed::stop() {
}
void Thneed::find_inputs_outputs() {
// thneed on PC doesn't work on old style inputs/outputs
}

View File

@ -269,7 +269,7 @@ void Thneed::execute(float **finputs, float *foutput, bool slow) {
if (debug >= 1) tb = nanos_since_boot();
// ****** copy inputs
copy_inputs(finputs);
copy_inputs(finputs, true);
// ****** set power constraint
int ret;

View File

@ -1 +1 @@
ca90e11f8d59902af38d3785ddd91a27d0fbb411
cffb4e720b0379bedd4ff802912d998ace775c37

1
tinygrad Symbolic link
View File

@ -0,0 +1 @@
tinygrad_repo/tinygrad

1
tinygrad_repo Submodule

@ -0,0 +1 @@
Subproject commit 2e9b7637b3c3c8895fda9f964215db3a35fe3441