Revert ISP image processing + tinygrad bump (#34020)

* Revert "Replace ThneedModel with TinygradModel (#33532)"

This reverts commit da952e9b64.

* Revert "camerad: move E + D cams image pipelines to the IFE (#33959)"

This reverts commit f2a1cce42b.
This commit is contained in:
Adeeb Shihadeh 2024-11-13 19:27:11 -08:00 committed by GitHub
parent 3dc970960d
commit d9d57e5d6f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
27 changed files with 1020 additions and 150 deletions

View File

@ -55,7 +55,7 @@ whitelist = [
"tools/joystick/", "tools/joystick/",
"tools/longitudinal_maneuvers/", "tools/longitudinal_maneuvers/",
"tinygrad_repo/examples/openpilot/compile3.py", "tinygrad_repo/openpilot/compile2.py",
"tinygrad_repo/extra/onnx.py", "tinygrad_repo/extra/onnx.py",
"tinygrad_repo/extra/onnx_ops.py", "tinygrad_repo/extra/onnx_ops.py",
"tinygrad_repo/extra/thneed.py", "tinygrad_repo/extra/thneed.py",

View File

@ -13,6 +13,15 @@ common_src = [
"transforms/transform.cc", "transforms/transform.cc",
] ]
thneed_src_common = [
"thneed/thneed_common.cc",
"thneed/serialize.cc",
]
thneed_src_qcom = thneed_src_common + ["thneed/thneed_qcom2.cc"]
thneed_src_pc = thneed_src_common + ["thneed/thneed_pc.cc"]
thneed_src = thneed_src_qcom if arch == "larch64" else thneed_src_pc
# SNPE except on Mac and ARM Linux # SNPE except on Mac and ARM Linux
snpe_lib = [] snpe_lib = []
if arch != "Darwin" and arch != "aarch64": if arch != "Darwin" and arch != "aarch64":
@ -50,18 +59,20 @@ fn = File("models/supercombo").abspath
cmd = f'python3 {Dir("#selfdrive/modeld").abspath}/get_model_metadata.py {fn}.onnx' cmd = f'python3 {Dir("#selfdrive/modeld").abspath}/get_model_metadata.py {fn}.onnx'
lenv.Command(fn + "_metadata.pkl", [fn + ".onnx"] + tinygrad_files, cmd) lenv.Command(fn + "_metadata.pkl", [fn + ".onnx"] + tinygrad_files, cmd)
# Compile tinygrad model # Build thneed model
# TODO this is all super hacky if arch == "larch64" or GetOption('pc_thneed'):
pythonpath_string = 'PYTHONPATH="${PYTHONPATH}:' + env.Dir("#tinygrad_repo").abspath + '"' tinygrad_opts = []
if arch == 'larch64': if not GetOption('pc_thneed'):
device_string = 'QCOM=1' # use FLOAT16 on device for speed + don't cache the CL kernels for space
elif arch == 'Darwin' or arch == 'aarch64': tinygrad_opts += ["FLOAT16=1", "PYOPENCL_NO_CACHE=1"]
device_string = 'CLANG=1 IMAGE=0' cmd = f"cd {Dir('#').abspath}/tinygrad_repo && " + ' '.join(tinygrad_opts) + f" python3 openpilot/compile2.py {fn}.onnx {fn}.thneed"
else:
device_string = 'GPU=1'
for model_name in ['supercombo', 'dmonitoring_model']: lenv.Command(fn + ".thneed", [fn + ".onnx"] + tinygrad_files, cmd)
fn = File(f"models/{model_name}").abspath
cmd = f'{pythonpath_string} {device_string} python3 {Dir("#tinygrad_repo").abspath}/examples/openpilot/compile3.py {fn}.onnx {fn}_tinygrad.pkl'
lenv.Command(fn + "_tinygrad.pkl", [fn + ".onnx"] + tinygrad_files, cmd)
fn_dm = File("models/dmonitoring_model").abspath
cmd = f"cd {Dir('#').abspath}/tinygrad_repo && " + ' '.join(tinygrad_opts) + f" python3 openpilot/compile2.py {fn_dm}.onnx {fn_dm}.thneed"
lenv.Command(fn_dm + ".thneed", [fn_dm + ".onnx"] + tinygrad_files, cmd)
thneed_lib = env.SharedLibrary('thneed', thneed_src, LIBS=[gpucommon, common, 'OpenCL', 'dl'])
thneedmodel_lib = env.Library('thneedmodel', ['runners/thneedmodel.cc'])
lenvCython.Program('runners/thneedmodel_pyx.so', 'runners/thneedmodel_pyx.pyx', LIBS=envCython["LIBS"]+[thneedmodel_lib, thneed_lib, gpucommon, common, 'dl', 'OpenCL'])

View File

@ -1,4 +1,10 @@
#!/usr/bin/env bash #!/usr/bin/env bash
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null && pwd)" DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null && pwd)"
cd "$DIR/../../"
if [ -f "$DIR/libthneed.so" ]; then
export LD_PRELOAD="$DIR/libthneed.so"
fi
exec "$DIR/dmonitoringmodeld.py" "$@" exec "$DIR/dmonitoringmodeld.py" "$@"

View File

@ -1,16 +1,8 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os import os
from openpilot.system.hardware import TICI
## TODO this is hack
if TICI:
GPU_BACKEND = 'QCOM'
else:
GPU_BACKEND = 'GPU'
os.environ[GPU_BACKEND] = '1'
import gc import gc
import math import math
import time import time
import pickle
import ctypes import ctypes
import numpy as np import numpy as np
from pathlib import Path from pathlib import Path
@ -22,11 +14,9 @@ from msgq.visionipc import VisionIpcClient, VisionStreamType, VisionBuf
from openpilot.common.swaglog import cloudlog from openpilot.common.swaglog import cloudlog
from openpilot.common.params import Params from openpilot.common.params import Params
from openpilot.common.realtime import set_realtime_priority from openpilot.common.realtime import set_realtime_priority
from openpilot.selfdrive.modeld.models.commonmodel_pyx import CLContext #, cl_from_visionbuf from openpilot.selfdrive.modeld.runners import ModelRunner, Runtime
from openpilot.selfdrive.modeld.models.commonmodel_pyx import CLContext
from openpilot.selfdrive.modeld.parse_model_outputs import sigmoid from openpilot.selfdrive.modeld.parse_model_outputs import sigmoid
#from openpilot.selfdrive.modeld.runners.tinygrad_helpers import qcom_tensor_from_opencl_address
from tinygrad.tensor import Tensor
#from tinygrad.dtype import dtypes
CALIB_LEN = 3 CALIB_LEN = 3
MODEL_WIDTH = 1440 MODEL_WIDTH = 1440
@ -36,7 +26,9 @@ OUTPUT_SIZE = 84 + FEATURE_LEN
PROCESS_NAME = "selfdrive.modeld.dmonitoringmodeld" PROCESS_NAME = "selfdrive.modeld.dmonitoringmodeld"
SEND_RAW_PRED = os.getenv('SEND_RAW_PRED') SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
MODEL_PKL_PATH = Path(__file__).parent / 'models/dmonitoring_model_tinygrad.pkl' MODEL_PATHS = {
ModelRunner.THNEED: Path(__file__).parent / 'models/dmonitoring_model.thneed',
ModelRunner.ONNX: Path(__file__).parent / 'models/dmonitoring_model.onnx'}
class DriverStateResult(ctypes.Structure): class DriverStateResult(ctypes.Structure):
_fields_ = [ _fields_ = [
@ -67,32 +59,33 @@ class DMonitoringModelResult(ctypes.Structure):
class ModelState: class ModelState:
inputs: dict[str, np.ndarray] inputs: dict[str, np.ndarray]
output: np.ndarray output: np.ndarray
model: ModelRunner
def __init__(self, cl_ctx): def __init__(self, cl_ctx):
assert ctypes.sizeof(DMonitoringModelResult) == OUTPUT_SIZE * ctypes.sizeof(ctypes.c_float) assert ctypes.sizeof(DMonitoringModelResult) == OUTPUT_SIZE * ctypes.sizeof(ctypes.c_float)
self.numpy_inputs = {'calib': np.zeros((1, CALIB_LEN), dtype=np.float32), self.output = np.zeros(OUTPUT_SIZE, dtype=np.float32)
'input_img': np.zeros((1,MODEL_HEIGHT * MODEL_WIDTH), dtype=np.uint8)} self.inputs = {
self.img = None 'input_img': np.zeros(MODEL_HEIGHT * MODEL_WIDTH, dtype=np.uint8),
'calib': np.zeros(CALIB_LEN, dtype=np.float32)}
self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.GPU, False, cl_ctx)
with open(MODEL_PKL_PATH, "rb") as f: self.model.addInput("input_img", None)
self.model_run = pickle.load(f) self.model.addInput("calib", self.inputs['calib'])
def run(self, buf:VisionBuf, calib:np.ndarray) -> tuple[np.ndarray, float]: def run(self, buf:VisionBuf, calib:np.ndarray) -> tuple[np.ndarray, float]:
self.numpy_inputs['calib'][0,:] = calib self.inputs['calib'][:] = calib
t1 = time.perf_counter()
# TODO use opencl buffer directly to make tensor
v_offset = buf.height - MODEL_HEIGHT v_offset = buf.height - MODEL_HEIGHT
h_offset = (buf.width - MODEL_WIDTH) // 2 h_offset = (buf.width - MODEL_WIDTH) // 2
buf_data = buf.data.reshape(-1, buf.stride) buf_data = buf.data.reshape(-1, buf.stride)
self.numpy_inputs['input_img'][:] = buf_data[v_offset:v_offset+MODEL_HEIGHT, h_offset:h_offset+MODEL_WIDTH].reshape((1, -1)) input_data = self.inputs['input_img'].reshape(MODEL_HEIGHT, MODEL_WIDTH)
input_data[:] = buf_data[v_offset:v_offset+MODEL_HEIGHT, h_offset:h_offset+MODEL_WIDTH]
tensor_inputs = {k: Tensor(v) for k,v in self.numpy_inputs.items()}
output = self.model_run(**tensor_inputs)['outputs'].numpy().flatten()
self.model.setInputBuffer("input_img", self.inputs['input_img'].view(np.float32))
t1 = time.perf_counter()
self.model.execute()
t2 = time.perf_counter() t2 = time.perf_counter()
return output, t2 - t1 return self.output, t2 - t1
def fill_driver_state(msg, ds_result: DriverStateResult): def fill_driver_state(msg, ds_result: DriverStateResult):

View File

@ -1,12 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os import os
from openpilot.system.hardware import TICI
## TODO this is hack
if TICI:
GPU_BACKEND = 'QCOM'
else:
GPU_BACKEND = 'GPU'
os.environ[GPU_BACKEND] = '1'
import time import time
import pickle import pickle
import numpy as np import numpy as np
@ -25,24 +18,21 @@ from openpilot.common.transformations.camera import DEVICE_CAMERAS
from openpilot.common.transformations.model import get_warp_matrix from openpilot.common.transformations.model import get_warp_matrix
from openpilot.system import sentry from openpilot.system import sentry
from openpilot.selfdrive.controls.lib.desire_helper import DesireHelper from openpilot.selfdrive.controls.lib.desire_helper import DesireHelper
from openpilot.selfdrive.modeld.runners import ModelRunner, Runtime
from openpilot.selfdrive.modeld.parse_model_outputs import Parser from openpilot.selfdrive.modeld.parse_model_outputs import Parser
from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState
from openpilot.selfdrive.modeld.constants import ModelConstants from openpilot.selfdrive.modeld.constants import ModelConstants
from openpilot.selfdrive.modeld.models.commonmodel_pyx import ModelFrame, CLContext from openpilot.selfdrive.modeld.models.commonmodel_pyx import ModelFrame, CLContext
from openpilot.selfdrive.modeld.runners.tinygrad_helpers import qcom_tensor_from_opencl_address
from tinygrad.tensor import Tensor
from tinygrad.dtype import dtypes
PROCESS_NAME = "selfdrive.modeld.modeld" PROCESS_NAME = "selfdrive.modeld.modeld"
SEND_RAW_PRED = os.getenv('SEND_RAW_PRED') SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
MODEL_PATH = Path(__file__).parent / 'models/supercombo.onnx' MODEL_PATHS = {
MODEL_PKL_PATH = Path(__file__).parent / 'models/supercombo_tinygrad.pkl' ModelRunner.THNEED: Path(__file__).parent / 'models/supercombo.thneed',
ModelRunner.ONNX: Path(__file__).parent / 'models/supercombo.onnx'}
METADATA_PATH = Path(__file__).parent / 'models/supercombo_metadata.pkl' METADATA_PATH = Path(__file__).parent / 'models/supercombo_metadata.pkl'
# TODO: should not hardcoded
IMG_INPUT_SHAPE = (1, 12, 128, 256)
class FrameMeta: class FrameMeta:
frame_id: int = 0 frame_id: int = 0
@ -59,6 +49,7 @@ class ModelState:
inputs: dict[str, np.ndarray] inputs: dict[str, np.ndarray]
output: np.ndarray output: np.ndarray
prev_desire: np.ndarray # for tracking the rising edge of the pulse prev_desire: np.ndarray # for tracking the rising edge of the pulse
model: ModelRunner
def __init__(self, context: CLContext): def __init__(self, context: CLContext):
self.frame = ModelFrame(context) self.frame = ModelFrame(context)
@ -69,14 +60,13 @@ class ModelState:
self.prev_desired_curv_20hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN + 1, ModelConstants.PREV_DESIRED_CURV_LEN), dtype=np.float32) self.prev_desired_curv_20hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN + 1, ModelConstants.PREV_DESIRED_CURV_LEN), dtype=np.float32)
# img buffers are managed in openCL transform code # img buffers are managed in openCL transform code
self.numpy_inputs = { self.inputs = {
'desire': np.zeros((1, (ModelConstants.HISTORY_BUFFER_LEN+1), ModelConstants.DESIRE_LEN), dtype=np.float32), 'desire': np.zeros(ModelConstants.DESIRE_LEN * (ModelConstants.HISTORY_BUFFER_LEN+1), dtype=np.float32),
'traffic_convention': np.zeros((1, ModelConstants.TRAFFIC_CONVENTION_LEN), dtype=np.float32), 'traffic_convention': np.zeros(ModelConstants.TRAFFIC_CONVENTION_LEN, dtype=np.float32),
'lateral_control_params': np.zeros((1, ModelConstants.LATERAL_CONTROL_PARAMS_LEN), dtype=np.float32), 'lateral_control_params': np.zeros(ModelConstants.LATERAL_CONTROL_PARAMS_LEN, dtype=np.float32),
'prev_desired_curv': np.zeros((1,(ModelConstants.HISTORY_BUFFER_LEN+1), ModelConstants.PREV_DESIRED_CURV_LEN), dtype=np.float32), 'prev_desired_curv': np.zeros(ModelConstants.PREV_DESIRED_CURV_LEN * (ModelConstants.HISTORY_BUFFER_LEN+1), dtype=np.float32),
'features_buffer': np.zeros((1, ModelConstants.HISTORY_BUFFER_LEN, ModelConstants.FEATURE_LEN), dtype=np.float32), 'features_buffer': np.zeros(ModelConstants.HISTORY_BUFFER_LEN * ModelConstants.FEATURE_LEN, dtype=np.float32),
} }
self.img_inputs = {} # type: ignore
with open(METADATA_PATH, 'rb') as f: with open(METADATA_PATH, 'rb') as f:
model_metadata = pickle.load(f) model_metadata = pickle.load(f)
@ -86,8 +76,11 @@ class ModelState:
self.output = np.zeros(net_output_size, dtype=np.float32) self.output = np.zeros(net_output_size, dtype=np.float32)
self.parser = Parser() self.parser = Parser()
with open(MODEL_PKL_PATH, "rb") as f: self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.GPU, False, context)
self.model_run = pickle.load(f) self.model.addInput("input_imgs", None)
self.model.addInput("big_input_imgs", None)
for k,v in self.inputs.items():
self.model.addInput(k, v)
def slice_outputs(self, model_outputs: np.ndarray) -> dict[str, np.ndarray]: def slice_outputs(self, model_outputs: np.ndarray) -> dict[str, np.ndarray]:
parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in self.output_slices.items()} parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in self.output_slices.items()}
@ -104,27 +97,18 @@ class ModelState:
self.desire_20Hz[:-1] = self.desire_20Hz[1:] self.desire_20Hz[:-1] = self.desire_20Hz[1:]
self.desire_20Hz[-1] = new_desire self.desire_20Hz[-1] = new_desire
self.numpy_inputs['desire'][:] = self.desire_20Hz.reshape((1,25,4,-1)).max(axis=2) self.inputs['desire'][:] = self.desire_20Hz.reshape((25,4,-1)).max(axis=1).flatten()
self.numpy_inputs['traffic_convention'][:] = inputs['traffic_convention'] self.inputs['traffic_convention'][:] = inputs['traffic_convention']
self.numpy_inputs['lateral_control_params'][:] = inputs['lateral_control_params'] self.inputs['lateral_control_params'][:] = inputs['lateral_control_params']
input_imgs_cl = self.frame.prepare(buf, transform.flatten())
big_input_imgs_cl = self.wide_frame.prepare(wbuf, transform_wide.flatten())
if TICI: self.model.setInputBuffer("input_imgs", self.frame.prepare(buf, transform.flatten(), self.model.getCLBuffer("input_imgs")))
# The imgs tensors are backed by opencl memory, only need init once self.model.setInputBuffer("big_input_imgs", self.wide_frame.prepare(wbuf, transform_wide.flatten(), self.model.getCLBuffer("big_input_imgs")))
if 'input_imgs' not in self.img_inputs:
self.img_inputs['input_imgs'] = qcom_tensor_from_opencl_address(input_imgs_cl.mem_address, IMG_INPUT_SHAPE, dtype=dtypes.uint8)
self.img_inputs['big_input_imgs'] = qcom_tensor_from_opencl_address(big_input_imgs_cl.mem_address, IMG_INPUT_SHAPE, dtype=dtypes.uint8)
else:
self.img_inputs['input_imgs'] = Tensor(self.frame.buffer_from_cl(input_imgs_cl)).reshape(IMG_INPUT_SHAPE)
self.img_inputs['big_input_imgs'] = Tensor(self.wide_frame.buffer_from_cl(big_input_imgs_cl)).reshape(IMG_INPUT_SHAPE)
tensor_inputs = {**self.img_inputs, **{k: Tensor(v) for k,v in self.numpy_inputs.items()}}
if prepare_only: if prepare_only:
return None return None
self.output = self.model_run(**tensor_inputs)['outputs'].numpy().flatten() self.model.execute()
outputs = self.parser.parse_outputs(self.slice_outputs(self.output)) outputs = self.parser.parse_outputs(self.slice_outputs(self.output))
self.full_features_20Hz[:-1] = self.full_features_20Hz[1:] self.full_features_20Hz[:-1] = self.full_features_20Hz[1:]
@ -134,9 +118,9 @@ class ModelState:
self.prev_desired_curv_20hz[-1] = outputs['desired_curvature'][0, :] self.prev_desired_curv_20hz[-1] = outputs['desired_curvature'][0, :]
idxs = np.arange(-4,-100,-4)[::-1] idxs = np.arange(-4,-100,-4)[::-1]
self.numpy_inputs['features_buffer'][:] = self.full_features_20Hz[idxs] self.inputs['features_buffer'][:] = self.full_features_20Hz[idxs].flatten()
# TODO model only uses last value now, once that changes we need to input strided action history buffer # TODO model only uses last value now, once that changes we need to input strided action history buffer
self.numpy_inputs['prev_desired_curv'][-ModelConstants.PREV_DESIRED_CURV_LEN:] = 0. * self.prev_desired_curv_20hz[-4, :] self.inputs['prev_desired_curv'][-ModelConstants.PREV_DESIRED_CURV_LEN:] = 0. * self.prev_desired_curv_20hz[-4, :]
return outputs return outputs
@ -205,7 +189,7 @@ def main(demo=False):
cloudlog.info("modeld got CarParams: %s", CP.carName) cloudlog.info("modeld got CarParams: %s", CP.carName)
# TODO this needs more thought, use .2s extra for now to estimate other delays # TODO this needs more thought, use .2s extra for now to estimate other delays
steer_delay = .2 steer_delay = CP.steerActuatorDelay + .2
DH = DesireHelper() DH = DesireHelper()

View File

@ -8,7 +8,6 @@
ModelFrame::ModelFrame(cl_device_id device_id, cl_context context) { ModelFrame::ModelFrame(cl_device_id device_id, cl_context context) {
input_frames = std::make_unique<uint8_t[]>(buf_size); input_frames = std::make_unique<uint8_t[]>(buf_size);
input_frames_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err));
q = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &err)); q = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &err));
y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, MODEL_WIDTH * MODEL_HEIGHT, NULL, &err)); y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, MODEL_WIDTH * MODEL_HEIGHT, NULL, &err));
@ -23,7 +22,7 @@ ModelFrame::ModelFrame(cl_device_id device_id, cl_context context) {
loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT); loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT);
} }
cl_mem* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3 &projection) { uint8_t* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3 &projection, cl_mem *output) {
transform_queue(&this->transform, q, transform_queue(&this->transform, q,
yuv_cl, frame_width, frame_height, frame_stride, frame_uv_offset, yuv_cl, frame_width, frame_height, frame_stride, frame_uv_offset,
y_cl, u_cl, v_cl, MODEL_WIDTH, MODEL_HEIGHT, projection); y_cl, u_cl, v_cl, MODEL_WIDTH, MODEL_HEIGHT, projection);
@ -32,19 +31,19 @@ cl_mem* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, in
CL_CHECK(clEnqueueCopyBuffer(q, img_buffer_20hz_cl, img_buffer_20hz_cl, (i+1)*frame_size_bytes, i*frame_size_bytes, frame_size_bytes, 0, nullptr, nullptr)); CL_CHECK(clEnqueueCopyBuffer(q, img_buffer_20hz_cl, img_buffer_20hz_cl, (i+1)*frame_size_bytes, i*frame_size_bytes, frame_size_bytes, 0, nullptr, nullptr));
} }
loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, last_img_cl); loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, last_img_cl);
if (output == NULL) {
CL_CHECK(clEnqueueReadBuffer(q, img_buffer_20hz_cl, CL_TRUE, 0, frame_size_bytes, &input_frames[0], 0, nullptr, nullptr));
CL_CHECK(clEnqueueReadBuffer(q, last_img_cl, CL_TRUE, 0, frame_size_bytes, &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr));
clFinish(q);
return &input_frames[0];
} else {
copy_queue(&loadyuv, q, img_buffer_20hz_cl, *output, 0, 0, frame_size_bytes);
copy_queue(&loadyuv, q, last_img_cl, *output, 0, frame_size_bytes, frame_size_bytes);
copy_queue(&loadyuv, q, img_buffer_20hz_cl, input_frames_cl, 0, 0, frame_size_bytes); // NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready.
copy_queue(&loadyuv, q, last_img_cl, input_frames_cl, 0, frame_size_bytes, frame_size_bytes); clFinish(q);
return NULL;
// NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready. }
clFinish(q);
return &input_frames_cl;
}
uint8_t* ModelFrame::buffer_from_cl(cl_mem *in_frames) {
CL_CHECK(clEnqueueReadBuffer(q, *in_frames, CL_TRUE, 0, MODEL_FRAME_SIZE * 2 * sizeof(uint8_t), &input_frames[0], 0, nullptr, nullptr));
clFinish(q);
return &input_frames[0];
} }
ModelFrame::~ModelFrame() { ModelFrame::~ModelFrame() {

View File

@ -20,8 +20,7 @@ class ModelFrame {
public: public:
ModelFrame(cl_device_id device_id, cl_context context); ModelFrame(cl_device_id device_id, cl_context context);
~ModelFrame(); ~ModelFrame();
cl_mem* prepare(cl_mem yuv_cl, int width, int height, int frame_stride, int frame_uv_offset, const mat3& transform); uint8_t* prepare(cl_mem yuv_cl, int width, int height, int frame_stride, int frame_uv_offset, const mat3& transform, cl_mem *output);
uint8_t* buffer_from_cl(cl_mem *in_frames);
const int MODEL_WIDTH = 512; const int MODEL_WIDTH = 512;
const int MODEL_HEIGHT = 256; const int MODEL_HEIGHT = 256;
@ -33,7 +32,7 @@ private:
Transform transform; Transform transform;
LoadYUVState loadyuv; LoadYUVState loadyuv;
cl_command_queue q; cl_command_queue q;
cl_mem y_cl, u_cl, v_cl, img_buffer_20hz_cl, last_img_cl, input_frames_cl; cl_mem y_cl, u_cl, v_cl, img_buffer_20hz_cl, last_img_cl;
cl_buffer_region region; cl_buffer_region region;
std::unique_ptr<uint8_t[]> input_frames; std::unique_ptr<uint8_t[]> input_frames;
}; };

View File

@ -15,5 +15,4 @@ cdef extern from "selfdrive/modeld/models/commonmodel.h":
cppclass ModelFrame: cppclass ModelFrame:
int buf_size int buf_size
ModelFrame(cl_device_id, cl_context) ModelFrame(cl_device_id, cl_context)
cl_mem * prepare(cl_mem, int, int, int, int, mat3) unsigned char * prepare(cl_mem, int, int, int, int, mat3, cl_mem*)
unsigned char * buffer_from_cl(cl_mem*);

View File

@ -4,7 +4,6 @@
import numpy as np import numpy as np
cimport numpy as cnp cimport numpy as cnp
from libc.string cimport memcpy from libc.string cimport memcpy
from libc.stdint cimport uintptr_t
from msgq.visionipc.visionipc cimport cl_mem from msgq.visionipc.visionipc cimport cl_mem
from msgq.visionipc.visionipc_pyx cimport VisionBuf, CLContext as BaseCLContext from msgq.visionipc.visionipc_pyx cimport VisionBuf, CLContext as BaseCLContext
@ -24,13 +23,6 @@ cdef class CLMem:
mem.mem = <cl_mem*> cmem mem.mem = <cl_mem*> cmem
return mem return mem
@property
def mem_address(self):
return <uintptr_t>(self.mem)
def cl_from_visionbuf(VisionBuf buf):
return CLMem.create(<void*>&buf.buf.buf_cl)
cdef class ModelFrame: cdef class ModelFrame:
cdef cppModelFrame * frame cdef cppModelFrame * frame
@ -40,14 +32,14 @@ cdef class ModelFrame:
def __dealloc__(self): def __dealloc__(self):
del self.frame del self.frame
def prepare(self, VisionBuf buf, float[:] projection): def prepare(self, VisionBuf buf, float[:] projection, CLMem output):
cdef mat3 cprojection cdef mat3 cprojection
memcpy(cprojection.v, &projection[0], 9*sizeof(float)) memcpy(cprojection.v, &projection[0], 9*sizeof(float))
cdef cl_mem * data cdef unsigned char * data
data = self.frame.prepare(buf.buf.buf_cl, buf.width, buf.height, buf.stride, buf.uv_offset, cprojection) if output is None:
return CLMem.create(data) data = self.frame.prepare(buf.buf.buf_cl, buf.width, buf.height, buf.stride, buf.uv_offset, cprojection, NULL)
else:
def buffer_from_cl(self, CLMem in_frames): data = self.frame.prepare(buf.buf.buf_cl, buf.width, buf.height, buf.stride, buf.uv_offset, cprojection, output.mem)
cdef unsigned char * data2 if not data:
data2 = self.frame.buffer_from_cl(in_frames.mem) return None
return np.asarray(<cnp.uint8_t[:self.frame.buf_size]> data2) return np.asarray(<cnp.uint8_t[:self.frame.buf_size]> data)

View File

@ -3,18 +3,18 @@ from openpilot.system.hardware import TICI
from openpilot.selfdrive.modeld.runners.runmodel_pyx import RunModel, Runtime from openpilot.selfdrive.modeld.runners.runmodel_pyx import RunModel, Runtime
assert Runtime assert Runtime
USE_TINYGRAD = int(os.getenv('USE_TINYGRAD', str(int(TICI)))) USE_THNEED = int(os.getenv('USE_THNEED', str(int(TICI))))
USE_SNPE = int(os.getenv('USE_SNPE', str(int(TICI)))) USE_SNPE = int(os.getenv('USE_SNPE', str(int(TICI))))
class ModelRunner(RunModel): class ModelRunner(RunModel):
TINYGRAD = 'TINYGRAD' THNEED = 'THNEED'
SNPE = 'SNPE' SNPE = 'SNPE'
ONNX = 'ONNX' ONNX = 'ONNX'
def __new__(cls, paths, *args, **kwargs): def __new__(cls, paths, *args, **kwargs):
if ModelRunner.TINYGRAD in paths and USE_TINYGRAD: if ModelRunner.THNEED in paths and USE_THNEED:
from openpilot.selfdrive.modeld.runners.tinygradmodel import TinygradModel as Runner from openpilot.selfdrive.modeld.runners.thneedmodel_pyx import ThneedModel as Runner
runner_type = ModelRunner.TINYGRAD runner_type = ModelRunner.THNEED
elif ModelRunner.SNPE in paths and USE_SNPE: elif ModelRunner.SNPE in paths and USE_SNPE:
from openpilot.selfdrive.modeld.runners.snpemodel_pyx import SNPEModel as Runner from openpilot.selfdrive.modeld.runners.snpemodel_pyx import SNPEModel as Runner
runner_type = ModelRunner.SNPE runner_type = ModelRunner.SNPE

View File

@ -5,7 +5,6 @@ from libcpp.string cimport string
from .runmodel cimport USE_CPU_RUNTIME, USE_GPU_RUNTIME, USE_DSP_RUNTIME from .runmodel cimport USE_CPU_RUNTIME, USE_GPU_RUNTIME, USE_DSP_RUNTIME
from selfdrive.modeld.models.commonmodel_pyx cimport CLMem from selfdrive.modeld.models.commonmodel_pyx cimport CLMem
import numpy as np
class Runtime: class Runtime:
CPU = USE_CPU_RUNTIME CPU = USE_CPU_RUNTIME
@ -22,12 +21,11 @@ cdef class RunModel:
else: else:
self.model.addInput(name, NULL, 0) self.model.addInput(name, NULL, 0)
def setInputBuffer(self, string name, unsigned char[:] input_buffer): def setInputBuffer(self, string name, float[:] buffer):
cdef int num_floats = len(input_buffer) // sizeof(float) if buffer is not None:
cdef float* float_ptr = <float*> &input_buffer[0] self.model.setInputBuffer(name, &buffer[0], len(buffer))
cdef float[:] float_buffer_view = <float[:num_floats]> float_ptr else:
if float_buffer_view is not None: self.model.setInputBuffer(name, NULL, 0)
self.model.setInputBuffer(name, &float_buffer_view[0], num_floats)
def getCLBuffer(self, string name): def getCLBuffer(self, string name):
cdef void * cl_buf = self.model.getCLBuffer(name) cdef void * cl_buf = self.model.getCLBuffer(name)

View File

@ -0,0 +1,58 @@
#include "selfdrive/modeld/runners/thneedmodel.h"
#include <string>
#include "common/swaglog.h"
ThneedModel::ThneedModel(const std::string path, float *_output, size_t _output_size, int runtime, bool luse_tf8, cl_context context) {
thneed = new Thneed(true, context);
thneed->load(path.c_str());
thneed->clexec();
recorded = false;
output = _output;
}
void* ThneedModel::getCLBuffer(const std::string name) {
int index = -1;
for (int i = 0; i < inputs.size(); i++) {
if (name == inputs[i]->name) {
index = i;
break;
}
}
if (index == -1) {
LOGE("Tried to get CL buffer for input `%s` but no input with this name exists", name.c_str());
assert(false);
}
if (thneed->input_clmem.size() >= inputs.size()) {
return &thneed->input_clmem[inputs.size() - index - 1];
} else {
return nullptr;
}
}
void ThneedModel::execute() {
if (!recorded) {
thneed->record = true;
float *input_buffers[inputs.size()];
for (int i = 0; i < inputs.size(); i++) {
input_buffers[inputs.size() - i - 1] = inputs[i]->buffer;
}
thneed->copy_inputs(input_buffers);
thneed->clexec();
thneed->copy_output(output);
thneed->stop();
recorded = true;
} else {
float *input_buffers[inputs.size()];
for (int i = 0; i < inputs.size(); i++) {
input_buffers[inputs.size() - i - 1] = inputs[i]->buffer;
}
thneed->execute(input_buffers, output);
}
}

View File

@ -0,0 +1,17 @@
#pragma once
#include <string>
#include "selfdrive/modeld/runners/runmodel.h"
#include "selfdrive/modeld/thneed/thneed.h"
class ThneedModel : public RunModel {
public:
ThneedModel(const std::string path, float *_output, size_t _output_size, int runtime, bool use_tf8 = false, cl_context context = NULL);
void *getCLBuffer(const std::string name);
void execute();
private:
Thneed *thneed = NULL;
bool recorded;
float *output;
};

View File

@ -0,0 +1,9 @@
# distutils: language = c++
from libcpp.string cimport string
from msgq.visionipc.visionipc cimport cl_context
cdef extern from "selfdrive/modeld/runners/thneedmodel.h":
cdef cppclass ThneedModel:
ThneedModel(string, float*, size_t, int, bool, cl_context)

View File

@ -0,0 +1,14 @@
# distutils: language = c++
# cython: c_string_encoding=ascii, language_level=3
from libcpp cimport bool
from libcpp.string cimport string
from .thneedmodel cimport ThneedModel as cppThneedModel
from selfdrive.modeld.models.commonmodel_pyx cimport CLContext
from selfdrive.modeld.runners.runmodel_pyx cimport RunModel
from selfdrive.modeld.runners.runmodel cimport RunModel as cppRunModel
cdef class ThneedModel(RunModel):
def __cinit__(self, string path, float[:] output, int runtime, bool use_tf8, CLContext context):
self.model = <cppRunModel *> new cppThneedModel(path, &output[0], len(output), runtime, use_tf8, context.context)

View File

@ -1,8 +0,0 @@
from tinygrad.tensor import Tensor
from tinygrad.helpers import to_mv
def qcom_tensor_from_opencl_address(opencl_address, shape, dtype):
cl_buf_desc_ptr = to_mv(opencl_address, 8).cast('Q')[0]
rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20] # offset 0xA0 is a raw gpu pointer.
return Tensor.from_blob(rawbuf_ptr, shape, dtype=dtype, device='QCOM')

View File

@ -0,0 +1,8 @@
thneed is an SNPE accelerator. I know SNPE is already an accelerator, but sometimes things need to go even faster..
It runs on the local device, and caches a single model run. Then it replays it, but fast.
thneed slices through abstraction layers like a fish.
You need a thneed.

View File

View File

@ -0,0 +1,154 @@
#include <cassert>
#include <set>
#include "third_party/json11/json11.hpp"
#include "common/util.h"
#include "common/clutil.h"
#include "common/swaglog.h"
#include "selfdrive/modeld/thneed/thneed.h"
using namespace json11;
extern map<cl_program, string> g_program_source;
void Thneed::load(const char *filename) {
LOGD("Thneed::load: loading from %s\n", filename);
string buf = util::read_file(filename);
int jsz = *(int *)buf.data();
string jsonerr;
string jj(buf.data() + sizeof(int), jsz);
Json jdat = Json::parse(jj, jsonerr);
map<cl_mem, cl_mem> real_mem;
real_mem[NULL] = NULL;
int ptr = sizeof(int)+jsz;
for (auto &obj : jdat["objects"].array_items()) {
auto mobj = obj.object_items();
int sz = mobj["size"].int_value();
cl_mem clbuf = NULL;
if (mobj["buffer_id"].string_value().size() > 0) {
// image buffer must already be allocated
clbuf = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())];
assert(mobj["needs_load"].bool_value() == false);
} else {
if (mobj["needs_load"].bool_value()) {
clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, &buf[ptr], NULL);
if (debug >= 1) printf("loading %p %d @ 0x%X\n", clbuf, sz, ptr);
ptr += sz;
} else {
// TODO: is there a faster way to init zeroed out buffers?
void *host_zeros = calloc(sz, 1);
clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, host_zeros, NULL);
free(host_zeros);
}
}
assert(clbuf != NULL);
if (mobj["arg_type"] == "image2d_t" || mobj["arg_type"] == "image1d_t") {
cl_image_desc desc = {0};
desc.image_type = (mobj["arg_type"] == "image2d_t") ? CL_MEM_OBJECT_IMAGE2D : CL_MEM_OBJECT_IMAGE1D_BUFFER;
desc.image_width = mobj["width"].int_value();
desc.image_height = mobj["height"].int_value();
desc.image_row_pitch = mobj["row_pitch"].int_value();
assert(sz == desc.image_height*desc.image_row_pitch);
#ifdef QCOM2
desc.buffer = clbuf;
#else
// TODO: we are creating unused buffers on PC
clReleaseMemObject(clbuf);
#endif
cl_image_format format = {0};
format.image_channel_order = CL_RGBA;
format.image_channel_data_type = mobj["float32"].bool_value() ? CL_FLOAT : CL_HALF_FLOAT;
cl_int errcode;
#ifndef QCOM2
if (mobj["needs_load"].bool_value()) {
clbuf = clCreateImage(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, &format, &desc, &buf[ptr-sz], &errcode);
} else {
clbuf = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &errcode);
}
#else
clbuf = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &errcode);
#endif
if (clbuf == NULL) {
LOGE("clError: %s create image %zux%zu rp %zu with buffer %p\n", cl_get_error_string(errcode),
desc.image_width, desc.image_height, desc.image_row_pitch, desc.buffer);
}
assert(clbuf != NULL);
}
real_mem[*(cl_mem*)(mobj["id"].string_value().data())] = clbuf;
}
map<string, cl_program> g_programs;
for (const auto &[name, source] : jdat["programs"].object_items()) {
if (debug >= 1) printf("building %s with size %zu\n", name.c_str(), source.string_value().size());
g_programs[name] = cl_program_from_source(context, device_id, source.string_value());
}
for (auto &obj : jdat["inputs"].array_items()) {
auto mobj = obj.object_items();
int sz = mobj["size"].int_value();
cl_mem aa = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())];
input_clmem.push_back(aa);
input_sizes.push_back(sz);
LOGD("Thneed::load: adding input %s with size %d\n", mobj["name"].string_value().data(), sz);
cl_int cl_err;
void *ret = clEnqueueMapBuffer(command_queue, aa, CL_TRUE, CL_MAP_WRITE, 0, sz, 0, NULL, NULL, &cl_err);
if (cl_err != CL_SUCCESS) LOGE("clError: %s map %p %d\n", cl_get_error_string(cl_err), aa, sz);
assert(cl_err == CL_SUCCESS);
inputs.push_back(ret);
}
for (auto &obj : jdat["outputs"].array_items()) {
auto mobj = obj.object_items();
int sz = mobj["size"].int_value();
LOGD("Thneed::save: adding output with size %d\n", sz);
// TODO: support multiple outputs
output = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())];
assert(output != NULL);
}
for (auto &obj : jdat["binaries"].array_items()) {
string name = obj["name"].string_value();
size_t length = obj["length"].int_value();
if (debug >= 1) printf("binary %s with size %zu\n", name.c_str(), length);
g_programs[name] = cl_program_from_binary(context, device_id, (const uint8_t*)&buf[ptr], length);
ptr += length;
}
for (auto &obj : jdat["kernels"].array_items()) {
auto gws = obj["global_work_size"];
auto lws = obj["local_work_size"];
auto kk = shared_ptr<CLQueuedKernel>(new CLQueuedKernel(this));
kk->name = obj["name"].string_value();
kk->program = g_programs[kk->name];
kk->work_dim = obj["work_dim"].int_value();
for (int i = 0; i < kk->work_dim; i++) {
kk->global_work_size[i] = gws[i].int_value();
kk->local_work_size[i] = lws[i].int_value();
}
kk->num_args = obj["num_args"].int_value();
for (int i = 0; i < kk->num_args; i++) {
string arg = obj["args"].array_items()[i].string_value();
int arg_size = obj["args_size"].array_items()[i].int_value();
kk->args_size.push_back(arg_size);
if (arg_size == 8) {
cl_mem val = *(cl_mem*)(arg.data());
val = real_mem[val];
kk->args.push_back(string((char*)&val, sizeof(val)));
} else {
kk->args.push_back(arg);
}
}
kq.push_back(kk);
}
clFinish(command_queue);
}

View File

@ -0,0 +1,133 @@
#pragma once
#ifndef __user
#define __user __attribute__(())
#endif
#include <cstdint>
#include <cstdlib>
#include <memory>
#include <string>
#include <vector>
#include <CL/cl.h>
#include "third_party/linux/include/msm_kgsl.h"
using namespace std;
cl_int thneed_clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value);
namespace json11 {
class Json;
}
class Thneed;
class GPUMalloc {
public:
GPUMalloc(int size, int fd);
~GPUMalloc();
void *alloc(int size);
private:
uint64_t base;
int remaining;
};
class CLQueuedKernel {
public:
CLQueuedKernel(Thneed *lthneed) { thneed = lthneed; }
CLQueuedKernel(Thneed *lthneed,
cl_kernel _kernel,
cl_uint _work_dim,
const size_t *_global_work_size,
const size_t *_local_work_size);
cl_int exec();
void debug_print(bool verbose);
int get_arg_num(const char *search_arg_name);
cl_program program;
string name;
cl_uint num_args;
vector<string> arg_names;
vector<string> arg_types;
vector<string> args;
vector<int> args_size;
cl_kernel kernel = NULL;
json11::Json to_json() const;
cl_uint work_dim;
size_t global_work_size[3] = {0};
size_t local_work_size[3] = {0};
private:
Thneed *thneed;
};
class CachedIoctl {
public:
virtual void exec() {}
};
class CachedSync: public CachedIoctl {
public:
CachedSync(Thneed *lthneed, string ldata) { thneed = lthneed; data = ldata; }
void exec();
private:
Thneed *thneed;
string data;
};
class CachedCommand: public CachedIoctl {
public:
CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd);
void exec();
private:
void disassemble(int cmd_index);
struct kgsl_gpu_command cache;
unique_ptr<kgsl_command_object[]> cmds;
unique_ptr<kgsl_command_object[]> objs;
Thneed *thneed;
vector<shared_ptr<CLQueuedKernel> > kq;
};
class Thneed {
public:
Thneed(bool do_clinit=false, cl_context _context = NULL);
void stop();
void execute(float **finputs, float *foutput, bool slow=false);
void wait();
vector<cl_mem> input_clmem;
vector<void *> inputs;
vector<size_t> input_sizes;
cl_mem output = NULL;
cl_context context = NULL;
cl_command_queue command_queue;
cl_device_id device_id;
int context_id;
// protected?
bool record = false;
int debug;
int timestamp;
#ifdef QCOM2
unique_ptr<GPUMalloc> ram;
vector<unique_ptr<CachedIoctl> > cmds;
int fd;
#endif
// all CL kernels
void copy_inputs(float **finputs, bool internal=false);
void copy_output(float *foutput);
cl_int clexec();
vector<shared_ptr<CLQueuedKernel> > kq;
// pending CL kernels
vector<shared_ptr<CLQueuedKernel> > ckq;
// loading
void load(const char *filename);
private:
void clinit();
};

View File

@ -0,0 +1,216 @@
#include "selfdrive/modeld/thneed/thneed.h"
#include <cassert>
#include <cstring>
#include <map>
#include "common/clutil.h"
#include "common/timing.h"
map<pair<cl_kernel, int>, string> g_args;
map<pair<cl_kernel, int>, int> g_args_size;
map<cl_program, string> g_program_source;
void Thneed::stop() {
//printf("Thneed::stop: recorded %lu commands\n", cmds.size());
record = false;
}
void Thneed::clinit() {
device_id = cl_get_device_id(CL_DEVICE_TYPE_DEFAULT);
if (context == NULL) context = CL_CHECK_ERR(clCreateContext(NULL, 1, &device_id, NULL, NULL, &err));
//cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0};
command_queue = CL_CHECK_ERR(clCreateCommandQueueWithProperties(context, device_id, props, &err));
printf("Thneed::clinit done\n");
}
cl_int Thneed::clexec() {
if (debug >= 1) printf("Thneed::clexec: running %lu queued kernels\n", kq.size());
for (auto &k : kq) {
if (record) ckq.push_back(k);
cl_int ret = k->exec();
assert(ret == CL_SUCCESS);
}
return clFinish(command_queue);
}
void Thneed::copy_inputs(float **finputs, bool internal) {
for (int idx = 0; idx < inputs.size(); ++idx) {
if (debug >= 1) printf("copying %lu -- %p -> %p (cl %p)\n", input_sizes[idx], finputs[idx], inputs[idx], input_clmem[idx]);
if (internal) {
// if it's internal, using memcpy is fine since the buffer sync is cached in the ioctl layer
if (finputs[idx] != NULL) memcpy(inputs[idx], finputs[idx], input_sizes[idx]);
} else {
if (finputs[idx] != NULL) CL_CHECK(clEnqueueWriteBuffer(command_queue, input_clmem[idx], CL_TRUE, 0, input_sizes[idx], finputs[idx], 0, NULL, NULL));
}
}
}
void Thneed::copy_output(float *foutput) {
if (output != NULL) {
size_t sz;
clGetMemObjectInfo(output, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
if (debug >= 1) printf("copying %lu for output %p -> %p\n", sz, output, foutput);
CL_CHECK(clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sz, foutput, 0, NULL, NULL));
} else {
printf("CAUTION: model output is NULL, does it have no outputs?\n");
}
}
// *********** CLQueuedKernel ***********
CLQueuedKernel::CLQueuedKernel(Thneed *lthneed,
cl_kernel _kernel,
cl_uint _work_dim,
const size_t *_global_work_size,
const size_t *_local_work_size) {
thneed = lthneed;
kernel = _kernel;
work_dim = _work_dim;
assert(work_dim <= 3);
for (int i = 0; i < work_dim; i++) {
global_work_size[i] = _global_work_size[i];
local_work_size[i] = _local_work_size[i];
}
char _name[0x100];
clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, sizeof(_name), _name, NULL);
name = string(_name);
clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(num_args), &num_args, NULL);
// get args
for (int i = 0; i < num_args; i++) {
char arg_name[0x100] = {0};
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
arg_names.push_back(string(arg_name));
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_name), arg_name, NULL);
arg_types.push_back(string(arg_name));
args.push_back(g_args[make_pair(kernel, i)]);
args_size.push_back(g_args_size[make_pair(kernel, i)]);
}
// get program
clGetKernelInfo(kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, NULL);
}
int CLQueuedKernel::get_arg_num(const char *search_arg_name) {
for (int i = 0; i < num_args; i++) {
if (arg_names[i] == search_arg_name) return i;
}
printf("failed to find %s in %s\n", search_arg_name, name.c_str());
assert(false);
}
cl_int CLQueuedKernel::exec() {
if (kernel == NULL) {
kernel = clCreateKernel(program, name.c_str(), NULL);
arg_names.clear();
arg_types.clear();
for (int j = 0; j < num_args; j++) {
char arg_name[0x100] = {0};
clGetKernelArgInfo(kernel, j, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
arg_names.push_back(string(arg_name));
clGetKernelArgInfo(kernel, j, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_name), arg_name, NULL);
arg_types.push_back(string(arg_name));
cl_int ret;
if (args[j].size() != 0) {
assert(args[j].size() == args_size[j]);
ret = thneed_clSetKernelArg(kernel, j, args[j].size(), args[j].data());
} else {
ret = thneed_clSetKernelArg(kernel, j, args_size[j], NULL);
}
assert(ret == CL_SUCCESS);
}
}
if (thneed->debug >= 1) {
debug_print(thneed->debug >= 2);
}
return clEnqueueNDRangeKernel(thneed->command_queue,
kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL);
}
void CLQueuedKernel::debug_print(bool verbose) {
printf("%p %56s -- ", kernel, name.c_str());
for (int i = 0; i < work_dim; i++) {
printf("%4zu ", global_work_size[i]);
}
printf(" -- ");
for (int i = 0; i < work_dim; i++) {
printf("%4zu ", local_work_size[i]);
}
printf("\n");
if (verbose) {
for (int i = 0; i < num_args; i++) {
string arg = args[i];
printf(" %s %s", arg_types[i].c_str(), arg_names[i].c_str());
void *arg_value = (void*)arg.data();
int arg_size = arg.size();
if (arg_size == 0) {
printf(" (size) %d", args_size[i]);
} else if (arg_size == 1) {
printf(" = %d", *((char*)arg_value));
} else if (arg_size == 2) {
printf(" = %d", *((short*)arg_value));
} else if (arg_size == 4) {
if (arg_types[i] == "float") {
printf(" = %f", *((float*)arg_value));
} else {
printf(" = %d", *((int*)arg_value));
}
} else if (arg_size == 8) {
cl_mem val = (cl_mem)(*((uintptr_t*)arg_value));
printf(" = %p", val);
if (val != NULL) {
cl_mem_object_type obj_type;
clGetMemObjectInfo(val, CL_MEM_TYPE, sizeof(obj_type), &obj_type, NULL);
if (arg_types[i] == "image2d_t" || arg_types[i] == "image1d_t" || obj_type == CL_MEM_OBJECT_IMAGE2D) {
cl_image_format format;
size_t width, height, depth, array_size, row_pitch, slice_pitch;
cl_mem buf;
clGetImageInfo(val, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);
assert(format.image_channel_order == CL_RGBA);
assert(format.image_channel_data_type == CL_HALF_FLOAT || format.image_channel_data_type == CL_FLOAT);
clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL);
clGetImageInfo(val, CL_IMAGE_DEPTH, sizeof(depth), &depth, NULL);
clGetImageInfo(val, CL_IMAGE_ARRAY_SIZE, sizeof(array_size), &array_size, NULL);
clGetImageInfo(val, CL_IMAGE_SLICE_PITCH, sizeof(slice_pitch), &slice_pitch, NULL);
assert(depth == 0);
assert(array_size == 0);
assert(slice_pitch == 0);
clGetImageInfo(val, CL_IMAGE_BUFFER, sizeof(buf), &buf, NULL);
size_t sz = 0;
if (buf != NULL) clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
printf(" image %zu x %zu rp %zu @ %p buffer %zu", width, height, row_pitch, buf, sz);
} else {
size_t sz;
clGetMemObjectInfo(val, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
printf(" buffer %zu", sz);
}
}
}
printf("\n");
}
}
}
cl_int thneed_clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) {
g_args_size[make_pair(kernel, arg_index)] = arg_size;
if (arg_value != NULL) {
g_args[make_pair(kernel, arg_index)] = string((char*)arg_value, arg_size);
} else {
g_args[make_pair(kernel, arg_index)] = string("");
}
cl_int ret = clSetKernelArg(kernel, arg_index, arg_size, arg_value);
return ret;
}

View File

@ -0,0 +1,32 @@
#include "selfdrive/modeld/thneed/thneed.h"
#include <cassert>
#include "common/clutil.h"
#include "common/timing.h"
Thneed::Thneed(bool do_clinit, cl_context _context) {
context = _context;
if (do_clinit) clinit();
char *thneed_debug_env = getenv("THNEED_DEBUG");
debug = (thneed_debug_env != NULL) ? atoi(thneed_debug_env) : 0;
}
void Thneed::execute(float **finputs, float *foutput, bool slow) {
uint64_t tb, te;
if (debug >= 1) tb = nanos_since_boot();
// ****** copy inputs
copy_inputs(finputs);
// ****** run commands
clexec();
// ****** copy outputs
copy_output(foutput);
if (debug >= 1) {
te = nanos_since_boot();
printf("model exec in %lu us\n", (te-tb)/1000);
}
}

View File

@ -0,0 +1,258 @@
#include "selfdrive/modeld/thneed/thneed.h"
#include <dlfcn.h>
#include <sys/mman.h>
#include <cassert>
#include <cerrno>
#include <cstring>
#include <map>
#include <string>
#include "common/clutil.h"
#include "common/timing.h"
Thneed *g_thneed = NULL;
int g_fd = -1;
void hexdump(uint8_t *d, int len) {
assert((len%4) == 0);
printf(" dumping %p len 0x%x\n", d, len);
for (int i = 0; i < len/4; i++) {
if (i != 0 && (i%0x10) == 0) printf("\n");
printf("%8x ", d[i]);
}
printf("\n");
}
// *********** ioctl interceptor ***********
extern "C" {
int (*my_ioctl)(int filedes, unsigned long request, void *argp) = NULL;
#undef ioctl
int ioctl(int filedes, unsigned long request, void *argp) {
request &= 0xFFFFFFFF; // needed on QCOM2
if (my_ioctl == NULL) my_ioctl = reinterpret_cast<decltype(my_ioctl)>(dlsym(RTLD_NEXT, "ioctl"));
Thneed *thneed = g_thneed;
// save the fd
if (request == IOCTL_KGSL_GPUOBJ_ALLOC) g_fd = filedes;
// note that this runs always, even without a thneed object
if (request == IOCTL_KGSL_DRAWCTXT_CREATE) {
struct kgsl_drawctxt_create *create = (struct kgsl_drawctxt_create *)argp;
create->flags &= ~KGSL_CONTEXT_PRIORITY_MASK;
create->flags |= 6 << KGSL_CONTEXT_PRIORITY_SHIFT; // priority from 1-15, 1 is max priority
printf("IOCTL_KGSL_DRAWCTXT_CREATE: creating context with flags 0x%x\n", create->flags);
}
if (thneed != NULL) {
if (request == IOCTL_KGSL_GPU_COMMAND) {
struct kgsl_gpu_command *cmd = (struct kgsl_gpu_command *)argp;
if (thneed->record) {
thneed->timestamp = cmd->timestamp;
thneed->context_id = cmd->context_id;
thneed->cmds.push_back(unique_ptr<CachedCommand>(new CachedCommand(thneed, cmd)));
}
if (thneed->debug >= 1) {
printf("IOCTL_KGSL_GPU_COMMAND(%2zu): flags: 0x%lx context_id: %u timestamp: %u numcmds: %d numobjs: %d\n",
thneed->cmds.size(),
cmd->flags,
cmd->context_id, cmd->timestamp, cmd->numcmds, cmd->numobjs);
}
} else if (request == IOCTL_KGSL_GPUOBJ_SYNC) {
struct kgsl_gpuobj_sync *cmd = (struct kgsl_gpuobj_sync *)argp;
struct kgsl_gpuobj_sync_obj *objs = (struct kgsl_gpuobj_sync_obj *)(cmd->objs);
if (thneed->debug >= 2) {
printf("IOCTL_KGSL_GPUOBJ_SYNC count:%d ", cmd->count);
for (int i = 0; i < cmd->count; i++) {
printf(" -- offset:0x%lx len:0x%lx id:%d op:%d ", objs[i].offset, objs[i].length, objs[i].id, objs[i].op);
}
printf("\n");
}
if (thneed->record) {
thneed->cmds.push_back(unique_ptr<CachedSync>(new
CachedSync(thneed, string((char *)objs, sizeof(struct kgsl_gpuobj_sync_obj)*cmd->count))));
}
} else if (request == IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID) {
struct kgsl_device_waittimestamp_ctxtid *cmd = (struct kgsl_device_waittimestamp_ctxtid *)argp;
if (thneed->debug >= 1) {
printf("IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID: context_id: %d timestamp: %d timeout: %d\n",
cmd->context_id, cmd->timestamp, cmd->timeout);
}
} else if (request == IOCTL_KGSL_SETPROPERTY) {
if (thneed->debug >= 1) {
struct kgsl_device_getproperty *prop = (struct kgsl_device_getproperty *)argp;
printf("IOCTL_KGSL_SETPROPERTY: 0x%x sizebytes:%zu\n", prop->type, prop->sizebytes);
if (thneed->debug >= 2) {
hexdump((uint8_t *)prop->value, prop->sizebytes);
if (prop->type == KGSL_PROP_PWR_CONSTRAINT) {
struct kgsl_device_constraint *constraint = (struct kgsl_device_constraint *)prop->value;
hexdump((uint8_t *)constraint->data, constraint->size);
}
}
}
} else if (request == IOCTL_KGSL_DRAWCTXT_CREATE || request == IOCTL_KGSL_DRAWCTXT_DESTROY) {
// this happens
} else if (request == IOCTL_KGSL_GPUOBJ_ALLOC || request == IOCTL_KGSL_GPUOBJ_FREE) {
// this happens
} else {
if (thneed->debug >= 1) {
printf("other ioctl %lx\n", request);
}
}
}
int ret = my_ioctl(filedes, request, argp);
// NOTE: This error message goes into stdout and messes up pyenv
// if (ret != 0) printf("ioctl returned %d with errno %d\n", ret, errno);
return ret;
}
}
// *********** GPUMalloc ***********
GPUMalloc::GPUMalloc(int size, int fd) {
struct kgsl_gpuobj_alloc alloc;
memset(&alloc, 0, sizeof(alloc));
alloc.size = size;
alloc.flags = 0x10000a00;
ioctl(fd, IOCTL_KGSL_GPUOBJ_ALLOC, &alloc);
void *addr = mmap64(NULL, alloc.mmapsize, 0x3, 0x1, fd, alloc.id*0x1000);
assert(addr != MAP_FAILED);
base = (uint64_t)addr;
remaining = size;
}
GPUMalloc::~GPUMalloc() {
// TODO: free the GPU malloced area
}
void *GPUMalloc::alloc(int size) {
void *ret = (void*)base;
size = (size+0xff) & (~0xFF);
assert(size <= remaining);
remaining -= size;
base += size;
return ret;
}
// *********** CachedSync, at the ioctl layer ***********
void CachedSync::exec() {
struct kgsl_gpuobj_sync cmd;
cmd.objs = (uint64_t)data.data();
cmd.obj_len = data.length();
cmd.count = data.length() / sizeof(struct kgsl_gpuobj_sync_obj);
int ret = ioctl(thneed->fd, IOCTL_KGSL_GPUOBJ_SYNC, &cmd);
assert(ret == 0);
}
// *********** CachedCommand, at the ioctl layer ***********
CachedCommand::CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd) {
thneed = lthneed;
assert(cmd->numsyncs == 0);
memcpy(&cache, cmd, sizeof(cache));
if (cmd->numcmds > 0) {
cmds = make_unique<struct kgsl_command_object[]>(cmd->numcmds);
memcpy(cmds.get(), (void *)cmd->cmdlist, sizeof(struct kgsl_command_object)*cmd->numcmds);
cache.cmdlist = (uint64_t)cmds.get();
for (int i = 0; i < cmd->numcmds; i++) {
void *nn = thneed->ram->alloc(cmds[i].size);
memcpy(nn, (void*)cmds[i].gpuaddr, cmds[i].size);
cmds[i].gpuaddr = (uint64_t)nn;
}
}
if (cmd->numobjs > 0) {
objs = make_unique<struct kgsl_command_object[]>(cmd->numobjs);
memcpy(objs.get(), (void *)cmd->objlist, sizeof(struct kgsl_command_object)*cmd->numobjs);
cache.objlist = (uint64_t)objs.get();
for (int i = 0; i < cmd->numobjs; i++) {
void *nn = thneed->ram->alloc(objs[i].size);
memset(nn, 0, objs[i].size);
objs[i].gpuaddr = (uint64_t)nn;
}
}
kq = thneed->ckq;
thneed->ckq.clear();
}
void CachedCommand::exec() {
cache.timestamp = ++thneed->timestamp;
int ret = ioctl(thneed->fd, IOCTL_KGSL_GPU_COMMAND, &cache);
if (thneed->debug >= 1) printf("CachedCommand::exec got %d\n", ret);
if (thneed->debug >= 2) {
for (auto &it : kq) {
it->debug_print(false);
}
}
assert(ret == 0);
}
// *********** Thneed ***********
Thneed::Thneed(bool do_clinit, cl_context _context) {
// TODO: QCOM2 actually requires a different context
//context = _context;
if (do_clinit) clinit();
assert(g_fd != -1);
fd = g_fd;
ram = make_unique<GPUMalloc>(0x80000, fd);
timestamp = -1;
g_thneed = this;
char *thneed_debug_env = getenv("THNEED_DEBUG");
debug = (thneed_debug_env != NULL) ? atoi(thneed_debug_env) : 0;
}
void Thneed::wait() {
struct kgsl_device_waittimestamp_ctxtid wait;
wait.context_id = context_id;
wait.timestamp = timestamp;
wait.timeout = -1;
uint64_t tb = nanos_since_boot();
int wret = ioctl(fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait);
uint64_t te = nanos_since_boot();
if (debug >= 1) printf("wait %d after %lu us\n", wret, (te-tb)/1000);
}
void Thneed::execute(float **finputs, float *foutput, bool slow) {
uint64_t tb, te;
if (debug >= 1) tb = nanos_since_boot();
// ****** copy inputs
copy_inputs(finputs, true);
// ****** run commands
int i = 0;
for (auto &it : cmds) {
++i;
if (debug >= 1) printf("run %2d @ %7lu us: ", i, (nanos_since_boot()-tb)/1000);
it->exec();
if ((i == cmds.size()) || slow) wait();
}
// ****** copy outputs
copy_output(foutput);
if (debug >= 1) {
te = nanos_since_boot();
printf("model exec in %lu us\n", (te-tb)/1000);
}
}

View File

@ -36,7 +36,7 @@ CPU usage budget
TEST_DURATION = 25 TEST_DURATION = 25
LOG_OFFSET = 8 LOG_OFFSET = 8
MAX_TOTAL_CPU = 275. # total for all 8 cores MAX_TOTAL_CPU = 265. # total for all 8 cores
PROCS = { PROCS = {
# Baseline CPU usage by process # Baseline CPU usage by process
"selfdrive.controls.controlsd": 16.0, "selfdrive.controls.controlsd": 16.0,
@ -50,8 +50,8 @@ PROCS = {
"selfdrive.locationd.paramsd": 9.0, "selfdrive.locationd.paramsd": 9.0,
"./sensord": 7.0, "./sensord": 7.0,
"selfdrive.controls.radard": 2.0, "selfdrive.controls.radard": 2.0,
"selfdrive.modeld.modeld": 22.0, "selfdrive.modeld.modeld": 17.0,
"selfdrive.modeld.dmonitoringmodeld": 21.0, "selfdrive.modeld.dmonitoringmodeld": 11.0,
"system.hardware.hardwared": 4.0, "system.hardware.hardwared": 4.0,
"selfdrive.locationd.calibrationd": 2.0, "selfdrive.locationd.calibrationd": 2.0,
"selfdrive.locationd.torqued": 5.0, "selfdrive.locationd.torqued": 5.0,
@ -361,15 +361,13 @@ class TestOnroad:
result += "------------------------------------------------\n" result += "------------------------------------------------\n"
result += "----------------- Model Timing -----------------\n" result += "----------------- Model Timing -----------------\n"
result += "------------------------------------------------\n" result += "------------------------------------------------\n"
# TODO: Decrease again when tinygrad speeds ups # TODO: this went up when plannerd cpu usage increased, why?
cfgs = [ cfgs = [
("modelV2", 0.050, 0.040), ("modelV2", 0.050, 0.036),
("driverStateV2", 0.050, 0.026), ("driverStateV2", 0.050, 0.026),
] ]
for (s, instant_max, avg_max) in cfgs: for (s, instant_max, avg_max) in cfgs:
ts = [getattr(m, s).modelExecutionTime for m in self.msgs[s]] ts = [getattr(m, s).modelExecutionTime for m in self.msgs[s]]
# TODO some tinygrad init happens in first iteration
ts = ts[1:]
assert max(ts) < instant_max, f"high '{s}' execution time: {max(ts)}" assert max(ts) < instant_max, f"high '{s}' execution time: {max(ts)}"
assert np.mean(ts) < avg_max, f"high avg '{s}' execution time: {np.mean(ts)}" assert np.mean(ts) < avg_max, f"high avg '{s}' execution time: {np.mean(ts)}"
result += f"'{s}' execution time: min {min(ts):.5f}s\n" result += f"'{s}' execution time: min {min(ts):.5f}s\n"

View File

@ -55,7 +55,7 @@ public:
float fl_pix = 0; float fl_pix = 0;
CameraState(SpectraMaster *master, const CameraConfig &config) : camera(master, config, config.stream_type == VISION_STREAM_ROAD) {}; CameraState(SpectraMaster *master, const CameraConfig &config) : camera(master, config, true /*config.stream_type == VISION_STREAM_ROAD*/) {};
~CameraState(); ~CameraState();
void init(VisionIpcServer *v, cl_device_id device_id, cl_context ctx); void init(VisionIpcServer *v, cl_device_id device_id, cl_context ctx);
void update_exposure_score(float desired_ev, int exp_t, int exp_g_idx, float exp_gain); void update_exposure_score(float desired_ev, int exp_t, int exp_g_idx, float exp_gain);

View File

@ -31,7 +31,7 @@ class Proc:
PROCS = [ PROCS = [
Proc(['camerad'], 1.75, msgs=['roadCameraState', 'wideRoadCameraState', 'driverCameraState']), Proc(['camerad'], 2.1, msgs=['roadCameraState', 'wideRoadCameraState', 'driverCameraState']),
Proc(['modeld'], 1.12, atol=0.2, msgs=['modelV2']), Proc(['modeld'], 1.12, atol=0.2, msgs=['modelV2']),
Proc(['dmonitoringmodeld'], 0.5, msgs=['driverStateV2']), Proc(['dmonitoringmodeld'], 0.5, msgs=['driverStateV2']),
Proc(['encoderd'], 0.23, msgs=[]), Proc(['encoderd'], 0.23, msgs=[]),

@ -1 +1 @@
Subproject commit ad119af6a511373e1c016a6525ab733f14a60c51 Subproject commit 9dda6d260db0255750bacff61e3cee1e580567e1