From 593c3a0c8e69686ee821b2a0b2cfa9901b67dc27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Harald=20Sch=C3=A4fer?= Date: Fri, 6 Feb 2026 14:13:46 -0800 Subject: [PATCH] Calibrate in tg (#36621) * squash * bump tg * fix linmt * Ready to merge * cleaner * match modeld * more dead stuff --- selfdrive/modeld/SConscript | 49 ++--- selfdrive/modeld/compile_warp.py | 206 ++++++++++++++++++ selfdrive/modeld/dmonitoringmodeld.py | 27 ++- selfdrive/modeld/modeld.py | 47 ++-- selfdrive/modeld/models/commonmodel.cc | 64 ------ selfdrive/modeld/models/commonmodel.h | 97 --------- selfdrive/modeld/models/commonmodel.pxd | 27 --- selfdrive/modeld/models/commonmodel_pyx.pxd | 13 -- selfdrive/modeld/models/commonmodel_pyx.pyx | 74 ------- selfdrive/modeld/transforms/loadyuv.cc | 76 ------- selfdrive/modeld/transforms/loadyuv.cl | 47 ---- selfdrive/modeld/transforms/loadyuv.h | 20 -- selfdrive/modeld/transforms/transform.cc | 97 --------- selfdrive/modeld/transforms/transform.cl | 54 ----- selfdrive/modeld/transforms/transform.h | 25 --- selfdrive/test/process_replay/model_replay.py | 2 +- .../test/process_replay/process_replay.py | 15 +- 17 files changed, 280 insertions(+), 660 deletions(-) create mode 100755 selfdrive/modeld/compile_warp.py delete mode 100644 selfdrive/modeld/models/commonmodel.cc delete mode 100644 selfdrive/modeld/models/commonmodel.h delete mode 100644 selfdrive/modeld/models/commonmodel.pxd delete mode 100644 selfdrive/modeld/models/commonmodel_pyx.pxd delete mode 100644 selfdrive/modeld/models/commonmodel_pyx.pyx delete mode 100644 selfdrive/modeld/transforms/loadyuv.cc delete mode 100644 selfdrive/modeld/transforms/loadyuv.cl delete mode 100644 selfdrive/modeld/transforms/loadyuv.h delete mode 100644 selfdrive/modeld/transforms/transform.cc delete mode 100644 selfdrive/modeld/transforms/transform.cl delete mode 100644 selfdrive/modeld/transforms/transform.h diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript index 91f3597447..72d3f95c41 100644 --- a/selfdrive/modeld/SConscript +++ b/selfdrive/modeld/SConscript @@ -1,35 +1,10 @@ import os import glob -Import('env', 'envCython', 'arch', 'cereal', 'messaging', 'common', 'visionipc') +Import('env', 'arch') lenv = env.Clone() -lenvCython = envCython.Clone() -libs = [cereal, messaging, visionipc, common, 'capnp', 'kj', 'pthread'] -frameworks = [] - -common_src = [ - "models/commonmodel.cc", - "transforms/loadyuv.cc", - "transforms/transform.cc", -] - -# OpenCL is a framework on Mac -if arch == "Darwin": - frameworks += ['OpenCL'] -else: - libs += ['OpenCL'] - -# Set path definitions -for pathdef, fn in {'TRANSFORM': 'transforms/transform.cl', 'LOADYUV': 'transforms/loadyuv.cl'}.items(): - for xenv in (lenv, lenvCython): - xenv['CXXFLAGS'].append(f'-D{pathdef}_PATH=\\"{File(fn).abspath}\\"') - -# Compile cython -cython_libs = envCython["LIBS"] + libs -commonmodel_lib = lenv.Library('commonmodel', common_src) -lenvCython.Program('models/commonmodel_pyx.so', 'models/commonmodel_pyx.pyx', LIBS=[commonmodel_lib, *cython_libs], FRAMEWORKS=frameworks) -tinygrad_files = sorted(["#"+x for x in glob.glob(env.Dir("#tinygrad_repo").relpath + "/**", recursive=True, root_dir=env.Dir("#").abspath) if 'pycache' not in x]) +tinygrad_files = ["#"+x for x in glob.glob(env.Dir("#tinygrad_repo").relpath + "/**", recursive=True, root_dir=env.Dir("#").abspath) if 'pycache' not in x] # Get model metadata for model_name in ['driving_vision', 'driving_policy', 'dmonitoring_model']: @@ -38,22 +13,30 @@ for model_name in ['driving_vision', 'driving_policy', 'dmonitoring_model']: cmd = f'python3 {Dir("#selfdrive/modeld").abspath}/get_model_metadata.py {fn}.onnx' lenv.Command(fn + "_metadata.pkl", [fn + ".onnx"] + tinygrad_files + script_files, cmd) +# compile warp +tg_flags = { + 'larch64': 'DEV=QCOM FLOAT16=1 NOLOCALS=1 JIT_BATCH_SIZE=0', + 'Darwin': f'DEV=CPU HOME={os.path.expanduser("~")}', # tinygrad calls brew which needs a $HOME in the env +}.get(arch, 'DEV=CPU CPU_LLVM=1') +image_flag = { + 'larch64': 'IMAGE=2', +}.get(arch, 'IMAGE=0') +script_files = [File(Dir("#selfdrive/modeld").File("compile_warp.py").abspath)] +cmd = f'{tg_flags} python3 {Dir("#selfdrive/modeld").abspath}/compile_warp.py ' +lenv.Command(fn + "warp_tinygrad.pkl", tinygrad_files + script_files, cmd) + def tg_compile(flags, model_name): pythonpath_string = 'PYTHONPATH="${PYTHONPATH}:' + env.Dir("#tinygrad_repo").abspath + '"' fn = File(f"models/{model_name}").abspath return lenv.Command( fn + "_tinygrad.pkl", [fn + ".onnx"] + tinygrad_files, - f'{pythonpath_string} {flags} python3 {Dir("#tinygrad_repo").abspath}/examples/openpilot/compile3.py {fn}.onnx {fn}_tinygrad.pkl' + f'{pythonpath_string} {flags} {image_flag} python3 {Dir("#tinygrad_repo").abspath}/examples/openpilot/compile3.py {fn}.onnx {fn}_tinygrad.pkl' ) # Compile small models for model_name in ['driving_vision', 'driving_policy', 'dmonitoring_model']: - flags = { - 'larch64': 'DEV=QCOM FLOAT16=1 NOLOCALS=1 IMAGE=2 JIT_BATCH_SIZE=0', - 'Darwin': f'DEV=CPU HOME={os.path.expanduser("~")}', # tinygrad calls brew which needs a $HOME in the env - }.get(arch, 'DEV=CPU CPU_LLVM=1') - tg_compile(flags, model_name) + tg_compile(tg_flags, model_name) # Compile BIG model if USB GPU is available if "USBGPU" in os.environ: diff --git a/selfdrive/modeld/compile_warp.py b/selfdrive/modeld/compile_warp.py new file mode 100755 index 0000000000..5adb60e624 --- /dev/null +++ b/selfdrive/modeld/compile_warp.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python3 +import time +import pickle +import numpy as np +from pathlib import Path +from tinygrad.tensor import Tensor +from tinygrad.helpers import Context +from tinygrad.device import Device +from tinygrad.engine.jit import TinyJit + +from openpilot.system.camerad.cameras.nv12_info import get_nv12_info +from openpilot.common.transformations.model import MEDMODEL_INPUT_SIZE, DM_INPUT_SIZE +from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye + +MODELS_DIR = Path(__file__).parent / 'models' + +CAMERA_CONFIGS = [ + (_ar_ox_fisheye.width, _ar_ox_fisheye.height), # tici: 1928x1208 + (_os_fisheye.width, _os_fisheye.height), # mici: 1344x760 +] + +UV_SCALE_MATRIX = np.array([[0.5, 0, 0], [0, 0.5, 0], [0, 0, 1]], dtype=np.float32) +UV_SCALE_MATRIX_INV = np.linalg.inv(UV_SCALE_MATRIX) + +IMG_BUFFER_SHAPE = (30, MEDMODEL_INPUT_SIZE[1] // 2, MEDMODEL_INPUT_SIZE[0] // 2) + + +def warp_pkl_path(w, h): + return MODELS_DIR / f'warp_{w}x{h}_tinygrad.pkl' + + +def dm_warp_pkl_path(w, h): + return MODELS_DIR / f'dm_warp_{w}x{h}_tinygrad.pkl' + + +def warp_perspective_tinygrad(src_flat, M_inv, dst_shape, src_shape, stride_pad, ratio): + w_dst, h_dst = dst_shape + h_src, w_src = src_shape + + x = Tensor.arange(w_dst).reshape(1, w_dst).expand(h_dst, w_dst) + y = Tensor.arange(h_dst).reshape(h_dst, 1).expand(h_dst, w_dst) + ones = Tensor.ones_like(x) + dst_coords = x.reshape(1, -1).cat(y.reshape(1, -1)).cat(ones.reshape(1, -1)) + + src_coords = M_inv @ dst_coords + src_coords = src_coords / src_coords[2:3, :] + + x_nn_clipped = Tensor.round(src_coords[0]).clip(0, w_src - 1).cast('int') + y_nn_clipped = Tensor.round(src_coords[1]).clip(0, h_src - 1).cast('int') + idx = y_nn_clipped * w_src + (y_nn_clipped * ratio).cast('int') * stride_pad + x_nn_clipped + + sampled = src_flat[idx] + return sampled + + +def frames_to_tensor(frames, model_w, model_h): + H = (frames.shape[0] * 2) // 3 + W = frames.shape[1] + in_img1 = Tensor.cat(frames[0:H:2, 0::2], + frames[1:H:2, 0::2], + frames[0:H:2, 1::2], + frames[1:H:2, 1::2], + frames[H:H+H//4].reshape((H//2, W//2)), + frames[H+H//4:H+H//2].reshape((H//2, W//2)), dim=0).reshape((6, H//2, W//2)) + return in_img1 + + +def make_frame_prepare(cam_w, cam_h, model_w, model_h): + stride, y_height, _, _ = get_nv12_info(cam_w, cam_h) + uv_offset = stride * y_height + stride_pad = stride - cam_w + + def frame_prepare_tinygrad(input_frame, M_inv): + tg_scale = Tensor(UV_SCALE_MATRIX) + M_inv_uv = tg_scale @ M_inv @ Tensor(UV_SCALE_MATRIX_INV) + with Context(SPLIT_REDUCEOP=0): + y = warp_perspective_tinygrad(input_frame[:cam_h*stride], + M_inv, (model_w, model_h), + (cam_h, cam_w), stride_pad, 1).realize() + u = warp_perspective_tinygrad(input_frame[uv_offset:uv_offset + (cam_h//4)*stride], + M_inv_uv, (model_w//2, model_h//2), + (cam_h//2, cam_w//2), stride_pad, 0.5).realize() + v = warp_perspective_tinygrad(input_frame[uv_offset + (cam_h//4)*stride:uv_offset + (cam_h//2)*stride], + M_inv_uv, (model_w//2, model_h//2), + (cam_h//2, cam_w//2), stride_pad, 0.5).realize() + yuv = y.cat(u).cat(v).reshape((model_h * 3 // 2, model_w)) + tensor = frames_to_tensor(yuv, model_w, model_h) + return tensor + return frame_prepare_tinygrad + + +def make_update_img_input(frame_prepare, model_w, model_h): + def update_img_input_tinygrad(tensor, frame, M_inv): + M_inv = M_inv.to(Device.DEFAULT) + new_img = frame_prepare(frame, M_inv) + full_buffer = tensor[6:].cat(new_img, dim=0).contiguous() + return full_buffer, Tensor.cat(full_buffer[:6], full_buffer[-6:], dim=0).contiguous().reshape(1, 12, model_h//2, model_w//2) + return update_img_input_tinygrad + + +def make_update_both_imgs(frame_prepare, model_w, model_h): + update_img = make_update_img_input(frame_prepare, model_w, model_h) + + def update_both_imgs_tinygrad(calib_img_buffer, new_img, M_inv, + calib_big_img_buffer, new_big_img, M_inv_big): + calib_img_buffer, calib_img_pair = update_img(calib_img_buffer, new_img, M_inv) + calib_big_img_buffer, calib_big_img_pair = update_img(calib_big_img_buffer, new_big_img, M_inv_big) + return calib_img_buffer, calib_img_pair, calib_big_img_buffer, calib_big_img_pair + return update_both_imgs_tinygrad + + +def make_warp_dm(cam_w, cam_h, dm_w, dm_h): + stride, y_height, _, _ = get_nv12_info(cam_w, cam_h) + stride_pad = stride - cam_w + + def warp_dm(input_frame, M_inv): + M_inv = M_inv.to(Device.DEFAULT) + with Context(SPLIT_REDUCEOP=0): + result = warp_perspective_tinygrad(input_frame[:cam_h*stride], M_inv, (dm_w, dm_h), (cam_h, cam_w), stride_pad, 1).reshape(-1, dm_h * dm_w) + return result + return warp_dm + + +def compile_modeld_warp(cam_w, cam_h): + model_w, model_h = MEDMODEL_INPUT_SIZE + _, _, _, yuv_size = get_nv12_info(cam_w, cam_h) + + print(f"Compiling modeld warp for {cam_w}x{cam_h}...") + + frame_prepare = make_frame_prepare(cam_w, cam_h, model_w, model_h) + update_both_imgs = make_update_both_imgs(frame_prepare, model_w, model_h) + update_img_jit = TinyJit(update_both_imgs, prune=True) + + full_buffer = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize() + big_full_buffer = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize() + full_buffer_np = np.zeros(IMG_BUFFER_SHAPE, dtype=np.uint8) + big_full_buffer_np = np.zeros(IMG_BUFFER_SHAPE, dtype=np.uint8) + + for i in range(10): + new_frame_np = (32 * np.random.randn(yuv_size).astype(np.float32) + 128).clip(0, 255).astype(np.uint8) + img_inputs = [full_buffer, + Tensor.from_blob(new_frame_np.ctypes.data, (yuv_size,), dtype='uint8').realize(), + Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')] + new_big_frame_np = (32 * np.random.randn(yuv_size).astype(np.float32) + 128).clip(0, 255).astype(np.uint8) + big_img_inputs = [big_full_buffer, + Tensor.from_blob(new_big_frame_np.ctypes.data, (yuv_size,), dtype='uint8').realize(), + Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')] + inputs = img_inputs + big_img_inputs + Device.default.synchronize() + + inputs_np = [x.numpy() for x in inputs] + inputs_np[0] = full_buffer_np + inputs_np[3] = big_full_buffer_np + + st = time.perf_counter() + out = update_img_jit(*inputs) + full_buffer = out[0].contiguous().realize().clone() + big_full_buffer = out[2].contiguous().realize().clone() + mt = time.perf_counter() + Device.default.synchronize() + et = time.perf_counter() + print(f" [{i+1}/10] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms") + + pkl_path = warp_pkl_path(cam_w, cam_h) + with open(pkl_path, "wb") as f: + pickle.dump(update_img_jit, f) + print(f" Saved to {pkl_path}") + + jit = pickle.load(open(pkl_path, "rb")) + jit(*inputs) + + +def compile_dm_warp(cam_w, cam_h): + dm_w, dm_h = DM_INPUT_SIZE + _, _, _, yuv_size = get_nv12_info(cam_w, cam_h) + + print(f"Compiling DM warp for {cam_w}x{cam_h}...") + + warp_dm = make_warp_dm(cam_w, cam_h, dm_w, dm_h) + warp_dm_jit = TinyJit(warp_dm, prune=True) + + for i in range(10): + inputs = [Tensor.from_blob((32 * Tensor.randn(yuv_size,) + 128).cast(dtype='uint8').realize().numpy().ctypes.data, (yuv_size,), dtype='uint8'), + Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')] + Device.default.synchronize() + st = time.perf_counter() + warp_dm_jit(*inputs) + mt = time.perf_counter() + Device.default.synchronize() + et = time.perf_counter() + print(f" [{i+1}/10] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms") + + pkl_path = dm_warp_pkl_path(cam_w, cam_h) + with open(pkl_path, "wb") as f: + pickle.dump(warp_dm_jit, f) + print(f" Saved to {pkl_path}") + + +def run_and_save_pickle(): + for cam_w, cam_h in CAMERA_CONFIGS: + compile_modeld_warp(cam_w, cam_h) + compile_dm_warp(cam_w, cam_h) + + +if __name__ == "__main__": + run_and_save_pickle() diff --git a/selfdrive/modeld/dmonitoringmodeld.py b/selfdrive/modeld/dmonitoringmodeld.py index fca762c69b..f12c13081b 100755 --- a/selfdrive/modeld/dmonitoringmodeld.py +++ b/selfdrive/modeld/dmonitoringmodeld.py @@ -3,7 +3,6 @@ import os from openpilot.system.hardware import TICI os.environ['DEV'] = 'QCOM' if TICI else 'CPU' from tinygrad.tensor import Tensor -from tinygrad.dtype import dtypes import time import pickle import numpy as np @@ -12,19 +11,19 @@ from pathlib import Path from cereal import messaging from cereal.messaging import PubMaster, SubMaster from msgq.visionipc import VisionIpcClient, VisionStreamType, VisionBuf +from msgq.visionipc.visionipc_pyx import CLContext from openpilot.common.swaglog import cloudlog from openpilot.common.realtime import config_realtime_process from openpilot.common.transformations.model import dmonitoringmodel_intrinsics from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye -from openpilot.selfdrive.modeld.models.commonmodel_pyx import CLContext, MonitoringModelFrame +from openpilot.system.camerad.cameras.nv12_info import get_nv12_info from openpilot.selfdrive.modeld.parse_model_outputs import sigmoid, safe_exp -from openpilot.selfdrive.modeld.runners.tinygrad_helpers import qcom_tensor_from_opencl_address PROCESS_NAME = "selfdrive.modeld.dmonitoringmodeld" SEND_RAW_PRED = os.getenv('SEND_RAW_PRED') MODEL_PKL_PATH = Path(__file__).parent / 'models/dmonitoring_model_tinygrad.pkl' METADATA_PATH = Path(__file__).parent / 'models/dmonitoring_model_metadata.pkl' - +MODELS_DIR = Path(__file__).parent / 'models' class ModelState: inputs: dict[str, np.ndarray] @@ -36,12 +35,15 @@ class ModelState: self.input_shapes = model_metadata['input_shapes'] self.output_slices = model_metadata['output_slices'] - self.frame = MonitoringModelFrame(cl_ctx) self.numpy_inputs = { 'calib': np.zeros(self.input_shapes['calib'], dtype=np.float32), } + self.warp_inputs_np = {'transform': np.zeros((3,3), dtype=np.float32)} + self.warp_inputs = {k: Tensor(v, device='NPY') for k,v in self.warp_inputs_np.items()} + self.frame_buf_params = None self.tensor_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()} + self.image_warp = None with open(MODEL_PKL_PATH, "rb") as f: self.model_run = pickle.load(f) @@ -50,14 +52,15 @@ class ModelState: t1 = time.perf_counter() - input_img_cl = self.frame.prepare(buf, transform.flatten()) - if TICI: - # The imgs tensors are backed by opencl memory, only need init once - if 'input_img' not in self.tensor_inputs: - self.tensor_inputs['input_img'] = qcom_tensor_from_opencl_address(input_img_cl.mem_address, self.input_shapes['input_img'], dtype=dtypes.uint8) - else: - self.tensor_inputs['input_img'] = Tensor(self.frame.buffer_from_cl(input_img_cl).reshape(self.input_shapes['input_img']), dtype=dtypes.uint8).realize() + if self.image_warp is None: + self.frame_buf_params = get_nv12_info(buf.width, buf.height) + warp_path = MODELS_DIR / f'dm_warp_{buf.width}x{buf.height}_tinygrad.pkl' + with open(warp_path, "rb") as f: + self.image_warp = pickle.load(f) + self.warp_inputs['frame'] = Tensor.from_blob(buf.data.ctypes.data, (self.frame_buf_params[3],), dtype='uint8').realize() + self.warp_inputs_np['transform'][:] = transform[:] + self.tensor_inputs['input_img'] = self.image_warp(self.warp_inputs['frame'], self.warp_inputs['transform']).realize() output = self.model_run(**self.tensor_inputs).contiguous().realize().uop.base.buffer.numpy() diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py index 1f347dc32a..d9445c8d33 100755 --- a/selfdrive/modeld/modeld.py +++ b/selfdrive/modeld/modeld.py @@ -7,7 +7,6 @@ if USBGPU: os.environ['DEV'] = 'AMD' os.environ['AMD_IFACE'] = 'USB' from tinygrad.tensor import Tensor -from tinygrad.dtype import dtypes import time import pickle import numpy as np @@ -16,20 +15,20 @@ from cereal import car, log from pathlib import Path from cereal.messaging import PubMaster, SubMaster from msgq.visionipc import VisionIpcClient, VisionStreamType, VisionBuf +from msgq.visionipc.visionipc_pyx import CLContext from opendbc.car.car_helpers import get_demo_car_params from openpilot.common.swaglog import cloudlog from openpilot.common.params import Params from openpilot.common.filter_simple import FirstOrderFilter from openpilot.common.realtime import config_realtime_process, DT_MDL from openpilot.common.transformations.camera import DEVICE_CAMERAS +from openpilot.system.camerad.cameras.nv12_info import get_nv12_info from openpilot.common.transformations.model import get_warp_matrix from openpilot.selfdrive.controls.lib.desire_helper import DesireHelper from openpilot.selfdrive.controls.lib.drive_helpers import get_accel_from_plan, smooth_value, get_curvature_from_plan from openpilot.selfdrive.modeld.parse_model_outputs import Parser from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState from openpilot.selfdrive.modeld.constants import ModelConstants, Plan -from openpilot.selfdrive.modeld.models.commonmodel_pyx import DrivingModelFrame, CLContext -from openpilot.selfdrive.modeld.runners.tinygrad_helpers import qcom_tensor_from_opencl_address PROCESS_NAME = "selfdrive.modeld.modeld" @@ -39,11 +38,15 @@ VISION_PKL_PATH = Path(__file__).parent / 'models/driving_vision_tinygrad.pkl' POLICY_PKL_PATH = Path(__file__).parent / 'models/driving_policy_tinygrad.pkl' VISION_METADATA_PATH = Path(__file__).parent / 'models/driving_vision_metadata.pkl' POLICY_METADATA_PATH = Path(__file__).parent / 'models/driving_policy_metadata.pkl' +MODELS_DIR = Path(__file__).parent / 'models' LAT_SMOOTH_SECONDS = 0.0 LONG_SMOOTH_SECONDS = 0.3 MIN_LAT_CONTROL_SPEED = 0.3 +IMG_QUEUE_SHAPE = (6*(ModelConstants.MODEL_RUN_FREQ//ModelConstants.MODEL_CONTEXT_FREQ + 1), 128, 256) +assert IMG_QUEUE_SHAPE[0] == 30 + def get_action_from_model(model_output: dict[str, np.ndarray], prev_action: log.ModelDataV2.Action, lat_action_t: float, long_action_t: float, v_ego: float) -> log.ModelDataV2.Action: @@ -136,7 +139,6 @@ class InputQueues: return out class ModelState: - frames: dict[str, DrivingModelFrame] inputs: dict[str, np.ndarray] output: np.ndarray prev_desire: np.ndarray # for tracking the rising edge of the pulse @@ -155,7 +157,6 @@ class ModelState: self.policy_output_slices = policy_metadata['output_slices'] policy_output_size = policy_metadata['output_shapes']['outputs'][1] - self.frames = {name: DrivingModelFrame(context, ModelConstants.MODEL_RUN_FREQ//ModelConstants.MODEL_CONTEXT_FREQ) for name in self.vision_input_names} self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32) # policy inputs @@ -166,11 +167,17 @@ class ModelState: self.full_input_queues.reset() # img buffers are managed in openCL transform code - self.vision_inputs: dict[str, Tensor] = {} + self.img_queues = {'img': Tensor.zeros(IMG_QUEUE_SHAPE, dtype='uint8').contiguous().realize(), + 'big_img': Tensor.zeros(IMG_QUEUE_SHAPE, dtype='uint8').contiguous().realize(),} + self.full_frames : dict[str, Tensor] = {} + self.transforms_np = {k: np.zeros((3,3), dtype=np.float32) for k in self.img_queues} + self.transforms = {k: Tensor(v, device='NPY').realize() for k, v in self.transforms_np.items()} self.vision_output = np.zeros(vision_output_size, dtype=np.float32) self.policy_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()} self.policy_output = np.zeros(policy_output_size, dtype=np.float32) self.parser = Parser() + self.frame_buf_params : dict[str, tuple[int, int, int, int]] = {} + self.update_imgs = None with open(VISION_PKL_PATH, "rb") as f: self.vision_run = pickle.load(f) @@ -188,23 +195,28 @@ class ModelState: inputs['desire_pulse'][0] = 0 new_desire = np.where(inputs['desire_pulse'] - self.prev_desire > .99, inputs['desire_pulse'], 0) self.prev_desire[:] = inputs['desire_pulse'] + if self.update_imgs is None: + for key in bufs.keys(): + w, h = bufs[key].width, bufs[key].height + self.frame_buf_params[key] = get_nv12_info(w, h) + warp_path = MODELS_DIR / f'warp_{w}x{h}_tinygrad.pkl' + with open(warp_path, "rb") as f: + self.update_imgs = pickle.load(f) - imgs_cl = {name: self.frames[name].prepare(bufs[name], transforms[name].flatten()) for name in self.vision_input_names} - if TICI and not USBGPU: - # The imgs tensors are backed by opencl memory, only need init once - for key in imgs_cl: - if key not in self.vision_inputs: - self.vision_inputs[key] = qcom_tensor_from_opencl_address(imgs_cl[key].mem_address, self.vision_input_shapes[key], dtype=dtypes.uint8) - else: - for key in imgs_cl: - frame_input = self.frames[key].buffer_from_cl(imgs_cl[key]).reshape(self.vision_input_shapes[key]) - self.vision_inputs[key] = Tensor(frame_input, dtype=dtypes.uint8).realize() + for key in bufs.keys(): + self.full_frames[key] = Tensor.from_blob(bufs[key].data.ctypes.data, (self.frame_buf_params[key][3],), dtype='uint8').realize() + self.transforms_np[key][:,:] = transforms[key][:,:] + + out = self.update_imgs(self.img_queues['img'], self.full_frames['img'], self.transforms['img'], + self.img_queues['big_img'], self.full_frames['big_img'], self.transforms['big_img']) + self.img_queues['img'], self.img_queues['big_img'], = out[0].realize(), out[2].realize() + vision_inputs = {'img': out[1], 'big_img': out[3]} if prepare_only: return None - self.vision_output = self.vision_run(**self.vision_inputs).contiguous().realize().uop.base.buffer.numpy() + self.vision_output = self.vision_run(**vision_inputs).contiguous().realize().uop.base.buffer.numpy() vision_outputs_dict = self.parser.parse_vision_outputs(self.slice_outputs(self.vision_output, self.vision_output_slices)) self.full_input_queues.enqueue({'features_buffer': vision_outputs_dict['hidden_state'], 'desire_pulse': new_desire}) @@ -214,7 +226,6 @@ class ModelState: self.policy_output = self.policy_run(**self.policy_inputs).contiguous().realize().uop.base.buffer.numpy() policy_outputs_dict = self.parser.parse_policy_outputs(self.slice_outputs(self.policy_output, self.policy_output_slices)) - combined_outputs_dict = {**vision_outputs_dict, **policy_outputs_dict} if SEND_RAW_PRED: combined_outputs_dict['raw_pred'] = np.concatenate([self.vision_output.copy(), self.policy_output.copy()]) diff --git a/selfdrive/modeld/models/commonmodel.cc b/selfdrive/modeld/models/commonmodel.cc deleted file mode 100644 index d3341e76ec..0000000000 --- a/selfdrive/modeld/models/commonmodel.cc +++ /dev/null @@ -1,64 +0,0 @@ -#include "selfdrive/modeld/models/commonmodel.h" - -#include -#include - -#include "common/clutil.h" - -DrivingModelFrame::DrivingModelFrame(cl_device_id device_id, cl_context context, int _temporal_skip) : ModelFrame(device_id, context) { - input_frames = std::make_unique(buf_size); - temporal_skip = _temporal_skip; - input_frames_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err)); - img_buffer_20hz_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (temporal_skip+1)*frame_size_bytes, NULL, &err)); - region.origin = temporal_skip * frame_size_bytes; - region.size = frame_size_bytes; - last_img_cl = CL_CHECK_ERR(clCreateSubBuffer(img_buffer_20hz_cl, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err)); - - loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT); - init_transform(device_id, context, MODEL_WIDTH, MODEL_HEIGHT); -} - -cl_mem* DrivingModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) { - run_transform(yuv_cl, MODEL_WIDTH, MODEL_HEIGHT, frame_width, frame_height, frame_stride, frame_uv_offset, projection); - - for (int i = 0; i < temporal_skip; i++) { - CL_CHECK(clEnqueueCopyBuffer(q, img_buffer_20hz_cl, img_buffer_20hz_cl, (i+1)*frame_size_bytes, i*frame_size_bytes, frame_size_bytes, 0, nullptr, nullptr)); - } - loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, last_img_cl); - - copy_queue(&loadyuv, q, img_buffer_20hz_cl, input_frames_cl, 0, 0, frame_size_bytes); - copy_queue(&loadyuv, q, last_img_cl, input_frames_cl, 0, frame_size_bytes, frame_size_bytes); - - // NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready. - clFinish(q); - return &input_frames_cl; -} - -DrivingModelFrame::~DrivingModelFrame() { - deinit_transform(); - loadyuv_destroy(&loadyuv); - CL_CHECK(clReleaseMemObject(input_frames_cl)); - CL_CHECK(clReleaseMemObject(img_buffer_20hz_cl)); - CL_CHECK(clReleaseMemObject(last_img_cl)); - CL_CHECK(clReleaseCommandQueue(q)); -} - - -MonitoringModelFrame::MonitoringModelFrame(cl_device_id device_id, cl_context context) : ModelFrame(device_id, context) { - input_frames = std::make_unique(buf_size); - input_frame_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err)); - - init_transform(device_id, context, MODEL_WIDTH, MODEL_HEIGHT); -} - -cl_mem* MonitoringModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) { - run_transform(yuv_cl, MODEL_WIDTH, MODEL_HEIGHT, frame_width, frame_height, frame_stride, frame_uv_offset, projection); - clFinish(q); - return &y_cl; -} - -MonitoringModelFrame::~MonitoringModelFrame() { - deinit_transform(); - CL_CHECK(clReleaseMemObject(input_frame_cl)); - CL_CHECK(clReleaseCommandQueue(q)); -} diff --git a/selfdrive/modeld/models/commonmodel.h b/selfdrive/modeld/models/commonmodel.h deleted file mode 100644 index 176d7eb6dc..0000000000 --- a/selfdrive/modeld/models/commonmodel.h +++ /dev/null @@ -1,97 +0,0 @@ -#pragma once - -#include -#include -#include - -#include - -#define CL_USE_DEPRECATED_OPENCL_1_2_APIS -#ifdef __APPLE__ -#include -#else -#include -#endif - -#include "common/mat.h" -#include "selfdrive/modeld/transforms/loadyuv.h" -#include "selfdrive/modeld/transforms/transform.h" - -class ModelFrame { -public: - ModelFrame(cl_device_id device_id, cl_context context) { - q = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &err)); - } - virtual ~ModelFrame() {} - virtual cl_mem* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) { return NULL; } - uint8_t* buffer_from_cl(cl_mem *in_frames, int buffer_size) { - CL_CHECK(clEnqueueReadBuffer(q, *in_frames, CL_TRUE, 0, buffer_size, input_frames.get(), 0, nullptr, nullptr)); - clFinish(q); - return &input_frames[0]; - } - - int MODEL_WIDTH; - int MODEL_HEIGHT; - int MODEL_FRAME_SIZE; - int buf_size; - -protected: - cl_mem y_cl, u_cl, v_cl; - Transform transform; - cl_command_queue q; - std::unique_ptr input_frames; - - void init_transform(cl_device_id device_id, cl_context context, int model_width, int model_height) { - y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, model_width * model_height, NULL, &err)); - u_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (model_width / 2) * (model_height / 2), NULL, &err)); - v_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (model_width / 2) * (model_height / 2), NULL, &err)); - transform_init(&transform, context, device_id); - } - - void deinit_transform() { - transform_destroy(&transform); - CL_CHECK(clReleaseMemObject(v_cl)); - CL_CHECK(clReleaseMemObject(u_cl)); - CL_CHECK(clReleaseMemObject(y_cl)); - } - - void run_transform(cl_mem yuv_cl, int model_width, int model_height, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) { - transform_queue(&transform, q, - yuv_cl, frame_width, frame_height, frame_stride, frame_uv_offset, - y_cl, u_cl, v_cl, model_width, model_height, projection); - } -}; - -class DrivingModelFrame : public ModelFrame { -public: - DrivingModelFrame(cl_device_id device_id, cl_context context, int _temporal_skip); - ~DrivingModelFrame(); - cl_mem* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection); - - const int MODEL_WIDTH = 512; - const int MODEL_HEIGHT = 256; - const int MODEL_FRAME_SIZE = MODEL_WIDTH * MODEL_HEIGHT * 3 / 2; - const int buf_size = MODEL_FRAME_SIZE * 2; // 2 frames are temporal_skip frames apart - const size_t frame_size_bytes = MODEL_FRAME_SIZE * sizeof(uint8_t); - -private: - LoadYUVState loadyuv; - cl_mem img_buffer_20hz_cl, last_img_cl, input_frames_cl; - cl_buffer_region region; - int temporal_skip; -}; - -class MonitoringModelFrame : public ModelFrame { -public: - MonitoringModelFrame(cl_device_id device_id, cl_context context); - ~MonitoringModelFrame(); - cl_mem* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection); - - const int MODEL_WIDTH = 1440; - const int MODEL_HEIGHT = 960; - const int MODEL_FRAME_SIZE = MODEL_WIDTH * MODEL_HEIGHT; - const int buf_size = MODEL_FRAME_SIZE; - -private: - cl_mem input_frame_cl; -}; diff --git a/selfdrive/modeld/models/commonmodel.pxd b/selfdrive/modeld/models/commonmodel.pxd deleted file mode 100644 index 4ac64d9172..0000000000 --- a/selfdrive/modeld/models/commonmodel.pxd +++ /dev/null @@ -1,27 +0,0 @@ -# distutils: language = c++ - -from msgq.visionipc.visionipc cimport cl_device_id, cl_context, cl_mem - -cdef extern from "common/mat.h": - cdef struct mat3: - float v[9] - -cdef extern from "common/clutil.h": - cdef unsigned long CL_DEVICE_TYPE_DEFAULT - cl_device_id cl_get_device_id(unsigned long) - cl_context cl_create_context(cl_device_id) - void cl_release_context(cl_context) - -cdef extern from "selfdrive/modeld/models/commonmodel.h": - cppclass ModelFrame: - int buf_size - unsigned char * buffer_from_cl(cl_mem*, int); - cl_mem * prepare(cl_mem, int, int, int, int, mat3) - - cppclass DrivingModelFrame: - int buf_size - DrivingModelFrame(cl_device_id, cl_context, int) - - cppclass MonitoringModelFrame: - int buf_size - MonitoringModelFrame(cl_device_id, cl_context) diff --git a/selfdrive/modeld/models/commonmodel_pyx.pxd b/selfdrive/modeld/models/commonmodel_pyx.pxd deleted file mode 100644 index 0bb798625b..0000000000 --- a/selfdrive/modeld/models/commonmodel_pyx.pxd +++ /dev/null @@ -1,13 +0,0 @@ -# distutils: language = c++ - -from msgq.visionipc.visionipc cimport cl_mem -from msgq.visionipc.visionipc_pyx cimport CLContext as BaseCLContext - -cdef class CLContext(BaseCLContext): - pass - -cdef class CLMem: - cdef cl_mem * mem - - @staticmethod - cdef create(void*) diff --git a/selfdrive/modeld/models/commonmodel_pyx.pyx b/selfdrive/modeld/models/commonmodel_pyx.pyx deleted file mode 100644 index 5b7d11bc71..0000000000 --- a/selfdrive/modeld/models/commonmodel_pyx.pyx +++ /dev/null @@ -1,74 +0,0 @@ -# distutils: language = c++ -# cython: c_string_encoding=ascii, language_level=3 - -import numpy as np -cimport numpy as cnp -from libc.string cimport memcpy -from libc.stdint cimport uintptr_t - -from msgq.visionipc.visionipc cimport cl_mem -from msgq.visionipc.visionipc_pyx cimport VisionBuf, CLContext as BaseCLContext -from .commonmodel cimport CL_DEVICE_TYPE_DEFAULT, cl_get_device_id, cl_create_context, cl_release_context -from .commonmodel cimport mat3, ModelFrame as cppModelFrame, DrivingModelFrame as cppDrivingModelFrame, MonitoringModelFrame as cppMonitoringModelFrame - - -cdef class CLContext(BaseCLContext): - def __cinit__(self): - self.device_id = cl_get_device_id(CL_DEVICE_TYPE_DEFAULT) - self.context = cl_create_context(self.device_id) - - def __dealloc__(self): - if self.context: - cl_release_context(self.context) - -cdef class CLMem: - @staticmethod - cdef create(void * cmem): - mem = CLMem() - mem.mem = cmem - return mem - - @property - def mem_address(self): - return (self.mem) - -def cl_from_visionbuf(VisionBuf buf): - return CLMem.create(&buf.buf.buf_cl) - - -cdef class ModelFrame: - cdef cppModelFrame * frame - cdef int buf_size - - def __dealloc__(self): - del self.frame - - def prepare(self, VisionBuf buf, float[:] projection): - cdef mat3 cprojection - memcpy(cprojection.v, &projection[0], 9*sizeof(float)) - cdef cl_mem * data - data = self.frame.prepare(buf.buf.buf_cl, buf.width, buf.height, buf.stride, buf.uv_offset, cprojection) - return CLMem.create(data) - - def buffer_from_cl(self, CLMem in_frames): - cdef unsigned char * data2 - data2 = self.frame.buffer_from_cl(in_frames.mem, self.buf_size) - return np.asarray( data2) - - -cdef class DrivingModelFrame(ModelFrame): - cdef cppDrivingModelFrame * _frame - - def __cinit__(self, CLContext context, int temporal_skip): - self._frame = new cppDrivingModelFrame(context.device_id, context.context, temporal_skip) - self.frame = (self._frame) - self.buf_size = self._frame.buf_size - -cdef class MonitoringModelFrame(ModelFrame): - cdef cppMonitoringModelFrame * _frame - - def __cinit__(self, CLContext context): - self._frame = new cppMonitoringModelFrame(context.device_id, context.context) - self.frame = (self._frame) - self.buf_size = self._frame.buf_size - diff --git a/selfdrive/modeld/transforms/loadyuv.cc b/selfdrive/modeld/transforms/loadyuv.cc deleted file mode 100644 index c93f5cd038..0000000000 --- a/selfdrive/modeld/transforms/loadyuv.cc +++ /dev/null @@ -1,76 +0,0 @@ -#include "selfdrive/modeld/transforms/loadyuv.h" - -#include -#include -#include - -void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int width, int height) { - memset(s, 0, sizeof(*s)); - - s->width = width; - s->height = height; - - char args[1024]; - snprintf(args, sizeof(args), - "-cl-fast-relaxed-math -cl-denorms-are-zero " - "-DTRANSFORMED_WIDTH=%d -DTRANSFORMED_HEIGHT=%d", - width, height); - cl_program prg = cl_program_from_file(ctx, device_id, LOADYUV_PATH, args); - - s->loadys_krnl = CL_CHECK_ERR(clCreateKernel(prg, "loadys", &err)); - s->loaduv_krnl = CL_CHECK_ERR(clCreateKernel(prg, "loaduv", &err)); - s->copy_krnl = CL_CHECK_ERR(clCreateKernel(prg, "copy", &err)); - - // done with this - CL_CHECK(clReleaseProgram(prg)); -} - -void loadyuv_destroy(LoadYUVState* s) { - CL_CHECK(clReleaseKernel(s->loadys_krnl)); - CL_CHECK(clReleaseKernel(s->loaduv_krnl)); - CL_CHECK(clReleaseKernel(s->copy_krnl)); -} - -void loadyuv_queue(LoadYUVState* s, cl_command_queue q, - cl_mem y_cl, cl_mem u_cl, cl_mem v_cl, - cl_mem out_cl) { - cl_int global_out_off = 0; - - CL_CHECK(clSetKernelArg(s->loadys_krnl, 0, sizeof(cl_mem), &y_cl)); - CL_CHECK(clSetKernelArg(s->loadys_krnl, 1, sizeof(cl_mem), &out_cl)); - CL_CHECK(clSetKernelArg(s->loadys_krnl, 2, sizeof(cl_int), &global_out_off)); - - const size_t loadys_work_size = (s->width*s->height)/8; - CL_CHECK(clEnqueueNDRangeKernel(q, s->loadys_krnl, 1, NULL, - &loadys_work_size, NULL, 0, 0, NULL)); - - const size_t loaduv_work_size = ((s->width/2)*(s->height/2))/8; - global_out_off += (s->width*s->height); - - CL_CHECK(clSetKernelArg(s->loaduv_krnl, 0, sizeof(cl_mem), &u_cl)); - CL_CHECK(clSetKernelArg(s->loaduv_krnl, 1, sizeof(cl_mem), &out_cl)); - CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &global_out_off)); - - CL_CHECK(clEnqueueNDRangeKernel(q, s->loaduv_krnl, 1, NULL, - &loaduv_work_size, NULL, 0, 0, NULL)); - - global_out_off += (s->width/2)*(s->height/2); - - CL_CHECK(clSetKernelArg(s->loaduv_krnl, 0, sizeof(cl_mem), &v_cl)); - CL_CHECK(clSetKernelArg(s->loaduv_krnl, 1, sizeof(cl_mem), &out_cl)); - CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &global_out_off)); - - CL_CHECK(clEnqueueNDRangeKernel(q, s->loaduv_krnl, 1, NULL, - &loaduv_work_size, NULL, 0, 0, NULL)); -} - -void copy_queue(LoadYUVState* s, cl_command_queue q, cl_mem src, cl_mem dst, - size_t src_offset, size_t dst_offset, size_t size) { - CL_CHECK(clSetKernelArg(s->copy_krnl, 0, sizeof(cl_mem), &src)); - CL_CHECK(clSetKernelArg(s->copy_krnl, 1, sizeof(cl_mem), &dst)); - CL_CHECK(clSetKernelArg(s->copy_krnl, 2, sizeof(cl_int), &src_offset)); - CL_CHECK(clSetKernelArg(s->copy_krnl, 3, sizeof(cl_int), &dst_offset)); - const size_t copy_work_size = size/8; - CL_CHECK(clEnqueueNDRangeKernel(q, s->copy_krnl, 1, NULL, - ©_work_size, NULL, 0, 0, NULL)); -} \ No newline at end of file diff --git a/selfdrive/modeld/transforms/loadyuv.cl b/selfdrive/modeld/transforms/loadyuv.cl deleted file mode 100644 index 970187a6d7..0000000000 --- a/selfdrive/modeld/transforms/loadyuv.cl +++ /dev/null @@ -1,47 +0,0 @@ -#define UV_SIZE ((TRANSFORMED_WIDTH/2)*(TRANSFORMED_HEIGHT/2)) - -__kernel void loadys(__global uchar8 const * const Y, - __global uchar * out, - int out_offset) -{ - const int gid = get_global_id(0); - const int ois = gid * 8; - const int oy = ois / TRANSFORMED_WIDTH; - const int ox = ois % TRANSFORMED_WIDTH; - - const uchar8 ys = Y[gid]; - - // 02 - // 13 - - __global uchar* outy0; - __global uchar* outy1; - if ((oy & 1) == 0) { - outy0 = out + out_offset; //y0 - outy1 = out + out_offset + UV_SIZE*2; //y2 - } else { - outy0 = out + out_offset + UV_SIZE; //y1 - outy1 = out + out_offset + UV_SIZE*3; //y3 - } - - vstore4(ys.s0246, 0, outy0 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2); - vstore4(ys.s1357, 0, outy1 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2); -} - -__kernel void loaduv(__global uchar8 const * const in, - __global uchar8 * out, - int out_offset) -{ - const int gid = get_global_id(0); - const uchar8 inv = in[gid]; - out[gid + out_offset / 8] = inv; -} - -__kernel void copy(__global uchar8 * in, - __global uchar8 * out, - int in_offset, - int out_offset) -{ - const int gid = get_global_id(0); - out[gid + out_offset / 8] = in[gid + in_offset / 8]; -} diff --git a/selfdrive/modeld/transforms/loadyuv.h b/selfdrive/modeld/transforms/loadyuv.h deleted file mode 100644 index 659059cd25..0000000000 --- a/selfdrive/modeld/transforms/loadyuv.h +++ /dev/null @@ -1,20 +0,0 @@ -#pragma once - -#include "common/clutil.h" - -typedef struct { - int width, height; - cl_kernel loadys_krnl, loaduv_krnl, copy_krnl; -} LoadYUVState; - -void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int width, int height); - -void loadyuv_destroy(LoadYUVState* s); - -void loadyuv_queue(LoadYUVState* s, cl_command_queue q, - cl_mem y_cl, cl_mem u_cl, cl_mem v_cl, - cl_mem out_cl); - - -void copy_queue(LoadYUVState* s, cl_command_queue q, cl_mem src, cl_mem dst, - size_t src_offset, size_t dst_offset, size_t size); \ No newline at end of file diff --git a/selfdrive/modeld/transforms/transform.cc b/selfdrive/modeld/transforms/transform.cc deleted file mode 100644 index 305643cf42..0000000000 --- a/selfdrive/modeld/transforms/transform.cc +++ /dev/null @@ -1,97 +0,0 @@ -#include "selfdrive/modeld/transforms/transform.h" - -#include -#include - -#include "common/clutil.h" - -void transform_init(Transform* s, cl_context ctx, cl_device_id device_id) { - memset(s, 0, sizeof(*s)); - - cl_program prg = cl_program_from_file(ctx, device_id, TRANSFORM_PATH, ""); - s->krnl = CL_CHECK_ERR(clCreateKernel(prg, "warpPerspective", &err)); - // done with this - CL_CHECK(clReleaseProgram(prg)); - - s->m_y_cl = CL_CHECK_ERR(clCreateBuffer(ctx, CL_MEM_READ_WRITE, 3*3*sizeof(float), NULL, &err)); - s->m_uv_cl = CL_CHECK_ERR(clCreateBuffer(ctx, CL_MEM_READ_WRITE, 3*3*sizeof(float), NULL, &err)); -} - -void transform_destroy(Transform* s) { - CL_CHECK(clReleaseMemObject(s->m_y_cl)); - CL_CHECK(clReleaseMemObject(s->m_uv_cl)); - CL_CHECK(clReleaseKernel(s->krnl)); -} - -void transform_queue(Transform* s, - cl_command_queue q, - cl_mem in_yuv, int in_width, int in_height, int in_stride, int in_uv_offset, - cl_mem out_y, cl_mem out_u, cl_mem out_v, - int out_width, int out_height, - const mat3& projection) { - const int zero = 0; - - // sampled using pixel center origin - // (because that's how fastcv and opencv does it) - - mat3 projection_y = projection; - - // in and out uv is half the size of y. - mat3 projection_uv = transform_scale_buffer(projection, 0.5); - - CL_CHECK(clEnqueueWriteBuffer(q, s->m_y_cl, CL_TRUE, 0, 3*3*sizeof(float), (void*)projection_y.v, 0, NULL, NULL)); - CL_CHECK(clEnqueueWriteBuffer(q, s->m_uv_cl, CL_TRUE, 0, 3*3*sizeof(float), (void*)projection_uv.v, 0, NULL, NULL)); - - const int in_y_width = in_width; - const int in_y_height = in_height; - const int in_y_px_stride = 1; - const int in_uv_width = in_width/2; - const int in_uv_height = in_height/2; - const int in_uv_px_stride = 2; - const int in_u_offset = in_uv_offset; - const int in_v_offset = in_uv_offset + 1; - - const int out_y_width = out_width; - const int out_y_height = out_height; - const int out_uv_width = out_width/2; - const int out_uv_height = out_height/2; - - CL_CHECK(clSetKernelArg(s->krnl, 0, sizeof(cl_mem), &in_yuv)); // src - CL_CHECK(clSetKernelArg(s->krnl, 1, sizeof(cl_int), &in_stride)); // src_row_stride - CL_CHECK(clSetKernelArg(s->krnl, 2, sizeof(cl_int), &in_y_px_stride)); // src_px_stride - CL_CHECK(clSetKernelArg(s->krnl, 3, sizeof(cl_int), &zero)); // src_offset - CL_CHECK(clSetKernelArg(s->krnl, 4, sizeof(cl_int), &in_y_height)); // src_rows - CL_CHECK(clSetKernelArg(s->krnl, 5, sizeof(cl_int), &in_y_width)); // src_cols - CL_CHECK(clSetKernelArg(s->krnl, 6, sizeof(cl_mem), &out_y)); // dst - CL_CHECK(clSetKernelArg(s->krnl, 7, sizeof(cl_int), &out_y_width)); // dst_row_stride - CL_CHECK(clSetKernelArg(s->krnl, 8, sizeof(cl_int), &zero)); // dst_offset - CL_CHECK(clSetKernelArg(s->krnl, 9, sizeof(cl_int), &out_y_height)); // dst_rows - CL_CHECK(clSetKernelArg(s->krnl, 10, sizeof(cl_int), &out_y_width)); // dst_cols - CL_CHECK(clSetKernelArg(s->krnl, 11, sizeof(cl_mem), &s->m_y_cl)); // M - - const size_t work_size_y[2] = {(size_t)out_y_width, (size_t)out_y_height}; - - CL_CHECK(clEnqueueNDRangeKernel(q, s->krnl, 2, NULL, - (const size_t*)&work_size_y, NULL, 0, 0, NULL)); - - const size_t work_size_uv[2] = {(size_t)out_uv_width, (size_t)out_uv_height}; - - CL_CHECK(clSetKernelArg(s->krnl, 2, sizeof(cl_int), &in_uv_px_stride)); // src_px_stride - CL_CHECK(clSetKernelArg(s->krnl, 3, sizeof(cl_int), &in_u_offset)); // src_offset - CL_CHECK(clSetKernelArg(s->krnl, 4, sizeof(cl_int), &in_uv_height)); // src_rows - CL_CHECK(clSetKernelArg(s->krnl, 5, sizeof(cl_int), &in_uv_width)); // src_cols - CL_CHECK(clSetKernelArg(s->krnl, 6, sizeof(cl_mem), &out_u)); // dst - CL_CHECK(clSetKernelArg(s->krnl, 7, sizeof(cl_int), &out_uv_width)); // dst_row_stride - CL_CHECK(clSetKernelArg(s->krnl, 8, sizeof(cl_int), &zero)); // dst_offset - CL_CHECK(clSetKernelArg(s->krnl, 9, sizeof(cl_int), &out_uv_height)); // dst_rows - CL_CHECK(clSetKernelArg(s->krnl, 10, sizeof(cl_int), &out_uv_width)); // dst_cols - CL_CHECK(clSetKernelArg(s->krnl, 11, sizeof(cl_mem), &s->m_uv_cl)); // M - - CL_CHECK(clEnqueueNDRangeKernel(q, s->krnl, 2, NULL, - (const size_t*)&work_size_uv, NULL, 0, 0, NULL)); - CL_CHECK(clSetKernelArg(s->krnl, 3, sizeof(cl_int), &in_v_offset)); // src_ofset - CL_CHECK(clSetKernelArg(s->krnl, 6, sizeof(cl_mem), &out_v)); // dst - - CL_CHECK(clEnqueueNDRangeKernel(q, s->krnl, 2, NULL, - (const size_t*)&work_size_uv, NULL, 0, 0, NULL)); -} diff --git a/selfdrive/modeld/transforms/transform.cl b/selfdrive/modeld/transforms/transform.cl deleted file mode 100644 index 2ca25920cd..0000000000 --- a/selfdrive/modeld/transforms/transform.cl +++ /dev/null @@ -1,54 +0,0 @@ -#define INTER_BITS 5 -#define INTER_TAB_SIZE (1 << INTER_BITS) -#define INTER_SCALE 1.f / INTER_TAB_SIZE - -#define INTER_REMAP_COEF_BITS 15 -#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS) - -__kernel void warpPerspective(__global const uchar * src, - int src_row_stride, int src_px_stride, int src_offset, int src_rows, int src_cols, - __global uchar * dst, - int dst_row_stride, int dst_offset, int dst_rows, int dst_cols, - __constant float * M) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - if (dx < dst_cols && dy < dst_rows) - { - float X0 = M[0] * dx + M[1] * dy + M[2]; - float Y0 = M[3] * dx + M[4] * dy + M[5]; - float W = M[6] * dx + M[7] * dy + M[8]; - W = W != 0.0f ? INTER_TAB_SIZE / W : 0.0f; - int X = rint(X0 * W), Y = rint(Y0 * W); - - int sx = convert_short_sat(X >> INTER_BITS); - int sy = convert_short_sat(Y >> INTER_BITS); - - short sx_clamp = clamp(sx, 0, src_cols - 1); - short sx_p1_clamp = clamp(sx + 1, 0, src_cols - 1); - short sy_clamp = clamp(sy, 0, src_rows - 1); - short sy_p1_clamp = clamp(sy + 1, 0, src_rows - 1); - int v0 = convert_int(src[mad24(sy_clamp, src_row_stride, src_offset + sx_clamp*src_px_stride)]); - int v1 = convert_int(src[mad24(sy_clamp, src_row_stride, src_offset + sx_p1_clamp*src_px_stride)]); - int v2 = convert_int(src[mad24(sy_p1_clamp, src_row_stride, src_offset + sx_clamp*src_px_stride)]); - int v3 = convert_int(src[mad24(sy_p1_clamp, src_row_stride, src_offset + sx_p1_clamp*src_px_stride)]); - - short ay = (short)(Y & (INTER_TAB_SIZE - 1)); - short ax = (short)(X & (INTER_TAB_SIZE - 1)); - float taby = 1.f/INTER_TAB_SIZE*ay; - float tabx = 1.f/INTER_TAB_SIZE*ax; - - int dst_index = mad24(dy, dst_row_stride, dst_offset + dx); - - int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ); - int itab1 = convert_short_sat_rte( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ); - int itab2 = convert_short_sat_rte( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ); - int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE ); - - int val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3; - - uchar pix = convert_uchar_sat((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS); - dst[dst_index] = pix; - } -} diff --git a/selfdrive/modeld/transforms/transform.h b/selfdrive/modeld/transforms/transform.h deleted file mode 100644 index 771a7054b3..0000000000 --- a/selfdrive/modeld/transforms/transform.h +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#define CL_USE_DEPRECATED_OPENCL_1_2_APIS -#ifdef __APPLE__ -#include -#else -#include -#endif - -#include "common/mat.h" - -typedef struct { - cl_kernel krnl; - cl_mem m_y_cl, m_uv_cl; -} Transform; - -void transform_init(Transform* s, cl_context ctx, cl_device_id device_id); - -void transform_destroy(Transform* transform); - -void transform_queue(Transform* s, cl_command_queue q, - cl_mem yuv, int in_width, int in_height, int in_stride, int in_uv_offset, - cl_mem out_y, cl_mem out_u, cl_mem out_v, - int out_width, int out_height, - const mat3& projection); diff --git a/selfdrive/test/process_replay/model_replay.py b/selfdrive/test/process_replay/model_replay.py index 9ba599bac9..c242328400 100755 --- a/selfdrive/test/process_replay/model_replay.py +++ b/selfdrive/test/process_replay/model_replay.py @@ -34,7 +34,7 @@ GITHUB = GithubUtils(API_TOKEN, DATA_TOKEN) EXEC_TIMINGS = [ # model, instant max, average max - ("modelV2", 0.035, 0.025), + ("modelV2", 0.035, 0.028), ("driverStateV2", 0.02, 0.015), ] diff --git a/selfdrive/test/process_replay/process_replay.py b/selfdrive/test/process_replay/process_replay.py index 8af72e5f4e..5143334bc6 100755 --- a/selfdrive/test/process_replay/process_replay.py +++ b/selfdrive/test/process_replay/process_replay.py @@ -4,6 +4,7 @@ import time import copy import heapq import signal +import numpy as np from collections import Counter from dataclasses import dataclass, field from itertools import islice @@ -23,6 +24,7 @@ from openpilot.common.params import Params from openpilot.common.prefix import OpenpilotPrefix from openpilot.common.timeout import Timeout from openpilot.common.realtime import DT_CTRL +from openpilot.system.camerad.cameras.nv12_info import get_nv12_info from openpilot.system.manager.process_config import managed_processes from openpilot.selfdrive.test.process_replay.vision_meta import meta_from_camera_state, available_streams from openpilot.selfdrive.test.process_replay.migration import migrate_all @@ -203,7 +205,8 @@ class ProcessContainer: if meta.camera_state in self.cfg.vision_pubs: assert frs[meta.camera_state].pix_fmt == 'nv12' frame_size = (frs[meta.camera_state].w, frs[meta.camera_state].h) - vipc_server.create_buffers(meta.stream, 2, *frame_size) + stride, y_height, _, yuv_size = get_nv12_info(frame_size[0], frame_size[1]) + vipc_server.create_buffers_with_sizes(meta.stream, 2, frame_size[0], frame_size[1], yuv_size, stride, stride * y_height) vipc_server.start_listener() self.vipc_server = vipc_server @@ -300,7 +303,15 @@ class ProcessContainer: camera_meta = meta_from_camera_state(m.which()) assert frs is not None img = frs[m.which()].get(camera_state.frameId) - self.vipc_server.send(camera_meta.stream, img.flatten().tobytes(), + + h, w = frs[m.which()].h, frs[m.which()].w + stride, y_height, _, yuv_size = get_nv12_info(w, h) + uv_offset = stride * y_height + padded_img = np.zeros((yuv_size), dtype=np.uint8).reshape((-1, stride)) + padded_img[:h, :w] = img[:h * w].reshape((-1, w)) + padded_img[uv_offset // stride:uv_offset // stride + h // 2, :w] = img[h * w:].reshape((-1, w)) + + self.vipc_server.send(camera_meta.stream, padded_img.flatten().tobytes(), camera_state.frameId, camera_state.timestampSof, camera_state.timestampEof) self.msg_queue = []