mirror of
https://github.com/sunnypilot/sunnypilot.git
synced 2026-02-18 13:04:01 +08:00
Calibrate in tg (#36621)
* squash * bump tg * fix linmt * Ready to merge * cleaner * match modeld * more dead stuff
This commit is contained in:
@@ -1,35 +1,10 @@
|
||||
import os
|
||||
import glob
|
||||
|
||||
Import('env', 'envCython', 'arch', 'cereal', 'messaging', 'common', 'visionipc')
|
||||
Import('env', 'arch')
|
||||
lenv = env.Clone()
|
||||
lenvCython = envCython.Clone()
|
||||
|
||||
libs = [cereal, messaging, visionipc, common, 'capnp', 'kj', 'pthread']
|
||||
frameworks = []
|
||||
|
||||
common_src = [
|
||||
"models/commonmodel.cc",
|
||||
"transforms/loadyuv.cc",
|
||||
"transforms/transform.cc",
|
||||
]
|
||||
|
||||
# OpenCL is a framework on Mac
|
||||
if arch == "Darwin":
|
||||
frameworks += ['OpenCL']
|
||||
else:
|
||||
libs += ['OpenCL']
|
||||
|
||||
# Set path definitions
|
||||
for pathdef, fn in {'TRANSFORM': 'transforms/transform.cl', 'LOADYUV': 'transforms/loadyuv.cl'}.items():
|
||||
for xenv in (lenv, lenvCython):
|
||||
xenv['CXXFLAGS'].append(f'-D{pathdef}_PATH=\\"{File(fn).abspath}\\"')
|
||||
|
||||
# Compile cython
|
||||
cython_libs = envCython["LIBS"] + libs
|
||||
commonmodel_lib = lenv.Library('commonmodel', common_src)
|
||||
lenvCython.Program('models/commonmodel_pyx.so', 'models/commonmodel_pyx.pyx', LIBS=[commonmodel_lib, *cython_libs], FRAMEWORKS=frameworks)
|
||||
tinygrad_files = sorted(["#"+x for x in glob.glob(env.Dir("#tinygrad_repo").relpath + "/**", recursive=True, root_dir=env.Dir("#").abspath) if 'pycache' not in x])
|
||||
tinygrad_files = ["#"+x for x in glob.glob(env.Dir("#tinygrad_repo").relpath + "/**", recursive=True, root_dir=env.Dir("#").abspath) if 'pycache' not in x]
|
||||
|
||||
# Get model metadata
|
||||
for model_name in ['driving_vision', 'driving_policy', 'dmonitoring_model']:
|
||||
@@ -38,22 +13,30 @@ for model_name in ['driving_vision', 'driving_policy', 'dmonitoring_model']:
|
||||
cmd = f'python3 {Dir("#selfdrive/modeld").abspath}/get_model_metadata.py {fn}.onnx'
|
||||
lenv.Command(fn + "_metadata.pkl", [fn + ".onnx"] + tinygrad_files + script_files, cmd)
|
||||
|
||||
# compile warp
|
||||
tg_flags = {
|
||||
'larch64': 'DEV=QCOM FLOAT16=1 NOLOCALS=1 JIT_BATCH_SIZE=0',
|
||||
'Darwin': f'DEV=CPU HOME={os.path.expanduser("~")}', # tinygrad calls brew which needs a $HOME in the env
|
||||
}.get(arch, 'DEV=CPU CPU_LLVM=1')
|
||||
image_flag = {
|
||||
'larch64': 'IMAGE=2',
|
||||
}.get(arch, 'IMAGE=0')
|
||||
script_files = [File(Dir("#selfdrive/modeld").File("compile_warp.py").abspath)]
|
||||
cmd = f'{tg_flags} python3 {Dir("#selfdrive/modeld").abspath}/compile_warp.py '
|
||||
lenv.Command(fn + "warp_tinygrad.pkl", tinygrad_files + script_files, cmd)
|
||||
|
||||
def tg_compile(flags, model_name):
|
||||
pythonpath_string = 'PYTHONPATH="${PYTHONPATH}:' + env.Dir("#tinygrad_repo").abspath + '"'
|
||||
fn = File(f"models/{model_name}").abspath
|
||||
return lenv.Command(
|
||||
fn + "_tinygrad.pkl",
|
||||
[fn + ".onnx"] + tinygrad_files,
|
||||
f'{pythonpath_string} {flags} python3 {Dir("#tinygrad_repo").abspath}/examples/openpilot/compile3.py {fn}.onnx {fn}_tinygrad.pkl'
|
||||
f'{pythonpath_string} {flags} {image_flag} python3 {Dir("#tinygrad_repo").abspath}/examples/openpilot/compile3.py {fn}.onnx {fn}_tinygrad.pkl'
|
||||
)
|
||||
|
||||
# Compile small models
|
||||
for model_name in ['driving_vision', 'driving_policy', 'dmonitoring_model']:
|
||||
flags = {
|
||||
'larch64': 'DEV=QCOM FLOAT16=1 NOLOCALS=1 IMAGE=2 JIT_BATCH_SIZE=0',
|
||||
'Darwin': f'DEV=CPU HOME={os.path.expanduser("~")}', # tinygrad calls brew which needs a $HOME in the env
|
||||
}.get(arch, 'DEV=CPU CPU_LLVM=1')
|
||||
tg_compile(flags, model_name)
|
||||
tg_compile(tg_flags, model_name)
|
||||
|
||||
# Compile BIG model if USB GPU is available
|
||||
if "USBGPU" in os.environ:
|
||||
|
||||
206
selfdrive/modeld/compile_warp.py
Executable file
206
selfdrive/modeld/compile_warp.py
Executable file
@@ -0,0 +1,206 @@
|
||||
#!/usr/bin/env python3
|
||||
import time
|
||||
import pickle
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.helpers import Context
|
||||
from tinygrad.device import Device
|
||||
from tinygrad.engine.jit import TinyJit
|
||||
|
||||
from openpilot.system.camerad.cameras.nv12_info import get_nv12_info
|
||||
from openpilot.common.transformations.model import MEDMODEL_INPUT_SIZE, DM_INPUT_SIZE
|
||||
from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
|
||||
|
||||
MODELS_DIR = Path(__file__).parent / 'models'
|
||||
|
||||
CAMERA_CONFIGS = [
|
||||
(_ar_ox_fisheye.width, _ar_ox_fisheye.height), # tici: 1928x1208
|
||||
(_os_fisheye.width, _os_fisheye.height), # mici: 1344x760
|
||||
]
|
||||
|
||||
UV_SCALE_MATRIX = np.array([[0.5, 0, 0], [0, 0.5, 0], [0, 0, 1]], dtype=np.float32)
|
||||
UV_SCALE_MATRIX_INV = np.linalg.inv(UV_SCALE_MATRIX)
|
||||
|
||||
IMG_BUFFER_SHAPE = (30, MEDMODEL_INPUT_SIZE[1] // 2, MEDMODEL_INPUT_SIZE[0] // 2)
|
||||
|
||||
|
||||
def warp_pkl_path(w, h):
|
||||
return MODELS_DIR / f'warp_{w}x{h}_tinygrad.pkl'
|
||||
|
||||
|
||||
def dm_warp_pkl_path(w, h):
|
||||
return MODELS_DIR / f'dm_warp_{w}x{h}_tinygrad.pkl'
|
||||
|
||||
|
||||
def warp_perspective_tinygrad(src_flat, M_inv, dst_shape, src_shape, stride_pad, ratio):
|
||||
w_dst, h_dst = dst_shape
|
||||
h_src, w_src = src_shape
|
||||
|
||||
x = Tensor.arange(w_dst).reshape(1, w_dst).expand(h_dst, w_dst)
|
||||
y = Tensor.arange(h_dst).reshape(h_dst, 1).expand(h_dst, w_dst)
|
||||
ones = Tensor.ones_like(x)
|
||||
dst_coords = x.reshape(1, -1).cat(y.reshape(1, -1)).cat(ones.reshape(1, -1))
|
||||
|
||||
src_coords = M_inv @ dst_coords
|
||||
src_coords = src_coords / src_coords[2:3, :]
|
||||
|
||||
x_nn_clipped = Tensor.round(src_coords[0]).clip(0, w_src - 1).cast('int')
|
||||
y_nn_clipped = Tensor.round(src_coords[1]).clip(0, h_src - 1).cast('int')
|
||||
idx = y_nn_clipped * w_src + (y_nn_clipped * ratio).cast('int') * stride_pad + x_nn_clipped
|
||||
|
||||
sampled = src_flat[idx]
|
||||
return sampled
|
||||
|
||||
|
||||
def frames_to_tensor(frames, model_w, model_h):
|
||||
H = (frames.shape[0] * 2) // 3
|
||||
W = frames.shape[1]
|
||||
in_img1 = Tensor.cat(frames[0:H:2, 0::2],
|
||||
frames[1:H:2, 0::2],
|
||||
frames[0:H:2, 1::2],
|
||||
frames[1:H:2, 1::2],
|
||||
frames[H:H+H//4].reshape((H//2, W//2)),
|
||||
frames[H+H//4:H+H//2].reshape((H//2, W//2)), dim=0).reshape((6, H//2, W//2))
|
||||
return in_img1
|
||||
|
||||
|
||||
def make_frame_prepare(cam_w, cam_h, model_w, model_h):
|
||||
stride, y_height, _, _ = get_nv12_info(cam_w, cam_h)
|
||||
uv_offset = stride * y_height
|
||||
stride_pad = stride - cam_w
|
||||
|
||||
def frame_prepare_tinygrad(input_frame, M_inv):
|
||||
tg_scale = Tensor(UV_SCALE_MATRIX)
|
||||
M_inv_uv = tg_scale @ M_inv @ Tensor(UV_SCALE_MATRIX_INV)
|
||||
with Context(SPLIT_REDUCEOP=0):
|
||||
y = warp_perspective_tinygrad(input_frame[:cam_h*stride],
|
||||
M_inv, (model_w, model_h),
|
||||
(cam_h, cam_w), stride_pad, 1).realize()
|
||||
u = warp_perspective_tinygrad(input_frame[uv_offset:uv_offset + (cam_h//4)*stride],
|
||||
M_inv_uv, (model_w//2, model_h//2),
|
||||
(cam_h//2, cam_w//2), stride_pad, 0.5).realize()
|
||||
v = warp_perspective_tinygrad(input_frame[uv_offset + (cam_h//4)*stride:uv_offset + (cam_h//2)*stride],
|
||||
M_inv_uv, (model_w//2, model_h//2),
|
||||
(cam_h//2, cam_w//2), stride_pad, 0.5).realize()
|
||||
yuv = y.cat(u).cat(v).reshape((model_h * 3 // 2, model_w))
|
||||
tensor = frames_to_tensor(yuv, model_w, model_h)
|
||||
return tensor
|
||||
return frame_prepare_tinygrad
|
||||
|
||||
|
||||
def make_update_img_input(frame_prepare, model_w, model_h):
|
||||
def update_img_input_tinygrad(tensor, frame, M_inv):
|
||||
M_inv = M_inv.to(Device.DEFAULT)
|
||||
new_img = frame_prepare(frame, M_inv)
|
||||
full_buffer = tensor[6:].cat(new_img, dim=0).contiguous()
|
||||
return full_buffer, Tensor.cat(full_buffer[:6], full_buffer[-6:], dim=0).contiguous().reshape(1, 12, model_h//2, model_w//2)
|
||||
return update_img_input_tinygrad
|
||||
|
||||
|
||||
def make_update_both_imgs(frame_prepare, model_w, model_h):
|
||||
update_img = make_update_img_input(frame_prepare, model_w, model_h)
|
||||
|
||||
def update_both_imgs_tinygrad(calib_img_buffer, new_img, M_inv,
|
||||
calib_big_img_buffer, new_big_img, M_inv_big):
|
||||
calib_img_buffer, calib_img_pair = update_img(calib_img_buffer, new_img, M_inv)
|
||||
calib_big_img_buffer, calib_big_img_pair = update_img(calib_big_img_buffer, new_big_img, M_inv_big)
|
||||
return calib_img_buffer, calib_img_pair, calib_big_img_buffer, calib_big_img_pair
|
||||
return update_both_imgs_tinygrad
|
||||
|
||||
|
||||
def make_warp_dm(cam_w, cam_h, dm_w, dm_h):
|
||||
stride, y_height, _, _ = get_nv12_info(cam_w, cam_h)
|
||||
stride_pad = stride - cam_w
|
||||
|
||||
def warp_dm(input_frame, M_inv):
|
||||
M_inv = M_inv.to(Device.DEFAULT)
|
||||
with Context(SPLIT_REDUCEOP=0):
|
||||
result = warp_perspective_tinygrad(input_frame[:cam_h*stride], M_inv, (dm_w, dm_h), (cam_h, cam_w), stride_pad, 1).reshape(-1, dm_h * dm_w)
|
||||
return result
|
||||
return warp_dm
|
||||
|
||||
|
||||
def compile_modeld_warp(cam_w, cam_h):
|
||||
model_w, model_h = MEDMODEL_INPUT_SIZE
|
||||
_, _, _, yuv_size = get_nv12_info(cam_w, cam_h)
|
||||
|
||||
print(f"Compiling modeld warp for {cam_w}x{cam_h}...")
|
||||
|
||||
frame_prepare = make_frame_prepare(cam_w, cam_h, model_w, model_h)
|
||||
update_both_imgs = make_update_both_imgs(frame_prepare, model_w, model_h)
|
||||
update_img_jit = TinyJit(update_both_imgs, prune=True)
|
||||
|
||||
full_buffer = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
|
||||
big_full_buffer = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
|
||||
full_buffer_np = np.zeros(IMG_BUFFER_SHAPE, dtype=np.uint8)
|
||||
big_full_buffer_np = np.zeros(IMG_BUFFER_SHAPE, dtype=np.uint8)
|
||||
|
||||
for i in range(10):
|
||||
new_frame_np = (32 * np.random.randn(yuv_size).astype(np.float32) + 128).clip(0, 255).astype(np.uint8)
|
||||
img_inputs = [full_buffer,
|
||||
Tensor.from_blob(new_frame_np.ctypes.data, (yuv_size,), dtype='uint8').realize(),
|
||||
Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')]
|
||||
new_big_frame_np = (32 * np.random.randn(yuv_size).astype(np.float32) + 128).clip(0, 255).astype(np.uint8)
|
||||
big_img_inputs = [big_full_buffer,
|
||||
Tensor.from_blob(new_big_frame_np.ctypes.data, (yuv_size,), dtype='uint8').realize(),
|
||||
Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')]
|
||||
inputs = img_inputs + big_img_inputs
|
||||
Device.default.synchronize()
|
||||
|
||||
inputs_np = [x.numpy() for x in inputs]
|
||||
inputs_np[0] = full_buffer_np
|
||||
inputs_np[3] = big_full_buffer_np
|
||||
|
||||
st = time.perf_counter()
|
||||
out = update_img_jit(*inputs)
|
||||
full_buffer = out[0].contiguous().realize().clone()
|
||||
big_full_buffer = out[2].contiguous().realize().clone()
|
||||
mt = time.perf_counter()
|
||||
Device.default.synchronize()
|
||||
et = time.perf_counter()
|
||||
print(f" [{i+1}/10] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
|
||||
|
||||
pkl_path = warp_pkl_path(cam_w, cam_h)
|
||||
with open(pkl_path, "wb") as f:
|
||||
pickle.dump(update_img_jit, f)
|
||||
print(f" Saved to {pkl_path}")
|
||||
|
||||
jit = pickle.load(open(pkl_path, "rb"))
|
||||
jit(*inputs)
|
||||
|
||||
|
||||
def compile_dm_warp(cam_w, cam_h):
|
||||
dm_w, dm_h = DM_INPUT_SIZE
|
||||
_, _, _, yuv_size = get_nv12_info(cam_w, cam_h)
|
||||
|
||||
print(f"Compiling DM warp for {cam_w}x{cam_h}...")
|
||||
|
||||
warp_dm = make_warp_dm(cam_w, cam_h, dm_w, dm_h)
|
||||
warp_dm_jit = TinyJit(warp_dm, prune=True)
|
||||
|
||||
for i in range(10):
|
||||
inputs = [Tensor.from_blob((32 * Tensor.randn(yuv_size,) + 128).cast(dtype='uint8').realize().numpy().ctypes.data, (yuv_size,), dtype='uint8'),
|
||||
Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')]
|
||||
Device.default.synchronize()
|
||||
st = time.perf_counter()
|
||||
warp_dm_jit(*inputs)
|
||||
mt = time.perf_counter()
|
||||
Device.default.synchronize()
|
||||
et = time.perf_counter()
|
||||
print(f" [{i+1}/10] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
|
||||
|
||||
pkl_path = dm_warp_pkl_path(cam_w, cam_h)
|
||||
with open(pkl_path, "wb") as f:
|
||||
pickle.dump(warp_dm_jit, f)
|
||||
print(f" Saved to {pkl_path}")
|
||||
|
||||
|
||||
def run_and_save_pickle():
|
||||
for cam_w, cam_h in CAMERA_CONFIGS:
|
||||
compile_modeld_warp(cam_w, cam_h)
|
||||
compile_dm_warp(cam_w, cam_h)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_and_save_pickle()
|
||||
@@ -3,7 +3,6 @@ import os
|
||||
from openpilot.system.hardware import TICI
|
||||
os.environ['DEV'] = 'QCOM' if TICI else 'CPU'
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.dtype import dtypes
|
||||
import time
|
||||
import pickle
|
||||
import numpy as np
|
||||
@@ -12,19 +11,19 @@ from pathlib import Path
|
||||
from cereal import messaging
|
||||
from cereal.messaging import PubMaster, SubMaster
|
||||
from msgq.visionipc import VisionIpcClient, VisionStreamType, VisionBuf
|
||||
from msgq.visionipc.visionipc_pyx import CLContext
|
||||
from openpilot.common.swaglog import cloudlog
|
||||
from openpilot.common.realtime import config_realtime_process
|
||||
from openpilot.common.transformations.model import dmonitoringmodel_intrinsics
|
||||
from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
|
||||
from openpilot.selfdrive.modeld.models.commonmodel_pyx import CLContext, MonitoringModelFrame
|
||||
from openpilot.system.camerad.cameras.nv12_info import get_nv12_info
|
||||
from openpilot.selfdrive.modeld.parse_model_outputs import sigmoid, safe_exp
|
||||
from openpilot.selfdrive.modeld.runners.tinygrad_helpers import qcom_tensor_from_opencl_address
|
||||
|
||||
PROCESS_NAME = "selfdrive.modeld.dmonitoringmodeld"
|
||||
SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
|
||||
MODEL_PKL_PATH = Path(__file__).parent / 'models/dmonitoring_model_tinygrad.pkl'
|
||||
METADATA_PATH = Path(__file__).parent / 'models/dmonitoring_model_metadata.pkl'
|
||||
|
||||
MODELS_DIR = Path(__file__).parent / 'models'
|
||||
|
||||
class ModelState:
|
||||
inputs: dict[str, np.ndarray]
|
||||
@@ -36,12 +35,15 @@ class ModelState:
|
||||
self.input_shapes = model_metadata['input_shapes']
|
||||
self.output_slices = model_metadata['output_slices']
|
||||
|
||||
self.frame = MonitoringModelFrame(cl_ctx)
|
||||
self.numpy_inputs = {
|
||||
'calib': np.zeros(self.input_shapes['calib'], dtype=np.float32),
|
||||
}
|
||||
|
||||
self.warp_inputs_np = {'transform': np.zeros((3,3), dtype=np.float32)}
|
||||
self.warp_inputs = {k: Tensor(v, device='NPY') for k,v in self.warp_inputs_np.items()}
|
||||
self.frame_buf_params = None
|
||||
self.tensor_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()}
|
||||
self.image_warp = None
|
||||
with open(MODEL_PKL_PATH, "rb") as f:
|
||||
self.model_run = pickle.load(f)
|
||||
|
||||
@@ -50,14 +52,15 @@ class ModelState:
|
||||
|
||||
t1 = time.perf_counter()
|
||||
|
||||
input_img_cl = self.frame.prepare(buf, transform.flatten())
|
||||
if TICI:
|
||||
# The imgs tensors are backed by opencl memory, only need init once
|
||||
if 'input_img' not in self.tensor_inputs:
|
||||
self.tensor_inputs['input_img'] = qcom_tensor_from_opencl_address(input_img_cl.mem_address, self.input_shapes['input_img'], dtype=dtypes.uint8)
|
||||
else:
|
||||
self.tensor_inputs['input_img'] = Tensor(self.frame.buffer_from_cl(input_img_cl).reshape(self.input_shapes['input_img']), dtype=dtypes.uint8).realize()
|
||||
if self.image_warp is None:
|
||||
self.frame_buf_params = get_nv12_info(buf.width, buf.height)
|
||||
warp_path = MODELS_DIR / f'dm_warp_{buf.width}x{buf.height}_tinygrad.pkl'
|
||||
with open(warp_path, "rb") as f:
|
||||
self.image_warp = pickle.load(f)
|
||||
self.warp_inputs['frame'] = Tensor.from_blob(buf.data.ctypes.data, (self.frame_buf_params[3],), dtype='uint8').realize()
|
||||
|
||||
self.warp_inputs_np['transform'][:] = transform[:]
|
||||
self.tensor_inputs['input_img'] = self.image_warp(self.warp_inputs['frame'], self.warp_inputs['transform']).realize()
|
||||
|
||||
output = self.model_run(**self.tensor_inputs).contiguous().realize().uop.base.buffer.numpy()
|
||||
|
||||
|
||||
@@ -7,7 +7,6 @@ if USBGPU:
|
||||
os.environ['DEV'] = 'AMD'
|
||||
os.environ['AMD_IFACE'] = 'USB'
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.dtype import dtypes
|
||||
import time
|
||||
import pickle
|
||||
import numpy as np
|
||||
@@ -16,20 +15,20 @@ from cereal import car, log
|
||||
from pathlib import Path
|
||||
from cereal.messaging import PubMaster, SubMaster
|
||||
from msgq.visionipc import VisionIpcClient, VisionStreamType, VisionBuf
|
||||
from msgq.visionipc.visionipc_pyx import CLContext
|
||||
from opendbc.car.car_helpers import get_demo_car_params
|
||||
from openpilot.common.swaglog import cloudlog
|
||||
from openpilot.common.params import Params
|
||||
from openpilot.common.filter_simple import FirstOrderFilter
|
||||
from openpilot.common.realtime import config_realtime_process, DT_MDL
|
||||
from openpilot.common.transformations.camera import DEVICE_CAMERAS
|
||||
from openpilot.system.camerad.cameras.nv12_info import get_nv12_info
|
||||
from openpilot.common.transformations.model import get_warp_matrix
|
||||
from openpilot.selfdrive.controls.lib.desire_helper import DesireHelper
|
||||
from openpilot.selfdrive.controls.lib.drive_helpers import get_accel_from_plan, smooth_value, get_curvature_from_plan
|
||||
from openpilot.selfdrive.modeld.parse_model_outputs import Parser
|
||||
from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState
|
||||
from openpilot.selfdrive.modeld.constants import ModelConstants, Plan
|
||||
from openpilot.selfdrive.modeld.models.commonmodel_pyx import DrivingModelFrame, CLContext
|
||||
from openpilot.selfdrive.modeld.runners.tinygrad_helpers import qcom_tensor_from_opencl_address
|
||||
|
||||
|
||||
PROCESS_NAME = "selfdrive.modeld.modeld"
|
||||
@@ -39,11 +38,15 @@ VISION_PKL_PATH = Path(__file__).parent / 'models/driving_vision_tinygrad.pkl'
|
||||
POLICY_PKL_PATH = Path(__file__).parent / 'models/driving_policy_tinygrad.pkl'
|
||||
VISION_METADATA_PATH = Path(__file__).parent / 'models/driving_vision_metadata.pkl'
|
||||
POLICY_METADATA_PATH = Path(__file__).parent / 'models/driving_policy_metadata.pkl'
|
||||
MODELS_DIR = Path(__file__).parent / 'models'
|
||||
|
||||
LAT_SMOOTH_SECONDS = 0.0
|
||||
LONG_SMOOTH_SECONDS = 0.3
|
||||
MIN_LAT_CONTROL_SPEED = 0.3
|
||||
|
||||
IMG_QUEUE_SHAPE = (6*(ModelConstants.MODEL_RUN_FREQ//ModelConstants.MODEL_CONTEXT_FREQ + 1), 128, 256)
|
||||
assert IMG_QUEUE_SHAPE[0] == 30
|
||||
|
||||
|
||||
def get_action_from_model(model_output: dict[str, np.ndarray], prev_action: log.ModelDataV2.Action,
|
||||
lat_action_t: float, long_action_t: float, v_ego: float) -> log.ModelDataV2.Action:
|
||||
@@ -136,7 +139,6 @@ class InputQueues:
|
||||
return out
|
||||
|
||||
class ModelState:
|
||||
frames: dict[str, DrivingModelFrame]
|
||||
inputs: dict[str, np.ndarray]
|
||||
output: np.ndarray
|
||||
prev_desire: np.ndarray # for tracking the rising edge of the pulse
|
||||
@@ -155,7 +157,6 @@ class ModelState:
|
||||
self.policy_output_slices = policy_metadata['output_slices']
|
||||
policy_output_size = policy_metadata['output_shapes']['outputs'][1]
|
||||
|
||||
self.frames = {name: DrivingModelFrame(context, ModelConstants.MODEL_RUN_FREQ//ModelConstants.MODEL_CONTEXT_FREQ) for name in self.vision_input_names}
|
||||
self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)
|
||||
|
||||
# policy inputs
|
||||
@@ -166,11 +167,17 @@ class ModelState:
|
||||
self.full_input_queues.reset()
|
||||
|
||||
# img buffers are managed in openCL transform code
|
||||
self.vision_inputs: dict[str, Tensor] = {}
|
||||
self.img_queues = {'img': Tensor.zeros(IMG_QUEUE_SHAPE, dtype='uint8').contiguous().realize(),
|
||||
'big_img': Tensor.zeros(IMG_QUEUE_SHAPE, dtype='uint8').contiguous().realize(),}
|
||||
self.full_frames : dict[str, Tensor] = {}
|
||||
self.transforms_np = {k: np.zeros((3,3), dtype=np.float32) for k in self.img_queues}
|
||||
self.transforms = {k: Tensor(v, device='NPY').realize() for k, v in self.transforms_np.items()}
|
||||
self.vision_output = np.zeros(vision_output_size, dtype=np.float32)
|
||||
self.policy_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()}
|
||||
self.policy_output = np.zeros(policy_output_size, dtype=np.float32)
|
||||
self.parser = Parser()
|
||||
self.frame_buf_params : dict[str, tuple[int, int, int, int]] = {}
|
||||
self.update_imgs = None
|
||||
|
||||
with open(VISION_PKL_PATH, "rb") as f:
|
||||
self.vision_run = pickle.load(f)
|
||||
@@ -188,23 +195,28 @@ class ModelState:
|
||||
inputs['desire_pulse'][0] = 0
|
||||
new_desire = np.where(inputs['desire_pulse'] - self.prev_desire > .99, inputs['desire_pulse'], 0)
|
||||
self.prev_desire[:] = inputs['desire_pulse']
|
||||
if self.update_imgs is None:
|
||||
for key in bufs.keys():
|
||||
w, h = bufs[key].width, bufs[key].height
|
||||
self.frame_buf_params[key] = get_nv12_info(w, h)
|
||||
warp_path = MODELS_DIR / f'warp_{w}x{h}_tinygrad.pkl'
|
||||
with open(warp_path, "rb") as f:
|
||||
self.update_imgs = pickle.load(f)
|
||||
|
||||
imgs_cl = {name: self.frames[name].prepare(bufs[name], transforms[name].flatten()) for name in self.vision_input_names}
|
||||
|
||||
if TICI and not USBGPU:
|
||||
# The imgs tensors are backed by opencl memory, only need init once
|
||||
for key in imgs_cl:
|
||||
if key not in self.vision_inputs:
|
||||
self.vision_inputs[key] = qcom_tensor_from_opencl_address(imgs_cl[key].mem_address, self.vision_input_shapes[key], dtype=dtypes.uint8)
|
||||
else:
|
||||
for key in imgs_cl:
|
||||
frame_input = self.frames[key].buffer_from_cl(imgs_cl[key]).reshape(self.vision_input_shapes[key])
|
||||
self.vision_inputs[key] = Tensor(frame_input, dtype=dtypes.uint8).realize()
|
||||
for key in bufs.keys():
|
||||
self.full_frames[key] = Tensor.from_blob(bufs[key].data.ctypes.data, (self.frame_buf_params[key][3],), dtype='uint8').realize()
|
||||
self.transforms_np[key][:,:] = transforms[key][:,:]
|
||||
|
||||
out = self.update_imgs(self.img_queues['img'], self.full_frames['img'], self.transforms['img'],
|
||||
self.img_queues['big_img'], self.full_frames['big_img'], self.transforms['big_img'])
|
||||
self.img_queues['img'], self.img_queues['big_img'], = out[0].realize(), out[2].realize()
|
||||
vision_inputs = {'img': out[1], 'big_img': out[3]}
|
||||
|
||||
if prepare_only:
|
||||
return None
|
||||
|
||||
self.vision_output = self.vision_run(**self.vision_inputs).contiguous().realize().uop.base.buffer.numpy()
|
||||
self.vision_output = self.vision_run(**vision_inputs).contiguous().realize().uop.base.buffer.numpy()
|
||||
vision_outputs_dict = self.parser.parse_vision_outputs(self.slice_outputs(self.vision_output, self.vision_output_slices))
|
||||
|
||||
self.full_input_queues.enqueue({'features_buffer': vision_outputs_dict['hidden_state'], 'desire_pulse': new_desire})
|
||||
@@ -214,7 +226,6 @@ class ModelState:
|
||||
|
||||
self.policy_output = self.policy_run(**self.policy_inputs).contiguous().realize().uop.base.buffer.numpy()
|
||||
policy_outputs_dict = self.parser.parse_policy_outputs(self.slice_outputs(self.policy_output, self.policy_output_slices))
|
||||
|
||||
combined_outputs_dict = {**vision_outputs_dict, **policy_outputs_dict}
|
||||
if SEND_RAW_PRED:
|
||||
combined_outputs_dict['raw_pred'] = np.concatenate([self.vision_output.copy(), self.policy_output.copy()])
|
||||
|
||||
@@ -1,64 +0,0 @@
|
||||
#include "selfdrive/modeld/models/commonmodel.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
|
||||
#include "common/clutil.h"
|
||||
|
||||
DrivingModelFrame::DrivingModelFrame(cl_device_id device_id, cl_context context, int _temporal_skip) : ModelFrame(device_id, context) {
|
||||
input_frames = std::make_unique<uint8_t[]>(buf_size);
|
||||
temporal_skip = _temporal_skip;
|
||||
input_frames_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err));
|
||||
img_buffer_20hz_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (temporal_skip+1)*frame_size_bytes, NULL, &err));
|
||||
region.origin = temporal_skip * frame_size_bytes;
|
||||
region.size = frame_size_bytes;
|
||||
last_img_cl = CL_CHECK_ERR(clCreateSubBuffer(img_buffer_20hz_cl, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err));
|
||||
|
||||
loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT);
|
||||
init_transform(device_id, context, MODEL_WIDTH, MODEL_HEIGHT);
|
||||
}
|
||||
|
||||
cl_mem* DrivingModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) {
|
||||
run_transform(yuv_cl, MODEL_WIDTH, MODEL_HEIGHT, frame_width, frame_height, frame_stride, frame_uv_offset, projection);
|
||||
|
||||
for (int i = 0; i < temporal_skip; i++) {
|
||||
CL_CHECK(clEnqueueCopyBuffer(q, img_buffer_20hz_cl, img_buffer_20hz_cl, (i+1)*frame_size_bytes, i*frame_size_bytes, frame_size_bytes, 0, nullptr, nullptr));
|
||||
}
|
||||
loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, last_img_cl);
|
||||
|
||||
copy_queue(&loadyuv, q, img_buffer_20hz_cl, input_frames_cl, 0, 0, frame_size_bytes);
|
||||
copy_queue(&loadyuv, q, last_img_cl, input_frames_cl, 0, frame_size_bytes, frame_size_bytes);
|
||||
|
||||
// NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready.
|
||||
clFinish(q);
|
||||
return &input_frames_cl;
|
||||
}
|
||||
|
||||
DrivingModelFrame::~DrivingModelFrame() {
|
||||
deinit_transform();
|
||||
loadyuv_destroy(&loadyuv);
|
||||
CL_CHECK(clReleaseMemObject(input_frames_cl));
|
||||
CL_CHECK(clReleaseMemObject(img_buffer_20hz_cl));
|
||||
CL_CHECK(clReleaseMemObject(last_img_cl));
|
||||
CL_CHECK(clReleaseCommandQueue(q));
|
||||
}
|
||||
|
||||
|
||||
MonitoringModelFrame::MonitoringModelFrame(cl_device_id device_id, cl_context context) : ModelFrame(device_id, context) {
|
||||
input_frames = std::make_unique<uint8_t[]>(buf_size);
|
||||
input_frame_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err));
|
||||
|
||||
init_transform(device_id, context, MODEL_WIDTH, MODEL_HEIGHT);
|
||||
}
|
||||
|
||||
cl_mem* MonitoringModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) {
|
||||
run_transform(yuv_cl, MODEL_WIDTH, MODEL_HEIGHT, frame_width, frame_height, frame_stride, frame_uv_offset, projection);
|
||||
clFinish(q);
|
||||
return &y_cl;
|
||||
}
|
||||
|
||||
MonitoringModelFrame::~MonitoringModelFrame() {
|
||||
deinit_transform();
|
||||
CL_CHECK(clReleaseMemObject(input_frame_cl));
|
||||
CL_CHECK(clReleaseCommandQueue(q));
|
||||
}
|
||||
@@ -1,97 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <cfloat>
|
||||
#include <cstdlib>
|
||||
#include <cassert>
|
||||
|
||||
#include <memory>
|
||||
|
||||
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
|
||||
#ifdef __APPLE__
|
||||
#include <OpenCL/cl.h>
|
||||
#else
|
||||
#include <CL/cl.h>
|
||||
#endif
|
||||
|
||||
#include "common/mat.h"
|
||||
#include "selfdrive/modeld/transforms/loadyuv.h"
|
||||
#include "selfdrive/modeld/transforms/transform.h"
|
||||
|
||||
class ModelFrame {
|
||||
public:
|
||||
ModelFrame(cl_device_id device_id, cl_context context) {
|
||||
q = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &err));
|
||||
}
|
||||
virtual ~ModelFrame() {}
|
||||
virtual cl_mem* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) { return NULL; }
|
||||
uint8_t* buffer_from_cl(cl_mem *in_frames, int buffer_size) {
|
||||
CL_CHECK(clEnqueueReadBuffer(q, *in_frames, CL_TRUE, 0, buffer_size, input_frames.get(), 0, nullptr, nullptr));
|
||||
clFinish(q);
|
||||
return &input_frames[0];
|
||||
}
|
||||
|
||||
int MODEL_WIDTH;
|
||||
int MODEL_HEIGHT;
|
||||
int MODEL_FRAME_SIZE;
|
||||
int buf_size;
|
||||
|
||||
protected:
|
||||
cl_mem y_cl, u_cl, v_cl;
|
||||
Transform transform;
|
||||
cl_command_queue q;
|
||||
std::unique_ptr<uint8_t[]> input_frames;
|
||||
|
||||
void init_transform(cl_device_id device_id, cl_context context, int model_width, int model_height) {
|
||||
y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, model_width * model_height, NULL, &err));
|
||||
u_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (model_width / 2) * (model_height / 2), NULL, &err));
|
||||
v_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (model_width / 2) * (model_height / 2), NULL, &err));
|
||||
transform_init(&transform, context, device_id);
|
||||
}
|
||||
|
||||
void deinit_transform() {
|
||||
transform_destroy(&transform);
|
||||
CL_CHECK(clReleaseMemObject(v_cl));
|
||||
CL_CHECK(clReleaseMemObject(u_cl));
|
||||
CL_CHECK(clReleaseMemObject(y_cl));
|
||||
}
|
||||
|
||||
void run_transform(cl_mem yuv_cl, int model_width, int model_height, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) {
|
||||
transform_queue(&transform, q,
|
||||
yuv_cl, frame_width, frame_height, frame_stride, frame_uv_offset,
|
||||
y_cl, u_cl, v_cl, model_width, model_height, projection);
|
||||
}
|
||||
};
|
||||
|
||||
class DrivingModelFrame : public ModelFrame {
|
||||
public:
|
||||
DrivingModelFrame(cl_device_id device_id, cl_context context, int _temporal_skip);
|
||||
~DrivingModelFrame();
|
||||
cl_mem* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection);
|
||||
|
||||
const int MODEL_WIDTH = 512;
|
||||
const int MODEL_HEIGHT = 256;
|
||||
const int MODEL_FRAME_SIZE = MODEL_WIDTH * MODEL_HEIGHT * 3 / 2;
|
||||
const int buf_size = MODEL_FRAME_SIZE * 2; // 2 frames are temporal_skip frames apart
|
||||
const size_t frame_size_bytes = MODEL_FRAME_SIZE * sizeof(uint8_t);
|
||||
|
||||
private:
|
||||
LoadYUVState loadyuv;
|
||||
cl_mem img_buffer_20hz_cl, last_img_cl, input_frames_cl;
|
||||
cl_buffer_region region;
|
||||
int temporal_skip;
|
||||
};
|
||||
|
||||
class MonitoringModelFrame : public ModelFrame {
|
||||
public:
|
||||
MonitoringModelFrame(cl_device_id device_id, cl_context context);
|
||||
~MonitoringModelFrame();
|
||||
cl_mem* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection);
|
||||
|
||||
const int MODEL_WIDTH = 1440;
|
||||
const int MODEL_HEIGHT = 960;
|
||||
const int MODEL_FRAME_SIZE = MODEL_WIDTH * MODEL_HEIGHT;
|
||||
const int buf_size = MODEL_FRAME_SIZE;
|
||||
|
||||
private:
|
||||
cl_mem input_frame_cl;
|
||||
};
|
||||
@@ -1,27 +0,0 @@
|
||||
# distutils: language = c++
|
||||
|
||||
from msgq.visionipc.visionipc cimport cl_device_id, cl_context, cl_mem
|
||||
|
||||
cdef extern from "common/mat.h":
|
||||
cdef struct mat3:
|
||||
float v[9]
|
||||
|
||||
cdef extern from "common/clutil.h":
|
||||
cdef unsigned long CL_DEVICE_TYPE_DEFAULT
|
||||
cl_device_id cl_get_device_id(unsigned long)
|
||||
cl_context cl_create_context(cl_device_id)
|
||||
void cl_release_context(cl_context)
|
||||
|
||||
cdef extern from "selfdrive/modeld/models/commonmodel.h":
|
||||
cppclass ModelFrame:
|
||||
int buf_size
|
||||
unsigned char * buffer_from_cl(cl_mem*, int);
|
||||
cl_mem * prepare(cl_mem, int, int, int, int, mat3)
|
||||
|
||||
cppclass DrivingModelFrame:
|
||||
int buf_size
|
||||
DrivingModelFrame(cl_device_id, cl_context, int)
|
||||
|
||||
cppclass MonitoringModelFrame:
|
||||
int buf_size
|
||||
MonitoringModelFrame(cl_device_id, cl_context)
|
||||
@@ -1,13 +0,0 @@
|
||||
# distutils: language = c++
|
||||
|
||||
from msgq.visionipc.visionipc cimport cl_mem
|
||||
from msgq.visionipc.visionipc_pyx cimport CLContext as BaseCLContext
|
||||
|
||||
cdef class CLContext(BaseCLContext):
|
||||
pass
|
||||
|
||||
cdef class CLMem:
|
||||
cdef cl_mem * mem
|
||||
|
||||
@staticmethod
|
||||
cdef create(void*)
|
||||
@@ -1,74 +0,0 @@
|
||||
# distutils: language = c++
|
||||
# cython: c_string_encoding=ascii, language_level=3
|
||||
|
||||
import numpy as np
|
||||
cimport numpy as cnp
|
||||
from libc.string cimport memcpy
|
||||
from libc.stdint cimport uintptr_t
|
||||
|
||||
from msgq.visionipc.visionipc cimport cl_mem
|
||||
from msgq.visionipc.visionipc_pyx cimport VisionBuf, CLContext as BaseCLContext
|
||||
from .commonmodel cimport CL_DEVICE_TYPE_DEFAULT, cl_get_device_id, cl_create_context, cl_release_context
|
||||
from .commonmodel cimport mat3, ModelFrame as cppModelFrame, DrivingModelFrame as cppDrivingModelFrame, MonitoringModelFrame as cppMonitoringModelFrame
|
||||
|
||||
|
||||
cdef class CLContext(BaseCLContext):
|
||||
def __cinit__(self):
|
||||
self.device_id = cl_get_device_id(CL_DEVICE_TYPE_DEFAULT)
|
||||
self.context = cl_create_context(self.device_id)
|
||||
|
||||
def __dealloc__(self):
|
||||
if self.context:
|
||||
cl_release_context(self.context)
|
||||
|
||||
cdef class CLMem:
|
||||
@staticmethod
|
||||
cdef create(void * cmem):
|
||||
mem = CLMem()
|
||||
mem.mem = <cl_mem*> cmem
|
||||
return mem
|
||||
|
||||
@property
|
||||
def mem_address(self):
|
||||
return <uintptr_t>(self.mem)
|
||||
|
||||
def cl_from_visionbuf(VisionBuf buf):
|
||||
return CLMem.create(<void*>&buf.buf.buf_cl)
|
||||
|
||||
|
||||
cdef class ModelFrame:
|
||||
cdef cppModelFrame * frame
|
||||
cdef int buf_size
|
||||
|
||||
def __dealloc__(self):
|
||||
del self.frame
|
||||
|
||||
def prepare(self, VisionBuf buf, float[:] projection):
|
||||
cdef mat3 cprojection
|
||||
memcpy(cprojection.v, &projection[0], 9*sizeof(float))
|
||||
cdef cl_mem * data
|
||||
data = self.frame.prepare(buf.buf.buf_cl, buf.width, buf.height, buf.stride, buf.uv_offset, cprojection)
|
||||
return CLMem.create(data)
|
||||
|
||||
def buffer_from_cl(self, CLMem in_frames):
|
||||
cdef unsigned char * data2
|
||||
data2 = self.frame.buffer_from_cl(in_frames.mem, self.buf_size)
|
||||
return np.asarray(<cnp.uint8_t[:self.buf_size]> data2)
|
||||
|
||||
|
||||
cdef class DrivingModelFrame(ModelFrame):
|
||||
cdef cppDrivingModelFrame * _frame
|
||||
|
||||
def __cinit__(self, CLContext context, int temporal_skip):
|
||||
self._frame = new cppDrivingModelFrame(context.device_id, context.context, temporal_skip)
|
||||
self.frame = <cppModelFrame*>(self._frame)
|
||||
self.buf_size = self._frame.buf_size
|
||||
|
||||
cdef class MonitoringModelFrame(ModelFrame):
|
||||
cdef cppMonitoringModelFrame * _frame
|
||||
|
||||
def __cinit__(self, CLContext context):
|
||||
self._frame = new cppMonitoringModelFrame(context.device_id, context.context)
|
||||
self.frame = <cppModelFrame*>(self._frame)
|
||||
self.buf_size = self._frame.buf_size
|
||||
|
||||
@@ -1,76 +0,0 @@
|
||||
#include "selfdrive/modeld/transforms/loadyuv.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
|
||||
void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int width, int height) {
|
||||
memset(s, 0, sizeof(*s));
|
||||
|
||||
s->width = width;
|
||||
s->height = height;
|
||||
|
||||
char args[1024];
|
||||
snprintf(args, sizeof(args),
|
||||
"-cl-fast-relaxed-math -cl-denorms-are-zero "
|
||||
"-DTRANSFORMED_WIDTH=%d -DTRANSFORMED_HEIGHT=%d",
|
||||
width, height);
|
||||
cl_program prg = cl_program_from_file(ctx, device_id, LOADYUV_PATH, args);
|
||||
|
||||
s->loadys_krnl = CL_CHECK_ERR(clCreateKernel(prg, "loadys", &err));
|
||||
s->loaduv_krnl = CL_CHECK_ERR(clCreateKernel(prg, "loaduv", &err));
|
||||
s->copy_krnl = CL_CHECK_ERR(clCreateKernel(prg, "copy", &err));
|
||||
|
||||
// done with this
|
||||
CL_CHECK(clReleaseProgram(prg));
|
||||
}
|
||||
|
||||
void loadyuv_destroy(LoadYUVState* s) {
|
||||
CL_CHECK(clReleaseKernel(s->loadys_krnl));
|
||||
CL_CHECK(clReleaseKernel(s->loaduv_krnl));
|
||||
CL_CHECK(clReleaseKernel(s->copy_krnl));
|
||||
}
|
||||
|
||||
void loadyuv_queue(LoadYUVState* s, cl_command_queue q,
|
||||
cl_mem y_cl, cl_mem u_cl, cl_mem v_cl,
|
||||
cl_mem out_cl) {
|
||||
cl_int global_out_off = 0;
|
||||
|
||||
CL_CHECK(clSetKernelArg(s->loadys_krnl, 0, sizeof(cl_mem), &y_cl));
|
||||
CL_CHECK(clSetKernelArg(s->loadys_krnl, 1, sizeof(cl_mem), &out_cl));
|
||||
CL_CHECK(clSetKernelArg(s->loadys_krnl, 2, sizeof(cl_int), &global_out_off));
|
||||
|
||||
const size_t loadys_work_size = (s->width*s->height)/8;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(q, s->loadys_krnl, 1, NULL,
|
||||
&loadys_work_size, NULL, 0, 0, NULL));
|
||||
|
||||
const size_t loaduv_work_size = ((s->width/2)*(s->height/2))/8;
|
||||
global_out_off += (s->width*s->height);
|
||||
|
||||
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 0, sizeof(cl_mem), &u_cl));
|
||||
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 1, sizeof(cl_mem), &out_cl));
|
||||
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &global_out_off));
|
||||
|
||||
CL_CHECK(clEnqueueNDRangeKernel(q, s->loaduv_krnl, 1, NULL,
|
||||
&loaduv_work_size, NULL, 0, 0, NULL));
|
||||
|
||||
global_out_off += (s->width/2)*(s->height/2);
|
||||
|
||||
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 0, sizeof(cl_mem), &v_cl));
|
||||
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 1, sizeof(cl_mem), &out_cl));
|
||||
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &global_out_off));
|
||||
|
||||
CL_CHECK(clEnqueueNDRangeKernel(q, s->loaduv_krnl, 1, NULL,
|
||||
&loaduv_work_size, NULL, 0, 0, NULL));
|
||||
}
|
||||
|
||||
void copy_queue(LoadYUVState* s, cl_command_queue q, cl_mem src, cl_mem dst,
|
||||
size_t src_offset, size_t dst_offset, size_t size) {
|
||||
CL_CHECK(clSetKernelArg(s->copy_krnl, 0, sizeof(cl_mem), &src));
|
||||
CL_CHECK(clSetKernelArg(s->copy_krnl, 1, sizeof(cl_mem), &dst));
|
||||
CL_CHECK(clSetKernelArg(s->copy_krnl, 2, sizeof(cl_int), &src_offset));
|
||||
CL_CHECK(clSetKernelArg(s->copy_krnl, 3, sizeof(cl_int), &dst_offset));
|
||||
const size_t copy_work_size = size/8;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(q, s->copy_krnl, 1, NULL,
|
||||
©_work_size, NULL, 0, 0, NULL));
|
||||
}
|
||||
@@ -1,47 +0,0 @@
|
||||
#define UV_SIZE ((TRANSFORMED_WIDTH/2)*(TRANSFORMED_HEIGHT/2))
|
||||
|
||||
__kernel void loadys(__global uchar8 const * const Y,
|
||||
__global uchar * out,
|
||||
int out_offset)
|
||||
{
|
||||
const int gid = get_global_id(0);
|
||||
const int ois = gid * 8;
|
||||
const int oy = ois / TRANSFORMED_WIDTH;
|
||||
const int ox = ois % TRANSFORMED_WIDTH;
|
||||
|
||||
const uchar8 ys = Y[gid];
|
||||
|
||||
// 02
|
||||
// 13
|
||||
|
||||
__global uchar* outy0;
|
||||
__global uchar* outy1;
|
||||
if ((oy & 1) == 0) {
|
||||
outy0 = out + out_offset; //y0
|
||||
outy1 = out + out_offset + UV_SIZE*2; //y2
|
||||
} else {
|
||||
outy0 = out + out_offset + UV_SIZE; //y1
|
||||
outy1 = out + out_offset + UV_SIZE*3; //y3
|
||||
}
|
||||
|
||||
vstore4(ys.s0246, 0, outy0 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2);
|
||||
vstore4(ys.s1357, 0, outy1 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2);
|
||||
}
|
||||
|
||||
__kernel void loaduv(__global uchar8 const * const in,
|
||||
__global uchar8 * out,
|
||||
int out_offset)
|
||||
{
|
||||
const int gid = get_global_id(0);
|
||||
const uchar8 inv = in[gid];
|
||||
out[gid + out_offset / 8] = inv;
|
||||
}
|
||||
|
||||
__kernel void copy(__global uchar8 * in,
|
||||
__global uchar8 * out,
|
||||
int in_offset,
|
||||
int out_offset)
|
||||
{
|
||||
const int gid = get_global_id(0);
|
||||
out[gid + out_offset / 8] = in[gid + in_offset / 8];
|
||||
}
|
||||
@@ -1,20 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "common/clutil.h"
|
||||
|
||||
typedef struct {
|
||||
int width, height;
|
||||
cl_kernel loadys_krnl, loaduv_krnl, copy_krnl;
|
||||
} LoadYUVState;
|
||||
|
||||
void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int width, int height);
|
||||
|
||||
void loadyuv_destroy(LoadYUVState* s);
|
||||
|
||||
void loadyuv_queue(LoadYUVState* s, cl_command_queue q,
|
||||
cl_mem y_cl, cl_mem u_cl, cl_mem v_cl,
|
||||
cl_mem out_cl);
|
||||
|
||||
|
||||
void copy_queue(LoadYUVState* s, cl_command_queue q, cl_mem src, cl_mem dst,
|
||||
size_t src_offset, size_t dst_offset, size_t size);
|
||||
@@ -1,97 +0,0 @@
|
||||
#include "selfdrive/modeld/transforms/transform.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
|
||||
#include "common/clutil.h"
|
||||
|
||||
void transform_init(Transform* s, cl_context ctx, cl_device_id device_id) {
|
||||
memset(s, 0, sizeof(*s));
|
||||
|
||||
cl_program prg = cl_program_from_file(ctx, device_id, TRANSFORM_PATH, "");
|
||||
s->krnl = CL_CHECK_ERR(clCreateKernel(prg, "warpPerspective", &err));
|
||||
// done with this
|
||||
CL_CHECK(clReleaseProgram(prg));
|
||||
|
||||
s->m_y_cl = CL_CHECK_ERR(clCreateBuffer(ctx, CL_MEM_READ_WRITE, 3*3*sizeof(float), NULL, &err));
|
||||
s->m_uv_cl = CL_CHECK_ERR(clCreateBuffer(ctx, CL_MEM_READ_WRITE, 3*3*sizeof(float), NULL, &err));
|
||||
}
|
||||
|
||||
void transform_destroy(Transform* s) {
|
||||
CL_CHECK(clReleaseMemObject(s->m_y_cl));
|
||||
CL_CHECK(clReleaseMemObject(s->m_uv_cl));
|
||||
CL_CHECK(clReleaseKernel(s->krnl));
|
||||
}
|
||||
|
||||
void transform_queue(Transform* s,
|
||||
cl_command_queue q,
|
||||
cl_mem in_yuv, int in_width, int in_height, int in_stride, int in_uv_offset,
|
||||
cl_mem out_y, cl_mem out_u, cl_mem out_v,
|
||||
int out_width, int out_height,
|
||||
const mat3& projection) {
|
||||
const int zero = 0;
|
||||
|
||||
// sampled using pixel center origin
|
||||
// (because that's how fastcv and opencv does it)
|
||||
|
||||
mat3 projection_y = projection;
|
||||
|
||||
// in and out uv is half the size of y.
|
||||
mat3 projection_uv = transform_scale_buffer(projection, 0.5);
|
||||
|
||||
CL_CHECK(clEnqueueWriteBuffer(q, s->m_y_cl, CL_TRUE, 0, 3*3*sizeof(float), (void*)projection_y.v, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueWriteBuffer(q, s->m_uv_cl, CL_TRUE, 0, 3*3*sizeof(float), (void*)projection_uv.v, 0, NULL, NULL));
|
||||
|
||||
const int in_y_width = in_width;
|
||||
const int in_y_height = in_height;
|
||||
const int in_y_px_stride = 1;
|
||||
const int in_uv_width = in_width/2;
|
||||
const int in_uv_height = in_height/2;
|
||||
const int in_uv_px_stride = 2;
|
||||
const int in_u_offset = in_uv_offset;
|
||||
const int in_v_offset = in_uv_offset + 1;
|
||||
|
||||
const int out_y_width = out_width;
|
||||
const int out_y_height = out_height;
|
||||
const int out_uv_width = out_width/2;
|
||||
const int out_uv_height = out_height/2;
|
||||
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 0, sizeof(cl_mem), &in_yuv)); // src
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 1, sizeof(cl_int), &in_stride)); // src_row_stride
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 2, sizeof(cl_int), &in_y_px_stride)); // src_px_stride
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 3, sizeof(cl_int), &zero)); // src_offset
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 4, sizeof(cl_int), &in_y_height)); // src_rows
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 5, sizeof(cl_int), &in_y_width)); // src_cols
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 6, sizeof(cl_mem), &out_y)); // dst
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 7, sizeof(cl_int), &out_y_width)); // dst_row_stride
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 8, sizeof(cl_int), &zero)); // dst_offset
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 9, sizeof(cl_int), &out_y_height)); // dst_rows
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 10, sizeof(cl_int), &out_y_width)); // dst_cols
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 11, sizeof(cl_mem), &s->m_y_cl)); // M
|
||||
|
||||
const size_t work_size_y[2] = {(size_t)out_y_width, (size_t)out_y_height};
|
||||
|
||||
CL_CHECK(clEnqueueNDRangeKernel(q, s->krnl, 2, NULL,
|
||||
(const size_t*)&work_size_y, NULL, 0, 0, NULL));
|
||||
|
||||
const size_t work_size_uv[2] = {(size_t)out_uv_width, (size_t)out_uv_height};
|
||||
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 2, sizeof(cl_int), &in_uv_px_stride)); // src_px_stride
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 3, sizeof(cl_int), &in_u_offset)); // src_offset
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 4, sizeof(cl_int), &in_uv_height)); // src_rows
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 5, sizeof(cl_int), &in_uv_width)); // src_cols
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 6, sizeof(cl_mem), &out_u)); // dst
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 7, sizeof(cl_int), &out_uv_width)); // dst_row_stride
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 8, sizeof(cl_int), &zero)); // dst_offset
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 9, sizeof(cl_int), &out_uv_height)); // dst_rows
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 10, sizeof(cl_int), &out_uv_width)); // dst_cols
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 11, sizeof(cl_mem), &s->m_uv_cl)); // M
|
||||
|
||||
CL_CHECK(clEnqueueNDRangeKernel(q, s->krnl, 2, NULL,
|
||||
(const size_t*)&work_size_uv, NULL, 0, 0, NULL));
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 3, sizeof(cl_int), &in_v_offset)); // src_ofset
|
||||
CL_CHECK(clSetKernelArg(s->krnl, 6, sizeof(cl_mem), &out_v)); // dst
|
||||
|
||||
CL_CHECK(clEnqueueNDRangeKernel(q, s->krnl, 2, NULL,
|
||||
(const size_t*)&work_size_uv, NULL, 0, 0, NULL));
|
||||
}
|
||||
@@ -1,54 +0,0 @@
|
||||
#define INTER_BITS 5
|
||||
#define INTER_TAB_SIZE (1 << INTER_BITS)
|
||||
#define INTER_SCALE 1.f / INTER_TAB_SIZE
|
||||
|
||||
#define INTER_REMAP_COEF_BITS 15
|
||||
#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)
|
||||
|
||||
__kernel void warpPerspective(__global const uchar * src,
|
||||
int src_row_stride, int src_px_stride, int src_offset, int src_rows, int src_cols,
|
||||
__global uchar * dst,
|
||||
int dst_row_stride, int dst_offset, int dst_rows, int dst_cols,
|
||||
__constant float * M)
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
if (dx < dst_cols && dy < dst_rows)
|
||||
{
|
||||
float X0 = M[0] * dx + M[1] * dy + M[2];
|
||||
float Y0 = M[3] * dx + M[4] * dy + M[5];
|
||||
float W = M[6] * dx + M[7] * dy + M[8];
|
||||
W = W != 0.0f ? INTER_TAB_SIZE / W : 0.0f;
|
||||
int X = rint(X0 * W), Y = rint(Y0 * W);
|
||||
|
||||
int sx = convert_short_sat(X >> INTER_BITS);
|
||||
int sy = convert_short_sat(Y >> INTER_BITS);
|
||||
|
||||
short sx_clamp = clamp(sx, 0, src_cols - 1);
|
||||
short sx_p1_clamp = clamp(sx + 1, 0, src_cols - 1);
|
||||
short sy_clamp = clamp(sy, 0, src_rows - 1);
|
||||
short sy_p1_clamp = clamp(sy + 1, 0, src_rows - 1);
|
||||
int v0 = convert_int(src[mad24(sy_clamp, src_row_stride, src_offset + sx_clamp*src_px_stride)]);
|
||||
int v1 = convert_int(src[mad24(sy_clamp, src_row_stride, src_offset + sx_p1_clamp*src_px_stride)]);
|
||||
int v2 = convert_int(src[mad24(sy_p1_clamp, src_row_stride, src_offset + sx_clamp*src_px_stride)]);
|
||||
int v3 = convert_int(src[mad24(sy_p1_clamp, src_row_stride, src_offset + sx_p1_clamp*src_px_stride)]);
|
||||
|
||||
short ay = (short)(Y & (INTER_TAB_SIZE - 1));
|
||||
short ax = (short)(X & (INTER_TAB_SIZE - 1));
|
||||
float taby = 1.f/INTER_TAB_SIZE*ay;
|
||||
float tabx = 1.f/INTER_TAB_SIZE*ax;
|
||||
|
||||
int dst_index = mad24(dy, dst_row_stride, dst_offset + dx);
|
||||
|
||||
int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
|
||||
int itab1 = convert_short_sat_rte( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE );
|
||||
int itab2 = convert_short_sat_rte( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
|
||||
int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE );
|
||||
|
||||
int val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3;
|
||||
|
||||
uchar pix = convert_uchar_sat((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS);
|
||||
dst[dst_index] = pix;
|
||||
}
|
||||
}
|
||||
@@ -1,25 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
|
||||
#ifdef __APPLE__
|
||||
#include <OpenCL/cl.h>
|
||||
#else
|
||||
#include <CL/cl.h>
|
||||
#endif
|
||||
|
||||
#include "common/mat.h"
|
||||
|
||||
typedef struct {
|
||||
cl_kernel krnl;
|
||||
cl_mem m_y_cl, m_uv_cl;
|
||||
} Transform;
|
||||
|
||||
void transform_init(Transform* s, cl_context ctx, cl_device_id device_id);
|
||||
|
||||
void transform_destroy(Transform* transform);
|
||||
|
||||
void transform_queue(Transform* s, cl_command_queue q,
|
||||
cl_mem yuv, int in_width, int in_height, int in_stride, int in_uv_offset,
|
||||
cl_mem out_y, cl_mem out_u, cl_mem out_v,
|
||||
int out_width, int out_height,
|
||||
const mat3& projection);
|
||||
@@ -34,7 +34,7 @@ GITHUB = GithubUtils(API_TOKEN, DATA_TOKEN)
|
||||
|
||||
EXEC_TIMINGS = [
|
||||
# model, instant max, average max
|
||||
("modelV2", 0.035, 0.025),
|
||||
("modelV2", 0.035, 0.028),
|
||||
("driverStateV2", 0.02, 0.015),
|
||||
]
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ import time
|
||||
import copy
|
||||
import heapq
|
||||
import signal
|
||||
import numpy as np
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass, field
|
||||
from itertools import islice
|
||||
@@ -23,6 +24,7 @@ from openpilot.common.params import Params
|
||||
from openpilot.common.prefix import OpenpilotPrefix
|
||||
from openpilot.common.timeout import Timeout
|
||||
from openpilot.common.realtime import DT_CTRL
|
||||
from openpilot.system.camerad.cameras.nv12_info import get_nv12_info
|
||||
from openpilot.system.manager.process_config import managed_processes
|
||||
from openpilot.selfdrive.test.process_replay.vision_meta import meta_from_camera_state, available_streams
|
||||
from openpilot.selfdrive.test.process_replay.migration import migrate_all
|
||||
@@ -203,7 +205,8 @@ class ProcessContainer:
|
||||
if meta.camera_state in self.cfg.vision_pubs:
|
||||
assert frs[meta.camera_state].pix_fmt == 'nv12'
|
||||
frame_size = (frs[meta.camera_state].w, frs[meta.camera_state].h)
|
||||
vipc_server.create_buffers(meta.stream, 2, *frame_size)
|
||||
stride, y_height, _, yuv_size = get_nv12_info(frame_size[0], frame_size[1])
|
||||
vipc_server.create_buffers_with_sizes(meta.stream, 2, frame_size[0], frame_size[1], yuv_size, stride, stride * y_height)
|
||||
vipc_server.start_listener()
|
||||
|
||||
self.vipc_server = vipc_server
|
||||
@@ -300,7 +303,15 @@ class ProcessContainer:
|
||||
camera_meta = meta_from_camera_state(m.which())
|
||||
assert frs is not None
|
||||
img = frs[m.which()].get(camera_state.frameId)
|
||||
self.vipc_server.send(camera_meta.stream, img.flatten().tobytes(),
|
||||
|
||||
h, w = frs[m.which()].h, frs[m.which()].w
|
||||
stride, y_height, _, yuv_size = get_nv12_info(w, h)
|
||||
uv_offset = stride * y_height
|
||||
padded_img = np.zeros((yuv_size), dtype=np.uint8).reshape((-1, stride))
|
||||
padded_img[:h, :w] = img[:h * w].reshape((-1, w))
|
||||
padded_img[uv_offset // stride:uv_offset // stride + h // 2, :w] = img[h * w:].reshape((-1, w))
|
||||
|
||||
self.vipc_server.send(camera_meta.stream, padded_img.flatten().tobytes(),
|
||||
camera_state.frameId, camera_state.timestampSof, camera_state.timestampEof)
|
||||
self.msg_queue = []
|
||||
|
||||
|
||||
Reference in New Issue
Block a user