exec DM model with gpu (#33609)

* half

old-commit-hash: 9f72eca003d4637ca7fe847ebaf925c694fc2e84

* optimed

old-commit-hash: 6e36e2a12e09275ec21d1590012a92b05ca52ff5

* thneed

old-commit-hash: 419a06c09c0c767d828bcd1e118bc575898c343a

* exec

old-commit-hash: 0059c27ec11b076a37f65d604ed135ea6541b1a6

* runner

old-commit-hash: 34232ada94450ce541eaef546197fa219810a891

* runs but

old-commit-hash: 3db37c00b6a64908293b4de8d8b56e80308cd8f2

* it is 01

old-commit-hash: a160d81eb1a7e77abbef959b44f602610f68f665

* np

old-commit-hash: c1caff6ba648cc2c0094c71b2ea074f01c3c2dc8

* module url

old-commit-hash: 6f4902c4d384263a53e2c1d14d93b5ff864b6a5f

* new

old-commit-hash: 779ae79b1bc3df6374fb6663ac8592e107a6e504

* ds fast

* is this work

* corcention

* real timing

* no reg

* interim gather

* 0e4a9c7b

* fa69be01, and halve

* list

* cleanup

* slighly faster

* setprotlt

* expected

* replay ref

* more powar

* reluctantly

* bump tg

* 8

* less

* less

* bump tg

* better than exp

* closer

* cc

* see diff

* commits

* was right

* to 32 cast

* remove dlc file

* support both

* dspExecutionTime -> gpuExecutionTime

* ignore

* time ref

* ref commit

* last

---------

Co-authored-by: Comma Device <device@comma.ai>
This commit is contained in:
ZwX1616 2024-09-26 16:40:44 -07:00 committed by GitHub
parent e2f9942633
commit 876f192112
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 53 additions and 35 deletions

1
.gitattributes vendored
View File

@ -2,7 +2,6 @@
# to move existing files into LFS: # to move existing files into LFS:
# git add --renormalize . # git add --renormalize .
*.dlc filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text *.onnx filter=lfs diff=lfs merge=lfs -text
*.svg filter=lfs diff=lfs merge=lfs -text *.svg filter=lfs diff=lfs merge=lfs -text
*.png filter=lfs diff=lfs merge=lfs -text *.png filter=lfs diff=lfs merge=lfs -text

View File

@ -2012,7 +2012,8 @@ struct Joystick {
struct DriverStateV2 { struct DriverStateV2 {
frameId @0 :UInt32; frameId @0 :UInt32;
modelExecutionTime @1 :Float32; modelExecutionTime @1 :Float32;
dspExecutionTime @2 :Float32; dspExecutionTimeDEPRECATED @2 :Float32;
gpuExecutionTime @8 :Float32;
rawPredictions @3 :Data; rawPredictions @3 :Data;
poorVisionProb @4 :Float32; poorVisionProb @4 :Float32;

View File

@ -69,6 +69,10 @@ if arch == "larch64" or GetOption('pc_thneed'):
lenv.Command(fn + ".thneed", [fn + ".onnx"] + tinygrad_files, cmd) lenv.Command(fn + ".thneed", [fn + ".onnx"] + tinygrad_files, cmd)
fn_dm = File("models/dmonitoring_model").abspath
cmd = f"cd {Dir('#').abspath}/tinygrad_repo && " + ' '.join(tinygrad_opts) + f" python3 openpilot/compile2.py {fn_dm}.onnx {fn_dm}.thneed"
lenv.Command(fn_dm + ".thneed", [fn_dm + ".onnx"] + tinygrad_files, cmd)
thneed_lib = env.SharedLibrary('thneed', thneed_src, LIBS=[gpucommon, common, 'OpenCL', 'dl']) thneed_lib = env.SharedLibrary('thneed', thneed_src, LIBS=[gpucommon, common, 'OpenCL', 'dl'])
thneedmodel_lib = env.Library('thneedmodel', ['runners/thneedmodel.cc']) thneedmodel_lib = env.Library('thneedmodel', ['runners/thneedmodel.cc'])
lenvCython.Program('runners/thneedmodel_pyx.so', 'runners/thneedmodel_pyx.pyx', LIBS=envCython["LIBS"]+[thneedmodel_lib, thneed_lib, gpucommon, common, 'dl', 'OpenCL']) lenvCython.Program('runners/thneedmodel_pyx.so', 'runners/thneedmodel_pyx.pyx', LIBS=envCython["LIBS"]+[thneedmodel_lib, thneed_lib, gpucommon, common, 'dl', 'OpenCL'])

View File

@ -0,0 +1,10 @@
#!/usr/bin/env bash
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null && pwd)"
cd "$DIR/../../"
if [ -f "$DIR/libthneed.so" ]; then
export LD_PRELOAD="$DIR/libthneed.so"
fi
exec "$DIR/dmonitoringmodeld.py" "$@"

View File

@ -6,6 +6,7 @@ import time
import ctypes import ctypes
import numpy as np import numpy as np
from pathlib import Path from pathlib import Path
from setproctitle import setproctitle
from cereal import messaging from cereal import messaging
from cereal.messaging import PubMaster, SubMaster from cereal.messaging import PubMaster, SubMaster
@ -14,16 +15,19 @@ from openpilot.common.swaglog import cloudlog
from openpilot.common.params import Params from openpilot.common.params import Params
from openpilot.common.realtime import set_realtime_priority from openpilot.common.realtime import set_realtime_priority
from openpilot.selfdrive.modeld.runners import ModelRunner, Runtime from openpilot.selfdrive.modeld.runners import ModelRunner, Runtime
from openpilot.selfdrive.modeld.models.commonmodel_pyx import CLContext
from openpilot.selfdrive.modeld.parse_model_outputs import sigmoid from openpilot.selfdrive.modeld.parse_model_outputs import sigmoid
CALIB_LEN = 3 CALIB_LEN = 3
REG_SCALE = 0.25
MODEL_WIDTH = 1440 MODEL_WIDTH = 1440
MODEL_HEIGHT = 960 MODEL_HEIGHT = 960
OUTPUT_SIZE = 84 FEATURE_LEN = 512
OUTPUT_SIZE = 84 + FEATURE_LEN
PROCESS_NAME = "selfdrive.modeld.dmonitoringmodeld"
SEND_RAW_PRED = os.getenv('SEND_RAW_PRED') SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
MODEL_PATHS = { MODEL_PATHS = {
ModelRunner.SNPE: Path(__file__).parent / 'models/dmonitoring_model_q.dlc', ModelRunner.THNEED: Path(__file__).parent / 'models/dmonitoring_model.thneed',
ModelRunner.ONNX: Path(__file__).parent / 'models/dmonitoring_model.onnx'} ModelRunner.ONNX: Path(__file__).parent / 'models/dmonitoring_model.onnx'}
class DriverStateResult(ctypes.Structure): class DriverStateResult(ctypes.Structure):
@ -49,21 +53,22 @@ class DMonitoringModelResult(ctypes.Structure):
("driver_state_lhd", DriverStateResult), ("driver_state_lhd", DriverStateResult),
("driver_state_rhd", DriverStateResult), ("driver_state_rhd", DriverStateResult),
("poor_vision_prob", ctypes.c_float), ("poor_vision_prob", ctypes.c_float),
("wheel_on_right_prob", ctypes.c_float)] ("wheel_on_right_prob", ctypes.c_float),
("features", ctypes.c_float*FEATURE_LEN)]
class ModelState: class ModelState:
inputs: dict[str, np.ndarray] inputs: dict[str, np.ndarray]
output: np.ndarray output: np.ndarray
model: ModelRunner model: ModelRunner
def __init__(self): def __init__(self, cl_ctx):
assert ctypes.sizeof(DMonitoringModelResult) == OUTPUT_SIZE * ctypes.sizeof(ctypes.c_float) assert ctypes.sizeof(DMonitoringModelResult) == OUTPUT_SIZE * ctypes.sizeof(ctypes.c_float)
self.output = np.zeros(OUTPUT_SIZE, dtype=np.float32) self.output = np.zeros(OUTPUT_SIZE, dtype=np.float32)
self.inputs = { self.inputs = {
'input_img': np.zeros(MODEL_HEIGHT * MODEL_WIDTH, dtype=np.uint8), 'input_img': np.zeros(MODEL_HEIGHT * MODEL_WIDTH, dtype=np.uint8),
'calib': np.zeros(CALIB_LEN, dtype=np.float32)} 'calib': np.zeros(CALIB_LEN, dtype=np.float32)}
self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.DSP, True, None) self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.GPU, False, cl_ctx)
self.model.addInput("input_img", None) self.model.addInput("input_img", None)
self.model.addInput("calib", self.inputs['calib']) self.model.addInput("calib", self.inputs['calib'])
@ -76,17 +81,17 @@ class ModelState:
input_data = self.inputs['input_img'].reshape(MODEL_HEIGHT, MODEL_WIDTH) input_data = self.inputs['input_img'].reshape(MODEL_HEIGHT, MODEL_WIDTH)
input_data[:] = buf_data[v_offset:v_offset+MODEL_HEIGHT, h_offset:h_offset+MODEL_WIDTH] input_data[:] = buf_data[v_offset:v_offset+MODEL_HEIGHT, h_offset:h_offset+MODEL_WIDTH]
t1 = time.perf_counter()
self.model.setInputBuffer("input_img", self.inputs['input_img'].view(np.float32)) self.model.setInputBuffer("input_img", self.inputs['input_img'].view(np.float32))
t1 = time.perf_counter()
self.model.execute() self.model.execute()
t2 = time.perf_counter() t2 = time.perf_counter()
return self.output, t2 - t1 return self.output, t2 - t1
def fill_driver_state(msg, ds_result: DriverStateResult): def fill_driver_state(msg, ds_result: DriverStateResult):
msg.faceOrientation = [x * REG_SCALE for x in ds_result.face_orientation] msg.faceOrientation = list(ds_result.face_orientation)
msg.faceOrientationStd = [math.exp(x) for x in ds_result.face_orientation_std] msg.faceOrientationStd = [math.exp(x) for x in ds_result.face_orientation_std]
msg.facePosition = [x * REG_SCALE for x in ds_result.face_position[:2]] msg.facePosition = list(ds_result.face_position[:2])
msg.facePositionStd = [math.exp(x) for x in ds_result.face_position_std[:2]] msg.facePositionStd = [math.exp(x) for x in ds_result.face_position_std[:2]]
msg.faceProb = float(sigmoid(ds_result.face_prob)) msg.faceProb = float(sigmoid(ds_result.face_prob))
msg.leftEyeProb = float(sigmoid(ds_result.left_eye_prob)) msg.leftEyeProb = float(sigmoid(ds_result.left_eye_prob))
@ -98,13 +103,13 @@ def fill_driver_state(msg, ds_result: DriverStateResult):
msg.readyProb = [float(sigmoid(x)) for x in ds_result.ready_prob] msg.readyProb = [float(sigmoid(x)) for x in ds_result.ready_prob]
msg.notReadyProb = [float(sigmoid(x)) for x in ds_result.not_ready_prob] msg.notReadyProb = [float(sigmoid(x)) for x in ds_result.not_ready_prob]
def get_driverstate_packet(model_output: np.ndarray, frame_id: int, location_ts: int, execution_time: float, dsp_execution_time: float): def get_driverstate_packet(model_output: np.ndarray, frame_id: int, location_ts: int, execution_time: float, gpu_execution_time: float):
model_result = ctypes.cast(model_output.ctypes.data, ctypes.POINTER(DMonitoringModelResult)).contents model_result = ctypes.cast(model_output.ctypes.data, ctypes.POINTER(DMonitoringModelResult)).contents
msg = messaging.new_message('driverStateV2', valid=True) msg = messaging.new_message('driverStateV2', valid=True)
ds = msg.driverStateV2 ds = msg.driverStateV2
ds.frameId = frame_id ds.frameId = frame_id
ds.modelExecutionTime = execution_time ds.modelExecutionTime = execution_time
ds.dspExecutionTime = dsp_execution_time ds.gpuExecutionTime = gpu_execution_time
ds.poorVisionProb = float(sigmoid(model_result.poor_vision_prob)) ds.poorVisionProb = float(sigmoid(model_result.poor_vision_prob))
ds.wheelOnRightProb = float(sigmoid(model_result.wheel_on_right_prob)) ds.wheelOnRightProb = float(sigmoid(model_result.wheel_on_right_prob))
ds.rawPredictions = model_output.tobytes() if SEND_RAW_PRED else b'' ds.rawPredictions = model_output.tobytes() if SEND_RAW_PRED else b''
@ -115,14 +120,16 @@ def get_driverstate_packet(model_output: np.ndarray, frame_id: int, location_ts:
def main(): def main():
gc.disable() gc.disable()
setproctitle(PROCESS_NAME)
set_realtime_priority(1) set_realtime_priority(1)
model = ModelState() cl_context = CLContext()
model = ModelState(cl_context)
cloudlog.warning("models loaded, dmonitoringmodeld starting") cloudlog.warning("models loaded, dmonitoringmodeld starting")
Params().put_bool("DmModelInitialized", True) Params().put_bool("DmModelInitialized", True)
cloudlog.warning("connecting to driver stream") cloudlog.warning("connecting to driver stream")
vipc_client = VisionIpcClient("camerad", VisionStreamType.VISION_STREAM_DRIVER, True) vipc_client = VisionIpcClient("camerad", VisionStreamType.VISION_STREAM_DRIVER, True, cl_context)
while not vipc_client.connect(False): while not vipc_client.connect(False):
time.sleep(0.1) time.sleep(0.1)
assert vipc_client.is_connected() assert vipc_client.is_connected()
@ -144,10 +151,10 @@ def main():
calib[:] = np.array(sm["liveCalibration"].rpyCalib) calib[:] = np.array(sm["liveCalibration"].rpyCalib)
t1 = time.perf_counter() t1 = time.perf_counter()
model_output, dsp_execution_time = model.run(buf, calib) model_output, gpu_execution_time = model.run(buf, calib)
t2 = time.perf_counter() t2 = time.perf_counter()
pm.send("driverStateV2", get_driverstate_packet(model_output, vipc_client.frame_id, vipc_client.timestamp_sof, t2 - t1, dsp_execution_time)) pm.send("driverStateV2", get_driverstate_packet(model_output, vipc_client.frame_id, vipc_client.timestamp_sof, t2 - t1, gpu_execution_time))
# print("dmonitoring process: %.2fms, from last %.2fms\n" % (t2 - t1, t1 - last)) # print("dmonitoring process: %.2fms, from last %.2fms\n" % (t2 - t1, t1 - last))
# last = t1 # last = t1

View File

@ -1,2 +1,2 @@
5ec97a39-0095-4cea-adfa-6d72b1966cc1 fa69be01-b430-4504-9d72-7dcb058eb6dd
26cac7a9757a27c783a365403040a1bd27ccdaea d9fb22d1c4fa3ca3d201dbc8edf1d0f0918e53e6

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1 version https://git-lfs.github.com/spec/v1
oid sha256:3dd3982940d823c4fbb0429b733a0b78b0688d7d67aa76ff7b754a3e2f3d8683 oid sha256:50efe6451a3fb3fa04b6bb0e846544533329bd46ecefe9e657e91214dee2aaeb
size 16132780 size 7196502

View File

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:7c26f13816b143f5bb29ac2980f8557bd5687a75729e4d895313fb9a5a1f0f46
size 4488449

View File

@ -67,7 +67,6 @@ class ONNXModel(RunModel):
def __init__(self, path, output, runtime, use_tf8, cl_context): def __init__(self, path, output, runtime, use_tf8, cl_context):
self.inputs = {} self.inputs = {}
self.output = output self.output = output
self.use_tf8 = use_tf8
self.session = create_ort_session(path, fp16_to_fp32=True) self.session = create_ort_session(path, fp16_to_fp32=True)
self.input_names = [x.name for x in self.session.get_inputs()] self.input_names = [x.name for x in self.session.get_inputs()]
@ -91,7 +90,7 @@ class ONNXModel(RunModel):
return None return None
def execute(self): def execute(self):
inputs = {k: (v.view(np.uint8) / 255. if self.use_tf8 and k == 'input_img' else v) for k,v in self.inputs.items()} inputs = {k: v.view(self.input_dtypes[k]) for k,v in self.inputs.items()}
inputs = {k: v.reshape(self.input_shapes[k]).astype(self.input_dtypes[k]) for k,v in inputs.items()} inputs = {k: v.reshape(self.input_shapes[k]).astype(self.input_dtypes[k]) for k,v in inputs.items()}
outputs = self.session.run(None, inputs) outputs = self.session.run(None, inputs)
assert len(outputs) == 1, "Only single model outputs are supported" assert len(outputs) == 1, "Only single model outputs are supported"

View File

@ -33,8 +33,8 @@ class DRIVER_MONITOR_SETTINGS:
self._SG_THRESHOLD = 0.9 self._SG_THRESHOLD = 0.9
self._BLINK_THRESHOLD = 0.865 self._BLINK_THRESHOLD = 0.865
self._EE_THRESH11 = 0.25 self._EE_THRESH11 = 0.4
self._EE_THRESH12 = 7.5 self._EE_THRESH12 = 15.0
self._EE_MAX_OFFSET1 = 0.06 self._EE_MAX_OFFSET1 = 0.06
self._EE_MIN_OFFSET1 = 0.025 self._EE_MIN_OFFSET1 = 0.025
self._EE_THRESH21 = 0.01 self._EE_THRESH21 = 0.01

View File

@ -109,7 +109,7 @@ if __name__ == "__main__":
'modelV2.frameDropPerc', 'modelV2.frameDropPerc',
'modelV2.modelExecutionTime', 'modelV2.modelExecutionTime',
'driverStateV2.modelExecutionTime', 'driverStateV2.modelExecutionTime',
'driverStateV2.dspExecutionTime' 'driverStateV2.gpuExecutionTime'
] ]
if PC: if PC:
# TODO We ignore whole bunch so we can compare important stuff # TODO We ignore whole bunch so we can compare important stuff

View File

@ -1 +1 @@
666448fce191e196aac68d06e29a0745e6620db9 7cd64f431b814adfa11118643efe3822c496922b

View File

@ -585,7 +585,7 @@ CONFIGS = [
proc_name="dmonitoringmodeld", proc_name="dmonitoringmodeld",
pubs=["liveCalibration", "driverCameraState"], pubs=["liveCalibration", "driverCameraState"],
subs=["driverStateV2"], subs=["driverStateV2"],
ignore=["logMonoTime", "driverStateV2.modelExecutionTime", "driverStateV2.dspExecutionTime"], ignore=["logMonoTime", "driverStateV2.modelExecutionTime", "driverStateV2.gpuExecutionTime"],
should_recv_callback=dmonitoringmodeld_rcv_callback, should_recv_callback=dmonitoringmodeld_rcv_callback,
tolerance=NUMPY_TOLERANCE, tolerance=NUMPY_TOLERANCE,
processing_time=0.020, processing_time=0.020,

View File

@ -32,6 +32,7 @@ CPU usage budget
* total CPU usage of openpilot (sum(PROCS.values()) * total CPU usage of openpilot (sum(PROCS.values())
should not exceed MAX_TOTAL_CPU should not exceed MAX_TOTAL_CPU
""" """
MAX_TOTAL_CPU = 265. # total for all 8 cores MAX_TOTAL_CPU = 265. # total for all 8 cores
PROCS = { PROCS = {
# Baseline CPU usage by process # Baseline CPU usage by process
@ -312,7 +313,7 @@ class TestOnroad:
assert max(mems) - min(mems) <= 3.0 assert max(mems) - min(mems) <= 3.0
def test_gpu_usage(self): def test_gpu_usage(self):
assert self.gpu_procs == {"weston", "ui", "camerad", "selfdrive.modeld.modeld"} assert self.gpu_procs == {"weston", "ui", "camerad", "selfdrive.modeld.modeld", "selfdrive.modeld.dmonitoringmodeld"}
def test_camera_processing_time(self): def test_camera_processing_time(self):
result = "\n" result = "\n"

View File

@ -34,7 +34,7 @@ class Proc:
PROCS = [ PROCS = [
Proc(['camerad'], 2.1, msgs=['roadCameraState', 'wideRoadCameraState', 'driverCameraState']), Proc(['camerad'], 2.1, msgs=['roadCameraState', 'wideRoadCameraState', 'driverCameraState']),
Proc(['modeld'], 1.12, atol=0.2, msgs=['modelV2']), Proc(['modeld'], 1.12, atol=0.2, msgs=['modelV2']),
Proc(['dmonitoringmodeld'], 0.4, msgs=['driverStateV2']), Proc(['dmonitoringmodeld'], 0.5, msgs=['driverStateV2']),
Proc(['encoderd'], 0.23, msgs=[]), Proc(['encoderd'], 0.23, msgs=[]),
] ]

View File

@ -70,7 +70,7 @@ procs = [
PythonProcess("micd", "system.micd", iscar), PythonProcess("micd", "system.micd", iscar),
PythonProcess("timed", "system.timed", always_run, enabled=not PC), PythonProcess("timed", "system.timed", always_run, enabled=not PC),
PythonProcess("dmonitoringmodeld", "selfdrive.modeld.dmonitoringmodeld", driverview, enabled=(not PC or WEBCAM)), NativeProcess("dmonitoringmodeld", "selfdrive/modeld", ["./dmonitoringmodeld"], driverview, enabled=(not PC or WEBCAM)),
NativeProcess("encoderd", "system/loggerd", ["./encoderd"], only_onroad), NativeProcess("encoderd", "system/loggerd", ["./encoderd"], only_onroad),
NativeProcess("stream_encoderd", "system/loggerd", ["./encoderd", "--stream"], notcar), NativeProcess("stream_encoderd", "system/loggerd", ["./encoderd", "--stream"], notcar),
NativeProcess("loggerd", "system/loggerd", ["./loggerd"], logging), NativeProcess("loggerd", "system/loggerd", ["./loggerd"], logging),

@ -1 +1 @@
Subproject commit f51aa0fc7cdbac710e640172db280cfb747d2718 Subproject commit 3e15fa0daefae75e2ddef98f82be5b5d37820631