2021-03-05 18:03:23 +08:00
|
|
|
import importlib
|
|
|
|
import os
|
|
|
|
import signal
|
2021-11-17 19:02:31 +08:00
|
|
|
import struct
|
2021-03-05 18:03:23 +08:00
|
|
|
import time
|
|
|
|
import subprocess
|
2022-05-04 14:30:35 +08:00
|
|
|
from typing import Optional, Callable, List, ValuesView
|
2021-03-05 18:03:23 +08:00
|
|
|
from abc import ABC, abstractmethod
|
|
|
|
from multiprocessing import Process
|
|
|
|
|
|
|
|
from setproctitle import setproctitle # pylint: disable=no-name-in-module
|
|
|
|
|
2023-08-21 11:49:55 +08:00
|
|
|
from cereal import car, log
|
2021-03-05 18:03:23 +08:00
|
|
|
import cereal.messaging as messaging
|
2023-08-21 11:49:55 +08:00
|
|
|
import openpilot.selfdrive.sentry as sentry
|
|
|
|
from openpilot.common.basedir import BASEDIR
|
|
|
|
from openpilot.common.params import Params
|
|
|
|
from openpilot.system.swaglog import cloudlog
|
2021-03-05 18:03:23 +08:00
|
|
|
|
2021-03-09 11:17:46 +08:00
|
|
|
WATCHDOG_FN = "/dev/shm/wd_"
|
|
|
|
ENABLE_WATCHDOG = os.getenv("NO_WATCHDOG") is None
|
|
|
|
|
2021-03-05 18:03:23 +08:00
|
|
|
|
2021-12-29 01:07:00 +08:00
|
|
|
def launcher(proc: str, name: str) -> None:
|
2021-03-05 18:03:23 +08:00
|
|
|
try:
|
|
|
|
# import the process
|
|
|
|
mod = importlib.import_module(proc)
|
|
|
|
|
|
|
|
# rename the process
|
|
|
|
setproctitle(proc)
|
|
|
|
|
|
|
|
# create new context since we forked
|
|
|
|
messaging.context = messaging.Context()
|
|
|
|
|
2022-01-27 03:44:10 +08:00
|
|
|
# add daemon name tag to logs
|
2021-12-15 12:43:26 +08:00
|
|
|
cloudlog.bind(daemon=name)
|
2022-01-27 03:44:10 +08:00
|
|
|
sentry.set_tag("daemon", name)
|
2021-12-15 12:43:26 +08:00
|
|
|
|
2021-03-05 18:03:23 +08:00
|
|
|
# exec the process
|
2023-08-12 06:33:49 +08:00
|
|
|
mod.main()
|
2021-03-05 18:03:23 +08:00
|
|
|
except KeyboardInterrupt:
|
2021-12-16 21:58:17 +08:00
|
|
|
cloudlog.warning(f"child {proc} got SIGINT")
|
2021-03-05 18:03:23 +08:00
|
|
|
except Exception:
|
2021-08-06 03:05:49 +08:00
|
|
|
# can't install the crash handler because sys.excepthook doesn't play nice
|
2021-03-05 18:03:23 +08:00
|
|
|
# with threads, so catch it here.
|
2022-01-15 07:25:16 +08:00
|
|
|
sentry.capture_exception()
|
2021-03-05 18:03:23 +08:00
|
|
|
raise
|
|
|
|
|
|
|
|
|
2022-01-14 12:23:08 +08:00
|
|
|
def nativelauncher(pargs: List[str], cwd: str, name: str) -> None:
|
|
|
|
os.environ['MANAGER_DAEMON'] = name
|
|
|
|
|
2021-03-05 18:03:23 +08:00
|
|
|
# exec the process
|
|
|
|
os.chdir(cwd)
|
|
|
|
os.execvp(pargs[0], pargs)
|
|
|
|
|
|
|
|
|
2021-12-29 01:07:00 +08:00
|
|
|
def join_process(process: Process, timeout: float) -> None:
|
2021-03-05 18:03:23 +08:00
|
|
|
# Process().join(timeout) will hang due to a python 3 bug: https://bugs.python.org/issue28382
|
|
|
|
# We have to poll the exitcode instead
|
|
|
|
t = time.monotonic()
|
|
|
|
while time.monotonic() - t < timeout and process.exitcode is None:
|
|
|
|
time.sleep(0.001)
|
|
|
|
|
|
|
|
|
|
|
|
class ManagerProcess(ABC):
|
|
|
|
daemon = False
|
|
|
|
sigkill = False
|
2022-04-21 22:23:39 +08:00
|
|
|
onroad = True
|
|
|
|
offroad = False
|
2022-05-04 14:30:35 +08:00
|
|
|
callback: Optional[Callable[[bool, Params, car.CarParams], bool]] = None
|
2021-12-29 01:07:00 +08:00
|
|
|
proc: Optional[Process] = None
|
2021-03-08 19:18:58 +08:00
|
|
|
enabled = True
|
2021-03-05 18:03:23 +08:00
|
|
|
name = ""
|
|
|
|
|
2021-03-09 11:17:46 +08:00
|
|
|
last_watchdog_time = 0
|
2022-04-22 00:27:45 +08:00
|
|
|
watchdog_max_dt: Optional[int] = None
|
2021-03-09 11:17:46 +08:00
|
|
|
watchdog_seen = False
|
2021-03-26 00:27:49 +08:00
|
|
|
shutting_down = False
|
2021-03-09 11:17:46 +08:00
|
|
|
|
2021-03-05 18:03:23 +08:00
|
|
|
@abstractmethod
|
2021-12-29 01:07:00 +08:00
|
|
|
def prepare(self) -> None:
|
2021-03-05 18:03:23 +08:00
|
|
|
pass
|
|
|
|
|
|
|
|
@abstractmethod
|
2021-12-29 01:07:00 +08:00
|
|
|
def start(self) -> None:
|
2021-03-05 18:03:23 +08:00
|
|
|
pass
|
|
|
|
|
2021-12-29 01:07:00 +08:00
|
|
|
def restart(self) -> None:
|
2023-06-13 12:46:20 +08:00
|
|
|
self.stop(sig=signal.SIGKILL)
|
2021-03-09 11:17:46 +08:00
|
|
|
self.start()
|
|
|
|
|
2021-12-29 01:07:00 +08:00
|
|
|
def check_watchdog(self, started: bool) -> None:
|
2021-03-09 11:17:46 +08:00
|
|
|
if self.watchdog_max_dt is None or self.proc is None:
|
|
|
|
return
|
|
|
|
|
|
|
|
try:
|
|
|
|
fn = WATCHDOG_FN + str(self.proc.pid)
|
2023-06-17 04:22:30 +08:00
|
|
|
with open(fn, "rb") as f:
|
|
|
|
# TODO: why can't pylint find struct.unpack?
|
|
|
|
self.last_watchdog_time = struct.unpack('Q', f.read())[0] # pylint: disable=no-member
|
2021-03-09 11:17:46 +08:00
|
|
|
except Exception:
|
|
|
|
pass
|
|
|
|
|
2023-08-21 09:50:58 +08:00
|
|
|
dt = time.monotonic() - self.last_watchdog_time / 1e9
|
2021-03-09 11:17:46 +08:00
|
|
|
|
|
|
|
if dt > self.watchdog_max_dt:
|
2021-07-18 05:52:05 +08:00
|
|
|
if self.watchdog_seen and ENABLE_WATCHDOG:
|
|
|
|
cloudlog.error(f"Watchdog timeout for {self.name} (exitcode {self.proc.exitcode}) restarting ({started=})")
|
2021-03-09 11:17:46 +08:00
|
|
|
self.restart()
|
|
|
|
else:
|
|
|
|
self.watchdog_seen = True
|
|
|
|
|
2023-06-13 12:46:20 +08:00
|
|
|
def stop(self, retry: bool = True, block: bool = True, sig: Optional[signal.Signals] = None) -> Optional[int]:
|
2021-03-05 18:03:23 +08:00
|
|
|
if self.proc is None:
|
2021-12-29 01:07:00 +08:00
|
|
|
return None
|
2021-03-05 18:03:23 +08:00
|
|
|
|
|
|
|
if self.proc.exitcode is None:
|
2021-03-26 00:27:49 +08:00
|
|
|
if not self.shutting_down:
|
|
|
|
cloudlog.info(f"killing {self.name}")
|
2023-06-13 12:46:20 +08:00
|
|
|
if sig is None:
|
|
|
|
sig = signal.SIGKILL if self.sigkill else signal.SIGINT
|
2021-03-26 00:27:49 +08:00
|
|
|
self.signal(sig)
|
|
|
|
self.shutting_down = True
|
|
|
|
|
|
|
|
if not block:
|
2021-12-29 01:07:00 +08:00
|
|
|
return None
|
2021-03-05 18:03:23 +08:00
|
|
|
|
|
|
|
join_process(self.proc, 5)
|
|
|
|
|
2023-08-02 13:18:58 +08:00
|
|
|
# If process failed to die send SIGKILL
|
2021-03-05 18:03:23 +08:00
|
|
|
if self.proc.exitcode is None and retry:
|
2023-08-02 13:18:58 +08:00
|
|
|
cloudlog.info(f"killing {self.name} with SIGKILL")
|
|
|
|
self.signal(signal.SIGKILL)
|
|
|
|
self.proc.join()
|
2021-03-05 18:03:23 +08:00
|
|
|
|
|
|
|
ret = self.proc.exitcode
|
|
|
|
cloudlog.info(f"{self.name} is dead with {ret}")
|
|
|
|
|
|
|
|
if self.proc.exitcode is not None:
|
2021-03-26 00:27:49 +08:00
|
|
|
self.shutting_down = False
|
2021-03-05 18:03:23 +08:00
|
|
|
self.proc = None
|
|
|
|
|
|
|
|
return ret
|
|
|
|
|
2021-12-29 01:07:00 +08:00
|
|
|
def signal(self, sig: int) -> None:
|
2021-03-08 22:42:09 +08:00
|
|
|
if self.proc is None:
|
|
|
|
return
|
|
|
|
|
|
|
|
# Don't signal if already exited
|
2021-03-05 18:03:23 +08:00
|
|
|
if self.proc.exitcode is not None and self.proc.pid is not None:
|
|
|
|
return
|
|
|
|
|
2021-12-29 01:07:00 +08:00
|
|
|
# Can't signal if we don't have a pid
|
|
|
|
if self.proc.pid is None:
|
|
|
|
return
|
|
|
|
|
2021-03-05 18:03:23 +08:00
|
|
|
cloudlog.info(f"sending signal {sig} to {self.name}")
|
|
|
|
os.kill(self.proc.pid, sig)
|
|
|
|
|
|
|
|
def get_process_state_msg(self):
|
|
|
|
state = log.ManagerState.ProcessState.new_message()
|
|
|
|
state.name = self.name
|
|
|
|
if self.proc:
|
|
|
|
state.running = self.proc.is_alive()
|
2021-09-16 10:51:55 +08:00
|
|
|
state.shouldBeRunning = self.proc is not None and not self.shutting_down
|
2021-03-05 18:03:23 +08:00
|
|
|
state.pid = self.proc.pid or 0
|
|
|
|
state.exitCode = self.proc.exitcode or 0
|
|
|
|
return state
|
|
|
|
|
|
|
|
|
|
|
|
class NativeProcess(ManagerProcess):
|
2022-05-04 14:30:35 +08:00
|
|
|
def __init__(self, name, cwd, cmdline, enabled=True, onroad=True, offroad=False, callback=None, unkillable=False, sigkill=False, watchdog_max_dt=None):
|
2021-03-05 18:03:23 +08:00
|
|
|
self.name = name
|
|
|
|
self.cwd = cwd
|
|
|
|
self.cmdline = cmdline
|
2021-03-08 19:18:58 +08:00
|
|
|
self.enabled = enabled
|
2022-04-21 22:23:39 +08:00
|
|
|
self.onroad = onroad
|
|
|
|
self.offroad = offroad
|
2022-05-04 14:30:35 +08:00
|
|
|
self.callback = callback
|
2021-03-05 18:03:23 +08:00
|
|
|
self.unkillable = unkillable
|
|
|
|
self.sigkill = sigkill
|
2021-03-09 11:17:46 +08:00
|
|
|
self.watchdog_max_dt = watchdog_max_dt
|
2023-08-01 07:30:58 +08:00
|
|
|
self.launcher = nativelauncher
|
2021-03-05 18:03:23 +08:00
|
|
|
|
2021-12-29 01:07:00 +08:00
|
|
|
def prepare(self) -> None:
|
2021-03-05 18:03:23 +08:00
|
|
|
pass
|
|
|
|
|
2021-12-29 01:07:00 +08:00
|
|
|
def start(self) -> None:
|
2021-03-26 00:27:49 +08:00
|
|
|
# In case we only tried a non blocking stop we need to stop it before restarting
|
|
|
|
if self.shutting_down:
|
2022-03-25 14:23:29 +08:00
|
|
|
self.stop()
|
2021-03-26 00:27:49 +08:00
|
|
|
|
2021-03-05 18:03:23 +08:00
|
|
|
if self.proc is not None:
|
|
|
|
return
|
|
|
|
|
|
|
|
cwd = os.path.join(BASEDIR, self.cwd)
|
2021-12-16 21:58:17 +08:00
|
|
|
cloudlog.info(f"starting process {self.name}")
|
2023-08-01 07:30:58 +08:00
|
|
|
self.proc = Process(name=self.name, target=self.launcher, args=(self.cmdline, cwd, self.name))
|
2021-03-05 18:03:23 +08:00
|
|
|
self.proc.start()
|
2021-03-09 11:17:46 +08:00
|
|
|
self.watchdog_seen = False
|
2021-03-26 00:27:49 +08:00
|
|
|
self.shutting_down = False
|
2021-03-05 18:03:23 +08:00
|
|
|
|
|
|
|
|
|
|
|
class PythonProcess(ManagerProcess):
|
2022-05-04 14:30:35 +08:00
|
|
|
def __init__(self, name, module, enabled=True, onroad=True, offroad=False, callback=None, unkillable=False, sigkill=False, watchdog_max_dt=None):
|
2021-03-05 18:03:23 +08:00
|
|
|
self.name = name
|
|
|
|
self.module = module
|
2021-03-08 19:18:58 +08:00
|
|
|
self.enabled = enabled
|
2022-04-21 22:23:39 +08:00
|
|
|
self.onroad = onroad
|
|
|
|
self.offroad = offroad
|
2022-05-04 14:30:35 +08:00
|
|
|
self.callback = callback
|
2021-03-05 18:03:23 +08:00
|
|
|
self.unkillable = unkillable
|
|
|
|
self.sigkill = sigkill
|
2021-03-09 11:17:46 +08:00
|
|
|
self.watchdog_max_dt = watchdog_max_dt
|
2023-08-01 07:30:58 +08:00
|
|
|
self.launcher = launcher
|
2021-03-05 18:03:23 +08:00
|
|
|
|
2021-12-29 01:07:00 +08:00
|
|
|
def prepare(self) -> None:
|
2021-03-08 19:18:58 +08:00
|
|
|
if self.enabled:
|
2021-12-16 21:58:17 +08:00
|
|
|
cloudlog.info(f"preimporting {self.module}")
|
2021-03-08 19:18:58 +08:00
|
|
|
importlib.import_module(self.module)
|
2021-03-05 18:03:23 +08:00
|
|
|
|
2021-12-29 01:07:00 +08:00
|
|
|
def start(self) -> None:
|
2021-03-26 00:27:49 +08:00
|
|
|
# In case we only tried a non blocking stop we need to stop it before restarting
|
|
|
|
if self.shutting_down:
|
2022-03-25 14:23:29 +08:00
|
|
|
self.stop()
|
2021-03-26 00:27:49 +08:00
|
|
|
|
2021-03-05 18:03:23 +08:00
|
|
|
if self.proc is not None:
|
|
|
|
return
|
|
|
|
|
2021-12-16 21:58:17 +08:00
|
|
|
cloudlog.info(f"starting python {self.module}")
|
2023-08-01 07:30:58 +08:00
|
|
|
self.proc = Process(name=self.name, target=self.launcher, args=(self.module, self.name))
|
2021-03-05 18:03:23 +08:00
|
|
|
self.proc.start()
|
2021-03-09 11:17:46 +08:00
|
|
|
self.watchdog_seen = False
|
2021-03-26 00:27:49 +08:00
|
|
|
self.shutting_down = False
|
2021-03-05 18:03:23 +08:00
|
|
|
|
|
|
|
|
|
|
|
class DaemonProcess(ManagerProcess):
|
2021-08-06 03:05:49 +08:00
|
|
|
"""Python process that has to stay running across manager restart.
|
2021-03-05 18:03:23 +08:00
|
|
|
This is used for athena so you don't lose SSH access when restarting manager."""
|
2021-03-08 19:18:58 +08:00
|
|
|
def __init__(self, name, module, param_name, enabled=True):
|
2021-03-05 18:03:23 +08:00
|
|
|
self.name = name
|
|
|
|
self.module = module
|
|
|
|
self.param_name = param_name
|
2021-03-08 19:18:58 +08:00
|
|
|
self.enabled = enabled
|
2022-04-21 22:23:39 +08:00
|
|
|
self.onroad = True
|
|
|
|
self.offroad = True
|
2023-07-01 01:03:45 +08:00
|
|
|
self.params = None
|
2021-03-05 18:03:23 +08:00
|
|
|
|
2021-12-29 01:07:00 +08:00
|
|
|
def prepare(self) -> None:
|
2021-03-05 18:03:23 +08:00
|
|
|
pass
|
|
|
|
|
2021-12-29 01:07:00 +08:00
|
|
|
def start(self) -> None:
|
2023-07-01 01:03:45 +08:00
|
|
|
if self.params is None:
|
|
|
|
self.params = Params()
|
2021-03-05 18:03:23 +08:00
|
|
|
|
2023-07-01 01:03:45 +08:00
|
|
|
pid = self.params.get(self.param_name, encoding='utf-8')
|
2021-03-05 18:03:23 +08:00
|
|
|
if pid is not None:
|
|
|
|
try:
|
|
|
|
os.kill(int(pid), 0)
|
|
|
|
with open(f'/proc/{pid}/cmdline') as f:
|
|
|
|
if self.module in f.read():
|
|
|
|
# daemon is running
|
|
|
|
return
|
|
|
|
except (OSError, FileNotFoundError):
|
|
|
|
# process is dead
|
|
|
|
pass
|
|
|
|
|
2021-12-16 21:58:17 +08:00
|
|
|
cloudlog.info(f"starting daemon {self.name}")
|
2021-03-05 18:03:23 +08:00
|
|
|
proc = subprocess.Popen(['python', '-m', self.module], # pylint: disable=subprocess-popen-preexec-fn
|
2021-12-25 03:18:39 +08:00
|
|
|
stdin=open('/dev/null'),
|
2021-03-05 18:03:23 +08:00
|
|
|
stdout=open('/dev/null', 'w'),
|
|
|
|
stderr=open('/dev/null', 'w'),
|
|
|
|
preexec_fn=os.setpgrp)
|
|
|
|
|
2023-07-01 01:03:45 +08:00
|
|
|
self.params.put(self.param_name, str(proc.pid))
|
2021-03-05 18:03:23 +08:00
|
|
|
|
2023-06-13 12:46:20 +08:00
|
|
|
def stop(self, retry=True, block=True, sig=None) -> None:
|
2021-03-05 18:03:23 +08:00
|
|
|
pass
|
|
|
|
|
|
|
|
|
2022-05-04 14:30:35 +08:00
|
|
|
def ensure_running(procs: ValuesView[ManagerProcess], started: bool, params=None, CP: car.CarParams=None,
|
2023-01-23 06:26:55 +08:00
|
|
|
not_run: Optional[List[str]]=None) -> List[ManagerProcess]:
|
2021-03-05 18:03:23 +08:00
|
|
|
if not_run is None:
|
|
|
|
not_run = []
|
|
|
|
|
2023-01-23 06:26:55 +08:00
|
|
|
running = []
|
2021-03-05 18:03:23 +08:00
|
|
|
for p in procs:
|
2022-04-21 22:23:39 +08:00
|
|
|
# Conditions that make a process run
|
|
|
|
run = any((
|
|
|
|
p.offroad and not started,
|
|
|
|
p.onroad and started,
|
|
|
|
))
|
2022-05-04 14:30:35 +08:00
|
|
|
if p.callback is not None and None not in (params, CP):
|
|
|
|
run = run or p.callback(started, params, CP)
|
2022-04-21 22:23:39 +08:00
|
|
|
|
|
|
|
# Conditions that block a process from starting
|
|
|
|
run = run and not any((
|
|
|
|
not p.enabled,
|
|
|
|
p.name in not_run,
|
|
|
|
))
|
|
|
|
|
|
|
|
if run:
|
2021-03-05 18:03:23 +08:00
|
|
|
p.start()
|
2023-01-23 06:26:55 +08:00
|
|
|
running.append(p)
|
2021-03-05 18:03:23 +08:00
|
|
|
else:
|
2021-03-26 00:27:49 +08:00
|
|
|
p.stop(block=False)
|
2021-03-09 11:17:46 +08:00
|
|
|
|
|
|
|
p.check_watchdog(started)
|
2023-01-23 06:26:55 +08:00
|
|
|
|
|
|
|
return running
|