diff --git a/release/files_common b/release/files_common index dbe110183f..91abab0de2 100644 --- a/release/files_common +++ b/release/files_common @@ -196,6 +196,8 @@ selfdrive/common/clutil.cc selfdrive/common/clutil.h selfdrive/common/params.h selfdrive/common/params.cc +selfdrive/common/watchdog.cc +selfdrive/common/watchdog.h selfdrive/common/modeldata.h selfdrive/common/mat.h diff --git a/selfdrive/common/SConscript b/selfdrive/common/SConscript index ec3603d3af..fb36e2b9f0 100644 --- a/selfdrive/common/SConscript +++ b/selfdrive/common/SConscript @@ -5,7 +5,14 @@ if SHARED: else: fxn = env.Library -common_libs = ['params.cc', 'swaglog.cc', 'util.cc', 'gpio.cc', 'i2c.cc'] +common_libs = [ + 'params.cc', + 'swaglog.cc', + 'util.cc', + 'gpio.cc', + 'i2c.cc', + 'watchdog.cc', +] _common = fxn('common', common_libs, LIBS="json11") diff --git a/selfdrive/common/util.cc b/selfdrive/common/util.cc index 49f53db33d..a9db1a156b 100644 --- a/selfdrive/common/util.cc +++ b/selfdrive/common/util.cc @@ -1,11 +1,7 @@ -#include -#include -#include -#include -#include -#include #include +#include "common/util.h" + #ifdef __linux__ #include #include @@ -45,8 +41,8 @@ void* read_file(const char* path, size_t* out_len) { return buf; } -int write_file(const char* path, const void* data, size_t size) { - int fd = open(path, O_WRONLY); +int write_file(const char* path, const void* data, size_t size, int flags, mode_t mode) { + int fd = open(path, flags, mode); if (fd == -1) { return -1; } diff --git a/selfdrive/common/util.h b/selfdrive/common/util.h index b3dfae7e73..3da88bb767 100644 --- a/selfdrive/common/util.h +++ b/selfdrive/common/util.h @@ -1,8 +1,9 @@ #pragma once -#include -#include +#include #include +#include +#include #include #include #include @@ -10,6 +11,9 @@ #include #include #include +#include +#include +#include #ifndef sighandler_t typedef void (*sighandler_t)(int sig); @@ -25,7 +29,7 @@ typedef void (*sighandler_t)(int sig); // Returns NULL on failure, otherwise the NULL-terminated file contents. // The result must be freed by the caller. void* read_file(const char* path, size_t* out_len); -int write_file(const char* path, const void* data, size_t size); +int write_file(const char* path, const void* data, size_t size, int flags=O_WRONLY, mode_t mode=0777); void set_thread_name(const char* name); diff --git a/selfdrive/common/watchdog.cc b/selfdrive/common/watchdog.cc new file mode 100644 index 0000000000..2e8afb3910 --- /dev/null +++ b/selfdrive/common/watchdog.cc @@ -0,0 +1,17 @@ +#include +#include +#include + +#include "common/timing.h" +#include "common/util.h" +#include "common/watchdog.h" + +const std::string watchdog_fn_prefix = "/dev/shm/wd_"; // + + +bool watchdog_kick(){ + std::string fn = watchdog_fn_prefix + std::to_string(getpid()); + std::string cur_t = std::to_string(nanos_since_boot()); + + int r = write_file(fn.c_str(), cur_t.data(), cur_t.length(), O_WRONLY | O_CREAT); + return r == 0; +} diff --git a/selfdrive/common/watchdog.h b/selfdrive/common/watchdog.h new file mode 100644 index 0000000000..7ed23aa0d9 --- /dev/null +++ b/selfdrive/common/watchdog.h @@ -0,0 +1,3 @@ +#pragma once + +bool watchdog_kick(); diff --git a/selfdrive/manager/process.py b/selfdrive/manager/process.py index 8edd102d57..0610f91141 100644 --- a/selfdrive/manager/process.py +++ b/selfdrive/manager/process.py @@ -12,10 +12,14 @@ import cereal.messaging as messaging import selfdrive.crash as crash from common.basedir import BASEDIR from common.params import Params +from common.realtime import sec_since_boot from selfdrive.swaglog import cloudlog from selfdrive.hardware import HARDWARE from cereal import log +WATCHDOG_FN = "/dev/shm/wd_" +ENABLE_WATCHDOG = os.getenv("NO_WATCHDOG") is None + def launcher(proc): try: @@ -61,6 +65,10 @@ class ManagerProcess(ABC): enabled = True name = "" + last_watchdog_time = 0 + watchdog_max_dt = None + watchdog_seen = False + @abstractmethod def prepare(self): pass @@ -69,6 +77,30 @@ class ManagerProcess(ABC): def start(self): pass + def restart(self): + self.stop() + self.start() + + def check_watchdog(self, started): + if self.watchdog_max_dt is None or self.proc is None: + return + + try: + fn = WATCHDOG_FN + str(self.proc.pid) + self.last_watchdog_time = int(open(fn).read()) + except Exception: + pass + + dt = sec_since_boot() - self.last_watchdog_time / 1e9 + + if dt > self.watchdog_max_dt: + # Only restart while offroad for now + if self.watchdog_seen and ENABLE_WATCHDOG and (not started): + cloudlog.error(f"Watchdog timeout for {self.name}, restarting") + self.restart() + else: + self.watchdog_seen = True + def stop(self, retry=True): if self.proc is None: return @@ -128,7 +160,7 @@ class ManagerProcess(ABC): class NativeProcess(ManagerProcess): - def __init__(self, name, cwd, cmdline, enabled=True, persistent=False, driverview=False, unkillable=False, sigkill=False): + def __init__(self, name, cwd, cmdline, enabled=True, persistent=False, driverview=False, unkillable=False, sigkill=False, watchdog_max_dt=None): self.name = name self.cwd = cwd self.cmdline = cmdline @@ -137,6 +169,7 @@ class NativeProcess(ManagerProcess): self.driverview = driverview self.unkillable = unkillable self.sigkill = sigkill + self.watchdog_max_dt = watchdog_max_dt def prepare(self): pass @@ -149,10 +182,11 @@ class NativeProcess(ManagerProcess): cloudlog.info("starting process %s" % self.name) self.proc = Process(name=self.name, target=nativelauncher, args=(self.cmdline, cwd)) self.proc.start() + self.watchdog_seen = False class PythonProcess(ManagerProcess): - def __init__(self, name, module, enabled=True, persistent=False, driverview=False, unkillable=False, sigkill=False): + def __init__(self, name, module, enabled=True, persistent=False, driverview=False, unkillable=False, sigkill=False, watchdog_max_dt=None): self.name = name self.module = module self.enabled = enabled @@ -160,6 +194,7 @@ class PythonProcess(ManagerProcess): self.driverview = driverview self.unkillable = unkillable self.sigkill = sigkill + self.watchdog_max_dt = watchdog_max_dt def prepare(self): if self.enabled: @@ -173,6 +208,7 @@ class PythonProcess(ManagerProcess): cloudlog.info("starting python %s" % self.module) self.proc = Process(name=self.name, target=launcher, args=(self.module,)) self.proc.start() + self.watchdog_seen = False class DaemonProcess(ManagerProcess): @@ -234,3 +270,6 @@ def ensure_running(procs, started, driverview=False, not_run=None): p.start() else: p.stop() + + p.check_watchdog(started) + diff --git a/selfdrive/manager/process_config.py b/selfdrive/manager/process_config.py index 692e722f25..ab036f9f44 100644 --- a/selfdrive/manager/process_config.py +++ b/selfdrive/manager/process_config.py @@ -17,7 +17,7 @@ procs = [ NativeProcess("proclogd", "selfdrive/proclogd", ["./proclogd"]), NativeProcess("sensord", "selfdrive/sensord", ["./sensord"], enabled=not PC, persistent=EON, sigkill=EON), NativeProcess("ubloxd", "selfdrive/locationd", ["./ubloxd"], enabled=(not PC or WEBCAM)), - NativeProcess("ui", "selfdrive/ui", ["./ui"], persistent=True), + NativeProcess("ui", "selfdrive/ui", ["./ui"], persistent=True, watchdog_max_dt=10), PythonProcess("calibrationd", "selfdrive.locationd.calibrationd"), PythonProcess("controlsd", "selfdrive.controls.controlsd"), PythonProcess("deleter", "selfdrive.loggerd.deleter", persistent=True), diff --git a/selfdrive/ui/android/ui.cc b/selfdrive/ui/android/ui.cc index 1a917efba0..914d7d5b48 100644 --- a/selfdrive/ui/android/ui.cc +++ b/selfdrive/ui/android/ui.cc @@ -9,6 +9,7 @@ #include "common/params.h" #include "common/touch.h" #include "common/swaglog.h" +#include "common/watchdog.h" #include "ui.hpp" #include "paint.hpp" @@ -139,6 +140,7 @@ int main(int argc, char* argv[]) { s->sound->setVolume(MIN_VOLUME); while (!do_exit) { + watchdog_kick(); if (!s->scene.started) { util::sleep_for(50); } diff --git a/selfdrive/ui/qt/home.cc b/selfdrive/ui/qt/home.cc index df3e8e4e9d..4ed0748ce1 100644 --- a/selfdrive/ui/qt/home.cc +++ b/selfdrive/ui/qt/home.cc @@ -16,6 +16,7 @@ #include "common/params.h" #include "common/timing.h" #include "common/swaglog.h" +#include "common/watchdog.h" #include "home.hpp" #include "paint.hpp" @@ -276,6 +277,7 @@ void GLWindow::timerUpdate() { ui_update(&ui_state); repaint(); + watchdog_kick(); } void GLWindow::resizeGL(int w, int h) {