Add watchdog check to manager (#20277)

* this should work but doesnt

* Only offroad

* works

* make it work offorad

* reduce diff

* cleanup

* need util

Co-authored-by: Adeeb Shihadeh <adeebshihadeh@gmail.com>
old-commit-hash: a94ba4fb8b
This commit is contained in:
Willem Melching 2021-03-09 04:17:46 +01:00 committed by GitHub
parent e80ea20b34
commit 4d6c98aa6b
10 changed files with 87 additions and 15 deletions

View File

@ -196,6 +196,8 @@ selfdrive/common/clutil.cc
selfdrive/common/clutil.h selfdrive/common/clutil.h
selfdrive/common/params.h selfdrive/common/params.h
selfdrive/common/params.cc selfdrive/common/params.cc
selfdrive/common/watchdog.cc
selfdrive/common/watchdog.h
selfdrive/common/modeldata.h selfdrive/common/modeldata.h
selfdrive/common/mat.h selfdrive/common/mat.h

View File

@ -5,7 +5,14 @@ if SHARED:
else: else:
fxn = env.Library fxn = env.Library
common_libs = ['params.cc', 'swaglog.cc', 'util.cc', 'gpio.cc', 'i2c.cc'] common_libs = [
'params.cc',
'swaglog.cc',
'util.cc',
'gpio.cc',
'i2c.cc',
'watchdog.cc',
]
_common = fxn('common', common_libs, LIBS="json11") _common = fxn('common', common_libs, LIBS="json11")

View File

@ -1,11 +1,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h> #include <errno.h>
#include "common/util.h"
#ifdef __linux__ #ifdef __linux__
#include <sys/prctl.h> #include <sys/prctl.h>
#include <sys/syscall.h> #include <sys/syscall.h>
@ -45,8 +41,8 @@ void* read_file(const char* path, size_t* out_len) {
return buf; return buf;
} }
int write_file(const char* path, const void* data, size_t size) { int write_file(const char* path, const void* data, size_t size, int flags, mode_t mode) {
int fd = open(path, O_WRONLY); int fd = open(path, flags, mode);
if (fd == -1) { if (fd == -1) {
return -1; return -1;
} }

View File

@ -1,8 +1,9 @@
#pragma once #pragma once
#include <stdio.h> #include <cstdio>
#include <unistd.h>
#include <csignal> #include <csignal>
#include <cstring>
#include <cstdlib>
#include <string> #include <string>
#include <memory> #include <memory>
#include <atomic> #include <atomic>
@ -10,6 +11,9 @@
#include <fstream> #include <fstream>
#include <thread> #include <thread>
#include <chrono> #include <chrono>
#include <cassert>
#include <unistd.h>
#include <fcntl.h>
#ifndef sighandler_t #ifndef sighandler_t
typedef void (*sighandler_t)(int sig); typedef void (*sighandler_t)(int sig);
@ -25,7 +29,7 @@ typedef void (*sighandler_t)(int sig);
// Returns NULL on failure, otherwise the NULL-terminated file contents. // Returns NULL on failure, otherwise the NULL-terminated file contents.
// The result must be freed by the caller. // The result must be freed by the caller.
void* read_file(const char* path, size_t* out_len); void* read_file(const char* path, size_t* out_len);
int write_file(const char* path, const void* data, size_t size); int write_file(const char* path, const void* data, size_t size, int flags=O_WRONLY, mode_t mode=0777);
void set_thread_name(const char* name); void set_thread_name(const char* name);

View File

@ -0,0 +1,17 @@
#include <string>
#include <cstdint>
#include <unistd.h>
#include "common/timing.h"
#include "common/util.h"
#include "common/watchdog.h"
const std::string watchdog_fn_prefix = "/dev/shm/wd_"; // + <pid>
bool watchdog_kick(){
std::string fn = watchdog_fn_prefix + std::to_string(getpid());
std::string cur_t = std::to_string(nanos_since_boot());
int r = write_file(fn.c_str(), cur_t.data(), cur_t.length(), O_WRONLY | O_CREAT);
return r == 0;
}

View File

@ -0,0 +1,3 @@
#pragma once
bool watchdog_kick();

View File

@ -12,10 +12,14 @@ import cereal.messaging as messaging
import selfdrive.crash as crash import selfdrive.crash as crash
from common.basedir import BASEDIR from common.basedir import BASEDIR
from common.params import Params from common.params import Params
from common.realtime import sec_since_boot
from selfdrive.swaglog import cloudlog from selfdrive.swaglog import cloudlog
from selfdrive.hardware import HARDWARE from selfdrive.hardware import HARDWARE
from cereal import log from cereal import log
WATCHDOG_FN = "/dev/shm/wd_"
ENABLE_WATCHDOG = os.getenv("NO_WATCHDOG") is None
def launcher(proc): def launcher(proc):
try: try:
@ -61,6 +65,10 @@ class ManagerProcess(ABC):
enabled = True enabled = True
name = "" name = ""
last_watchdog_time = 0
watchdog_max_dt = None
watchdog_seen = False
@abstractmethod @abstractmethod
def prepare(self): def prepare(self):
pass pass
@ -69,6 +77,30 @@ class ManagerProcess(ABC):
def start(self): def start(self):
pass pass
def restart(self):
self.stop()
self.start()
def check_watchdog(self, started):
if self.watchdog_max_dt is None or self.proc is None:
return
try:
fn = WATCHDOG_FN + str(self.proc.pid)
self.last_watchdog_time = int(open(fn).read())
except Exception:
pass
dt = sec_since_boot() - self.last_watchdog_time / 1e9
if dt > self.watchdog_max_dt:
# Only restart while offroad for now
if self.watchdog_seen and ENABLE_WATCHDOG and (not started):
cloudlog.error(f"Watchdog timeout for {self.name}, restarting")
self.restart()
else:
self.watchdog_seen = True
def stop(self, retry=True): def stop(self, retry=True):
if self.proc is None: if self.proc is None:
return return
@ -128,7 +160,7 @@ class ManagerProcess(ABC):
class NativeProcess(ManagerProcess): class NativeProcess(ManagerProcess):
def __init__(self, name, cwd, cmdline, enabled=True, persistent=False, driverview=False, unkillable=False, sigkill=False): def __init__(self, name, cwd, cmdline, enabled=True, persistent=False, driverview=False, unkillable=False, sigkill=False, watchdog_max_dt=None):
self.name = name self.name = name
self.cwd = cwd self.cwd = cwd
self.cmdline = cmdline self.cmdline = cmdline
@ -137,6 +169,7 @@ class NativeProcess(ManagerProcess):
self.driverview = driverview self.driverview = driverview
self.unkillable = unkillable self.unkillable = unkillable
self.sigkill = sigkill self.sigkill = sigkill
self.watchdog_max_dt = watchdog_max_dt
def prepare(self): def prepare(self):
pass pass
@ -149,10 +182,11 @@ class NativeProcess(ManagerProcess):
cloudlog.info("starting process %s" % self.name) cloudlog.info("starting process %s" % self.name)
self.proc = Process(name=self.name, target=nativelauncher, args=(self.cmdline, cwd)) self.proc = Process(name=self.name, target=nativelauncher, args=(self.cmdline, cwd))
self.proc.start() self.proc.start()
self.watchdog_seen = False
class PythonProcess(ManagerProcess): class PythonProcess(ManagerProcess):
def __init__(self, name, module, enabled=True, persistent=False, driverview=False, unkillable=False, sigkill=False): def __init__(self, name, module, enabled=True, persistent=False, driverview=False, unkillable=False, sigkill=False, watchdog_max_dt=None):
self.name = name self.name = name
self.module = module self.module = module
self.enabled = enabled self.enabled = enabled
@ -160,6 +194,7 @@ class PythonProcess(ManagerProcess):
self.driverview = driverview self.driverview = driverview
self.unkillable = unkillable self.unkillable = unkillable
self.sigkill = sigkill self.sigkill = sigkill
self.watchdog_max_dt = watchdog_max_dt
def prepare(self): def prepare(self):
if self.enabled: if self.enabled:
@ -173,6 +208,7 @@ class PythonProcess(ManagerProcess):
cloudlog.info("starting python %s" % self.module) cloudlog.info("starting python %s" % self.module)
self.proc = Process(name=self.name, target=launcher, args=(self.module,)) self.proc = Process(name=self.name, target=launcher, args=(self.module,))
self.proc.start() self.proc.start()
self.watchdog_seen = False
class DaemonProcess(ManagerProcess): class DaemonProcess(ManagerProcess):
@ -234,3 +270,6 @@ def ensure_running(procs, started, driverview=False, not_run=None):
p.start() p.start()
else: else:
p.stop() p.stop()
p.check_watchdog(started)

View File

@ -17,7 +17,7 @@ procs = [
NativeProcess("proclogd", "selfdrive/proclogd", ["./proclogd"]), NativeProcess("proclogd", "selfdrive/proclogd", ["./proclogd"]),
NativeProcess("sensord", "selfdrive/sensord", ["./sensord"], enabled=not PC, persistent=EON, sigkill=EON), NativeProcess("sensord", "selfdrive/sensord", ["./sensord"], enabled=not PC, persistent=EON, sigkill=EON),
NativeProcess("ubloxd", "selfdrive/locationd", ["./ubloxd"], enabled=(not PC or WEBCAM)), NativeProcess("ubloxd", "selfdrive/locationd", ["./ubloxd"], enabled=(not PC or WEBCAM)),
NativeProcess("ui", "selfdrive/ui", ["./ui"], persistent=True), NativeProcess("ui", "selfdrive/ui", ["./ui"], persistent=True, watchdog_max_dt=10),
PythonProcess("calibrationd", "selfdrive.locationd.calibrationd"), PythonProcess("calibrationd", "selfdrive.locationd.calibrationd"),
PythonProcess("controlsd", "selfdrive.controls.controlsd"), PythonProcess("controlsd", "selfdrive.controls.controlsd"),
PythonProcess("deleter", "selfdrive.loggerd.deleter", persistent=True), PythonProcess("deleter", "selfdrive.loggerd.deleter", persistent=True),

View File

@ -9,6 +9,7 @@
#include "common/params.h" #include "common/params.h"
#include "common/touch.h" #include "common/touch.h"
#include "common/swaglog.h" #include "common/swaglog.h"
#include "common/watchdog.h"
#include "ui.hpp" #include "ui.hpp"
#include "paint.hpp" #include "paint.hpp"
@ -139,6 +140,7 @@ int main(int argc, char* argv[]) {
s->sound->setVolume(MIN_VOLUME); s->sound->setVolume(MIN_VOLUME);
while (!do_exit) { while (!do_exit) {
watchdog_kick();
if (!s->scene.started) { if (!s->scene.started) {
util::sleep_for(50); util::sleep_for(50);
} }

View File

@ -16,6 +16,7 @@
#include "common/params.h" #include "common/params.h"
#include "common/timing.h" #include "common/timing.h"
#include "common/swaglog.h" #include "common/swaglog.h"
#include "common/watchdog.h"
#include "home.hpp" #include "home.hpp"
#include "paint.hpp" #include "paint.hpp"
@ -276,6 +277,7 @@ void GLWindow::timerUpdate() {
ui_update(&ui_state); ui_update(&ui_state);
repaint(); repaint();
watchdog_kick();
} }
void GLWindow::resizeGL(int w, int h) { void GLWindow::resizeGL(int w, int h) {