Add watchdog check to manager (#20277)

* this should work but doesnt

* Only offroad

* works

* make it work offorad

* reduce diff

* cleanup

* need util

Co-authored-by: Adeeb Shihadeh <adeebshihadeh@gmail.com>
old-commit-hash: a94ba4fb8b
This commit is contained in:
Willem Melching 2021-03-09 04:17:46 +01:00 committed by GitHub
parent e80ea20b34
commit 4d6c98aa6b
10 changed files with 87 additions and 15 deletions

View File

@ -196,6 +196,8 @@ selfdrive/common/clutil.cc
selfdrive/common/clutil.h
selfdrive/common/params.h
selfdrive/common/params.cc
selfdrive/common/watchdog.cc
selfdrive/common/watchdog.h
selfdrive/common/modeldata.h
selfdrive/common/mat.h

View File

@ -5,7 +5,14 @@ if SHARED:
else:
fxn = env.Library
common_libs = ['params.cc', 'swaglog.cc', 'util.cc', 'gpio.cc', 'i2c.cc']
common_libs = [
'params.cc',
'swaglog.cc',
'util.cc',
'gpio.cc',
'i2c.cc',
'watchdog.cc',
]
_common = fxn('common', common_libs, LIBS="json11")

View File

@ -1,11 +1,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include "common/util.h"
#ifdef __linux__
#include <sys/prctl.h>
#include <sys/syscall.h>
@ -45,8 +41,8 @@ void* read_file(const char* path, size_t* out_len) {
return buf;
}
int write_file(const char* path, const void* data, size_t size) {
int fd = open(path, O_WRONLY);
int write_file(const char* path, const void* data, size_t size, int flags, mode_t mode) {
int fd = open(path, flags, mode);
if (fd == -1) {
return -1;
}

View File

@ -1,8 +1,9 @@
#pragma once
#include <stdio.h>
#include <unistd.h>
#include <cstdio>
#include <csignal>
#include <cstring>
#include <cstdlib>
#include <string>
#include <memory>
#include <atomic>
@ -10,6 +11,9 @@
#include <fstream>
#include <thread>
#include <chrono>
#include <cassert>
#include <unistd.h>
#include <fcntl.h>
#ifndef sighandler_t
typedef void (*sighandler_t)(int sig);
@ -25,7 +29,7 @@ typedef void (*sighandler_t)(int sig);
// Returns NULL on failure, otherwise the NULL-terminated file contents.
// The result must be freed by the caller.
void* read_file(const char* path, size_t* out_len);
int write_file(const char* path, const void* data, size_t size);
int write_file(const char* path, const void* data, size_t size, int flags=O_WRONLY, mode_t mode=0777);
void set_thread_name(const char* name);

View File

@ -0,0 +1,17 @@
#include <string>
#include <cstdint>
#include <unistd.h>
#include "common/timing.h"
#include "common/util.h"
#include "common/watchdog.h"
const std::string watchdog_fn_prefix = "/dev/shm/wd_"; // + <pid>
bool watchdog_kick(){
std::string fn = watchdog_fn_prefix + std::to_string(getpid());
std::string cur_t = std::to_string(nanos_since_boot());
int r = write_file(fn.c_str(), cur_t.data(), cur_t.length(), O_WRONLY | O_CREAT);
return r == 0;
}

View File

@ -0,0 +1,3 @@
#pragma once
bool watchdog_kick();

View File

@ -12,10 +12,14 @@ import cereal.messaging as messaging
import selfdrive.crash as crash
from common.basedir import BASEDIR
from common.params import Params
from common.realtime import sec_since_boot
from selfdrive.swaglog import cloudlog
from selfdrive.hardware import HARDWARE
from cereal import log
WATCHDOG_FN = "/dev/shm/wd_"
ENABLE_WATCHDOG = os.getenv("NO_WATCHDOG") is None
def launcher(proc):
try:
@ -61,6 +65,10 @@ class ManagerProcess(ABC):
enabled = True
name = ""
last_watchdog_time = 0
watchdog_max_dt = None
watchdog_seen = False
@abstractmethod
def prepare(self):
pass
@ -69,6 +77,30 @@ class ManagerProcess(ABC):
def start(self):
pass
def restart(self):
self.stop()
self.start()
def check_watchdog(self, started):
if self.watchdog_max_dt is None or self.proc is None:
return
try:
fn = WATCHDOG_FN + str(self.proc.pid)
self.last_watchdog_time = int(open(fn).read())
except Exception:
pass
dt = sec_since_boot() - self.last_watchdog_time / 1e9
if dt > self.watchdog_max_dt:
# Only restart while offroad for now
if self.watchdog_seen and ENABLE_WATCHDOG and (not started):
cloudlog.error(f"Watchdog timeout for {self.name}, restarting")
self.restart()
else:
self.watchdog_seen = True
def stop(self, retry=True):
if self.proc is None:
return
@ -128,7 +160,7 @@ class ManagerProcess(ABC):
class NativeProcess(ManagerProcess):
def __init__(self, name, cwd, cmdline, enabled=True, persistent=False, driverview=False, unkillable=False, sigkill=False):
def __init__(self, name, cwd, cmdline, enabled=True, persistent=False, driverview=False, unkillable=False, sigkill=False, watchdog_max_dt=None):
self.name = name
self.cwd = cwd
self.cmdline = cmdline
@ -137,6 +169,7 @@ class NativeProcess(ManagerProcess):
self.driverview = driverview
self.unkillable = unkillable
self.sigkill = sigkill
self.watchdog_max_dt = watchdog_max_dt
def prepare(self):
pass
@ -149,10 +182,11 @@ class NativeProcess(ManagerProcess):
cloudlog.info("starting process %s" % self.name)
self.proc = Process(name=self.name, target=nativelauncher, args=(self.cmdline, cwd))
self.proc.start()
self.watchdog_seen = False
class PythonProcess(ManagerProcess):
def __init__(self, name, module, enabled=True, persistent=False, driverview=False, unkillable=False, sigkill=False):
def __init__(self, name, module, enabled=True, persistent=False, driverview=False, unkillable=False, sigkill=False, watchdog_max_dt=None):
self.name = name
self.module = module
self.enabled = enabled
@ -160,6 +194,7 @@ class PythonProcess(ManagerProcess):
self.driverview = driverview
self.unkillable = unkillable
self.sigkill = sigkill
self.watchdog_max_dt = watchdog_max_dt
def prepare(self):
if self.enabled:
@ -173,6 +208,7 @@ class PythonProcess(ManagerProcess):
cloudlog.info("starting python %s" % self.module)
self.proc = Process(name=self.name, target=launcher, args=(self.module,))
self.proc.start()
self.watchdog_seen = False
class DaemonProcess(ManagerProcess):
@ -234,3 +270,6 @@ def ensure_running(procs, started, driverview=False, not_run=None):
p.start()
else:
p.stop()
p.check_watchdog(started)

View File

@ -17,7 +17,7 @@ procs = [
NativeProcess("proclogd", "selfdrive/proclogd", ["./proclogd"]),
NativeProcess("sensord", "selfdrive/sensord", ["./sensord"], enabled=not PC, persistent=EON, sigkill=EON),
NativeProcess("ubloxd", "selfdrive/locationd", ["./ubloxd"], enabled=(not PC or WEBCAM)),
NativeProcess("ui", "selfdrive/ui", ["./ui"], persistent=True),
NativeProcess("ui", "selfdrive/ui", ["./ui"], persistent=True, watchdog_max_dt=10),
PythonProcess("calibrationd", "selfdrive.locationd.calibrationd"),
PythonProcess("controlsd", "selfdrive.controls.controlsd"),
PythonProcess("deleter", "selfdrive.loggerd.deleter", persistent=True),

View File

@ -9,6 +9,7 @@
#include "common/params.h"
#include "common/touch.h"
#include "common/swaglog.h"
#include "common/watchdog.h"
#include "ui.hpp"
#include "paint.hpp"
@ -139,6 +140,7 @@ int main(int argc, char* argv[]) {
s->sound->setVolume(MIN_VOLUME);
while (!do_exit) {
watchdog_kick();
if (!s->scene.started) {
util::sleep_for(50);
}

View File

@ -16,6 +16,7 @@
#include "common/params.h"
#include "common/timing.h"
#include "common/swaglog.h"
#include "common/watchdog.h"
#include "home.hpp"
#include "paint.hpp"
@ -276,6 +277,7 @@ void GLWindow::timerUpdate() {
ui_update(&ui_state);
repaint();
watchdog_kick();
}
void GLWindow::resizeGL(int w, int h) {