Better memory usage debugging (#37120)
This commit is contained in:
@@ -1478,6 +1478,11 @@ struct ProcLog {
|
||||
|
||||
cmdline @15 :List(Text);
|
||||
exe @16 :Text;
|
||||
|
||||
# from /proc/<pid>/smaps_rollup (proportional/private memory)
|
||||
memPss @17 :UInt64; # Pss — shared pages split by mapper count
|
||||
memPssAnon @18 :UInt64; # Pss_Anon — private anonymous (heap, stack)
|
||||
memPssShmem @19 :UInt64; # Pss_Shmem — proportional MSGQ/tmpfs share
|
||||
}
|
||||
|
||||
struct CPUTimes {
|
||||
|
||||
238
selfdrive/debug/mem_usage.py
Executable file
238
selfdrive/debug/mem_usage.py
Executable file
@@ -0,0 +1,238 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import os
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
from tabulate import tabulate
|
||||
|
||||
from openpilot.tools.lib.logreader import LogReader
|
||||
|
||||
DEMO_ROUTE = "a2a0ccea32023010|2023-07-27--13-01-19"
|
||||
MB = 1024 * 1024
|
||||
TABULATE_OPTS = dict(tablefmt="simple_grid", stralign="center", numalign="center")
|
||||
|
||||
|
||||
def _get_procs():
|
||||
from openpilot.selfdrive.test.test_onroad import PROCS
|
||||
return PROCS
|
||||
|
||||
|
||||
def is_openpilot_proc(name):
|
||||
if any(p in name for p in _get_procs()):
|
||||
return True
|
||||
# catch openpilot processes not in PROCS (athenad, manager, etc.)
|
||||
return 'openpilot' in name or name.startswith(('selfdrive.', 'system.'))
|
||||
|
||||
|
||||
def get_proc_name(proc):
|
||||
if len(proc.cmdline) > 0:
|
||||
return list(proc.cmdline)[0]
|
||||
return proc.name
|
||||
|
||||
|
||||
def pct(val_mb, total_mb):
|
||||
return val_mb / total_mb * 100 if total_mb else 0
|
||||
|
||||
|
||||
def has_pss(proc_logs):
|
||||
"""Check if logs contain PSS data (new field, not in old logs)."""
|
||||
try:
|
||||
for proc in proc_logs[-1].procLog.procs:
|
||||
if proc.memPss > 0:
|
||||
return True
|
||||
except AttributeError:
|
||||
pass
|
||||
return False
|
||||
|
||||
|
||||
def print_summary(proc_logs, device_states):
|
||||
mem = proc_logs[-1].procLog.mem
|
||||
total = mem.total / MB
|
||||
used = (mem.total - mem.available) / MB
|
||||
cached = mem.cached / MB
|
||||
shared = mem.shared / MB
|
||||
buffers = mem.buffers / MB
|
||||
|
||||
lines = [
|
||||
f" Total: {total:.0f} MB",
|
||||
f" Used (total-avail): {used:.0f} MB ({pct(used, total):.0f}%)",
|
||||
f" Cached: {cached:.0f} MB ({pct(cached, total):.0f}%) Buffers: {buffers:.0f} MB ({pct(buffers, total):.0f}%)",
|
||||
f" Shared/MSGQ: {shared:.0f} MB ({pct(shared, total):.0f}%)",
|
||||
]
|
||||
|
||||
if device_states:
|
||||
mem_pcts = [m.deviceState.memoryUsagePercent for m in device_states]
|
||||
lines.append(f" deviceState memory: {np.min(mem_pcts)}-{np.max(mem_pcts)}% (avg {np.mean(mem_pcts):.0f}%)")
|
||||
|
||||
print("\n-- Memory Summary --")
|
||||
print("\n".join(lines))
|
||||
return total
|
||||
|
||||
|
||||
def collect_per_process_mem(proc_logs, use_pss):
|
||||
"""Collect per-process memory samples. Returns {name: {metric: [values_per_sample_in_MB]}}."""
|
||||
by_proc = defaultdict(lambda: defaultdict(list))
|
||||
|
||||
for msg in proc_logs:
|
||||
sample = defaultdict(lambda: defaultdict(float))
|
||||
for proc in msg.procLog.procs:
|
||||
name = get_proc_name(proc)
|
||||
sample[name]['rss'] += proc.memRss / MB
|
||||
if use_pss:
|
||||
sample[name]['pss'] += proc.memPss / MB
|
||||
sample[name]['pss_anon'] += proc.memPssAnon / MB
|
||||
sample[name]['pss_shmem'] += proc.memPssShmem / MB
|
||||
|
||||
for name, metrics in sample.items():
|
||||
for metric, val in metrics.items():
|
||||
by_proc[name][metric].append(val)
|
||||
|
||||
return by_proc
|
||||
|
||||
|
||||
def _has_pss_detail(by_proc) -> bool:
|
||||
"""Check if any process has non-zero pss_anon/pss_shmem (unavailable on some kernels)."""
|
||||
return any(sum(v.get('pss_anon', [])) > 0 or sum(v.get('pss_shmem', [])) > 0 for v in by_proc.values())
|
||||
|
||||
|
||||
def process_table_rows(by_proc, total_mb, use_pss, show_detail):
|
||||
"""Build table rows. Returns (rows, total_row)."""
|
||||
mem_key = 'pss' if use_pss else 'rss'
|
||||
rows = []
|
||||
for name in sorted(by_proc, key=lambda n: np.mean(by_proc[n][mem_key]), reverse=True):
|
||||
m = by_proc[name]
|
||||
vals = m[mem_key]
|
||||
avg = round(np.mean(vals))
|
||||
row = [name, f"{avg} MB", f"{round(np.max(vals))} MB", f"{round(pct(avg, total_mb), 1)}%"]
|
||||
if show_detail:
|
||||
row.append(f"{round(np.mean(m['pss_anon']))} MB")
|
||||
row.append(f"{round(np.mean(m['pss_shmem']))} MB")
|
||||
rows.append(row)
|
||||
|
||||
# Total row
|
||||
total_row = None
|
||||
if by_proc:
|
||||
max_samples = max(len(v[mem_key]) for v in by_proc.values())
|
||||
totals = []
|
||||
for i in range(max_samples):
|
||||
s = sum(v[mem_key][i] for v in by_proc.values() if i < len(v[mem_key]))
|
||||
totals.append(s)
|
||||
avg_total = round(np.mean(totals))
|
||||
total_row = ["TOTAL", f"{avg_total} MB", f"{round(np.max(totals))} MB", f"{round(pct(avg_total, total_mb), 1)}%"]
|
||||
if show_detail:
|
||||
total_row.append(f"{round(sum(np.mean(v['pss_anon']) for v in by_proc.values()))} MB")
|
||||
total_row.append(f"{round(sum(np.mean(v['pss_shmem']) for v in by_proc.values()))} MB")
|
||||
|
||||
return rows, total_row
|
||||
|
||||
|
||||
def print_process_tables(op_procs, other_procs, total_mb, use_pss):
|
||||
all_procs = {**op_procs, **other_procs}
|
||||
show_detail = use_pss and _has_pss_detail(all_procs)
|
||||
|
||||
header = ["process", "avg", "max", "%"]
|
||||
if show_detail:
|
||||
header += ["anon", "shmem"]
|
||||
|
||||
op_rows, op_total = process_table_rows(op_procs, total_mb, use_pss, show_detail)
|
||||
# filter other: >5MB avg and not bare interpreter paths (test infra noise)
|
||||
other_filtered = {n: v for n, v in other_procs.items()
|
||||
if np.mean(v['pss' if use_pss else 'rss']) > 5.0
|
||||
and os.path.basename(n.split()[0]) not in ('python', 'python3')}
|
||||
other_rows, other_total = process_table_rows(other_filtered, total_mb, use_pss, show_detail)
|
||||
|
||||
rows = op_rows
|
||||
if op_total:
|
||||
rows.append(op_total)
|
||||
if other_rows:
|
||||
sep_width = len(header)
|
||||
rows.append([""] * sep_width)
|
||||
rows.extend(other_rows)
|
||||
if other_total:
|
||||
other_total[0] = "TOTAL (other)"
|
||||
rows.append(other_total)
|
||||
|
||||
metric = "PSS (no shared double-count)" if use_pss else "RSS (includes shared, overcounts)"
|
||||
print(f"\n-- Per-Process Memory: {metric} --")
|
||||
print(tabulate(rows, header, **TABULATE_OPTS))
|
||||
|
||||
|
||||
def print_memory_accounting(proc_logs, op_procs, other_procs, total_mb, use_pss):
|
||||
last = proc_logs[-1].procLog.mem
|
||||
used = (last.total - last.available) / MB
|
||||
shared = last.shared / MB
|
||||
cached_buf = (last.buffers + last.cached) / MB - shared # shared (MSGQ) is in Cached; separate it
|
||||
msgq = shared
|
||||
|
||||
mem_key = 'pss' if use_pss else 'rss'
|
||||
op_total = sum(v[mem_key][-1] for v in op_procs.values()) if op_procs else 0
|
||||
other_total = sum(v[mem_key][-1] for v in other_procs.values()) if other_procs else 0
|
||||
proc_sum = op_total + other_total
|
||||
remainder = used - (cached_buf + msgq) - proc_sum
|
||||
|
||||
if not use_pss:
|
||||
# RSS double-counts shared; add back once to partially correct
|
||||
remainder += shared
|
||||
|
||||
header = ["", "MB", "%", ""]
|
||||
label = "PSS" if use_pss else "RSS*"
|
||||
rows = [
|
||||
["Used (total - avail)", f"{used:.0f}", f"{pct(used, total_mb):.1f}", "memory in use by the system"],
|
||||
[" Cached + Buffers", f"{cached_buf:.0f}", f"{pct(cached_buf, total_mb):.1f}", "pagecache + fs metadata, reclaimable"],
|
||||
[" MSGQ (shared)", f"{msgq:.0f}", f"{pct(msgq, total_mb):.1f}", "/dev/shm tmpfs, also in process PSS"],
|
||||
[f" openpilot {label}", f"{op_total:.0f}", f"{pct(op_total, total_mb):.1f}", "sum of openpilot process memory"],
|
||||
[f" other {label}", f"{other_total:.0f}", f"{pct(other_total, total_mb):.1f}", "sum of non-openpilot process memory"],
|
||||
[" kernel/ION/GPU", f"{remainder:.0f}", f"{pct(remainder, total_mb):.1f}", "slab, ION/DMA-BUF, GPU, page tables"],
|
||||
]
|
||||
note = "" if use_pss else " (*RSS overcounts shared mem)"
|
||||
print(f"\n-- Memory Accounting (last sample){note} --")
|
||||
print(tabulate(rows, header, tablefmt="simple_grid", stralign="right"))
|
||||
|
||||
|
||||
def print_report(proc_logs, device_states=None):
|
||||
"""Print full memory analysis report. Can be called from tests or CLI."""
|
||||
if not proc_logs:
|
||||
print("No procLog messages found")
|
||||
return
|
||||
|
||||
print(f"{len(proc_logs)} procLog samples, {len(device_states or [])} deviceState samples")
|
||||
|
||||
use_pss = has_pss(proc_logs)
|
||||
if not use_pss:
|
||||
print(" (no PSS data — re-record with updated proclogd for accurate numbers)")
|
||||
|
||||
total_mb = print_summary(proc_logs, device_states or [])
|
||||
|
||||
by_proc = collect_per_process_mem(proc_logs, use_pss)
|
||||
op_procs = {n: v for n, v in by_proc.items() if is_openpilot_proc(n)}
|
||||
other_procs = {n: v for n, v in by_proc.items() if not is_openpilot_proc(n)}
|
||||
|
||||
print_process_tables(op_procs, other_procs, total_mb, use_pss)
|
||||
print_memory_accounting(proc_logs, op_procs, other_procs, total_mb, use_pss)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Analyze memory usage from route logs")
|
||||
parser.add_argument("route", nargs="?", default=None, help="route ID or local rlog path")
|
||||
parser.add_argument("--demo", action="store_true", help=f"use demo route ({DEMO_ROUTE})")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.demo:
|
||||
route = DEMO_ROUTE
|
||||
elif args.route:
|
||||
route = args.route
|
||||
else:
|
||||
parser.error("provide a route or use --demo")
|
||||
|
||||
print(f"Reading logs from: {route}")
|
||||
|
||||
proc_logs = []
|
||||
device_states = []
|
||||
for msg in LogReader(route):
|
||||
if msg.which() == 'procLog':
|
||||
proc_logs.append(msg)
|
||||
elif msg.which() == 'deviceState':
|
||||
device_states.append(msg)
|
||||
|
||||
print_report(proc_logs, device_states)
|
||||
@@ -56,7 +56,7 @@ PROCS = {
|
||||
"selfdrive.ui.soundd": 3.0,
|
||||
"selfdrive.ui.feedback.feedbackd": 1.0,
|
||||
"selfdrive.monitoring.dmonitoringd": 4.0,
|
||||
"system.proclogd": 3.0,
|
||||
"system.proclogd": 7.0,
|
||||
"system.logmessaged": 1.0,
|
||||
"system.tombstoned": 0,
|
||||
"system.journald": 1.0,
|
||||
@@ -282,9 +282,12 @@ class TestOnroad:
|
||||
print("\n------------------------------------------------")
|
||||
print("--------------- Memory Usage -------------------")
|
||||
print("------------------------------------------------")
|
||||
|
||||
from openpilot.selfdrive.debug.mem_usage import print_report
|
||||
print_report(self.msgs['procLog'], self.msgs['deviceState'])
|
||||
|
||||
offset = int(SERVICE_LIST['deviceState'].frequency * LOG_OFFSET)
|
||||
mems = [m.deviceState.memoryUsagePercent for m in self.msgs['deviceState'][offset:]]
|
||||
print("Overall memory usage: ", mems)
|
||||
print("MSGQ (/dev/shm/) usage: ", subprocess.check_output(["du", "-hs", "/dev/shm"]).split()[0].decode())
|
||||
|
||||
# check for big leaks. note that memory usage is
|
||||
|
||||
@@ -115,6 +115,55 @@ def _parse_proc_stat(stat: str) -> ProcStat | None:
|
||||
cloudlog.exception("failed to parse /proc/<pid>/stat")
|
||||
return None
|
||||
|
||||
class SmapsData(TypedDict):
|
||||
pss: int # bytes
|
||||
pss_anon: int # bytes
|
||||
pss_shmem: int # bytes
|
||||
|
||||
|
||||
_SMAPS_KEYS = {b'Pss:', b'Pss_Anon:', b'Pss_Shmem:'}
|
||||
|
||||
# smaps_rollup (kernel 4.14+) is ideal but missing on some BSP kernels;
|
||||
# fall back to per-VMA smaps (any kernel). Pss_Anon/Pss_Shmem only in 5.x+.
|
||||
_smaps_path: str | None = None # auto-detected on first call
|
||||
|
||||
# per-VMA smaps is expensive (kernel walks page tables for every VMA).
|
||||
# cache results and only refresh every N cycles to keep CPU low.
|
||||
_smaps_cache: dict[int, SmapsData] = {}
|
||||
_smaps_cycle = 0
|
||||
_SMAPS_EVERY = 20 # refresh every 20th cycle (40s at 0.5Hz)
|
||||
|
||||
|
||||
def _read_smaps(pid: int) -> SmapsData:
|
||||
global _smaps_path
|
||||
try:
|
||||
if _smaps_path is None:
|
||||
_smaps_path = 'smaps_rollup' if os.path.exists(f'/proc/{pid}/smaps_rollup') else 'smaps'
|
||||
|
||||
result: SmapsData = {'pss': 0, 'pss_anon': 0, 'pss_shmem': 0}
|
||||
with open(f'/proc/{pid}/{_smaps_path}', 'rb') as f:
|
||||
for line in f:
|
||||
parts = line.split()
|
||||
if len(parts) >= 2 and parts[0] in _SMAPS_KEYS:
|
||||
val = int(parts[1]) * 1024 # kB -> bytes
|
||||
if parts[0] == b'Pss:':
|
||||
result['pss'] += val
|
||||
elif parts[0] == b'Pss_Anon:':
|
||||
result['pss_anon'] += val
|
||||
elif parts[0] == b'Pss_Shmem:':
|
||||
result['pss_shmem'] += val
|
||||
return result
|
||||
except (FileNotFoundError, PermissionError, ProcessLookupError, OSError):
|
||||
return {'pss': 0, 'pss_anon': 0, 'pss_shmem': 0}
|
||||
|
||||
|
||||
def _get_smaps_cached(pid: int) -> SmapsData:
|
||||
"""Return cached smaps data, refreshing every _SMAPS_EVERY cycles."""
|
||||
if _smaps_cycle == 0 or pid not in _smaps_cache:
|
||||
_smaps_cache[pid] = _read_smaps(pid)
|
||||
return _smaps_cache.get(pid, {'pss': 0, 'pss_anon': 0, 'pss_shmem': 0})
|
||||
|
||||
|
||||
class ProcExtra(TypedDict):
|
||||
pid: int
|
||||
name: str
|
||||
@@ -189,6 +238,13 @@ def build_proc_log_message(msg) -> None:
|
||||
for j, arg in enumerate(extra['cmdline']):
|
||||
cmdline[j] = arg
|
||||
|
||||
# smaps is expensive (kernel walks page tables); skip small processes, use cache
|
||||
if r['rss'] * PAGE_SIZE > 5 * 1024 * 1024:
|
||||
smaps = _get_smaps_cached(r['pid'])
|
||||
proc.memPss = smaps['pss']
|
||||
proc.memPssAnon = smaps['pss_anon']
|
||||
proc.memPssShmem = smaps['pss_shmem']
|
||||
|
||||
cpu_times = _cpu_times()
|
||||
cpu_list = pl.init('cpuTimes', len(cpu_times))
|
||||
for i, ct in enumerate(cpu_times):
|
||||
@@ -212,6 +268,9 @@ def build_proc_log_message(msg) -> None:
|
||||
pl.mem.inactive = mem_info["Inactive:"]
|
||||
pl.mem.shared = mem_info["Shmem:"]
|
||||
|
||||
global _smaps_cycle
|
||||
_smaps_cycle = (_smaps_cycle + 1) % _SMAPS_EVERY
|
||||
|
||||
|
||||
def main() -> NoReturn:
|
||||
pm = messaging.PubMaster(['procLog'])
|
||||
|
||||
Reference in New Issue
Block a user