optimize CAN send over SPI (#2266)

* profiling * lil faster * chunking happens later * little better * prealloc is no bueno? * cleanup * fix usb tests --------- Co-authored-by: Comma Device <device@comma.ai>
2026-02-19 09:43:51 +08:00 · 2025-09-02 19:26:52 -07:00
parent 819fa5854e
commit 5c1ff7bfa3
6 changed files with 35 additions and 22 deletions
--- a/board/drivers/spi.h
+++ b/board/drivers/spi.h
@@ -3,10 +3,8 @@
 #include "board/drivers/spi_declarations.h"
 #include "board/crc.h"

-#define SPI_BUF_SIZE 2048U
-// H7 DMA2 located in D2 domain, so we need to use SRAM1/SRAM2
-__attribute__((section(".sram12"))) uint8_t spi_buf_rx[SPI_BUF_SIZE];
-__attribute__((section(".sram12"))) uint8_t spi_buf_tx[SPI_BUF_SIZE];
+uint8_t spi_buf_rx[SPI_BUF_SIZE];
+uint8_t spi_buf_tx[SPI_BUF_SIZE];

 uint16_t spi_error_count = 0;

--- a/board/drivers/spi_declarations.h
+++ b/board/drivers/spi_declarations.h
@@ -8,7 +8,7 @@
 // in a tight loop, plus some buffer
 #define SPI_IRQ_RATE  16000U

-#define SPI_BUF_SIZE 2048U
+#define SPI_BUF_SIZE 4096U
 // H7 DMA2 located in D2 domain, so we need to use SRAM1/SRAM2
 __attribute__((section(".sram12"))) extern uint8_t spi_buf_rx[SPI_BUF_SIZE];
 __attribute__((section(".sram12"))) extern uint8_t spi_buf_tx[SPI_BUF_SIZE];
--- a/python/init.py
+++ b/python/init.py
@@ -32,16 +32,13 @@ def calculate_checksum(data):
    res ^= b
  return res

-def pack_can_buffer(arr, fd=False):
-  snds = [b'']
+def pack_can_buffer(arr, chunk=False, fd=False):
+  snds = [bytearray(), ]
  for address, dat, bus in arr:
-    assert len(dat) in LEN_TO_DLC
-    #logger.debug("  W 0x%x: 0x%s", address, dat.hex())
-
    extended = 1 if address >= 0x800 else 0
    data_len_code = LEN_TO_DLC[len(dat)]
    header = bytearray(CANPACKET_HEAD_SIZE)
-    word_4b = address << 3 | extended << 2
+    word_4b = (address << 3) | (extended << 2)
    header[0] = (data_len_code << 4) | (bus << 1) | int(fd)
    header[1] = word_4b & 0xFF
    header[2] = (word_4b >> 8) & 0xFF
@@ -49,9 +46,10 @@ def pack_can_buffer(arr, fd=False):
    header[4] = (word_4b >> 24) & 0xFF
    header[5] = calculate_checksum(header[:5] + dat)

-    snds[-1] += header + dat
-    if len(snds[-1]) > 256: # Limit chunks to 256 bytes
-      snds.append(b'')
+    snds[-1].extend(header)
+    snds[-1].extend(dat)
+    if chunk and len(snds[-1]) > 256:
+      snds.append(bytearray())

  return snds

@@ -729,7 +727,7 @@ class Panda:

  @ensure_can_packet_version
  def can_send_many(self, arr, *, fd=False, timeout=CAN_SEND_TIMEOUT_MS):
-    snds = pack_can_buffer(arr, fd=fd)
+    snds = pack_can_buffer(arr, chunk=(not self.spi), fd=fd)
    for tx in snds:
      while len(tx) > 0:
        bs = self._handle.bulkWrite(3, tx, timeout=timeout)
--- a/python/spi.py
+++ b/python/spi.py
@@ -29,7 +29,8 @@ CHECKSUM_START = 0xAB
 MIN_ACK_TIMEOUT_MS = 100
 MAX_XFER_RETRY_COUNT = 5

-XFER_SIZE = 0x40*31
+SPI_BUF_SIZE = 4096  # from panda/board/drivers/spi.h
+XFER_SIZE = SPI_BUF_SIZE - 0x40 # give some room for SPI protocol overhead

 DEV_PATH = "/dev/spidev0.0"

@@ -290,8 +291,9 @@ class PandaSpiHandle(BaseHandle):
    return self._transfer(0, struct.pack("<BHHH", request, value, index, length), timeout, max_rx_len=length)

  def bulkWrite(self, endpoint: int, data: bytes, timeout: int = TIMEOUT) -> int:
+    mv = memoryview(data)
    for x in range(math.ceil(len(data) / XFER_SIZE)):
-      self._transfer(endpoint, data[XFER_SIZE*x:XFER_SIZE*(x+1)], timeout)
+      self._transfer(endpoint, mv[XFER_SIZE*x:XFER_SIZE*(x+1)], timeout)
    return len(data)

  def bulkRead(self, endpoint: int, length: int, timeout: int = TIMEOUT) -> bytes:
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -1,17 +1,32 @@
 #!/usr/bin/env python3
+import io
+import os
 import time
+import pstats
+import cProfile
 from contextlib import contextmanager

 from panda import Panda, PandaDFU
 from panda.tests.hitl.helpers import get_random_can_messages


+PROFILE = "PROFILE" in os.environ
+
@contextmanager
 def print_time(desc):
+  if PROFILE:
+    pr = cProfile.Profile()
+    pr.enable()
  start = time.perf_counter()
  yield
  end = time.perf_counter()
-  print(f"{end - start:.2f}s - {desc}")
+  print(f"{end - start:.3f}s - {desc}")
+  if PROFILE:
+    pr.disable()
+    s = io.StringIO()
+    ps = pstats.Stats(pr, stream=s).sort_stats("cumtime")
+    ps.print_stats()
+    print(s.getvalue())


 if __name__ == "__main__":
--- a/tests/usbprotocol/test_comms.py
+++ b/tests/usbprotocol/test_comms.py
@@ -78,17 +78,17 @@ class TestPandaComms(unittest.TestCase):
  def test_comms_reset_tx(self):
    # store some test messages in the queue
    test_msg = (0x100, b"test", 0)
-    packed = pack_can_buffer([test_msg for _ in range(100)])
+    packed = pack_can_buffer([test_msg for _ in range(100)], chunk=True)

    # write a small chunk such that we have some overflow
    TINY_CHUNK_SIZE = 6
-    lpp.comms_can_write(packed[0][:TINY_CHUNK_SIZE], TINY_CHUNK_SIZE)
+    lpp.comms_can_write(bytes(packed[0][:TINY_CHUNK_SIZE]), TINY_CHUNK_SIZE)

    # reset the comms to clear the overflow buffer on the panda side
    lpp.comms_can_reset()

    # write a full valid chunk, which should now contain valid messages
-    lpp.comms_can_write(packed[1], len(packed[1]))
+    lpp.comms_can_write(bytes(packed[1]), len(packed[1]))

    # read the messages from the queue and make sure they're valid
    queue_msgs = []
@@ -114,7 +114,7 @@ class TestPandaComms(unittest.TestCase):
          for buf in packed:
            for i in range(0, len(buf), CHUNK_SIZE):
              chunk_len = min(CHUNK_SIZE, len(buf) - i)
-              lpp.comms_can_write(buf[i:i+chunk_len], chunk_len)
+              lpp.comms_can_write(bytes(buf[i:i+chunk_len]), chunk_len)

          # Check that they ended up in the right buffers
          queue_msgs = []