diff --git a/board/drivers/spi.h b/board/drivers/spi.h
index e657aea6..e112b3f3 100644
--- a/board/drivers/spi.h
+++ b/board/drivers/spi.h
@@ -3,10 +3,8 @@
 #include "board/drivers/spi_declarations.h"
 #include "board/crc.h"
 
-#define SPI_BUF_SIZE 2048U
-// H7 DMA2 located in D2 domain, so we need to use SRAM1/SRAM2
-__attribute__((section(".sram12"))) uint8_t spi_buf_rx[SPI_BUF_SIZE];
-__attribute__((section(".sram12"))) uint8_t spi_buf_tx[SPI_BUF_SIZE];
+uint8_t spi_buf_rx[SPI_BUF_SIZE];
+uint8_t spi_buf_tx[SPI_BUF_SIZE];
 
 uint16_t spi_error_count = 0;
 
diff --git a/board/drivers/spi_declarations.h b/board/drivers/spi_declarations.h
index f719404d..23254f0e 100644
--- a/board/drivers/spi_declarations.h
+++ b/board/drivers/spi_declarations.h
@@ -8,7 +8,7 @@
 // in a tight loop, plus some buffer
 #define SPI_IRQ_RATE  16000U
 
-#define SPI_BUF_SIZE 2048U
+#define SPI_BUF_SIZE 4096U
 // H7 DMA2 located in D2 domain, so we need to use SRAM1/SRAM2
 __attribute__((section(".sram12"))) extern uint8_t spi_buf_rx[SPI_BUF_SIZE];
 __attribute__((section(".sram12"))) extern uint8_t spi_buf_tx[SPI_BUF_SIZE];
diff --git a/python/__init__.py b/python/__init__.py
index c89582d5..a2655ada 100644
--- a/python/__init__.py
+++ b/python/__init__.py
@@ -32,16 +32,13 @@ def calculate_checksum(data):
     res ^= b
   return res
 
-def pack_can_buffer(arr, fd=False):
-  snds = [b'']
+def pack_can_buffer(arr, chunk=False, fd=False):
+  snds = [bytearray(), ]
   for address, dat, bus in arr:
-    assert len(dat) in LEN_TO_DLC
-    #logger.debug("  W 0x%x: 0x%s", address, dat.hex())
-
     extended = 1 if address >= 0x800 else 0
     data_len_code = LEN_TO_DLC[len(dat)]
     header = bytearray(CANPACKET_HEAD_SIZE)
-    word_4b = address << 3 | extended << 2
+    word_4b = (address << 3) | (extended << 2)
     header[0] = (data_len_code << 4) | (bus << 1) | int(fd)
     header[1] = word_4b & 0xFF
     header[2] = (word_4b >> 8) & 0xFF
@@ -49,9 +46,10 @@ def pack_can_buffer(arr, fd=False):
     header[4] = (word_4b >> 24) & 0xFF
     header[5] = calculate_checksum(header[:5] + dat)
 
-    snds[-1] += header + dat
-    if len(snds[-1]) > 256: # Limit chunks to 256 bytes
-      snds.append(b'')
+    snds[-1].extend(header)
+    snds[-1].extend(dat)
+    if chunk and len(snds[-1]) > 256:
+      snds.append(bytearray())
 
   return snds
 
@@ -729,7 +727,7 @@ class Panda:
 
   @ensure_can_packet_version
   def can_send_many(self, arr, *, fd=False, timeout=CAN_SEND_TIMEOUT_MS):
-    snds = pack_can_buffer(arr, fd=fd)
+    snds = pack_can_buffer(arr, chunk=(not self.spi), fd=fd)
     for tx in snds:
       while len(tx) > 0:
         bs = self._handle.bulkWrite(3, tx, timeout=timeout)
diff --git a/python/spi.py b/python/spi.py
index cee8755e..ecb95944 100644
--- a/python/spi.py
+++ b/python/spi.py
@@ -29,7 +29,8 @@ CHECKSUM_START = 0xAB
 MIN_ACK_TIMEOUT_MS = 100
 MAX_XFER_RETRY_COUNT = 5
 
-XFER_SIZE = 0x40*31
+SPI_BUF_SIZE = 4096  # from panda/board/drivers/spi.h
+XFER_SIZE = SPI_BUF_SIZE - 0x40 # give some room for SPI protocol overhead
 
 DEV_PATH = "/dev/spidev0.0"
 
@@ -290,8 +291,9 @@ class PandaSpiHandle(BaseHandle):
     return self._transfer(0, struct.pack("<BHHH", request, value, index, length), timeout, max_rx_len=length)
 
   def bulkWrite(self, endpoint: int, data: bytes, timeout: int = TIMEOUT) -> int:
+    mv = memoryview(data)
     for x in range(math.ceil(len(data) / XFER_SIZE)):
-      self._transfer(endpoint, data[XFER_SIZE*x:XFER_SIZE*(x+1)], timeout)
+      self._transfer(endpoint, mv[XFER_SIZE*x:XFER_SIZE*(x+1)], timeout)
     return len(data)
 
   def bulkRead(self, endpoint: int, length: int, timeout: int = TIMEOUT) -> bytes:
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index c2b0c85c..60633804 100755
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -1,17 +1,32 @@
 #!/usr/bin/env python3
+import io
+import os
 import time
+import pstats
+import cProfile
 from contextlib import contextmanager
 
 from panda import Panda, PandaDFU
 from panda.tests.hitl.helpers import get_random_can_messages
 
 
+PROFILE = "PROFILE" in os.environ
+
 @contextmanager
 def print_time(desc):
+  if PROFILE:
+    pr = cProfile.Profile()
+    pr.enable()
   start = time.perf_counter()
   yield
   end = time.perf_counter()
-  print(f"{end - start:.2f}s - {desc}")
+  print(f"{end - start:.3f}s - {desc}")
+  if PROFILE:
+    pr.disable()
+    s = io.StringIO()
+    ps = pstats.Stats(pr, stream=s).sort_stats("cumtime")
+    ps.print_stats()
+    print(s.getvalue())
 
 
 if __name__ == "__main__":
diff --git a/tests/usbprotocol/test_comms.py b/tests/usbprotocol/test_comms.py
index 5785db8b..446a1341 100755
--- a/tests/usbprotocol/test_comms.py
+++ b/tests/usbprotocol/test_comms.py
@@ -78,17 +78,17 @@ class TestPandaComms(unittest.TestCase):
   def test_comms_reset_tx(self):
     # store some test messages in the queue
     test_msg = (0x100, b"test", 0)
-    packed = pack_can_buffer([test_msg for _ in range(100)])
+    packed = pack_can_buffer([test_msg for _ in range(100)], chunk=True)
 
     # write a small chunk such that we have some overflow
     TINY_CHUNK_SIZE = 6
-    lpp.comms_can_write(packed[0][:TINY_CHUNK_SIZE], TINY_CHUNK_SIZE)
+    lpp.comms_can_write(bytes(packed[0][:TINY_CHUNK_SIZE]), TINY_CHUNK_SIZE)
 
     # reset the comms to clear the overflow buffer on the panda side
     lpp.comms_can_reset()
 
     # write a full valid chunk, which should now contain valid messages
-    lpp.comms_can_write(packed[1], len(packed[1]))
+    lpp.comms_can_write(bytes(packed[1]), len(packed[1]))
 
     # read the messages from the queue and make sure they're valid
     queue_msgs = []
@@ -114,7 +114,7 @@ class TestPandaComms(unittest.TestCase):
           for buf in packed:
             for i in range(0, len(buf), CHUNK_SIZE):
               chunk_len = min(CHUNK_SIZE, len(buf) - i)
-              lpp.comms_can_write(buf[i:i+chunk_len], chunk_len)
+              lpp.comms_can_write(bytes(buf[i:i+chunk_len]), chunk_len)
 
           # Check that they ended up in the right buffers
           queue_msgs = []