tinygrad/test/test_copy_speed.py

import unittest
from tinygrad import Tensor
from tinygrad import Device
from tinygrad.helpers import Timing, CI, OSX
import multiprocessing.shared_memory as shared_memory

N = 4096 if CI else 16384
class TestCopySpeed(unittest.TestCase):
  @classmethod
  def setUpClass(cls): Device[Device.DEFAULT].synchronize()

  def testCopySHMtoDefault(self):
    s = shared_memory.SharedMemory(name="test_X", create=True, size=N*N*4)
    s.close()
    if CI and not OSX:
      t = Tensor.empty(N, N, device="disk:/dev/shm/test_X").realize()
    else:
      t = Tensor.empty(N, N, device="disk:shm:test_X").realize()
    for _ in range(3):
      with Timing("sync:  ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
        with Timing("queue: "):
          t.to(Device.DEFAULT).realize()
        Device[Device.DEFAULT].synchronize()
    s.unlink()

  def testCopyCPUtoDefault(self):
    t = Tensor.rand(N, N, device="clang").realize()
    print(f"buffer: {t.nbytes()*1e-9:.2f} GB")
    for _ in range(3):
      with Timing("sync:  ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
        with Timing("queue: "):
          t.to(Device.DEFAULT).realize()
        Device[Device.DEFAULT].synchronize()

  def testCopyCPUtoDefaultFresh(self):
    print("fresh copy")
    for _ in range(3):
      t = Tensor.rand(N, N, device="clang").realize()
      with Timing("sync:  ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"): # noqa: F821
        with Timing("queue: "):
          t.to(Device.DEFAULT).realize()
        Device[Device.DEFAULT].synchronize()
      del t

  def testCopyDefaulttoCPU(self):
    t = Tensor.rand(N, N).realize()
    print(f"buffer: {t.nbytes()*1e-9:.2f} GB")
    for _ in range(3):
      with Timing("sync:  ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
        t.to('clang').realize()

  @unittest.skipIf(CI, "CI doesn't have 6 GPUs")
  @unittest.skipIf(Device.DEFAULT != "GPU", "only test this on GPU")
  def testCopyCPUto6GPUs(self):
    from tinygrad.runtime.ops_gpu import CLDevice
    if len(CLDevice.device_ids) != 6: raise unittest.SkipTest("computer doesn't have 6 GPUs")
    t = Tensor.rand(N, N, device="clang").realize()
    print(f"buffer: {t.nbytes()*1e-9:.2f} GB")
    for _ in range(3):
      with Timing("sync:  ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s ({t.nbytes()*6/ns:.2f} GB/s total)"):
        with Timing("queue: "):
          for g in range(6):
            t.to(f"gpu:{g}").realize()
        Device["gpu"].synchronize()

if __name__ == '__main__':
  unittest.main()