mirror of https://github.com/commaai/tinygrad.git
68 lines
2.5 KiB
Python
68 lines
2.5 KiB
Python
import unittest
|
|
from tinygrad import Tensor
|
|
from tinygrad import Device
|
|
from tinygrad.helpers import Timing, CI, OSX
|
|
import multiprocessing.shared_memory as shared_memory
|
|
|
|
N = 4096
|
|
class TestCopySpeed(unittest.TestCase):
|
|
@classmethod
|
|
def setUpClass(cls): Device[Device.DEFAULT].synchronize()
|
|
|
|
def testCopySHMtoDefault(self):
|
|
s = shared_memory.SharedMemory(name="test_X", create=True, size=N*N*4)
|
|
s.close()
|
|
if CI and not OSX:
|
|
t = Tensor.empty(N, N, device="disk:/dev/shm/test_X").realize()
|
|
else:
|
|
t = Tensor.empty(N, N, device="disk:shm:test_X").realize()
|
|
for _ in range(3):
|
|
with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
|
|
with Timing("queue: "):
|
|
t.to(Device.DEFAULT).realize()
|
|
Device[Device.DEFAULT].synchronize()
|
|
s.unlink()
|
|
|
|
def testCopyCPUtoDefault(self):
|
|
t = Tensor.rand(N, N, device="clang").realize()
|
|
print(f"buffer: {t.nbytes()*1e-9:.2f} GB")
|
|
for _ in range(3):
|
|
with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
|
|
with Timing("queue: "):
|
|
t.to(Device.DEFAULT).realize()
|
|
Device[Device.DEFAULT].synchronize()
|
|
|
|
def testCopyCPUtoDefaultFresh(self):
|
|
print("fresh copy")
|
|
for _ in range(3):
|
|
t = Tensor.rand(N, N, device="clang").realize()
|
|
with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"): # noqa: F821
|
|
with Timing("queue: "):
|
|
t.to(Device.DEFAULT).realize()
|
|
Device[Device.DEFAULT].synchronize()
|
|
del t
|
|
|
|
def testCopyDefaulttoCPU(self):
|
|
t = Tensor.rand(N, N).realize()
|
|
print(f"buffer: {t.nbytes()*1e-9:.2f} GB")
|
|
for _ in range(3):
|
|
with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
|
|
t.to('clang').realize()
|
|
|
|
@unittest.skipIf(CI, "CI doesn't have 6 GPUs")
|
|
@unittest.skipIf(Device.DEFAULT != "GPU", "only test this on GPU")
|
|
def testCopyCPUto6GPUs(self):
|
|
from tinygrad.runtime.ops_gpu import CLDevice
|
|
if len(CLDevice.device_ids) != 6: raise unittest.SkipTest("computer doesn't have 6 GPUs")
|
|
t = Tensor.rand(N, N, device="clang").realize()
|
|
print(f"buffer: {t.nbytes()*1e-9:.2f} GB")
|
|
for _ in range(3):
|
|
with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s ({t.nbytes()*6/ns:.2f} GB/s total)"):
|
|
with Timing("queue: "):
|
|
for g in range(6):
|
|
t.to(f"gpu:{g}").realize()
|
|
Device["gpu"].synchronize()
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|