mirror of https://github.com/commaai/tinygrad.git
80 lines
2.5 KiB
Python
80 lines
2.5 KiB
Python
#!/usr/bin/env python3
|
|
# cd extra/disassemblers/ && git clone --recursive github.com:geohot/cuda_ioctl_sniffer.git
|
|
# LD_PRELOAD=$PWD/extra/disassemblers/cuda_ioctl_sniffer/out/sniff.so GPU=1 python3 test/external/external_multi_gpu.py
|
|
import numpy as np
|
|
from tinygrad.tensor import Tensor
|
|
from tinygrad.helpers import colored, Timing, getenv
|
|
from tinygrad.device import Device
|
|
|
|
d0, d1 = f'{Device.DEFAULT}:0', f'{Device.DEFAULT}:1'
|
|
|
|
def sync():
|
|
Device[d0].synchronize()
|
|
Device[d1].synchronize()
|
|
|
|
if __name__ == "__main__":
|
|
print("GPU devices", d0, d1)
|
|
sz = getenv("N", 1024*1024*256) # 1 GB
|
|
|
|
with Timing("GPU initial sync: "): sync()
|
|
|
|
with Timing("CPU creation: ", on_exit=lambda x: f", {(sz*4*2)/x:.2f} GB/sec"):
|
|
c0 = (Tensor.ones(sz, device="clang")/2).realize()
|
|
c1 = (Tensor.ones(sz, device="clang")/4).realize()
|
|
print(c0.lazydata.base.realized)
|
|
print(c1.lazydata.base.realized)
|
|
|
|
with Timing("CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
|
a0 = c0.to(d0).realize()
|
|
sync()
|
|
with Timing("CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
|
b1 = c1.to(d1).realize()
|
|
sync()
|
|
|
|
# cross copy. this is (sometimes) going through the CPU
|
|
with Timing("0 -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
|
a1 = a0.to(d1).realize()
|
|
sync()
|
|
with Timing("1 -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
|
b0 = b1.to(d0).realize()
|
|
sync()
|
|
|
|
# sum
|
|
with Timing("0+0 -> 0 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
|
ab0 = (a0 + b0).realize()
|
|
sync()
|
|
with Timing("1+1 -> 1 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
|
ab1 = (a1 + b1).realize()
|
|
sync()
|
|
|
|
# cross device sum (does this work?)
|
|
with Timing(colored("0+1 -> 0 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
|
abx0 = (a0 + b1.to(d0)).realize()
|
|
sync()
|
|
|
|
with Timing(colored("1+0 -> 1 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
|
abx1 = (b1 + a0.to(d1)).realize()
|
|
sync()
|
|
|
|
# copy back
|
|
# NOTE: half of this slowness is caused by allocating memory on the CPU
|
|
with Timing("0 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
|
cc0 = ab0.numpy()
|
|
with Timing("1 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
|
cc1 = ab1.numpy()
|
|
|
|
# same
|
|
print("testing")
|
|
np.testing.assert_allclose(cc0, cc1)
|
|
|
|
# same (cross)
|
|
print("testing (cross)")
|
|
np.testing.assert_allclose(cc0, abx0.numpy())
|
|
np.testing.assert_allclose(cc0, abx1.numpy())
|
|
|
|
# devices
|
|
print(ab0)
|
|
print(ab1)
|
|
print(abx0)
|
|
print(abx1)
|