from tinygrad import Device, dtypes from tinygrad.helpers import getenv, colorize_float from extra.optimization.helpers import load_worlds, ast_str_to_lin from test.external.fuzz_linearizer import get_fuzz_rawbufs from tinygrad.engine.search import bufs_from_lin from tinygrad.engine.realize import CompiledRunner from tinygrad.tensor import _to_np_dtype import numpy as np if __name__ == "__main__": ast_strs = load_worlds(filter_reduce=False, filter_novariable=True) cudev = Device["CUDA"] nvdev = Device["NV"] # NUM=112 python3 test/external/speed_compare_cuda_nv.py single = getenv("NUM", -1) if single != -1: ast_strs = ast_strs[single:single+1] average_tm_cuda, average_tm_nv = 0, 0 for num,ast in enumerate(ast_strs): # cuda compile culin = ast_str_to_lin(ast, opts=cudev.renderer) culin.hand_coded_optimizations() has_bf16 = any(b.dtype == dtypes.bfloat16 for b in culin.membufs) cuda_prg = CompiledRunner(culin.to_program()) cubufs = bufs_from_lin(culin) test_cubufs = get_fuzz_rawbufs(culin) if not has_bf16 else cubufs rdr = nvdev.renderer rdr.device = "NV" nvlin = ast_str_to_lin(ast, opts=rdr) nvlin.hand_coded_optimizations() nv_prg = CompiledRunner(nvlin.to_program()) nvbufs = bufs_from_lin(nvlin) test_nvbufs = get_fuzz_rawbufs(nvlin) if not has_bf16 else nvbufs if not has_bf16: for i,rawbuf in enumerate(test_nvbufs): rawbuf.copyin(test_cubufs[i].as_buffer()) # warmup tm_cuda, tm_nv, failed = [], [], False try: cuda_prg(test_cubufs, {}, wait=True) for i in range(5): tm_cuda.append(cuda_prg(cubufs, {}, wait=True)) except RuntimeError: print("CUDA FAILED") tm_cuda = [1e9] failed = True try: nv_prg(test_nvbufs, {}, wait=True) for i in range(5): tm_nv.append(nv_prg(nvbufs, {}, wait=True)) except RuntimeError: print("NV FAILED") tm_nv = [1e9] failed = True if not failed and not has_bf16: curesult = np.frombuffer(test_cubufs[0].as_buffer(), _to_np_dtype(test_cubufs[0].dtype)) nvresult = np.frombuffer(test_nvbufs[0].as_buffer(), _to_np_dtype(test_nvbufs[0].dtype)) np.testing.assert_allclose(curesult, nvresult, rtol=1e-2, atol=1e-2) average_tm_cuda += min(tm_cuda) average_tm_nv += min(tm_nv) ratio = min(tm_nv)/min(tm_cuda) print(f"{average_tm_nv/average_tm_cuda:5.2f}x -- {num:4d} {colorize_float(ratio)} {min(tm_nv)*1e6:7.2f} us", nvlin.name) if ratio > 1.04: print(f"NV slower {ratio}", nvlin.ast, nvlin.applied_opts)