From 902f00b09561d4f9cf39fb42b29f68fddeca4a44 Mon Sep 17 00:00:00 2001 From: Yixiang Gao Date: Fri, 27 Oct 2023 19:25:59 -0500 Subject: [PATCH] adding cuda TC headers (#2165) * split cuda to renderer and add headers for tc * fix TritonRenderer * remove unused import --- tinygrad/renderer/cuda.py | 25 +++++++++++++++++++++++++ tinygrad/runtime/ops_cuda.py | 25 ++++++------------------- 2 files changed, 31 insertions(+), 19 deletions(-) create mode 100644 tinygrad/renderer/cuda.py diff --git a/tinygrad/renderer/cuda.py b/tinygrad/renderer/cuda.py new file mode 100644 index 00000000..9e6d0d60 --- /dev/null +++ b/tinygrad/renderer/cuda.py @@ -0,0 +1,25 @@ +import functools +from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage + +class CUDALanguage(CStyleLanguage): + kernel_prefix = "__global__ " + smem_prefix = "__shared__ " + smem_prefix_for_cast = False + arg_int_prefix = "const int" + barrier = "__syncthreads();" + float4 = "make_float4" + gid = [f'blockIdx.{chr(120+i)}' for i in range(3)] + lid = [f'threadIdx.{chr(120+i)}' for i in range(3)] + xid = [f'(blockIdx.{chr(120+i)}*blockDim.{chr(120+i)}+threadIdx.{chr(120+i)})' for i in range(3)] + half_prekernel = """ + #include + #include + using namespace nvcuda; + struct __align__(8) half4 { + half2 x, y; + __device__ __forceinline__ explicit half4(const float4& a): x(make_half2(__float2half(a.x), __float2half(a.y))), y(make_half2(__float2half(a.z),__float2half(a.w))) {} + __device__ __forceinline__ explicit operator float4() const {return make_float4(__half2float(x.x), __half2float(x.y), __half2float(y.x), __half2float(y.y)); } + }; + """ # if not getenv("PTX") else fromimport("tinygrad.renderer.assembly_ptx", "uops_to_ptx_asm") # assembly_ptx currently isn't supported + +CUDARenderer = functools.partial(uops_to_cstyle, CUDALanguage()) \ No newline at end of file diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py index 74c53da6..71dc74c0 100644 --- a/tinygrad/runtime/ops_cuda.py +++ b/tinygrad/runtime/ops_cuda.py @@ -1,13 +1,13 @@ -import subprocess, time, re, hashlib, tempfile, functools +import subprocess, time, re, hashlib, tempfile from pathlib import Path from typing import Optional, List, Any, Tuple import numpy as np from pycuda.compiler import compile as cuda_compile # type: ignore -from tinygrad.helpers import DEBUG, getenv, colored, fromimport +from tinygrad.helpers import DEBUG, getenv, colored from tinygrad.ops import Compiled, GraphBatchExecutor, ASTRunner from tinygrad.runtime.lib import RawBufferCopyInOut, RawMallocBuffer, LRUAllocator from tinygrad.codegen.kernel import LinearizerOptions -from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage +from tinygrad.renderer.cuda import CUDARenderer def pretty_ptx(s): # all expressions match `` and replace it with `color()` @@ -116,22 +116,9 @@ class CUDAProgram: end.synchronize() return start.time_till(end)*1e-3 -renderer = functools.partial(uops_to_cstyle, CStyleLanguage( - kernel_prefix = "__global__ ", smem_prefix = "__shared__ ", smem_prefix_for_cast=False, arg_int_prefix = "const int", barrier = "__syncthreads();", float4 = "make_float4", - gid = [f'blockIdx.{chr(120+i)}' for i in range(3)], - lid = [f'threadIdx.{chr(120+i)}' for i in range(3)], - xid = [f'(blockIdx.{chr(120+i)}*blockDim.{chr(120+i)}+threadIdx.{chr(120+i)})' for i in range(3)], - half_prekernel = """ - #include - struct __align__(8) half4 { - half2 x, y; - __device__ __forceinline__ explicit half4(const float4& a): x(make_half2(__float2half(a.x), __float2half(a.y))), y(make_half2(__float2half(a.z),__float2half(a.w))) {} - __device__ __forceinline__ explicit operator float4() const {return make_float4(__half2float(x.x), __half2float(x.y), __half2float(y.x), __half2float(y.y)); } - }; - """)) if not getenv("PTX") else fromimport("tinygrad.renderer.assembly_ptx", "uops_to_ptx_asm") if getenv("TRITON") == 1: from tinygrad.renderer.triton import uops_to_triton - renderer = uops_to_triton - CUDABuffer = Compiled(RawCUDABuffer, LinearizerOptions(supports_float4=False, supports_float4_alu=False, global_max = [65535, 65535, 2147483647], local_max = [64, 1024, 1024], has_shared=False), renderer, CUDAProgram, cuda.Context.synchronize) + TritonRenderer = uops_to_triton + CUDABuffer = Compiled(RawCUDABuffer, LinearizerOptions(supports_float4=False, supports_float4_alu=False, global_max = [65535, 65535, 2147483647], local_max = [64, 1024, 1024], has_shared=False), TritonRenderer, CUDAProgram, cuda.Context.synchronize) else: - CUDABuffer = Compiled(RawCUDABuffer, LinearizerOptions(supports_float4=False if getenv("PTX") else True, supports_float4_alu=False, global_max = [65535, 65535, 2147483647], local_max = [64, 1024, 1024]), renderer, CUDAProgram, cuda.Context.synchronize, CUDAGraph) + CUDABuffer = Compiled(RawCUDABuffer, LinearizerOptions(supports_float4=False if getenv("PTX") else True, supports_float4_alu=False, global_max = [65535, 65535, 2147483647], local_max = [64, 1024, 1024]), CUDARenderer, CUDAProgram, cuda.Context.synchronize, CUDAGraph)