From 902f00b09561d4f9cf39fb42b29f68fddeca4a44 Mon Sep 17 00:00:00 2001
From: Yixiang Gao <yg5d6@mail.missouri.edu>
Date: Fri, 27 Oct 2023 19:25:59 -0500
Subject: [PATCH] adding cuda TC headers (#2165)

* split cuda to renderer and add headers for tc

* fix TritonRenderer

* remove unused import
---
 tinygrad/renderer/cuda.py    | 25 +++++++++++++++++++++++++
 tinygrad/runtime/ops_cuda.py | 25 ++++++-------------------
 2 files changed, 31 insertions(+), 19 deletions(-)
 create mode 100644 tinygrad/renderer/cuda.py
diff --git a/tinygrad/renderer/cuda.py b/tinygrad/renderer/cuda.py
new file mode 100644
index 00000000..9e6d0d60
--- /dev/null
+++ b/tinygrad/renderer/cuda.py
@@ -0,0 +1,25 @@
+import functools
+from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage
+
+class CUDALanguage(CStyleLanguage):
+  kernel_prefix = "__global__ "
+  smem_prefix = "__shared__ "
+  smem_prefix_for_cast = False
+  arg_int_prefix = "const int"
+  barrier = "__syncthreads();" 
+  float4 = "make_float4"
+  gid = [f'blockIdx.{chr(120+i)}' for i in range(3)]
+  lid = [f'threadIdx.{chr(120+i)}' for i in range(3)]
+  xid = [f'(blockIdx.{chr(120+i)}*blockDim.{chr(120+i)}+threadIdx.{chr(120+i)})' for i in range(3)]
+  half_prekernel = """
+    #include <cuda_fp16.h>
+    #include <mma.h>
+    using namespace nvcuda;
+    struct __align__(8) half4 {
+      half2 x, y;
+      __device__ __forceinline__ explicit half4(const float4& a): x(make_half2(__float2half(a.x), __float2half(a.y))), y(make_half2(__float2half(a.z),__float2half(a.w))) {}
+      __device__ __forceinline__ explicit operator float4() const {return make_float4(__half2float(x.x), __half2float(x.y), __half2float(y.x), __half2float(y.y)); }
+    };
+    """ # if not getenv("PTX") else fromimport("tinygrad.renderer.assembly_ptx", "uops_to_ptx_asm") # assembly_ptx currently isn't supported
+
+CUDARenderer = functools.partial(uops_to_cstyle, CUDALanguage())
\ No newline at end of file
diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py
index 74c53da6..71dc74c0 100644
--- a/tinygrad/runtime/ops_cuda.py
+++ b/tinygrad/runtime/ops_cuda.py
@@ -1,13 +1,13 @@
-import subprocess, time, re, hashlib, tempfile, functools
+import subprocess, time, re, hashlib, tempfile
 from pathlib import Path
 from typing import Optional, List, Any, Tuple
 import numpy as np
 from pycuda.compiler import compile as cuda_compile # type: ignore
-from tinygrad.helpers import DEBUG, getenv, colored, fromimport
+from tinygrad.helpers import DEBUG, getenv, colored
 from tinygrad.ops import Compiled, GraphBatchExecutor, ASTRunner
 from tinygrad.runtime.lib import RawBufferCopyInOut, RawMallocBuffer, LRUAllocator
 from tinygrad.codegen.kernel import LinearizerOptions
-from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage
+from tinygrad.renderer.cuda import CUDARenderer
 
 def pretty_ptx(s):
   # all expressions match `<valid_before><expr><valid_after>` and replace it with `<valid_before>color(<expr>)<valid_after>`
@@ -116,22 +116,9 @@ class CUDAProgram:
       end.synchronize()
       return start.time_till(end)*1e-3
 
-renderer = functools.partial(uops_to_cstyle, CStyleLanguage(
-  kernel_prefix = "__global__ ", smem_prefix = "__shared__ ", smem_prefix_for_cast=False, arg_int_prefix = "const int", barrier = "__syncthreads();", float4 = "make_float4",
-  gid = [f'blockIdx.{chr(120+i)}' for i in range(3)],
-  lid = [f'threadIdx.{chr(120+i)}' for i in range(3)],
-  xid = [f'(blockIdx.{chr(120+i)}*blockDim.{chr(120+i)}+threadIdx.{chr(120+i)})' for i in range(3)],
-  half_prekernel = """
-    #include <cuda_fp16.h>
-    struct __align__(8) half4 {
-      half2 x, y;
-      __device__ __forceinline__ explicit half4(const float4& a): x(make_half2(__float2half(a.x), __float2half(a.y))), y(make_half2(__float2half(a.z),__float2half(a.w))) {}
-      __device__ __forceinline__ explicit operator float4() const {return make_float4(__half2float(x.x), __half2float(x.y), __half2float(y.x), __half2float(y.y)); }
-    };
-    """)) if not getenv("PTX") else fromimport("tinygrad.renderer.assembly_ptx", "uops_to_ptx_asm")
 if getenv("TRITON") == 1:
   from tinygrad.renderer.triton import uops_to_triton
-  renderer = uops_to_triton
-  CUDABuffer = Compiled(RawCUDABuffer, LinearizerOptions(supports_float4=False, supports_float4_alu=False, global_max = [65535, 65535, 2147483647], local_max = [64, 1024, 1024], has_shared=False), renderer, CUDAProgram, cuda.Context.synchronize)
+  TritonRenderer = uops_to_triton
+  CUDABuffer = Compiled(RawCUDABuffer, LinearizerOptions(supports_float4=False, supports_float4_alu=False, global_max = [65535, 65535, 2147483647], local_max = [64, 1024, 1024], has_shared=False), TritonRenderer, CUDAProgram, cuda.Context.synchronize)
 else:
-  CUDABuffer = Compiled(RawCUDABuffer, LinearizerOptions(supports_float4=False if getenv("PTX") else True, supports_float4_alu=False, global_max = [65535, 65535, 2147483647], local_max = [64, 1024, 1024]), renderer, CUDAProgram, cuda.Context.synchronize, CUDAGraph)
+  CUDABuffer = Compiled(RawCUDABuffer, LinearizerOptions(supports_float4=False if getenv("PTX") else True, supports_float4_alu=False, global_max = [65535, 65535, 2147483647], local_max = [64, 1024, 1024]), CUDARenderer, CUDAProgram, cuda.Context.synchronize, CUDAGraph)