add check global dim limit in linearizer (#1299)

* need a better place for reshape and permute * add permutation * cuda fixed * clean up * enable nvidia GPU with global max * fix order * fix CI * add check for global dim limit but need refactor * refactor * fix ignore
2023-07-31 14:14:54 -04:00 · 2023-07-31 14:14:54 -04:00 · 6e62dcfbf3
parent ce0ab1c14e
commit 6e62dcfbf3
4 changed files with 17 additions and 4 deletions
--- a/tinygrad/codegen/cstyle.py
+++ b/tinygrad/codegen/cstyle.py
@ -20,6 +20,7 @@ class CStyleLanguage(NamedTuple):
  barrier: str = ""
  gid: List[str] = []
  lid: List[str] = []
+  global_max: List[int] = []
  extra_args: List[str] = []
  float4: Optional[str] = None
  half_prekernel: Optional[str] = None
@ -194,7 +195,7 @@ class CStyleCodegen(Linearizer):

  def codegen(self):
    self.process()
-    #self.limit_global_dims(len(self.lang.gid))  # NOTE: this is optional now
+    if self.lang.global_max: self.limit_global_dims(len(self.lang.gid), self.lang.global_max)  # NOTE: this is optional now
    self.linearize()

    prg, global_size, local_size = uops_to_cstyle(self.uops, self.lang)
--- a/tinygrad/codegen/linearizer.py
+++ b/tinygrad/codegen/linearizer.py
@ -596,14 +596,25 @@ class Linearizer:

  # ******************** GPU simplifiers ********************

-  def limit_global_dims(self, limit):
+  def limit_global_dims(self, limit, global_max):
    # sometimes, there's more dimensions than len(self.lang.gid).
    # compact all the dimensions into the first
    # NOTE: this might make multiview shapetrackers
-    if limit and (self.first_reduce-self.local_dims) > limit:
+    if (self.first_reduce-self.local_dims) > limit:
      num_to_merge = ((self.first_reduce-self.local_dims) - limit)+1
      self.reshape_and_permute(lambda x: (prod(x[0:num_to_merge]),)+x[num_to_merge:], None)
      if DEBUG >= 3: print("reshaped to", self.full_shape, "due to too many global dimensions")
+    # Check the global allocation limit, current the global_size will be flipped during codegen 
+    # and then padded right with 1s if its length < 3 which makes this part a bit awkward to write
+    global_dims = self.first_reduce-self.local_dims
+    if global_dims > 0: 
+      assert max(global_max) >= max(self.full_shape[0:global_dims]), f"device max allocation {max(self.full_shape[0:global_dims])} exceeds global dim maximum {max(global_max)}"
+      for i in range(global_dims-1):
+        if self.full_shape[i] > global_max[i]:
+          order = list(range(len(self.full_shape)))
+          order[i], order[global_dims-1] = order[global_dims-1], order[i] 
+          self.reshape_and_permute(None, order)
+          if DEBUG >= 3: print("permuted global dim", order, "due to allocation exceeds global limit")

  def alias_buffer(self, i, pattern):
    assert len(pattern) == len(self.sts[i].shape), f"must include a pattern for each shape {pattern} {self.sts[i].shape}"
--- a/tinygrad/runtime/ops_cuda.py
+++ b/tinygrad/runtime/ops_cuda.py
@ -82,6 +82,7 @@ class CUDAProgram:
 class CUDACodegen(CStyleCodegen):
  lang = CStyleLanguage(
    kernel_prefix = "__global__", smem_prefix = "__shared__ ", barrier = "__syncthreads();", float4 = "make_float4",
+    global_max = [65535, 65535, 2147483647],
    gid = [f'blockIdx.{chr(120+i)}' for i in range(3)],
    lid = [f'threadIdx.{chr(120+i)}' for i in range(3)],
    half_prekernel = """