mirror of https://github.com/commaai/tinygrad.git
add check global dim limit in linearizer (#1299)
* need a better place for reshape and permute * add permutation * cuda fixed * clean up * enable nvidia GPU with global max * fix order * fix CI * add check for global dim limit but need refactor * refactor * fix ignore
This commit is contained in:
parent
ce0ab1c14e
commit
6e62dcfbf3
|
@ -36,4 +36,4 @@ examples/webgpu/net.safetensors
|
|||
node_modules
|
||||
package.json
|
||||
package-lock.json
|
||||
temp
|
||||
temp
|
|
@ -20,6 +20,7 @@ class CStyleLanguage(NamedTuple):
|
|||
barrier: str = ""
|
||||
gid: List[str] = []
|
||||
lid: List[str] = []
|
||||
global_max: List[int] = []
|
||||
extra_args: List[str] = []
|
||||
float4: Optional[str] = None
|
||||
half_prekernel: Optional[str] = None
|
||||
|
@ -194,7 +195,7 @@ class CStyleCodegen(Linearizer):
|
|||
|
||||
def codegen(self):
|
||||
self.process()
|
||||
#self.limit_global_dims(len(self.lang.gid)) # NOTE: this is optional now
|
||||
if self.lang.global_max: self.limit_global_dims(len(self.lang.gid), self.lang.global_max) # NOTE: this is optional now
|
||||
self.linearize()
|
||||
|
||||
prg, global_size, local_size = uops_to_cstyle(self.uops, self.lang)
|
||||
|
|
|
@ -596,14 +596,25 @@ class Linearizer:
|
|||
|
||||
# ******************** GPU simplifiers ********************
|
||||
|
||||
def limit_global_dims(self, limit):
|
||||
def limit_global_dims(self, limit, global_max):
|
||||
# sometimes, there's more dimensions than len(self.lang.gid).
|
||||
# compact all the dimensions into the first
|
||||
# NOTE: this might make multiview shapetrackers
|
||||
if limit and (self.first_reduce-self.local_dims) > limit:
|
||||
if (self.first_reduce-self.local_dims) > limit:
|
||||
num_to_merge = ((self.first_reduce-self.local_dims) - limit)+1
|
||||
self.reshape_and_permute(lambda x: (prod(x[0:num_to_merge]),)+x[num_to_merge:], None)
|
||||
if DEBUG >= 3: print("reshaped to", self.full_shape, "due to too many global dimensions")
|
||||
# Check the global allocation limit, current the global_size will be flipped during codegen
|
||||
# and then padded right with 1s if its length < 3 which makes this part a bit awkward to write
|
||||
global_dims = self.first_reduce-self.local_dims
|
||||
if global_dims > 0:
|
||||
assert max(global_max) >= max(self.full_shape[0:global_dims]), f"device max allocation {max(self.full_shape[0:global_dims])} exceeds global dim maximum {max(global_max)}"
|
||||
for i in range(global_dims-1):
|
||||
if self.full_shape[i] > global_max[i]:
|
||||
order = list(range(len(self.full_shape)))
|
||||
order[i], order[global_dims-1] = order[global_dims-1], order[i]
|
||||
self.reshape_and_permute(None, order)
|
||||
if DEBUG >= 3: print("permuted global dim", order, "due to allocation exceeds global limit")
|
||||
|
||||
def alias_buffer(self, i, pattern):
|
||||
assert len(pattern) == len(self.sts[i].shape), f"must include a pattern for each shape {pattern} {self.sts[i].shape}"
|
||||
|
|
|
@ -82,6 +82,7 @@ class CUDAProgram:
|
|||
class CUDACodegen(CStyleCodegen):
|
||||
lang = CStyleLanguage(
|
||||
kernel_prefix = "__global__", smem_prefix = "__shared__ ", barrier = "__syncthreads();", float4 = "make_float4",
|
||||
global_max = [65535, 65535, 2147483647],
|
||||
gid = [f'blockIdx.{chr(120+i)}' for i in range(3)],
|
||||
lid = [f'threadIdx.{chr(120+i)}' for i in range(3)],
|
||||
half_prekernel = """
|
||||
|
|
Loading…
Reference in New Issue