mirror of https://github.com/commaai/tinygrad.git
parent
325a3bf2cf
commit
0400315078
|
@ -24,10 +24,8 @@ code = open(pathlib.Path(__file__).parent / "prog.s", "r").read()
|
||||||
|
|
||||||
gen = []
|
gen = []
|
||||||
FLOPS = 0
|
FLOPS = 0
|
||||||
#MAX_REG = 251
|
for j in range(4):
|
||||||
MAX_REG = 32
|
for i in range(0, 251, 6):
|
||||||
for j in range(1):
|
|
||||||
for i in range(0, MAX_REG, 6):
|
|
||||||
#gen.append(f"v_dual_fmac_f32 v{i+0}, v{i+1}, v{i+2} :: v_dual_fmac_f32 v{i+3}, v{i+4}, v{i+5}")
|
#gen.append(f"v_dual_fmac_f32 v{i+0}, v{i+1}, v{i+2} :: v_dual_fmac_f32 v{i+3}, v{i+4}, v{i+5}")
|
||||||
#FLOPS += 4
|
#FLOPS += 4
|
||||||
gen.append(f"v_dual_dot2acc_f32_f16 v{i+0}, v{i+1}, v{i+2} :: v_dual_dot2acc_f32_f16 v{i+3}, v{i+4}, v{i+5}")
|
gen.append(f"v_dual_dot2acc_f32_f16 v{i+0}, v{i+1}, v{i+2} :: v_dual_dot2acc_f32_f16 v{i+3}, v{i+4}, v{i+5}")
|
||||||
|
@ -50,10 +48,9 @@ print(colored("creating CLProgram", "green"))
|
||||||
prg = CLProgram("code", asm, binary=True)
|
prg = CLProgram("code", asm, binary=True)
|
||||||
|
|
||||||
print(colored("running program", "green"))
|
print(colored("running program", "green"))
|
||||||
G = 256
|
FLOPS *= 100000*1024*1024 # loop * global_size
|
||||||
FLOPS *= 100000*G*G # loop * global_size
|
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
tm = prg([G, G], [256, 1], buf, wait=True)
|
tm = prg([1024, 1024], [256, 1], buf, wait=True)
|
||||||
print(f"ran in {tm*1e3:.2f} ms, {FLOPS/(tm*1e9):.2f} GFLOPS")
|
print(f"ran in {tm*1e3:.2f} ms, {FLOPS/(tm*1e9):.2f} GFLOPS")
|
||||||
|
|
||||||
print(colored("transferring buffer", "green"))
|
print(colored("transferring buffer", "green"))
|
||||||
|
|
|
@ -1,20 +0,0 @@
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from tinygrad.runtime.ops_gpu import CLCodegen
|
|
||||||
from tinygrad.codegen.assembly import AssemblyCodegen
|
|
||||||
|
|
||||||
from tinygrad.helpers import LazyNumpyArray, dtypes
|
|
||||||
from tinygrad.ops import LazyOp, BinaryOps
|
|
||||||
from tinygrad.lazy import LazyBuffer
|
|
||||||
from tinygrad.shape.shapetracker import ShapeTracker
|
|
||||||
|
|
||||||
ones = LazyNumpyArray.from_np(np.ones((3,), np.float32))
|
|
||||||
|
|
||||||
#target = "GPU"
|
|
||||||
target = "RDNA"
|
|
||||||
|
|
||||||
b1 = LazyBuffer.fromCPU(ones, target)
|
|
||||||
b2 = LazyBuffer.fromCPU(ones, target)
|
|
||||||
|
|
||||||
out = LazyBuffer(target, ShapeTracker((3,)), BinaryOps, LazyOp(BinaryOps.ADD, (b1, b2)), dtypes.float32)
|
|
||||||
print(out.toCPU())
|
|
|
@ -21,6 +21,9 @@ code.kd:
|
||||||
# kernel_code_properties |= AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR = 1
|
# kernel_code_properties |= AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR = 1
|
||||||
# kernel_code_properties |= AMD_KERNEL_CODE_PROPERTIES_RESERVED1 = 1
|
# kernel_code_properties |= AMD_KERNEL_CODE_PROPERTIES_RESERVED1 = 1
|
||||||
.text
|
.text
|
||||||
|
.global code
|
||||||
|
.type code,STT_FUNC
|
||||||
|
code:
|
||||||
# https://llvm.org/docs/AMDGPUUsage.html#initial-kernel-execution-state
|
# https://llvm.org/docs/AMDGPUUsage.html#initial-kernel-execution-state
|
||||||
# s[0:1] contains the kernarg_address
|
# s[0:1] contains the kernarg_address
|
||||||
# TODO: can we use s[2:3] if this was really a wave since we only alloced 2 SGPRs?
|
# TODO: can we use s[2:3] if this was really a wave since we only alloced 2 SGPRs?
|
||||||
|
|
|
@ -1,103 +0,0 @@
|
||||||
from tinygrad.codegen.linearizer import Linearizer
|
|
||||||
from tinygrad.ops import ASTRunner
|
|
||||||
from tinygrad.runtime.ops_gpu import ROCM_LLVM_PATH
|
|
||||||
|
|
||||||
# ugh, is this really needed?
|
|
||||||
from extra.helpers import enable_early_exec
|
|
||||||
early_exec = enable_early_exec()
|
|
||||||
|
|
||||||
# https://github.com/ROCm-Developer-Tools/ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md#initial-kernel-register-state
|
|
||||||
# enable_sgpr_kernarg_segment_ptr
|
|
||||||
# enable_sgpr_grid_workgroup_count_X
|
|
||||||
|
|
||||||
# amd_kernel_..., amd_machine_...
|
|
||||||
# kernel_code_entry_byte_offset, kernel_code_prefetch_byte_offset
|
|
||||||
# kernel_code_prefetch_byte_size, max_scratch_backing_memory_byte_size
|
|
||||||
# compute_pgm_rsrc1, compute_pgm_rsrc2, kernel_code_properties, workitem_private_segment_byte_size
|
|
||||||
|
|
||||||
# TODO: generate this struct
|
|
||||||
boilerplate_start = """
|
|
||||||
.global _start
|
|
||||||
_start:
|
|
||||||
.rodata
|
|
||||||
.align 0x10
|
|
||||||
.global code.kd
|
|
||||||
.type code.kd,STT_OBJECT
|
|
||||||
code.kd:
|
|
||||||
.long 0,0,0,0
|
|
||||||
.long 0x00000bc0,0x00000000,0x00000000,0x00000000
|
|
||||||
.long 0,0,0,0
|
|
||||||
.long 0x60af0000,0x0000009e,0x00000408,0x00000000
|
|
||||||
.text
|
|
||||||
"""
|
|
||||||
|
|
||||||
# TODO: generate this yaml
|
|
||||||
boilerplate_end = """
|
|
||||||
.amdgpu_metadata
|
|
||||||
amdhsa.kernels:
|
|
||||||
- .args:
|
|
||||||
- .address_space: global
|
|
||||||
.name: a
|
|
||||||
.offset: 0
|
|
||||||
.size: 8
|
|
||||||
.type_name: 'float*'
|
|
||||||
.value_kind: global_buffer
|
|
||||||
- .address_space: global
|
|
||||||
.name: b
|
|
||||||
.offset: 0
|
|
||||||
.size: 8
|
|
||||||
.type_name: 'float*'
|
|
||||||
.value_kind: global_buffer
|
|
||||||
- .address_space: global
|
|
||||||
.name: c
|
|
||||||
.offset: 0
|
|
||||||
.size: 8
|
|
||||||
.type_name: 'float*'
|
|
||||||
.value_kind: global_buffer
|
|
||||||
.group_segment_fixed_size: 0
|
|
||||||
.kernarg_segment_align: 8
|
|
||||||
.kernarg_segment_size: 8
|
|
||||||
.language: OpenCL C
|
|
||||||
.language_version:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
.max_flat_workgroup_size: 256
|
|
||||||
.name: code
|
|
||||||
.private_segment_fixed_size: 0
|
|
||||||
.sgpr_count: 2
|
|
||||||
.sgpr_spill_count: 0
|
|
||||||
.symbol: code.kd
|
|
||||||
.uses_dynamic_stack: false
|
|
||||||
.vgpr_count: 256
|
|
||||||
.vgpr_spill_count: 0
|
|
||||||
.wavefront_size: 32
|
|
||||||
amdhsa.target: amdgcn-amd-amdhsa--gfx1100
|
|
||||||
amdhsa.version:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
.end_amdgpu_metadata
|
|
||||||
"""
|
|
||||||
|
|
||||||
class AssemblyCodegen(Linearizer):
|
|
||||||
supports_float4: bool = True
|
|
||||||
|
|
||||||
# s registers are the addresses and non local indexes
|
|
||||||
def codegen(self):
|
|
||||||
self.process()
|
|
||||||
self.hand_coded_optimizations()
|
|
||||||
self.linearize()
|
|
||||||
|
|
||||||
instructions = []
|
|
||||||
|
|
||||||
# exit asm
|
|
||||||
instructions += ['s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)', 's_endpgm', 's_code_end']
|
|
||||||
|
|
||||||
code = boilerplate_start + '\n'.join(instructions) + boilerplate_end
|
|
||||||
object = early_exec(([ROCM_LLVM_PATH / "llvm-mc", '--arch=amdgcn', '--mcpu=gfx1100', '--triple=amdgcn-amd-amdhsa', '--filetype=obj', '-'], code.encode("utf-8")))
|
|
||||||
asm = early_exec(([ROCM_LLVM_PATH / "ld.lld", "/dev/stdin", "-o", "/dev/stdout", "--pie"], object))
|
|
||||||
|
|
||||||
global_size = []
|
|
||||||
local_size = []
|
|
||||||
return ASTRunner('code', asm,
|
|
||||||
global_size[::-1] if len(global_size) else [1], local_size[::-1] if len(local_size) else None,
|
|
||||||
op_estimate=self.info.flops, mem_estimate=self.mem_estimate, display_name=self.function_name, runtime_args={"binary": True})
|
|
|
@ -46,8 +46,7 @@ class LazyNumpyArray:
|
||||||
def reshape(self, new_shape): return LazyNumpyArray(self.fxn, new_shape, self.dtype)
|
def reshape(self, new_shape): return LazyNumpyArray(self.fxn, new_shape, self.dtype)
|
||||||
def copy(self): return self if callable(self.fxn) else LazyNumpyArray(self.fxn, self.shape, self.dtype)
|
def copy(self): return self if callable(self.fxn) else LazyNumpyArray(self.fxn, self.shape, self.dtype)
|
||||||
def astype(self, typ): return LazyNumpyArray(self.fxn, self.shape, typ)
|
def astype(self, typ): return LazyNumpyArray(self.fxn, self.shape, typ)
|
||||||
@staticmethod
|
|
||||||
def from_np(data): return LazyNumpyArray(data, data.shape, data.dtype)
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class dtypes:
|
class dtypes:
|
||||||
|
|
|
@ -78,12 +78,12 @@ def get_lazyop_info(ast:LazyOp) -> FlopCounter: return InterpretedFlopCounter.ex
|
||||||
# **************** for Compiled Buffers ****************
|
# **************** for Compiled Buffers ****************
|
||||||
|
|
||||||
class ASTRunner:
|
class ASTRunner:
|
||||||
def __init__(self, name, prg, global_size:Optional[List[int]]=None, local_size:Optional[List[int]]=None, op_estimate=0, mem_estimate=0, display_name:Optional[str]=None, runtime_args={}):
|
def __init__(self, name, prg, global_size:Optional[List[int]]=None, local_size:Optional[List[int]]=None, op_estimate=0, mem_estimate=0, display_name:Optional[str]=None):
|
||||||
if DEBUG >= 4 and 'binary' not in runtime_args: print(prg)
|
if DEBUG >= 4: print(prg)
|
||||||
self.name, self.prg, self.global_size, self.local_size, self.op_estimate, self.mem_estimate, self.display_name, self.runtime_args = name, prg, global_size, local_size, op_estimate, mem_estimate, display_name, runtime_args
|
self.name, self.prg, self.global_size, self.local_size, self.op_estimate, self.mem_estimate, self.display_name = name, prg, global_size, local_size, op_estimate, mem_estimate, display_name
|
||||||
|
|
||||||
def build(self, runtime):
|
def build(self, runtime):
|
||||||
self.clprg = runtime(self.name, self.prg, **self.runtime_args)
|
self.clprg = runtime(self.name, self.prg)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def exec(self, bufs) -> Optional[float]:
|
def exec(self, bufs) -> Optional[float]:
|
||||||
|
|
|
@ -62,8 +62,6 @@ class CLProgram:
|
||||||
from disassemblers.adreno import disasm
|
from disassemblers.adreno import disasm
|
||||||
disasm(self.binary())
|
disasm(self.binary())
|
||||||
elif 'gfx1100' in CL.cl_ctx.devices[0].name:
|
elif 'gfx1100' in CL.cl_ctx.devices[0].name:
|
||||||
# NOTE: this can move, you have to read the ELF
|
|
||||||
#print(','.join([hex(x) for x in struct.unpack("I"*0x10, self.binary()[0x800:0x840])]))
|
|
||||||
asm = early_exec(([ROCM_LLVM_PATH / "llvm-objdump", '-d', '-'], self.binary()))
|
asm = early_exec(([ROCM_LLVM_PATH / "llvm-objdump", '-d', '-'], self.binary()))
|
||||||
print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
|
print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -1,5 +0,0 @@
|
||||||
from tinygrad.ops import Compiled
|
|
||||||
from tinygrad.codegen.assembly import AssemblyCodegen
|
|
||||||
from tinygrad.runtime.ops_gpu import CLBuffer, CLProgram, CL
|
|
||||||
|
|
||||||
RDNABuffer = Compiled(CLBuffer, AssemblyCodegen, CLProgram, CL.synchronize)
|
|
|
@ -42,7 +42,7 @@ class Tensor:
|
||||||
data = data.realize().toCPU()
|
data = data.realize().toCPU()
|
||||||
|
|
||||||
# all ndarrays are lazy now
|
# all ndarrays are lazy now
|
||||||
if isinstance(data, np.ndarray): data = LazyNumpyArray.from_np(data)
|
if isinstance(data, np.ndarray): data = LazyNumpyArray(data, data.shape, data.dtype)
|
||||||
|
|
||||||
# by here, it's either LazyNumpyArray or LazyBuffer
|
# by here, it's either LazyNumpyArray or LazyBuffer
|
||||||
# TODO: it should all be LazyBuffer I think
|
# TODO: it should all be LazyBuffer I think
|
||||||
|
|
Loading…
Reference in New Issue