mirror of https://github.com/commaai/tinygrad.git
qcom texs refactor (#6613)
* qcom texs refactor * fix * linter * qcombuf * linter
This commit is contained in:
parent
d1bae42d35
commit
8a9195d86e
|
@ -1,7 +1,7 @@
|
|||
from __future__ import annotations
|
||||
import os, ctypes, functools, mmap, struct, array, decimal, math
|
||||
from types import SimpleNamespace
|
||||
from typing import Tuple, List, Dict, Any
|
||||
from typing import Tuple, List, Any, cast
|
||||
from tinygrad.device import BufferOptions, HCQBuffer, HWComputeQueue, HCQProgram, HCQCompiled, HCQSignal, HCQAllocator, HCQArgsState, hcq_command
|
||||
from tinygrad.runtime.autogen import kgsl, adreno, libc
|
||||
from tinygrad.runtime.ops_gpu import CLCompiler, CLDevice
|
||||
|
@ -9,6 +9,8 @@ from tinygrad.renderer.cstyle import QCOMRenderer
|
|||
from tinygrad.helpers import getenv, from_mv, mv_address, to_mv, round_up, data64_le, prod, DEBUG, fromimport
|
||||
if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # pylint: disable=unused-import
|
||||
|
||||
BUFTYPE_BUF, BUFTYPE_TEX, BUFTYPE_IBO = 0, 1, 2
|
||||
|
||||
def _qreg_exec(reg, __val=0, **kwargs):
|
||||
for k, v in kwargs.items():
|
||||
__val |= (getattr(adreno, f'{reg[4:]}_{k.upper()}') if v else 0) if type(v) is bool else (v << getattr(adreno, f'{reg[4:]}_{k.upper()}__SHIFT'))
|
||||
|
@ -123,39 +125,38 @@ class QCOMComputeQueue(HWComputeQueue):
|
|||
qreg.a6xx_sp_cs_pvt_mem_size(totalpvtmemsize=prg.pvtmem_size_total))
|
||||
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_CONSTANTS, state_src=adreno.SS6_INDIRECT,
|
||||
state_block=adreno.SB6_CS_SHADER, num_unit=prg.kernargs_alloc_size // 4),
|
||||
state_block=adreno.SB6_CS_SHADER, num_unit=1024 // 4),
|
||||
*data64_le(args_state.ptr))
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_SHADER, state_src=adreno.SS6_INDIRECT,
|
||||
state_block=adreno.SB6_CS_SHADER, num_unit=round_up(prg.image_size, 128) // 128),
|
||||
*data64_le(prg.lib_gpu.va_addr))
|
||||
|
||||
self.reg(adreno.REG_A6XX_HLSQ_CONTROL_2_REG, 0xfcfcfcfc, 0xfcfcfcfc, 0xfcfcfcfc, 0xfc,
|
||||
qreg.a6xx_hlsq_cs_cntl(constlen=prg.kernargs_alloc_size // 4, enabled=True))
|
||||
self.reg(adreno.REG_A6XX_HLSQ_CONTROL_2_REG, 0xfcfcfcfc, 0xfcfcfcfc, 0xfcfcfcfc, 0xfc, qreg.a6xx_hlsq_cs_cntl(constlen=1024 // 4, enabled=True))
|
||||
|
||||
self.reg(adreno.REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET, qreg.a6xx_sp_cs_pvt_mem_hw_stack_offset(prg.hw_stack_offset))
|
||||
self.reg(adreno.REG_A6XX_SP_CS_INSTRLEN, qreg.a6xx_sp_cs_instrlen(prg.image_size // 4))
|
||||
|
||||
if hasattr(args_state, 'samplers_ptr'):
|
||||
if args_state.prg.samp_cnt > 0:
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_SHADER, state_src=adreno.SS6_INDIRECT,
|
||||
state_block=adreno.SB6_CS_TEX, num_unit=args_state.samplers_cnt),
|
||||
*data64_le(args_state.samplers_ptr.va_addr))
|
||||
self.reg(adreno.REG_A6XX_SP_CS_TEX_SAMP, *data64_le(args_state.samplers_ptr.va_addr))
|
||||
state_block=adreno.SB6_CS_TEX, num_unit=args_state.prg.samp_cnt),
|
||||
*data64_le(args_state.ptr + args_state.prg.samp_off))
|
||||
self.reg(adreno.REG_A6XX_SP_CS_TEX_SAMP, *data64_le(args_state.ptr + args_state.prg.samp_off))
|
||||
self.reg(adreno.REG_A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR, *data64_le(prg.device._border_color_base()))
|
||||
|
||||
if hasattr(args_state, 'descriptors_ptr'):
|
||||
if args_state.prg.tex_cnt > 0:
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_CONSTANTS, state_src=adreno.SS6_INDIRECT,
|
||||
state_block=adreno.SB6_CS_TEX, num_unit=args_state.descriptors_cnt),
|
||||
*data64_le(args_state.descriptors_ptr.va_addr))
|
||||
self.reg(adreno.REG_A6XX_SP_CS_TEX_CONST, *data64_le(args_state.descriptors_ptr.va_addr))
|
||||
state_block=adreno.SB6_CS_TEX, num_unit=args_state.prg.tex_cnt),
|
||||
*data64_le(args_state.ptr + args_state.prg.tex_off))
|
||||
self.reg(adreno.REG_A6XX_SP_CS_TEX_CONST, *data64_le(args_state.ptr + args_state.prg.tex_off))
|
||||
|
||||
if hasattr(args_state, 'ibos_ptr'):
|
||||
if args_state.prg.ibo_cnt > 0:
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST6_IBO, state_src=adreno.SS6_INDIRECT,
|
||||
state_block=adreno.SB6_CS_SHADER, num_unit=args_state.ibos_cnt),
|
||||
*data64_le(args_state.ibos_ptr.va_addr))
|
||||
self.reg(adreno.REG_A6XX_SP_CS_IBO, *data64_le(args_state.ibos_ptr.va_addr))
|
||||
state_block=adreno.SB6_CS_SHADER, num_unit=args_state.prg.ibo_cnt),
|
||||
*data64_le(args_state.ptr + args_state.prg.ibo_off))
|
||||
self.reg(adreno.REG_A6XX_SP_CS_IBO, *data64_le(args_state.ptr + args_state.prg.ibo_off))
|
||||
|
||||
self.reg(adreno.REG_A6XX_SP_CS_CONFIG,
|
||||
qreg.a6xx_sp_cs_config(enabled=True, nsamp=args_state.samplers_cnt, ntex=args_state.descriptors_cnt, nibo=args_state.ibos_cnt))
|
||||
qreg.a6xx_sp_cs_config(enabled=True, nsamp=args_state.prg.samp_cnt, ntex=args_state.prg.tex_cnt, nibo=args_state.prg.ibo_cnt))
|
||||
self.cmd(adreno.CP_RUN_OPENCL, 0)
|
||||
self._cache_flush(write_back=True, invalidate=False, sync=False, memsync=False)
|
||||
|
||||
|
@ -175,46 +176,27 @@ class QCOMComputeQueue(HWComputeQueue):
|
|||
class QCOMArgsState(HCQArgsState):
|
||||
def __init__(self, ptr:int, prg:QCOMProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
|
||||
super().__init__(ptr, prg, bufs, vals=vals)
|
||||
self.ibos_cnt, self.descriptors_cnt, self.samplers_cnt = 0, 0, 0
|
||||
ctypes.memset(ptr, 0, 1024)
|
||||
ctypes.memset(self.ptr, 0, 1024)
|
||||
|
||||
if len(bufs) + len(vals) != len(prg.buf_info): raise RuntimeError(f'incorrect args size given={len(bufs)+len(vals)} != want={len(prg.buf_info)}')
|
||||
|
||||
self.buf_info, self.args_info = prg.buf_info[:len(bufs)], prg.buf_info[len(bufs):]
|
||||
|
||||
if len(bufs) + len(vals) != len(prg.buffs_info): raise RuntimeError(f'incorrect args size given={len(bufs)} != want={len(prg.buffs_info)}')
|
||||
self.boffs, self.aoffs = prg.buffs_info[:len(bufs)], prg.buffs_info[len(bufs):]
|
||||
for i, v in enumerate(vals): self.update_var(i, v)
|
||||
for cnst_val, cnst_off, cnst_sz in prg.consts_info:
|
||||
ctypes.memmove(self.ptr + cnst_off, (ctypes.c_int8 * cnst_sz).from_buffer_copy(cnst_val.to_bytes(cnst_sz, byteorder='little')), cnst_sz)
|
||||
|
||||
samplers: List[Any] = []
|
||||
descriptors: List[Any] = []
|
||||
ibos: List[Any] = []
|
||||
self.i2descr: Dict[int, int] = {}
|
||||
self.i2ibo: Dict[int, int] = {}
|
||||
for i, b in enumerate(bufs):
|
||||
if not hasattr(b, 'samplers') and not hasattr(b, 'descriptor') and not hasattr(b, 'ibo'): self.update_buffer(i, b)
|
||||
elif self.boffs[i][1]: ibos, self.i2ibo = [*ibos, *getattr(b, 'ibo')], {**self.i2ibo, i: len(ibos)}
|
||||
else:
|
||||
samplers, descriptors = [*samplers, *getattr(b, 'samplers')], [*descriptors, *getattr(b, 'descriptor')]
|
||||
self.i2descr[i] = len(descriptors) - 1
|
||||
|
||||
def alloc_tex_gpu(data, chunk_size) -> Tuple[HCQBuffer, int]:
|
||||
tex_gpu = self.prg.device.allocator.alloc(len(data) * 4, BufferOptions(nolru=True, cpu_access=True))
|
||||
to_mv(tex_gpu.va_addr, len(data) * 4).cast('I')[:] = array.array('I', data)
|
||||
return tex_gpu, len(data) // chunk_size
|
||||
|
||||
if len(samplers): self.samplers_ptr, self.samplers_cnt = alloc_tex_gpu(samplers, 4)
|
||||
if len(descriptors): self.descriptors_ptr, self.descriptors_cnt = alloc_tex_gpu(descriptors, 16)
|
||||
if len(ibos): self.ibos_ptr, self.ibos_cnt = alloc_tex_gpu(ibos, 16)
|
||||
|
||||
def __del__(self):
|
||||
for ptr in ('samplers_ptr', 'descriptors_ptr', 'ibos_ptr'):
|
||||
if hasattr(self, ptr): self.prg.device.allocator.free((x:=getattr(self, ptr)), x.size, BufferOptions(nolru=True, cpu_access=True))
|
||||
if prg.samp_cnt > 0: to_mv(self.ptr + prg.samp_off, len(prg.samplers) * 4).cast('I')[:] = array.array('I', prg.samplers)
|
||||
for i, b in enumerate(cast(List[QCOMBuffer], bufs)):
|
||||
if prg.buf_info[i].type is BUFTYPE_TEX: to_mv(self.ptr + prg.buf_info[i].offset, len(b.desc) * 4).cast('I')[:] = array.array('I', b.desc)
|
||||
elif prg.buf_info[i].type is BUFTYPE_IBO: to_mv(self.ptr + prg.buf_info[i].offset, len(b.ibo) * 4).cast('I')[:] = array.array('I', b.ibo)
|
||||
else: self.update_buffer(i, b)
|
||||
for i, v in enumerate(vals): self.update_var(i, v)
|
||||
|
||||
def update_buffer(self, index:int, buf:HCQBuffer):
|
||||
if (descr:=self.i2descr.get(index, None)) is not None: to_mv(self.descriptors_ptr.va_addr + 16 * descr + 4 * 4, 8).cast('Q')[0] = buf.va_addr
|
||||
elif (ibo:=self.i2ibo.get(index, None)) is not None: to_mv(self.ibos_ptr.va_addr + 16 * ibo + 4 * 4, 8).cast('Q')[0] = buf.va_addr
|
||||
else: to_mv(self.ptr + self.boffs[index][0], 8).cast('Q')[0] = buf.va_addr
|
||||
if self.buf_info[index].type is not BUFTYPE_BUF: to_mv(self.ptr+self.buf_info[index].offset+0x10, 8).cast('Q')[0] = buf.va_addr
|
||||
else: to_mv(self.ptr + self.buf_info[index].offset, 8).cast('Q')[0] = buf.va_addr
|
||||
|
||||
def update_var(self, index:int, val:int): to_mv(self.ptr + self.aoffs[index][0], 8).cast('Q')[0] = val
|
||||
def update_var(self, index:int, val:int): to_mv(self.ptr + self.args_info[index].offset, 8).cast('Q')[0] = val
|
||||
|
||||
class QCOMProgram(HCQProgram):
|
||||
def __init__(self, device: QCOMDevice, name: str, lib: bytes):
|
||||
|
@ -232,7 +214,7 @@ class QCOMProgram(HCQProgram):
|
|||
self.max_threads = min(1024, ((384 * 32) // (max(1, (self.fregs + round_up(self.hregs, 2) // 2)) * 128)) * 128)
|
||||
device._ensure_stack_size(self.hw_stack_offset * 4)
|
||||
|
||||
super().__init__(QCOMArgsState, self.device, self.name, kernargs_alloc_size=1024)
|
||||
super().__init__(QCOMArgsState, self.device, self.name, kernargs_alloc_size=2048 + (self.tex_cnt + self.ibo_cnt) * 0x40 + self.samp_cnt * 0x10)
|
||||
|
||||
def __call__(self, *bufs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
|
||||
if self.max_threads < prod(local_size): raise RuntimeError("Too many resources requsted for launch")
|
||||
|
@ -253,19 +235,34 @@ class QCOMProgram(HCQProgram):
|
|||
self.pvtmem, self.shmem = _read_lib(image_desc_off+0xc8), _read_lib(image_desc_off+0xd8)
|
||||
|
||||
# Fill up constants and buffers info
|
||||
self.buffs_info, self.consts_info = [], []
|
||||
self.buf_info, self.consts_info = [], []
|
||||
|
||||
samplers_count = _read_lib(image_desc_off + 0xdc)
|
||||
bdoff = round_up(image_desc_off + 0x158 + len(self.name), 4) + 8 * samplers_count
|
||||
while (bdoff + 16 <= len(self.lib)):
|
||||
length, _, _, offset_words = struct.unpack("I" * 4, self.lib[bdoff:bdoff+16])
|
||||
# Collect sampler info.
|
||||
self.samp_cnt = _read_lib(image_desc_off + 0xdc)
|
||||
assert self.samp_cnt <= 1, "Up to one sampler supported"
|
||||
if self.samp_cnt:
|
||||
self.samplers = [qreg.a6xx_tex_samp_0(wrap_s=(clamp_mode:=adreno.A6XX_TEX_CLAMP_TO_BORDER), wrap_t=clamp_mode, wrap_r=clamp_mode),
|
||||
qreg.a6xx_tex_samp_1(unnorm_coords=True, cubemapseamlessfiltoff=True), 0, 0]
|
||||
|
||||
# Collect kernel arguments (buffers) info.
|
||||
bdoff = round_up(image_desc_off + 0x158 + len(self.name), 4) + 8 * self.samp_cnt
|
||||
while bdoff + 32 <= len(self.lib):
|
||||
length, _, _, offset_words, _, _, _, typ = struct.unpack("IIIIIIII", self.lib[bdoff:bdoff+32])
|
||||
if length == 0: break
|
||||
self.buffs_info.append((offset_words * 4, struct.unpack("I", self.lib[bdoff+0x3c:bdoff+0x40])[0] == 0x0))
|
||||
self.buf_info.append(SimpleNamespace(offset=offset_words * 4, type=typ))
|
||||
bdoff += length
|
||||
|
||||
# Setting correct offsets to textures/ibos.
|
||||
self.tex_cnt, self.ibo_cnt = sum(x.type is BUFTYPE_TEX for x in self.buf_info), sum(x.type is BUFTYPE_IBO for x in self.buf_info)
|
||||
self.samp_off, self.ibo_off, self.tex_off = 2048, 2048 + 0x10 * self.samp_cnt, 2048 + 0x10 * self.samp_cnt + 0x40 * self.ibo_cnt
|
||||
cur_ibo_off, cur_tex_off = self.ibo_off, self.tex_off
|
||||
for x in self.buf_info:
|
||||
if x.type is BUFTYPE_IBO: x.offset, cur_ibo_off = cur_ibo_off, cur_ibo_off + 0x40
|
||||
elif x.type is BUFTYPE_TEX: x.offset, cur_tex_off = cur_tex_off, cur_tex_off + 0x40
|
||||
|
||||
if _read_lib(0xb0) != 0: # check if we have constants.
|
||||
cdoff = _read_lib(0xac)
|
||||
while (cdoff + 40 <= image_offset):
|
||||
while cdoff + 40 <= image_offset:
|
||||
cnst, offset_words, _, is32 = struct.unpack("I", self.lib[cdoff:cdoff+4])[0], *struct.unpack("III", self.lib[cdoff+16:cdoff+28])
|
||||
self.consts_info.append((cnst, offset_words * (sz_bytes:=(2 << is32)), sz_bytes))
|
||||
cdoff += 40
|
||||
|
@ -277,6 +274,9 @@ class QCOMProgram(HCQProgram):
|
|||
def __del__(self):
|
||||
if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_gpu.size, options=BufferOptions(cpu_access=True, nolru=True))
|
||||
|
||||
class QCOMBuffer(HCQBuffer):
|
||||
def __init__(self, va_addr:int, size:int, desc=None, ibo=None): self.va_addr, self.size, self.desc, self.ibo = va_addr, size, desc, ibo
|
||||
|
||||
class QCOMAllocator(HCQAllocator):
|
||||
def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer:
|
||||
if options.image is not None:
|
||||
|
@ -291,18 +291,14 @@ class QCOMAllocator(HCQAllocator):
|
|||
texture = self.device._gpu_alloc(pitch * round_up(imgh, 16), kgsl.KGSL_MEMTYPE_TEXTURE, map_to_cpu=True)
|
||||
|
||||
# Extend HCQBuffer with texture-related info.
|
||||
texture.samplers, texture.descriptor, texture.ibo = [0] * 4, [0] * 16, [0] * 16
|
||||
|
||||
# Compiled sampler (always the same in tinygrad).
|
||||
texture.samplers[0] = qreg.a6xx_tex_samp_0(wrap_s=(clamp_mode:=adreno.A6XX_TEX_CLAMP_TO_BORDER), wrap_t=clamp_mode, wrap_r=clamp_mode)
|
||||
texture.samplers[1] = qreg.a6xx_tex_samp_1(unnorm_coords=True, cubemapseamlessfiltoff=True)
|
||||
texture.desc, texture.ibo = [0] * 16, [0] * 16
|
||||
|
||||
tex_fmt = adreno.FMT6_32_32_32_32_FLOAT if options.image.itemsize == 4 else adreno.FMT6_16_16_16_16_FLOAT
|
||||
texture.descriptor[0] = qreg.a6xx_tex_const_0(swiz_x=0, swiz_y=1, swiz_z=2, swiz_w=3, fmt=tex_fmt)
|
||||
texture.descriptor[1] = qreg.a6xx_tex_const_1(width=imgw, height=imgh)
|
||||
texture.descriptor[2] = qreg.a6xx_tex_const_2(type=adreno.A6XX_TEX_2D, pitch=pitch, pitchalign=pitchalign-6)
|
||||
texture.descriptor[4:7] = [*data64_le(texture.va_addr), qreg.a6xx_tex_const_6(plane_pitch=0x400000)]
|
||||
texture.ibo = [texture.descriptor[0] & (~0xffff), *texture.descriptor[1:len(texture.descriptor)]]
|
||||
texture.desc[0] = qreg.a6xx_tex_const_0(swiz_x=0, swiz_y=1, swiz_z=2, swiz_w=3, fmt=tex_fmt)
|
||||
texture.desc[1] = qreg.a6xx_tex_const_1(width=imgw, height=imgh)
|
||||
texture.desc[2] = qreg.a6xx_tex_const_2(type=adreno.A6XX_TEX_2D, pitch=pitch, pitchalign=pitchalign-6)
|
||||
texture.desc[4:7] = [*data64_le(texture.va_addr), qreg.a6xx_tex_const_6(plane_pitch=0x400000)]
|
||||
texture.ibo = [texture.desc[0] & (~0xffff), *texture.desc[1:len(texture.desc)]]
|
||||
|
||||
return texture
|
||||
|
||||
|
|
Loading…
Reference in New Issue