qcom texs refactor (#6613)

* qcom texs refactor

* fix

* linter

* qcombuf

* linter
This commit is contained in:
nimlgen 2024-09-23 09:03:17 +08:00 committed by GitHub
parent d1bae42d35
commit 8a9195d86e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 64 additions and 68 deletions

View File

@ -1,7 +1,7 @@
from __future__ import annotations
import os, ctypes, functools, mmap, struct, array, decimal, math
from types import SimpleNamespace
from typing import Tuple, List, Dict, Any
from typing import Tuple, List, Any, cast
from tinygrad.device import BufferOptions, HCQBuffer, HWComputeQueue, HCQProgram, HCQCompiled, HCQSignal, HCQAllocator, HCQArgsState, hcq_command
from tinygrad.runtime.autogen import kgsl, adreno, libc
from tinygrad.runtime.ops_gpu import CLCompiler, CLDevice
@ -9,6 +9,8 @@ from tinygrad.renderer.cstyle import QCOMRenderer
from tinygrad.helpers import getenv, from_mv, mv_address, to_mv, round_up, data64_le, prod, DEBUG, fromimport
if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # pylint: disable=unused-import
BUFTYPE_BUF, BUFTYPE_TEX, BUFTYPE_IBO = 0, 1, 2
def _qreg_exec(reg, __val=0, **kwargs):
for k, v in kwargs.items():
__val |= (getattr(adreno, f'{reg[4:]}_{k.upper()}') if v else 0) if type(v) is bool else (v << getattr(adreno, f'{reg[4:]}_{k.upper()}__SHIFT'))
@ -123,39 +125,38 @@ class QCOMComputeQueue(HWComputeQueue):
qreg.a6xx_sp_cs_pvt_mem_size(totalpvtmemsize=prg.pvtmem_size_total))
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_CONSTANTS, state_src=adreno.SS6_INDIRECT,
state_block=adreno.SB6_CS_SHADER, num_unit=prg.kernargs_alloc_size // 4),
state_block=adreno.SB6_CS_SHADER, num_unit=1024 // 4),
*data64_le(args_state.ptr))
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_SHADER, state_src=adreno.SS6_INDIRECT,
state_block=adreno.SB6_CS_SHADER, num_unit=round_up(prg.image_size, 128) // 128),
*data64_le(prg.lib_gpu.va_addr))
self.reg(adreno.REG_A6XX_HLSQ_CONTROL_2_REG, 0xfcfcfcfc, 0xfcfcfcfc, 0xfcfcfcfc, 0xfc,
qreg.a6xx_hlsq_cs_cntl(constlen=prg.kernargs_alloc_size // 4, enabled=True))
self.reg(adreno.REG_A6XX_HLSQ_CONTROL_2_REG, 0xfcfcfcfc, 0xfcfcfcfc, 0xfcfcfcfc, 0xfc, qreg.a6xx_hlsq_cs_cntl(constlen=1024 // 4, enabled=True))
self.reg(adreno.REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET, qreg.a6xx_sp_cs_pvt_mem_hw_stack_offset(prg.hw_stack_offset))
self.reg(adreno.REG_A6XX_SP_CS_INSTRLEN, qreg.a6xx_sp_cs_instrlen(prg.image_size // 4))
if hasattr(args_state, 'samplers_ptr'):
if args_state.prg.samp_cnt > 0:
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_SHADER, state_src=adreno.SS6_INDIRECT,
state_block=adreno.SB6_CS_TEX, num_unit=args_state.samplers_cnt),
*data64_le(args_state.samplers_ptr.va_addr))
self.reg(adreno.REG_A6XX_SP_CS_TEX_SAMP, *data64_le(args_state.samplers_ptr.va_addr))
state_block=adreno.SB6_CS_TEX, num_unit=args_state.prg.samp_cnt),
*data64_le(args_state.ptr + args_state.prg.samp_off))
self.reg(adreno.REG_A6XX_SP_CS_TEX_SAMP, *data64_le(args_state.ptr + args_state.prg.samp_off))
self.reg(adreno.REG_A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR, *data64_le(prg.device._border_color_base()))
if hasattr(args_state, 'descriptors_ptr'):
if args_state.prg.tex_cnt > 0:
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_CONSTANTS, state_src=adreno.SS6_INDIRECT,
state_block=adreno.SB6_CS_TEX, num_unit=args_state.descriptors_cnt),
*data64_le(args_state.descriptors_ptr.va_addr))
self.reg(adreno.REG_A6XX_SP_CS_TEX_CONST, *data64_le(args_state.descriptors_ptr.va_addr))
state_block=adreno.SB6_CS_TEX, num_unit=args_state.prg.tex_cnt),
*data64_le(args_state.ptr + args_state.prg.tex_off))
self.reg(adreno.REG_A6XX_SP_CS_TEX_CONST, *data64_le(args_state.ptr + args_state.prg.tex_off))
if hasattr(args_state, 'ibos_ptr'):
if args_state.prg.ibo_cnt > 0:
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST6_IBO, state_src=adreno.SS6_INDIRECT,
state_block=adreno.SB6_CS_SHADER, num_unit=args_state.ibos_cnt),
*data64_le(args_state.ibos_ptr.va_addr))
self.reg(adreno.REG_A6XX_SP_CS_IBO, *data64_le(args_state.ibos_ptr.va_addr))
state_block=adreno.SB6_CS_SHADER, num_unit=args_state.prg.ibo_cnt),
*data64_le(args_state.ptr + args_state.prg.ibo_off))
self.reg(adreno.REG_A6XX_SP_CS_IBO, *data64_le(args_state.ptr + args_state.prg.ibo_off))
self.reg(adreno.REG_A6XX_SP_CS_CONFIG,
qreg.a6xx_sp_cs_config(enabled=True, nsamp=args_state.samplers_cnt, ntex=args_state.descriptors_cnt, nibo=args_state.ibos_cnt))
qreg.a6xx_sp_cs_config(enabled=True, nsamp=args_state.prg.samp_cnt, ntex=args_state.prg.tex_cnt, nibo=args_state.prg.ibo_cnt))
self.cmd(adreno.CP_RUN_OPENCL, 0)
self._cache_flush(write_back=True, invalidate=False, sync=False, memsync=False)
@ -175,46 +176,27 @@ class QCOMComputeQueue(HWComputeQueue):
class QCOMArgsState(HCQArgsState):
def __init__(self, ptr:int, prg:QCOMProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
super().__init__(ptr, prg, bufs, vals=vals)
self.ibos_cnt, self.descriptors_cnt, self.samplers_cnt = 0, 0, 0
ctypes.memset(ptr, 0, 1024)
ctypes.memset(self.ptr, 0, 1024)
if len(bufs) + len(vals) != len(prg.buf_info): raise RuntimeError(f'incorrect args size given={len(bufs)+len(vals)} != want={len(prg.buf_info)}')
self.buf_info, self.args_info = prg.buf_info[:len(bufs)], prg.buf_info[len(bufs):]
if len(bufs) + len(vals) != len(prg.buffs_info): raise RuntimeError(f'incorrect args size given={len(bufs)} != want={len(prg.buffs_info)}')
self.boffs, self.aoffs = prg.buffs_info[:len(bufs)], prg.buffs_info[len(bufs):]
for i, v in enumerate(vals): self.update_var(i, v)
for cnst_val, cnst_off, cnst_sz in prg.consts_info:
ctypes.memmove(self.ptr + cnst_off, (ctypes.c_int8 * cnst_sz).from_buffer_copy(cnst_val.to_bytes(cnst_sz, byteorder='little')), cnst_sz)
samplers: List[Any] = []
descriptors: List[Any] = []
ibos: List[Any] = []
self.i2descr: Dict[int, int] = {}
self.i2ibo: Dict[int, int] = {}
for i, b in enumerate(bufs):
if not hasattr(b, 'samplers') and not hasattr(b, 'descriptor') and not hasattr(b, 'ibo'): self.update_buffer(i, b)
elif self.boffs[i][1]: ibos, self.i2ibo = [*ibos, *getattr(b, 'ibo')], {**self.i2ibo, i: len(ibos)}
else:
samplers, descriptors = [*samplers, *getattr(b, 'samplers')], [*descriptors, *getattr(b, 'descriptor')]
self.i2descr[i] = len(descriptors) - 1
def alloc_tex_gpu(data, chunk_size) -> Tuple[HCQBuffer, int]:
tex_gpu = self.prg.device.allocator.alloc(len(data) * 4, BufferOptions(nolru=True, cpu_access=True))
to_mv(tex_gpu.va_addr, len(data) * 4).cast('I')[:] = array.array('I', data)
return tex_gpu, len(data) // chunk_size
if len(samplers): self.samplers_ptr, self.samplers_cnt = alloc_tex_gpu(samplers, 4)
if len(descriptors): self.descriptors_ptr, self.descriptors_cnt = alloc_tex_gpu(descriptors, 16)
if len(ibos): self.ibos_ptr, self.ibos_cnt = alloc_tex_gpu(ibos, 16)
def __del__(self):
for ptr in ('samplers_ptr', 'descriptors_ptr', 'ibos_ptr'):
if hasattr(self, ptr): self.prg.device.allocator.free((x:=getattr(self, ptr)), x.size, BufferOptions(nolru=True, cpu_access=True))
if prg.samp_cnt > 0: to_mv(self.ptr + prg.samp_off, len(prg.samplers) * 4).cast('I')[:] = array.array('I', prg.samplers)
for i, b in enumerate(cast(List[QCOMBuffer], bufs)):
if prg.buf_info[i].type is BUFTYPE_TEX: to_mv(self.ptr + prg.buf_info[i].offset, len(b.desc) * 4).cast('I')[:] = array.array('I', b.desc)
elif prg.buf_info[i].type is BUFTYPE_IBO: to_mv(self.ptr + prg.buf_info[i].offset, len(b.ibo) * 4).cast('I')[:] = array.array('I', b.ibo)
else: self.update_buffer(i, b)
for i, v in enumerate(vals): self.update_var(i, v)
def update_buffer(self, index:int, buf:HCQBuffer):
if (descr:=self.i2descr.get(index, None)) is not None: to_mv(self.descriptors_ptr.va_addr + 16 * descr + 4 * 4, 8).cast('Q')[0] = buf.va_addr
elif (ibo:=self.i2ibo.get(index, None)) is not None: to_mv(self.ibos_ptr.va_addr + 16 * ibo + 4 * 4, 8).cast('Q')[0] = buf.va_addr
else: to_mv(self.ptr + self.boffs[index][0], 8).cast('Q')[0] = buf.va_addr
if self.buf_info[index].type is not BUFTYPE_BUF: to_mv(self.ptr+self.buf_info[index].offset+0x10, 8).cast('Q')[0] = buf.va_addr
else: to_mv(self.ptr + self.buf_info[index].offset, 8).cast('Q')[0] = buf.va_addr
def update_var(self, index:int, val:int): to_mv(self.ptr + self.aoffs[index][0], 8).cast('Q')[0] = val
def update_var(self, index:int, val:int): to_mv(self.ptr + self.args_info[index].offset, 8).cast('Q')[0] = val
class QCOMProgram(HCQProgram):
def __init__(self, device: QCOMDevice, name: str, lib: bytes):
@ -232,7 +214,7 @@ class QCOMProgram(HCQProgram):
self.max_threads = min(1024, ((384 * 32) // (max(1, (self.fregs + round_up(self.hregs, 2) // 2)) * 128)) * 128)
device._ensure_stack_size(self.hw_stack_offset * 4)
super().__init__(QCOMArgsState, self.device, self.name, kernargs_alloc_size=1024)
super().__init__(QCOMArgsState, self.device, self.name, kernargs_alloc_size=2048 + (self.tex_cnt + self.ibo_cnt) * 0x40 + self.samp_cnt * 0x10)
def __call__(self, *bufs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
if self.max_threads < prod(local_size): raise RuntimeError("Too many resources requsted for launch")
@ -253,19 +235,34 @@ class QCOMProgram(HCQProgram):
self.pvtmem, self.shmem = _read_lib(image_desc_off+0xc8), _read_lib(image_desc_off+0xd8)
# Fill up constants and buffers info
self.buffs_info, self.consts_info = [], []
self.buf_info, self.consts_info = [], []
samplers_count = _read_lib(image_desc_off + 0xdc)
bdoff = round_up(image_desc_off + 0x158 + len(self.name), 4) + 8 * samplers_count
while (bdoff + 16 <= len(self.lib)):
length, _, _, offset_words = struct.unpack("I" * 4, self.lib[bdoff:bdoff+16])
# Collect sampler info.
self.samp_cnt = _read_lib(image_desc_off + 0xdc)
assert self.samp_cnt <= 1, "Up to one sampler supported"
if self.samp_cnt:
self.samplers = [qreg.a6xx_tex_samp_0(wrap_s=(clamp_mode:=adreno.A6XX_TEX_CLAMP_TO_BORDER), wrap_t=clamp_mode, wrap_r=clamp_mode),
qreg.a6xx_tex_samp_1(unnorm_coords=True, cubemapseamlessfiltoff=True), 0, 0]
# Collect kernel arguments (buffers) info.
bdoff = round_up(image_desc_off + 0x158 + len(self.name), 4) + 8 * self.samp_cnt
while bdoff + 32 <= len(self.lib):
length, _, _, offset_words, _, _, _, typ = struct.unpack("IIIIIIII", self.lib[bdoff:bdoff+32])
if length == 0: break
self.buffs_info.append((offset_words * 4, struct.unpack("I", self.lib[bdoff+0x3c:bdoff+0x40])[0] == 0x0))
self.buf_info.append(SimpleNamespace(offset=offset_words * 4, type=typ))
bdoff += length
# Setting correct offsets to textures/ibos.
self.tex_cnt, self.ibo_cnt = sum(x.type is BUFTYPE_TEX for x in self.buf_info), sum(x.type is BUFTYPE_IBO for x in self.buf_info)
self.samp_off, self.ibo_off, self.tex_off = 2048, 2048 + 0x10 * self.samp_cnt, 2048 + 0x10 * self.samp_cnt + 0x40 * self.ibo_cnt
cur_ibo_off, cur_tex_off = self.ibo_off, self.tex_off
for x in self.buf_info:
if x.type is BUFTYPE_IBO: x.offset, cur_ibo_off = cur_ibo_off, cur_ibo_off + 0x40
elif x.type is BUFTYPE_TEX: x.offset, cur_tex_off = cur_tex_off, cur_tex_off + 0x40
if _read_lib(0xb0) != 0: # check if we have constants.
cdoff = _read_lib(0xac)
while (cdoff + 40 <= image_offset):
while cdoff + 40 <= image_offset:
cnst, offset_words, _, is32 = struct.unpack("I", self.lib[cdoff:cdoff+4])[0], *struct.unpack("III", self.lib[cdoff+16:cdoff+28])
self.consts_info.append((cnst, offset_words * (sz_bytes:=(2 << is32)), sz_bytes))
cdoff += 40
@ -277,6 +274,9 @@ class QCOMProgram(HCQProgram):
def __del__(self):
if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_gpu.size, options=BufferOptions(cpu_access=True, nolru=True))
class QCOMBuffer(HCQBuffer):
def __init__(self, va_addr:int, size:int, desc=None, ibo=None): self.va_addr, self.size, self.desc, self.ibo = va_addr, size, desc, ibo
class QCOMAllocator(HCQAllocator):
def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer:
if options.image is not None:
@ -291,18 +291,14 @@ class QCOMAllocator(HCQAllocator):
texture = self.device._gpu_alloc(pitch * round_up(imgh, 16), kgsl.KGSL_MEMTYPE_TEXTURE, map_to_cpu=True)
# Extend HCQBuffer with texture-related info.
texture.samplers, texture.descriptor, texture.ibo = [0] * 4, [0] * 16, [0] * 16
# Compiled sampler (always the same in tinygrad).
texture.samplers[0] = qreg.a6xx_tex_samp_0(wrap_s=(clamp_mode:=adreno.A6XX_TEX_CLAMP_TO_BORDER), wrap_t=clamp_mode, wrap_r=clamp_mode)
texture.samplers[1] = qreg.a6xx_tex_samp_1(unnorm_coords=True, cubemapseamlessfiltoff=True)
texture.desc, texture.ibo = [0] * 16, [0] * 16
tex_fmt = adreno.FMT6_32_32_32_32_FLOAT if options.image.itemsize == 4 else adreno.FMT6_16_16_16_16_FLOAT
texture.descriptor[0] = qreg.a6xx_tex_const_0(swiz_x=0, swiz_y=1, swiz_z=2, swiz_w=3, fmt=tex_fmt)
texture.descriptor[1] = qreg.a6xx_tex_const_1(width=imgw, height=imgh)
texture.descriptor[2] = qreg.a6xx_tex_const_2(type=adreno.A6XX_TEX_2D, pitch=pitch, pitchalign=pitchalign-6)
texture.descriptor[4:7] = [*data64_le(texture.va_addr), qreg.a6xx_tex_const_6(plane_pitch=0x400000)]
texture.ibo = [texture.descriptor[0] & (~0xffff), *texture.descriptor[1:len(texture.descriptor)]]
texture.desc[0] = qreg.a6xx_tex_const_0(swiz_x=0, swiz_y=1, swiz_z=2, swiz_w=3, fmt=tex_fmt)
texture.desc[1] = qreg.a6xx_tex_const_1(width=imgw, height=imgh)
texture.desc[2] = qreg.a6xx_tex_const_2(type=adreno.A6XX_TEX_2D, pitch=pitch, pitchalign=pitchalign-6)
texture.desc[4:7] = [*data64_le(texture.va_addr), qreg.a6xx_tex_const_6(plane_pitch=0x400000)]
texture.ibo = [texture.desc[0] & (~0xffff), *texture.desc[1:len(texture.desc)]]
return texture