qcom match texture/sampler descriptors to OpenCL (#7622)

* qcom ioctl compare more regs

* bug fix
This commit is contained in:
nimlgen 2024-11-11 21:56:51 +03:00 committed by Comma Device
parent 45db7d9045
commit 408c0a5e7f
4 changed files with 138 additions and 23 deletions

View File

@ -1,6 +1,7 @@
# type: ignore # type: ignore
import ctypes, ctypes.util, struct, fcntl, re import ctypes, ctypes.util, struct, fcntl, re
from hexdump import hexdump from hexdump import hexdump
from copy import deepcopy
import pathlib, sys import pathlib, sys
from tinygrad.helpers import to_mv, getenv from tinygrad.helpers import to_mv, getenv
from tinygrad.runtime.autogen import adreno from tinygrad.runtime.autogen import adreno
@ -91,27 +92,36 @@ def parse_cmd_buf(dat):
num_unit = vals[0]>>22 num_unit = vals[0]>>22
if IOCTL > 0: print(f"{num_unit=} {state_block=} {state_src=} {state_type=} {dst_off=}") if IOCTL > 0: print(f"{num_unit=} {state_block=} {state_src=} {state_type=} {dst_off=}")
if state_block == SB6_CS_SHADER and IOCTL > 2: if "LOAD_FRAGS" not in CAPTURED_STATE: CAPTURED_STATE['LOAD_FRAGS'] = []
CAPTURED_STATE['LOAD_FRAGS'].append((state_block, state_type, num_unit, dst_off))
if state_block == SB6_CS_SHADER:
from extra.disassemblers.adreno import disasm_raw from extra.disassemblers.adreno import disasm_raw
if state_type == ST6_SHADER: disasm_raw(get_mem(((vals[2] << 32) | vals[1]), num_unit * 128)) if state_type == ST6_SHADER and IOCTL > 2:
if state_type == ST6_CONSTANTS: hexdump(get_mem(((vals[2] << 32) | vals[1]), min(0x180, num_unit*4))) disasm_raw(get_mem(((vals[2] << 32) | vals[1]), num_unit * 128))
if state_type == ST6_CONSTANTS:
x = get_mem(((vals[2] << 32) | vals[1]), num_unit*4)
CAPTURED_STATE['constants'] = x[:]
if IOCTL > 2:
print('constants')
hexdump(x)
if state_type == ST6_IBO: if state_type == ST6_IBO:
ibos_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 16 * 4) ibos_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 16 * 4)
CAPTURED_STATE['ibos'] = ibos_bytes[:] CAPTURED_STATE['ibos'] = ibos_bytes[:]
if IOCTL > 0: if IOCTL > 1:
print('texture ibos') print('texture ibos')
hexdump(ibos_bytes) hexdump(ibos_bytes)
elif state_block == SB6_CS_TEX and IOCTL > 2: elif state_block == SB6_CS_TEX:
if state_type == ST6_SHADER: if state_type == ST6_SHADER:
samplers_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 4 * 4) samplers_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 4 * 4)
CAPTURED_STATE['samplers'] = ibos_bytes[:] CAPTURED_STATE['samplers'] = samplers_bytes[:]
if IOCTL > 0: if IOCTL > 1:
print('texture samplers') print('texture samplers')
hexdump(samplers_bytes) hexdump(samplers_bytes)
if state_type == ST6_CONSTANTS: if state_type == ST6_CONSTANTS:
descriptors_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 16 * 4) descriptors_bytes = get_mem((vals[2] << 32) | vals[1], 1600)
CAPTURED_STATE['descriptors'] = ibos_bytes[:] CAPTURED_STATE['descriptors'] = descriptors_bytes[:]
if IOCTL > 0: if IOCTL > 1:
print('texture descriptors') print('texture descriptors')
hexdump(descriptors_bytes) hexdump(descriptors_bytes)
@ -200,10 +210,19 @@ install_hook(libc.ioctl, ioctl)
def before_launch(): def before_launch():
global CAPTURED_STATE global CAPTURED_STATE
CAPTURED_STATE.clear() CAPTURED_STATE.clear()
def collect_last_launch_state(): return CAPTURED_STATE def collect_last_launch_state():
global CAPTURED_STATE
return deepcopy(CAPTURED_STATE)
def compare_launch_state(state, good_state): def compare_launch_state(state, good_state):
cmp = [ cmp = [
(adreno.REG_A6XX_SP_CS_CONFIG, 0xffffffff), (adreno.REG_A6XX_SP_CS_CONFIG, adreno.A6XX_SP_CS_CONFIG_NTEX__MASK),
(adreno.REG_A6XX_SP_CS_CONFIG, adreno.A6XX_SP_CS_CONFIG_NSAMP__MASK),
(adreno.REG_A6XX_SP_CS_CONFIG, adreno.A6XX_SP_CS_CONFIG_NIBO__MASK),
(adreno.REG_A6XX_SP_CS_CONFIG, adreno.A6XX_SP_CS_CONFIG_ENABLED),
(adreno.REG_A6XX_SP_CS_CONFIG, adreno.A6XX_SP_CS_CONFIG_BINDLESS_TEX),
(adreno.REG_A6XX_SP_CS_CONFIG, adreno.A6XX_SP_CS_CONFIG_BINDLESS_SAMP),
(adreno.REG_A6XX_SP_CS_CONFIG, adreno.A6XX_SP_CS_CONFIG_BINDLESS_IBO),
(adreno.REG_A6XX_SP_CS_CONFIG, adreno.A6XX_SP_CS_CONFIG_BINDLESS_UBO),
(adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_HALFREGFOOTPRINT__MASK), (adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_HALFREGFOOTPRINT__MASK),
(adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT__MASK), (adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT__MASK),
@ -213,20 +232,53 @@ def compare_launch_state(state, good_state):
(adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_EARLYPREAMBLE), (adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_EARLYPREAMBLE),
(adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_MERGEDREGS), (adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_MERGEDREGS),
(adreno.REG_A6XX_SP_CS_PVT_MEM_PARAM, adreno.A6XX_SP_CS_PVT_MEM_PARAM_MEMSIZEPERITEM__MASK),
(adreno.REG_A6XX_SP_CS_PVT_MEM_PARAM, adreno.A6XX_SP_CS_PVT_MEM_PARAM_HWSTACKSIZEPERTHREAD__MASK),
(adreno.REG_A6XX_SP_CS_UNKNOWN_A9B1, adreno.A6XX_SP_CS_UNKNOWN_A9B1_UNK5), (adreno.REG_A6XX_SP_CS_UNKNOWN_A9B1, adreno.A6XX_SP_CS_UNKNOWN_A9B1_UNK5),
(adreno.REG_A6XX_SP_CS_UNKNOWN_A9B1, adreno.A6XX_SP_CS_UNKNOWN_A9B1_UNK6), (adreno.REG_A6XX_SP_CS_UNKNOWN_A9B1, adreno.A6XX_SP_CS_UNKNOWN_A9B1_UNK6),
(adreno.REG_A6XX_SP_CS_BRANCH_COND, 0xffffffff), (adreno.REG_A6XX_SP_CS_BRANCH_COND, 0xffffffff),
(adreno.REG_A6XX_HLSQ_CS_NDRANGE_0, adreno.A6XX_HLSQ_CS_NDRANGE_0_KERNELDIM__MASK),
(adreno.REG_A6XX_HLSQ_CS_NDRANGE_0, adreno.A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEX__MASK),
(adreno.REG_A6XX_HLSQ_CS_NDRANGE_0, adreno.A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEY__MASK),
(adreno.REG_A6XX_HLSQ_CS_NDRANGE_0, adreno.A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEZ__MASK),
(adreno.REG_A6XX_HLSQ_CS_NDRANGE_1, 0xffffffff),
(adreno.REG_A6XX_HLSQ_CS_NDRANGE_2, 0xffffffff),
(adreno.REG_A6XX_HLSQ_CS_NDRANGE_3, 0xffffffff),
(adreno.REG_A6XX_HLSQ_CS_NDRANGE_4, 0xffffffff),
(adreno.REG_A6XX_HLSQ_CS_NDRANGE_5, 0xffffffff),
(adreno.REG_A6XX_HLSQ_CS_NDRANGE_6, 0xffffffff),
(adreno.REG_A6XX_HLSQ_CS_CNTL_0, 0xffffffff),
(adreno.REG_A6XX_HLSQ_CS_CNTL_1, 0xffffffff),
(adreno.REG_A6XX_HLSQ_CS_KERNEL_GROUP_X, 0xffffffff),
(adreno.REG_A6XX_HLSQ_CS_KERNEL_GROUP_Y, 0xffffffff),
(adreno.REG_A6XX_HLSQ_CS_KERNEL_GROUP_Z, 0xffffffff),
] ]
for x,m in cmp: for x,m in cmp:
print(f"Field {REGS[x]}, mask: 0x{m:X} cmp: {state.get(x, 0) & m} vs {good_state.get(x, 0) & m}")
if state.get(x, 0) & m != good_state.get(x, 0) & m: if state.get(x, 0) & m != good_state.get(x, 0) & m:
return False, f"Field {REGS[x]}, mask: {x:X} mismatch: {state.get(x, 0) & m} vs {good_state.get(x, 0) & m}" return False, f"Field {REGS[x]}, mask: 0x{m:X} mismatch: {state.get(x, 0) & m} vs {good_state.get(x, 0) & m}"
for n in ['ibos', 'samplers', 'descriptors']: for n in ['descriptors', 'ibos']:
if n not in good_state: continue if n not in good_state: continue
mv1, mv2 = state.get(n), good_state.get(n) mv1, mv2 = state.get(n), good_state.get(n)
if len(mv1) != len(mv2): return False, f"{n}: len mismatch"
if len(mv1) != len(mv2): return False, f"{n}: len mismatch {len(mv1)} != {len(mv2)}"
mv1 = memoryview(bytearray(mv1)).cast('I')
mv2 = memoryview(bytearray(mv2)).cast('I')
for i in range(len(mv2)):
if i % 8 == 5 or i % 8 == 4: continue # addresses
if mv1[i]!=mv2[i]: return False, f"{n}: content mismatch {i} {mv1[i]} {mv2[i]}"
for n in ['samplers']:
if n not in good_state: continue
mv1, mv2 = state.get(n), good_state.get(n)
if len(mv1) != len(mv2): return False, f"{n}: len mismatch {len(mv1)} != {len(mv2)}"
if any(mv1[i]!=mv2[i] for i in range(len(mv1))): return False, f"{n}: content mismatch" if any(mv1[i]!=mv2[i] for i in range(len(mv1))): return False, f"{n}: content mismatch"
return True, "PASS" return True, "PASS"

File diff suppressed because one or more lines are too long

View File

@ -30,7 +30,7 @@ from test.helpers import is_dtype_supported
def on_linearizer_will_run(): pass def on_linearizer_will_run(): pass
def on_linearizer_did_run(): pass def on_linearizer_did_run(): pass
def compare_states(x, y): return True def compare_states(x, y): return (True, "")
if getenv("VALIDATE_HCQ"): if getenv("VALIDATE_HCQ"):
if Device.DEFAULT == "NV": if Device.DEFAULT == "NV":

View File

@ -151,7 +151,7 @@ class QCOMComputeQueue(HWComputeQueue):
if args_state.prg.tex_cnt > 0: if args_state.prg.tex_cnt > 0:
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_CONSTANTS, state_src=adreno.SS6_INDIRECT, self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_CONSTANTS, state_src=adreno.SS6_INDIRECT,
state_block=adreno.SB6_CS_TEX, num_unit=args_state.prg.tex_cnt), state_block=adreno.SB6_CS_TEX, num_unit=min(16, args_state.prg.tex_cnt)),
*data64_le(args_state.ptr + args_state.prg.tex_off)) *data64_le(args_state.ptr + args_state.prg.tex_off))
self.reg(adreno.REG_A6XX_SP_CS_TEX_CONST, *data64_le(args_state.ptr + args_state.prg.tex_off)) self.reg(adreno.REG_A6XX_SP_CS_TEX_CONST, *data64_le(args_state.ptr + args_state.prg.tex_off))
@ -244,14 +244,15 @@ class QCOMProgram(HCQProgram):
self.buf_info, self.consts_info = [], [] self.buf_info, self.consts_info = [], []
# Collect sampler info. # Collect sampler info.
self.samp_cnt = _read_lib(image_desc_off + 0xdc) self.samp_cnt = samp_cnt_in_file = _read_lib(image_desc_off + 0xdc)
assert self.samp_cnt <= 1, "Up to one sampler supported" assert self.samp_cnt <= 1, "Up to one sampler supported"
if self.samp_cnt: if self.samp_cnt:
self.samp_cnt += 1
self.samplers = [qreg.a6xx_tex_samp_0(wrap_s=(clamp_mode:=adreno.A6XX_TEX_CLAMP_TO_BORDER), wrap_t=clamp_mode, wrap_r=clamp_mode), self.samplers = [qreg.a6xx_tex_samp_0(wrap_s=(clamp_mode:=adreno.A6XX_TEX_CLAMP_TO_BORDER), wrap_t=clamp_mode, wrap_r=clamp_mode),
qreg.a6xx_tex_samp_1(unnorm_coords=True, cubemapseamlessfiltoff=True), 0, 0] qreg.a6xx_tex_samp_1(unnorm_coords=True, cubemapseamlessfiltoff=True), 0, 0, 0, 0, 0, 0]
# Collect kernel arguments (buffers) info. # Collect kernel arguments (buffers) info.
bdoff = round_up(image_desc_off + 0x158 + len(self.name), 4) + 8 * self.samp_cnt bdoff = round_up(image_desc_off + 0x158 + len(self.name), 4) + 8 * samp_cnt_in_file
while bdoff + 32 <= len(self.lib): while bdoff + 32 <= len(self.lib):
length, _, _, offset_words, _, _, _, typ = struct.unpack("IIIIIIII", self.lib[bdoff:bdoff+32]) length, _, _, offset_words, _, _, _, typ = struct.unpack("IIIIIIII", self.lib[bdoff:bdoff+32])
if length == 0: break if length == 0: break
@ -260,7 +261,7 @@ class QCOMProgram(HCQProgram):
# Setting correct offsets to textures/ibos. # Setting correct offsets to textures/ibos.
self.tex_cnt, self.ibo_cnt = sum(x.type is BUFTYPE_TEX for x in self.buf_info), sum(x.type is BUFTYPE_IBO for x in self.buf_info) self.tex_cnt, self.ibo_cnt = sum(x.type is BUFTYPE_TEX for x in self.buf_info), sum(x.type is BUFTYPE_IBO for x in self.buf_info)
self.samp_off, self.ibo_off, self.tex_off = 2048, 2048 + 0x10 * self.samp_cnt, 2048 + 0x10 * self.samp_cnt + 0x40 * self.ibo_cnt self.ibo_off, self.tex_off, self.samp_off = 2048, 2048 + 0x40 * self.ibo_cnt, 2048 + 0x40 * self.tex_cnt + 0x40 * self.ibo_cnt
cur_ibo_off, cur_tex_off = self.ibo_off, self.tex_off cur_ibo_off, cur_tex_off = self.ibo_off, self.tex_off
for x in self.buf_info: for x in self.buf_info:
if x.type is BUFTYPE_IBO: x.offset, cur_ibo_off = cur_ibo_off, cur_ibo_off + 0x40 if x.type is BUFTYPE_IBO: x.offset, cur_ibo_off = cur_ibo_off, cur_ibo_off + 0x40
@ -304,10 +305,10 @@ class QCOMAllocator(HCQAllocator):
texture.pitch, texture.real_stride = pitch, real_stride texture.pitch, texture.real_stride = pitch, real_stride
tex_fmt = adreno.FMT6_32_32_32_32_FLOAT if options.image.itemsize == 4 else adreno.FMT6_16_16_16_16_FLOAT tex_fmt = adreno.FMT6_32_32_32_32_FLOAT if options.image.itemsize == 4 else adreno.FMT6_16_16_16_16_FLOAT
texture.desc[0] = qreg.a6xx_tex_const_0(swiz_x=0, swiz_y=1, swiz_z=2, swiz_w=3, fmt=tex_fmt) texture.desc[0] = qreg.a6xx_tex_const_0(0x8, swiz_x=0, swiz_y=1, swiz_z=2, swiz_w=3, fmt=tex_fmt)
texture.desc[1] = qreg.a6xx_tex_const_1(width=imgw, height=imgh) texture.desc[1] = qreg.a6xx_tex_const_1(width=imgw, height=imgh)
texture.desc[2] = qreg.a6xx_tex_const_2(type=adreno.A6XX_TEX_2D, pitch=texture.pitch, pitchalign=pitchalign-6) texture.desc[2] = qreg.a6xx_tex_const_2(type=adreno.A6XX_TEX_2D, pitch=texture.pitch, pitchalign=pitchalign-6)
texture.desc[4:7] = [*data64_le(texture.va_addr), qreg.a6xx_tex_const_6(plane_pitch=0x400000)] texture.desc[4:8] = [*data64_le(texture.va_addr), qreg.a6xx_tex_const_6(plane_pitch=0x400000), qreg.a6xx_tex_const_7(13)]
texture.ibo = [texture.desc[0] & (~0xffff), *texture.desc[1:len(texture.desc)]] texture.ibo = [texture.desc[0] & (~0xffff), *texture.desc[1:len(texture.desc)]]
return texture return texture