fuzz qcom vs opencl (#7130)

* fuzz qcom vs opencl

* fix nv

* bettre?

* typo

* open both devs
This commit is contained in:
nimlgen 2024-10-17 18:49:08 +03:00 committed by GitHub
parent 188eef959d
commit 45db7d9045
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 126 additions and 47 deletions

View File

@ -177,7 +177,7 @@ def _dump_gpfifo(mark):
launches = []
# print("_dump_gpfifo:", mark)
for start,size in gpus_fifo:
for start, size in gpus_fifo:
gpfifo_controls = nv_gpu.AmpereAControlGPFifo.from_address(start+size*8)
gpfifo = to_mv(start, size * 8).cast("Q")
while old_gpputs[start] != gpfifo_controls.GPPut:
@ -235,32 +235,40 @@ def _dump_qmd(address, packets):
def before_launch(): _dump_gpfifo("before launch")
def collect_last_launch_state(): return _dump_gpfifo("after launch")
def compare_launch_state(states1, states2):
states1 = states1 or list()
states2 = states2 or list()
if len(states1) != 1 or len(states2) != 1:
return False, f"Some states not captured. {len(states1)}!=1 || {len(states2)}!=1"
def compare_launch_state(states, good_states):
states = states or list()
good_states = good_states or list()
if len(states) != 1 or len(good_states) != 1:
return False, f"Some states not captured. {len(states)}!=1 || {len(good_states)}!=1"
for i in range(len(states1)):
state1, state2 = states1[i], states2[i]
for i in range(len(states)):
state, good_state = states[i], good_states[i]
for n in ['qmd_major_version', 'invalidate_shader_data_cache', 'invalidate_shader_data_cache',
'sm_global_caching_enable', 'invalidate_texture_header_cache', 'invalidate_texture_sampler_cache',
'barrier_count', 'sampler_index', 'api_visible_call_limit', 'cwd_membar_type', 'sass_version',
'min_sm_config_shared_mem_size', 'max_sm_config_shared_mem_size', 'register_count_v',
'target_sm_config_shared_mem_size', 'shared_memory_size']:
if getattr(state1, n) != getattr(state1, n):
return False, f"Field {n} mismatch: {getattr(state1, n)} vs {getattr(state2, n)}"
'max_sm_config_shared_mem_size', 'register_count_v']:
if getattr(state, n) != getattr(good_state, n):
return False, f"Field {n} mismatch: {getattr(state, n)} vs {getattr(good_state, n)}"
# Allow NV to allocate more, at least this is not exact problem, so ignore it here.
# Hmm, CUDA minimum is 0x640, is this hw-required minimum (will check)?
if state1.shader_local_memory_high_size < state2.shader_local_memory_high_size and state2.shader_local_memory_high_size > 0x640:
return False, f"Field shader_local_memory_high_size mismatch: {state1.shader_local_memory_high_size} vs {state2.shader_local_memory_high_size}"
if state.shader_local_memory_high_size < good_state.shader_local_memory_high_size and good_state.shader_local_memory_high_size > 0x640:
return False, f"Field shader_local_memory_high_size mismatch: {state.shader_local_memory_high_size}vs{good_state.shader_local_memory_high_size}"
# TODO: Can't request more, since it might not be optimal, but need to investigate their formula for this.. #7133
if state.min_sm_config_shared_mem_size > good_state.min_sm_config_shared_mem_size and good_state.min_sm_config_shared_mem_size > 5:
return (False,
f"Field min_sm_config_shared_mem_size mismatch: {state.min_sm_config_shared_mem_size}vs{good_state.min_sm_config_shared_mem_size}")
if state.target_sm_config_shared_mem_size > good_state.target_sm_config_shared_mem_size and good_state.target_sm_config_shared_mem_size > 5:
return (False,
f"Field target_sm_config_shared_mem_size mismatch: {state.target_sm_config_shared_mem_size}vs{good_state.target_sm_config_shared_mem_size}")
for i in range(8):
if i in {1, 7}: continue # shaders don't use that. what's cuda put here?
n = f"constant_buffer_valid_{i}"
if getattr(state1, n) != getattr(state1, n):
return False, f"Field {n} mismatch: {getattr(state1, n)} vs {getattr(state2, n)}"
if getattr(state, n) != getattr(good_state, n):
return False, f"Field {n} mismatch: {getattr(state, n)} vs {getattr(good_state, n)}"
return True, "PASS"

View File

@ -19,6 +19,8 @@ for child in xml.getroot():
#print(ops)
#exit(0)
CAPTURED_STATE = {}
REGS = {}
for k, v in adreno.__dict__.items():
if k.startswith("REG_") and isinstance(v, int) and v > 1024: REGS[v] = k
@ -71,6 +73,8 @@ SB6_CS_TEX = 5
SB6_CS_SHADER = 13
def parse_cmd_buf(dat):
global CAPTURED_STATE
ptr = 0
while ptr < len(dat):
cmd = struct.unpack("I", dat[ptr:ptr+4])[0]
@ -78,14 +82,14 @@ def parse_cmd_buf(dat):
# packet with opcode and opcode specific payload (replace pkt3 starting with a5xx)
opcode, size = ((cmd>>16)&0x7F), cmd&0x3FFF
vals = struct.unpack("I"*size, dat[ptr+4:ptr+4+4*size])
print(f"{ptr:3X} -- typ 7: {size=:3d}, {opcode=:#x} {ops[opcode]}", hprint(vals))
if IOCTL > 0: print(f"{ptr:3X} -- typ 7: {size=:3d}, {opcode=:#x} {ops[opcode]}", hprint(vals))
if ops[opcode] == "CP_LOAD_STATE6_FRAG": # for compute shaders CP_LOAD_STATE6_FRAG is used
dst_off = vals[0] & 0x3FFF
state_type = (vals[0]>>14) & 0x3
state_src = (vals[0]>>16) & 0x3
state_block = (vals[0]>>18) & 0xF # 13 = SB4_CS_SHADER
num_unit = vals[0]>>22
print(f"{num_unit=} {state_block=} {state_src=} {state_type=} {dst_off=}")
if IOCTL > 0: print(f"{num_unit=} {state_block=} {state_src=} {state_type=} {dst_off=}")
if state_block == SB6_CS_SHADER and IOCTL > 2:
from extra.disassemblers.adreno import disasm_raw
@ -93,48 +97,59 @@ def parse_cmd_buf(dat):
if state_type == ST6_CONSTANTS: hexdump(get_mem(((vals[2] << 32) | vals[1]), min(0x180, num_unit*4)))
if state_type == ST6_IBO:
ibos_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 16 * 4)
print('texture ibos')
hexdump(ibos_bytes)
CAPTURED_STATE['ibos'] = ibos_bytes[:]
if IOCTL > 0:
print('texture ibos')
hexdump(ibos_bytes)
elif state_block == SB6_CS_TEX and IOCTL > 2:
if state_type == ST6_SHADER:
samplers_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 4 * 4)
print('texture samplers')
hexdump(samplers_bytes)
CAPTURED_STATE['samplers'] = ibos_bytes[:]
if IOCTL > 0:
print('texture samplers')
hexdump(samplers_bytes)
if state_type == ST6_CONSTANTS:
descriptors_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 16 * 4)
print('texture descriptors')
hexdump(descriptors_bytes)
CAPTURED_STATE['descriptors'] = ibos_bytes[:]
if IOCTL > 0:
print('texture descriptors')
hexdump(descriptors_bytes)
elif ops[opcode] == "CP_REG_TO_MEM":
reg, cnt, b64, accum = vals[0] & 0x3FFFF, (vals[0] >> 18) & 0xFFF, (vals[0] >> 30) & 0x1, (vals[0] >> 31) & 0x1
dest = vals[1] | (vals[2] << 32)
print(f"{reg=} {cnt=} {b64=} {accum=} {dest=:#x}")
if IOCTL > 0: print(f"{reg=} {cnt=} {b64=} {accum=} {dest=:#x}")
ptr += 4*size
elif (cmd>>28) == 0x4:
# write one or more registers (replace pkt0 starting with a5xx)
offset, size = ((cmd>>8)&0x7FFFF), cmd&0x7F
reg_name = REGS.get(offset, f"reg {offset=:#x}")
vals = struct.unpack("I"*size, dat[ptr+4:ptr+4+4*size])
print(f"{ptr:3X} -- typ 4: {size=:3d}, {reg_name}", hprint(vals))
if IOCTL > 0: print(f"{ptr:3X} -- typ 4: {size=:3d}, {reg_name}", hprint(vals))
for vi,v in enumerate(vals): CAPTURED_STATE[offset+vi] = v
if offset == adreno.REG_A6XX_SP_CS_CONFIG:
val = vals[0]
print(f"\tBINDLESS_TEX={(val >> 0) & 0b1}")
print(f"\tBINDLESS_SAMP={(val >> 1) & 0b1}")
print(f"\tBINDLESS_IBO={(val >> 2) & 0b1}")
print(f"\tBINDLESS_UBO={(val >> 3) & 0b1}")
print(f"\tEN={(val >> 8) & 0b1}")
print(f"\tNTEX={(val >> 9) & 0b11111111}")
print(f"\tNSAMP={(val >> 17) & 0b11111}")
print(f"\tNIBO={(val >> 22) & 0b1111111}")
if IOCTL > 0:
print(f"\tBINDLESS_TEX={(val >> 0) & 0b1}")
print(f"\tBINDLESS_SAMP={(val >> 1) & 0b1}")
print(f"\tBINDLESS_IBO={(val >> 2) & 0b1}")
print(f"\tBINDLESS_UBO={(val >> 3) & 0b1}")
print(f"\tEN={(val >> 8) & 0b1}")
print(f"\tNTEX={(val >> 9) & 0b11111111}")
print(f"\tNSAMP={(val >> 17) & 0b11111}")
print(f"\tNIBO={(val >> 22) & 0b1111111}")
if offset == 0xa9b0:
print(f'THREADSIZE-{(vals[0] >> 20)&0x1}\nEARLYPREAMBLE-{(vals[0] >> 23) & 0x1}\nMERGEDREGS-{(vals[0] >> 3) & 0x1}\nTHREADMODE-{vals[0] & 0x1}\nHALFREGFOOTPRINT-{(vals[0] >> 1) & 0x3f}\nFULLREGFOOTPRINT-{(vals[0] >> 7) & 0x3f}\nBRANCHSTACK-{(vals[0] >> 14) & 0x3f}\n')
print(f'SP_CS_UNKNOWN_A9B1-{vals[1]}\nSP_CS_BRANCH_COND-{vals[2]}\nSP_CS_OBJ_FIRST_EXEC_OFFSET-{vals[3]}\nSP_CS_OBJ_START-{vals[4] | (vals[5] << 32)}\nSP_CS_PVT_MEM_PARAM-{vals[6]}\nSP_CS_PVT_MEM_ADDR-{vals[7] | (vals[8] << 32)}\nSP_CS_PVT_MEM_SIZE-{vals[9]}')
if IOCTL > 0:
print(f'THREADSIZE-{(vals[0] >> 20)&0x1}\nEARLYPREAMBLE-{(vals[0] >> 23) & 0x1}\nMERGEDREGS-{(vals[0] >> 3) & 0x1}\nTHREADMODE-{vals[0] & 0x1}\nHALFREGFOOTPRINT-{(vals[0] >> 1) & 0x3f}\nFULLREGFOOTPRINT-{(vals[0] >> 7) & 0x3f}\nBRANCHSTACK-{(vals[0] >> 14) & 0x3f}\n')
print(f'SP_CS_UNKNOWN_A9B1-{vals[1]}\nSP_CS_BRANCH_COND-{vals[2]}\nSP_CS_OBJ_FIRST_EXEC_OFFSET-{vals[3]}\nSP_CS_OBJ_START-{vals[4] | (vals[5] << 32)}\nSP_CS_PVT_MEM_PARAM-{vals[6]}\nSP_CS_PVT_MEM_ADDR-{vals[7] | (vals[8] << 32)}\nSP_CS_PVT_MEM_SIZE-{vals[9]}')
if offset == 0xb180:
print('border color offset', hex(vals[1] << 32 | vals[0]))
hexdump(get_mem(vals[1] << 32 | vals[0], 0x200))
if IOCTL > 0:
print('border color offset', hex(vals[1] << 32 | vals[0]))
hexdump(get_mem(vals[1] << 32 | vals[0], 0x200))
ptr += 4*size
else:
print("unk", hex(cmd))
if IOCTL > 0:
print("unk", hex(cmd))
ptr += 4
@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_ulong, ctypes.c_void_p)
@ -145,19 +160,19 @@ def ioctl(fd, request, argp):
if nr in nrs and itype == 9:
name, stype = nrs[nr]
s = get_struct(argp, stype)
print(f"{ret:2d} = {name:40s}", ' '.join(format_struct(s)))
if IOCTL > 0: print(f"{ret:2d} = {name:40s}", ' '.join(format_struct(s)))
if name == "IOCTL_KGSL_GPUOBJ_INFO": pass
# mmaped[s.gpuaddr] = mmap.mmap(fd, s.size, offset=s.id*0x1000)
if name == "IOCTL_KGSL_GPU_COMMAND":
for i in range(s.numcmds):
cmd = get_struct(s.cmdlist+ctypes.sizeof(msm_kgsl.struct_kgsl_command_object)*i, msm_kgsl.struct_kgsl_command_object)
print(f"cmd {i}:", format_struct(cmd))
#hexdump(get_mem(cmd.gpuaddr, cmd.size))
if IOCTL > 1: parse_cmd_buf(get_mem(cmd.gpuaddr, cmd.size))
if IOCTL > 0: print(f"cmd {i}:", format_struct(cmd))
parse_cmd_buf(get_mem(cmd.gpuaddr, cmd.size))
for i in range(s.numobjs):
obj = get_struct(s.objlist+s.objsize*i, msm_kgsl.struct_kgsl_command_object)
print(f"obj {i}:", format_struct(obj))
print(format_struct(msm_kgsl.struct_kgsl_cmdbatch_profiling_buffer.from_buffer_copy(get_mem(obj.gpuaddr, obj.size))))
if IOCTL > 0:
print(f"obj {i}:", format_struct(obj))
print(format_struct(msm_kgsl.struct_kgsl_cmdbatch_profiling_buffer.from_buffer_copy(get_mem(obj.gpuaddr, obj.size))))
#hexdump(get_mem(obj.gpuaddr, obj.size))
else:
#print(f"ioctl({fd=}, (dir:{idir}, size:0x{size:3X}, type:{itype:d}, nr:0x{nr:2X}), {argp=:X}) = {ret=}")
@ -181,3 +196,37 @@ def install_hook(c_function, python_function):
libc = ctypes.CDLL(ctypes.util.find_library("libc"))
install_hook(libc.ioctl, ioctl)
def before_launch():
global CAPTURED_STATE
CAPTURED_STATE.clear()
def collect_last_launch_state(): return CAPTURED_STATE
def compare_launch_state(state, good_state):
cmp = [
(adreno.REG_A6XX_SP_CS_CONFIG, 0xffffffff),
(adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_HALFREGFOOTPRINT__MASK),
(adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT__MASK),
(adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_BRANCHSTACK__MASK),
(adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT__MASK),
(adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_THREADMODE__MASK),
(adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_EARLYPREAMBLE),
(adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_MERGEDREGS),
(adreno.REG_A6XX_SP_CS_UNKNOWN_A9B1, adreno.A6XX_SP_CS_UNKNOWN_A9B1_UNK5),
(adreno.REG_A6XX_SP_CS_UNKNOWN_A9B1, adreno.A6XX_SP_CS_UNKNOWN_A9B1_UNK6),
(adreno.REG_A6XX_SP_CS_BRANCH_COND, 0xffffffff),
]
for x,m in cmp:
if state.get(x, 0) & m != good_state.get(x, 0) & m:
return False, f"Field {REGS[x]}, mask: {x:X} mismatch: {state.get(x, 0) & m} vs {good_state.get(x, 0) & m}"
for n in ['ibos', 'samplers', 'descriptors']:
if n not in good_state: continue
mv1, mv2 = state.get(n), good_state.get(n)
if len(mv1) != len(mv2): return False, f"{n}: len mismatch"
if any(mv1[i]!=mv2[i] for i in range(len(mv1))): return False, f"{n}: content mismatch"
return True, "PASS"

View File

@ -1,9 +1,23 @@
import random, traceback, ctypes, argparse
import random, traceback, ctypes, argparse, os
from typing import List, Tuple, DefaultDict, Any
import numpy as np
from collections import defaultdict
from extra.optimization.helpers import load_worlds, ast_str_to_lin, kern_str_to_lin
# We need to insert ioctl before opening devices.
if os.getenv("VALIDATE_HCQ", 0) != 0:
try:
import extra.nv_gpu_driver.nv_ioctl
from tinygrad import Device
_, _ = Device["NV"], Device["CUDA"]
except Exception: pass
try:
import extra.qcom_gpu_driver.opencl_ioctl
from tinygrad import Device
_, _ = Device["QCOM"], Device["GPU"]
except Exception: pass
from tinygrad import Tensor, Device, dtypes
from tinygrad.tensor import _to_np_dtype
from tinygrad.codegen.kernel import Kernel
@ -26,6 +40,13 @@ if getenv("VALIDATE_HCQ"):
on_linearizer_will_run = extra.nv_gpu_driver.nv_ioctl.before_launch
on_linearizer_did_run = extra.nv_gpu_driver.nv_ioctl.collect_last_launch_state
compare_states = extra.nv_gpu_driver.nv_ioctl.compare_launch_state
elif Device.DEFAULT == "QCOM":
print("VALIDATE_HCQ: Comparing QCOM to GPU")
import extra.qcom_gpu_driver.opencl_ioctl
validate_device = Device["GPU"]
on_linearizer_will_run = extra.qcom_gpu_driver.opencl_ioctl.before_launch
on_linearizer_did_run = extra.qcom_gpu_driver.opencl_ioctl.collect_last_launch_state
compare_states = extra.qcom_gpu_driver.opencl_ioctl.compare_launch_state
else:
print(colored("VALIDATE_HCQ options is ignored", 'red'))
@ -282,7 +303,8 @@ if __name__ == "__main__":
try:
for i, ast in enumerate(ast_strs[:getenv("FUZZ_N", len(ast_strs))]):
if (nth := getenv("FUZZ_NTH", -1)) != -1 and i != nth: continue
if "dtypes.image" in ast and Device.DEFAULT != "GPU": continue # IMAGE is only for GPU
if getenv("FUZZ_IMAGEONLY") and "dtypes.image" not in ast: continue
if "dtypes.image" in ast and Device.DEFAULT not in {"GPU", "QCOM"}: continue # IMAGE is only for GPU
if ast in seen_ast_strs: continue
seen_ast_strs.add(ast)