mirror of https://github.com/commaai/tinygrad.git
fuzz qcom vs opencl (#7130)
* fuzz qcom vs opencl * fix nv * bettre? * typo * open both devs
This commit is contained in:
parent
188eef959d
commit
45db7d9045
|
@ -177,7 +177,7 @@ def _dump_gpfifo(mark):
|
|||
launches = []
|
||||
|
||||
# print("_dump_gpfifo:", mark)
|
||||
for start,size in gpus_fifo:
|
||||
for start, size in gpus_fifo:
|
||||
gpfifo_controls = nv_gpu.AmpereAControlGPFifo.from_address(start+size*8)
|
||||
gpfifo = to_mv(start, size * 8).cast("Q")
|
||||
while old_gpputs[start] != gpfifo_controls.GPPut:
|
||||
|
@ -235,32 +235,40 @@ def _dump_qmd(address, packets):
|
|||
def before_launch(): _dump_gpfifo("before launch")
|
||||
def collect_last_launch_state(): return _dump_gpfifo("after launch")
|
||||
|
||||
def compare_launch_state(states1, states2):
|
||||
states1 = states1 or list()
|
||||
states2 = states2 or list()
|
||||
if len(states1) != 1 or len(states2) != 1:
|
||||
return False, f"Some states not captured. {len(states1)}!=1 || {len(states2)}!=1"
|
||||
def compare_launch_state(states, good_states):
|
||||
states = states or list()
|
||||
good_states = good_states or list()
|
||||
if len(states) != 1 or len(good_states) != 1:
|
||||
return False, f"Some states not captured. {len(states)}!=1 || {len(good_states)}!=1"
|
||||
|
||||
for i in range(len(states1)):
|
||||
state1, state2 = states1[i], states2[i]
|
||||
for i in range(len(states)):
|
||||
state, good_state = states[i], good_states[i]
|
||||
|
||||
for n in ['qmd_major_version', 'invalidate_shader_data_cache', 'invalidate_shader_data_cache',
|
||||
'sm_global_caching_enable', 'invalidate_texture_header_cache', 'invalidate_texture_sampler_cache',
|
||||
'barrier_count', 'sampler_index', 'api_visible_call_limit', 'cwd_membar_type', 'sass_version',
|
||||
'min_sm_config_shared_mem_size', 'max_sm_config_shared_mem_size', 'register_count_v',
|
||||
'target_sm_config_shared_mem_size', 'shared_memory_size']:
|
||||
if getattr(state1, n) != getattr(state1, n):
|
||||
return False, f"Field {n} mismatch: {getattr(state1, n)} vs {getattr(state2, n)}"
|
||||
'max_sm_config_shared_mem_size', 'register_count_v']:
|
||||
if getattr(state, n) != getattr(good_state, n):
|
||||
return False, f"Field {n} mismatch: {getattr(state, n)} vs {getattr(good_state, n)}"
|
||||
|
||||
# Allow NV to allocate more, at least this is not exact problem, so ignore it here.
|
||||
# Hmm, CUDA minimum is 0x640, is this hw-required minimum (will check)?
|
||||
if state1.shader_local_memory_high_size < state2.shader_local_memory_high_size and state2.shader_local_memory_high_size > 0x640:
|
||||
return False, f"Field shader_local_memory_high_size mismatch: {state1.shader_local_memory_high_size} vs {state2.shader_local_memory_high_size}"
|
||||
if state.shader_local_memory_high_size < good_state.shader_local_memory_high_size and good_state.shader_local_memory_high_size > 0x640:
|
||||
return False, f"Field shader_local_memory_high_size mismatch: {state.shader_local_memory_high_size}vs{good_state.shader_local_memory_high_size}"
|
||||
|
||||
# TODO: Can't request more, since it might not be optimal, but need to investigate their formula for this.. #7133
|
||||
if state.min_sm_config_shared_mem_size > good_state.min_sm_config_shared_mem_size and good_state.min_sm_config_shared_mem_size > 5:
|
||||
return (False,
|
||||
f"Field min_sm_config_shared_mem_size mismatch: {state.min_sm_config_shared_mem_size}vs{good_state.min_sm_config_shared_mem_size}")
|
||||
if state.target_sm_config_shared_mem_size > good_state.target_sm_config_shared_mem_size and good_state.target_sm_config_shared_mem_size > 5:
|
||||
return (False,
|
||||
f"Field target_sm_config_shared_mem_size mismatch: {state.target_sm_config_shared_mem_size}vs{good_state.target_sm_config_shared_mem_size}")
|
||||
|
||||
for i in range(8):
|
||||
if i in {1, 7}: continue # shaders don't use that. what's cuda put here?
|
||||
n = f"constant_buffer_valid_{i}"
|
||||
if getattr(state1, n) != getattr(state1, n):
|
||||
return False, f"Field {n} mismatch: {getattr(state1, n)} vs {getattr(state2, n)}"
|
||||
if getattr(state, n) != getattr(good_state, n):
|
||||
return False, f"Field {n} mismatch: {getattr(state, n)} vs {getattr(good_state, n)}"
|
||||
|
||||
return True, "PASS"
|
||||
|
||||
|
|
|
@ -19,6 +19,8 @@ for child in xml.getroot():
|
|||
#print(ops)
|
||||
#exit(0)
|
||||
|
||||
CAPTURED_STATE = {}
|
||||
|
||||
REGS = {}
|
||||
for k, v in adreno.__dict__.items():
|
||||
if k.startswith("REG_") and isinstance(v, int) and v > 1024: REGS[v] = k
|
||||
|
@ -71,6 +73,8 @@ SB6_CS_TEX = 5
|
|||
SB6_CS_SHADER = 13
|
||||
|
||||
def parse_cmd_buf(dat):
|
||||
global CAPTURED_STATE
|
||||
|
||||
ptr = 0
|
||||
while ptr < len(dat):
|
||||
cmd = struct.unpack("I", dat[ptr:ptr+4])[0]
|
||||
|
@ -78,14 +82,14 @@ def parse_cmd_buf(dat):
|
|||
# packet with opcode and opcode specific payload (replace pkt3 starting with a5xx)
|
||||
opcode, size = ((cmd>>16)&0x7F), cmd&0x3FFF
|
||||
vals = struct.unpack("I"*size, dat[ptr+4:ptr+4+4*size])
|
||||
print(f"{ptr:3X} -- typ 7: {size=:3d}, {opcode=:#x} {ops[opcode]}", hprint(vals))
|
||||
if IOCTL > 0: print(f"{ptr:3X} -- typ 7: {size=:3d}, {opcode=:#x} {ops[opcode]}", hprint(vals))
|
||||
if ops[opcode] == "CP_LOAD_STATE6_FRAG": # for compute shaders CP_LOAD_STATE6_FRAG is used
|
||||
dst_off = vals[0] & 0x3FFF
|
||||
state_type = (vals[0]>>14) & 0x3
|
||||
state_src = (vals[0]>>16) & 0x3
|
||||
state_block = (vals[0]>>18) & 0xF # 13 = SB4_CS_SHADER
|
||||
num_unit = vals[0]>>22
|
||||
print(f"{num_unit=} {state_block=} {state_src=} {state_type=} {dst_off=}")
|
||||
if IOCTL > 0: print(f"{num_unit=} {state_block=} {state_src=} {state_type=} {dst_off=}")
|
||||
|
||||
if state_block == SB6_CS_SHADER and IOCTL > 2:
|
||||
from extra.disassemblers.adreno import disasm_raw
|
||||
|
@ -93,48 +97,59 @@ def parse_cmd_buf(dat):
|
|||
if state_type == ST6_CONSTANTS: hexdump(get_mem(((vals[2] << 32) | vals[1]), min(0x180, num_unit*4)))
|
||||
if state_type == ST6_IBO:
|
||||
ibos_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 16 * 4)
|
||||
print('texture ibos')
|
||||
hexdump(ibos_bytes)
|
||||
CAPTURED_STATE['ibos'] = ibos_bytes[:]
|
||||
if IOCTL > 0:
|
||||
print('texture ibos')
|
||||
hexdump(ibos_bytes)
|
||||
elif state_block == SB6_CS_TEX and IOCTL > 2:
|
||||
if state_type == ST6_SHADER:
|
||||
samplers_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 4 * 4)
|
||||
print('texture samplers')
|
||||
hexdump(samplers_bytes)
|
||||
CAPTURED_STATE['samplers'] = ibos_bytes[:]
|
||||
if IOCTL > 0:
|
||||
print('texture samplers')
|
||||
hexdump(samplers_bytes)
|
||||
if state_type == ST6_CONSTANTS:
|
||||
descriptors_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 16 * 4)
|
||||
print('texture descriptors')
|
||||
hexdump(descriptors_bytes)
|
||||
CAPTURED_STATE['descriptors'] = ibos_bytes[:]
|
||||
if IOCTL > 0:
|
||||
print('texture descriptors')
|
||||
hexdump(descriptors_bytes)
|
||||
|
||||
elif ops[opcode] == "CP_REG_TO_MEM":
|
||||
reg, cnt, b64, accum = vals[0] & 0x3FFFF, (vals[0] >> 18) & 0xFFF, (vals[0] >> 30) & 0x1, (vals[0] >> 31) & 0x1
|
||||
dest = vals[1] | (vals[2] << 32)
|
||||
print(f"{reg=} {cnt=} {b64=} {accum=} {dest=:#x}")
|
||||
if IOCTL > 0: print(f"{reg=} {cnt=} {b64=} {accum=} {dest=:#x}")
|
||||
ptr += 4*size
|
||||
elif (cmd>>28) == 0x4:
|
||||
# write one or more registers (replace pkt0 starting with a5xx)
|
||||
offset, size = ((cmd>>8)&0x7FFFF), cmd&0x7F
|
||||
reg_name = REGS.get(offset, f"reg {offset=:#x}")
|
||||
vals = struct.unpack("I"*size, dat[ptr+4:ptr+4+4*size])
|
||||
print(f"{ptr:3X} -- typ 4: {size=:3d}, {reg_name}", hprint(vals))
|
||||
if IOCTL > 0: print(f"{ptr:3X} -- typ 4: {size=:3d}, {reg_name}", hprint(vals))
|
||||
for vi,v in enumerate(vals): CAPTURED_STATE[offset+vi] = v
|
||||
if offset == adreno.REG_A6XX_SP_CS_CONFIG:
|
||||
val = vals[0]
|
||||
print(f"\tBINDLESS_TEX={(val >> 0) & 0b1}")
|
||||
print(f"\tBINDLESS_SAMP={(val >> 1) & 0b1}")
|
||||
print(f"\tBINDLESS_IBO={(val >> 2) & 0b1}")
|
||||
print(f"\tBINDLESS_UBO={(val >> 3) & 0b1}")
|
||||
print(f"\tEN={(val >> 8) & 0b1}")
|
||||
print(f"\tNTEX={(val >> 9) & 0b11111111}")
|
||||
print(f"\tNSAMP={(val >> 17) & 0b11111}")
|
||||
print(f"\tNIBO={(val >> 22) & 0b1111111}")
|
||||
if IOCTL > 0:
|
||||
print(f"\tBINDLESS_TEX={(val >> 0) & 0b1}")
|
||||
print(f"\tBINDLESS_SAMP={(val >> 1) & 0b1}")
|
||||
print(f"\tBINDLESS_IBO={(val >> 2) & 0b1}")
|
||||
print(f"\tBINDLESS_UBO={(val >> 3) & 0b1}")
|
||||
print(f"\tEN={(val >> 8) & 0b1}")
|
||||
print(f"\tNTEX={(val >> 9) & 0b11111111}")
|
||||
print(f"\tNSAMP={(val >> 17) & 0b11111}")
|
||||
print(f"\tNIBO={(val >> 22) & 0b1111111}")
|
||||
if offset == 0xa9b0:
|
||||
print(f'THREADSIZE-{(vals[0] >> 20)&0x1}\nEARLYPREAMBLE-{(vals[0] >> 23) & 0x1}\nMERGEDREGS-{(vals[0] >> 3) & 0x1}\nTHREADMODE-{vals[0] & 0x1}\nHALFREGFOOTPRINT-{(vals[0] >> 1) & 0x3f}\nFULLREGFOOTPRINT-{(vals[0] >> 7) & 0x3f}\nBRANCHSTACK-{(vals[0] >> 14) & 0x3f}\n')
|
||||
print(f'SP_CS_UNKNOWN_A9B1-{vals[1]}\nSP_CS_BRANCH_COND-{vals[2]}\nSP_CS_OBJ_FIRST_EXEC_OFFSET-{vals[3]}\nSP_CS_OBJ_START-{vals[4] | (vals[5] << 32)}\nSP_CS_PVT_MEM_PARAM-{vals[6]}\nSP_CS_PVT_MEM_ADDR-{vals[7] | (vals[8] << 32)}\nSP_CS_PVT_MEM_SIZE-{vals[9]}')
|
||||
if IOCTL > 0:
|
||||
print(f'THREADSIZE-{(vals[0] >> 20)&0x1}\nEARLYPREAMBLE-{(vals[0] >> 23) & 0x1}\nMERGEDREGS-{(vals[0] >> 3) & 0x1}\nTHREADMODE-{vals[0] & 0x1}\nHALFREGFOOTPRINT-{(vals[0] >> 1) & 0x3f}\nFULLREGFOOTPRINT-{(vals[0] >> 7) & 0x3f}\nBRANCHSTACK-{(vals[0] >> 14) & 0x3f}\n')
|
||||
print(f'SP_CS_UNKNOWN_A9B1-{vals[1]}\nSP_CS_BRANCH_COND-{vals[2]}\nSP_CS_OBJ_FIRST_EXEC_OFFSET-{vals[3]}\nSP_CS_OBJ_START-{vals[4] | (vals[5] << 32)}\nSP_CS_PVT_MEM_PARAM-{vals[6]}\nSP_CS_PVT_MEM_ADDR-{vals[7] | (vals[8] << 32)}\nSP_CS_PVT_MEM_SIZE-{vals[9]}')
|
||||
if offset == 0xb180:
|
||||
print('border color offset', hex(vals[1] << 32 | vals[0]))
|
||||
hexdump(get_mem(vals[1] << 32 | vals[0], 0x200))
|
||||
if IOCTL > 0:
|
||||
print('border color offset', hex(vals[1] << 32 | vals[0]))
|
||||
hexdump(get_mem(vals[1] << 32 | vals[0], 0x200))
|
||||
ptr += 4*size
|
||||
else:
|
||||
print("unk", hex(cmd))
|
||||
if IOCTL > 0:
|
||||
print("unk", hex(cmd))
|
||||
ptr += 4
|
||||
|
||||
@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_ulong, ctypes.c_void_p)
|
||||
|
@ -145,19 +160,19 @@ def ioctl(fd, request, argp):
|
|||
if nr in nrs and itype == 9:
|
||||
name, stype = nrs[nr]
|
||||
s = get_struct(argp, stype)
|
||||
print(f"{ret:2d} = {name:40s}", ' '.join(format_struct(s)))
|
||||
if IOCTL > 0: print(f"{ret:2d} = {name:40s}", ' '.join(format_struct(s)))
|
||||
if name == "IOCTL_KGSL_GPUOBJ_INFO": pass
|
||||
# mmaped[s.gpuaddr] = mmap.mmap(fd, s.size, offset=s.id*0x1000)
|
||||
if name == "IOCTL_KGSL_GPU_COMMAND":
|
||||
for i in range(s.numcmds):
|
||||
cmd = get_struct(s.cmdlist+ctypes.sizeof(msm_kgsl.struct_kgsl_command_object)*i, msm_kgsl.struct_kgsl_command_object)
|
||||
print(f"cmd {i}:", format_struct(cmd))
|
||||
#hexdump(get_mem(cmd.gpuaddr, cmd.size))
|
||||
if IOCTL > 1: parse_cmd_buf(get_mem(cmd.gpuaddr, cmd.size))
|
||||
if IOCTL > 0: print(f"cmd {i}:", format_struct(cmd))
|
||||
parse_cmd_buf(get_mem(cmd.gpuaddr, cmd.size))
|
||||
for i in range(s.numobjs):
|
||||
obj = get_struct(s.objlist+s.objsize*i, msm_kgsl.struct_kgsl_command_object)
|
||||
print(f"obj {i}:", format_struct(obj))
|
||||
print(format_struct(msm_kgsl.struct_kgsl_cmdbatch_profiling_buffer.from_buffer_copy(get_mem(obj.gpuaddr, obj.size))))
|
||||
if IOCTL > 0:
|
||||
print(f"obj {i}:", format_struct(obj))
|
||||
print(format_struct(msm_kgsl.struct_kgsl_cmdbatch_profiling_buffer.from_buffer_copy(get_mem(obj.gpuaddr, obj.size))))
|
||||
#hexdump(get_mem(obj.gpuaddr, obj.size))
|
||||
else:
|
||||
#print(f"ioctl({fd=}, (dir:{idir}, size:0x{size:3X}, type:{itype:d}, nr:0x{nr:2X}), {argp=:X}) = {ret=}")
|
||||
|
@ -181,3 +196,37 @@ def install_hook(c_function, python_function):
|
|||
|
||||
libc = ctypes.CDLL(ctypes.util.find_library("libc"))
|
||||
install_hook(libc.ioctl, ioctl)
|
||||
|
||||
def before_launch():
|
||||
global CAPTURED_STATE
|
||||
CAPTURED_STATE.clear()
|
||||
def collect_last_launch_state(): return CAPTURED_STATE
|
||||
def compare_launch_state(state, good_state):
|
||||
cmp = [
|
||||
(adreno.REG_A6XX_SP_CS_CONFIG, 0xffffffff),
|
||||
|
||||
(adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_HALFREGFOOTPRINT__MASK),
|
||||
(adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT__MASK),
|
||||
(adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_BRANCHSTACK__MASK),
|
||||
(adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT__MASK),
|
||||
(adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_THREADMODE__MASK),
|
||||
(adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_EARLYPREAMBLE),
|
||||
(adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_MERGEDREGS),
|
||||
|
||||
(adreno.REG_A6XX_SP_CS_UNKNOWN_A9B1, adreno.A6XX_SP_CS_UNKNOWN_A9B1_UNK5),
|
||||
(adreno.REG_A6XX_SP_CS_UNKNOWN_A9B1, adreno.A6XX_SP_CS_UNKNOWN_A9B1_UNK6),
|
||||
|
||||
(adreno.REG_A6XX_SP_CS_BRANCH_COND, 0xffffffff),
|
||||
]
|
||||
|
||||
for x,m in cmp:
|
||||
if state.get(x, 0) & m != good_state.get(x, 0) & m:
|
||||
return False, f"Field {REGS[x]}, mask: {x:X} mismatch: {state.get(x, 0) & m} vs {good_state.get(x, 0) & m}"
|
||||
|
||||
for n in ['ibos', 'samplers', 'descriptors']:
|
||||
if n not in good_state: continue
|
||||
mv1, mv2 = state.get(n), good_state.get(n)
|
||||
if len(mv1) != len(mv2): return False, f"{n}: len mismatch"
|
||||
if any(mv1[i]!=mv2[i] for i in range(len(mv1))): return False, f"{n}: content mismatch"
|
||||
|
||||
return True, "PASS"
|
||||
|
|
|
@ -1,9 +1,23 @@
|
|||
import random, traceback, ctypes, argparse
|
||||
import random, traceback, ctypes, argparse, os
|
||||
from typing import List, Tuple, DefaultDict, Any
|
||||
import numpy as np
|
||||
from collections import defaultdict
|
||||
from extra.optimization.helpers import load_worlds, ast_str_to_lin, kern_str_to_lin
|
||||
|
||||
# We need to insert ioctl before opening devices.
|
||||
if os.getenv("VALIDATE_HCQ", 0) != 0:
|
||||
try:
|
||||
import extra.nv_gpu_driver.nv_ioctl
|
||||
from tinygrad import Device
|
||||
_, _ = Device["NV"], Device["CUDA"]
|
||||
except Exception: pass
|
||||
|
||||
try:
|
||||
import extra.qcom_gpu_driver.opencl_ioctl
|
||||
from tinygrad import Device
|
||||
_, _ = Device["QCOM"], Device["GPU"]
|
||||
except Exception: pass
|
||||
|
||||
from tinygrad import Tensor, Device, dtypes
|
||||
from tinygrad.tensor import _to_np_dtype
|
||||
from tinygrad.codegen.kernel import Kernel
|
||||
|
@ -26,6 +40,13 @@ if getenv("VALIDATE_HCQ"):
|
|||
on_linearizer_will_run = extra.nv_gpu_driver.nv_ioctl.before_launch
|
||||
on_linearizer_did_run = extra.nv_gpu_driver.nv_ioctl.collect_last_launch_state
|
||||
compare_states = extra.nv_gpu_driver.nv_ioctl.compare_launch_state
|
||||
elif Device.DEFAULT == "QCOM":
|
||||
print("VALIDATE_HCQ: Comparing QCOM to GPU")
|
||||
import extra.qcom_gpu_driver.opencl_ioctl
|
||||
validate_device = Device["GPU"]
|
||||
on_linearizer_will_run = extra.qcom_gpu_driver.opencl_ioctl.before_launch
|
||||
on_linearizer_did_run = extra.qcom_gpu_driver.opencl_ioctl.collect_last_launch_state
|
||||
compare_states = extra.qcom_gpu_driver.opencl_ioctl.compare_launch_state
|
||||
else:
|
||||
print(colored("VALIDATE_HCQ options is ignored", 'red'))
|
||||
|
||||
|
@ -282,7 +303,8 @@ if __name__ == "__main__":
|
|||
try:
|
||||
for i, ast in enumerate(ast_strs[:getenv("FUZZ_N", len(ast_strs))]):
|
||||
if (nth := getenv("FUZZ_NTH", -1)) != -1 and i != nth: continue
|
||||
if "dtypes.image" in ast and Device.DEFAULT != "GPU": continue # IMAGE is only for GPU
|
||||
if getenv("FUZZ_IMAGEONLY") and "dtypes.image" not in ast: continue
|
||||
if "dtypes.image" in ast and Device.DEFAULT not in {"GPU", "QCOM"}: continue # IMAGE is only for GPU
|
||||
if ast in seen_ast_strs: continue
|
||||
seen_ast_strs.add(ast)
|
||||
|
||||
|
|
Loading…
Reference in New Issue