diff --git a/extra/nv_gpu_driver/nv_ioctl.py b/extra/nv_gpu_driver/nv_ioctl.py index d8d28006..f85a309b 100644 --- a/extra/nv_gpu_driver/nv_ioctl.py +++ b/extra/nv_gpu_driver/nv_ioctl.py @@ -177,7 +177,7 @@ def _dump_gpfifo(mark): launches = [] # print("_dump_gpfifo:", mark) - for start,size in gpus_fifo: + for start, size in gpus_fifo: gpfifo_controls = nv_gpu.AmpereAControlGPFifo.from_address(start+size*8) gpfifo = to_mv(start, size * 8).cast("Q") while old_gpputs[start] != gpfifo_controls.GPPut: @@ -235,32 +235,40 @@ def _dump_qmd(address, packets): def before_launch(): _dump_gpfifo("before launch") def collect_last_launch_state(): return _dump_gpfifo("after launch") -def compare_launch_state(states1, states2): - states1 = states1 or list() - states2 = states2 or list() - if len(states1) != 1 or len(states2) != 1: - return False, f"Some states not captured. {len(states1)}!=1 || {len(states2)}!=1" +def compare_launch_state(states, good_states): + states = states or list() + good_states = good_states or list() + if len(states) != 1 or len(good_states) != 1: + return False, f"Some states not captured. {len(states)}!=1 || {len(good_states)}!=1" - for i in range(len(states1)): - state1, state2 = states1[i], states2[i] + for i in range(len(states)): + state, good_state = states[i], good_states[i] for n in ['qmd_major_version', 'invalidate_shader_data_cache', 'invalidate_shader_data_cache', 'sm_global_caching_enable', 'invalidate_texture_header_cache', 'invalidate_texture_sampler_cache', 'barrier_count', 'sampler_index', 'api_visible_call_limit', 'cwd_membar_type', 'sass_version', - 'min_sm_config_shared_mem_size', 'max_sm_config_shared_mem_size', 'register_count_v', - 'target_sm_config_shared_mem_size', 'shared_memory_size']: - if getattr(state1, n) != getattr(state1, n): - return False, f"Field {n} mismatch: {getattr(state1, n)} vs {getattr(state2, n)}" + 'max_sm_config_shared_mem_size', 'register_count_v']: + if getattr(state, n) != getattr(good_state, n): + return False, f"Field {n} mismatch: {getattr(state, n)} vs {getattr(good_state, n)}" # Allow NV to allocate more, at least this is not exact problem, so ignore it here. # Hmm, CUDA minimum is 0x640, is this hw-required minimum (will check)? - if state1.shader_local_memory_high_size < state2.shader_local_memory_high_size and state2.shader_local_memory_high_size > 0x640: - return False, f"Field shader_local_memory_high_size mismatch: {state1.shader_local_memory_high_size} vs {state2.shader_local_memory_high_size}" + if state.shader_local_memory_high_size < good_state.shader_local_memory_high_size and good_state.shader_local_memory_high_size > 0x640: + return False, f"Field shader_local_memory_high_size mismatch: {state.shader_local_memory_high_size}vs{good_state.shader_local_memory_high_size}" + + # TODO: Can't request more, since it might not be optimal, but need to investigate their formula for this.. #7133 + if state.min_sm_config_shared_mem_size > good_state.min_sm_config_shared_mem_size and good_state.min_sm_config_shared_mem_size > 5: + return (False, + f"Field min_sm_config_shared_mem_size mismatch: {state.min_sm_config_shared_mem_size}vs{good_state.min_sm_config_shared_mem_size}") + if state.target_sm_config_shared_mem_size > good_state.target_sm_config_shared_mem_size and good_state.target_sm_config_shared_mem_size > 5: + return (False, + f"Field target_sm_config_shared_mem_size mismatch: {state.target_sm_config_shared_mem_size}vs{good_state.target_sm_config_shared_mem_size}") for i in range(8): + if i in {1, 7}: continue # shaders don't use that. what's cuda put here? n = f"constant_buffer_valid_{i}" - if getattr(state1, n) != getattr(state1, n): - return False, f"Field {n} mismatch: {getattr(state1, n)} vs {getattr(state2, n)}" + if getattr(state, n) != getattr(good_state, n): + return False, f"Field {n} mismatch: {getattr(state, n)} vs {getattr(good_state, n)}" return True, "PASS" diff --git a/extra/qcom_gpu_driver/opencl_ioctl.py b/extra/qcom_gpu_driver/opencl_ioctl.py index 87d1c143..0e87a317 100644 --- a/extra/qcom_gpu_driver/opencl_ioctl.py +++ b/extra/qcom_gpu_driver/opencl_ioctl.py @@ -19,6 +19,8 @@ for child in xml.getroot(): #print(ops) #exit(0) +CAPTURED_STATE = {} + REGS = {} for k, v in adreno.__dict__.items(): if k.startswith("REG_") and isinstance(v, int) and v > 1024: REGS[v] = k @@ -71,6 +73,8 @@ SB6_CS_TEX = 5 SB6_CS_SHADER = 13 def parse_cmd_buf(dat): + global CAPTURED_STATE + ptr = 0 while ptr < len(dat): cmd = struct.unpack("I", dat[ptr:ptr+4])[0] @@ -78,14 +82,14 @@ def parse_cmd_buf(dat): # packet with opcode and opcode specific payload (replace pkt3 starting with a5xx) opcode, size = ((cmd>>16)&0x7F), cmd&0x3FFF vals = struct.unpack("I"*size, dat[ptr+4:ptr+4+4*size]) - print(f"{ptr:3X} -- typ 7: {size=:3d}, {opcode=:#x} {ops[opcode]}", hprint(vals)) + if IOCTL > 0: print(f"{ptr:3X} -- typ 7: {size=:3d}, {opcode=:#x} {ops[opcode]}", hprint(vals)) if ops[opcode] == "CP_LOAD_STATE6_FRAG": # for compute shaders CP_LOAD_STATE6_FRAG is used dst_off = vals[0] & 0x3FFF state_type = (vals[0]>>14) & 0x3 state_src = (vals[0]>>16) & 0x3 state_block = (vals[0]>>18) & 0xF # 13 = SB4_CS_SHADER num_unit = vals[0]>>22 - print(f"{num_unit=} {state_block=} {state_src=} {state_type=} {dst_off=}") + if IOCTL > 0: print(f"{num_unit=} {state_block=} {state_src=} {state_type=} {dst_off=}") if state_block == SB6_CS_SHADER and IOCTL > 2: from extra.disassemblers.adreno import disasm_raw @@ -93,48 +97,59 @@ def parse_cmd_buf(dat): if state_type == ST6_CONSTANTS: hexdump(get_mem(((vals[2] << 32) | vals[1]), min(0x180, num_unit*4))) if state_type == ST6_IBO: ibos_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 16 * 4) - print('texture ibos') - hexdump(ibos_bytes) + CAPTURED_STATE['ibos'] = ibos_bytes[:] + if IOCTL > 0: + print('texture ibos') + hexdump(ibos_bytes) elif state_block == SB6_CS_TEX and IOCTL > 2: if state_type == ST6_SHADER: samplers_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 4 * 4) - print('texture samplers') - hexdump(samplers_bytes) + CAPTURED_STATE['samplers'] = ibos_bytes[:] + if IOCTL > 0: + print('texture samplers') + hexdump(samplers_bytes) if state_type == ST6_CONSTANTS: descriptors_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 16 * 4) - print('texture descriptors') - hexdump(descriptors_bytes) + CAPTURED_STATE['descriptors'] = ibos_bytes[:] + if IOCTL > 0: + print('texture descriptors') + hexdump(descriptors_bytes) elif ops[opcode] == "CP_REG_TO_MEM": reg, cnt, b64, accum = vals[0] & 0x3FFFF, (vals[0] >> 18) & 0xFFF, (vals[0] >> 30) & 0x1, (vals[0] >> 31) & 0x1 dest = vals[1] | (vals[2] << 32) - print(f"{reg=} {cnt=} {b64=} {accum=} {dest=:#x}") + if IOCTL > 0: print(f"{reg=} {cnt=} {b64=} {accum=} {dest=:#x}") ptr += 4*size elif (cmd>>28) == 0x4: # write one or more registers (replace pkt0 starting with a5xx) offset, size = ((cmd>>8)&0x7FFFF), cmd&0x7F reg_name = REGS.get(offset, f"reg {offset=:#x}") vals = struct.unpack("I"*size, dat[ptr+4:ptr+4+4*size]) - print(f"{ptr:3X} -- typ 4: {size=:3d}, {reg_name}", hprint(vals)) + if IOCTL > 0: print(f"{ptr:3X} -- typ 4: {size=:3d}, {reg_name}", hprint(vals)) + for vi,v in enumerate(vals): CAPTURED_STATE[offset+vi] = v if offset == adreno.REG_A6XX_SP_CS_CONFIG: val = vals[0] - print(f"\tBINDLESS_TEX={(val >> 0) & 0b1}") - print(f"\tBINDLESS_SAMP={(val >> 1) & 0b1}") - print(f"\tBINDLESS_IBO={(val >> 2) & 0b1}") - print(f"\tBINDLESS_UBO={(val >> 3) & 0b1}") - print(f"\tEN={(val >> 8) & 0b1}") - print(f"\tNTEX={(val >> 9) & 0b11111111}") - print(f"\tNSAMP={(val >> 17) & 0b11111}") - print(f"\tNIBO={(val >> 22) & 0b1111111}") + if IOCTL > 0: + print(f"\tBINDLESS_TEX={(val >> 0) & 0b1}") + print(f"\tBINDLESS_SAMP={(val >> 1) & 0b1}") + print(f"\tBINDLESS_IBO={(val >> 2) & 0b1}") + print(f"\tBINDLESS_UBO={(val >> 3) & 0b1}") + print(f"\tEN={(val >> 8) & 0b1}") + print(f"\tNTEX={(val >> 9) & 0b11111111}") + print(f"\tNSAMP={(val >> 17) & 0b11111}") + print(f"\tNIBO={(val >> 22) & 0b1111111}") if offset == 0xa9b0: - print(f'THREADSIZE-{(vals[0] >> 20)&0x1}\nEARLYPREAMBLE-{(vals[0] >> 23) & 0x1}\nMERGEDREGS-{(vals[0] >> 3) & 0x1}\nTHREADMODE-{vals[0] & 0x1}\nHALFREGFOOTPRINT-{(vals[0] >> 1) & 0x3f}\nFULLREGFOOTPRINT-{(vals[0] >> 7) & 0x3f}\nBRANCHSTACK-{(vals[0] >> 14) & 0x3f}\n') - print(f'SP_CS_UNKNOWN_A9B1-{vals[1]}\nSP_CS_BRANCH_COND-{vals[2]}\nSP_CS_OBJ_FIRST_EXEC_OFFSET-{vals[3]}\nSP_CS_OBJ_START-{vals[4] | (vals[5] << 32)}\nSP_CS_PVT_MEM_PARAM-{vals[6]}\nSP_CS_PVT_MEM_ADDR-{vals[7] | (vals[8] << 32)}\nSP_CS_PVT_MEM_SIZE-{vals[9]}') + if IOCTL > 0: + print(f'THREADSIZE-{(vals[0] >> 20)&0x1}\nEARLYPREAMBLE-{(vals[0] >> 23) & 0x1}\nMERGEDREGS-{(vals[0] >> 3) & 0x1}\nTHREADMODE-{vals[0] & 0x1}\nHALFREGFOOTPRINT-{(vals[0] >> 1) & 0x3f}\nFULLREGFOOTPRINT-{(vals[0] >> 7) & 0x3f}\nBRANCHSTACK-{(vals[0] >> 14) & 0x3f}\n') + print(f'SP_CS_UNKNOWN_A9B1-{vals[1]}\nSP_CS_BRANCH_COND-{vals[2]}\nSP_CS_OBJ_FIRST_EXEC_OFFSET-{vals[3]}\nSP_CS_OBJ_START-{vals[4] | (vals[5] << 32)}\nSP_CS_PVT_MEM_PARAM-{vals[6]}\nSP_CS_PVT_MEM_ADDR-{vals[7] | (vals[8] << 32)}\nSP_CS_PVT_MEM_SIZE-{vals[9]}') if offset == 0xb180: - print('border color offset', hex(vals[1] << 32 | vals[0])) - hexdump(get_mem(vals[1] << 32 | vals[0], 0x200)) + if IOCTL > 0: + print('border color offset', hex(vals[1] << 32 | vals[0])) + hexdump(get_mem(vals[1] << 32 | vals[0], 0x200)) ptr += 4*size else: - print("unk", hex(cmd)) + if IOCTL > 0: + print("unk", hex(cmd)) ptr += 4 @ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_ulong, ctypes.c_void_p) @@ -145,19 +160,19 @@ def ioctl(fd, request, argp): if nr in nrs and itype == 9: name, stype = nrs[nr] s = get_struct(argp, stype) - print(f"{ret:2d} = {name:40s}", ' '.join(format_struct(s))) + if IOCTL > 0: print(f"{ret:2d} = {name:40s}", ' '.join(format_struct(s))) if name == "IOCTL_KGSL_GPUOBJ_INFO": pass # mmaped[s.gpuaddr] = mmap.mmap(fd, s.size, offset=s.id*0x1000) if name == "IOCTL_KGSL_GPU_COMMAND": for i in range(s.numcmds): cmd = get_struct(s.cmdlist+ctypes.sizeof(msm_kgsl.struct_kgsl_command_object)*i, msm_kgsl.struct_kgsl_command_object) - print(f"cmd {i}:", format_struct(cmd)) - #hexdump(get_mem(cmd.gpuaddr, cmd.size)) - if IOCTL > 1: parse_cmd_buf(get_mem(cmd.gpuaddr, cmd.size)) + if IOCTL > 0: print(f"cmd {i}:", format_struct(cmd)) + parse_cmd_buf(get_mem(cmd.gpuaddr, cmd.size)) for i in range(s.numobjs): obj = get_struct(s.objlist+s.objsize*i, msm_kgsl.struct_kgsl_command_object) - print(f"obj {i}:", format_struct(obj)) - print(format_struct(msm_kgsl.struct_kgsl_cmdbatch_profiling_buffer.from_buffer_copy(get_mem(obj.gpuaddr, obj.size)))) + if IOCTL > 0: + print(f"obj {i}:", format_struct(obj)) + print(format_struct(msm_kgsl.struct_kgsl_cmdbatch_profiling_buffer.from_buffer_copy(get_mem(obj.gpuaddr, obj.size)))) #hexdump(get_mem(obj.gpuaddr, obj.size)) else: #print(f"ioctl({fd=}, (dir:{idir}, size:0x{size:3X}, type:{itype:d}, nr:0x{nr:2X}), {argp=:X}) = {ret=}") @@ -181,3 +196,37 @@ def install_hook(c_function, python_function): libc = ctypes.CDLL(ctypes.util.find_library("libc")) install_hook(libc.ioctl, ioctl) + +def before_launch(): + global CAPTURED_STATE + CAPTURED_STATE.clear() +def collect_last_launch_state(): return CAPTURED_STATE +def compare_launch_state(state, good_state): + cmp = [ + (adreno.REG_A6XX_SP_CS_CONFIG, 0xffffffff), + + (adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_HALFREGFOOTPRINT__MASK), + (adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT__MASK), + (adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_BRANCHSTACK__MASK), + (adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT__MASK), + (adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_THREADMODE__MASK), + (adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_EARLYPREAMBLE), + (adreno.REG_A6XX_SP_CS_CTRL_REG0, adreno.A6XX_SP_CS_CTRL_REG0_MERGEDREGS), + + (adreno.REG_A6XX_SP_CS_UNKNOWN_A9B1, adreno.A6XX_SP_CS_UNKNOWN_A9B1_UNK5), + (adreno.REG_A6XX_SP_CS_UNKNOWN_A9B1, adreno.A6XX_SP_CS_UNKNOWN_A9B1_UNK6), + + (adreno.REG_A6XX_SP_CS_BRANCH_COND, 0xffffffff), + ] + + for x,m in cmp: + if state.get(x, 0) & m != good_state.get(x, 0) & m: + return False, f"Field {REGS[x]}, mask: {x:X} mismatch: {state.get(x, 0) & m} vs {good_state.get(x, 0) & m}" + + for n in ['ibos', 'samplers', 'descriptors']: + if n not in good_state: continue + mv1, mv2 = state.get(n), good_state.get(n) + if len(mv1) != len(mv2): return False, f"{n}: len mismatch" + if any(mv1[i]!=mv2[i] for i in range(len(mv1))): return False, f"{n}: content mismatch" + + return True, "PASS" diff --git a/test/external/fuzz_linearizer.py b/test/external/fuzz_linearizer.py index 57d74a15..8dcfd249 100644 --- a/test/external/fuzz_linearizer.py +++ b/test/external/fuzz_linearizer.py @@ -1,9 +1,23 @@ -import random, traceback, ctypes, argparse +import random, traceback, ctypes, argparse, os from typing import List, Tuple, DefaultDict, Any import numpy as np from collections import defaultdict from extra.optimization.helpers import load_worlds, ast_str_to_lin, kern_str_to_lin +# We need to insert ioctl before opening devices. +if os.getenv("VALIDATE_HCQ", 0) != 0: + try: + import extra.nv_gpu_driver.nv_ioctl + from tinygrad import Device + _, _ = Device["NV"], Device["CUDA"] + except Exception: pass + + try: + import extra.qcom_gpu_driver.opencl_ioctl + from tinygrad import Device + _, _ = Device["QCOM"], Device["GPU"] + except Exception: pass + from tinygrad import Tensor, Device, dtypes from tinygrad.tensor import _to_np_dtype from tinygrad.codegen.kernel import Kernel @@ -26,6 +40,13 @@ if getenv("VALIDATE_HCQ"): on_linearizer_will_run = extra.nv_gpu_driver.nv_ioctl.before_launch on_linearizer_did_run = extra.nv_gpu_driver.nv_ioctl.collect_last_launch_state compare_states = extra.nv_gpu_driver.nv_ioctl.compare_launch_state + elif Device.DEFAULT == "QCOM": + print("VALIDATE_HCQ: Comparing QCOM to GPU") + import extra.qcom_gpu_driver.opencl_ioctl + validate_device = Device["GPU"] + on_linearizer_will_run = extra.qcom_gpu_driver.opencl_ioctl.before_launch + on_linearizer_did_run = extra.qcom_gpu_driver.opencl_ioctl.collect_last_launch_state + compare_states = extra.qcom_gpu_driver.opencl_ioctl.compare_launch_state else: print(colored("VALIDATE_HCQ options is ignored", 'red')) @@ -282,7 +303,8 @@ if __name__ == "__main__": try: for i, ast in enumerate(ast_strs[:getenv("FUZZ_N", len(ast_strs))]): if (nth := getenv("FUZZ_NTH", -1)) != -1 and i != nth: continue - if "dtypes.image" in ast and Device.DEFAULT != "GPU": continue # IMAGE is only for GPU + if getenv("FUZZ_IMAGEONLY") and "dtypes.image" not in ast: continue + if "dtypes.image" in ast and Device.DEFAULT not in {"GPU", "QCOM"}: continue # IMAGE is only for GPU if ast in seen_ast_strs: continue seen_ast_strs.add(ast)