rocm: disassembler for shader

2023-05-06 18:56:09 +00:00 · 2023-05-06 18:56:09 +00:00 · 5190037cbc
parent 7fbf96b992
commit 5190037cbc
3 changed files with 32 additions and 3 deletions
--- a/extra/helpers.py
+++ b/extra/helpers.py
@ -6,3 +6,18 @@ class Timing(object):
  def __exit__(self, exc_type, exc_val, exc_tb):
    self.et = time.perf_counter_ns() - self.st
    if self.enabled: print(f"{self.prefix}{self.et*1e-6:.2f} ms"+(self.on_exit(self.et) if self.on_exit else ""))
+
+def enable_early_exec():
+  import subprocess, multiprocessing
+  qin, qout = multiprocessing.Queue(), multiprocessing.Queue()
+  def _early_exec_process(qin, qout):
+    while 1:
+      path, inp = qin.get()
+      qout.put(subprocess.check_output(path, input=inp))
+  p = multiprocessing.Process(target=_early_exec_process, args=(qin, qout))
+  p.daemon = True
+  p.start()
+  def early_exec(x):
+    qin.put(x)
+    return qout.get()
+  return early_exec
--- a/extra/rocm/sniffer/sniff.cc
+++ b/extra/rocm/sniffer/sniff.cc
@ -72,8 +72,14 @@ static void handler(int sig, siginfo_t *si, void *unused) {
      D("HSA_PACKET_TYPE_KERNEL_DISPATCH -- setup:%d workgroup[%d, %d, %d] grid[%d, %d, %d] kernel_object:0x%lx kernarg_address:%p\n", pkt->setup, pkt->workgroup_size_x, pkt->workgroup_size_y, pkt->workgroup_size_z, pkt->grid_size_x, pkt->grid_size_y, pkt->grid_size_z, pkt->kernel_object, pkt->kernarg_address);
      amd_kernel_code_t *code = (amd_kernel_code_t *)pkt->kernel_object;
      D("kernel_code_entry_byte_offset:%lx\n", code->kernel_code_entry_byte_offset);
-      hexdump((void*)(pkt->kernel_object + code->kernel_code_entry_byte_offset), 0x200);
-      //hexdump((void*)pkt->kernel_object, sizeof(amd_kernel_code_t));
+      uint32_t *kernel_code = (uint32_t*)(pkt->kernel_object + code->kernel_code_entry_byte_offset);
+      int code_len = 0;
+      while (kernel_code[code_len] != 0xbf9f0000 && kernel_code[code_len] != 0) code_len++;
+      hexdump(kernel_code, code_len*4);
+      /*FILE *f = fopen("/tmp/kernel_code", "wb");
+      fwrite(kernel_code, 4, code_len, f);
+      fclose(f);
+      system("python -c 'print(\" \".join([(\"0x%02X\"%x) for x in open(\"/tmp/kernel_code\", \"rb\").read()]))' | ../build/llvm-project/bin/llvm-mc --disassemble --arch=amdgcn --mcpu=gfx1100 --show-encoding");*/
    } else if ((pkt->header&0xFF) == HSA_PACKET_TYPE_BARRIER_AND) {
      D("HSA_PACKET_TYPE_BARRIER_AND\n");
    }
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@ -1,5 +1,5 @@
 from __future__ import annotations
-import platform
+import platform, pathlib
 import numpy as np
 import pyopencl as cl  # type: ignore
 from typing import Optional, List
@ -12,6 +12,11 @@ OSX = platform.system() == "Darwin"
 OSX_TIMING_RATIO = (125/3) if OSX else 1.0   # see test/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
 FLOAT16 = getenv("FLOAT16", 0)

+# TODO: if you fork and exit the child process after creating anything with cl on AMD, it hangs on e.wait()
+if DEBUG >= 5:
+  from extra.helpers import enable_early_exec
+  early_exec = enable_early_exec()
+
 class _CL:
  def __init__(self):
    platforms: List[List[cl.Device]] = [y for y in ([x.get_devices(device_type=cl.device_type.GPU) for x in cl.get_platforms()] + [x.get_devices(device_type=cl.device_type.CPU) for x in cl.get_platforms()]) if len(y)]
@ -54,6 +59,9 @@ class CLProgram:
      if 'Adreno' in CL.cl_ctx.devices[0].name:
        from disassemblers.adreno import disasm
        disasm(self.binary())
+      elif 'gfx1100' in CL.cl_ctx.devices[0].name:
+        asm = early_exec(([pathlib.Path(__file__).parent.parent.parent / "extra/rocm/build/llvm-project/bin/llvm-objdump", '-d', '-'], self.binary()))
+        print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
      else:
        # print the PTX for NVIDIA. TODO: probably broken for everything else
        print(self.binary().decode('utf-8'))