From bad0ff60b72e995f11e4f0cc9aa4a230a57b858d Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Sat, 16 Dec 2023 23:10:50 -0800
Subject: [PATCH] start Qualcomm GPU driver (#2804)

* hooking works

* working

* qcom work

* parsing command buffers

* proper parse
---
 disassemblers/adreno/__init__.py      |    8 +-
 extra/qcom_gpu_driver/adreno_pm4.xml  | 2265 +++++++++++++++++++++++++
 extra/qcom_gpu_driver/gen.sh          |    2 +
 extra/qcom_gpu_driver/msm_kgsl.h      | 1451 ++++++++++++++++
 extra/qcom_gpu_driver/msm_kgsl.py     | 1034 +++++++++++
 extra/qcom_gpu_driver/opencl_ioctl.py |  171 ++
 6 files changed, 4928 insertions(+), 3 deletions(-)
 create mode 100644 extra/qcom_gpu_driver/adreno_pm4.xml
 create mode 100755 extra/qcom_gpu_driver/gen.sh
 create mode 100644 extra/qcom_gpu_driver/msm_kgsl.h
 create mode 100644 extra/qcom_gpu_driver/msm_kgsl.py
 create mode 100644 extra/qcom_gpu_driver/opencl_ioctl.py
diff --git a/disassemblers/adreno/__init__.py b/disassemblers/adreno/__init__.py
index ac16b381..b0b3e422 100644
--- a/disassemblers/adreno/__init__.py
+++ b/disassemblers/adreno/__init__.py
@@ -4,15 +4,17 @@ import pathlib
 from hexdump import hexdump
 
 fxn = None
-def disasm(buf):
+def disasm_raw(buf):
   global fxn
   if fxn is None:
     shared = pathlib.Path(__file__).parent / "disasm.so"
     if not shared.is_file():
       os.system(f'cd {pathlib.Path(__file__).parent} && gcc -shared disasm-a3xx.c -o disasm.so')
     fxn = ctypes.CDLL(shared.as_posix())['disasm']
-  #hexdump(buf)
+  fxn(buf, len(buf))
+
+def disasm(buf):
   END = b"\x00\x00\x00\x00\x00\x00\x00\x03"
   buf = buf[0x510:]  # this right?
   buf = buf.split(END)[0] + END
-  fxn(buf, len(buf))
+  disasm_raw(buf)
diff --git a/extra/qcom_gpu_driver/adreno_pm4.xml b/extra/qcom_gpu_driver/adreno_pm4.xml
new file mode 100644
index 00000000..1b687eed
--- /dev/null
+++ b/extra/qcom_gpu_driver/adreno_pm4.xml
@@ -0,0 +1,2265 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<database xmlns="http://nouveau.freedesktop.org/"
+xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd">
+<import file="adreno/adreno_common.xml"/>
+
+<enum name="vgt_event_type" varset="chip">
+	<value name="VS_DEALLOC" value="0"/>
+	<value name="PS_DEALLOC" value="1" variants="A2XX-A6XX"/>
+	<value name="VS_DONE_TS" value="2"/>
+	<value name="PS_DONE_TS" value="3"/>
+	<doc>
+		Flushes dirty data from UCHE, and also writes a GPU timestamp to
+		the address if one is provided.
+	</doc>
+	<value name="CACHE_FLUSH_TS" value="4"/>
+	<value name="CONTEXT_DONE" value="5"/>
+	<value name="CACHE_FLUSH" value="6" variants="A2XX-A4XX"/>
+	<value name="VIZQUERY_START" value="7" variants="A2XX"/>
+	<value name="HLSQ_FLUSH" value="7" variants="A3XX-A4XX"/>
+	<value name="VIZQUERY_END" value="8" variants="A2XX"/>
+	<value name="SC_WAIT_WC" value="9" variants="A2XX"/>
+	<value name="WRITE_PRIMITIVE_COUNTS" value="9" variants="A6XX"/>
+	<value name="START_PRIMITIVE_CTRS" value="11" variants="A6XX"/>
+	<value name="STOP_PRIMITIVE_CTRS" value="12" variants="A6XX"/>
+	<!-- Not sure that these 4 events don't have the same meaning as on A5XX+ -->
+	<value name="RST_PIX_CNT" value="13" variants="A2XX-A4XX"/>
+	<value name="RST_VTX_CNT" value="14" variants="A2XX-A4XX"/>
+	<value name="TILE_FLUSH" value="15" variants="A2XX-A4XX"/>
+	<value name="STAT_EVENT" value="16" variants="A2XX-A4XX"/>
+	<value name="CACHE_FLUSH_AND_INV_TS_EVENT" value="20" variants="A2XX-A4XX"/>
+	<doc>
+		If A6XX_RB_SAMPLE_COUNT_CONTROL.copy is true, writes OQ Z passed
+		sample counts to RB_SAMPLE_COUNT_ADDR.  This writes to main
+		memory, skipping UCHE.
+	</doc>
+	<value name="ZPASS_DONE" value="21"/>
+	<value name="CACHE_FLUSH_AND_INV_EVENT" value="22" variants="A2XX"/>
+
+	<doc>
+		Writes the GPU timestamp to the address that follows, once RB
+		access and flushes are complete.
+	</doc>
+	<value name="RB_DONE_TS" value="22" variants="A3XX-"/>
+
+	<value name="PERFCOUNTER_START" value="23" variants="A2XX-A4XX"/>
+	<value name="PERFCOUNTER_STOP" value="24" variants="A2XX-A4XX"/>
+	<value name="VS_FETCH_DONE" value="27"/>
+	<value name="FACENESS_FLUSH" value="28" variants="A2XX-A4XX"/>
+
+	<!-- a5xx events -->
+	<value name="WT_DONE_TS" value="8" variants="A5XX-"/>
+	<value name="START_FRAGMENT_CTRS" value="13" variants="A5XX-"/>
+	<value name="STOP_FRAGMENT_CTRS" value="14" variants="A5XX-"/>
+	<value name="START_COMPUTE_CTRS" value="15" variants="A5XX-"/>
+	<value name="STOP_COMPUTE_CTRS" value="16" variants="A5XX-"/>
+	<value name="FLUSH_SO_0" value="17" variants="A5XX-"/>
+	<value name="FLUSH_SO_1" value="18" variants="A5XX-"/>
+	<value name="FLUSH_SO_2" value="19" variants="A5XX-"/>
+	<value name="FLUSH_SO_3" value="20" variants="A5XX-"/>
+
+	<doc>
+		Invalidates depth attachment data from the CCU.  We assume this
+		happens in the last stage.
+	</doc>
+	<value name="PC_CCU_INVALIDATE_DEPTH" value="24" variants="A5XX-"/>
+
+	<doc>
+		Invalidates color attachment data from the CCU.  We assume this
+		happens in the last stage.
+	</doc>
+	<value name="PC_CCU_INVALIDATE_COLOR" value="25" variants="A5XX-"/>
+
+	<doc>
+		Flushes the small cache used by CP_EVENT_WRITE::BLIT (which,
+		along with its registers, would be better named RESOLVE).
+	</doc>
+	<value name="PC_CCU_RESOLVE_TS" value="26" variants="A6XX"/>
+
+	<doc>
+		Flushes depth attachment data from the CCU.  We assume this
+		happens in the last stage.
+	</doc>
+	<value name="PC_CCU_FLUSH_DEPTH_TS" value="28" variants="A5XX-"/>
+
+	<doc>
+		Flushes color attachment data from the CCU.  We assume this
+		happens in the last stage.
+	</doc>
+	<value name="PC_CCU_FLUSH_COLOR_TS" value="29" variants="A5XX-"/>
+
+	<doc>
+		2D blit to resolve GMEM to system memory (skipping CCU) at the
+		end of a render pass.  Compare to CP_BLIT's BLIT_OP_SCALE for
+		more general blitting.
+	</doc>
+	<value name="BLIT" value="30" variants="A5XX-"/>
+
+	<doc>
+		Clears based on GRAS_LRZ_CNTL configuration, could clear
+		fast-clear buffer or LRZ direction.
+		LRZ direction is stored at lrz_fc_offset + 0x200, has 1 byte which
+		could be expressed by enum:
+			CUR_DIR_DISABLED = 0x0
+			CUR_DIR_GE = 0x1
+			CUR_DIR_LE = 0x2
+			CUR_DIR_UNSET = 0x3
+		Clear of direction means setting the direction to CUR_DIR_UNSET.
+	</doc>
+	<value name="LRZ_CLEAR" value="37" variants="A5XX-"/>
+
+	<value name="LRZ_FLUSH" value="38" variants="A5XX-"/>
+	<value name="BLIT_OP_FILL_2D" value="39" variants="A5XX-"/>
+	<value name="BLIT_OP_COPY_2D" value="40" variants="A5XX-A6XX"/>
+	<value name="UNK_40" value="40" variants="A7XX"/>
+	<value name="BLIT_OP_SCALE_2D" value="42" variants="A5XX-"/>
+	<value name="CONTEXT_DONE_2D" value="43" variants="A5XX-"/>
+	<value name="UNK_2C" value="44" variants="A5XX-"/>
+	<value name="UNK_2D" value="45" variants="A5XX-"/>
+
+	<!-- a6xx events -->
+	<doc>
+		Invalidates UCHE.
+	</doc>
+	<value name="CACHE_INVALIDATE" value="49" variants="A6XX"/>
+
+	<value name="LABEL" value="63" variants="A6XX-"/>
+
+	<!-- note, some of these are the same as a6xx, just named differently -->
+
+	<doc> Doesn't seem to do anything </doc>
+	<value name="DUMMY_EVENT" value="1" variants="A7XX"/>
+	<value name="CCU_INVALIDATE_DEPTH" value="24" variants="A7XX"/>
+	<value name="CCU_INVALIDATE_COLOR" value="25" variants="A7XX"/>
+	<value name="CCU_RESOLVE_CLEAN" value="26" variants="A7XX"/>
+	<value name="CCU_FLUSH_DEPTH" value="28" variants="A7XX"/>
+	<value name="CCU_FLUSH_COLOR" value="29" variants="A7XX"/>
+	<value name="CCU_RESOLVE" value="30" variants="A7XX"/>
+	<value name="CCU_END_RESOLVE_GROUP" value="31" variants="A7XX"/>
+	<value name="CCU_CLEAN_DEPTH" value="32" variants="A7XX"/>
+	<value name="CCU_CLEAN_COLOR" value="33" variants="A7XX"/>
+	<value name="CACHE_RESET" value="48" variants="A7XX"/>
+	<value name="CACHE_CLEAN" value="49" variants="A7XX"/>
+	<!-- TODO: deal with name conflicts with other gens -->
+	<value name="CACHE_FLUSH7" value="50" variants="A7XX"/>
+	<value name="CACHE_INVALIDATE7" value="51" variants="A7XX"/>
+</enum>
+
+<enum name="pc_di_primtype">
+	<value name="DI_PT_NONE" value="0"/>
+	<!-- POINTLIST_PSIZE is used on a3xx/a4xx when gl_PointSize is written: -->
+	<value name="DI_PT_POINTLIST_PSIZE" value="1"/>
+	<value name="DI_PT_LINELIST" value="2"/>
+	<value name="DI_PT_LINESTRIP" value="3"/>
+	<value name="DI_PT_TRILIST" value="4"/>
+	<value name="DI_PT_TRIFAN" value="5"/>
+	<value name="DI_PT_TRISTRIP" value="6"/>
+	<value name="DI_PT_LINELOOP" value="7"/>  <!-- a22x, a3xx -->
+	<value name="DI_PT_RECTLIST" value="8"/>
+	<value name="DI_PT_POINTLIST" value="9"/>
+	<value name="DI_PT_LINE_ADJ" value="0xa"/>
+	<value name="DI_PT_LINESTRIP_ADJ" value="0xb"/>
+	<value name="DI_PT_TRI_ADJ" value="0xc"/>
+	<value name="DI_PT_TRISTRIP_ADJ" value="0xd"/>
+
+	<value name="DI_PT_PATCHES0" value="0x1f"/>
+	<value name="DI_PT_PATCHES1" value="0x20"/>
+	<value name="DI_PT_PATCHES2" value="0x21"/>
+	<value name="DI_PT_PATCHES3" value="0x22"/>
+	<value name="DI_PT_PATCHES4" value="0x23"/>
+	<value name="DI_PT_PATCHES5" value="0x24"/>
+	<value name="DI_PT_PATCHES6" value="0x25"/>
+	<value name="DI_PT_PATCHES7" value="0x26"/>
+	<value name="DI_PT_PATCHES8" value="0x27"/>
+	<value name="DI_PT_PATCHES9" value="0x28"/>
+	<value name="DI_PT_PATCHES10" value="0x29"/>
+	<value name="DI_PT_PATCHES11" value="0x2a"/>
+	<value name="DI_PT_PATCHES12" value="0x2b"/>
+	<value name="DI_PT_PATCHES13" value="0x2c"/>
+	<value name="DI_PT_PATCHES14" value="0x2d"/>
+	<value name="DI_PT_PATCHES15" value="0x2e"/>
+	<value name="DI_PT_PATCHES16" value="0x2f"/>
+	<value name="DI_PT_PATCHES17" value="0x30"/>
+	<value name="DI_PT_PATCHES18" value="0x31"/>
+	<value name="DI_PT_PATCHES19" value="0x32"/>
+	<value name="DI_PT_PATCHES20" value="0x33"/>
+	<value name="DI_PT_PATCHES21" value="0x34"/>
+	<value name="DI_PT_PATCHES22" value="0x35"/>
+	<value name="DI_PT_PATCHES23" value="0x36"/>
+	<value name="DI_PT_PATCHES24" value="0x37"/>
+	<value name="DI_PT_PATCHES25" value="0x38"/>
+	<value name="DI_PT_PATCHES26" value="0x39"/>
+	<value name="DI_PT_PATCHES27" value="0x3a"/>
+	<value name="DI_PT_PATCHES28" value="0x3b"/>
+	<value name="DI_PT_PATCHES29" value="0x3c"/>
+	<value name="DI_PT_PATCHES30" value="0x3d"/>
+	<value name="DI_PT_PATCHES31" value="0x3e"/>
+</enum>
+
+<enum name="pc_di_src_sel">
+	<value name="DI_SRC_SEL_DMA" value="0"/>
+	<value name="DI_SRC_SEL_IMMEDIATE" value="1"/>
+	<value name="DI_SRC_SEL_AUTO_INDEX" value="2"/>
+	<value name="DI_SRC_SEL_AUTO_XFB" value="3"/>
+</enum>
+
+<enum name="pc_di_face_cull_sel">
+	<value name="DI_FACE_CULL_NONE" value="0"/>
+	<value name="DI_FACE_CULL_FETCH" value="1"/>
+	<value name="DI_FACE_BACKFACE_CULL" value="2"/>
+	<value name="DI_FACE_FRONTFACE_CULL" value="3"/>
+</enum>
+
+<enum name="pc_di_index_size">
+	<value name="INDEX_SIZE_IGN" value="0"/>
+	<value name="INDEX_SIZE_16_BIT" value="0"/>
+	<value name="INDEX_SIZE_32_BIT" value="1"/>
+	<value name="INDEX_SIZE_8_BIT" value="2"/>
+	<value name="INDEX_SIZE_INVALID"/>
+</enum>
+
+<enum name="pc_di_vis_cull_mode">
+	<value name="IGNORE_VISIBILITY" value="0"/>
+	<value name="USE_VISIBILITY" value="1"/>
+</enum>
+
+<enum name="adreno_pm4_packet_type">
+	<value name="CP_TYPE0_PKT" value="0x00000000"/>
+	<value name="CP_TYPE1_PKT" value="0x40000000"/>
+	<value name="CP_TYPE2_PKT" value="0x80000000"/>
+	<value name="CP_TYPE3_PKT" value="0xc0000000"/>
+	<value name="CP_TYPE4_PKT" value="0x40000000"/>
+	<value name="CP_TYPE7_PKT" value="0x70000000"/>
+</enum>
+
+<!--
+   Note that in some cases, the same packet id is recycled on a later
+   generation, so variants attribute is used to distinguish.   They
+   may not be completely accurate, we would probably have to analyze
+   the pfp and me/pm4 firmware to verify the packet is actually
+   handled on a particular generation.  But it is at least enough to
+   disambiguate the packet-id's that were re-used for different
+   packets starting with a5xx.
+ -->
+<enum name="adreno_pm4_type3_packets" varset="chip">
+	<doc>initialize CP's micro-engine</doc>
+	<value name="CP_ME_INIT" value="0x48"/>
+	<doc>skip N 32-bit words to get to the next packet</doc>
+	<value name="CP_NOP" value="0x10"/>
+	<doc>
+		indirect buffer dispatch.  prefetch parser uses this packet
+		type to determine whether to pre-fetch the IB
+	</doc>
+	<value name="CP_PREEMPT_ENABLE" value="0x1c" variants="A5XX"/>
+	<value name="CP_PREEMPT_TOKEN" value="0x1e" variants="A5XX"/>
+	<value name="CP_INDIRECT_BUFFER" value="0x3f"/>
+	<doc>
+		Takes the same arguments as CP_INDIRECT_BUFFER, but jumps to
+		another buffer at the same level. Must be at the end of IB, and
+		doesn't work with draw state IB's.
+	</doc>
+	<value name="CP_INDIRECT_BUFFER_CHAIN" value="0x57" variants="A5XX-"/>
+	<doc>indirect buffer dispatch.  same as IB, but init is pipelined</doc>
+	<value name="CP_INDIRECT_BUFFER_PFD" value="0x37"/>
+	<doc>
+		Waits for the IDLE state of the engine before further drawing.
+		This is pipelined, so the CP may continue.
+	</doc>
+	<value name="CP_WAIT_FOR_IDLE" value="0x26"/>
+	<doc>wait until a register or memory location is a specific value</doc>
+	<value name="CP_WAIT_REG_MEM" value="0x3c"/>
+	<doc>wait until a register location is equal to a specific value</doc>
+	<value name="CP_WAIT_REG_EQ" value="0x52"/>
+	<doc>wait until a register location is >= a specific value</doc>
+	<value name="CP_WAIT_REG_GTE" value="0x53" variants="A2XX-A4XX"/>
+	<doc>wait until a read completes</doc>
+	<value name="CP_WAIT_UNTIL_READ" value="0x5c" variants="A2XX-A4XX"/>
+	<doc>wait until all base/size writes from an IB_PFD packet have completed</doc>
+	<!--
+		NOTE: CP_WAIT_IB_PFD_COMPLETE unimplemented at least since a5xx fw, and
+		recycled for something new on a7xx
+	 -->
+	<value name="CP_WAIT_IB_PFD_COMPLETE" value="0x5d" varset="chip" variants="A2XX-A4XX"/>
+	<doc>register read/modify/write</doc>
+	<value name="CP_REG_RMW" value="0x21"/>
+	<doc>Set binning configuration registers</doc>
+	<value name="CP_SET_BIN_DATA" value="0x2f" variants="A2XX-A4XX"/>
+	<value name="CP_SET_BIN_DATA5" value="0x2f" variants="A5XX-"/>
+	<doc>reads register in chip and writes to memory</doc>
+	<value name="CP_REG_TO_MEM" value="0x3e"/>
+	<doc>write N 32-bit words to memory</doc>
+	<value name="CP_MEM_WRITE" value="0x3d"/>
+	<doc>write CP_PROG_COUNTER value to memory</doc>
+	<value name="CP_MEM_WRITE_CNTR" value="0x4f"/>
+	<doc>conditional execution of a sequence of packets</doc>
+	<value name="CP_COND_EXEC" value="0x44"/>
+	<doc>conditional write to memory or register</doc>
+	<value name="CP_COND_WRITE" value="0x45" variants="A2XX-A4XX"/>
+	<value name="CP_COND_WRITE5" value="0x45" variants="A5XX-"/>
+	<doc>generate an event that creates a write to memory when completed</doc>
+	<value name="CP_EVENT_WRITE" value="0x46" variants="A2XX-A6XX"/>
+	<value name="CP_EVENT_WRITE7" value="0x46" variants="A7XX-"/>
+	<doc>generate a VS|PS_done event</doc>
+	<value name="CP_EVENT_WRITE_SHD" value="0x58"/>
+	<doc>generate a cache flush done event</doc>
+	<value name="CP_EVENT_WRITE_CFL" value="0x59"/>
+	<doc>generate a z_pass done event</doc>
+	<value name="CP_EVENT_WRITE_ZPD" value="0x5b"/>
+	<doc>
+		not sure the real name, but this seems to be what is used for
+		opencl, instead of CP_DRAW_INDX..
+	</doc>
+	<value name="CP_RUN_OPENCL" value="0x31"/>
+	<doc>initiate fetch of index buffer and draw</doc>
+	<value name="CP_DRAW_INDX" value="0x22"/>
+	<doc>draw using supplied indices in packet</doc>
+	<value name="CP_DRAW_INDX_2" value="0x36" variants="A2XX-A4XX"/>  <!-- this is something different on a6xx and unused on a5xx -->
+	<doc>initiate fetch of index buffer and binIDs and draw</doc>
+	<value name="CP_DRAW_INDX_BIN" value="0x34" variants="A2XX-A4XX"/>
+	<doc>initiate fetch of bin IDs and draw using supplied indices</doc>
+	<value name="CP_DRAW_INDX_2_BIN" value="0x35" variants="A2XX-A4XX"/>
+	<doc>begin/end initiator for viz query extent processing</doc>
+	<value name="CP_VIZ_QUERY" value="0x23" variants="A2XX-A4XX"/>
+	<doc>fetch state sub-blocks and initiate shader code DMAs</doc>
+	<value name="CP_SET_STATE" value="0x25"/>
+	<doc>load constant into chip and to memory</doc>
+	<value name="CP_SET_CONSTANT" value="0x2d"/>
+	<doc>load sequencer instruction memory (pointer-based)</doc>
+	<value name="CP_IM_LOAD" value="0x27"/>
+	<doc>load sequencer instruction memory (code embedded in packet)</doc>
+	<value name="CP_IM_LOAD_IMMEDIATE" value="0x2b"/>
+	<doc>load constants from a location in memory</doc>
+	<value name="CP_LOAD_CONSTANT_CONTEXT" value="0x2e" variants="A2XX"/>
+	<doc>selective invalidation of state pointers</doc>
+	<value name="CP_INVALIDATE_STATE" value="0x3b"/>
+	<doc>dynamically changes shader instruction memory partition</doc>
+	<value name="CP_SET_SHADER_BASES" value="0x4a" variants="A2XX-A4XX"/>
+	<doc>sets the 64-bit BIN_MASK register in the PFP</doc>
+	<value name="CP_SET_BIN_MASK" value="0x50" variants="A2XX-A4XX"/>
+	<doc>sets the 64-bit BIN_SELECT register in the PFP</doc>
+	<value name="CP_SET_BIN_SELECT" value="0x51" variants="A2XX-A4XX"/>
+	<doc>updates the current context, if needed</doc>
+	<value name="CP_CONTEXT_UPDATE" value="0x5e"/>
+	<doc>generate interrupt from the command stream</doc>
+	<value name="CP_INTERRUPT" value="0x40"/>
+	<doc>copy sequencer instruction memory to system memory</doc>
+	<value name="CP_IM_STORE" value="0x2c" variants="A2XX"/>
+
+	<!-- For a20x -->
+<!-- TODO handle variants..
+	<doc>
+		Program an offset that will added to the BIN_BASE value of
+		the 3D_DRAW_INDX_BIN packet
+	</doc>
+	<value name="CP_SET_BIN_BASE_OFFSET" value="0x4b"/>
+ -->
+
+	<!-- for a22x -->
+	<doc>
+		sets draw initiator flags register in PFP, gets bitwise-ORed into
+		every draw initiator
+	</doc>
+	<value name="CP_SET_DRAW_INIT_FLAGS" value="0x4b"/>
+	<doc>sets the register protection mode</doc>
+	<value name="CP_SET_PROTECTED_MODE" value="0x5f"/>
+
+	<value name="CP_BOOTSTRAP_UCODE" value="0x6f"/>
+
+	<!-- for a3xx -->
+	<doc>load high level sequencer command</doc>
+	<value name="CP_LOAD_STATE" value="0x30" variants="A3XX"/>
+	<value name="CP_LOAD_STATE4" value="0x30" variants="A4XX-A5XX"/>
+	<doc>Conditionally load a IB based on a flag, prefetch enabled</doc>
+	<value name="CP_COND_INDIRECT_BUFFER_PFE" value="0x3a"/>
+	<doc>Conditionally load a IB based on a flag, prefetch disabled</doc>
+	<value name="CP_COND_INDIRECT_BUFFER_PFD" value="0x32" variants="A3XX"/>
+	<doc>Load a buffer with pre-fetch enabled</doc>
+	<value name="CP_INDIRECT_BUFFER_PFE" value="0x3f" variants="A5XX"/>
+	<doc>Set bin (?)</doc>
+	<value name="CP_SET_BIN" value="0x4c" variants="A2XX"/>
+
+	<doc>test 2 memory locations to dword values specified</doc>
+	<value name="CP_TEST_TWO_MEMS" value="0x71"/>
+
+	<doc>Write register, ignoring context state for context sensitive registers</doc>
+	<value name="CP_REG_WR_NO_CTXT" value="0x78"/>
+
+	<doc>Record the real-time when this packet is processed by PFP</doc>
+	<value name="CP_RECORD_PFP_TIMESTAMP" value="0x11"/>
+
+	<!-- Used to switch GPU between secure and non-secure modes -->
+	<value name="CP_SET_SECURE_MODE" value="0x66"/>
+
+	<doc>PFP waits until the FIFO between the PFP and the ME is empty</doc>
+	<value name="CP_WAIT_FOR_ME" value="0x13"/>
+
+	<!-- for a4xx -->
+	<doc>
+		Used a bit like CP_SET_CONSTANT on a2xx, but can write multiple
+		groups of registers.  Looks like it can be used to create state
+		objects in GPU memory, and on state change only emit pointer
+		(via CP_SET_DRAW_STATE), which should be nice for reducing CPU
+		overhead:
+
+		(A4x) save PM4 stream pointers to execute upon a visible draw
+	</doc>
+	<value name="CP_SET_DRAW_STATE" value="0x43" variants="A4XX-"/>
+	<value name="CP_DRAW_INDX_OFFSET" value="0x38"/>
+	<value name="CP_DRAW_INDIRECT" value="0x28" variants="A4XX-"/>
+	<value name="CP_DRAW_INDX_INDIRECT" value="0x29" variants="A4XX-"/>
+	<value name="CP_DRAW_INDIRECT_MULTI" value="0x2a" variants="A6XX-"/>
+	<value name="CP_DRAW_AUTO" value="0x24"/>
+
+	<doc>
+		Enable or disable predication globally. Also resets the
+		predicate to "passing" and the local bit to enabled when
+		enabling global predication.
+	</doc>
+	<value name="CP_DRAW_PRED_ENABLE_GLOBAL" value="0x19"/>
+
+	<doc>
+		Enable or disable predication locally. Unlike globally enabling
+		predication, this packet doesn't touch any other state.
+		Predication only happens when enabled globally and locally and a
+		predicate has been set. This should be used for internal draws
+		which aren't supposed to use the predication state:
+
+		CP_DRAW_PRED_ENABLE_LOCAL(0)
+		... do draw...
+		CP_DRAW_PRED_ENABLE_LOCAL(1)
+	</doc>
+	<value name="CP_DRAW_PRED_ENABLE_LOCAL" value="0x1a"/>
+
+	<doc>
+		Latch a draw predicate into the internal register.
+	</doc>
+	<value name="CP_DRAW_PRED_SET" value="0x4e"/>
+
+	<doc>
+		for A4xx
+		Write to register with address that does not fit into type-0 pkt
+	</doc>
+	<value name="CP_WIDE_REG_WRITE" value="0x74" variants="A4XX"/>
+
+	<doc>copy from ME scratch RAM to a register</doc>
+	<value name="CP_SCRATCH_TO_REG" value="0x4d"/>
+
+	<doc>Copy from REG to ME scratch RAM</doc>
+	<value name="CP_REG_TO_SCRATCH" value="0x4a"/>
+
+	<doc>Wait for memory writes to complete</doc>
+	<value name="CP_WAIT_MEM_WRITES" value="0x12"/>
+
+	<doc>Conditional execution based on register comparison</doc>
+	<value name="CP_COND_REG_EXEC" value="0x47"/>
+
+	<doc>Memory to REG copy</doc>
+	<value name="CP_MEM_TO_REG" value="0x42"/>
+
+	<value name="CP_EXEC_CS_INDIRECT" value="0x41" variants="A4XX-"/>
+	<value name="CP_EXEC_CS" value="0x33"/>
+
+	<doc>
+		for a5xx
+	</doc>
+	<value name="CP_PERFCOUNTER_ACTION" value="0x50" variants="A5XX"/>
+	<!-- switches SMMU pagetable, used on a5xx+ only -->
+	<value name="CP_SMMU_TABLE_UPDATE" value="0x53" variants="A5XX-"/>
+	<!-- for a6xx -->
+	<doc>Tells CP the current mode of GPU operation</doc>
+	<value name="CP_SET_MARKER" value="0x65" variants="A6XX-"/>
+	<doc>Instruct CP to set a few internal CP registers</doc>
+	<value name="CP_SET_PSEUDO_REG" value="0x56" variants="A6XX-"/>
+	<!--
+	pairs of regid and value.. seems to be used to program some TF
+	related regs:
+	 -->
+	<value name="CP_CONTEXT_REG_BUNCH" value="0x5c" variants="A5XX-"/>
+	<!-- A5XX Enable yield in RB only -->
+	<value name="CP_YIELD_ENABLE" value="0x1c" variants="A5XX"/>
+	<doc>
+		Enables IB2 skipping.  If both GLOBAL and LOCAL are 1 and
+		nothing is left in the visibility stream, then
+		CP_INDIRECT_BUFFER will be skipped, and draws will early return
+		from their IB.
+	</doc>
+	<value name="CP_SKIP_IB2_ENABLE_GLOBAL" value="0x1d" variants="A5XX-"/>
+	<value name="CP_SKIP_IB2_ENABLE_LOCAL" value="0x23" variants="A5XX-"/>
+	<value name="CP_SET_SUBDRAW_SIZE" value="0x35" variants="A5XX-"/>
+	<value name="CP_WHERE_AM_I" value="0x62" variants="A5XX-"/>
+	<value name="CP_SET_VISIBILITY_OVERRIDE" value="0x64" variants="A5XX-"/>
+	<!-- Enable/Disable/Defer A5x global preemption model -->
+	<value name="CP_PREEMPT_ENABLE_GLOBAL" value="0x69" variants="A5XX"/>
+	<!-- Enable/Disable A5x local preemption model -->
+	<value name="CP_PREEMPT_ENABLE_LOCAL" value="0x6a" variants="A5XX"/>
+	<!-- Yield token on a5xx similar to CP_PREEMPT on a4xx -->
+	<value name="CP_CONTEXT_SWITCH_YIELD" value="0x6b" variants="A5XX-"/>
+	<!-- Inform CP about current render mode (needed for a5xx preemption) -->
+	<value name="CP_SET_RENDER_MODE" value="0x6c" variants="A5XX"/>
+	<value name="CP_COMPUTE_CHECKPOINT" value="0x6e" variants="A5XX"/>
+	<!-- check if this works on earlier.. -->
+	<value name="CP_MEM_TO_MEM" value="0x73" variants="A5XX-"/>
+
+	<doc>
+		General purpose 2D blit engine for image transfers and mipmap
+		generation.  Reads through UCHE, writes through the CCU cache in
+		the PS stage.
+	</doc>
+	<value name="CP_BLIT" value="0x2c" variants="A5XX-"/>
+
+	<!-- Test specified bit in specified register and set predicate -->
+	<value name="CP_REG_TEST" value="0x39" variants="A5XX-"/>
+
+	<!--
+	Seems to set the mode flags which control which CP_SET_DRAW_STATE
+	packets are executed, based on their ENABLE_MASK values
+	
+	CP_SET_MODE w/ payload of 0x1 seems to cause CP_SET_DRAW_STATE
+	packets w/ ENABLE_MASK & 0x6 to execute immediately
+	 -->
+	<value name="CP_SET_MODE" value="0x63" variants="A6XX-"/>
+
+	<!--
+	Seems like there are now separate blocks of state for VS vs FS/CS
+	(probably these amounts to geometry vs fragments so that geometry
+	stage of the pipeline for next draw can start while fragment stage
+	of current draw is still running.  The format of the payload of the
+	packets is the same, the only difference is the offsets of the regs
+	the firmware code that handles the packet writes.
+
+	Note that for CL, starting with a6xx, the preferred # of local
+	threads is no longer the same as the max, implying that the shader
+	core can now run warps from unrelated shaders (ie.
+	CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE vs
+	CL_KERNEL_WORK_GROUP_SIZE)
+	 -->
+	<value name="CP_LOAD_STATE6_GEOM" value="0x32" variants="A6XX-"/>
+	<value name="CP_LOAD_STATE6_FRAG" value="0x34" variants="A6XX-"/>
+	<!--
+	Note: For IBO state (Image/SSBOs) which have shared state across
+	shader stages, for 3d pipeline CP_LOAD_STATE6 is used.  But for
+	compute shaders, CP_LOAD_STATE6_FRAG is used.  Possibly they are
+	interchangable.
+	 -->
+	<value name="CP_LOAD_STATE6" value="0x36" variants="A6XX-"/>
+
+	<!-- internal packets: -->
+	<value name="IN_IB_PREFETCH_END" value="0x17" variants="A2XX"/>
+	<value name="IN_SUBBLK_PREFETCH" value="0x1f" variants="A2XX"/>
+	<value name="IN_INSTR_PREFETCH" value="0x20" variants="A2XX"/>
+	<value name="IN_INSTR_MATCH" value="0x47" variants="A2XX"/>
+	<value name="IN_CONST_PREFETCH" value="0x49" variants="A2XX"/>
+	<value name="IN_INCR_UPDT_STATE" value="0x55" variants="A2XX"/>
+	<value name="IN_INCR_UPDT_CONST" value="0x56" variants="A2XX"/>
+	<value name="IN_INCR_UPDT_INSTR" value="0x57" variants="A2XX"/>
+
+	<!-- internal jumptable entries on a6xx+, possibly a5xx: -->
+
+	<!-- jmptable entry used to handle type4 packet on a5xx+: -->
+	<value name="PKT4" value="0x04" variants="A5XX-"/>
+	<!-- called when ROQ is empty, "returns" from an IB or merged sequence of IBs -->
+	<value name="IN_IB_END" value="0x0a" variants="A6XX-"/>
+	<!-- handles IFPC save/restore -->
+	<value name="IN_GMU_INTERRUPT" value="0x0b" variants="A6XX-"/>
+	<!-- preemption/context-swtich routine -->
+	<value name="IN_PREEMPT" value="0x0f" variants="A6XX-"/>
+
+	<!-- TODO do these exist on A5xx? -->
+	<value name="CP_SCRATCH_WRITE" value="0x4c" variants="A6XX"/>
+	<value name="CP_REG_TO_MEM_OFFSET_MEM" value="0x74" variants="A6XX-"/>
+	<value name="CP_REG_TO_MEM_OFFSET_REG" value="0x72" variants="A6XX-"/>
+	<value name="CP_WAIT_MEM_GTE" value="0x14" variants="A6XX"/>
+	<value name="CP_WAIT_TWO_REGS" value="0x70" variants="A6XX"/>
+	<value name="CP_MEMCPY" value="0x75" variants="A6XX-"/>
+	<value name="CP_SET_BIN_DATA5_OFFSET" value="0x2e" variants="A6XX-"/>
+	<doc>
+                Write CP_CONTEXT_SWITCH_*_INFO from CP to the following dwords,
+                and forcibly switch to the indicated context.
+	</doc>
+	<value name="CP_CONTEXT_SWITCH" value="0x54" variants="A6XX"/>
+	<!-- Note, kgsl calls this CP_SET_AMBLE: -->
+	<value name="CP_SET_CTXSWITCH_IB" value="0x55" variants="A6XX-"/>
+
+	<!--
+	Seems to always have the payload:
+	  00000002 00008801 00004010
+	or:
+	  00000002 00008801 00004090
+	or:
+	  00000002 00008801 00000010
+	  00000002 00008801 00010010
+	  00000002 00008801 00d64010
+	  ...
+	Note set for compute shaders..
+	Is 0x8801 a register offset?
+	This appears to be a special sort of register write packet
+	more or less, but the firmware has some special handling..
+	Seems like it intercepts/modifies certain register offsets,
+	but others are treated like a normal PKT4 reg write.  I
+	guess there are some registers that the fw controls certain
+	bits.
+	 -->
+	<value name="CP_REG_WRITE" value="0x6d" variants="A6XX"/>
+
+	<doc>
+		These first appear in a650_sqe.bin. They can in theory be used
+		to loop any sequence of IB1 commands, but in practice they are
+		used to loop over bins. There is a fixed-size per-iteration
+		prefix, used to set per-bin state, and then the following IB1
+		commands are executed until CP_END_BIN which are always the same
+		for each iteration and usually contain a list of
+		CP_INDIRECT_BUFFER calls to IB2 commands which setup state and
+		execute restore/draw/save commands. This replaces the previous
+		technique of just repeating the CP_INDIRECT_BUFFER calls and
+		"unrolling" the loop.
+	</doc>
+	<value name="CP_START_BIN" value="0x50" variants="A6XX-"/>
+	<value name="CP_END_BIN" value="0x51" variants="A6XX-"/>
+
+	<doc> Make next dword 1 to disable preemption, 0 to re-enable it. </doc>
+	<value name="CP_PREEMPT_DISABLE" value="0x6c" variants="A6XX"/>
+
+	<value name="CP_WAIT_TIMESTAMP" value="0x14" variants="A7XX-"/>
+	<value name="CP_GLOBAL_TIMESTAMP" value="0x15" variants="A7XX-"/>  <!-- payload 1 dword -->
+	<value name="CP_LOCAL_TIMESTAMP" value="0x16" variants="A7XX-"/>  <!-- payload 1 dword, follows 0x15 -->
+	<value name="CP_THREAD_CONTROL" value="0x17" variants="A7XX-"/>
+	<!-- payload 4 dwords, last two could be render target addr (one pkt per MRT), possibly used for GMEM save/restore?-->
+	<value name="CP_RESOURCE_LIST" value="0x18" variants="A7XX-"/>
+	<doc> Can clear BV/BR counters, or wait until one catches up to another </doc>
+	<value name="CP_BV_BR_COUNT_OPS" value="0x1b" variants="A7XX-"/>
+	<doc> Clears, adds to local, or adds to global timestamp </doc>
+	<value name="CP_MODIFY_TIMESTAMP" value="0x1c" variants="A7XX-"/>
+	<!-- similar to CP_CONTEXT_REG_BUNCH, but discards first two dwords?? -->
+	<value name="CP_CONTEXT_REG_BUNCH2" value="0x5d" variants="A7XX-"/>
+	<doc>
+		Write to a scratch memory that is read by CP_REG_TEST with
+		SOURCE_SCRATCH_MEM set. It's not the same scratch as scratch registers.
+		However it uses the same memory space.
+	</doc>
+	<value name="CP_MEM_TO_SCRATCH_MEM" value="0x49" variants="A7XX-"/>
+
+	<doc>
+		Executes an array of fixed-size command buffers where each
+		buffer is assumed to have one draw call, skipping buffers with
+		non-visible draw calls.
+	</doc>
+	<value name="CP_FIXED_STRIDE_DRAW_TABLE" value="0x7f" variants="A7XX-"/>
+
+	<doc>Reset various on-chip state used for synchronization</doc>
+	<value name="CP_RESET_CONTEXT_STATE" value="0x1f" variants="A7XX-"/>
+</enum>
+
+
+<domain name="CP_LOAD_STATE" width="32">
+	<doc>Load state, a3xx (and later?)</doc>
+	<enum name="adreno_state_block">
+		<value name="SB_VERT_TEX" value="0"/>
+		<value name="SB_VERT_MIPADDR" value="1"/>
+		<value name="SB_FRAG_TEX" value="2"/>
+		<value name="SB_FRAG_MIPADDR" value="3"/>
+		<value name="SB_VERT_SHADER" value="4"/>
+		<value name="SB_GEOM_SHADER" value="5"/>
+		<value name="SB_FRAG_SHADER" value="6"/>
+		<value name="SB_COMPUTE_SHADER" value="7"/>
+	</enum>
+	<enum name="adreno_state_type">
+		<value name="ST_SHADER" value="0"/>
+		<value name="ST_CONSTANTS" value="1"/>
+	</enum>
+	<enum name="adreno_state_src">
+		<value name="SS_DIRECT" value="0">
+			<doc>inline with the CP_LOAD_STATE packet</doc>
+		</value>
+		<value name="SS_INVALID_ALL_IC" value="2"/>
+		<value name="SS_INVALID_PART_IC" value="3"/>
+		<value name="SS_INDIRECT" value="4">
+			<doc>in buffer pointed to by EXT_SRC_ADDR</doc>
+		</value>
+		<value name="SS_INDIRECT_TCM" value="5"/>
+		<value name="SS_INDIRECT_STM" value="6"/>
+	</enum>
+	<reg32 offset="0" name="0">
+		<bitfield name="DST_OFF" low="0" high="15" type="uint"/>
+		<bitfield name="STATE_SRC" low="16" high="18" type="adreno_state_src"/>
+		<bitfield name="STATE_BLOCK" low="19" high="21" type="adreno_state_block"/>
+		<bitfield name="NUM_UNIT" low="22" high="31" type="uint"/>
+	</reg32>
+	<reg32 offset="1" name="1">
+		<bitfield name="STATE_TYPE" low="0" high="1" type="adreno_state_type"/>
+		<bitfield name="EXT_SRC_ADDR" low="2" high="31" shr="2"/>
+	</reg32>
+</domain>
+
+<domain name="CP_LOAD_STATE4" width="32" varset="chip">
+	<doc>Load state, a4xx+</doc>
+	<enum name="a4xx_state_block">
+		<!--
+		unknown: 0x7 and 0xf <- seen in compute shader
+
+		STATE_BLOCK = 0x6, STATE_TYPE = 0x2 possibly used for preemption?
+		Seen in some GL shaders.  Payload is NUM_UNIT dwords, and it contains
+		the gpuaddr of the following shader constants block.  DST_OFF seems
+		to specify which shader stage:
+
+		    16 -> vert
+		    36 -> tcs
+		    56 -> tes
+		    76 -> geom
+		    96 -> frag
+
+		Example:
+
+opcode: CP_LOAD_STATE4 (30) (12 dwords)
+        { DST_OFF = 16 | STATE_SRC = SS4_DIRECT | STATE_BLOCK = 0x6 | NUM_UNIT = 4 }
+        { STATE_TYPE = 0x2 | EXT_SRC_ADDR = 0 }
+        { EXT_SRC_ADDR_HI = 0 }
+                        0000: c0264100 00000000 00000000 00000000
+                0000: 70b0000b 01180010 00000002 00000000 c0264100 00000000 00000000 00000000
+
+opcode: CP_LOAD_STATE4 (30) (4 dwords)
+        { DST_OFF = 16 | STATE_SRC = SS4_INDIRECT | STATE_BLOCK = SB4_VS_SHADER | NUM_UNIT = 4 }
+        { STATE_TYPE = ST4_CONSTANTS | EXT_SRC_ADDR = 0xc0264100 }
+        { EXT_SRC_ADDR_HI = 0 }
+                        0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
+                        0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
+                        0000: 00000040 0000000c 00000000 00000000 00000000 00000000 00000000 00000000
+
+		STATE_BLOCK = 0x6, STATE_TYPE = 0x1, seen in compute shader.  NUM_UNITS * 2 dwords.
+
+		 -->
+		<value name="SB4_VS_TEX"    value="0x0"/>
+		<value name="SB4_HS_TEX"    value="0x1"/>    <!-- aka. TCS -->
+		<value name="SB4_DS_TEX"    value="0x2"/>    <!-- aka. TES -->
+		<value name="SB4_GS_TEX"    value="0x3"/>
+		<value name="SB4_FS_TEX"    value="0x4"/>
+		<value name="SB4_CS_TEX"    value="0x5"/>
+		<value name="SB4_VS_SHADER" value="0x8"/>
+		<value name="SB4_HS_SHADER" value="0x9"/>
+		<value name="SB4_DS_SHADER" value="0xa"/>
+		<value name="SB4_GS_SHADER" value="0xb"/>
+		<value name="SB4_FS_SHADER" value="0xc"/>
+		<value name="SB4_CS_SHADER" value="0xd"/>
+		<!--
+		for SSBO, STATE_TYPE=0 appears to be addresses (four dwords each),
+		STATE_TYPE=1 sizes, STATE_TYPE=2 addresses again (two dwords each)
+
+		Compute has it's own dedicated SSBO state, it seems, but the rest
+		of the stages share state
+		 -->
+		<value name="SB4_SSBO"   value="0xe"/>
+		<value name="SB4_CS_SSBO"   value="0xf"/>
+	</enum>
+	<enum name="a4xx_state_type">
+		<value name="ST4_SHADER" value="0"/>
+		<value name="ST4_CONSTANTS" value="1"/>
+		<value name="ST4_UBO" value="2"/>
+	</enum>
+	<enum name="a4xx_state_src">
+		<value name="SS4_DIRECT" value="0"/>
+		<value name="SS4_INDIRECT" value="2"/>
+	</enum>
+	<reg32 offset="0" name="0">
+		<bitfield name="DST_OFF" low="0" high="13" type="uint"/>
+		<bitfield name="STATE_SRC" low="16" high="17" type="a4xx_state_src"/>
+		<bitfield name="STATE_BLOCK" low="18" high="21" type="a4xx_state_block"/>
+		<bitfield name="NUM_UNIT" low="22" high="31" type="uint"/>
+	</reg32>
+	<reg32 offset="1" name="1">
+		<bitfield name="STATE_TYPE" low="0" high="1" type="a4xx_state_type"/>
+		<bitfield name="EXT_SRC_ADDR" low="2" high="31" shr="2"/>
+	</reg32>
+	<reg32 offset="2" name="2" varset="chip" variants="A5XX-">
+		<bitfield name="EXT_SRC_ADDR_HI" low="0" high="31" shr="0"/>
+	</reg32>
+</domain>
+
+<!-- looks basically same CP_LOAD_STATE4 -->
+<domain name="CP_LOAD_STATE6" width="32" varset="chip">
+	<doc>Load state, a6xx+</doc>
+	<enum name="a6xx_state_block">
+		<value name="SB6_VS_TEX"    value="0x0"/>
+		<value name="SB6_HS_TEX"    value="0x1"/>    <!-- aka. TCS -->
+		<value name="SB6_DS_TEX"    value="0x2"/>    <!-- aka. TES -->
+		<value name="SB6_GS_TEX"    value="0x3"/>
+		<value name="SB6_FS_TEX"    value="0x4"/>
+		<value name="SB6_CS_TEX"    value="0x5"/>
+		<value name="SB6_VS_SHADER" value="0x8"/>
+		<value name="SB6_HS_SHADER" value="0x9"/>
+		<value name="SB6_DS_SHADER" value="0xa"/>
+		<value name="SB6_GS_SHADER" value="0xb"/>
+		<value name="SB6_FS_SHADER" value="0xc"/>
+		<value name="SB6_CS_SHADER" value="0xd"/>
+		<value name="SB6_IBO"       value="0xe"/>
+		<value name="SB6_CS_IBO"    value="0xf"/>
+	</enum>
+	<enum name="a6xx_state_type">
+		<value name="ST6_SHADER" value="0"/>
+		<value name="ST6_CONSTANTS" value="1"/>
+		<value name="ST6_UBO" value="2"/>
+		<value name="ST6_IBO" value="3"/>
+	</enum>
+	<enum name="a6xx_state_src">
+		<value name="SS6_DIRECT" value="0"/>
+		<value name="SS6_BINDLESS" value="1"/> <!-- TODO does this exist on a4xx/a5xx? -->
+		<value name="SS6_INDIRECT" value="2"/>
+		<doc>
+		SS6_UBO used by the a6xx vulkan blob with tesselation constants
+		in this case, EXT_SRC_ADDR is (ubo_id shl 16 | offset)
+		to load constants from a UBO loaded with DST_OFF = 14 and offset 0,
+		EXT_SRC_ADDR = 0xe0000
+		(offset is a guess, should be in bytes given that maxUniformBufferRange=64k)
+		</doc>
+		<value name="SS6_UBO" value="3"/>
+	</enum>
+	<reg32 offset="0" name="0">
+		<bitfield name="DST_OFF" low="0" high="13" type="uint"/>
+		<bitfield name="STATE_TYPE" low="14" high="15" type="a6xx_state_type"/>
+		<bitfield name="STATE_SRC" low="16" high="17" type="a6xx_state_src"/>
+		<bitfield name="STATE_BLOCK" low="18" high="21" type="a6xx_state_block"/>
+		<bitfield name="NUM_UNIT" low="22" high="31" type="uint"/>
+	</reg32>
+	<reg32 offset="1" name="1">
+		<bitfield name="EXT_SRC_ADDR" low="2" high="31" shr="2"/>
+	</reg32>
+	<reg32 offset="2" name="2">
+		<bitfield name="EXT_SRC_ADDR_HI" low="0" high="31" shr="0"/>
+	</reg32>
+	<reg64 offset="1" name="EXT_SRC_ADDR" type="address"/>
+</domain>
+
+<bitset name="vgt_draw_initiator" inline="yes">
+	<bitfield name="PRIM_TYPE" low="0" high="5" type="pc_di_primtype"/>
+	<bitfield name="SOURCE_SELECT" low="6" high="7" type="pc_di_src_sel"/>
+	<bitfield name="VIS_CULL" low="9" high="10" type="pc_di_vis_cull_mode"/>
+	<bitfield name="INDEX_SIZE" pos="11" type="pc_di_index_size"/>
+	<bitfield name="NOT_EOP" pos="12" type="boolean"/>
+	<bitfield name="SMALL_INDEX" pos="13" type="boolean"/>
+	<bitfield name="PRE_DRAW_INITIATOR_ENABLE" pos="14" type="boolean"/>
+	<bitfield name="NUM_INSTANCES" low="24" high="31" type="uint"/>
+</bitset>
+
+<!-- changed on a4xx: -->
+<enum name="a4xx_index_size">
+	<value name="INDEX4_SIZE_8_BIT" value="0"/>
+	<value name="INDEX4_SIZE_16_BIT" value="1"/>
+	<value name="INDEX4_SIZE_32_BIT" value="2"/>
+</enum>
+
+<enum name="a6xx_patch_type">
+  <value name="TESS_QUADS" value="0"/>
+  <value name="TESS_TRIANGLES" value="1"/>
+  <value name="TESS_ISOLINES" value="2"/>
+</enum>
+
+<bitset name="vgt_draw_initiator_a4xx" inline="yes">
+	<!-- When the 0x20 bit is set, it's the number of patch vertices - 1 -->
+	<bitfield name="PRIM_TYPE" low="0" high="5" type="pc_di_primtype"/>
+	<bitfield name="SOURCE_SELECT" low="6" high="7" type="pc_di_src_sel"/>
+	<bitfield name="VIS_CULL" low="8" high="9" type="pc_di_vis_cull_mode"/>
+	<bitfield name="INDEX_SIZE" low="10" high="11" type="a4xx_index_size"/>
+	<bitfield name="PATCH_TYPE" low="12" high="13" type="a6xx_patch_type"/>
+	<bitfield name="GS_ENABLE" pos="16" type="boolean"/>
+	<bitfield name="TESS_ENABLE" pos="17" type="boolean"/>
+</bitset>
+
+<domain name="CP_DRAW_INDX" width="32">
+	<reg32 offset="0" name="0">
+		<bitfield name="VIZ_QUERY" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="1" name="1" type="vgt_draw_initiator"/>
+	<reg32 offset="2" name="2">
+		<bitfield name="NUM_INDICES" low="0" high="31" type="uint"/>
+	</reg32>
+	<reg32 offset="3" name="3">
+		<bitfield name="INDX_BASE" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="4" name="4">
+		<bitfield name="INDX_SIZE" low="0" high="31"/>
+	</reg32>
+</domain>
+
+<domain name="CP_DRAW_INDX_2" width="32">
+	<reg32 offset="0" name="0">
+		<bitfield name="VIZ_QUERY" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="1" name="1" type="vgt_draw_initiator"/>
+	<reg32 offset="2" name="2">
+		<bitfield name="NUM_INDICES" low="0" high="31" type="uint"/>
+	</reg32>
+	<!-- followed by NUM_INDICES indices.. -->
+</domain>
+
+<domain name="CP_DRAW_INDX_OFFSET" width="32">
+	<reg32 offset="0" name="0" type="vgt_draw_initiator_a4xx"/>
+	<reg32 offset="1" name="1">
+		<bitfield name="NUM_INSTANCES" low="0" high="31" type="uint"/>
+	</reg32>
+	<reg32 offset="2" name="2">
+		<bitfield name="NUM_INDICES" low="0" high="31" type="uint"/>
+	</reg32>
+	<reg32 offset="3" name="3">
+		<bitfield name="FIRST_INDX" low="0" high="31"/>
+	</reg32>
+
+	<stripe varset="chip" variants="A5XX-">
+		<reg32 offset="4" name="4">
+			<bitfield name="INDX_BASE_LO" low="0" high="31"/>
+		</reg32>
+		<reg32 offset="5" name="5">
+			<bitfield name="INDX_BASE_HI" low="0" high="31"/>
+		</reg32>
+		<reg64 offset="4" name="INDX_BASE" type="address"/>
+		<reg32 offset="6" name="6">
+			<!-- max # of elements in index buffer -->
+			<bitfield name="MAX_INDICES" low="0" high="31"/>
+		</reg32>
+	</stripe>
+
+	<reg32 offset="4" name="4">
+		<bitfield name="INDX_BASE" low="0" high="31" type="address"/>
+	</reg32>
+
+	<reg32 offset="5" name="5">
+		<bitfield name="INDX_SIZE" low="0" high="31" type="uint"/>
+	</reg32>
+</domain>
+
+<domain name="CP_DRAW_INDIRECT" width="32" varset="chip" prefix="chip" variants="A4XX-">
+	<reg32 offset="0" name="0" type="vgt_draw_initiator_a4xx"/>
+	<stripe varset="chip" variants="A4XX">
+		<reg32 offset="1" name="1">
+			<bitfield name="INDIRECT" low="0" high="31"/>
+		</reg32>
+	</stripe>
+	<stripe varset="chip" variants="A5XX-">
+		<reg32 offset="1" name="1">
+			<bitfield name="INDIRECT_LO" low="0" high="31"/>
+		</reg32>
+		<reg32 offset="2" name="2">
+			<bitfield name="INDIRECT_HI" low="0" high="31"/>
+		</reg32>
+		<reg64 offset="1" name="INDIRECT" type="address"/>
+	</stripe>
+</domain>
+
+<domain name="CP_DRAW_INDX_INDIRECT" width="32" varset="chip" prefix="chip" variants="A4XX-">
+	<reg32 offset="0" name="0" type="vgt_draw_initiator_a4xx"/>
+	<stripe varset="chip" variants="A4XX">
+		<reg32 offset="1" name="1">
+			<bitfield name="INDX_BASE" low="0" high="31"/>
+		</reg32>
+		<reg32 offset="2" name="2">
+			<!-- max # of bytes in index buffer -->
+			<bitfield name="INDX_SIZE" low="0" high="31" type="uint"/>
+		</reg32>
+		<reg32 offset="3" name="3">
+			<bitfield name="INDIRECT" low="0" high="31"/>
+		</reg32>
+	</stripe>
+	<stripe varset="chip" variants="A5XX-">
+		<reg32 offset="1" name="1">
+			<bitfield name="INDX_BASE_LO" low="0" high="31"/>
+		</reg32>
+		<reg32 offset="2" name="2">
+			<bitfield name="INDX_BASE_HI" low="0" high="31"/>
+		</reg32>
+		<reg64 offset="1" name="INDX_BASE" type="address"/>
+		<reg32 offset="3" name="3">
+			<!-- max # of elements in index buffer -->
+			<bitfield name="MAX_INDICES" low="0" high="31" type="uint"/>
+		</reg32>
+		<reg32 offset="4" name="4">
+			<bitfield name="INDIRECT_LO" low="0" high="31"/>
+		</reg32>
+		<reg32 offset="5" name="5">
+			<bitfield name="INDIRECT_HI" low="0" high="31"/>
+		</reg32>
+		<reg64 offset="4" name="INDIRECT" type="address"/>
+	</stripe>
+</domain>
+
+<domain name="CP_DRAW_INDIRECT_MULTI" width="32" varset="chip" prefix="chip" variants="A6XX-">
+	<enum name="a6xx_draw_indirect_opcode">
+		<value name="INDIRECT_OP_NORMAL"  value="0x2"/>
+		<value name="INDIRECT_OP_INDEXED" value="0x4"/>
+		<value name="INDIRECT_OP_INDIRECT_COUNT" value="0x6"/>
+		<value name="INDIRECT_OP_INDIRECT_COUNT_INDEXED" value="0x7"/>
+	</enum>
+	<reg32 offset="0" name="0" type="vgt_draw_initiator_a4xx"/>
+	<reg32 offset="1" name="1">
+		<bitfield name="OPCODE" low="0" high="3" type="a6xx_draw_indirect_opcode" addvariant="yes"/>
+		<doc>
+		DST_OFF same as in CP_LOAD_STATE6 - vec4 VS const at this offset will
+		be updated for each draw to {draw_id, first_vertex, first_instance, 0}
+		value of 0 disables it
+		</doc>
+		<bitfield name="DST_OFF" low="8" high="21" type="hex"/>
+	</reg32>
+	<reg32 offset="2" name="DRAW_COUNT" type="uint"/>
+	<stripe varset="a6xx_draw_indirect_opcode" variants="INDIRECT_OP_NORMAL">
+		<reg64 offset="3" name="INDIRECT" type="address"/>
+		<reg32 offset="5" name="STRIDE" type="uint"/>
+	</stripe>
+	<stripe varset="a6xx_draw_indirect_opcode" variants="INDIRECT_OP_INDEXED" prefix="INDEXED">
+		<reg64 offset="3" name="INDEX" type="address"/>
+		<reg32 offset="5" name="MAX_INDICES" type="uint"/>
+		<reg64 offset="6" name="INDIRECT" type="address"/>
+		<reg32 offset="8" name="STRIDE" type="uint"/>
+	</stripe>
+	<stripe varset="a6xx_draw_indirect_opcode" variants="INDIRECT_OP_INDIRECT_COUNT" prefix="INDIRECT">
+		<reg64 offset="3" name="INDIRECT" type="address"/>
+		<reg64 offset="5" name="INDIRECT_COUNT" type="address"/>
+		<reg32 offset="7" name="STRIDE" type="uint"/>
+	</stripe>
+	<stripe varset="a6xx_draw_indirect_opcode" variants="INDIRECT_OP_INDIRECT_COUNT_INDEXED" prefix="INDIRECT_INDEXED">
+		<reg64 offset="3" name="INDEX" type="address"/>
+		<reg32 offset="5" name="MAX_INDICES" type="uint"/>
+		<reg64 offset="6" name="INDIRECT" type="address"/>
+		<reg64 offset="8" name="INDIRECT_COUNT" type="address"/>
+		<reg32 offset="10" name="STRIDE" type="uint"/>
+	</stripe>
+</domain>
+
+<domain name="CP_DRAW_AUTO" width="32">
+	<reg32 offset="0" name="0" type="vgt_draw_initiator_a4xx"/>
+	<reg32 offset="1" name="1">
+		<bitfield name="NUM_INSTANCES" low="0" high="31" type="uint"/>
+	</reg32>
+	<reg64 offset="2" name="NUM_VERTICES_BASE" type="address"/>
+	<reg32 offset="4" name="4">
+		<bitfield name="NUM_VERTICES_OFFSET" low="0" high="31" type="uint"/>
+	</reg32>
+	<reg32 offset="5" name="5">
+		<bitfield name="STRIDE" low="0" high="31" type="uint"/>
+	</reg32>
+</domain>
+
+<domain name="CP_DRAW_PRED_ENABLE_GLOBAL" width="32" varset="chip">
+	<reg32 offset="0" name="0">
+		<bitfield name="ENABLE" pos="0" type="boolean"/>
+	</reg32>
+</domain>
+
+<domain name="CP_DRAW_PRED_ENABLE_LOCAL" width="32" varset="chip">
+	<reg32 offset="0" name="0">
+		<bitfield name="ENABLE" pos="0" type="boolean"/>
+	</reg32>
+</domain>
+
+<domain name="CP_DRAW_PRED_SET" width="32" varset="chip">
+	<enum name="cp_draw_pred_src">
+		<!--
+			Sources 1-4 seem to be about combining reading
+			SO/primitive queries and setting the predicate, which is
+			a DX11-specific optimization (since in DX11 you can only
+			predicate on the result of queries).
+		-->
+		<value name="PRED_SRC_MEM" value="5">
+			<doc>
+				Read a 64-bit value at the given address and
+				test if it equals/doesn't equal 0.
+			</doc>
+		</value>
+	</enum>
+	<enum name="cp_draw_pred_test">
+		<value name="NE_0_PASS" value="0"/>
+		<value name="EQ_0_PASS" value="1"/>
+	</enum>
+	<reg32 offset="0" name="0">
+		<bitfield name="SRC" low="4" high="7" type="cp_draw_pred_src"/>
+		<bitfield name="TEST" pos="8" type="cp_draw_pred_test"/>
+	</reg32>
+	<reg64 offset="1" name="MEM_ADDR" type="address"/>
+</domain>
+
+<domain name="CP_SET_DRAW_STATE" width="32" varset="chip" variants="A4XX-">
+	<array offset="0" stride="3" length="100">
+		<reg32 offset="0" name="0">
+			<bitfield name="COUNT" low="0" high="15" type="uint"/>
+			<bitfield name="DIRTY" pos="16" type="boolean"/>
+			<bitfield name="DISABLE" pos="17" type="boolean"/>
+			<bitfield name="DISABLE_ALL_GROUPS" pos="18" type="boolean"/>
+			<bitfield name="LOAD_IMMED" pos="19" type="boolean"/>
+			<bitfield name="BINNING" pos="20" varset="chip" variants="A6XX-" type="boolean"/>
+			<bitfield name="GMEM" pos="21" varset="chip" variants="A6XX-" type="boolean"/>
+			<bitfield name="SYSMEM" pos="22" varset="chip" variants="A6XX-" type="boolean"/>
+			<bitfield name="GROUP_ID" low="24" high="28" type="uint"/>
+		</reg32>
+		<reg32 offset="1" name="1">
+			<bitfield name="ADDR_LO" low="0" high="31" type="hex"/>
+		</reg32>
+		<reg32 offset="2" name="2" varset="chip" variants="A5XX-">
+			<bitfield name="ADDR_HI" low="0" high="31" type="hex"/>
+		</reg32>
+	</array>
+</domain>
+
+<domain name="CP_SET_BIN" width="32">
+	<doc>value at offset 0 always seems to be 0x00000000..</doc>
+	<reg32 offset="0" name="0"/>
+	<reg32 offset="1" name="1">
+		<bitfield name="X1" low="0" high="15" type="uint"/>
+		<bitfield name="Y1" low="16" high="31" type="uint"/>
+	</reg32>
+	<reg32 offset="2" name="2">
+		<bitfield name="X2" low="0" high="15" type="uint"/>
+		<bitfield name="Y2" low="16" high="31" type="uint"/>
+	</reg32>
+</domain>
+
+<domain name="CP_SET_BIN_DATA" width="32">
+	<reg32 offset="0" name="0">
+		<!-- corresponds to VSC_PIPE[n].DATA_ADDR -->
+		<bitfield name="BIN_DATA_ADDR" low="0" high="31" type="hex"/>
+	</reg32>
+	<reg32 offset="1" name="1">
+		<!-- seesm to correspond to VSC_SIZE_ADDRESS -->
+		<bitfield name="BIN_SIZE_ADDRESS" low="0" high="31"/>
+	</reg32>
+</domain>
+
+<domain name="CP_SET_BIN_DATA5" width="32">
+	<reg32 offset="0" name="0">
+		<!-- equiv to PC_VSTREAM_CONTROL.SIZE on a3xx/a4xx: -->
+		<bitfield name="VSC_SIZE" low="16" high="21" type="uint"/>
+		<!-- equiv to PC_VSTREAM_CONTROL.N on a3xx/a4xx: -->
+		<bitfield name="VSC_N" low="22" high="26" type="uint"/>
+	</reg32>
+	<!-- BIN_DATA_ADDR -> VSC_PIPE[p].DATA_ADDRESS -->
+	<reg32 offset="1" name="1">
+		<bitfield name="BIN_DATA_ADDR_LO" low="0" high="31" type="hex"/>
+	</reg32>
+	<reg32 offset="2" name="2">
+		<bitfield name="BIN_DATA_ADDR_HI" low="0" high="31" type="hex"/>
+	</reg32>
+	<!-- BIN_SIZE_ADDRESS -> VSC_SIZE_ADDRESS + (p * 4)-->
+	<reg32 offset="3" name="3">
+		<bitfield name="BIN_SIZE_ADDRESS_LO" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="4" name="4">
+		<bitfield name="BIN_SIZE_ADDRESS_HI" low="0" high="31"/>
+	</reg32>
+	<!-- new on a6xx, where BIN_DATA_ADDR is the DRAW_STRM: -->
+	<reg32 offset="5" name="5">
+		<bitfield name="BIN_PRIM_STRM_LO" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="6" name="6">
+		<bitfield name="BIN_PRIM_STRM_HI" low="0" high="31"/>
+	</reg32>
+	<!--
+		a7xx adds a few more addresses to the end of the pkt
+	 -->
+	<reg64 offset="7" name="7"/>
+	<reg64 offset="9" name="9"/>
+</domain>
+
+<domain name="CP_SET_BIN_DATA5_OFFSET" width="32">
+	<doc>
+                Like CP_SET_BIN_DATA5, but set the pointers as offsets from the
+                pointers stored in VSC_PIPE_{DATA,DATA2,SIZE}_ADDRESS. Useful
+                for Vulkan where these values aren't known when the command
+                stream is recorded.
+	</doc>
+	<reg32 offset="0" name="0">
+		<!-- equiv to PC_VSTREAM_CONTROL.SIZE on a3xx/a4xx: -->
+		<bitfield name="VSC_SIZE" low="16" high="21" type="uint"/>
+		<!-- equiv to PC_VSTREAM_CONTROL.N on a3xx/a4xx: -->
+		<bitfield name="VSC_N" low="22" high="26" type="uint"/>
+	</reg32>
+	<!-- BIN_DATA_ADDR -> VSC_PIPE[p].DATA_ADDRESS -->
+	<reg32 offset="1" name="1">
+		<bitfield name="BIN_DATA_OFFSET" low="0" high="31" type="uint"/>
+	</reg32>
+	<!-- BIN_SIZE_ADDRESS -> VSC_SIZE_ADDRESS + (p * 4)-->
+	<reg32 offset="2" name="2">
+		<bitfield name="BIN_SIZE_OFFSET" low="0" high="31" type="uint"/>
+	</reg32>
+	<!-- BIN_DATA2_ADDR -> VSC_PIPE[p].DATA2_ADDRESS -->
+	<reg32 offset="3" name="3">
+		<bitfield name="BIN_DATA2_OFFSET" low="0" high="31" type="uint"/>
+	</reg32>
+</domain>
+
+<domain name="CP_REG_RMW" width="32">
+	<doc>
+                Modifies DST_REG using two sources that can either be registers
+                or immediates. If SRC1_ADD is set, then do the following:
+
+			$dst = (($dst &amp; $src0) rot $rotate) + $src1
+
+		Otherwise:
+
+			$dst = (($dst &amp; $src0) rot $rotate) | $src1
+
+		Here "rot" means rotate left.
+	</doc>
+	<reg32 offset="0" name="0">
+		<bitfield name="DST_REG" low="0" high="17" type="hex"/>
+		<bitfield name="ROTATE" low="24" high="28" type="uint"/>
+		<bitfield name="SRC1_ADD" pos="29" type="boolean"/>
+		<bitfield name="SRC1_IS_REG" pos="30" type="boolean"/>
+		<bitfield name="SRC0_IS_REG" pos="31" type="boolean"/>
+	</reg32>
+	<reg32 offset="1" name="1">
+		<bitfield name="SRC0" low="0" high="31" type="uint"/>
+	</reg32>
+	<reg32 offset="2" name="2">
+		<bitfield name="SRC1" low="0" high="31" type="uint"/>
+	</reg32>
+</domain>
+
+<domain name="CP_REG_TO_MEM" width="32">
+	<reg32 offset="0" name="0">
+		<bitfield name="REG" low="0" high="17" type="hex"/>
+		<!-- number of registers/dwords copied is max(CNT, 1). -->
+		<bitfield name="CNT" low="18" high="29" type="uint"/>
+		<bitfield name="64B" pos="30" type="boolean"/>
+		<bitfield name="ACCUMULATE" pos="31" type="boolean"/>
+	</reg32>
+	<reg32 offset="1" name="1">
+		<bitfield name="DEST" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="2" name="2" varset="chip" variants="A5XX-">
+		<bitfield name="DEST_HI" low="0" high="31"/>
+	</reg32>
+</domain>
+
+<domain name="CP_REG_TO_MEM_OFFSET_REG" width="32">
+	<doc>
+                Like CP_REG_TO_MEM, but the memory address to write to can be
+                offsetted using either one or two registers or scratch
+                registers.
+	</doc>
+	<reg32 offset="0" name="0">
+		<bitfield name="REG" low="0" high="17" type="hex"/>
+		<!-- number of registers/dwords copied is max(CNT, 1). -->
+		<bitfield name="CNT" low="18" high="29" type="uint"/>
+		<bitfield name="64B" pos="30" type="boolean"/>
+		<bitfield name="ACCUMULATE" pos="31" type="boolean"/>
+	</reg32>
+	<reg32 offset="1" name="1">
+		<bitfield name="DEST" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="2" name="2" varset="chip" variants="A5XX-">
+		<bitfield name="DEST_HI" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="3" name="3">
+		<bitfield name="OFFSET0" low="0" high="17" type="hex"/>
+		<bitfield name="OFFSET0_SCRATCH" pos="19" type="boolean"/>
+	</reg32>
+	<!-- followed by an optional identical OFFSET1 dword -->
+</domain>
+
+<domain name="CP_REG_TO_MEM_OFFSET_MEM" width="32">
+	<doc>
+                Like CP_REG_TO_MEM, but the memory address to write to can be
+                offsetted using a DWORD in memory.
+	</doc>
+	<reg32 offset="0" name="0">
+		<bitfield name="REG" low="0" high="17" type="hex"/>
+		<!-- number of registers/dwords copied is max(CNT, 1). -->
+		<bitfield name="CNT" low="18" high="29" type="uint"/>
+		<bitfield name="64B" pos="30" type="boolean"/>
+		<bitfield name="ACCUMULATE" pos="31" type="boolean"/>
+	</reg32>
+	<reg32 offset="1" name="1">
+		<bitfield name="DEST" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="2" name="2" varset="chip" variants="A5XX-">
+		<bitfield name="DEST_HI" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="3" name="3">
+		<bitfield name="OFFSET_LO" low="0" high="31" type="hex"/>
+	</reg32>
+	<reg32 offset="4" name="4">
+		<bitfield name="OFFSET_HI" low="0" high="31" type="hex"/>
+	</reg32>
+</domain>
+
+<domain name="CP_MEM_TO_REG" width="32">
+	<reg32 offset="0" name="0">
+		<bitfield name="REG" low="0" high="17" type="hex"/>
+		<!-- number of registers/dwords copied is max(CNT, 1). -->
+		<bitfield name="CNT" low="19" high="29" type="uint"/>
+		<!-- shift each DWORD left by 2 while copying -->
+		<bitfield name="SHIFT_BY_2" pos="30" type="boolean"/>
+		<!-- does the same thing as CP_MEM_TO_MEM::UNK31 -->
+		<bitfield name="UNK31" pos="31" type="boolean"/>
+	</reg32>
+	<reg32 offset="1" name="1">
+		<bitfield name="SRC" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="2" name="2" varset="chip" variants="A5XX-">
+		<bitfield name="SRC_HI" low="0" high="31"/>
+	</reg32>
+</domain>
+
+<domain name="CP_MEM_TO_MEM" width="32">
+	<reg32 offset="0" name="0">
+		<!--
+		not sure how many src operands we have, but the low
+		bits negate the n'th src argument.
+		 -->
+		<bitfield name="NEG_A" pos="0" type="boolean"/>
+		<bitfield name="NEG_B" pos="1" type="boolean"/>
+		<bitfield name="NEG_C" pos="2" type="boolean"/>
+
+		<!-- if set treat src/dst as 64bit values -->
+		<bitfield name="DOUBLE" pos="29" type="boolean"/>
+		<!-- execute CP_WAIT_FOR_MEM_WRITES beforehand -->
+		<bitfield name="WAIT_FOR_MEM_WRITES" pos="30" type="boolean"/>
+		<!-- some other kind of wait -->
+		<bitfield name="UNK31" pos="31" type="boolean"/>
+	</reg32>
+	<!--
+	followed by sequence of addresses.. the first is the
+	destination and the rest are N src addresses which are
+	summed (after being negated if NEG_x bit set) allowing
+	to do things like 'result += end - start' (which turns
+	out to be useful for queries and accumulating results
+	across multiple tiles)
+	 -->
+</domain>
+
+<domain name="CP_MEMCPY" width="32">
+	<reg32 offset="0" name="0">
+		<bitfield name="DWORDS" low="0" high="31" type="uint"/>
+	</reg32>
+	<reg32 offset="1" name="1">
+		<bitfield name="SRC_LO" low="0" high="31" type="hex"/>
+	</reg32>
+	<reg32 offset="2" name="2">
+		<bitfield name="SRC_HI" low="0" high="31" type="hex"/>
+	</reg32>
+	<reg32 offset="3" name="3">
+		<bitfield name="DST_LO" low="0" high="31" type="hex"/>
+	</reg32>
+	<reg32 offset="4" name="4">
+		<bitfield name="DST_HI" low="0" high="31" type="hex"/>
+	</reg32>
+</domain>
+
+<domain name="CP_REG_TO_SCRATCH" width="32">
+	<reg32 offset="0" name="0">
+		<bitfield name="REG" low="0" high="17" type="hex"/>
+		<bitfield name="SCRATCH" low="20" high="22" type="uint"/>
+		<!-- number of registers/dwords copied is CNT + 1. -->
+		<bitfield name="CNT" low="24" high="26" type="uint"/>
+	</reg32>
+</domain>
+
+<domain name="CP_SCRATCH_TO_REG" width="32">
+	<reg32 offset="0" name="0">
+		<bitfield name="REG" low="0" high="17" type="hex"/>
+		<!-- note: CP_MEM_TO_REG always sets this when writing to the register -->
+		<bitfield name="UNK18" pos="18" type="boolean"/>
+		<bitfield name="SCRATCH" low="20" high="22" type="uint"/>
+		<!-- number of registers/dwords copied is CNT + 1. -->
+		<bitfield name="CNT" low="24" high="26" type="uint"/>
+	</reg32>
+</domain>
+
+<domain name="CP_SCRATCH_WRITE" width="32">
+	<reg32 offset="0" name="0">
+		<bitfield name="SCRATCH" low="20" high="22" type="uint"/>
+	</reg32>
+	<!-- followed by one or more DWORDs to write to scratch registers -->
+</domain>
+
+<domain name="CP_MEM_WRITE" width="32">
+	<reg32 offset="0" name="0">
+		<bitfield name="ADDR_LO" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="1" name="1">
+		<bitfield name="ADDR_HI" low="0" high="31"/>
+	</reg32>
+	<!-- followed by the DWORDs to write -->
+</domain>
+
+<enum name="cp_cond_function">
+	<value value="0" name="WRITE_ALWAYS"/>
+	<value value="1" name="WRITE_LT"/>
+	<value value="2" name="WRITE_LE"/>
+	<value value="3" name="WRITE_EQ"/>
+	<value value="4" name="WRITE_NE"/>
+	<value value="5" name="WRITE_GE"/>
+	<value value="6" name="WRITE_GT"/>
+</enum>
+
+<domain name="CP_COND_WRITE" width="32">
+	<reg32 offset="0" name="0">
+		<bitfield name="FUNCTION" low="0" high="2" type="cp_cond_function"/>
+		<bitfield name="POLL_MEMORY" pos="4" type="boolean"/>
+		<bitfield name="WRITE_MEMORY" pos="8" type="boolean"/>
+	</reg32>
+	<reg32 offset="1" name="1">
+		<bitfield name="POLL_ADDR" low="0" high="31" type="hex"/>
+	</reg32>
+	<reg32 offset="2" name="2">
+		<bitfield name="REF" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="3" name="3">
+		<bitfield name="MASK" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="4" name="4">
+		<bitfield name="WRITE_ADDR" low="0" high="31" type="hex"/>
+	</reg32>
+	<reg32 offset="5" name="5">
+		<bitfield name="WRITE_DATA" low="0" high="31"/>
+	</reg32>
+</domain>
+
+<enum name="poll_memory_type">
+	<value value="0" name="POLL_REGISTER"/>
+	<value value="1" name="POLL_MEMORY"/>
+	<value value="2" name="POLL_SCRATCH"/>
+	<value value="3" name="POLL_ON_CHIP" varset="chip" variants="A7XX-"/>
+</enum>
+
+<domain name="CP_COND_WRITE5" width="32">
+	<reg32 offset="0" name="0">
+		<bitfield name="FUNCTION" low="0" high="2" type="cp_cond_function"/>
+		<bitfield name="SIGNED_COMPARE" pos="3" type="boolean"/>
+		<!-- POLL_REGISTER polls a register at POLL_ADDR_LO. -->
+		<bitfield name="POLL" low="4" high="5" type="poll_memory_type"/>
+		<bitfield name="WRITE_MEMORY" pos="8" type="boolean"/>
+	</reg32>
+	<reg32 offset="1" name="1">
+		<bitfield name="POLL_ADDR_LO" low="0" high="31" type="hex"/>
+	</reg32>
+	<reg32 offset="2" name="2">
+		<bitfield name="POLL_ADDR_HI" low="0" high="31" type="hex"/>
+	</reg32>
+	<reg32 offset="3" name="3">
+		<bitfield name="REF" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="4" name="4">
+		<bitfield name="MASK" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="5" name="5">
+		<bitfield name="WRITE_ADDR_LO" low="0" high="31" type="hex"/>
+	</reg32>
+	<reg32 offset="6" name="6">
+		<bitfield name="WRITE_ADDR_HI" low="0" high="31" type="hex"/>
+	</reg32>
+	<reg32 offset="7" name="7">
+		<bitfield name="WRITE_DATA" low="0" high="31"/>
+	</reg32>
+</domain>
+
+<domain name="CP_WAIT_MEM_GTE" width="32">
+        <doc>
+                Wait until a memory value is greater than or equal to the
+                reference, using signed comparison.
+	</doc>
+	<reg32 offset="0" name="0">
+		<!-- Reserved for flags, presumably? Unused in FW -->
+		<bitfield name="RESERVED" low="0" high="31" type="hex"/>
+	</reg32>
+	<reg32 offset="1" name="1">
+		<bitfield name="POLL_ADDR_LO" low="0" high="31" type="hex"/>
+	</reg32>
+	<reg32 offset="2" name="2">
+		<bitfield name="POLL_ADDR_HI" low="0" high="31" type="hex"/>
+	</reg32>
+	<reg32 offset="3" name="3">
+		<bitfield name="REF" low="0" high="31"/>
+	</reg32>
+</domain>
+
+<domain name="CP_WAIT_REG_MEM" width="32">
+        <doc>
+                This uses the same internal comparison as CP_COND_WRITE,
+                but waits until the comparison is true instead. It busy-loops in
+                the CP for the given number of cycles before trying again.
+	</doc>
+	<reg32 offset="0" name="0">
+		<bitfield name="FUNCTION" low="0" high="2" type="cp_cond_function"/>
+		<bitfield name="SIGNED_COMPARE" pos="3" type="boolean"/>
+		<bitfield name="POLL" low="4" high="5" type="poll_memory_type"/>
+		<bitfield name="WRITE_MEMORY" pos="8" type="boolean"/>
+	</reg32>
+	<reg32 offset="1" name="1">
+		<bitfield name="POLL_ADDR_LO" low="0" high="31" type="hex"/>
+	</reg32>
+	<reg32 offset="2" name="2">
+		<bitfield name="POLL_ADDR_HI" low="0" high="31" type="hex"/>
+	</reg32>
+	<reg32 offset="3" name="3">
+		<bitfield name="REF" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="4" name="4">
+		<bitfield name="MASK" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="5" name="5">
+		<bitfield name="DELAY_LOOP_CYCLES" low="0" high="31"/>
+	</reg32>
+</domain>
+
+<domain name="CP_WAIT_TWO_REGS" width="32">
+	<doc>
+		Waits for REG0 to not be 0 or REG1 to not equal REF
+	</doc>
+	<reg32 offset="0" name="0">
+		<bitfield name="REG0" low="0" high="17" type="hex"/>
+	</reg32>
+	<reg32 offset="1" name="1">
+		<bitfield name="REG1" low="0" high="17" type="hex"/>
+	</reg32>
+	<reg32 offset="2" name="2">
+		<bitfield name="REF" low="0" high="31" type="uint"/>
+	</reg32>
+</domain>
+
+<domain name="CP_DISPATCH_COMPUTE" width="32">
+	<reg32 offset="0" name="0"/>
+	<reg32 offset="1" name="1">
+		<bitfield name="X" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="2" name="2">
+		<bitfield name="Y" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="3" name="3">
+		<bitfield name="Z" low="0" high="31"/>
+	</reg32>
+</domain>
+
+<domain name="CP_SET_RENDER_MODE" width="32">
+	<enum name="render_mode_cmd">
+		<value value="1" name="BYPASS"/>
+		<value value="2" name="BINNING"/>
+		<value value="3" name="GMEM"/>
+		<value value="5" name="BLIT2D"/>
+		<!-- placeholder name.. used when CP_BLIT packets with BLIT_OP_SCALE?? -->
+		<value value="7" name="BLIT2DSCALE"/>
+		<!-- 8 set before going back to BYPASS exiting 2D -->
+		<value value="8" name="END2D"/>
+	</enum>
+	<reg32 offset="0" name="0">
+		<bitfield name="MODE" low="0" high="8" type="render_mode_cmd"/>
+		<!--
+		normally 0x1/0x3, sometimes see 0x5/0x8 with unknown registers in
+		0x21xx range.. possibly (at least some) a5xx variants have a
+		2d core?
+		 -->
+	</reg32>
+	<!-- I think first buffer is for GPU to save context in case of ctx switch? -->
+	<reg32 offset="1" name="1">
+		<bitfield name="ADDR_0_LO" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="2" name="2">
+		<bitfield name="ADDR_0_HI" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="3" name="3">
+		<!--
+		set when in GMEM.. maybe indicates GMEM contents need to be
+		preserved on ctx switch?
+		 -->
+		<bitfield name="VSC_ENABLE" pos="3" type="boolean"/>
+		<bitfield name="GMEM_ENABLE" pos="4" type="boolean"/>
+	</reg32>
+	<reg32 offset="4" name="4"/>
+	<!-- second buffer looks like some cmdstream.. length in dwords: -->
+	<reg32 offset="5" name="5">
+		<bitfield name="ADDR_1_LEN" low="0" high="31" type="uint"/>
+	</reg32>
+	<reg32 offset="6" name="6">
+		<bitfield name="ADDR_1_LO" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="7" name="7">
+		<bitfield name="ADDR_1_HI" low="0" high="31"/>
+	</reg32>
+</domain>
+
+<!-- this looks fairly similar to CP_SET_RENDER_MODE minus first dword -->
+<domain name="CP_COMPUTE_CHECKPOINT" width="32">
+	<!-- I think first buffer is for GPU to save context in case of ctx switch? -->
+	<reg32 offset="0" name="0">
+		<bitfield name="ADDR_0_LO" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="1" name="1">
+		<bitfield name="ADDR_0_HI" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="2" name="2">
+	</reg32>
+	<reg32 offset="3" name="3"/>
+	<!-- second buffer looks like some cmdstream.. length in dwords: -->
+	<reg32 offset="4" name="4">
+		<bitfield name="ADDR_1_LEN" low="0" high="31" type="uint"/>
+	</reg32>
+	<reg32 offset="5" name="5">
+		<bitfield name="ADDR_1_LO" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="6" name="6">
+		<bitfield name="ADDR_1_HI" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="7" name="7"/>
+</domain>
+
+<domain name="CP_PERFCOUNTER_ACTION" width="32">
+	<reg32 offset="0" name="0">
+	</reg32>
+	<reg32 offset="1" name="1">
+		<bitfield name="ADDR_0_LO" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="2" name="2">
+		<bitfield name="ADDR_0_HI" low="0" high="31"/>
+	</reg32>
+</domain>
+
+<domain varset="chip" name="CP_EVENT_WRITE" width="32">
+	<reg32 offset="0" name="0">
+		<bitfield name="EVENT" low="0" high="7" type="vgt_event_type"/>
+		<!-- when set, write back timestamp instead of value from packet: -->
+		<bitfield name="TIMESTAMP" pos="30" type="boolean"/>
+		<bitfield name="IRQ" pos="31" type="boolean"/>
+	</reg32>
+	<!--
+	TODO what is gpuaddr for, seems to be all 0's.. maybe needed for
+	context switch?
+	 -->
+	<reg32 offset="1" name="1">
+		<bitfield name="ADDR_0_LO" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="2" name="2">
+		<bitfield name="ADDR_0_HI" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="3" name="3">
+		<!-- ??? -->
+	</reg32>
+</domain>
+
+<domain varset="chip" name="CP_EVENT_WRITE7" width="32">
+	<enum name="event_write_src">
+		<!-- Write payload[0] -->
+		<value value="0" name="EV_WRITE_USER_32B"/>
+		<!-- Write payload[0] payload[1] -->
+		<value value="1" name="EV_WRITE_USER_64B"/>
+		<!-- Write (TIMESTAMP_GLOBAL + TIMESTAMP_LOCAL) -->
+		<value value="2" name="EV_WRITE_TIMESTAMP_SUM"/>
+		<value value="3" name="EV_WRITE_ALWAYSON"/>
+		<!-- Write payload[1] regs starting at payload[0] offset -->
+		<value value="4" name="EV_WRITE_REGS_CONTENT"/>
+	</enum>
+
+	<enum name="event_write_dst">
+		<value value="0" name="EV_DST_RAM"/>
+		<value value="1" name="EV_DST_ONCHIP"/>
+	</enum>
+
+	<reg32 offset="0" name="0">
+		<bitfield name="EVENT" low="0" high="7" type="vgt_event_type"/>
+		<bitfield name="WRITE_SAMPLE_COUNT" pos="12" type="boolean"/>
+		<!-- Write sample count at (iova + 16) -->
+		<bitfield name="SAMPLE_COUNT_END_OFFSET" pos="13" type="boolean"/>
+		<!-- *(iova + 8) = *(iova + 16) - *iova -->
+		<bitfield name="WRITE_SAMPLE_COUNT_DIFF" pos="14" type="boolean"/>
+
+		<!-- Next 4 flags are valid to set only when concurrent binning is enabled -->
+		<!-- Increment 16b BV counter. Valid only in BV pipe -->
+		<bitfield name="INC_BV_COUNT" pos="16" type="boolean"/>
+		<!-- Increment 16b BR counter. Valid only in BR pipe -->
+		<bitfield name="INC_BR_COUNT" pos="17" type="boolean"/>
+		<bitfield name="CLEAR_RENDER_RESOURCE" pos="18" type="boolean"/>
+		<bitfield name="CLEAR_LRZ_RESOURCE" pos="19" type="boolean"/>
+
+		<bitfield name="WRITE_SRC" low="20" high="22" type="event_write_src"/>
+		<bitfield name="WRITE_DST" pos="24" type="event_write_dst" addvariant="yes"/>
+		<!-- Writes into WRITE_DST from WRITE_SRC. RB_DONE_TS requires WRITE_ENABLED. -->
+		<bitfield name="WRITE_ENABLED" pos="27" type="boolean"/>
+	</reg32>
+
+	<stripe varset="event_write_dst" variants="EV_DST_RAM">
+		<reg32 offset="1" name="1">
+			<bitfield name="ADDR_0_LO" low="0" high="31"/>
+		</reg32>
+		<reg32 offset="2" name="2">
+			<bitfield name="ADDR_0_HI" low="0" high="31"/>
+		</reg32>
+		<reg32 offset="3" name="3">
+			<bitfield name="PAYLOAD_0" low="0" high="31"/>
+		</reg32>
+		<reg32 offset="4" name="4">
+			<bitfield name="PAYLOAD_1" low="0" high="31"/>
+		</reg32>
+	</stripe>
+
+	<stripe varset="event_write_dst" variants="EV_DST_ONCHIP">
+		<reg32 offset="1" name="1">
+			<bitfield name="ONCHIP_ADDR_0" low="0" high="31"/>
+		</reg32>
+		<reg32 offset="3" name="3">
+			<bitfield name="PAYLOAD_0" low="0" high="31"/>
+		</reg32>
+		<reg32 offset="4" name="4">
+			<bitfield name="PAYLOAD_1" low="0" high="31"/>
+		</reg32>
+	</stripe>
+</domain>
+
+<domain name="CP_BLIT" width="32">
+	<enum name="cp_blit_cmd">
+		<value value="0" name="BLIT_OP_FILL"/>
+		<value value="1" name="BLIT_OP_COPY"/>
+		<value value="3" name="BLIT_OP_SCALE"/> <!-- used for mipmap generation -->
+	</enum>
+	<reg32 offset="0" name="0">
+		<bitfield name="OP" low="0" high="3" type="cp_blit_cmd"/>
+	</reg32>
+	<reg32 offset="1" name="1">
+		<bitfield name="SRC_X1" low="0" high="13" type="uint"/>
+		<bitfield name="SRC_Y1" low="16" high="29" type="uint"/>
+	</reg32>
+	<reg32 offset="2" name="2">
+		<bitfield name="SRC_X2" low="0" high="13" type="uint"/>
+		<bitfield name="SRC_Y2" low="16" high="29" type="uint"/>
+	</reg32>
+	<reg32 offset="3" name="3">
+		<bitfield name="DST_X1" low="0" high="13" type="uint"/>
+		<bitfield name="DST_Y1" low="16" high="29" type="uint"/>
+	</reg32>
+	<reg32 offset="4" name="4">
+		<bitfield name="DST_X2" low="0" high="13" type="uint"/>
+		<bitfield name="DST_Y2" low="16" high="29" type="uint"/>
+	</reg32>
+</domain>
+
+<domain name="CP_EXEC_CS" width="32">
+	<reg32 offset="0" name="0">
+	</reg32>
+	<reg32 offset="1" name="1">
+		<bitfield name="NGROUPS_X" low="0" high="31" type="uint"/>
+	</reg32>
+	<reg32 offset="2" name="2">
+		<bitfield name="NGROUPS_Y" low="0" high="31" type="uint"/>
+	</reg32>
+	<reg32 offset="3" name="3">
+		<bitfield name="NGROUPS_Z" low="0" high="31" type="uint"/>
+	</reg32>
+</domain>
+
+<domain name="CP_EXEC_CS_INDIRECT" width="32" varset="chip" prefix="chip" variants="A4XX-">
+	<reg32 offset="0" name="0">
+	</reg32>
+	<stripe varset="chip" variants="A4XX">
+		<reg32 offset="1" name="1">
+			<bitfield name="ADDR" low="0" high="31"/>
+		</reg32>
+		<reg32 offset="2" name="2">
+			<!-- localsize is value minus one: -->
+			<bitfield name="LOCALSIZEX" low="2" high="11" type="uint"/>
+			<bitfield name="LOCALSIZEY" low="12" high="21" type="uint"/>
+			<bitfield name="LOCALSIZEZ" low="22" high="31" type="uint"/>
+		</reg32>
+	</stripe>
+	<stripe varset="chip" variants="A5XX-">
+		<reg32 offset="1" name="1">
+			<bitfield name="ADDR_LO" low="0" high="31"/>
+		</reg32>
+		<reg32 offset="2" name="2">
+			<bitfield name="ADDR_HI" low="0" high="31"/>
+		</reg32>
+		<reg32 offset="3" name="3">
+			<!-- localsize is value minus one: -->
+			<bitfield name="LOCALSIZEX" low="2" high="11" type="uint"/>
+			<bitfield name="LOCALSIZEY" low="12" high="21" type="uint"/>
+			<bitfield name="LOCALSIZEZ" low="22" high="31" type="uint"/>
+		</reg32>
+	</stripe>
+</domain>
+
+<domain name="CP_SET_MARKER" width="32" varset="chip" prefix="chip" variants="A6XX-">
+	<doc>Tell CP the current operation mode, indicates save and restore procedure</doc>
+	<enum name="a6xx_marker">
+		<value value="1" name="RM6_BYPASS"/>
+		<value value="2" name="RM6_BINNING"/>
+		<value value="4" name="RM6_GMEM"/>
+		<value value="5" name="RM6_ENDVIS"/>
+		<value value="6" name="RM6_RESOLVE"/>
+		<value value="7" name="RM6_YIELD"/>
+		<value value="8" name="RM6_COMPUTE"/>
+		<value value="0xc" name="RM6_BLIT2DSCALE"/>  <!-- no-op (at least on current sqe fw) -->
+
+		<!--
+			These values come from a6xx_set_marker() in the
+			downstream kernel, and they can only be set by the kernel
+		-->
+		<value value="0xd" name="RM6_IB1LIST_START"/>
+		<value value="0xe" name="RM6_IB1LIST_END"/>
+		<!-- IFPC - inter-frame power collapse -->
+		<value value="0x100" name="RM6_IFPC_ENABLE"/>
+		<value value="0x101" name="RM6_IFPC_DISABLE"/>
+	</enum>
+	<reg32 offset="0" name="0">
+		<!--
+			NOTE: blob driver and some versions of freedreno/turnip set
+			b4, which is unused (at least by current sqe fw), but interferes
+			with parsing if we extend the size of the bitfield to include
+			b8 (only sent by kernel mode driver).  Really, the way the
+			parsing works in the firmware, only b0-b3 are considered, but
+			if b8 is set, the low bits are interpreted differently.  To
+			model this, without getting confused by spurious b4, this is
+			described as two overlapping bitfields:
+		 -->
+		<bitfield name="MODE" low="0" high="8" type="a6xx_marker"/>
+		<bitfield name="MARKER" low="0" high="3" type="a6xx_marker"/>
+	</reg32>
+</domain>
+
+<domain name="CP_SET_PSEUDO_REG" width="32" varset="chip" prefix="chip" variants="A6XX-">
+	<doc>Set internal CP registers, used to indicate context save data addresses</doc>
+	<enum name="pseudo_reg">
+		<value value="0" name="SMMU_INFO"/>
+		<value value="1" name="NON_SECURE_SAVE_ADDR"/>
+		<value value="2" name="SECURE_SAVE_ADDR"/>
+		<value value="3" name="NON_PRIV_SAVE_ADDR"/>
+		<value value="4" name="COUNTER"/>
+
+		<!--
+			On a6xx the registers are set directly and CP_SET_BIN_DATA5_OFFSET reads them,
+			but that doesn't work with concurrent binning because BR will be reading from
+			a different set of streams than BV is writing, so on a7xx we have these
+			pseudo-regs instead, which do the right thing.
+
+			The corresponding VSC registers exist, and they're written by BV when it
+			encounters CP_SET_PSEUDO_REG. When BR later encounters the same CP_SET_PSEUDO_REG
+			it will only write some private scratch registers which are read by
+			CP_SET_BIN_DATA5_OFFSET.
+
+			If concurrent binning is disabled then BR also does binning so it will also
+			write the "real" registers in BR.
+		-->
+		<value value="8" name="DRAW_STRM_ADDRESS"/>
+		<value value="9" name="DRAW_STRM_SIZE_ADDRESS"/>
+		<value value="10" name="PRIM_STRM_ADDRESS"/>
+		<value value="11" name="UNK_STRM_ADDRESS"/>
+		<value value="12" name="UNK_STRM_SIZE_ADDRESS"/>
+
+		<value value="16" name="BINDLESS_BASE_0_ADDR"/>
+		<value value="17" name="BINDLESS_BASE_1_ADDR"/>
+		<value value="18" name="BINDLESS_BASE_2_ADDR"/>
+		<value value="19" name="BINDLESS_BASE_3_ADDR"/>
+		<value value="20" name="BINDLESS_BASE_4_ADDR"/>
+		<value value="21" name="BINDLESS_BASE_5_ADDR"/>
+		<value value="22" name="BINDLESS_BASE_6_ADDR"/>
+	</enum>
+	<array offset="0" stride="3" length="100">
+		<reg32 offset="0" name="0">
+			<bitfield name="PSEUDO_REG" low="0" high="10" type="pseudo_reg"/>
+		</reg32>
+		<reg32 offset="1" name="1">
+			<bitfield name="LO" low="0" high="31"/>
+		</reg32>
+		<reg32 offset="2" name="2">
+			<bitfield name="HI" low="0" high="31"/>
+		</reg32>
+	</array>
+</domain>
+
+<domain name="CP_REG_TEST" width="32" varset="chip" prefix="chip" variants="A6XX-">
+	<doc>
+		Tests bit in specified register and sets predicate for CP_COND_REG_EXEC.
+		So:
+
+			opcode: CP_REG_TEST (39) (2 dwords)
+			        { REG = 0xc10 | BIT = 0 }
+			               0000: 70b90001 00000c10
+			opcode: CP_COND_REG_EXEC (47) (3 dwords)
+			               0000: 70c70002 10000000 00000004
+			opcode: CP_INDIRECT_BUFFER (3f) (4 dwords)
+
+		Will execute the CP_INDIRECT_BUFFER only if b0 in the register at
+		offset 0x0c10 is 1
+	</doc>
+	<enum name="source_type">
+		<value value="0" name="SOURCE_REG"/>
+		<!-- Don't confuse with scratch registers, this is a separate memory
+			 written into by CP_MEM_TO_SCRATCH_MEM. -->
+		<value value="1" name="SOURCE_SCRATCH_MEM" varset="chip" variants="A7XX-"/>
+	</enum>
+	<reg32 offset="0" name="0">
+		<!-- the register to test -->
+		<bitfield name="REG" low="0" high="17" varset="source_type" variants="SOURCE_REG"/>
+		<bitfield name="SCRATCH_MEM_OFFSET" low="0" high="17" varset="source_type" variants="SOURCE_SCRATCH_MEM"/>
+		<bitfield name="SOURCE" pos="18" type="source_type" addvariant="yes"/>
+		<!-- the bit to test -->
+		<bitfield name="BIT" low="20" high="24" type="uint"/>
+		<!-- skip implied CP_WAIT_FOR_ME -->
+		<bitfield name="SKIP_WAIT_FOR_ME" pos="25" type="boolean"/>
+		<!-- the predicate bit to set (new in gen3+) -->
+		<bitfield name="PRED_BIT" low="26" high="30" type="uint"/>
+		<!-- update the predicate reg directly (new in gen3+) -->
+		<bitfield name="PRED_UPDATE" pos="31" type="boolean"/>
+	</reg32>
+
+        <!--
+		In PRED_UPDATE mode, the predicate reg is updated directly using two
+		more dwords, ignoring other bits:
+
+			PRED_REG = (PRED_REG & ~PRED_MASK) | (PRED_VAL & PRED_MASK);
+	-->
+	<reg32 offset="1" name="PRED_MASK" type="hex"/>
+	<reg32 offset="2" name="PRED_VAL" type="hex"/>
+</domain>
+
+<!-- I *think* this existed at least as far back as a4xx -->
+<domain name="CP_COND_REG_EXEC" width="32">
+	<enum name="compare_mode">
+		<!-- use the predicate bit set by CP_REG_TEST -->
+		<value value="1" name="PRED_TEST"/>
+		<!-- compare two registers directly for equality -->
+		<value value="2" name="REG_COMPARE"/>
+		<!-- test if certain render modes are set via CP_SET_MARKER -->
+		<value value="3" name="RENDER_MODE" varset="chip" variants="A6XX-"/>
+		<!-- compare REG0 for equality with immediate -->
+		<value value="4" name="REG_COMPARE_IMM" varset="chip" variants="A7XX-"/>
+		<!-- test which of BR/BV are enabled -->
+		<value value="5" name="THREAD_MODE" varset="chip" variants="A7XX-"/>
+	</enum>
+	<reg32 offset="0" name="0" varset="compare_mode">
+		<bitfield name="REG0" low="0" high="17" variants="REG_COMPARE" type="hex"/>
+
+		<!-- the predicate bit to test (new in gen3+) -->
+		<bitfield name="PRED_BIT" low="18" high="22" variants="PRED_TEST" type="uint"/>
+		<bitfield name="SKIP_WAIT_FOR_ME" pos="23" varset="chip" variants="A7XX-" type="boolean"/>
+		<!-- With REG_COMPARE instead of register read from ONCHIP memory -->
+		<bitfield name="ONCHIP_MEM" pos="24" varset="chip" variants="A7XX-" type="boolean"/>
+
+		<!--
+			Note: these bits have the same meaning, and use the same
+			internal mechanism as the bits in CP_SET_DRAW_STATE.
+			When RENDER_MODE is selected, they're used as
+			a bitmask of which modes pass the test.
+		-->
+
+		<!-- RM6_BINNING -->
+		<bitfield name="BINNING" pos="25" variants="RENDER_MODE" type="boolean"/>
+		<!-- all others -->
+		<bitfield name="GMEM" pos="26" variants="RENDER_MODE" type="boolean"/>
+		<!-- RM6_BYPASS -->
+		<bitfield name="SYSMEM" pos="27" variants="RENDER_MODE" type="boolean"/>
+
+		<bitfield name="BV" pos="25" variants="THREAD_MODE" type="boolean"/>
+		<bitfield name="BR" pos="26" variants="THREAD_MODE" type="boolean"/>
+		<bitfield name="LPAC" pos="27" variants="THREAD_MODE" type="boolean"/>
+
+		<bitfield name="MODE" low="28" high="31" type="compare_mode" addvariant="yes"/>
+	</reg32>
+
+	<stripe varset="compare_mode" variants="PRED_TEST">
+		<reg32 offset="1" name="1">
+			<bitfield name="DWORDS" low="0" high="23" type="uint"/>
+		</reg32>
+	</stripe>
+
+	<stripe varset="compare_mode" variants="REG_COMPARE">
+		<reg32 offset="1" name="1">
+			<bitfield name="REG1" low="0" high="17" type="hex"/>
+			<!-- Instead of register read from ONCHIP memory -->
+			<bitfield name="ONCHIP_MEM" pos="24" varset="chip" variants="A7XX-" type="boolean"/>
+		</reg32>
+	</stripe>
+
+	<stripe varset="compare_mode" variants="RENDER_MODE">
+		<reg32 offset="1" name="1">
+			<bitfield name="DWORDS" low="0" high="23" type="uint"/>
+		</reg32>
+	</stripe>
+
+	<stripe varset="compare_mode" variants="REG_COMPARE_IMM">
+		<reg32 offset="1" name="1">
+			<bitfield name="IMM" low="0" high="31"/>
+		</reg32>
+	</stripe>
+
+	<stripe varset="compare_mode" variants="THREAD_MODE">
+		<reg32 offset="1" name="1">
+			<bitfield name="DWORDS" low="0" high="23" type="uint"/>
+		</reg32>
+	</stripe>
+
+	<reg32 offset="2" name="2">
+		<bitfield name="DWORDS" low="0" high="23" type="uint"/>
+	</reg32>
+</domain>
+
+<domain name="CP_COND_EXEC" width="32">
+	<doc>
+                Executes the following DWORDs of commands if the dword at ADDR0
+                is not equal to 0 and the dword at ADDR1 is less than REF
+                (signed comparison).
+	</doc>
+	<reg32 offset="0" name="0">
+		<bitfield name="ADDR0_LO" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="1" name="1">
+		<bitfield name="ADDR0_HI" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="2" name="2">
+		<bitfield name="ADDR1_LO" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="3" name="3">
+		<bitfield name="ADDR1_HI" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="4" name="4">
+		<bitfield name="REF" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="5" name="5">
+		<bitfield name="DWORDS" low="0" high="31" type="uint"/>
+	</reg32>
+</domain>
+
+<domain name="CP_SET_CTXSWITCH_IB" width="32">
+	<doc>
+                Used by the userspace driver to set various IB's which are
+                executed during context save/restore for handling
+                state that isn't restored by the
+                context switch routine itself.
+	</doc>
+	<enum name="ctxswitch_ib">
+		<value name="RESTORE_IB" value="0">
+			<doc>Executed unconditionally when switching back to the context.</doc>
+		</value>
+		<value name="YIELD_RESTORE_IB" value="1">
+                        <doc>
+				Executed when switching back after switching
+				away during execution of
+				a CP_SET_MARKER packet with RM6_YIELD as the
+				payload *and* the normal save routine was
+				bypassed for a shorter one. I think this is
+				connected to the "skipsaverestore" bit set by
+				the kernel when preempting.
+			</doc>
+		</value>
+		<value name="SAVE_IB" value="2">
+                        <doc>
+				Executed when switching away from the context,
+				except for context switches initiated via
+				CP_YIELD.
+                        </doc>
+		</value>
+		<value name="RB_SAVE_IB" value="3">
+			<doc>
+				This can only be set by the RB (i.e. the kernel)
+				and executes with protected mode off, but
+				is otherwise similar to SAVE_IB.
+
+				Note, kgsl calls this CP_KMD_AMBLE_TYPE
+			</doc>
+		</value>
+	</enum>
+	<reg32 offset="0" name="0">
+		<bitfield name="ADDR_LO" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="1" name="1">
+		<bitfield name="ADDR_HI" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="2" name="2">
+		<bitfield name="DWORDS" low="0" high="19" type="uint"/>
+		<bitfield name="TYPE" low="20" high="21" type="ctxswitch_ib"/>
+	</reg32>
+</domain>
+
+<domain name="CP_REG_WRITE" width="32">
+	<enum name="reg_tracker">
+		<doc>
+			Keep shadow copies of these registers and only set them
+			when drawing, avoiding redundant writes:
+			- VPC_CNTL_0
+			- HLSQ_CONTROL_1_REG
+			- HLSQ_UNKNOWN_B980
+		</doc>
+		<value name="TRACK_CNTL_REG" value="0x1"/>
+		<doc>
+			Track RB_RENDER_CNTL, and insert a WFI in the following
+			situation:
+			- There is a write that disables binning
+			- There was a draw with binning left enabled, but in
+			  BYPASS mode
+			Presumably this is a hang workaround?
+		</doc>
+		<value name="TRACK_RENDER_CNTL" value="0x2"/>
+		<doc>
+			Do a mysterious CP_EVENT_WRITE 0x3f when the low bit of
+			the data to write is 0. Used by the Vulkan blob with
+			PC_MULTIVIEW_CNTL, but this isn't predicated on particular
+			register(s) like the others.
+		</doc>
+		<value name="UNK_EVENT_WRITE" value="0x4"/>
+		<doc>
+			Tracks GRAS_LRZ_CNTL::GREATER, GRAS_LRZ_CNTL::DIR, and
+			GRAS_LRZ_DEPTH_VIEW with previous values, and if one of
+			the following is true:
+			- GRAS_LRZ_CNTL::GREATER has changed
+			- GRAS_LRZ_CNTL::DIR has changed, the old value is not
+			  CUR_DIR_GE, and the new value is not CUR_DIR_DISABLED
+			- GRAS_LRZ_DEPTH_VIEW has changed
+			then it does a LRZ_FLUSH with GRAS_LRZ_CNTL::ENABLE
+			forced to 1.
+			Only exists in a650_sqe.fw.
+		</doc>
+		<value name="TRACK_LRZ" value="0x8"/>
+	</enum>
+	<reg32 offset="0" name="0">
+		<bitfield name="TRACKER" low="0" high="3" type="reg_tracker"/>
+	</reg32>
+	<reg32 offset="1" name="1"/>
+	<reg32 offset="2" name="2"/>
+</domain>
+
+<domain name="CP_SMMU_TABLE_UPDATE" width="32">
+	<doc>
+		Note that the SMMU's definition of TTBRn can take different forms
+		depending on the pgtable format.  But a5xx+ only uses aarch64
+		format.
+	</doc>
+	<reg32 offset="0" name="0">
+		<bitfield name="TTBR0_LO" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="1" name="1">
+		<bitfield name="TTBR0_HI" low="0" high="15"/>
+		<bitfield name="ASID" low="16" high="31"/>
+	</reg32>
+	<reg32 offset="2" name="2">
+		<doc>Unused, does not apply to aarch64 pgtable format</doc>
+		<bitfield name="CONTEXTIDR" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="3" name="3">
+		<bitfield name="CONTEXTBANK" low="0" high="31"/>
+	</reg32>
+</domain>
+
+<domain name="CP_START_BIN" width="32">
+	<reg32 offset="0" name="BIN_COUNT" type="uint"/>
+	<reg64 offset="1" name="PREFIX_ADDR" type="address"/>
+	<reg32 offset="3" name="PREFIX_DWORDS">
+		<doc>
+			Size of prefix for each bin. For each bin index i, the
+			prefix commands at PREFIX_ADDR + i * PREFIX_DWORDS are
+			executed in an IB2 before the IB1 commands following
+			this packet.
+		</doc>
+	</reg32>
+	<reg32 offset="4" name="BODY_DWORDS">
+		<doc>Number of dwords after this packet until CP_END_BIN</doc>
+	</reg32>
+</domain>
+
+<domain name="CP_WAIT_TIMESTAMP" width="32">
+	<enum name="ts_wait_value_src">
+		<!-- Wait for value at memory address to be >= SRC_0 (signed comparison) -->
+		<value value="0" name="TS_WAIT_GE_32B"/>
+		<!-- Wait for value at memory address to be >= SRC_0 (unsigned) -->
+		<value value="1" name="TS_WAIT_GE_64B"/>
+		<!-- Write (TIMESTAMP_GLOBAL + TIMESTAMP_LOCAL) -->
+		<value value="2" name="TS_WAIT_GE_TIMESTAMP_SUM"/>
+	</enum>
+
+	<enum name="ts_wait_type">
+		<value value="0" name="TS_WAIT_RAM"/>
+		<value value="1" name="TS_WAIT_ONCHIP"/>
+	</enum>
+
+	<reg32 offset="0" name="0">
+		<bitfield name="WAIT_VALUE_SRC" low="0" high="1" type="ts_wait_value_src"/>
+		<bitfield name="WAIT_DST" pos="4" type="ts_wait_type" addvariant="yes"/>
+	</reg32>
+
+	<stripe varset="ts_wait_type" variants="TS_WAIT_RAM">
+		<reg64 offset="1" name="ADDR" type="address"/>
+	</stripe>
+
+	<stripe varset="ts_wait_type" variants="TS_WAIT_ONCHIP">
+		<reg32 offset="1" name="ONCHIP_ADDR_0" low="0" high="31"/>
+	</stripe>
+
+	<reg32 offset="3" name="SRC_0"/>
+	<reg32 offset="4" name="SRC_1"/>
+</domain>
+
+<domain name="CP_BV_BR_COUNT_OPS" width="32">
+	<enum name="pipe_count_op">
+		<value name="PIPE_CLEAR_BV_BR" value="0x1"/>
+		<value name="PIPE_SET_BR_OFFSET" value="0x2"/>
+		<!-- Wait until for BV_counter > BR_counter -->
+		<value name="PIPE_BR_WAIT_FOR_BV" value="0x3"/>
+		<!-- Wait until (BR_counter + BR_OFFSET) > BV_counter -->
+		<value name="PIPE_BV_WAIT_FOR_BR" value="0x4"/>
+	</enum>
+	<reg32 offset="0" name="0">
+		<bitfield name="OP" low="0" high="3" type="pipe_count_op"/>
+	</reg32>
+	<reg32 offset="1" name="1">
+		<bitfield name="BR_OFFSET" low="0" high="15" type="uint"/>
+	</reg32>
+</domain>
+
+<domain name="CP_MODIFY_TIMESTAMP" width="32">
+	<enum name="timestamp_op">
+		<value name="MODIFY_TIMESTAMP_CLEAR" value="0"/>
+		<value name="MODIFY_TIMESTAMP_ADD_GLOBAL" value="1"/>
+		<value name="MODIFY_TIMESTAMP_ADD_LOCAL" value="2"/>
+	</enum>
+	<reg32 offset="0" name="0">
+		<bitfield name="ADD" low="0" high="7" type="uint"/>
+		<bitfield name="OP" low="28" high="31" type="timestamp_op"/>
+	</reg32>
+</domain>
+
+<domain name="CP_MEM_TO_SCRATCH_MEM" width="32">
+	<doc>
+		Best guess is that it is a faster way to fetch all the VSC_STATE registers
+		and keep them in a local scratch memory instead of fetching every time
+		when skipping IBs.
+	</doc>
+	<reg32 offset="0" name="0">
+		<bitfield name="CNT" low="0" high="5" type="uint"/>
+	</reg32>
+	<reg32 offset="1" name="1">
+		<doc>Scratch memory size is 48 dwords`</doc>
+		<bitfield name="OFFSET" low="0" high="5" type="uint"/>
+	</reg32>
+	<reg32 offset="2" name="2">
+		<bitfield name="SRC" low="0" high="31"/>
+	</reg32>
+	<reg32 offset="3" name="3">
+		<bitfield name="SRC_HI" low="0" high="31"/>
+	</reg32>
+</domain>
+
+<domain name="CP_THREAD_CONTROL" width="32">
+	<enum name="cp_thread">
+		<value name="CP_SET_THREAD_BR" value="1"/>    <!-- Render -->
+		<value name="CP_SET_THREAD_BV" value="2"/>    <!-- Visibility -->
+		<value name="CP_SET_THREAD_BOTH" value="3"/>
+	</enum>
+	<reg32 offset="0" name="0">
+		<bitfield low="0" high="1" name="THREAD" type="cp_thread"/>
+		<bitfield pos="27" name="CONCURRENT_BIN_DISABLE" type="boolean"/>
+		<bitfield pos="31" name="SYNC_THREADS" type="boolean"/>
+	</reg32>
+</domain>
+
+<domain name="CP_FIXED_STRIDE_DRAW_TABLE" width="32">
+	<reg64 offset="0" name="IB_BASE"/>
+	<reg32 offset="2" name="2">
+		<!-- STRIDE * COUNT -->
+		<bitfield name="IB_SIZE" low="0" high="11"/>
+		<bitfield name="STRIDE" low="20" high="31"/>
+	</reg32>
+	<reg32 offset="3" name="3">
+		<bitfield name="COUNT" low="0" high="31"/>
+	</reg32>
+</domain>
+
+<domain name="CP_RESET_CONTEXT_STATE" width="32">
+	<reg32 offset="0" name="0">
+		<bitfield name="CLEAR_ON_CHIP_TS" pos="0" type="boolean"/>
+		<bitfield name="CLEAR_RESOURCE_TABLE" pos="1" type="boolean"/>
+		<bitfield name="CLEAR_GLOBAL_LOCAL_TS" pos="2" type="boolean"/>
+	</reg32>
+</domain>
+
+</database>
+
diff --git a/extra/qcom_gpu_driver/gen.sh b/extra/qcom_gpu_driver/gen.sh
new file mode 100755
index 00000000..ad5d8b2f
--- /dev/null
+++ b/extra/qcom_gpu_driver/gen.sh
@@ -0,0 +1,2 @@
+#!/usr/bin/sh
+clang2py msm_kgsl.h -o msm_kgsl.py
\ No newline at end of file
diff --git a/extra/qcom_gpu_driver/msm_kgsl.h b/extra/qcom_gpu_driver/msm_kgsl.h
new file mode 100644
index 00000000..cae035d3
--- /dev/null
+++ b/extra/qcom_gpu_driver/msm_kgsl.h
@@ -0,0 +1,1451 @@
+#ifndef _UAPI_MSM_KGSL_H
+#define _UAPI_MSM_KGSL_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+#define size_t unsigned long
+#define uint64_t unsigned long
+
+/*
+ * The KGSL version has proven not to be very useful in userspace if features
+ * are cherry picked into other trees out of order so it is frozen as of 3.14.
+ * It is left here for backwards compatabilty and as a reminder that
+ * software releases are never linear. Also, I like pie.
+ */
+
+#define KGSL_VERSION_MAJOR        3
+#define KGSL_VERSION_MINOR        14
+
+/*
+ * We have traditionally mixed context and issueibcmds / command batch flags
+ * together into a big flag stew. This worked fine until we started adding a
+ * lot more command batch flags and we started running out of bits. Turns out
+ * we have a bit of room in the context type / priority mask that we could use
+ * for command batches, but that means we need to split out the flags into two
+ * coherent sets.
+ *
+ * If any future definitions are for both context and cmdbatch add both defines
+ * and link the cmdbatch to the context define as we do below. Otherwise feel
+ * free to add exclusive bits to either set.
+ */
+
+/* --- context flags --- */
+#define KGSL_CONTEXT_SAVE_GMEM		0x00000001
+#define KGSL_CONTEXT_NO_GMEM_ALLOC	0x00000002
+/* This is a cmdbatch exclusive flag - use the CMDBATCH equivalent instead */
+#define KGSL_CONTEXT_SUBMIT_IB_LIST	0x00000004
+#define KGSL_CONTEXT_CTX_SWITCH		0x00000008
+#define KGSL_CONTEXT_PREAMBLE		0x00000010
+#define KGSL_CONTEXT_TRASH_STATE	0x00000020
+#define KGSL_CONTEXT_PER_CONTEXT_TS	0x00000040
+#define KGSL_CONTEXT_USER_GENERATED_TS	0x00000080
+/* This is a cmdbatch exclusive flag - use the CMDBATCH equivalent instead */
+#define KGSL_CONTEXT_END_OF_FRAME	0x00000100
+#define KGSL_CONTEXT_NO_FAULT_TOLERANCE 0x00000200
+/* This is a cmdbatch exclusive flag - use the CMDBATCH equivalent instead */
+#define KGSL_CONTEXT_SYNC               0x00000400
+#define KGSL_CONTEXT_PWR_CONSTRAINT     0x00000800
+
+#define KGSL_CONTEXT_PRIORITY_MASK      0x0000F000
+#define KGSL_CONTEXT_PRIORITY_SHIFT     12
+#define KGSL_CONTEXT_PRIORITY_UNDEF     0
+
+#define KGSL_CONTEXT_IFH_NOP            0x00010000
+#define KGSL_CONTEXT_SECURE             0x00020000
+
+#define KGSL_CONTEXT_PREEMPT_STYLE_MASK       0x0E000000
+#define KGSL_CONTEXT_PREEMPT_STYLE_SHIFT      25
+#define KGSL_CONTEXT_PREEMPT_STYLE_DEFAULT    0x0
+#define KGSL_CONTEXT_PREEMPT_STYLE_RINGBUFFER 0x1
+#define KGSL_CONTEXT_PREEMPT_STYLE_FINEGRAIN  0x2
+
+#define KGSL_CONTEXT_TYPE_MASK          0x01F00000
+#define KGSL_CONTEXT_TYPE_SHIFT         20
+#define KGSL_CONTEXT_TYPE_ANY		0
+#define KGSL_CONTEXT_TYPE_GL		1
+#define KGSL_CONTEXT_TYPE_CL		2
+#define KGSL_CONTEXT_TYPE_C2D		3
+#define KGSL_CONTEXT_TYPE_RS		4
+#define KGSL_CONTEXT_TYPE_UNKNOWN	0x1E
+
+#define KGSL_CONTEXT_INVALID 0xffffffff
+
+/*
+ * --- command batch flags ---
+ * The bits that are linked to a KGSL_CONTEXT equivalent are either legacy
+ * definitions or bits that are valid for both contexts and cmdbatches.  To be
+ * safe the other 8 bits that are still available in the context field should be
+ * omitted here in case we need to share - the other bits are available for
+ * cmdbatch only flags as needed
+ */
+#define KGSL_CMDBATCH_MEMLIST		0x00000001
+#define KGSL_CMDBATCH_MARKER		0x00000002
+#define KGSL_CMDBATCH_SUBMIT_IB_LIST	KGSL_CONTEXT_SUBMIT_IB_LIST /* 0x004 */
+#define KGSL_CMDBATCH_CTX_SWITCH	KGSL_CONTEXT_CTX_SWITCH     /* 0x008 */
+#define KGSL_CMDBATCH_PROFILING		0x00000010
+#define KGSL_CMDBATCH_PROFILING_KTIME	0x00000020
+#define KGSL_CMDBATCH_END_OF_FRAME	KGSL_CONTEXT_END_OF_FRAME   /* 0x100 */
+#define KGSL_CMDBATCH_SYNC		KGSL_CONTEXT_SYNC           /* 0x400 */
+#define KGSL_CMDBATCH_PWR_CONSTRAINT	KGSL_CONTEXT_PWR_CONSTRAINT /* 0x800 */
+
+/*
+ * Reserve bits [16:19] and bits [28:31] for possible bits shared between
+ * contexts and command batches.  Update this comment as new flags are added.
+ */
+
+/*
+ * gpu_command_object flags - these flags communicate the type of command or
+ * memory object being submitted for a GPU command
+ */
+
+/* Flags for GPU command objects */
+#define KGSL_CMDLIST_IB                  0x00000001U
+#define KGSL_CMDLIST_CTXTSWITCH_PREAMBLE 0x00000002U
+#define KGSL_CMDLIST_IB_PREAMBLE         0x00000004U
+
+/* Flags for GPU command memory objects */
+#define KGSL_OBJLIST_MEMOBJ  0x00000008U
+#define KGSL_OBJLIST_PROFILE 0x00000010U
+
+/* Flags for GPU command sync points */
+#define KGSL_CMD_SYNCPOINT_TYPE_TIMESTAMP 0
+#define KGSL_CMD_SYNCPOINT_TYPE_FENCE 1
+
+/* --- Memory allocation flags --- */
+
+/* General allocation hints */
+#define KGSL_MEMFLAGS_SECURE      0x00000008ULL
+#define KGSL_MEMFLAGS_GPUREADONLY 0x01000000U
+#define KGSL_MEMFLAGS_GPUWRITEONLY 0x02000000U
+#define KGSL_MEMFLAGS_FORCE_32BIT 0x100000000ULL
+
+/* Memory caching hints */
+#define KGSL_CACHEMODE_MASK       0x0C000000U
+#define KGSL_CACHEMODE_SHIFT 26
+
+#define KGSL_CACHEMODE_WRITECOMBINE 0
+#define KGSL_CACHEMODE_UNCACHED 1
+#define KGSL_CACHEMODE_WRITETHROUGH 2
+#define KGSL_CACHEMODE_WRITEBACK 3
+
+#define KGSL_MEMFLAGS_USE_CPU_MAP 0x10000000ULL
+
+/* Memory types for which allocations are made */
+#define KGSL_MEMTYPE_MASK		0x0000FF00
+#define KGSL_MEMTYPE_SHIFT		8
+
+#define KGSL_MEMTYPE_OBJECTANY			0
+#define KGSL_MEMTYPE_FRAMEBUFFER		1
+#define KGSL_MEMTYPE_RENDERBUFFER		2
+#define KGSL_MEMTYPE_ARRAYBUFFER		3
+#define KGSL_MEMTYPE_ELEMENTARRAYBUFFER		4
+#define KGSL_MEMTYPE_VERTEXARRAYBUFFER		5
+#define KGSL_MEMTYPE_TEXTURE			6
+#define KGSL_MEMTYPE_SURFACE			7
+#define KGSL_MEMTYPE_EGL_SURFACE		8
+#define KGSL_MEMTYPE_GL				9
+#define KGSL_MEMTYPE_CL				10
+#define KGSL_MEMTYPE_CL_BUFFER_MAP		11
+#define KGSL_MEMTYPE_CL_BUFFER_NOMAP		12
+#define KGSL_MEMTYPE_CL_IMAGE_MAP		13
+#define KGSL_MEMTYPE_CL_IMAGE_NOMAP		14
+#define KGSL_MEMTYPE_CL_KERNEL_STACK		15
+#define KGSL_MEMTYPE_COMMAND			16
+#define KGSL_MEMTYPE_2D				17
+#define KGSL_MEMTYPE_EGL_IMAGE			18
+#define KGSL_MEMTYPE_EGL_SHADOW			19
+#define KGSL_MEMTYPE_MULTISAMPLE		20
+#define KGSL_MEMTYPE_KERNEL			255
+
+/*
+ * Alignment hint, passed as the power of 2 exponent.
+ * i.e 4k (2^12) would be 12, 64k (2^16)would be 16.
+ */
+#define KGSL_MEMALIGN_MASK		0x00FF0000
+#define KGSL_MEMALIGN_SHIFT		16
+
+enum kgsl_user_mem_type {
+	KGSL_USER_MEM_TYPE_PMEM		= 0x00000000,
+	KGSL_USER_MEM_TYPE_ASHMEM	= 0x00000001,
+	KGSL_USER_MEM_TYPE_ADDR		= 0x00000002,
+	KGSL_USER_MEM_TYPE_ION		= 0x00000003,
+	/*
+	 * ION type is retained for backwards compatibilty but Ion buffers are
+	 * dma-bufs so try to use that naming if we can
+	 */
+	KGSL_USER_MEM_TYPE_DMABUF       = 0x00000003,
+	KGSL_USER_MEM_TYPE_MAX		= 0x00000007,
+};
+#define KGSL_MEMFLAGS_USERMEM_MASK 0x000000e0
+#define KGSL_MEMFLAGS_USERMEM_SHIFT 5
+
+/*
+ * Unfortunately, enum kgsl_user_mem_type starts at 0 which does not
+ * leave a good value for allocated memory. In the flags we use
+ * 0 to indicate allocated memory and thus need to add 1 to the enum
+ * values.
+ */
+#define KGSL_USERMEM_FLAG(x) (((x) + 1) << KGSL_MEMFLAGS_USERMEM_SHIFT)
+
+#define KGSL_MEMFLAGS_NOT_USERMEM 0
+#define KGSL_MEMFLAGS_USERMEM_PMEM KGSL_USERMEM_FLAG(KGSL_USER_MEM_TYPE_PMEM)
+#define KGSL_MEMFLAGS_USERMEM_ASHMEM \
+		KGSL_USERMEM_FLAG(KGSL_USER_MEM_TYPE_ASHMEM)
+#define KGSL_MEMFLAGS_USERMEM_ADDR KGSL_USERMEM_FLAG(KGSL_USER_MEM_TYPE_ADDR)
+#define KGSL_MEMFLAGS_USERMEM_ION KGSL_USERMEM_FLAG(KGSL_USER_MEM_TYPE_ION)
+
+/* --- generic KGSL flag values --- */
+
+#define KGSL_FLAGS_NORMALMODE  0x00000000
+#define KGSL_FLAGS_SAFEMODE    0x00000001
+#define KGSL_FLAGS_INITIALIZED0 0x00000002
+#define KGSL_FLAGS_INITIALIZED 0x00000004
+#define KGSL_FLAGS_STARTED     0x00000008
+#define KGSL_FLAGS_ACTIVE      0x00000010
+#define KGSL_FLAGS_RESERVED0   0x00000020
+#define KGSL_FLAGS_RESERVED1   0x00000040
+#define KGSL_FLAGS_RESERVED2   0x00000080
+#define KGSL_FLAGS_SOFT_RESET  0x00000100
+#define KGSL_FLAGS_PER_CONTEXT_TIMESTAMPS 0x00000200
+
+/* Server Side Sync Timeout in milliseconds */
+#define KGSL_SYNCOBJ_SERVER_TIMEOUT 2000
+
+/*
+ * Reset status values for context
+ */
+enum kgsl_ctx_reset_stat {
+	KGSL_CTX_STAT_NO_ERROR				= 0x00000000,
+	KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT		= 0x00000001,
+	KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT	= 0x00000002,
+	KGSL_CTX_STAT_UNKNOWN_CONTEXT_RESET_EXT		= 0x00000003
+};
+
+#define KGSL_CONVERT_TO_MBPS(val) \
+	(val*1000*1000U)
+
+/* device id */
+enum kgsl_deviceid {
+	KGSL_DEVICE_3D0		= 0x00000000,
+	KGSL_DEVICE_MAX
+};
+
+struct kgsl_devinfo {
+
+	unsigned int device_id;
+	/* chip revision id
+	* coreid:8 majorrev:8 minorrev:8 patch:8
+	*/
+	unsigned int chip_id;
+	unsigned int mmu_enabled;
+	unsigned long gmem_gpubaseaddr;
+	/*
+	* This field contains the adreno revision
+	* number 200, 205, 220, etc...
+	*/
+	unsigned int gpu_id;
+	size_t gmem_sizebytes;
+};
+
+/*
+ * struct kgsl_devmemstore - this structure defines the region of memory
+ * that can be mmap()ed from this driver. The timestamp fields are volatile
+ * because they are written by the GPU
+ * @soptimestamp: Start of pipeline timestamp written by GPU before the
+ * commands in concern are processed
+ * @sbz: Unused, kept for 8 byte alignment
+ * @eoptimestamp: End of pipeline timestamp written by GPU after the
+ * commands in concern are processed
+ * @sbz2: Unused, kept for 8 byte alignment
+ * @preempted: Indicates if the context was preempted
+ * @sbz3: Unused, kept for 8 byte alignment
+ * @ref_wait_ts: Timestamp on which to generate interrupt, unused now.
+ * @sbz4: Unused, kept for 8 byte alignment
+ * @current_context: The current context the GPU is working on
+ * @sbz5: Unused, kept for 8 byte alignment
+ */
+struct kgsl_devmemstore {
+	volatile unsigned int soptimestamp;
+	unsigned int sbz;
+	volatile unsigned int eoptimestamp;
+	unsigned int sbz2;
+	volatile unsigned int preempted;
+	unsigned int sbz3;
+	volatile unsigned int ref_wait_ts;
+	unsigned int sbz4;
+	unsigned int current_context;
+	unsigned int sbz5;
+};
+
+#define KGSL_MEMSTORE_OFFSET(ctxt_id, field) \
+	((ctxt_id)*sizeof(struct kgsl_devmemstore) + \
+	 offsetof(struct kgsl_devmemstore, field))
+
+/* timestamp id*/
+enum kgsl_timestamp_type {
+	KGSL_TIMESTAMP_CONSUMED = 0x00000001, /* start-of-pipeline timestamp */
+	KGSL_TIMESTAMP_RETIRED  = 0x00000002, /* end-of-pipeline timestamp*/
+	KGSL_TIMESTAMP_QUEUED   = 0x00000003,
+};
+
+/* property types - used with kgsl_device_getproperty */
+#define KGSL_PROP_DEVICE_INFO		0x1
+#define KGSL_PROP_DEVICE_SHADOW		0x2
+#define KGSL_PROP_DEVICE_POWER		0x3
+#define KGSL_PROP_SHMEM			0x4
+#define KGSL_PROP_SHMEM_APERTURES	0x5
+#define KGSL_PROP_MMU_ENABLE		0x6
+#define KGSL_PROP_INTERRUPT_WAITS	0x7
+#define KGSL_PROP_VERSION		0x8
+#define KGSL_PROP_GPU_RESET_STAT	0x9
+#define KGSL_PROP_PWRCTRL		0xE
+#define KGSL_PROP_PWR_CONSTRAINT	0x12
+#define KGSL_PROP_UCHE_GMEM_VADDR	0x13
+#define KGSL_PROP_SP_GENERIC_MEM	0x14
+#define KGSL_PROP_UCODE_VERSION		0x15
+#define KGSL_PROP_GPMU_VERSION		0x16
+#define KGSL_PROP_DEVICE_BITNESS	0x18
+
+struct kgsl_shadowprop {
+	unsigned long gpuaddr;
+	size_t size;
+	unsigned int flags; /* contains KGSL_FLAGS_ values */
+};
+
+struct kgsl_version {
+	unsigned int drv_major;
+	unsigned int drv_minor;
+	unsigned int dev_major;
+	unsigned int dev_minor;
+};
+
+struct kgsl_sp_generic_mem {
+	uint64_t local;
+	uint64_t pvt;
+};
+
+struct kgsl_ucode_version {
+	unsigned int pfp;
+	unsigned int pm4;
+};
+
+struct kgsl_gpmu_version {
+	unsigned int major;
+	unsigned int minor;
+	unsigned int features;
+};
+
+/* Performance counter groups */
+
+#define KGSL_PERFCOUNTER_GROUP_CP 0x0
+#define KGSL_PERFCOUNTER_GROUP_RBBM 0x1
+#define KGSL_PERFCOUNTER_GROUP_PC 0x2
+#define KGSL_PERFCOUNTER_GROUP_VFD 0x3
+#define KGSL_PERFCOUNTER_GROUP_HLSQ 0x4
+#define KGSL_PERFCOUNTER_GROUP_VPC 0x5
+#define KGSL_PERFCOUNTER_GROUP_TSE 0x6
+#define KGSL_PERFCOUNTER_GROUP_RAS 0x7
+#define KGSL_PERFCOUNTER_GROUP_UCHE 0x8
+#define KGSL_PERFCOUNTER_GROUP_TP 0x9
+#define KGSL_PERFCOUNTER_GROUP_SP 0xA
+#define KGSL_PERFCOUNTER_GROUP_RB 0xB
+#define KGSL_PERFCOUNTER_GROUP_PWR 0xC
+#define KGSL_PERFCOUNTER_GROUP_VBIF 0xD
+#define KGSL_PERFCOUNTER_GROUP_VBIF_PWR 0xE
+#define KGSL_PERFCOUNTER_GROUP_MH 0xF
+#define KGSL_PERFCOUNTER_GROUP_PA_SU 0x10
+#define KGSL_PERFCOUNTER_GROUP_SQ 0x11
+#define KGSL_PERFCOUNTER_GROUP_SX 0x12
+#define KGSL_PERFCOUNTER_GROUP_TCF 0x13
+#define KGSL_PERFCOUNTER_GROUP_TCM 0x14
+#define KGSL_PERFCOUNTER_GROUP_TCR 0x15
+#define KGSL_PERFCOUNTER_GROUP_L2 0x16
+#define KGSL_PERFCOUNTER_GROUP_VSC 0x17
+#define KGSL_PERFCOUNTER_GROUP_CCU 0x18
+#define KGSL_PERFCOUNTER_GROUP_LRZ 0x19
+#define KGSL_PERFCOUNTER_GROUP_CMP 0x1A
+#define KGSL_PERFCOUNTER_GROUP_ALWAYSON 0x1B
+#define KGSL_PERFCOUNTER_GROUP_SP_PWR 0x1C
+#define KGSL_PERFCOUNTER_GROUP_TP_PWR 0x1D
+#define KGSL_PERFCOUNTER_GROUP_RB_PWR 0x1E
+#define KGSL_PERFCOUNTER_GROUP_CCU_PWR 0x1F
+#define KGSL_PERFCOUNTER_GROUP_UCHE_PWR 0x20
+#define KGSL_PERFCOUNTER_GROUP_CP_PWR 0x21
+#define KGSL_PERFCOUNTER_GROUP_GPMU_PWR 0x22
+#define KGSL_PERFCOUNTER_GROUP_ALWAYSON_PWR 0x23
+#define KGSL_PERFCOUNTER_GROUP_MAX 0x24
+
+#define KGSL_PERFCOUNTER_NOT_USED 0xFFFFFFFF
+#define KGSL_PERFCOUNTER_BROKEN 0xFFFFFFFE
+
+/* structure holds list of ibs */
+struct kgsl_ibdesc {
+	unsigned long gpuaddr;
+	unsigned long __pad;
+	size_t sizedwords;
+	unsigned int ctrl;
+};
+
+/**
+ * struct kgsl_cmdbatch_profiling_buffer
+ * @wall_clock_s: Ringbuffer submission time (seconds).
+ *                If KGSL_CMDBATCH_PROFILING_KTIME is set, time is provided
+ *                in kernel clocks, otherwise wall clock time is used.
+ * @wall_clock_ns: Ringbuffer submission time (nanoseconds).
+ *                 If KGSL_CMDBATCH_PROFILING_KTIME is set time is provided
+ *                 in kernel clocks, otherwise wall clock time is used.
+ * @gpu_ticks_queued: GPU ticks at ringbuffer submission
+ * @gpu_ticks_submitted: GPU ticks when starting cmdbatch execution
+ * @gpu_ticks_retired: GPU ticks when finishing cmdbatch execution
+ *
+ * This structure defines the profiling buffer used to measure cmdbatch
+ * execution time
+ */
+struct kgsl_cmdbatch_profiling_buffer {
+	uint64_t wall_clock_s;
+	uint64_t wall_clock_ns;
+	uint64_t gpu_ticks_queued;
+	uint64_t gpu_ticks_submitted;
+	uint64_t gpu_ticks_retired;
+};
+
+/* ioctls */
+#define KGSL_IOC_TYPE 0x09
+
+/* get misc info about the GPU
+   type should be a value from enum kgsl_property_type
+   value points to a structure that varies based on type
+   sizebytes is sizeof() that structure
+   for KGSL_PROP_DEVICE_INFO, use struct kgsl_devinfo
+   this structure contaings hardware versioning info.
+   for KGSL_PROP_DEVICE_SHADOW, use struct kgsl_shadowprop
+   this is used to find mmap() offset and sizes for mapping
+   struct kgsl_memstore into userspace.
+*/
+struct kgsl_device_getproperty {
+	unsigned int type;
+	void __user *value;
+	size_t sizebytes;
+};
+
+#define IOCTL_KGSL_DEVICE_GETPROPERTY \
+	_IOWR(KGSL_IOC_TYPE, 0x2, struct kgsl_device_getproperty)
+
+/* IOCTL_KGSL_DEVICE_READ (0x3) - removed 03/2012
+ */
+
+/* block until the GPU has executed past a given timestamp
+ * timeout is in milliseconds.
+ */
+struct kgsl_device_waittimestamp {
+	unsigned int timestamp;
+	unsigned int timeout;
+};
+
+#define IOCTL_KGSL_DEVICE_WAITTIMESTAMP \
+	_IOW(KGSL_IOC_TYPE, 0x6, struct kgsl_device_waittimestamp)
+
+struct kgsl_device_waittimestamp_ctxtid {
+	unsigned int context_id;
+	unsigned int timestamp;
+	unsigned int timeout;
+};
+
+#define IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID \
+	_IOW(KGSL_IOC_TYPE, 0x7, struct kgsl_device_waittimestamp_ctxtid)
+
+/* DEPRECATED: issue indirect commands to the GPU.
+ * drawctxt_id must have been created with IOCTL_KGSL_DRAWCTXT_CREATE
+ * ibaddr and sizedwords must specify a subset of a buffer created
+ * with IOCTL_KGSL_SHAREDMEM_FROM_PMEM
+ * flags may be a mask of KGSL_CONTEXT_ values
+ * timestamp is a returned counter value which can be passed to
+ * other ioctls to determine when the commands have been executed by
+ * the GPU.
+ *
+ * This fucntion is deprecated - consider using IOCTL_KGSL_SUBMIT_COMMANDS
+ * instead
+ */
+struct kgsl_ringbuffer_issueibcmds {
+	unsigned int drawctxt_id;
+	unsigned long ibdesc_addr;
+	unsigned int numibs;
+	unsigned int timestamp; /*output param */
+	unsigned int flags;
+};
+
+#define IOCTL_KGSL_RINGBUFFER_ISSUEIBCMDS \
+	_IOWR(KGSL_IOC_TYPE, 0x10, struct kgsl_ringbuffer_issueibcmds)
+
+/* read the most recently executed timestamp value
+ * type should be a value from enum kgsl_timestamp_type
+ */
+struct kgsl_cmdstream_readtimestamp {
+	unsigned int type;
+	unsigned int timestamp; /*output param */
+};
+
+#define IOCTL_KGSL_CMDSTREAM_READTIMESTAMP_OLD \
+	_IOR(KGSL_IOC_TYPE, 0x11, struct kgsl_cmdstream_readtimestamp)
+
+#define IOCTL_KGSL_CMDSTREAM_READTIMESTAMP \
+	_IOWR(KGSL_IOC_TYPE, 0x11, struct kgsl_cmdstream_readtimestamp)
+
+/* free memory when the GPU reaches a given timestamp.
+ * gpuaddr specify a memory region created by a
+ * IOCTL_KGSL_SHAREDMEM_FROM_PMEM call
+ * type should be a value from enum kgsl_timestamp_type
+ */
+struct kgsl_cmdstream_freememontimestamp {
+	unsigned long gpuaddr;
+	unsigned int type;
+	unsigned int timestamp;
+};
+
+#define IOCTL_KGSL_CMDSTREAM_FREEMEMONTIMESTAMP \
+	_IOW(KGSL_IOC_TYPE, 0x12, struct kgsl_cmdstream_freememontimestamp)
+
+/* Previous versions of this header had incorrectly defined
+   IOCTL_KGSL_CMDSTREAM_FREEMEMONTIMESTAMP as a read-only ioctl instead
+   of a write only ioctl.  To ensure binary compatability, the following
+   #define will be used to intercept the incorrect ioctl
+*/
+
+#define IOCTL_KGSL_CMDSTREAM_FREEMEMONTIMESTAMP_OLD \
+	_IOR(KGSL_IOC_TYPE, 0x12, struct kgsl_cmdstream_freememontimestamp)
+
+/* create a draw context, which is used to preserve GPU state.
+ * The flags field may contain a mask KGSL_CONTEXT_*  values
+ */
+struct kgsl_drawctxt_create {
+	unsigned int flags;
+	unsigned int drawctxt_id; /*output param */
+};
+
+#define IOCTL_KGSL_DRAWCTXT_CREATE \
+	_IOWR(KGSL_IOC_TYPE, 0x13, struct kgsl_drawctxt_create)
+
+/* destroy a draw context */
+struct kgsl_drawctxt_destroy {
+	unsigned int drawctxt_id;
+};
+
+#define IOCTL_KGSL_DRAWCTXT_DESTROY \
+	_IOW(KGSL_IOC_TYPE, 0x14, struct kgsl_drawctxt_destroy)
+
+/* add a block of pmem, fb, ashmem or user allocated address
+ * into the GPU address space */
+struct kgsl_map_user_mem {
+	int fd;
+	unsigned long gpuaddr;   /*output param */
+	size_t len;
+	size_t offset;
+	unsigned long hostptr;   /*input param */
+	enum kgsl_user_mem_type memtype;
+	unsigned int flags;
+};
+
+#define IOCTL_KGSL_MAP_USER_MEM \
+	_IOWR(KGSL_IOC_TYPE, 0x15, struct kgsl_map_user_mem)
+
+struct kgsl_cmdstream_readtimestamp_ctxtid {
+	unsigned int context_id;
+	unsigned int type;
+	unsigned int timestamp; /*output param */
+};
+
+#define IOCTL_KGSL_CMDSTREAM_READTIMESTAMP_CTXTID \
+	_IOWR(KGSL_IOC_TYPE, 0x16, struct kgsl_cmdstream_readtimestamp_ctxtid)
+
+struct kgsl_cmdstream_freememontimestamp_ctxtid {
+	unsigned int context_id;
+	unsigned long gpuaddr;
+	unsigned int type;
+	unsigned int timestamp;
+};
+
+#define IOCTL_KGSL_CMDSTREAM_FREEMEMONTIMESTAMP_CTXTID \
+	_IOW(KGSL_IOC_TYPE, 0x17, \
+	struct kgsl_cmdstream_freememontimestamp_ctxtid)
+
+/* add a block of pmem or fb into the GPU address space */
+struct kgsl_sharedmem_from_pmem {
+        int pmem_fd;
+        unsigned long gpuaddr;  /*output param */
+        unsigned int len;
+        unsigned int offset;
+};
+
+#define IOCTL_KGSL_SHAREDMEM_FROM_PMEM \
+        _IOWR(KGSL_IOC_TYPE, 0x20, struct kgsl_sharedmem_from_pmem)
+
+/* remove memory from the GPU's address space */
+struct kgsl_sharedmem_free {
+	unsigned long gpuaddr;
+};
+
+#define IOCTL_KGSL_SHAREDMEM_FREE \
+	_IOW(KGSL_IOC_TYPE, 0x21, struct kgsl_sharedmem_free)
+
+struct kgsl_cff_user_event {
+	unsigned char cff_opcode;
+	unsigned int op1;
+	unsigned int op2;
+	unsigned int op3;
+	unsigned int op4;
+	unsigned int op5;
+	unsigned int __pad[2];
+};
+
+#define IOCTL_KGSL_CFF_USER_EVENT \
+	_IOW(KGSL_IOC_TYPE, 0x31, struct kgsl_cff_user_event)
+
+struct kgsl_gmem_desc {
+	unsigned int x;
+	unsigned int y;
+	unsigned int width;
+	unsigned int height;
+	unsigned int pitch;
+};
+
+struct kgsl_buffer_desc {
+	void 			*hostptr;
+	unsigned long	gpuaddr;
+	int				size;
+	unsigned int	format;
+	unsigned int  	pitch;
+	unsigned int  	enabled;
+};
+
+struct kgsl_bind_gmem_shadow {
+	unsigned int drawctxt_id;
+	struct kgsl_gmem_desc gmem_desc;
+	unsigned int shadow_x;
+	unsigned int shadow_y;
+	struct kgsl_buffer_desc shadow_buffer;
+	unsigned int buffer_id;
+};
+
+#define IOCTL_KGSL_DRAWCTXT_BIND_GMEM_SHADOW \
+    _IOW(KGSL_IOC_TYPE, 0x22, struct kgsl_bind_gmem_shadow)
+
+/* add a block of memory into the GPU address space */
+
+/*
+ * IOCTL_KGSL_SHAREDMEM_FROM_VMALLOC deprecated 09/2012
+ * use IOCTL_KGSL_GPUMEM_ALLOC instead
+ */
+
+struct kgsl_sharedmem_from_vmalloc {
+	unsigned long gpuaddr;	/*output param */
+	unsigned int hostptr;
+	unsigned int flags;
+};
+
+#define IOCTL_KGSL_SHAREDMEM_FROM_VMALLOC \
+	_IOWR(KGSL_IOC_TYPE, 0x23, struct kgsl_sharedmem_from_vmalloc)
+
+/*
+ * This is being deprecated in favor of IOCTL_KGSL_GPUMEM_CACHE_SYNC which
+ * supports both directions (flush and invalidate). This code will still
+ * work, but by definition it will do a flush of the cache which might not be
+ * what you want to have happen on a buffer following a GPU operation.  It is
+ * safer to go with IOCTL_KGSL_GPUMEM_CACHE_SYNC
+ */
+
+#define IOCTL_KGSL_SHAREDMEM_FLUSH_CACHE \
+	_IOW(KGSL_IOC_TYPE, 0x24, struct kgsl_sharedmem_free)
+
+struct kgsl_drawctxt_set_bin_base_offset {
+	unsigned int drawctxt_id;
+	unsigned int offset;
+};
+
+#define IOCTL_KGSL_DRAWCTXT_SET_BIN_BASE_OFFSET \
+	_IOW(KGSL_IOC_TYPE, 0x25, struct kgsl_drawctxt_set_bin_base_offset)
+
+enum kgsl_cmdwindow_type {
+	KGSL_CMDWINDOW_MIN     = 0x00000000,
+	KGSL_CMDWINDOW_2D      = 0x00000000,
+	KGSL_CMDWINDOW_3D      = 0x00000001, /* legacy */
+	KGSL_CMDWINDOW_MMU     = 0x00000002,
+	KGSL_CMDWINDOW_ARBITER = 0x000000FF,
+	KGSL_CMDWINDOW_MAX     = 0x000000FF,
+};
+
+/* write to the command window */
+struct kgsl_cmdwindow_write {
+	enum kgsl_cmdwindow_type target;
+	unsigned int addr;
+	unsigned int data;
+};
+
+#define IOCTL_KGSL_CMDWINDOW_WRITE \
+	_IOW(KGSL_IOC_TYPE, 0x2e, struct kgsl_cmdwindow_write)
+
+struct kgsl_gpumem_alloc {
+	unsigned long gpuaddr; /* output param */
+	size_t size;
+	unsigned int flags;
+};
+
+#define IOCTL_KGSL_GPUMEM_ALLOC \
+	_IOWR(KGSL_IOC_TYPE, 0x2f, struct kgsl_gpumem_alloc)
+
+struct kgsl_cff_syncmem {
+	unsigned long gpuaddr;
+	size_t len;
+	unsigned int __pad[2]; /* For future binary compatibility */
+};
+
+#define IOCTL_KGSL_CFF_SYNCMEM \
+	_IOW(KGSL_IOC_TYPE, 0x30, struct kgsl_cff_syncmem)
+
+/*
+ * A timestamp event allows the user space to register an action following an
+ * expired timestamp. Note IOCTL_KGSL_TIMESTAMP_EVENT has been redefined to
+ * _IOWR to support fences which need to return a fd for the priv parameter.
+ */
+
+struct kgsl_timestamp_event {
+	int type;                /* Type of event (see list below) */
+	unsigned int timestamp;  /* Timestamp to trigger event on */
+	unsigned int context_id; /* Context for the timestamp */
+	void __user *priv;	 /* Pointer to the event specific blob */
+	size_t len;              /* Size of the event specific blob */
+};
+
+#define IOCTL_KGSL_TIMESTAMP_EVENT_OLD \
+	_IOW(KGSL_IOC_TYPE, 0x31, struct kgsl_timestamp_event)
+
+/* A genlock timestamp event releases an existing lock on timestamp expire */
+
+#define KGSL_TIMESTAMP_EVENT_GENLOCK 1
+
+struct kgsl_timestamp_event_genlock {
+	int handle; /* Handle of the genlock lock to release */
+};
+
+/* A fence timestamp event releases an existing lock on timestamp expire */
+
+#define KGSL_TIMESTAMP_EVENT_FENCE 2
+
+struct kgsl_timestamp_event_fence {
+	int fence_fd; /* Fence to signal */
+};
+
+/*
+ * Set a property within the kernel.  Uses the same structure as
+ * IOCTL_KGSL_GETPROPERTY
+ */
+
+#define IOCTL_KGSL_SETPROPERTY \
+	_IOW(KGSL_IOC_TYPE, 0x32, struct kgsl_device_getproperty)
+
+#define IOCTL_KGSL_TIMESTAMP_EVENT \
+	_IOWR(KGSL_IOC_TYPE, 0x33, struct kgsl_timestamp_event)
+
+/**
+ * struct kgsl_gpumem_alloc_id - argument to IOCTL_KGSL_GPUMEM_ALLOC_ID
+ * @id: returned id value for this allocation.
+ * @flags: mask of KGSL_MEM* values requested and actual flags on return.
+ * @size: requested size of the allocation and actual size on return.
+ * @mmapsize: returned size to pass to mmap() which may be larger than 'size'
+ * @gpuaddr: returned GPU address for the allocation
+ *
+ * Allocate memory for access by the GPU. The flags and size fields are echoed
+ * back by the kernel, so that the caller can know if the request was
+ * adjusted.
+ *
+ * Supported flags:
+ * KGSL_MEMFLAGS_GPUREADONLY: the GPU will be unable to write to the buffer
+ * KGSL_MEMTYPE*: usage hint for debugging aid
+ * KGSL_MEMALIGN*: alignment hint, may be ignored or adjusted by the kernel.
+ * KGSL_MEMFLAGS_USE_CPU_MAP: If set on call and return, the returned GPU
+ * address will be 0. Calling mmap() will set the GPU address.
+ */
+struct kgsl_gpumem_alloc_id {
+	unsigned int id;
+	unsigned int flags;
+	size_t size;
+	size_t mmapsize;
+	unsigned long gpuaddr;
+/* private: reserved for future use*/
+	unsigned long __pad[2];
+};
+
+#define IOCTL_KGSL_GPUMEM_ALLOC_ID \
+	_IOWR(KGSL_IOC_TYPE, 0x34, struct kgsl_gpumem_alloc_id)
+
+/**
+ * struct kgsl_gpumem_free_id - argument to IOCTL_KGSL_GPUMEM_FREE_ID
+ * @id: GPU allocation id to free
+ *
+ * Free an allocation by id, in case a GPU address has not been assigned or
+ * is unknown. Freeing an allocation by id with this ioctl or by GPU address
+ * with IOCTL_KGSL_SHAREDMEM_FREE are equivalent.
+ */
+struct kgsl_gpumem_free_id {
+	unsigned int id;
+/* private: reserved for future use*/
+	unsigned int __pad;
+};
+
+#define IOCTL_KGSL_GPUMEM_FREE_ID \
+	_IOWR(KGSL_IOC_TYPE, 0x35, struct kgsl_gpumem_free_id)
+
+/**
+ * struct kgsl_gpumem_get_info - argument to IOCTL_KGSL_GPUMEM_GET_INFO
+ * @gpuaddr: GPU address to query. Also set on return.
+ * @id: GPU allocation id to query. Also set on return.
+ * @flags: returned mask of KGSL_MEM* values.
+ * @size: returned size of the allocation.
+ * @mmapsize: returned size to pass mmap(), which may be larger than 'size'
+ * @useraddr: returned address of the userspace mapping for this buffer
+ *
+ * This ioctl allows querying of all user visible attributes of an existing
+ * allocation, by either the GPU address or the id returned by a previous
+ * call to IOCTL_KGSL_GPUMEM_ALLOC_ID. Legacy allocation ioctls may not
+ * return all attributes so this ioctl can be used to look them up if needed.
+ *
+ */
+struct kgsl_gpumem_get_info {
+	unsigned long gpuaddr;
+	unsigned int id;
+	unsigned int flags;
+	size_t size;
+	size_t mmapsize;
+	unsigned long useraddr;
+/* private: reserved for future use*/
+	unsigned long __pad[4];
+};
+
+#define IOCTL_KGSL_GPUMEM_GET_INFO\
+	_IOWR(KGSL_IOC_TYPE, 0x36, struct kgsl_gpumem_get_info)
+
+/**
+ * struct kgsl_gpumem_sync_cache - argument to IOCTL_KGSL_GPUMEM_SYNC_CACHE
+ * @gpuaddr: GPU address of the buffer to sync.
+ * @id: id of the buffer to sync. Either gpuaddr or id is sufficient.
+ * @op: a mask of KGSL_GPUMEM_CACHE_* values
+ * @offset: offset into the buffer
+ * @length: number of bytes starting from offset to perform
+ * the cache operation on
+ *
+ * Sync the L2 cache for memory headed to and from the GPU - this replaces
+ * KGSL_SHAREDMEM_FLUSH_CACHE since it can handle cache management for both
+ * directions
+ *
+ */
+struct kgsl_gpumem_sync_cache {
+	unsigned long gpuaddr;
+	unsigned int id;
+	unsigned int op;
+	size_t offset;
+	size_t length;
+};
+
+#define KGSL_GPUMEM_CACHE_CLEAN (1 << 0)
+#define KGSL_GPUMEM_CACHE_TO_GPU KGSL_GPUMEM_CACHE_CLEAN
+
+#define KGSL_GPUMEM_CACHE_INV (1 << 1)
+#define KGSL_GPUMEM_CACHE_FROM_GPU KGSL_GPUMEM_CACHE_INV
+
+#define KGSL_GPUMEM_CACHE_FLUSH \
+	(KGSL_GPUMEM_CACHE_CLEAN | KGSL_GPUMEM_CACHE_INV)
+
+/* Flag to ensure backwards compatibility of kgsl_gpumem_sync_cache struct */
+#define KGSL_GPUMEM_CACHE_RANGE (1 << 31U)
+
+#define IOCTL_KGSL_GPUMEM_SYNC_CACHE \
+	_IOW(KGSL_IOC_TYPE, 0x37, struct kgsl_gpumem_sync_cache)
+
+/**
+ * struct kgsl_perfcounter_get - argument to IOCTL_KGSL_PERFCOUNTER_GET
+ * @groupid: Performance counter group ID
+ * @countable: Countable to select within the group
+ * @offset: Return offset of the reserved LO counter
+ * @offset_hi: Return offset of the reserved HI counter
+ *
+ * Get an available performance counter from a specified groupid.  The offset
+ * of the performance counter will be returned after successfully assigning
+ * the countable to the counter for the specified group.  An error will be
+ * returned and an offset of 0 if the groupid is invalid or there are no
+ * more counters left.  After successfully getting a perfcounter, the user
+ * must call kgsl_perfcounter_put(groupid, contable) when finished with
+ * the perfcounter to clear up perfcounter resources.
+ *
+ */
+struct kgsl_perfcounter_get {
+	unsigned int groupid;
+	unsigned int countable;
+	unsigned int offset;
+	unsigned int offset_hi;
+/* private: reserved for future use */
+	unsigned int __pad; /* For future binary compatibility */
+};
+
+#define IOCTL_KGSL_PERFCOUNTER_GET \
+	_IOWR(KGSL_IOC_TYPE, 0x38, struct kgsl_perfcounter_get)
+
+/**
+ * struct kgsl_perfcounter_put - argument to IOCTL_KGSL_PERFCOUNTER_PUT
+ * @groupid: Performance counter group ID
+ * @countable: Countable to release within the group
+ *
+ * Put an allocated performance counter to allow others to have access to the
+ * resource that was previously taken.  This is only to be called after
+ * successfully getting a performance counter from kgsl_perfcounter_get().
+ *
+ */
+struct kgsl_perfcounter_put {
+	unsigned int groupid;
+	unsigned int countable;
+/* private: reserved for future use */
+	unsigned int __pad[2]; /* For future binary compatibility */
+};
+
+#define IOCTL_KGSL_PERFCOUNTER_PUT \
+	_IOW(KGSL_IOC_TYPE, 0x39, struct kgsl_perfcounter_put)
+
+/**
+ * struct kgsl_perfcounter_query - argument to IOCTL_KGSL_PERFCOUNTER_QUERY
+ * @groupid: Performance counter group ID
+ * @countable: Return active countables array
+ * @size: Size of active countables array
+ * @max_counters: Return total number counters for the group ID
+ *
+ * Query the available performance counters given a groupid.  The array
+ * *countables is used to return the current active countables in counters.
+ * The size of the array is passed in so the kernel will only write at most
+ * size or counter->size for the group id.  The total number of available
+ * counters for the group ID is returned in max_counters.
+ * If the array or size passed in are invalid, then only the maximum number
+ * of counters will be returned, no data will be written to *countables.
+ * If the groupid is invalid an error code will be returned.
+ *
+ */
+struct kgsl_perfcounter_query {
+	unsigned int groupid;
+	/* Array to return the current countable for up to size counters */
+	unsigned int __user *countables;
+	unsigned int count;
+	unsigned int max_counters;
+/* private: reserved for future use */
+	unsigned int __pad[2]; /* For future binary compatibility */
+};
+
+#define IOCTL_KGSL_PERFCOUNTER_QUERY \
+	_IOWR(KGSL_IOC_TYPE, 0x3A, struct kgsl_perfcounter_query)
+
+/**
+ * struct kgsl_perfcounter_query - argument to IOCTL_KGSL_PERFCOUNTER_QUERY
+ * @groupid: Performance counter group IDs
+ * @countable: Performance counter countable IDs
+ * @value: Return performance counter reads
+ * @size: Size of all arrays (groupid/countable pair and return value)
+ *
+ * Read in the current value of a performance counter given by the groupid
+ * and countable.
+ *
+ */
+
+struct kgsl_perfcounter_read_group {
+	unsigned int groupid;
+	unsigned int countable;
+	unsigned long long value;
+};
+
+struct kgsl_perfcounter_read {
+	struct kgsl_perfcounter_read_group __user *reads;
+	unsigned int count;
+/* private: reserved for future use */
+	unsigned int __pad[2]; /* For future binary compatibility */
+};
+
+#define IOCTL_KGSL_PERFCOUNTER_READ \
+	_IOWR(KGSL_IOC_TYPE, 0x3B, struct kgsl_perfcounter_read)
+/*
+ * struct kgsl_gpumem_sync_cache_bulk - argument to
+ * IOCTL_KGSL_GPUMEM_SYNC_CACHE_BULK
+ * @id_list: list of GPU buffer ids of the buffers to sync
+ * @count: number of GPU buffer ids in id_list
+ * @op: a mask of KGSL_GPUMEM_CACHE_* values
+ *
+ * Sync the cache for memory headed to and from the GPU. Certain
+ * optimizations can be made on the cache operation based on the total
+ * size of the working set of memory to be managed.
+ */
+struct kgsl_gpumem_sync_cache_bulk {
+	unsigned int __user *id_list;
+	unsigned int count;
+	unsigned int op;
+/* private: reserved for future use */
+	unsigned int __pad[2]; /* For future binary compatibility */
+};
+
+#define IOCTL_KGSL_GPUMEM_SYNC_CACHE_BULK \
+	_IOWR(KGSL_IOC_TYPE, 0x3C, struct kgsl_gpumem_sync_cache_bulk)
+
+/*
+ * struct kgsl_cmd_syncpoint_timestamp
+ * @context_id: ID of a KGSL context
+ * @timestamp: GPU timestamp
+ *
+ * This structure defines a syncpoint comprising a context/timestamp pair. A
+ * list of these may be passed by IOCTL_KGSL_SUBMIT_COMMANDS to define
+ * dependencies that must be met before the command can be submitted to the
+ * hardware
+ */
+struct kgsl_cmd_syncpoint_timestamp {
+	unsigned int context_id;
+	unsigned int timestamp;
+};
+
+struct kgsl_cmd_syncpoint_fence {
+	int fd;
+};
+
+/**
+ * struct kgsl_cmd_syncpoint - Define a sync point for a command batch
+ * @type: type of sync point defined here
+ * @priv: Pointer to the type specific buffer
+ * @size: Size of the type specific buffer
+ *
+ * This structure contains pointers defining a specific command sync point.
+ * The pointer and size should point to a type appropriate structure.
+ */
+struct kgsl_cmd_syncpoint {
+	int type;
+	void __user *priv;
+	size_t size;
+};
+
+/* Flag to indicate that the cmdlist may contain memlists */
+#define KGSL_IBDESC_MEMLIST 0x1
+
+/* Flag to point out the cmdbatch profiling buffer in the memlist */
+#define KGSL_IBDESC_PROFILING_BUFFER 0x2
+
+/**
+ * struct kgsl_submit_commands - Argument to IOCTL_KGSL_SUBMIT_COMMANDS
+ * @context_id: KGSL context ID that owns the commands
+ * @flags:
+ * @cmdlist: User pointer to a list of kgsl_ibdesc structures
+ * @numcmds: Number of commands listed in cmdlist
+ * @synclist: User pointer to a list of kgsl_cmd_syncpoint structures
+ * @numsyncs: Number of sync points listed in synclist
+ * @timestamp: On entry the a user defined timestamp, on exist the timestamp
+ * assigned to the command batch
+ *
+ * This structure specifies a command to send to the GPU hardware.  This is
+ * similar to kgsl_issueibcmds expect that it doesn't support the legacy way to
+ * submit IB lists and it adds sync points to block the IB until the
+ * dependencies are satisified.  This entry point is the new and preferred way
+ * to submit commands to the GPU. The memory list can be used to specify all
+ * memory that is referrenced in the current set of commands.
+ */
+
+struct kgsl_submit_commands {
+	unsigned int context_id;
+	unsigned int flags;
+	struct kgsl_ibdesc __user *cmdlist;
+	unsigned int numcmds;
+	struct kgsl_cmd_syncpoint __user *synclist;
+	unsigned int numsyncs;
+	unsigned int timestamp;
+/* private: reserved for future use */
+	unsigned int __pad[4];
+};
+
+#define IOCTL_KGSL_SUBMIT_COMMANDS \
+	_IOWR(KGSL_IOC_TYPE, 0x3D, struct kgsl_submit_commands)
+
+/**
+ * struct kgsl_device_constraint - device constraint argument
+ * @context_id: KGSL context ID
+ * @type: type of constraint i.e pwrlevel/none
+ * @data: constraint data
+ * @size: size of the constraint data
+ */
+struct kgsl_device_constraint {
+	unsigned int type;
+	unsigned int context_id;
+	void __user *data;
+	size_t size;
+};
+
+/* Constraint Type*/
+#define KGSL_CONSTRAINT_NONE 0
+#define KGSL_CONSTRAINT_PWRLEVEL 1
+
+/* PWRLEVEL constraint level*/
+/* set to min frequency */
+#define KGSL_CONSTRAINT_PWR_MIN    0
+/* set to max frequency */
+#define KGSL_CONSTRAINT_PWR_MAX    1
+
+struct kgsl_device_constraint_pwrlevel {
+	unsigned int level;
+};
+
+/**
+ * struct kgsl_syncsource_create - Argument to IOCTL_KGSL_SYNCSOURCE_CREATE
+ * @id: returned id for the syncsource that was created.
+ *
+ * This ioctl creates a userspace sync timeline.
+ */
+
+struct kgsl_syncsource_create {
+	unsigned int id;
+/* private: reserved for future use */
+	unsigned int __pad[3];
+};
+
+#define IOCTL_KGSL_SYNCSOURCE_CREATE \
+	_IOWR(KGSL_IOC_TYPE, 0x40, struct kgsl_syncsource_create)
+
+/**
+ * struct kgsl_syncsource_destroy - Argument to IOCTL_KGSL_SYNCSOURCE_DESTROY
+ * @id: syncsource id to destroy
+ *
+ * This ioctl creates a userspace sync timeline.
+ */
+
+struct kgsl_syncsource_destroy {
+	unsigned int id;
+/* private: reserved for future use */
+	unsigned int __pad[3];
+};
+
+#define IOCTL_KGSL_SYNCSOURCE_DESTROY \
+	_IOWR(KGSL_IOC_TYPE, 0x41, struct kgsl_syncsource_destroy)
+
+/**
+ * struct kgsl_syncsource_create_fence - Argument to
+ *     IOCTL_KGSL_SYNCSOURCE_CREATE_FENCE
+ * @id: syncsource id
+ * @fence_fd: returned sync_fence fd
+ *
+ * Create a fence that may be signaled by userspace by calling
+ * IOCTL_KGSL_SYNCSOURCE_SIGNAL_FENCE. There are no order dependencies between
+ * these fences.
+ */
+struct kgsl_syncsource_create_fence {
+	unsigned int id;
+	int fence_fd;
+/* private: reserved for future use */
+	unsigned int __pad[4];
+};
+
+/**
+ * struct kgsl_syncsource_signal_fence - Argument to
+ *     IOCTL_KGSL_SYNCSOURCE_SIGNAL_FENCE
+ * @id: syncsource id
+ * @fence_fd: sync_fence fd to signal
+ *
+ * Signal a fence that was created by a IOCTL_KGSL_SYNCSOURCE_CREATE_FENCE
+ * call using the same syncsource id. This allows a fence to be shared
+ * to other processes but only signaled by the process owning the fd
+ * used to create the fence.
+ */
+#define IOCTL_KGSL_SYNCSOURCE_CREATE_FENCE \
+	_IOWR(KGSL_IOC_TYPE, 0x42, struct kgsl_syncsource_create_fence)
+
+struct kgsl_syncsource_signal_fence {
+	unsigned int id;
+	int fence_fd;
+/* private: reserved for future use */
+	unsigned int __pad[4];
+};
+
+#define IOCTL_KGSL_SYNCSOURCE_SIGNAL_FENCE \
+	_IOWR(KGSL_IOC_TYPE, 0x43, struct kgsl_syncsource_signal_fence)
+
+/**
+ * struct kgsl_cff_sync_gpuobj - Argument to IOCTL_KGSL_CFF_SYNC_GPUOBJ
+ * @offset: Offset into the GPU object to sync
+ * @length: Number of bytes to sync
+ * @id: ID of the GPU object to sync
+ */
+struct kgsl_cff_sync_gpuobj {
+	uint64_t offset;
+	uint64_t length;
+	unsigned int id;
+};
+
+#define IOCTL_KGSL_CFF_SYNC_GPUOBJ \
+	_IOW(KGSL_IOC_TYPE, 0x44, struct kgsl_cff_sync_gpuobj)
+
+/**
+ * struct kgsl_gpuobj_alloc - Argument to IOCTL_KGSL_GPUOBJ_ALLOC
+ * @size: Size in bytes of the object to allocate
+ * @flags: mask of KGSL_MEMFLAG_* bits
+ * @va_len: Size in bytes of the virtual region to allocate
+ * @mmapsize: Returns the mmap() size of the object
+ * @id: Returns the GPU object ID of the new object
+ * @metadata_len: Length of the metdata to copy from the user
+ * @metadata: Pointer to the user specified metadata to store for the object
+ */
+struct kgsl_gpuobj_alloc {
+	uint64_t size;
+	uint64_t flags;
+	uint64_t va_len;
+	uint64_t mmapsize;
+	unsigned int id;
+	unsigned int metadata_len;
+	uint64_t metadata;
+};
+
+/* Let the user know that this header supports the gpuobj metadata */
+#define KGSL_GPUOBJ_ALLOC_METADATA_MAX 64
+
+#define IOCTL_KGSL_GPUOBJ_ALLOC \
+	_IOWR(KGSL_IOC_TYPE, 0x45, struct kgsl_gpuobj_alloc)
+
+/**
+ * struct kgsl_gpuobj_free - Argument to IOCTL_KGLS_GPUOBJ_FREE
+ * @flags: Mask of: KGSL_GUPOBJ_FREE_ON_EVENT
+ * @priv: Pointer to the private object if KGSL_GPUOBJ_FREE_ON_EVENT is
+ * specified
+ * @id: ID of the GPU object to free
+ * @type: If KGSL_GPUOBJ_FREE_ON_EVENT is specified, the type of asynchronous
+ * event to free on
+ * @len: Length of the data passed in priv
+ */
+struct kgsl_gpuobj_free {
+	uint64_t flags;
+	uint64_t __user priv;
+	unsigned int id;
+	unsigned int type;
+	unsigned int len;
+};
+
+#define KGSL_GPUOBJ_FREE_ON_EVENT 1
+
+#define KGSL_GPU_EVENT_TIMESTAMP 1
+#define KGSL_GPU_EVENT_FENCE     2
+
+/**
+ * struct kgsl_gpu_event_timestamp - Specifies a timestamp event to free a GPU
+ * object on
+ * @context_id: ID of the timestamp event to wait for
+ * @timestamp: Timestamp of the timestamp event to wait for
+ */
+struct kgsl_gpu_event_timestamp {
+	unsigned int context_id;
+	unsigned int timestamp;
+};
+
+/**
+ * struct kgsl_gpu_event_fence - Specifies a fence ID to to free a GPU object on
+ * @fd: File descriptor for the fence
+ */
+struct kgsl_gpu_event_fence {
+	int fd;
+};
+
+#define IOCTL_KGSL_GPUOBJ_FREE \
+	_IOW(KGSL_IOC_TYPE, 0x46, struct kgsl_gpuobj_free)
+
+/**
+ * struct kgsl_gpuobj_info - argument to IOCTL_KGSL_GPUOBJ_INFO
+ * @gpuaddr: GPU address of the object
+ * @flags: Current flags for the object
+ * @size: Size of the object
+ * @va_len: VA size of the object
+ * @va_addr: Virtual address of the object (if it is mapped)
+ * id - GPU object ID of the object to query
+ */
+struct kgsl_gpuobj_info {
+	uint64_t gpuaddr;
+	uint64_t flags;
+	uint64_t size;
+	uint64_t va_len;
+	uint64_t va_addr;
+	unsigned int id;
+};
+
+#define IOCTL_KGSL_GPUOBJ_INFO \
+	_IOWR(KGSL_IOC_TYPE, 0x47, struct kgsl_gpuobj_info)
+
+/**
+ * struct kgsl_gpuobj_import - argument to IOCTL_KGSL_GPUOBJ_IMPORT
+ * @priv: Pointer to the private data for the import type
+ * @priv_len: Length of the private data
+ * @flags: Mask of KGSL_MEMFLAG_ flags
+ * @type: Type of the import (KGSL_USER_MEM_TYPE_*)
+ * @id: Returns the ID of the new GPU object
+ */
+struct kgsl_gpuobj_import {
+	uint64_t __user priv;
+	uint64_t priv_len;
+	uint64_t flags;
+	unsigned int type;
+	unsigned int id;
+};
+
+/**
+ * struct kgsl_gpuobj_import_dma_buf - import a dmabuf object
+ * @fd: File descriptor for the dma-buf object
+ */
+struct kgsl_gpuobj_import_dma_buf {
+	int fd;
+};
+
+/**
+ * struct kgsl_gpuobj_import_useraddr - import an object based on a useraddr
+ * @virtaddr: Virtual address of the object to import
+ */
+struct kgsl_gpuobj_import_useraddr {
+	uint64_t virtaddr;
+};
+
+#define IOCTL_KGSL_GPUOBJ_IMPORT \
+	_IOWR(KGSL_IOC_TYPE, 0x48, struct kgsl_gpuobj_import)
+
+/**
+ * struct kgsl_gpuobj_sync_obj - Individual GPU object to sync
+ * @offset: Offset within the GPU object to sync
+ * @length: Number of bytes to sync
+ * @id: ID of the GPU object to sync
+ * @op: Cache operation to execute
+ */
+
+struct kgsl_gpuobj_sync_obj {
+	uint64_t offset;
+	uint64_t length;
+	unsigned int id;
+	unsigned int op;
+};
+
+/**
+ * struct kgsl_gpuobj_sync - Argument for IOCTL_KGSL_GPUOBJ_SYNC
+ * @objs: Pointer to an array of kgsl_gpuobj_sync_obj structs
+ * @obj_len: Size of each item in the array
+ * @count: Number of items in the array
+ */
+
+struct kgsl_gpuobj_sync {
+	uint64_t __user objs;
+	unsigned int obj_len;
+	unsigned int count;
+};
+
+#define IOCTL_KGSL_GPUOBJ_SYNC \
+	_IOW(KGSL_IOC_TYPE, 0x49, struct kgsl_gpuobj_sync)
+
+/**
+ * struct kgsl_command_object - GPU command object
+ * @offset: GPU address offset of the object
+ * @gpuaddr: GPU address of the object
+ * @size: Size of the object
+ * @flags: Current flags for the object
+ * @id - GPU command object ID
+ */
+struct kgsl_command_object {
+	uint64_t offset;
+	uint64_t gpuaddr;
+	uint64_t size;
+	unsigned int flags;
+	unsigned int id;
+};
+
+/**
+ * struct kgsl_command_syncpoint - GPU syncpoint object
+ * @priv: Pointer to the type specific buffer
+ * @size: Size of the type specific buffer
+ * @type: type of sync point defined here
+ */
+struct kgsl_command_syncpoint {
+	uint64_t __user priv;
+	uint64_t size;
+	unsigned int type;
+};
+
+/**
+ * struct kgsl_command_object - Argument for IOCTL_KGSL_GPU_COMMAND
+ * @flags: Current flags for the object
+ * @cmdlist: List of kgsl_command_objects for submission
+ * @cmd_size: Size of kgsl_command_objects structure
+ * @numcmds: Number of kgsl_command_objects in command list
+ * @objlist: List of kgsl_command_objects for tracking
+ * @obj_size: Size of kgsl_command_objects structure
+ * @numobjs: Number of kgsl_command_objects in object list
+ * @synclist: List of kgsl_command_syncpoints
+ * @sync_size: Size of kgsl_command_syncpoint structure
+ * @numsyncs: Number of kgsl_command_syncpoints in syncpoint list
+ * @context_id: Context ID submittin ghte kgsl_gpu_command
+ * @timestamp: Timestamp for the submitted commands
+ */
+struct kgsl_gpu_command {
+	uint64_t flags;
+	uint64_t __user cmdlist;
+	unsigned int cmdsize;
+	unsigned int numcmds;
+	uint64_t __user objlist;
+	unsigned int objsize;
+	unsigned int numobjs;
+	uint64_t __user synclist;
+	unsigned int syncsize;
+	unsigned int numsyncs;
+	unsigned int context_id;
+	unsigned int timestamp;
+};
+
+#define IOCTL_KGSL_GPU_COMMAND \
+	_IOWR(KGSL_IOC_TYPE, 0x4A, struct kgsl_gpu_command)
+
+/**
+ * struct kgsl_preemption_counters_query - argument to
+ * IOCTL_KGSL_PREEMPTIONCOUNTER_QUERY
+ * @counters: Return preemption counters array
+ * @size_user: Size allocated by userspace
+ * @size_priority_level: Size of preemption counters for each
+ * priority level
+ * @max_priority_level: Return max number of priority levels
+ *
+ * Query the available preemption counters. The array counters
+ * is used to return preemption counters. The size of the array
+ * is passed in so the kernel will only write at most size_user
+ * or max available preemption counters.  The total number of
+ * preemption counters is returned in max_priority_level. If the
+ * array or size passed in are invalid, then an error is
+ * returned back.
+ */
+struct kgsl_preemption_counters_query {
+	uint64_t __user counters;
+	unsigned int size_user;
+	unsigned int size_priority_level;
+	unsigned int max_priority_level;
+};
+
+#define IOCTL_KGSL_PREEMPTIONCOUNTER_QUERY \
+	_IOWR(KGSL_IOC_TYPE, 0x4B, struct kgsl_preemption_counters_query)
+
+/**
+ * struct kgsl_gpuobj_set_info - argument for IOCTL_KGSL_GPUOBJ_SET_INFO
+ * @flags: Flags to indicate which paramaters to change
+ * @metadata:  If KGSL_GPUOBJ_SET_INFO_METADATA is set, a pointer to the new
+ * metadata
+ * @id: GPU memory object ID to change
+ * @metadata_len:  If KGSL_GPUOBJ_SET_INFO_METADATA is set, the length of the
+ * new metadata string
+ * @type: If KGSL_GPUOBJ_SET_INFO_TYPE is set, the new type of the memory object
+ */
+
+#define KGSL_GPUOBJ_SET_INFO_METADATA (1 << 0)
+#define KGSL_GPUOBJ_SET_INFO_TYPE (1 << 1)
+
+struct kgsl_gpuobj_set_info {
+	uint64_t flags;
+	uint64_t metadata;
+	unsigned int id;
+	unsigned int metadata_len;
+	unsigned int type;
+};
+
+#define IOCTL_KGSL_GPUOBJ_SET_INFO \
+	_IOW(KGSL_IOC_TYPE, 0x4C, struct kgsl_gpuobj_set_info)
+
+#endif /* _UAPI_MSM_KGSL_H */
diff --git a/extra/qcom_gpu_driver/msm_kgsl.py b/extra/qcom_gpu_driver/msm_kgsl.py
new file mode 100644
index 00000000..1c5bca85
--- /dev/null
+++ b/extra/qcom_gpu_driver/msm_kgsl.py
@@ -0,0 +1,1034 @@
+# -*- coding: utf-8 -*-
+#
+# TARGET arch is: []
+# WORD_SIZE is: 8
+# POINTER_SIZE is: 8
+# LONGDOUBLE_SIZE is: 16
+#
+import ctypes
+
+
+class AsDictMixin:
+    @classmethod
+    def as_dict(cls, self):
+        result = {}
+        if not isinstance(self, AsDictMixin):
+            # not a structure, assume it's already a python object
+            return self
+        if not hasattr(cls, "_fields_"):
+            return result
+        # sys.version_info >= (3, 5)
+        # for (field, *_) in cls._fields_:  # noqa
+        for field_tuple in cls._fields_:  # noqa
+            field = field_tuple[0]
+            if field.startswith('PADDING_'):
+                continue
+            value = getattr(self, field)
+            type_ = type(value)
+            if hasattr(value, "_length_") and hasattr(value, "_type_"):
+                # array
+                if not hasattr(type_, "as_dict"):
+                    value = [v for v in value]
+                else:
+                    type_ = type_._type_
+                    value = [type_.as_dict(v) for v in value]
+            elif hasattr(value, "contents") and hasattr(value, "_type_"):
+                # pointer
+                try:
+                    if not hasattr(type_, "as_dict"):
+                        value = value.contents
+                    else:
+                        type_ = type_._type_
+                        value = type_.as_dict(value.contents)
+                except ValueError:
+                    # nullptr
+                    value = None
+            elif isinstance(value, AsDictMixin):
+                # other structure
+                value = type_.as_dict(value)
+            result[field] = value
+        return result
+
+
+class Structure(ctypes.Structure, AsDictMixin):
+
+    def __init__(self, *args, **kwds):
+        # We don't want to use positional arguments fill PADDING_* fields
+
+        args = dict(zip(self.__class__._field_names_(), args))
+        args.update(kwds)
+        super(Structure, self).__init__(**args)
+
+    @classmethod
+    def _field_names_(cls):
+        if hasattr(cls, '_fields_'):
+            return (f[0] for f in cls._fields_ if not f[0].startswith('PADDING'))
+        else:
+            return ()
+
+    @classmethod
+    def get_type(cls, field):
+        for f in cls._fields_:
+            if f[0] == field:
+                return f[1]
+        return None
+
+    @classmethod
+    def bind(cls, bound_fields):
+        fields = {}
+        for name, type_ in cls._fields_:
+            if hasattr(type_, "restype"):
+                if name in bound_fields:
+                    if bound_fields[name] is None:
+                        fields[name] = type_()
+                    else:
+                        # use a closure to capture the callback from the loop scope
+                        fields[name] = (
+                            type_((lambda callback: lambda *args: callback(*args))(
+                                bound_fields[name]))
+                        )
+                    del bound_fields[name]
+                else:
+                    # default callback implementation (does nothing)
+                    try:
+                        default_ = type_(0).restype().value
+                    except TypeError:
+                        default_ = None
+                    fields[name] = type_((
+                        lambda default_: lambda *args: default_)(default_))
+            else:
+                # not a callback function, use default initialization
+                if name in bound_fields:
+                    fields[name] = bound_fields[name]
+                    del bound_fields[name]
+                else:
+                    fields[name] = type_()
+        if len(bound_fields) != 0:
+            raise ValueError(
+                "Cannot bind the following unknown callback(s) {}.{}".format(
+                    cls.__name__, bound_fields.keys()
+            ))
+        return cls(**fields)
+
+
+class Union(ctypes.Union, AsDictMixin):
+    pass
+
+
+
+c_int128 = ctypes.c_ubyte*16
+c_uint128 = c_int128
+void = None
+if ctypes.sizeof(ctypes.c_longdouble) == 16:
+    c_long_double_t = ctypes.c_longdouble
+else:
+    c_long_double_t = ctypes.c_ubyte*16
+
+
+
+
+# values for enumeration 'kgsl_user_mem_type'
+kgsl_user_mem_type__enumvalues = {
+    0: 'KGSL_USER_MEM_TYPE_PMEM',
+    1: 'KGSL_USER_MEM_TYPE_ASHMEM',
+    2: 'KGSL_USER_MEM_TYPE_ADDR',
+    3: 'KGSL_USER_MEM_TYPE_ION',
+    3: 'KGSL_USER_MEM_TYPE_DMABUF',
+    7: 'KGSL_USER_MEM_TYPE_MAX',
+}
+KGSL_USER_MEM_TYPE_PMEM = 0
+KGSL_USER_MEM_TYPE_ASHMEM = 1
+KGSL_USER_MEM_TYPE_ADDR = 2
+KGSL_USER_MEM_TYPE_ION = 3
+KGSL_USER_MEM_TYPE_DMABUF = 3
+KGSL_USER_MEM_TYPE_MAX = 7
+kgsl_user_mem_type = ctypes.c_uint32 # enum
+
+# values for enumeration 'kgsl_ctx_reset_stat'
+kgsl_ctx_reset_stat__enumvalues = {
+    0: 'KGSL_CTX_STAT_NO_ERROR',
+    1: 'KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT',
+    2: 'KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT',
+    3: 'KGSL_CTX_STAT_UNKNOWN_CONTEXT_RESET_EXT',
+}
+KGSL_CTX_STAT_NO_ERROR = 0
+KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT = 1
+KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT = 2
+KGSL_CTX_STAT_UNKNOWN_CONTEXT_RESET_EXT = 3
+kgsl_ctx_reset_stat = ctypes.c_uint32 # enum
+
+# values for enumeration 'kgsl_deviceid'
+kgsl_deviceid__enumvalues = {
+    0: 'KGSL_DEVICE_3D0',
+    1: 'KGSL_DEVICE_MAX',
+}
+KGSL_DEVICE_3D0 = 0
+KGSL_DEVICE_MAX = 1
+kgsl_deviceid = ctypes.c_uint32 # enum
+class struct_kgsl_devinfo(Structure):
+    pass
+
+struct_kgsl_devinfo._pack_ = 1 # source:False
+struct_kgsl_devinfo._fields_ = [
+    ('device_id', ctypes.c_uint32),
+    ('chip_id', ctypes.c_uint32),
+    ('mmu_enabled', ctypes.c_uint32),
+    ('PADDING_0', ctypes.c_ubyte * 4),
+    ('gmem_gpubaseaddr', ctypes.c_uint64),
+    ('gpu_id', ctypes.c_uint32),
+    ('PADDING_1', ctypes.c_ubyte * 4),
+    ('gmem_sizebytes', ctypes.c_uint64),
+]
+
+class struct_kgsl_devmemstore(Structure):
+    pass
+
+struct_kgsl_devmemstore._pack_ = 1 # source:False
+struct_kgsl_devmemstore._fields_ = [
+    ('soptimestamp', ctypes.c_uint32),
+    ('sbz', ctypes.c_uint32),
+    ('eoptimestamp', ctypes.c_uint32),
+    ('sbz2', ctypes.c_uint32),
+    ('preempted', ctypes.c_uint32),
+    ('sbz3', ctypes.c_uint32),
+    ('ref_wait_ts', ctypes.c_uint32),
+    ('sbz4', ctypes.c_uint32),
+    ('current_context', ctypes.c_uint32),
+    ('sbz5', ctypes.c_uint32),
+]
+
+
+# values for enumeration 'kgsl_timestamp_type'
+kgsl_timestamp_type__enumvalues = {
+    1: 'KGSL_TIMESTAMP_CONSUMED',
+    2: 'KGSL_TIMESTAMP_RETIRED',
+    3: 'KGSL_TIMESTAMP_QUEUED',
+}
+KGSL_TIMESTAMP_CONSUMED = 1
+KGSL_TIMESTAMP_RETIRED = 2
+KGSL_TIMESTAMP_QUEUED = 3
+kgsl_timestamp_type = ctypes.c_uint32 # enum
+class struct_kgsl_shadowprop(Structure):
+    pass
+
+struct_kgsl_shadowprop._pack_ = 1 # source:False
+struct_kgsl_shadowprop._fields_ = [
+    ('gpuaddr', ctypes.c_uint64),
+    ('size', ctypes.c_uint64),
+    ('flags', ctypes.c_uint32),
+    ('PADDING_0', ctypes.c_ubyte * 4),
+]
+
+class struct_kgsl_version(Structure):
+    pass
+
+struct_kgsl_version._pack_ = 1 # source:False
+struct_kgsl_version._fields_ = [
+    ('drv_major', ctypes.c_uint32),
+    ('drv_minor', ctypes.c_uint32),
+    ('dev_major', ctypes.c_uint32),
+    ('dev_minor', ctypes.c_uint32),
+]
+
+class struct_kgsl_sp_generic_mem(Structure):
+    pass
+
+struct_kgsl_sp_generic_mem._pack_ = 1 # source:False
+struct_kgsl_sp_generic_mem._fields_ = [
+    ('local', ctypes.c_uint64),
+    ('pvt', ctypes.c_uint64),
+]
+
+class struct_kgsl_ucode_version(Structure):
+    pass
+
+struct_kgsl_ucode_version._pack_ = 1 # source:False
+struct_kgsl_ucode_version._fields_ = [
+    ('pfp', ctypes.c_uint32),
+    ('pm4', ctypes.c_uint32),
+]
+
+class struct_kgsl_gpmu_version(Structure):
+    pass
+
+struct_kgsl_gpmu_version._pack_ = 1 # source:False
+struct_kgsl_gpmu_version._fields_ = [
+    ('major', ctypes.c_uint32),
+    ('minor', ctypes.c_uint32),
+    ('features', ctypes.c_uint32),
+]
+
+class struct_kgsl_ibdesc(Structure):
+    pass
+
+struct_kgsl_ibdesc._pack_ = 1 # source:False
+struct_kgsl_ibdesc._fields_ = [
+    ('gpuaddr', ctypes.c_uint64),
+    ('__pad', ctypes.c_uint64),
+    ('sizedwords', ctypes.c_uint64),
+    ('ctrl', ctypes.c_uint32),
+    ('PADDING_0', ctypes.c_ubyte * 4),
+]
+
+class struct_kgsl_cmdbatch_profiling_buffer(Structure):
+    pass
+
+struct_kgsl_cmdbatch_profiling_buffer._pack_ = 1 # source:False
+struct_kgsl_cmdbatch_profiling_buffer._fields_ = [
+    ('wall_clock_s', ctypes.c_uint64),
+    ('wall_clock_ns', ctypes.c_uint64),
+    ('gpu_ticks_queued', ctypes.c_uint64),
+    ('gpu_ticks_submitted', ctypes.c_uint64),
+    ('gpu_ticks_retired', ctypes.c_uint64),
+]
+
+class struct_kgsl_device_getproperty(Structure):
+    pass
+
+struct_kgsl_device_getproperty._pack_ = 1 # source:False
+struct_kgsl_device_getproperty._fields_ = [
+    ('type', ctypes.c_uint32),
+    ('PADDING_0', ctypes.c_ubyte * 4),
+    ('value', ctypes.POINTER(None)),
+    ('sizebytes', ctypes.c_uint64),
+]
+
+class struct_kgsl_device_waittimestamp(Structure):
+    pass
+
+struct_kgsl_device_waittimestamp._pack_ = 1 # source:False
+struct_kgsl_device_waittimestamp._fields_ = [
+    ('timestamp', ctypes.c_uint32),
+    ('timeout', ctypes.c_uint32),
+]
+
+class struct_kgsl_device_waittimestamp_ctxtid(Structure):
+    pass
+
+struct_kgsl_device_waittimestamp_ctxtid._pack_ = 1 # source:False
+struct_kgsl_device_waittimestamp_ctxtid._fields_ = [
+    ('context_id', ctypes.c_uint32),
+    ('timestamp', ctypes.c_uint32),
+    ('timeout', ctypes.c_uint32),
+]
+
+class struct_kgsl_ringbuffer_issueibcmds(Structure):
+    pass
+
+struct_kgsl_ringbuffer_issueibcmds._pack_ = 1 # source:False
+struct_kgsl_ringbuffer_issueibcmds._fields_ = [
+    ('drawctxt_id', ctypes.c_uint32),
+    ('PADDING_0', ctypes.c_ubyte * 4),
+    ('ibdesc_addr', ctypes.c_uint64),
+    ('numibs', ctypes.c_uint32),
+    ('timestamp', ctypes.c_uint32),
+    ('flags', ctypes.c_uint32),
+    ('PADDING_1', ctypes.c_ubyte * 4),
+]
+
+class struct_kgsl_cmdstream_readtimestamp(Structure):
+    pass
+
+struct_kgsl_cmdstream_readtimestamp._pack_ = 1 # source:False
+struct_kgsl_cmdstream_readtimestamp._fields_ = [
+    ('type', ctypes.c_uint32),
+    ('timestamp', ctypes.c_uint32),
+]
+
+class struct_kgsl_cmdstream_freememontimestamp(Structure):
+    pass
+
+struct_kgsl_cmdstream_freememontimestamp._pack_ = 1 # source:False
+struct_kgsl_cmdstream_freememontimestamp._fields_ = [
+    ('gpuaddr', ctypes.c_uint64),
+    ('type', ctypes.c_uint32),
+    ('timestamp', ctypes.c_uint32),
+]
+
+class struct_kgsl_drawctxt_create(Structure):
+    pass
+
+struct_kgsl_drawctxt_create._pack_ = 1 # source:False
+struct_kgsl_drawctxt_create._fields_ = [
+    ('flags', ctypes.c_uint32),
+    ('drawctxt_id', ctypes.c_uint32),
+]
+
+class struct_kgsl_drawctxt_destroy(Structure):
+    pass
+
+struct_kgsl_drawctxt_destroy._pack_ = 1 # source:False
+struct_kgsl_drawctxt_destroy._fields_ = [
+    ('drawctxt_id', ctypes.c_uint32),
+]
+
+class struct_kgsl_map_user_mem(Structure):
+    pass
+
+struct_kgsl_map_user_mem._pack_ = 1 # source:False
+struct_kgsl_map_user_mem._fields_ = [
+    ('fd', ctypes.c_int32),
+    ('PADDING_0', ctypes.c_ubyte * 4),
+    ('gpuaddr', ctypes.c_uint64),
+    ('len', ctypes.c_uint64),
+    ('offset', ctypes.c_uint64),
+    ('hostptr', ctypes.c_uint64),
+    ('memtype', kgsl_user_mem_type),
+    ('flags', ctypes.c_uint32),
+]
+
+class struct_kgsl_cmdstream_readtimestamp_ctxtid(Structure):
+    pass
+
+struct_kgsl_cmdstream_readtimestamp_ctxtid._pack_ = 1 # source:False
+struct_kgsl_cmdstream_readtimestamp_ctxtid._fields_ = [
+    ('context_id', ctypes.c_uint32),
+    ('type', ctypes.c_uint32),
+    ('timestamp', ctypes.c_uint32),
+]
+
+class struct_kgsl_cmdstream_freememontimestamp_ctxtid(Structure):
+    pass
+
+struct_kgsl_cmdstream_freememontimestamp_ctxtid._pack_ = 1 # source:False
+struct_kgsl_cmdstream_freememontimestamp_ctxtid._fields_ = [
+    ('context_id', ctypes.c_uint32),
+    ('PADDING_0', ctypes.c_ubyte * 4),
+    ('gpuaddr', ctypes.c_uint64),
+    ('type', ctypes.c_uint32),
+    ('timestamp', ctypes.c_uint32),
+]
+
+class struct_kgsl_sharedmem_from_pmem(Structure):
+    pass
+
+struct_kgsl_sharedmem_from_pmem._pack_ = 1 # source:False
+struct_kgsl_sharedmem_from_pmem._fields_ = [
+    ('pmem_fd', ctypes.c_int32),
+    ('PADDING_0', ctypes.c_ubyte * 4),
+    ('gpuaddr', ctypes.c_uint64),
+    ('len', ctypes.c_uint32),
+    ('offset', ctypes.c_uint32),
+]
+
+class struct_kgsl_sharedmem_free(Structure):
+    pass
+
+struct_kgsl_sharedmem_free._pack_ = 1 # source:False
+struct_kgsl_sharedmem_free._fields_ = [
+    ('gpuaddr', ctypes.c_uint64),
+]
+
+class struct_kgsl_cff_user_event(Structure):
+    pass
+
+struct_kgsl_cff_user_event._pack_ = 1 # source:False
+struct_kgsl_cff_user_event._fields_ = [
+    ('cff_opcode', ctypes.c_ubyte),
+    ('PADDING_0', ctypes.c_ubyte * 3),
+    ('op1', ctypes.c_uint32),
+    ('op2', ctypes.c_uint32),
+    ('op3', ctypes.c_uint32),
+    ('op4', ctypes.c_uint32),
+    ('op5', ctypes.c_uint32),
+    ('__pad', ctypes.c_uint32 * 2),
+]
+
+class struct_kgsl_gmem_desc(Structure):
+    pass
+
+struct_kgsl_gmem_desc._pack_ = 1 # source:False
+struct_kgsl_gmem_desc._fields_ = [
+    ('x', ctypes.c_uint32),
+    ('y', ctypes.c_uint32),
+    ('width', ctypes.c_uint32),
+    ('height', ctypes.c_uint32),
+    ('pitch', ctypes.c_uint32),
+]
+
+class struct_kgsl_buffer_desc(Structure):
+    pass
+
+struct_kgsl_buffer_desc._pack_ = 1 # source:False
+struct_kgsl_buffer_desc._fields_ = [
+    ('hostptr', ctypes.POINTER(None)),
+    ('gpuaddr', ctypes.c_uint64),
+    ('size', ctypes.c_int32),
+    ('format', ctypes.c_uint32),
+    ('pitch', ctypes.c_uint32),
+    ('enabled', ctypes.c_uint32),
+]
+
+class struct_kgsl_bind_gmem_shadow(Structure):
+    pass
+
+struct_kgsl_bind_gmem_shadow._pack_ = 1 # source:False
+struct_kgsl_bind_gmem_shadow._fields_ = [
+    ('drawctxt_id', ctypes.c_uint32),
+    ('gmem_desc', struct_kgsl_gmem_desc),
+    ('shadow_x', ctypes.c_uint32),
+    ('shadow_y', ctypes.c_uint32),
+    ('shadow_buffer', struct_kgsl_buffer_desc),
+    ('buffer_id', ctypes.c_uint32),
+    ('PADDING_0', ctypes.c_ubyte * 4),
+]
+
+class struct_kgsl_sharedmem_from_vmalloc(Structure):
+    pass
+
+struct_kgsl_sharedmem_from_vmalloc._pack_ = 1 # source:False
+struct_kgsl_sharedmem_from_vmalloc._fields_ = [
+    ('gpuaddr', ctypes.c_uint64),
+    ('hostptr', ctypes.c_uint32),
+    ('flags', ctypes.c_uint32),
+]
+
+class struct_kgsl_drawctxt_set_bin_base_offset(Structure):
+    pass
+
+struct_kgsl_drawctxt_set_bin_base_offset._pack_ = 1 # source:False
+struct_kgsl_drawctxt_set_bin_base_offset._fields_ = [
+    ('drawctxt_id', ctypes.c_uint32),
+    ('offset', ctypes.c_uint32),
+]
+
+
+# values for enumeration 'kgsl_cmdwindow_type'
+kgsl_cmdwindow_type__enumvalues = {
+    0: 'KGSL_CMDWINDOW_MIN',
+    0: 'KGSL_CMDWINDOW_2D',
+    1: 'KGSL_CMDWINDOW_3D',
+    2: 'KGSL_CMDWINDOW_MMU',
+    255: 'KGSL_CMDWINDOW_ARBITER',
+    255: 'KGSL_CMDWINDOW_MAX',
+}
+KGSL_CMDWINDOW_MIN = 0
+KGSL_CMDWINDOW_2D = 0
+KGSL_CMDWINDOW_3D = 1
+KGSL_CMDWINDOW_MMU = 2
+KGSL_CMDWINDOW_ARBITER = 255
+KGSL_CMDWINDOW_MAX = 255
+kgsl_cmdwindow_type = ctypes.c_uint32 # enum
+class struct_kgsl_cmdwindow_write(Structure):
+    pass
+
+struct_kgsl_cmdwindow_write._pack_ = 1 # source:False
+struct_kgsl_cmdwindow_write._fields_ = [
+    ('target', kgsl_cmdwindow_type),
+    ('addr', ctypes.c_uint32),
+    ('data', ctypes.c_uint32),
+]
+
+class struct_kgsl_gpumem_alloc(Structure):
+    pass
+
+struct_kgsl_gpumem_alloc._pack_ = 1 # source:False
+struct_kgsl_gpumem_alloc._fields_ = [
+    ('gpuaddr', ctypes.c_uint64),
+    ('size', ctypes.c_uint64),
+    ('flags', ctypes.c_uint32),
+    ('PADDING_0', ctypes.c_ubyte * 4),
+]
+
+class struct_kgsl_cff_syncmem(Structure):
+    pass
+
+struct_kgsl_cff_syncmem._pack_ = 1 # source:False
+struct_kgsl_cff_syncmem._fields_ = [
+    ('gpuaddr', ctypes.c_uint64),
+    ('len', ctypes.c_uint64),
+    ('__pad', ctypes.c_uint32 * 2),
+]
+
+class struct_kgsl_timestamp_event(Structure):
+    pass
+
+struct_kgsl_timestamp_event._pack_ = 1 # source:False
+struct_kgsl_timestamp_event._fields_ = [
+    ('type', ctypes.c_int32),
+    ('timestamp', ctypes.c_uint32),
+    ('context_id', ctypes.c_uint32),
+    ('PADDING_0', ctypes.c_ubyte * 4),
+    ('priv', ctypes.POINTER(None)),
+    ('len', ctypes.c_uint64),
+]
+
+class struct_kgsl_timestamp_event_genlock(Structure):
+    pass
+
+struct_kgsl_timestamp_event_genlock._pack_ = 1 # source:False
+struct_kgsl_timestamp_event_genlock._fields_ = [
+    ('handle', ctypes.c_int32),
+]
+
+class struct_kgsl_timestamp_event_fence(Structure):
+    pass
+
+struct_kgsl_timestamp_event_fence._pack_ = 1 # source:False
+struct_kgsl_timestamp_event_fence._fields_ = [
+    ('fence_fd', ctypes.c_int32),
+]
+
+class struct_kgsl_gpumem_alloc_id(Structure):
+    pass
+
+struct_kgsl_gpumem_alloc_id._pack_ = 1 # source:False
+struct_kgsl_gpumem_alloc_id._fields_ = [
+    ('id', ctypes.c_uint32),
+    ('flags', ctypes.c_uint32),
+    ('size', ctypes.c_uint64),
+    ('mmapsize', ctypes.c_uint64),
+    ('gpuaddr', ctypes.c_uint64),
+    ('__pad', ctypes.c_uint64 * 2),
+]
+
+class struct_kgsl_gpumem_free_id(Structure):
+    pass
+
+struct_kgsl_gpumem_free_id._pack_ = 1 # source:False
+struct_kgsl_gpumem_free_id._fields_ = [
+    ('id', ctypes.c_uint32),
+    ('__pad', ctypes.c_uint32),
+]
+
+class struct_kgsl_gpumem_get_info(Structure):
+    pass
+
+struct_kgsl_gpumem_get_info._pack_ = 1 # source:False
+struct_kgsl_gpumem_get_info._fields_ = [
+    ('gpuaddr', ctypes.c_uint64),
+    ('id', ctypes.c_uint32),
+    ('flags', ctypes.c_uint32),
+    ('size', ctypes.c_uint64),
+    ('mmapsize', ctypes.c_uint64),
+    ('useraddr', ctypes.c_uint64),
+    ('__pad', ctypes.c_uint64 * 4),
+]
+
+class struct_kgsl_gpumem_sync_cache(Structure):
+    pass
+
+struct_kgsl_gpumem_sync_cache._pack_ = 1 # source:False
+struct_kgsl_gpumem_sync_cache._fields_ = [
+    ('gpuaddr', ctypes.c_uint64),
+    ('id', ctypes.c_uint32),
+    ('op', ctypes.c_uint32),
+    ('offset', ctypes.c_uint64),
+    ('length', ctypes.c_uint64),
+]
+
+class struct_kgsl_perfcounter_get(Structure):
+    pass
+
+struct_kgsl_perfcounter_get._pack_ = 1 # source:False
+struct_kgsl_perfcounter_get._fields_ = [
+    ('groupid', ctypes.c_uint32),
+    ('countable', ctypes.c_uint32),
+    ('offset', ctypes.c_uint32),
+    ('offset_hi', ctypes.c_uint32),
+    ('__pad', ctypes.c_uint32),
+]
+
+class struct_kgsl_perfcounter_put(Structure):
+    pass
+
+struct_kgsl_perfcounter_put._pack_ = 1 # source:False
+struct_kgsl_perfcounter_put._fields_ = [
+    ('groupid', ctypes.c_uint32),
+    ('countable', ctypes.c_uint32),
+    ('__pad', ctypes.c_uint32 * 2),
+]
+
+class struct_kgsl_perfcounter_query(Structure):
+    pass
+
+struct_kgsl_perfcounter_query._pack_ = 1 # source:False
+struct_kgsl_perfcounter_query._fields_ = [
+    ('groupid', ctypes.c_uint32),
+    ('PADDING_0', ctypes.c_ubyte * 4),
+    ('countables', ctypes.POINTER(ctypes.c_uint32)),
+    ('count', ctypes.c_uint32),
+    ('max_counters', ctypes.c_uint32),
+    ('__pad', ctypes.c_uint32 * 2),
+]
+
+class struct_kgsl_perfcounter_read_group(Structure):
+    pass
+
+struct_kgsl_perfcounter_read_group._pack_ = 1 # source:False
+struct_kgsl_perfcounter_read_group._fields_ = [
+    ('groupid', ctypes.c_uint32),
+    ('countable', ctypes.c_uint32),
+    ('value', ctypes.c_uint64),
+]
+
+class struct_kgsl_perfcounter_read(Structure):
+    pass
+
+struct_kgsl_perfcounter_read._pack_ = 1 # source:False
+struct_kgsl_perfcounter_read._fields_ = [
+    ('reads', ctypes.POINTER(struct_kgsl_perfcounter_read_group)),
+    ('count', ctypes.c_uint32),
+    ('__pad', ctypes.c_uint32 * 2),
+    ('PADDING_0', ctypes.c_ubyte * 4),
+]
+
+class struct_kgsl_gpumem_sync_cache_bulk(Structure):
+    pass
+
+struct_kgsl_gpumem_sync_cache_bulk._pack_ = 1 # source:False
+struct_kgsl_gpumem_sync_cache_bulk._fields_ = [
+    ('id_list', ctypes.POINTER(ctypes.c_uint32)),
+    ('count', ctypes.c_uint32),
+    ('op', ctypes.c_uint32),
+    ('__pad', ctypes.c_uint32 * 2),
+]
+
+class struct_kgsl_cmd_syncpoint_timestamp(Structure):
+    pass
+
+struct_kgsl_cmd_syncpoint_timestamp._pack_ = 1 # source:False
+struct_kgsl_cmd_syncpoint_timestamp._fields_ = [
+    ('context_id', ctypes.c_uint32),
+    ('timestamp', ctypes.c_uint32),
+]
+
+class struct_kgsl_cmd_syncpoint_fence(Structure):
+    pass
+
+struct_kgsl_cmd_syncpoint_fence._pack_ = 1 # source:False
+struct_kgsl_cmd_syncpoint_fence._fields_ = [
+    ('fd', ctypes.c_int32),
+]
+
+class struct_kgsl_cmd_syncpoint(Structure):
+    pass
+
+struct_kgsl_cmd_syncpoint._pack_ = 1 # source:False
+struct_kgsl_cmd_syncpoint._fields_ = [
+    ('type', ctypes.c_int32),
+    ('PADDING_0', ctypes.c_ubyte * 4),
+    ('priv', ctypes.POINTER(None)),
+    ('size', ctypes.c_uint64),
+]
+
+class struct_kgsl_submit_commands(Structure):
+    pass
+
+struct_kgsl_submit_commands._pack_ = 1 # source:False
+struct_kgsl_submit_commands._fields_ = [
+    ('context_id', ctypes.c_uint32),
+    ('flags', ctypes.c_uint32),
+    ('cmdlist', ctypes.POINTER(struct_kgsl_ibdesc)),
+    ('numcmds', ctypes.c_uint32),
+    ('PADDING_0', ctypes.c_ubyte * 4),
+    ('synclist', ctypes.POINTER(struct_kgsl_cmd_syncpoint)),
+    ('numsyncs', ctypes.c_uint32),
+    ('timestamp', ctypes.c_uint32),
+    ('__pad', ctypes.c_uint32 * 4),
+]
+
+class struct_kgsl_device_constraint(Structure):
+    pass
+
+struct_kgsl_device_constraint._pack_ = 1 # source:False
+struct_kgsl_device_constraint._fields_ = [
+    ('type', ctypes.c_uint32),
+    ('context_id', ctypes.c_uint32),
+    ('data', ctypes.POINTER(None)),
+    ('size', ctypes.c_uint64),
+]
+
+class struct_kgsl_device_constraint_pwrlevel(Structure):
+    pass
+
+struct_kgsl_device_constraint_pwrlevel._pack_ = 1 # source:False
+struct_kgsl_device_constraint_pwrlevel._fields_ = [
+    ('level', ctypes.c_uint32),
+]
+
+class struct_kgsl_syncsource_create(Structure):
+    pass
+
+struct_kgsl_syncsource_create._pack_ = 1 # source:False
+struct_kgsl_syncsource_create._fields_ = [
+    ('id', ctypes.c_uint32),
+    ('__pad', ctypes.c_uint32 * 3),
+]
+
+class struct_kgsl_syncsource_destroy(Structure):
+    pass
+
+struct_kgsl_syncsource_destroy._pack_ = 1 # source:False
+struct_kgsl_syncsource_destroy._fields_ = [
+    ('id', ctypes.c_uint32),
+    ('__pad', ctypes.c_uint32 * 3),
+]
+
+class struct_kgsl_syncsource_create_fence(Structure):
+    pass
+
+struct_kgsl_syncsource_create_fence._pack_ = 1 # source:False
+struct_kgsl_syncsource_create_fence._fields_ = [
+    ('id', ctypes.c_uint32),
+    ('fence_fd', ctypes.c_int32),
+    ('__pad', ctypes.c_uint32 * 4),
+]
+
+class struct_kgsl_syncsource_signal_fence(Structure):
+    pass
+
+struct_kgsl_syncsource_signal_fence._pack_ = 1 # source:False
+struct_kgsl_syncsource_signal_fence._fields_ = [
+    ('id', ctypes.c_uint32),
+    ('fence_fd', ctypes.c_int32),
+    ('__pad', ctypes.c_uint32 * 4),
+]
+
+class struct_kgsl_cff_sync_gpuobj(Structure):
+    pass
+
+struct_kgsl_cff_sync_gpuobj._pack_ = 1 # source:False
+struct_kgsl_cff_sync_gpuobj._fields_ = [
+    ('offset', ctypes.c_uint64),
+    ('length', ctypes.c_uint64),
+    ('id', ctypes.c_uint32),
+    ('PADDING_0', ctypes.c_ubyte * 4),
+]
+
+class struct_kgsl_gpuobj_alloc(Structure):
+    pass
+
+struct_kgsl_gpuobj_alloc._pack_ = 1 # source:False
+struct_kgsl_gpuobj_alloc._fields_ = [
+    ('size', ctypes.c_uint64),
+    ('flags', ctypes.c_uint64),
+    ('va_len', ctypes.c_uint64),
+    ('mmapsize', ctypes.c_uint64),
+    ('id', ctypes.c_uint32),
+    ('metadata_len', ctypes.c_uint32),
+    ('metadata', ctypes.c_uint64),
+]
+
+class struct_kgsl_gpuobj_free(Structure):
+    pass
+
+struct_kgsl_gpuobj_free._pack_ = 1 # source:False
+struct_kgsl_gpuobj_free._fields_ = [
+    ('flags', ctypes.c_uint64),
+    ('priv', ctypes.c_uint64),
+    ('id', ctypes.c_uint32),
+    ('type', ctypes.c_uint32),
+    ('len', ctypes.c_uint32),
+    ('PADDING_0', ctypes.c_ubyte * 4),
+]
+
+class struct_kgsl_gpu_event_timestamp(Structure):
+    pass
+
+struct_kgsl_gpu_event_timestamp._pack_ = 1 # source:False
+struct_kgsl_gpu_event_timestamp._fields_ = [
+    ('context_id', ctypes.c_uint32),
+    ('timestamp', ctypes.c_uint32),
+]
+
+class struct_kgsl_gpu_event_fence(Structure):
+    pass
+
+struct_kgsl_gpu_event_fence._pack_ = 1 # source:False
+struct_kgsl_gpu_event_fence._fields_ = [
+    ('fd', ctypes.c_int32),
+]
+
+class struct_kgsl_gpuobj_info(Structure):
+    pass
+
+struct_kgsl_gpuobj_info._pack_ = 1 # source:False
+struct_kgsl_gpuobj_info._fields_ = [
+    ('gpuaddr', ctypes.c_uint64),
+    ('flags', ctypes.c_uint64),
+    ('size', ctypes.c_uint64),
+    ('va_len', ctypes.c_uint64),
+    ('va_addr', ctypes.c_uint64),
+    ('id', ctypes.c_uint32),
+    ('PADDING_0', ctypes.c_ubyte * 4),
+]
+
+class struct_kgsl_gpuobj_import(Structure):
+    pass
+
+struct_kgsl_gpuobj_import._pack_ = 1 # source:False
+struct_kgsl_gpuobj_import._fields_ = [
+    ('priv', ctypes.c_uint64),
+    ('priv_len', ctypes.c_uint64),
+    ('flags', ctypes.c_uint64),
+    ('type', ctypes.c_uint32),
+    ('id', ctypes.c_uint32),
+]
+
+class struct_kgsl_gpuobj_import_dma_buf(Structure):
+    pass
+
+struct_kgsl_gpuobj_import_dma_buf._pack_ = 1 # source:False
+struct_kgsl_gpuobj_import_dma_buf._fields_ = [
+    ('fd', ctypes.c_int32),
+]
+
+class struct_kgsl_gpuobj_import_useraddr(Structure):
+    pass
+
+struct_kgsl_gpuobj_import_useraddr._pack_ = 1 # source:False
+struct_kgsl_gpuobj_import_useraddr._fields_ = [
+    ('virtaddr', ctypes.c_uint64),
+]
+
+class struct_kgsl_gpuobj_sync_obj(Structure):
+    pass
+
+struct_kgsl_gpuobj_sync_obj._pack_ = 1 # source:False
+struct_kgsl_gpuobj_sync_obj._fields_ = [
+    ('offset', ctypes.c_uint64),
+    ('length', ctypes.c_uint64),
+    ('id', ctypes.c_uint32),
+    ('op', ctypes.c_uint32),
+]
+
+class struct_kgsl_gpuobj_sync(Structure):
+    pass
+
+struct_kgsl_gpuobj_sync._pack_ = 1 # source:False
+struct_kgsl_gpuobj_sync._fields_ = [
+    ('objs', ctypes.c_uint64),
+    ('obj_len', ctypes.c_uint32),
+    ('count', ctypes.c_uint32),
+]
+
+class struct_kgsl_command_object(Structure):
+    pass
+
+struct_kgsl_command_object._pack_ = 1 # source:False
+struct_kgsl_command_object._fields_ = [
+    ('offset', ctypes.c_uint64),
+    ('gpuaddr', ctypes.c_uint64),
+    ('size', ctypes.c_uint64),
+    ('flags', ctypes.c_uint32),
+    ('id', ctypes.c_uint32),
+]
+
+class struct_kgsl_command_syncpoint(Structure):
+    pass
+
+struct_kgsl_command_syncpoint._pack_ = 1 # source:False
+struct_kgsl_command_syncpoint._fields_ = [
+    ('priv', ctypes.c_uint64),
+    ('size', ctypes.c_uint64),
+    ('type', ctypes.c_uint32),
+    ('PADDING_0', ctypes.c_ubyte * 4),
+]
+
+class struct_kgsl_gpu_command(Structure):
+    pass
+
+struct_kgsl_gpu_command._pack_ = 1 # source:False
+struct_kgsl_gpu_command._fields_ = [
+    ('flags', ctypes.c_uint64),
+    ('cmdlist', ctypes.c_uint64),
+    ('cmdsize', ctypes.c_uint32),
+    ('numcmds', ctypes.c_uint32),
+    ('objlist', ctypes.c_uint64),
+    ('objsize', ctypes.c_uint32),
+    ('numobjs', ctypes.c_uint32),
+    ('synclist', ctypes.c_uint64),
+    ('syncsize', ctypes.c_uint32),
+    ('numsyncs', ctypes.c_uint32),
+    ('context_id', ctypes.c_uint32),
+    ('timestamp', ctypes.c_uint32),
+]
+
+class struct_kgsl_preemption_counters_query(Structure):
+    pass
+
+struct_kgsl_preemption_counters_query._pack_ = 1 # source:False
+struct_kgsl_preemption_counters_query._fields_ = [
+    ('counters', ctypes.c_uint64),
+    ('size_user', ctypes.c_uint32),
+    ('size_priority_level', ctypes.c_uint32),
+    ('max_priority_level', ctypes.c_uint32),
+    ('PADDING_0', ctypes.c_ubyte * 4),
+]
+
+class struct_kgsl_gpuobj_set_info(Structure):
+    pass
+
+struct_kgsl_gpuobj_set_info._pack_ = 1 # source:False
+struct_kgsl_gpuobj_set_info._fields_ = [
+    ('flags', ctypes.c_uint64),
+    ('metadata', ctypes.c_uint64),
+    ('id', ctypes.c_uint32),
+    ('metadata_len', ctypes.c_uint32),
+    ('type', ctypes.c_uint32),
+    ('PADDING_0', ctypes.c_ubyte * 4),
+]
+
+__all__ = \
+    ['KGSL_CMDWINDOW_2D', 'KGSL_CMDWINDOW_3D',
+    'KGSL_CMDWINDOW_ARBITER', 'KGSL_CMDWINDOW_MAX',
+    'KGSL_CMDWINDOW_MIN', 'KGSL_CMDWINDOW_MMU',
+    'KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT',
+    'KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT',
+    'KGSL_CTX_STAT_NO_ERROR',
+    'KGSL_CTX_STAT_UNKNOWN_CONTEXT_RESET_EXT', 'KGSL_DEVICE_3D0',
+    'KGSL_DEVICE_MAX', 'KGSL_TIMESTAMP_CONSUMED',
+    'KGSL_TIMESTAMP_QUEUED', 'KGSL_TIMESTAMP_RETIRED',
+    'KGSL_USER_MEM_TYPE_ADDR', 'KGSL_USER_MEM_TYPE_ASHMEM',
+    'KGSL_USER_MEM_TYPE_DMABUF', 'KGSL_USER_MEM_TYPE_ION',
+    'KGSL_USER_MEM_TYPE_MAX', 'KGSL_USER_MEM_TYPE_PMEM',
+    'kgsl_cmdwindow_type', 'kgsl_ctx_reset_stat', 'kgsl_deviceid',
+    'kgsl_timestamp_type', 'kgsl_user_mem_type',
+    'struct_kgsl_bind_gmem_shadow', 'struct_kgsl_buffer_desc',
+    'struct_kgsl_cff_sync_gpuobj', 'struct_kgsl_cff_syncmem',
+    'struct_kgsl_cff_user_event', 'struct_kgsl_cmd_syncpoint',
+    'struct_kgsl_cmd_syncpoint_fence',
+    'struct_kgsl_cmd_syncpoint_timestamp',
+    'struct_kgsl_cmdbatch_profiling_buffer',
+    'struct_kgsl_cmdstream_freememontimestamp',
+    'struct_kgsl_cmdstream_freememontimestamp_ctxtid',
+    'struct_kgsl_cmdstream_readtimestamp',
+    'struct_kgsl_cmdstream_readtimestamp_ctxtid',
+    'struct_kgsl_cmdwindow_write', 'struct_kgsl_command_object',
+    'struct_kgsl_command_syncpoint', 'struct_kgsl_device_constraint',
+    'struct_kgsl_device_constraint_pwrlevel',
+    'struct_kgsl_device_getproperty',
+    'struct_kgsl_device_waittimestamp',
+    'struct_kgsl_device_waittimestamp_ctxtid', 'struct_kgsl_devinfo',
+    'struct_kgsl_devmemstore', 'struct_kgsl_drawctxt_create',
+    'struct_kgsl_drawctxt_destroy',
+    'struct_kgsl_drawctxt_set_bin_base_offset',
+    'struct_kgsl_gmem_desc', 'struct_kgsl_gpmu_version',
+    'struct_kgsl_gpu_command', 'struct_kgsl_gpu_event_fence',
+    'struct_kgsl_gpu_event_timestamp', 'struct_kgsl_gpumem_alloc',
+    'struct_kgsl_gpumem_alloc_id', 'struct_kgsl_gpumem_free_id',
+    'struct_kgsl_gpumem_get_info', 'struct_kgsl_gpumem_sync_cache',
+    'struct_kgsl_gpumem_sync_cache_bulk', 'struct_kgsl_gpuobj_alloc',
+    'struct_kgsl_gpuobj_free', 'struct_kgsl_gpuobj_import',
+    'struct_kgsl_gpuobj_import_dma_buf',
+    'struct_kgsl_gpuobj_import_useraddr', 'struct_kgsl_gpuobj_info',
+    'struct_kgsl_gpuobj_set_info', 'struct_kgsl_gpuobj_sync',
+    'struct_kgsl_gpuobj_sync_obj', 'struct_kgsl_ibdesc',
+    'struct_kgsl_map_user_mem', 'struct_kgsl_perfcounter_get',
+    'struct_kgsl_perfcounter_put', 'struct_kgsl_perfcounter_query',
+    'struct_kgsl_perfcounter_read',
+    'struct_kgsl_perfcounter_read_group',
+    'struct_kgsl_preemption_counters_query',
+    'struct_kgsl_ringbuffer_issueibcmds', 'struct_kgsl_shadowprop',
+    'struct_kgsl_sharedmem_free', 'struct_kgsl_sharedmem_from_pmem',
+    'struct_kgsl_sharedmem_from_vmalloc',
+    'struct_kgsl_sp_generic_mem', 'struct_kgsl_submit_commands',
+    'struct_kgsl_syncsource_create',
+    'struct_kgsl_syncsource_create_fence',
+    'struct_kgsl_syncsource_destroy',
+    'struct_kgsl_syncsource_signal_fence',
+    'struct_kgsl_timestamp_event',
+    'struct_kgsl_timestamp_event_fence',
+    'struct_kgsl_timestamp_event_genlock',
+    'struct_kgsl_ucode_version', 'struct_kgsl_version']
diff --git a/extra/qcom_gpu_driver/opencl_ioctl.py b/extra/qcom_gpu_driver/opencl_ioctl.py
new file mode 100644
index 00000000..32255f22
--- /dev/null
+++ b/extra/qcom_gpu_driver/opencl_ioctl.py
@@ -0,0 +1,171 @@
+import ctypes, ctypes.util, struct, fcntl, re
+from hexdump import hexdump
+from tinygrad.runtime.ops_gpu import CLDevice, CLAllocator
+import pathlib, sys
+sys.path.append(pathlib.Path(__file__).parent.parent.parent.as_posix())
+
+ops = {}
+import xml.etree.ElementTree as ET
+xml = ET.parse(pathlib.Path(__file__).parent / "adreno_pm4.xml")
+for child in xml.getroot():
+  if 'name' in child.attrib and child.attrib['name'] == "adreno_pm4_type3_packets":
+    for sc in child:
+      if 'name' in sc.attrib and ('variants' not in sc.attrib or sc.attrib['variants'] != "A2XX"):
+        ops[int(sc.attrib['value'], 0x10)] = sc.attrib['name']
+#print(ops)
+#exit(0)
+
+from extra.qcom_gpu_driver import msm_kgsl
+def ioctls_from_header():
+  hdr = (pathlib.Path(__file__).parent.parent.parent / "extra/qcom_gpu_driver/msm_kgsl.h").read_text().replace("\\\n", "")
+  pattern = r'#define\s+(IOCTL_KGSL_[A-Z0-9_]+)\s+_IOWR?\(KGSL_IOC_TYPE,\s+(0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
+  matches = re.findall(pattern, hdr, re.MULTILINE)
+  return {int(nr, 0x10):(name, getattr(msm_kgsl, "struct_"+sname)) for name, nr, sname in matches}
+
+nrs = ioctls_from_header()
+
+# https://github.com/ensc/dietlibc/blob/master/include/sys/aarch64-ioctl.h
+
+def get_struct(argp, stype):
+  return ctypes.cast(ctypes.c_void_p(argp), ctypes.POINTER(stype)).contents
+
+def format_struct(s):
+  sdats = []
+  for field_name, field_type in s._fields_:
+    if field_name in {"__pad", "PADDING_0"}: continue
+    dat = getattr(s, field_name)
+    if isinstance(dat, int): sdats.append(f"{field_name}:0x{dat:X}")
+    else: sdats.append(f"{field_name}:{dat}")
+  return sdats
+
+import mmap
+mmaped = {}
+def get_mem(addr, vlen):
+  for k,v in mmaped.items():
+    if k <= addr and addr < k+len(v):
+      return v[addr-k:addr-k+vlen]
+
+def hprint(vals):
+  ret = []
+  for v in vals:
+    if v > 31: ret.append(f"{v:#x}")
+    else: ret.append(f"{v}")
+  return f"({','.join(ret)})"
+
+ST6_SHADER = 0
+ST6_CONSTANTS = 1
+
+def parse_cmd_buf(dat):
+  ptr = 0
+  while ptr < len(dat):
+    cmd = struct.unpack("I", dat[ptr:ptr+4])[0]
+    if (cmd>>24) == 0x70:
+      # packet with opcode and opcode specific payload (replace pkt3)
+      opcode, size = ((cmd>>16)&0x7F), cmd&0x3FFF
+      vals = struct.unpack("I"*size, dat[ptr+4:ptr+4+4*size])
+      print(f"{ptr:3X} -- typ 7: {size=:3d}, {opcode=:#x} {ops[opcode]}", hprint(vals))
+      if ops[opcode] == "CP_LOAD_STATE6_FRAG":
+        dst_off = vals[0] & 0x3FFF
+        state_type = (vals[0]>>14) & 0x3
+        state_src = (vals[0]>>16) & 0x3
+        state_block = (vals[0]>>18) & 0xF  # 13 = SB4_CS_SHADER
+        num_unit = vals[0]>>22
+        print(f"{num_unit=} {state_block=} {state_src=} {state_type=} {dst_off=}")
+
+        from disassemblers.adreno import disasm_raw
+        if state_type == ST6_SHADER: disasm_raw(get_mem(((vals[2] << 32) | vals[1]), 0x180))
+        if state_type == ST6_CONSTANTS: hexdump(get_mem(((vals[2] << 32) | vals[1]), min(0x180, num_unit*4)))
+        pass
+      ptr += 4*size
+    elif (cmd>>28) == 0x4:
+      # write one or more registers (replace pkt0)
+      offset, size = ((cmd>>8)&0x7FFFF), cmd&0x7F
+      vals = struct.unpack("I"*size, dat[ptr+4:ptr+4+4*size])
+      print(f"{ptr:3X} -- typ 4: {size=:3d}, {offset=:#x}", hprint(vals))
+      ptr += 4*size
+    else:
+      print("unk", hex(cmd))
+    ptr += 4
+
+@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_ulong, ctypes.c_void_p)
+def ioctl(fd, request, argp):
+  ret = libc.syscall(0x1d, ctypes.c_int(fd), ctypes.c_ulong(request), ctypes.c_void_p(argp))
+
+  idir, size, itype, nr = (request>>30), (request>>16)&0x3FFF, (request>>8)&0xFF, request&0xFF
+  if nr in nrs and itype == 9:
+    name, stype = nrs[nr]
+    s = get_struct(argp, stype)
+    print(f"{ret:2d} = {name:40s}", ' '.join(format_struct(s)))
+    if name == "IOCTL_KGSL_GPUOBJ_INFO":
+      mmaped[s.gpuaddr] = mmap.mmap(fd, s.size, offset=s.id*0x1000)
+    if name == "IOCTL_KGSL_GPU_COMMAND":
+      for i in range(s.numcmds):
+        cmd = get_struct(s.cmdlist+s.cmdsize*i, msm_kgsl.struct_kgsl_command_object)
+        print(f"cmd {i}:", format_struct(cmd))
+        #hexdump(get_mem(cmd.gpuaddr, cmd.size))
+        parse_cmd_buf(get_mem(cmd.gpuaddr, cmd.size))
+      for i in range(s.numobjs):
+        obj = get_struct(s.objlist+s.objsize*i, msm_kgsl.struct_kgsl_command_object)
+        print(f"obj {i}:", format_struct(obj))
+        print(format_struct(msm_kgsl.struct_kgsl_cmdbatch_profiling_buffer.from_buffer_copy(get_mem(obj.gpuaddr, obj.size))))
+        #hexdump(get_mem(obj.gpuaddr, obj.size))
+  else:
+    #print(f"ioctl({fd=}, (dir:{idir}, size:0x{size:3X}, type:{itype:d}, nr:0x{nr:2X}), {argp=:X}) = {ret=}")
+    pass
+
+  return ret
+
+def install_hook(c_function, python_function):
+  # AARCH64 trampoline to ioctl
+  tramp = b"\x70\x00\x00\x10\x10\x02\x40\xf9\x00\x02\x1f\xd6"
+  tramp += struct.pack("Q", ctypes.cast(ctypes.byref(python_function), ctypes.POINTER(ctypes.c_ulong)).contents.value)
+
+  # get real ioctl address
+  ioctl_address = ctypes.cast(ctypes.byref(c_function), ctypes.POINTER(ctypes.c_ulong))
+
+  # hook ioctl
+  libc = ctypes.CDLL(ctypes.util.find_library("libc"))
+  ret = libc.mprotect(ctypes.c_ulong((ioctl_address.contents.value//0x1000)*0x1000), 0x2000, 7)
+  assert ret == 0
+  libc.memcpy(ioctl_address.contents, ctypes.create_string_buffer(tramp), len(tramp))
+
+libc = ctypes.CDLL(ctypes.util.find_library("libc"))
+install_hook(libc.ioctl, ioctl)
+
+"""
+print("***** init device")
+dev = CLDevice()
+print("***** alloc")
+alloc = CLAllocator(dev)
+a = alloc._alloc(16)
+#alloc._alloc(0x2000)
+ba = bytearray(b"hello")
+print(f"***** copyin {ctypes.addressof((ctypes.c_char * len(ba)).from_buffer(ba)):#x}")
+alloc.copyin(a, memoryview(ba))
+dev.synchronize()
+print("***** copyout")
+mv2 = memoryview(bytearray(b"nopeo"))
+alloc.copyout(mv2, a)
+dev.synchronize()
+print("***** done", bytes(mv2))
+exit(0)
+"""
+
+print("***** import tinygrad")
+from tinygrad import Tensor, Device, TinyJit
+print("***** access GPU")
+dev = Device["GPU"]
+print("***** create tensor a")
+a = Tensor([1.,2.]*200).realize()
+print("***** create tensor b")
+b = Tensor([3.,4.]*200).realize()
+@TinyJit
+def add(a, b): return (a+b).realize()
+for i in range(4):
+  print(f"***** add tensors {i}")
+  c = add(a, b)
+  #dev.synchronize()
+  c = add(b, a)
+  dev.synchronize()
+#print("***** copy out")
+#print(c.numpy())