Thneed refactors for future functions (#2673)

* delete debug

* thneed updates, but it seems slower

* thneed refactor

* refactor touchups

* add back asserts

* fix uaf

* track the size for local args

* final thneed refactor

* switch kgsl_command_object to avoid memory leak

* comments

* unused includes

Co-authored-by: Comma Device <device@comma.ai>
This commit is contained in:
George Hotz
2020-12-14 15:46:49 -08:00
committed by GitHub
parent 554ea8f54a
commit 5fdda8dbd8
22 changed files with 349 additions and 14693 deletions

View File

@@ -3,8 +3,6 @@ lenv = env.Clone()
libs = [cereal, messaging, common, 'OpenCL', 'SNPE', 'symphony-cpu', 'capnp', 'zmq', 'kj', 'yuv', gpucommon, visionipc]
TEST_THNEED = False
common_src = [
"models/commonmodel.cc",
"runners/snpemodel.cc",
@@ -14,16 +12,14 @@ common_src = [
if arch == "aarch64":
libs += ['gsl', 'CB', 'gnustl_shared']
if not TEST_THNEED:
common_src += ["thneed/thneed.cc"]
lenv['CFLAGS'].append("-DUSE_THNEED")
lenv['CXXFLAGS'].append("-DUSE_THNEED")
common_src += ["thneed/thneed.cc"]
lenv['CFLAGS'].append("-DUSE_THNEED")
lenv['CXXFLAGS'].append("-DUSE_THNEED")
elif arch == "larch64":
libs += ['gsl', 'CB', 'pthread', 'dl']
if not TEST_THNEED:
common_src += ["thneed/thneed.cc"]
lenv['CFLAGS'].append("-DUSE_THNEED")
lenv['CXXFLAGS'].append("-DUSE_THNEED")
common_src += ["thneed/thneed.cc"]
lenv['CFLAGS'].append("-DUSE_THNEED")
lenv['CXXFLAGS'].append("-DUSE_THNEED")
else:
libs += ['pthread']
@@ -56,8 +52,3 @@ lenv.Program('_modeld', [
"models/driving.cc",
]+common, LIBS=libs)
if TEST_THNEED:
lenv.Program('thneed/debug/_thneed', [
"thneed/thneed.cc", "thneed/debug/test.cc"
]+common, LIBS=libs)

View File

@@ -1 +0,0 @@
_thneed

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,906 +0,0 @@
/*
* Mesa 3-D graphics library
*
* Copyright (C) 1999-2008 Brian Paul All Rights Reserved.
* Copyright (C) 2009 VMware, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef SHADER_ENUMS_H
#define SHADER_ENUMS_H
#include <stdbool.h>
/* Project-wide (GL and Vulkan) maximum. */
#define MAX_DRAW_BUFFERS 8
#ifdef __cplusplus
extern "C" {
#endif
/**
* Shader stages.
*
* The order must match how shaders are ordered in the pipeline.
* The GLSL linker assumes that if i<j, then the j-th shader is
* executed later than the i-th shader.
*/
typedef enum
{
MESA_SHADER_NONE = -1,
MESA_SHADER_VERTEX = 0,
MESA_SHADER_TESS_CTRL = 1,
MESA_SHADER_TESS_EVAL = 2,
MESA_SHADER_GEOMETRY = 3,
MESA_SHADER_FRAGMENT = 4,
MESA_SHADER_COMPUTE = 5,
/* must be last so it doesn't affect the GL pipeline */
MESA_SHADER_KERNEL = 6,
} gl_shader_stage;
static inline bool
gl_shader_stage_is_compute(gl_shader_stage stage)
{
return stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL;
}
/**
* Number of STATE_* values we need to address any GL state.
* Used to dimension arrays.
*/
#define STATE_LENGTH 5
typedef short gl_state_index16; /* see enum gl_state_index */
const char *gl_shader_stage_name(gl_shader_stage stage);
/**
* Translate a gl_shader_stage to a short shader stage name for debug
* printouts and error messages.
*/
const char *_mesa_shader_stage_to_string(unsigned stage);
/**
* Translate a gl_shader_stage to a shader stage abbreviation (VS, GS, FS)
* for debug printouts and error messages.
*/
const char *_mesa_shader_stage_to_abbrev(unsigned stage);
/**
* GL related stages (not including CL)
*/
#define MESA_SHADER_STAGES (MESA_SHADER_COMPUTE + 1)
/**
* All stages
*/
#define MESA_ALL_SHADER_STAGES (MESA_SHADER_KERNEL + 1)
/**
* Indexes for vertex program attributes.
* GL_NV_vertex_program aliases generic attributes over the conventional
* attributes. In GL_ARB_vertex_program shader the aliasing is optional.
* In GL_ARB_vertex_shader / OpenGL 2.0 the aliasing is disallowed (the
* generic attributes are distinct/separate).
*/
typedef enum
{
VERT_ATTRIB_POS,
VERT_ATTRIB_NORMAL,
VERT_ATTRIB_COLOR0,
VERT_ATTRIB_COLOR1,
VERT_ATTRIB_FOG,
VERT_ATTRIB_COLOR_INDEX,
VERT_ATTRIB_EDGEFLAG,
VERT_ATTRIB_TEX0,
VERT_ATTRIB_TEX1,
VERT_ATTRIB_TEX2,
VERT_ATTRIB_TEX3,
VERT_ATTRIB_TEX4,
VERT_ATTRIB_TEX5,
VERT_ATTRIB_TEX6,
VERT_ATTRIB_TEX7,
VERT_ATTRIB_POINT_SIZE,
VERT_ATTRIB_GENERIC0,
VERT_ATTRIB_GENERIC1,
VERT_ATTRIB_GENERIC2,
VERT_ATTRIB_GENERIC3,
VERT_ATTRIB_GENERIC4,
VERT_ATTRIB_GENERIC5,
VERT_ATTRIB_GENERIC6,
VERT_ATTRIB_GENERIC7,
VERT_ATTRIB_GENERIC8,
VERT_ATTRIB_GENERIC9,
VERT_ATTRIB_GENERIC10,
VERT_ATTRIB_GENERIC11,
VERT_ATTRIB_GENERIC12,
VERT_ATTRIB_GENERIC13,
VERT_ATTRIB_GENERIC14,
VERT_ATTRIB_GENERIC15,
VERT_ATTRIB_MAX
} gl_vert_attrib;
const char *gl_vert_attrib_name(gl_vert_attrib attrib);
/**
* Symbolic constats to help iterating over
* specific blocks of vertex attributes.
*
* VERT_ATTRIB_FF
* includes all fixed function attributes as well as
* the aliased GL_NV_vertex_program shader attributes.
* VERT_ATTRIB_TEX
* include the classic texture coordinate attributes.
* Is a subset of VERT_ATTRIB_FF.
* VERT_ATTRIB_GENERIC
* include the OpenGL 2.0+ GLSL generic shader attributes.
* These alias the generic GL_ARB_vertex_shader attributes.
* VERT_ATTRIB_MAT
* include the generic shader attributes used to alias
* varying material values for the TNL shader programs.
* They are located at the end of the generic attribute
* block not to overlap with the generic 0 attribute.
*/
#define VERT_ATTRIB_FF(i) (VERT_ATTRIB_POS + (i))
#define VERT_ATTRIB_FF_MAX VERT_ATTRIB_GENERIC0
#define VERT_ATTRIB_TEX(i) (VERT_ATTRIB_TEX0 + (i))
#define VERT_ATTRIB_TEX_MAX MAX_TEXTURE_COORD_UNITS
#define VERT_ATTRIB_GENERIC(i) (VERT_ATTRIB_GENERIC0 + (i))
#define VERT_ATTRIB_GENERIC_MAX MAX_VERTEX_GENERIC_ATTRIBS
#define VERT_ATTRIB_MAT0 \
(VERT_ATTRIB_GENERIC_MAX - VERT_ATTRIB_MAT_MAX)
#define VERT_ATTRIB_MAT(i) \
VERT_ATTRIB_GENERIC((i) + VERT_ATTRIB_MAT0)
#define VERT_ATTRIB_MAT_MAX MAT_ATTRIB_MAX
/**
* Bitflags for vertex attributes.
* These are used in bitfields in many places.
*/
/*@{*/
#define VERT_BIT_POS BITFIELD_BIT(VERT_ATTRIB_POS)
#define VERT_BIT_NORMAL BITFIELD_BIT(VERT_ATTRIB_NORMAL)
#define VERT_BIT_COLOR0 BITFIELD_BIT(VERT_ATTRIB_COLOR0)
#define VERT_BIT_COLOR1 BITFIELD_BIT(VERT_ATTRIB_COLOR1)
#define VERT_BIT_FOG BITFIELD_BIT(VERT_ATTRIB_FOG)
#define VERT_BIT_COLOR_INDEX BITFIELD_BIT(VERT_ATTRIB_COLOR_INDEX)
#define VERT_BIT_EDGEFLAG BITFIELD_BIT(VERT_ATTRIB_EDGEFLAG)
#define VERT_BIT_TEX0 BITFIELD_BIT(VERT_ATTRIB_TEX0)
#define VERT_BIT_TEX1 BITFIELD_BIT(VERT_ATTRIB_TEX1)
#define VERT_BIT_TEX2 BITFIELD_BIT(VERT_ATTRIB_TEX2)
#define VERT_BIT_TEX3 BITFIELD_BIT(VERT_ATTRIB_TEX3)
#define VERT_BIT_TEX4 BITFIELD_BIT(VERT_ATTRIB_TEX4)
#define VERT_BIT_TEX5 BITFIELD_BIT(VERT_ATTRIB_TEX5)
#define VERT_BIT_TEX6 BITFIELD_BIT(VERT_ATTRIB_TEX6)
#define VERT_BIT_TEX7 BITFIELD_BIT(VERT_ATTRIB_TEX7)
#define VERT_BIT_POINT_SIZE BITFIELD_BIT(VERT_ATTRIB_POINT_SIZE)
#define VERT_BIT_GENERIC0 BITFIELD_BIT(VERT_ATTRIB_GENERIC0)
#define VERT_BIT(i) BITFIELD_BIT(i)
#define VERT_BIT_ALL BITFIELD_RANGE(0, VERT_ATTRIB_MAX)
#define VERT_BIT_FF(i) VERT_BIT(i)
#define VERT_BIT_FF_ALL BITFIELD_RANGE(0, VERT_ATTRIB_FF_MAX)
#define VERT_BIT_TEX(i) VERT_BIT(VERT_ATTRIB_TEX(i))
#define VERT_BIT_TEX_ALL \
BITFIELD_RANGE(VERT_ATTRIB_TEX(0), VERT_ATTRIB_TEX_MAX)
#define VERT_BIT_GENERIC(i) VERT_BIT(VERT_ATTRIB_GENERIC(i))
#define VERT_BIT_GENERIC_ALL \
BITFIELD_RANGE(VERT_ATTRIB_GENERIC(0), VERT_ATTRIB_GENERIC_MAX)
#define VERT_BIT_MAT(i) VERT_BIT(VERT_ATTRIB_MAT(i))
#define VERT_BIT_MAT_ALL \
BITFIELD_RANGE(VERT_ATTRIB_MAT(0), VERT_ATTRIB_MAT_MAX)
/*@}*/
#define MAX_VARYING 32 /**< number of float[4] vectors */
/**
* Indexes for vertex shader outputs, geometry shader inputs/outputs, and
* fragment shader inputs.
*
* Note that some of these values are not available to all pipeline stages.
*
* When this enum is updated, the following code must be updated too:
* - vertResults (in prog_print.c's arb_output_attrib_string())
* - fragAttribs (in prog_print.c's arb_input_attrib_string())
* - _mesa_varying_slot_in_fs()
*/
typedef enum
{
VARYING_SLOT_POS,
VARYING_SLOT_COL0, /* COL0 and COL1 must be contiguous */
VARYING_SLOT_COL1,
VARYING_SLOT_FOGC,
VARYING_SLOT_TEX0, /* TEX0-TEX7 must be contiguous */
VARYING_SLOT_TEX1,
VARYING_SLOT_TEX2,
VARYING_SLOT_TEX3,
VARYING_SLOT_TEX4,
VARYING_SLOT_TEX5,
VARYING_SLOT_TEX6,
VARYING_SLOT_TEX7,
VARYING_SLOT_PSIZ, /* Does not appear in FS */
VARYING_SLOT_BFC0, /* Does not appear in FS */
VARYING_SLOT_BFC1, /* Does not appear in FS */
VARYING_SLOT_EDGE, /* Does not appear in FS */
VARYING_SLOT_CLIP_VERTEX, /* Does not appear in FS */
VARYING_SLOT_CLIP_DIST0,
VARYING_SLOT_CLIP_DIST1,
VARYING_SLOT_CULL_DIST0,
VARYING_SLOT_CULL_DIST1,
VARYING_SLOT_PRIMITIVE_ID, /* Does not appear in VS */
VARYING_SLOT_LAYER, /* Appears as VS or GS output */
VARYING_SLOT_VIEWPORT, /* Appears as VS or GS output */
VARYING_SLOT_FACE, /* FS only */
VARYING_SLOT_PNTC, /* FS only */
VARYING_SLOT_TESS_LEVEL_OUTER, /* Only appears as TCS output. */
VARYING_SLOT_TESS_LEVEL_INNER, /* Only appears as TCS output. */
VARYING_SLOT_BOUNDING_BOX0, /* Only appears as TCS output. */
VARYING_SLOT_BOUNDING_BOX1, /* Only appears as TCS output. */
VARYING_SLOT_VIEW_INDEX,
VARYING_SLOT_VIEWPORT_MASK, /* Does not appear in FS */
VARYING_SLOT_VAR0, /* First generic varying slot */
/* the remaining are simply for the benefit of gl_varying_slot_name()
* and not to be construed as an upper bound:
*/
VARYING_SLOT_VAR1,
VARYING_SLOT_VAR2,
VARYING_SLOT_VAR3,
VARYING_SLOT_VAR4,
VARYING_SLOT_VAR5,
VARYING_SLOT_VAR6,
VARYING_SLOT_VAR7,
VARYING_SLOT_VAR8,
VARYING_SLOT_VAR9,
VARYING_SLOT_VAR10,
VARYING_SLOT_VAR11,
VARYING_SLOT_VAR12,
VARYING_SLOT_VAR13,
VARYING_SLOT_VAR14,
VARYING_SLOT_VAR15,
VARYING_SLOT_VAR16,
VARYING_SLOT_VAR17,
VARYING_SLOT_VAR18,
VARYING_SLOT_VAR19,
VARYING_SLOT_VAR20,
VARYING_SLOT_VAR21,
VARYING_SLOT_VAR22,
VARYING_SLOT_VAR23,
VARYING_SLOT_VAR24,
VARYING_SLOT_VAR25,
VARYING_SLOT_VAR26,
VARYING_SLOT_VAR27,
VARYING_SLOT_VAR28,
VARYING_SLOT_VAR29,
VARYING_SLOT_VAR30,
VARYING_SLOT_VAR31,
} gl_varying_slot;
#define VARYING_SLOT_MAX (VARYING_SLOT_VAR0 + MAX_VARYING)
#define VARYING_SLOT_PATCH0 (VARYING_SLOT_MAX)
#define VARYING_SLOT_TESS_MAX (VARYING_SLOT_PATCH0 + MAX_VARYING)
#define MAX_VARYINGS_INCL_PATCH (VARYING_SLOT_TESS_MAX - VARYING_SLOT_VAR0)
const char *gl_varying_slot_name(gl_varying_slot slot);
/**
* Bitflags for varying slots.
*/
/*@{*/
#define VARYING_BIT_POS BITFIELD64_BIT(VARYING_SLOT_POS)
#define VARYING_BIT_COL0 BITFIELD64_BIT(VARYING_SLOT_COL0)
#define VARYING_BIT_COL1 BITFIELD64_BIT(VARYING_SLOT_COL1)
#define VARYING_BIT_FOGC BITFIELD64_BIT(VARYING_SLOT_FOGC)
#define VARYING_BIT_TEX0 BITFIELD64_BIT(VARYING_SLOT_TEX0)
#define VARYING_BIT_TEX1 BITFIELD64_BIT(VARYING_SLOT_TEX1)
#define VARYING_BIT_TEX2 BITFIELD64_BIT(VARYING_SLOT_TEX2)
#define VARYING_BIT_TEX3 BITFIELD64_BIT(VARYING_SLOT_TEX3)
#define VARYING_BIT_TEX4 BITFIELD64_BIT(VARYING_SLOT_TEX4)
#define VARYING_BIT_TEX5 BITFIELD64_BIT(VARYING_SLOT_TEX5)
#define VARYING_BIT_TEX6 BITFIELD64_BIT(VARYING_SLOT_TEX6)
#define VARYING_BIT_TEX7 BITFIELD64_BIT(VARYING_SLOT_TEX7)
#define VARYING_BIT_TEX(U) BITFIELD64_BIT(VARYING_SLOT_TEX0 + (U))
#define VARYING_BITS_TEX_ANY BITFIELD64_RANGE(VARYING_SLOT_TEX0, \
MAX_TEXTURE_COORD_UNITS)
#define VARYING_BIT_PSIZ BITFIELD64_BIT(VARYING_SLOT_PSIZ)
#define VARYING_BIT_BFC0 BITFIELD64_BIT(VARYING_SLOT_BFC0)
#define VARYING_BIT_BFC1 BITFIELD64_BIT(VARYING_SLOT_BFC1)
#define VARYING_BITS_COLOR (VARYING_BIT_COL0 | \
VARYING_BIT_COL1 | \
VARYING_BIT_BFC0 | \
VARYING_BIT_BFC1)
#define VARYING_BIT_EDGE BITFIELD64_BIT(VARYING_SLOT_EDGE)
#define VARYING_BIT_CLIP_VERTEX BITFIELD64_BIT(VARYING_SLOT_CLIP_VERTEX)
#define VARYING_BIT_CLIP_DIST0 BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)
#define VARYING_BIT_CLIP_DIST1 BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)
#define VARYING_BIT_CULL_DIST0 BITFIELD64_BIT(VARYING_SLOT_CULL_DIST0)
#define VARYING_BIT_CULL_DIST1 BITFIELD64_BIT(VARYING_SLOT_CULL_DIST1)
#define VARYING_BIT_PRIMITIVE_ID BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_ID)
#define VARYING_BIT_LAYER BITFIELD64_BIT(VARYING_SLOT_LAYER)
#define VARYING_BIT_VIEWPORT BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)
#define VARYING_BIT_FACE BITFIELD64_BIT(VARYING_SLOT_FACE)
#define VARYING_BIT_PNTC BITFIELD64_BIT(VARYING_SLOT_PNTC)
#define VARYING_BIT_TESS_LEVEL_OUTER BITFIELD64_BIT(VARYING_SLOT_TESS_LEVEL_OUTER)
#define VARYING_BIT_TESS_LEVEL_INNER BITFIELD64_BIT(VARYING_SLOT_TESS_LEVEL_INNER)
#define VARYING_BIT_BOUNDING_BOX0 BITFIELD64_BIT(VARYING_SLOT_BOUNDING_BOX0)
#define VARYING_BIT_BOUNDING_BOX1 BITFIELD64_BIT(VARYING_SLOT_BOUNDING_BOX1)
#define VARYING_BIT_VIEWPORT_MASK BITFIELD64_BIT(VARYING_SLOT_VIEWPORT_MASK)
#define VARYING_BIT_VAR(V) BITFIELD64_BIT(VARYING_SLOT_VAR0 + (V))
/*@}*/
/**
* Bitflags for system values.
*/
#define SYSTEM_BIT_SAMPLE_ID ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_ID)
#define SYSTEM_BIT_SAMPLE_POS ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_POS)
#define SYSTEM_BIT_SAMPLE_MASK_IN ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_MASK_IN)
#define SYSTEM_BIT_LOCAL_INVOCATION_ID ((uint64_t)1 << SYSTEM_VALUE_LOCAL_INVOCATION_ID)
/**
* If the gl_register_file is PROGRAM_SYSTEM_VALUE, the register index will be
* one of these values. If a NIR variable's mode is nir_var_system_value, it
* will be one of these values.
*/
typedef enum
{
/**
* \name System values applicable to all shaders
*/
/*@{*/
/**
* Builtin variables added by GL_ARB_shader_ballot.
*/
/*@{*/
/**
* From the GL_ARB_shader-ballot spec:
*
* "A sub-group is a collection of invocations which execute in lockstep.
* The variable <gl_SubGroupSizeARB> is the maximum number of
* invocations in a sub-group. The maximum <gl_SubGroupSizeARB>
* supported in this extension is 64."
*
* The spec defines this as a uniform. However, it's highly unlikely that
* implementations actually treat it as a uniform (which is loaded from a
* constant buffer). Most likely, this is an implementation-wide constant,
* or perhaps something that depends on the shader stage.
*/
SYSTEM_VALUE_SUBGROUP_SIZE,
/**
* From the GL_ARB_shader_ballot spec:
*
* "The variable <gl_SubGroupInvocationARB> holds the index of the
* invocation within sub-group. This variable is in the range 0 to
* <gl_SubGroupSizeARB>-1, where <gl_SubGroupSizeARB> is the total
* number of invocations in a sub-group."
*/
SYSTEM_VALUE_SUBGROUP_INVOCATION,
/**
* From the GL_ARB_shader_ballot spec:
*
* "The <gl_SubGroup??MaskARB> variables provide a bitmask for all
* invocations, with one bit per invocation starting with the least
* significant bit, according to the following table,
*
* variable equation for bit values
* -------------------- ------------------------------------
* gl_SubGroupEqMaskARB bit index == gl_SubGroupInvocationARB
* gl_SubGroupGeMaskARB bit index >= gl_SubGroupInvocationARB
* gl_SubGroupGtMaskARB bit index > gl_SubGroupInvocationARB
* gl_SubGroupLeMaskARB bit index <= gl_SubGroupInvocationARB
* gl_SubGroupLtMaskARB bit index < gl_SubGroupInvocationARB
*/
SYSTEM_VALUE_SUBGROUP_EQ_MASK,
SYSTEM_VALUE_SUBGROUP_GE_MASK,
SYSTEM_VALUE_SUBGROUP_GT_MASK,
SYSTEM_VALUE_SUBGROUP_LE_MASK,
SYSTEM_VALUE_SUBGROUP_LT_MASK,
/*@}*/
/**
* Builtin variables added by VK_KHR_subgroups
*/
/*@{*/
SYSTEM_VALUE_NUM_SUBGROUPS,
SYSTEM_VALUE_SUBGROUP_ID,
/*@}*/
/*@}*/
/**
* \name Vertex shader system values
*/
/*@{*/
/**
* OpenGL-style vertex ID.
*
* Section 2.11.7 (Shader Execution), subsection Shader Inputs, of the
* OpenGL 3.3 core profile spec says:
*
* "gl_VertexID holds the integer index i implicitly passed by
* DrawArrays or one of the other drawing commands defined in section
* 2.8.3."
*
* Section 2.8.3 (Drawing Commands) of the same spec says:
*
* "The commands....are equivalent to the commands with the same base
* name (without the BaseVertex suffix), except that the ith element
* transferred by the corresponding draw call will be taken from
* element indices[i] + basevertex of each enabled array."
*
* Additionally, the overview in the GL_ARB_shader_draw_parameters spec
* says:
*
* "In unextended GL, vertex shaders have inputs named gl_VertexID and
* gl_InstanceID, which contain, respectively the index of the vertex
* and instance. The value of gl_VertexID is the implicitly passed
* index of the vertex being processed, which includes the value of
* baseVertex, for those commands that accept it."
*
* gl_VertexID gets basevertex added in. This differs from DirectX where
* SV_VertexID does \b not get basevertex added in.
*
* \note
* If all system values are available, \c SYSTEM_VALUE_VERTEX_ID will be
* equal to \c SYSTEM_VALUE_VERTEX_ID_ZERO_BASE plus
* \c SYSTEM_VALUE_BASE_VERTEX.
*
* \sa SYSTEM_VALUE_VERTEX_ID_ZERO_BASE, SYSTEM_VALUE_BASE_VERTEX
*/
SYSTEM_VALUE_VERTEX_ID,
/**
* Instanced ID as supplied to gl_InstanceID
*
* Values assigned to gl_InstanceID always begin with zero, regardless of
* the value of baseinstance.
*
* Section 11.1.3.9 (Shader Inputs) of the OpenGL 4.4 core profile spec
* says:
*
* "gl_InstanceID holds the integer instance number of the current
* primitive in an instanced draw call (see section 10.5)."
*
* Through a big chain of pseudocode, section 10.5 describes that
* baseinstance is not counted by gl_InstanceID. In that section, notice
*
* "If an enabled vertex attribute array is instanced (it has a
* non-zero divisor as specified by VertexAttribDivisor), the element
* index that is transferred to the GL, for all vertices, is given by
*
* floor(instance/divisor) + baseinstance
*
* If an array corresponding to an attribute required by a vertex
* shader is not enabled, then the corresponding element is taken from
* the current attribute state (see section 10.2)."
*
* Note that baseinstance is \b not included in the value of instance.
*/
SYSTEM_VALUE_INSTANCE_ID,
/**
* Vulkan InstanceIndex.
*
* InstanceIndex = gl_InstanceID + gl_BaseInstance
*/
SYSTEM_VALUE_INSTANCE_INDEX,
/**
* DirectX-style vertex ID.
*
* Unlike \c SYSTEM_VALUE_VERTEX_ID, this system value does \b not include
* the value of basevertex.
*
* \sa SYSTEM_VALUE_VERTEX_ID, SYSTEM_VALUE_BASE_VERTEX
*/
SYSTEM_VALUE_VERTEX_ID_ZERO_BASE,
/**
* Value of \c basevertex passed to \c glDrawElementsBaseVertex and similar
* functions.
*
* \sa SYSTEM_VALUE_VERTEX_ID, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE
*/
SYSTEM_VALUE_BASE_VERTEX,
/**
* Depending on the type of the draw call (indexed or non-indexed),
* is the value of \c basevertex passed to \c glDrawElementsBaseVertex and
* similar, or is the value of \c first passed to \c glDrawArrays and
* similar.
*
* \note
* It can be used to calculate the \c SYSTEM_VALUE_VERTEX_ID as
* \c SYSTEM_VALUE_VERTEX_ID_ZERO_BASE plus \c SYSTEM_VALUE_FIRST_VERTEX.
*
* \sa SYSTEM_VALUE_VERTEX_ID_ZERO_BASE, SYSTEM_VALUE_VERTEX_ID
*/
SYSTEM_VALUE_FIRST_VERTEX,
/**
* If the Draw command used to start the rendering was an indexed draw
* or not (~0/0). Useful to calculate \c SYSTEM_VALUE_BASE_VERTEX as
* \c SYSTEM_VALUE_IS_INDEXED_DRAW & \c SYSTEM_VALUE_FIRST_VERTEX.
*/
SYSTEM_VALUE_IS_INDEXED_DRAW,
/**
* Value of \c baseinstance passed to instanced draw entry points
*
* \sa SYSTEM_VALUE_INSTANCE_ID
*/
SYSTEM_VALUE_BASE_INSTANCE,
/**
* From _ARB_shader_draw_parameters:
*
* "Additionally, this extension adds a further built-in variable,
* gl_DrawID to the shading language. This variable contains the index
* of the draw currently being processed by a Multi* variant of a
* drawing command (such as MultiDrawElements or
* MultiDrawArraysIndirect)."
*
* If GL_ARB_multi_draw_indirect is not supported, this is always 0.
*/
SYSTEM_VALUE_DRAW_ID,
/*@}*/
/**
* \name Geometry shader system values
*/
/*@{*/
SYSTEM_VALUE_INVOCATION_ID, /**< (Also in Tessellation Control shader) */
/*@}*/
/**
* \name Fragment shader system values
*/
/*@{*/
SYSTEM_VALUE_FRAG_COORD,
SYSTEM_VALUE_POINT_COORD,
SYSTEM_VALUE_FRONT_FACE,
SYSTEM_VALUE_SAMPLE_ID,
SYSTEM_VALUE_SAMPLE_POS,
SYSTEM_VALUE_SAMPLE_MASK_IN,
SYSTEM_VALUE_HELPER_INVOCATION,
SYSTEM_VALUE_COLOR0,
SYSTEM_VALUE_COLOR1,
/*@}*/
/**
* \name Tessellation Evaluation shader system values
*/
/*@{*/
SYSTEM_VALUE_TESS_COORD,
SYSTEM_VALUE_VERTICES_IN, /**< Tessellation vertices in input patch */
SYSTEM_VALUE_PRIMITIVE_ID,
SYSTEM_VALUE_TESS_LEVEL_OUTER, /**< TES input */
SYSTEM_VALUE_TESS_LEVEL_INNER, /**< TES input */
SYSTEM_VALUE_TESS_LEVEL_OUTER_DEFAULT, /**< TCS input for passthru TCS */
SYSTEM_VALUE_TESS_LEVEL_INNER_DEFAULT, /**< TCS input for passthru TCS */
/*@}*/
/**
* \name Compute shader system values
*/
/*@{*/
SYSTEM_VALUE_LOCAL_INVOCATION_ID,
SYSTEM_VALUE_LOCAL_INVOCATION_INDEX,
SYSTEM_VALUE_GLOBAL_INVOCATION_ID,
SYSTEM_VALUE_GLOBAL_INVOCATION_INDEX,
SYSTEM_VALUE_WORK_GROUP_ID,
SYSTEM_VALUE_NUM_WORK_GROUPS,
SYSTEM_VALUE_LOCAL_GROUP_SIZE,
SYSTEM_VALUE_GLOBAL_GROUP_SIZE,
SYSTEM_VALUE_WORK_DIM,
SYSTEM_VALUE_USER_DATA_AMD,
/*@}*/
/** Required for VK_KHR_device_group */
SYSTEM_VALUE_DEVICE_INDEX,
/** Required for VK_KHX_multiview */
SYSTEM_VALUE_VIEW_INDEX,
/**
* Driver internal vertex-count, used (for example) for drivers to
* calculate stride for stream-out outputs. Not externally visible.
*/
SYSTEM_VALUE_VERTEX_CNT,
/**
* Required for AMD_shader_explicit_vertex_parameter and also used for
* varying-fetch instructions.
*
* The _SIZE value is "primitive size", used to scale i/j in primitive
* space to pixel space.
*/
SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE,
SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID,
SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE,
SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL,
SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID,
SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE,
SYSTEM_VALUE_BARYCENTRIC_PULL_MODEL,
/**
* IR3 specific geometry shader and tesselation control shader system
* values that packs invocation id, thread id and vertex id. Having this
* as a nir level system value lets us do the unpacking in nir.
*/
SYSTEM_VALUE_GS_HEADER_IR3,
SYSTEM_VALUE_TCS_HEADER_IR3,
SYSTEM_VALUE_MAX /**< Number of values */
} gl_system_value;
const char *gl_system_value_name(gl_system_value sysval);
/**
* The possible interpolation qualifiers that can be applied to a fragment
* shader input in GLSL.
*
* Note: INTERP_MODE_NONE must be 0 so that memsetting the
* ir_variable data structure to 0 causes the default behavior.
*/
enum glsl_interp_mode
{
INTERP_MODE_NONE = 0,
INTERP_MODE_SMOOTH,
INTERP_MODE_FLAT,
INTERP_MODE_NOPERSPECTIVE,
INTERP_MODE_EXPLICIT,
INTERP_MODE_COUNT /**< Number of interpolation qualifiers */
};
enum glsl_interface_packing {
GLSL_INTERFACE_PACKING_STD140,
GLSL_INTERFACE_PACKING_SHARED,
GLSL_INTERFACE_PACKING_PACKED,
GLSL_INTERFACE_PACKING_STD430
};
const char *glsl_interp_mode_name(enum glsl_interp_mode qual);
/**
* Fragment program results
*/
typedef enum
{
FRAG_RESULT_DEPTH = 0,
FRAG_RESULT_STENCIL = 1,
/* If a single color should be written to all render targets, this
* register is written. No FRAG_RESULT_DATAn will be written.
*/
FRAG_RESULT_COLOR = 2,
FRAG_RESULT_SAMPLE_MASK = 3,
/* FRAG_RESULT_DATAn are the per-render-target (GLSL gl_FragData[n]
* or ARB_fragment_program fragment.color[n]) color results. If
* any are written, FRAG_RESULT_COLOR will not be written.
* FRAG_RESULT_DATA1 and up are simply for the benefit of
* gl_frag_result_name() and not to be construed as an upper bound
*/
FRAG_RESULT_DATA0 = 4,
FRAG_RESULT_DATA1,
FRAG_RESULT_DATA2,
FRAG_RESULT_DATA3,
FRAG_RESULT_DATA4,
FRAG_RESULT_DATA5,
FRAG_RESULT_DATA6,
FRAG_RESULT_DATA7,
} gl_frag_result;
const char *gl_frag_result_name(gl_frag_result result);
#define FRAG_RESULT_MAX (FRAG_RESULT_DATA0 + MAX_DRAW_BUFFERS)
/**
* \brief Layout qualifiers for gl_FragDepth.
*
* Extension AMD_conservative_depth allows gl_FragDepth to be redeclared with
* a layout qualifier.
*
* \see enum ir_depth_layout
*/
enum gl_frag_depth_layout
{
FRAG_DEPTH_LAYOUT_NONE, /**< No layout is specified. */
FRAG_DEPTH_LAYOUT_ANY,
FRAG_DEPTH_LAYOUT_GREATER,
FRAG_DEPTH_LAYOUT_LESS,
FRAG_DEPTH_LAYOUT_UNCHANGED
};
/**
* \brief Buffer access qualifiers
*/
enum gl_access_qualifier
{
ACCESS_COHERENT = (1 << 0),
ACCESS_RESTRICT = (1 << 1),
ACCESS_VOLATILE = (1 << 2),
ACCESS_NON_READABLE = (1 << 3),
ACCESS_NON_WRITEABLE = (1 << 4),
/** The access may use a non-uniform buffer or image index */
ACCESS_NON_UNIFORM = (1 << 5),
/* This has the same semantics as NIR_INTRINSIC_CAN_REORDER, only to be
* used with loads. In other words, it means that the load can be
* arbitrarily reordered, or combined with other loads to the same address.
* It is implied by ACCESS_NON_WRITEABLE together with ACCESS_RESTRICT, and
* a lack of ACCESS_COHERENT and ACCESS_VOLATILE.
*/
ACCESS_CAN_REORDER = (1 << 6),
/** Use as little cache space as possible. */
ACCESS_STREAM_CACHE_POLICY = (1 << 7),
};
/**
* \brief Blend support qualifiers
*/
enum gl_advanced_blend_mode
{
BLEND_NONE = 0x0000,
BLEND_MULTIPLY = 0x0001,
BLEND_SCREEN = 0x0002,
BLEND_OVERLAY = 0x0004,
BLEND_DARKEN = 0x0008,
BLEND_LIGHTEN = 0x0010,
BLEND_COLORDODGE = 0x0020,
BLEND_COLORBURN = 0x0040,
BLEND_HARDLIGHT = 0x0080,
BLEND_SOFTLIGHT = 0x0100,
BLEND_DIFFERENCE = 0x0200,
BLEND_EXCLUSION = 0x0400,
BLEND_HSL_HUE = 0x0800,
BLEND_HSL_SATURATION = 0x1000,
BLEND_HSL_COLOR = 0x2000,
BLEND_HSL_LUMINOSITY = 0x4000,
BLEND_ALL = 0x7fff,
};
enum blend_func
{
BLEND_FUNC_ADD,
BLEND_FUNC_SUBTRACT,
BLEND_FUNC_REVERSE_SUBTRACT,
BLEND_FUNC_MIN,
BLEND_FUNC_MAX,
};
enum blend_factor
{
BLEND_FACTOR_ZERO,
BLEND_FACTOR_SRC_COLOR,
BLEND_FACTOR_DST_COLOR,
BLEND_FACTOR_SRC_ALPHA,
BLEND_FACTOR_DST_ALPHA,
BLEND_FACTOR_CONSTANT_COLOR,
BLEND_FACTOR_CONSTANT_ALPHA,
BLEND_FACTOR_SRC_ALPHA_SATURATE,
};
enum gl_tess_spacing
{
TESS_SPACING_UNSPECIFIED,
TESS_SPACING_EQUAL,
TESS_SPACING_FRACTIONAL_ODD,
TESS_SPACING_FRACTIONAL_EVEN,
};
/**
* A compare function enum for use in compiler lowering passes. This is in
* the same order as GL's compare functions (shifted down by GL_NEVER), and is
* exactly the same as gallium's PIPE_FUNC_*.
*/
enum compare_func
{
COMPARE_FUNC_NEVER,
COMPARE_FUNC_LESS,
COMPARE_FUNC_EQUAL,
COMPARE_FUNC_LEQUAL,
COMPARE_FUNC_GREATER,
COMPARE_FUNC_NOTEQUAL,
COMPARE_FUNC_GEQUAL,
COMPARE_FUNC_ALWAYS,
};
/**
* Arrangements for grouping invocations from NV_compute_shader_derivatives.
*
* The extension provides new layout qualifiers that support two different
* arrangements of compute shader invocations for the purpose of derivative
* computation. When specifying
*
* layout(derivative_group_quadsNV) in;
*
* compute shader invocations are grouped into 2x2x1 arrays whose four local
* invocation ID values follow the pattern:
*
* +-----------------+------------------+
* | (2x+0, 2y+0, z) | (2x+1, 2y+0, z) |
* +-----------------+------------------+
* | (2x+0, 2y+1, z) | (2x+1, 2y+1, z) |
* +-----------------+------------------+
*
* where Y increases from bottom to top. When specifying
*
* layout(derivative_group_linearNV) in;
*
* compute shader invocations are grouped into 2x2x1 arrays whose four local
* invocation index values follow the pattern:
*
* +------+------+
* | 4n+0 | 4n+1 |
* +------+------+
* | 4n+2 | 4n+3 |
* +------+------+
*
* If neither layout qualifier is specified, derivatives in compute shaders
* return zero, which is consistent with the handling of built-in texture
* functions like texture() in GLSL 4.50 compute shaders.
*/
enum gl_derivative_group {
DERIVATIVE_GROUP_NONE = 0,
DERIVATIVE_GROUP_QUADS,
DERIVATIVE_GROUP_LINEAR,
};
enum float_controls
{
FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE = 0x0000,
FLOAT_CONTROLS_DENORM_PRESERVE_FP16 = 0x0001,
FLOAT_CONTROLS_DENORM_PRESERVE_FP32 = 0x0002,
FLOAT_CONTROLS_DENORM_PRESERVE_FP64 = 0x0004,
FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 = 0x0008,
FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32 = 0x0010,
FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64 = 0x0020,
FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 = 0x0040,
FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32 = 0x0080,
FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64 = 0x0100,
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 = 0x0200,
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 = 0x0400,
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64 = 0x0800,
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 = 0x1000,
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 = 0x2000,
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 = 0x4000,
};
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* SHADER_ENUMS_H */

View File

@@ -1,261 +0,0 @@
/*
* Mesa 3-D graphics library
*
* Copyright (C) 2006 Brian Paul All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* \file bitset.h
* \brief Bitset of arbitrary size definitions.
* \author Michal Krol
*/
#ifndef BITSET_H
#define BITSET_H
//#include "util/bitscan.h"
//#include "util/macros.h"
/****************************************************************************
* generic bitset implementation
*/
#define BITSET_WORD unsigned int
#define BITSET_WORDBITS (sizeof (BITSET_WORD) * 8)
/* bitset declarations
*/
#define BITSET_WORDS(bits) (((bits) + BITSET_WORDBITS - 1) / BITSET_WORDBITS)
#define BITSET_DECLARE(name, bits) BITSET_WORD name[BITSET_WORDS(bits)]
/* bitset operations
*/
#define BITSET_COPY(x, y) memcpy( (x), (y), sizeof (x) )
#define BITSET_EQUAL(x, y) (memcmp( (x), (y), sizeof (x) ) == 0)
#define BITSET_ZERO(x) memset( (x), 0, sizeof (x) )
#define BITSET_ONES(x) memset( (x), 0xff, sizeof (x) )
#define BITSET_BITWORD(b) ((b) / BITSET_WORDBITS)
#define BITSET_BIT(b) (1u << ((b) % BITSET_WORDBITS))
/* single bit operations
*/
#define BITSET_TEST(x, b) (((x)[BITSET_BITWORD(b)] & BITSET_BIT(b)) != 0)
#define BITSET_SET(x, b) ((x)[BITSET_BITWORD(b)] |= BITSET_BIT(b))
#define BITSET_CLEAR(x, b) ((x)[BITSET_BITWORD(b)] &= ~BITSET_BIT(b))
#define BITSET_MASK(b) (((b) % BITSET_WORDBITS == 0) ? ~0 : BITSET_BIT(b) - 1)
#define BITSET_RANGE(b, e) ((BITSET_MASK((e) + 1)) & ~(BITSET_BIT(b) - 1))
/* bit range operations
*/
#define BITSET_TEST_RANGE(x, b, e) \
(BITSET_BITWORD(b) == BITSET_BITWORD(e) ? \
(((x)[BITSET_BITWORD(b)] & BITSET_RANGE(b, e)) != 0) : \
(assert (!"BITSET_TEST_RANGE: bit range crosses word boundary"), 0))
#define BITSET_SET_RANGE(x, b, e) \
(BITSET_BITWORD(b) == BITSET_BITWORD(e) ? \
((x)[BITSET_BITWORD(b)] |= BITSET_RANGE(b, e)) : \
(assert (!"BITSET_SET_RANGE: bit range crosses word boundary"), 0))
#define BITSET_CLEAR_RANGE(x, b, e) \
(BITSET_BITWORD(b) == BITSET_BITWORD(e) ? \
((x)[BITSET_BITWORD(b)] &= ~BITSET_RANGE(b, e)) : \
(assert (!"BITSET_CLEAR_RANGE: bit range crosses word boundary"), 0))
/* Get first bit set in a bitset.
*/
static inline int
__bitset_ffs(const BITSET_WORD *x, int n)
{
int i;
for (i = 0; i < n; i++) {
if (x[i])
return ffs(x[i]) + BITSET_WORDBITS * i;
}
return 0;
}
#define BITSET_FFS(x) __bitset_ffs(x, ARRAY_SIZE(x))
static inline unsigned
__bitset_next_set(unsigned i, BITSET_WORD *tmp,
const BITSET_WORD *set, unsigned size)
{
unsigned bit, word;
/* NOTE: The initial conditions for this function are very specific. At
* the start of the loop, the tmp variable must be set to *set and the
* initial i value set to 0. This way, if there is a bit set in the first
* word, we ignore the i-value and just grab that bit (so 0 is ok, even
* though 0 may be returned). If the first word is 0, then the value of
* `word` will be 0 and we will go on to look at the second word.
*/
word = BITSET_BITWORD(i);
while (*tmp == 0) {
word++;
if (word >= BITSET_WORDS(size))
return size;
*tmp = set[word];
}
/* Find the next set bit in the non-zero word */
bit = ffs(*tmp) - 1;
/* Unset the bit */
*tmp &= ~(1ull << bit);
return word * BITSET_WORDBITS + bit;
}
/**
* Iterates over each set bit in a set
*
* @param __i iteration variable, bit number
* @param __set the bitset to iterate (will not be modified)
* @param __size number of bits in the set to consider
*/
#define BITSET_FOREACH_SET(__i, __set, __size) \
for (BITSET_WORD __tmp = *(__set), *__foo = &__tmp; __foo != NULL; __foo = NULL) \
for (__i = 0; \
(__i = __bitset_next_set(__i, &__tmp, __set, __size)) < __size;)
#ifdef __cplusplus
/**
* Simple C++ wrapper of a bitset type of static size, with value semantics
* and basic bitwise arithmetic operators. The operators defined below are
* expected to have the same semantics as the same operator applied to other
* fundamental integer types. T is the name of the struct to instantiate
* it as, and N is the number of bits in the bitset.
*/
#define DECLARE_BITSET_T(T, N) struct T { \
EXPLICIT_CONVERSION \
operator bool() const \
{ \
for (unsigned i = 0; i < BITSET_WORDS(N); i++) \
if (words[i]) \
return true; \
return false; \
} \
\
T & \
operator=(int x) \
{ \
const T c = {{ (BITSET_WORD)x }}; \
return *this = c; \
} \
\
friend bool \
operator==(const T &b, const T &c) \
{ \
return BITSET_EQUAL(b.words, c.words); \
} \
\
friend bool \
operator!=(const T &b, const T &c) \
{ \
return !(b == c); \
} \
\
friend bool \
operator==(const T &b, int x) \
{ \
const T c = {{ (BITSET_WORD)x }}; \
return b == c; \
} \
\
friend bool \
operator!=(const T &b, int x) \
{ \
return !(b == x); \
} \
\
friend T \
operator~(const T &b) \
{ \
T c; \
for (unsigned i = 0; i < BITSET_WORDS(N); i++) \
c.words[i] = ~b.words[i]; \
return c; \
} \
\
T & \
operator|=(const T &b) \
{ \
for (unsigned i = 0; i < BITSET_WORDS(N); i++) \
words[i] |= b.words[i]; \
return *this; \
} \
\
friend T \
operator|(const T &b, const T &c) \
{ \
T d = b; \
d |= c; \
return d; \
} \
\
T & \
operator&=(const T &b) \
{ \
for (unsigned i = 0; i < BITSET_WORDS(N); i++) \
words[i] &= b.words[i]; \
return *this; \
} \
\
friend T \
operator&(const T &b, const T &c) \
{ \
T d = b; \
d &= c; \
return d; \
} \
\
bool \
test(unsigned i) const \
{ \
return BITSET_TEST(words, i); \
} \
\
T & \
set(unsigned i) \
{ \
BITSET_SET(words, i); \
return *this; \
} \
\
T & \
clear(unsigned i) \
{ \
BITSET_CLEAR(words, i); \
return *this; \
} \
\
BITSET_WORD words[BITSET_WORDS(N)]; \
}
#endif
#endif

View File

@@ -1,262 +0,0 @@
/**************************************************************************
*
* Copyright 2006 VMware, Inc., Bismarck, ND. USA.
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sub license, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* The above copyright notice and this permission notice (including the
* next paragraph) shall be included in all copies or substantial portions
* of the Software.
*
**************************************************************************/
/**
* \file
* List macros heavily inspired by the Linux kernel
* list handling. No list looping yet.
*
* Is not threadsafe, so common operations need to
* be protected using an external mutex.
*/
#ifndef _UTIL_LIST_H_
#define _UTIL_LIST_H_
#include <stdbool.h>
#include <stddef.h>
#include <assert.h>
#ifdef DEBUG
# define list_assert(cond, msg) assert(cond && msg)
#else
# define list_assert(cond, msg) (void)(0 && (cond))
#endif
struct list_head
{
struct list_head *prev;
struct list_head *next;
};
static inline void list_inithead(struct list_head *item)
{
item->prev = item;
item->next = item;
}
static inline void list_add(struct list_head *item, struct list_head *list)
{
item->prev = list;
item->next = list->next;
list->next->prev = item;
list->next = item;
}
static inline void list_addtail(struct list_head *item, struct list_head *list)
{
item->next = list;
item->prev = list->prev;
list->prev->next = item;
list->prev = item;
}
static inline bool list_is_empty(const struct list_head *list);
static inline void list_replace(struct list_head *from, struct list_head *to)
{
if (list_is_empty(from)) {
list_inithead(to);
} else {
to->prev = from->prev;
to->next = from->next;
from->next->prev = to;
from->prev->next = to;
}
}
static inline void list_del(struct list_head *item)
{
item->prev->next = item->next;
item->next->prev = item->prev;
item->prev = item->next = NULL;
}
static inline void list_delinit(struct list_head *item)
{
item->prev->next = item->next;
item->next->prev = item->prev;
item->next = item;
item->prev = item;
}
static inline bool list_is_empty(const struct list_head *list)
{
return list->next == list;
}
/**
* Returns whether the list has exactly one element.
*/
static inline bool list_is_singular(const struct list_head *list)
{
return list->next != NULL && list->next != list && list->next->next == list;
}
static inline unsigned list_length(const struct list_head *list)
{
struct list_head *node;
unsigned length = 0;
for (node = list->next; node != list; node = node->next)
length++;
return length;
}
static inline void list_splice(struct list_head *src, struct list_head *dst)
{
if (list_is_empty(src))
return;
src->next->prev = dst;
src->prev->next = dst->next;
dst->next->prev = src->prev;
dst->next = src->next;
}
static inline void list_splicetail(struct list_head *src, struct list_head *dst)
{
if (list_is_empty(src))
return;
src->prev->next = dst;
src->next->prev = dst->prev;
dst->prev->next = src->next;
dst->prev = src->prev;
}
static inline void list_validate(const struct list_head *list)
{
struct list_head *node;
assert(list->next->prev == list && list->prev->next == list);
for (node = list->next; node != list; node = node->next)
assert(node->next->prev == node && node->prev->next == node);
}
#define LIST_ENTRY(__type, __item, __field) \
((__type *)(((char *)(__item)) - offsetof(__type, __field)))
/**
* Cast from a pointer to a member of a struct back to the containing struct.
*
* 'sample' MUST be initialized, or else the result is undefined!
*/
#ifndef container_of
#define container_of(ptr, sample, member) \
(void *)((char *)(ptr) \
- ((char *)&(sample)->member - (char *)(sample)))
#endif
#define list_first_entry(ptr, type, member) \
LIST_ENTRY(type, (ptr)->next, member)
#define list_last_entry(ptr, type, member) \
LIST_ENTRY(type, (ptr)->prev, member)
#define LIST_FOR_EACH_ENTRY(pos, head, member) \
for (pos = NULL, pos = container_of((head)->next, pos, member); \
&pos->member != (head); \
pos = container_of(pos->member.next, pos, member))
#define LIST_FOR_EACH_ENTRY_SAFE(pos, storage, head, member) \
for (pos = NULL, pos = container_of((head)->next, pos, member), \
storage = container_of(pos->member.next, pos, member); \
&pos->member != (head); \
pos = storage, storage = container_of(storage->member.next, storage, member))
#define LIST_FOR_EACH_ENTRY_SAFE_REV(pos, storage, head, member) \
for (pos = NULL, pos = container_of((head)->prev, pos, member), \
storage = container_of(pos->member.prev, pos, member); \
&pos->member != (head); \
pos = storage, storage = container_of(storage->member.prev, storage, member))
#define LIST_FOR_EACH_ENTRY_FROM(pos, start, head, member) \
for (pos = NULL, pos = container_of((start), pos, member); \
&pos->member != (head); \
pos = container_of(pos->member.next, pos, member))
#define LIST_FOR_EACH_ENTRY_FROM_REV(pos, start, head, member) \
for (pos = NULL, pos = container_of((start), pos, member); \
&pos->member != (head); \
pos = container_of(pos->member.prev, pos, member))
#define list_for_each_entry(type, pos, head, member) \
for (type *pos = LIST_ENTRY(type, (head)->next, member), \
*__next = LIST_ENTRY(type, pos->member.next, member); \
&pos->member != (head); \
pos = LIST_ENTRY(type, pos->member.next, member), \
list_assert(pos == __next, "use _safe iterator"), \
__next = LIST_ENTRY(type, __next->member.next, member))
#define list_for_each_entry_safe(type, pos, head, member) \
for (type *pos = LIST_ENTRY(type, (head)->next, member), \
*__next = LIST_ENTRY(type, pos->member.next, member); \
&pos->member != (head); \
pos = __next, \
__next = LIST_ENTRY(type, __next->member.next, member))
#define list_for_each_entry_rev(type, pos, head, member) \
for (type *pos = LIST_ENTRY(type, (head)->prev, member), \
*__prev = LIST_ENTRY(type, pos->member.prev, member); \
&pos->member != (head); \
pos = LIST_ENTRY(type, pos->member.prev, member), \
list_assert(pos == __prev, "use _safe iterator"), \
__prev = LIST_ENTRY(type, __prev->member.prev, member))
#define list_for_each_entry_safe_rev(type, pos, head, member) \
for (type *pos = LIST_ENTRY(type, (head)->prev, member), \
*__prev = LIST_ENTRY(type, pos->member.prev, member); \
&pos->member != (head); \
pos = __prev, \
__prev = LIST_ENTRY(type, __prev->member.prev, member))
#define list_for_each_entry_from(type, pos, start, head, member) \
for (type *pos = LIST_ENTRY(type, (start), member); \
&pos->member != (head); \
pos = LIST_ENTRY(type, pos->member.next, member))
#define list_for_each_entry_from_safe(type, pos, start, head, member) \
for (type *pos = LIST_ENTRY(type, (start), member), \
*__next = LIST_ENTRY(type, pos->member.next, member); \
&pos->member != (head); \
pos = __next, \
__next = LIST_ENTRY(type, __next->member.next, member))
#define list_for_each_entry_from_rev(type, pos, start, head, member) \
for (type *pos = LIST_ENTRY(type, (start), member); \
&pos->member != (head); \
pos = LIST_ENTRY(type, pos->member.prev, member))
#define list_pair_for_each_entry(type, pos1, pos2, head1, head2, member) \
for (type *pos1 = LIST_ENTRY(type, (head1)->next, member), \
*pos2 = LIST_ENTRY(type, (head2)->next, member); \
&pos1->member != (head1) && &pos2->member != (head2); \
pos1 = LIST_ENTRY(type, pos1->member.next, member), \
pos2 = LIST_ENTRY(type, pos2->member.next, member))
#endif /*_UTIL_LIST_H_*/

View File

@@ -1,346 +0,0 @@
/*
* Copyright © 2014 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef UTIL_MACROS_H
#define UTIL_MACROS_H
#include <assert.h>
/* Compute the size of an array */
#ifndef ARRAY_SIZE
# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
#endif
/* For compatibility with Clang's __has_builtin() */
#ifndef __has_builtin
# define __has_builtin(x) 0
#endif
/**
* __builtin_expect macros
*/
#if !defined(HAVE___BUILTIN_EXPECT)
# define __builtin_expect(x, y) (x)
#endif
#ifndef likely
# ifdef HAVE___BUILTIN_EXPECT
# define likely(x) __builtin_expect(!!(x), 1)
# define unlikely(x) __builtin_expect(!!(x), 0)
# else
# define likely(x) (x)
# define unlikely(x) (x)
# endif
#endif
/**
* Static (compile-time) assertion.
* Basically, use COND to dimension an array. If COND is false/zero the
* array size will be -1 and we'll get a compilation error.
*/
#define STATIC_ASSERT(COND) \
do { \
(void) sizeof(char [1 - 2*!(COND)]); \
} while (0)
/**
* Unreachable macro. Useful for suppressing "control reaches end of non-void
* function" warnings.
*/
#if defined(HAVE___BUILTIN_UNREACHABLE) || __has_builtin(__builtin_unreachable)
#define unreachable(str) \
do { \
assert(!str); \
__builtin_unreachable(); \
} while (0)
#elif defined (_MSC_VER)
#define unreachable(str) \
do { \
assert(!str); \
__assume(0); \
} while (0)
#else
#define unreachable(str) assert(!str)
#endif
/**
* Assume macro. Useful for expressing our assumptions to the compiler,
* typically for purposes of silencing warnings.
*/
#if __has_builtin(__builtin_assume)
#define assume(expr) \
do { \
assert(expr); \
__builtin_assume(expr); \
} while (0)
#elif defined HAVE___BUILTIN_UNREACHABLE
#define assume(expr) ((expr) ? ((void) 0) \
: (assert(!"assumption failed"), \
__builtin_unreachable()))
#elif defined (_MSC_VER)
#define assume(expr) __assume(expr)
#else
#define assume(expr) assert(expr)
#endif
/* Attribute const is used for functions that have no effects other than their
* return value, and only rely on the argument values to compute the return
* value. As a result, calls to it can be CSEed. Note that using memory
* pointed to by the arguments is not allowed for const functions.
*/
#ifdef HAVE_FUNC_ATTRIBUTE_CONST
#define ATTRIBUTE_CONST __attribute__((__const__))
#else
#define ATTRIBUTE_CONST
#endif
#ifdef HAVE_FUNC_ATTRIBUTE_FLATTEN
#define FLATTEN __attribute__((__flatten__))
#else
#define FLATTEN
#endif
#ifdef HAVE_FUNC_ATTRIBUTE_FORMAT
#define PRINTFLIKE(f, a) __attribute__ ((format(__printf__, f, a)))
#else
#define PRINTFLIKE(f, a)
#endif
#ifdef HAVE_FUNC_ATTRIBUTE_MALLOC
#define MALLOCLIKE __attribute__((__malloc__))
#else
#define MALLOCLIKE
#endif
/* Forced function inlining */
/* Note: Clang also sets __GNUC__ (see other cases below) */
#ifndef ALWAYS_INLINE
# if defined(__GNUC__)
# define ALWAYS_INLINE inline __attribute__((always_inline))
# elif defined(_MSC_VER)
# define ALWAYS_INLINE __forceinline
# else
# define ALWAYS_INLINE inline
# endif
#endif
/* Used to optionally mark structures with misaligned elements or size as
* packed, to trade off performance for space.
*/
#ifdef HAVE_FUNC_ATTRIBUTE_PACKED
#define PACKED __attribute__((__packed__))
#else
#define PACKED
#endif
/* Attribute pure is used for functions that have no effects other than their
* return value. As a result, calls to it can be dead code eliminated.
*/
#ifdef HAVE_FUNC_ATTRIBUTE_PURE
#define ATTRIBUTE_PURE __attribute__((__pure__))
#else
#define ATTRIBUTE_PURE
#endif
#ifdef HAVE_FUNC_ATTRIBUTE_RETURNS_NONNULL
#define ATTRIBUTE_RETURNS_NONNULL __attribute__((__returns_nonnull__))
#else
#define ATTRIBUTE_RETURNS_NONNULL
#endif
#ifndef NORETURN
# ifdef _MSC_VER
# define NORETURN __declspec(noreturn)
# elif defined HAVE_FUNC_ATTRIBUTE_NORETURN
# define NORETURN __attribute__((__noreturn__))
# else
# define NORETURN
# endif
#endif
#ifdef __cplusplus
/**
* Macro function that evaluates to true if T is a trivially
* destructible type -- that is, if its (non-virtual) destructor
* performs no action and all member variables and base classes are
* trivially destructible themselves.
*/
# if (defined(__clang__) && defined(__has_feature))
# if __has_feature(has_trivial_destructor)
# define HAS_TRIVIAL_DESTRUCTOR(T) __has_trivial_destructor(T)
# endif
# elif defined(__GNUC__)
# if ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 3)))
# define HAS_TRIVIAL_DESTRUCTOR(T) __has_trivial_destructor(T)
# endif
# elif defined(_MSC_VER) && !defined(__INTEL_COMPILER)
# define HAS_TRIVIAL_DESTRUCTOR(T) __has_trivial_destructor(T)
# endif
# ifndef HAS_TRIVIAL_DESTRUCTOR
/* It's always safe (if inefficient) to assume that a
* destructor is non-trivial.
*/
# define HAS_TRIVIAL_DESTRUCTOR(T) (false)
# endif
#endif
/**
* PUBLIC/USED macros
*
* If we build the library with gcc's -fvisibility=hidden flag, we'll
* use the PUBLIC macro to mark functions that are to be exported.
*
* We also need to define a USED attribute, so the optimizer doesn't
* inline a static function that we later use in an alias. - ajax
*/
#ifndef PUBLIC
# if defined(__GNUC__)
# define PUBLIC __attribute__((visibility("default")))
# define USED __attribute__((used))
# elif defined(_MSC_VER)
# define PUBLIC __declspec(dllexport)
# define USED
# else
# define PUBLIC
# define USED
# endif
#endif
/**
* UNUSED marks variables (or sometimes functions) that have to be defined,
* but are sometimes (or always) unused beyond that. A common case is for
* a function parameter to be used in some build configurations but not others.
* Another case is fallback vfuncs that don't do anything with their params.
*
* Note that this should not be used for identifiers used in `assert()`;
* see ASSERTED below.
*/
#ifdef HAVE_FUNC_ATTRIBUTE_UNUSED
#define UNUSED __attribute__((unused))
#else
#define UNUSED
#endif
/**
* Use ASSERTED to indicate that an identifier is unused outside of an `assert()`,
* so that assert-free builds don't get "unused variable" warnings.
*/
#ifdef NDEBUG
#define ASSERTED UNUSED
#else
#define ASSERTED
#endif
#ifdef HAVE_FUNC_ATTRIBUTE_WARN_UNUSED_RESULT
#define MUST_CHECK __attribute__((warn_unused_result))
#else
#define MUST_CHECK
#endif
#if defined(__GNUC__)
#define ATTRIBUTE_NOINLINE __attribute__((noinline))
#else
#define ATTRIBUTE_NOINLINE
#endif
/**
* Check that STRUCT::FIELD can hold MAXVAL. We use a lot of bitfields
* in Mesa/gallium. We have to be sure they're of sufficient size to
* hold the largest expected value.
* Note that with MSVC, enums are signed and enum bitfields need one extra
* high bit (always zero) to ensure the max value is handled correctly.
* This macro will detect that with MSVC, but not GCC.
*/
#define ASSERT_BITFIELD_SIZE(STRUCT, FIELD, MAXVAL) \
do { \
ASSERTED STRUCT s; \
s.FIELD = (MAXVAL); \
assert((int) s.FIELD == (MAXVAL) && "Insufficient bitfield size!"); \
} while (0)
/** Compute ceiling of integer quotient of A divided by B. */
#define DIV_ROUND_UP( A, B ) ( ((A) + (B) - 1) / (B) )
/** Clamp X to [MIN,MAX]. Turn NaN into MIN, arbitrarily. */
#define CLAMP( X, MIN, MAX ) ( (X)>(MIN) ? ((X)>(MAX) ? (MAX) : (X)) : (MIN) )
/** Minimum of two values: */
#define MIN2( A, B ) ( (A)<(B) ? (A) : (B) )
/** Maximum of two values: */
#define MAX2( A, B ) ( (A)>(B) ? (A) : (B) )
/** Minimum and maximum of three values: */
#define MIN3( A, B, C ) ((A) < (B) ? MIN2(A, C) : MIN2(B, C))
#define MAX3( A, B, C ) ((A) > (B) ? MAX2(A, C) : MAX2(B, C))
/** Align a value to a power of two */
#define ALIGN_POT(x, pot_align) (((x) + (pot_align) - 1) & ~((pot_align) - 1))
/**
* Macro for declaring an explicit conversion operator. Defaults to an
* implicit conversion if C++11 is not supported.
*/
#if __cplusplus >= 201103L
#define EXPLICIT_CONVERSION explicit
#elif defined(__cplusplus)
#define EXPLICIT_CONVERSION
#endif
/** Set a single bit */
#define BITFIELD_BIT(b) (1u << (b))
/** Set all bits up to excluding bit b */
#define BITFIELD_MASK(b) \
((b) == 32 ? (~0u) : BITFIELD_BIT((b) % 32) - 1)
/** Set count bits starting from bit b */
#define BITFIELD_RANGE(b, count) \
(BITFIELD_MASK((b) + (count)) & ~BITFIELD_MASK(b))
/** Set a single bit */
#define BITFIELD64_BIT(b) (1ull << (b))
/** Set all bits up to excluding bit b */
#define BITFIELD64_MASK(b) \
((b) == 64 ? (~0ull) : BITFIELD64_BIT(b) - 1)
/** Set count bits starting from bit b */
#define BITFIELD64_RANGE(b, count) \
(BITFIELD64_MASK((b) + (count)) & ~BITFIELD64_MASK(b))
/* TODO: In future we should try to move this to u_debug.h once header
* dependencies are reorganised to allow this.
*/
enum pipe_debug_type
{
PIPE_DEBUG_TYPE_OUT_OF_MEMORY = 1,
PIPE_DEBUG_TYPE_ERROR,
PIPE_DEBUG_TYPE_SHADER_INFO,
PIPE_DEBUG_TYPE_PERF_INFO,
PIPE_DEBUG_TYPE_INFO,
PIPE_DEBUG_TYPE_FALLBACK,
PIPE_DEBUG_TYPE_CONFORMANCE,
};
#endif /* UTIL_MACROS_H */

View File

@@ -1,132 +0,0 @@
#include "debug/include/adreno_pm4types.h"
#define REG_A5XX_TPL1_CS_TEX_CONST_LO 0x0000e760
#define REG_A5XX_TPL1_CS_TEX_SAMP_LO 0x0000e75c
#define REG_A5XX_SP_CS_CTRL_REG0 0x0000e5f0
std::map<int, std::string> regs = {
{0x0000e760, "REG_A5XX_TPL1_CS_TEX_CONST_LO"},
{0x0000e75c, "REG_A5XX_TPL1_CS_TEX_SAMP_LO"},
{0x00000e06, "REG_A5XX_HLSQ_MODE_CNTL"},
{0x00000e91, "REG_A5XX_UCHE_CACHE_INVALIDATE_MIN_LO"},
{0x00000ec2, "REG_A5XX_SP_MODE_CNTL"},
{0x0000e580, "REG_A5XX_SP_SP_CNTL"},
{0x0000e5f0, "REG_A5XX_SP_CS_CTRL_REG0"},
{0x0000e796, "REG_A5XX_HLSQ_CS_CNTL"},
{0x0000e784, "REG_A5XX_HLSQ_CONTROL_0_REG"},
{0x0000e7b0, "REG_A5XX_HLSQ_CS_NDRANGE_0"},
{0x0000e7b9, "REG_A5XX_HLSQ_CS_KERNEL_GROUP_X"},
{0x00000cdd, "REG_A5XX_VSC_RESOLVE_CNTL"},
};
std::map<int, std::string> ops = {
{33, "CP_REG_RMW"},
{62, "CP_REG_TO_MEM"},
{49, "CP_RUN_OPENCL"},
{16, "CP_NOP"},
{38, "CP_WAIT_FOR_IDLE"},
{110, "CP_COMPUTE_CHECKPOINT"},
{48, "CP_LOAD_STATE"},
};
void CachedCommand::disassemble() {
uint32_t *src = (uint32_t *)cmds[1].gpuaddr;
int len = cmds[1].size/4;
printf("disassemble %p %d\n", src, len);
int i = 0;
while (i < len) {
int pktsize;
int pkttype = -1;
if (pkt_is_type0(src[i])) {
pkttype = 0;
pktsize = type0_pkt_size(src[i]);
} else if (pkt_is_type3(src[i])) {
pkttype = 3;
pktsize = type3_pkt_size(src[i]);
} else if (pkt_is_type4(src[i])) {
pkttype = 4;
pktsize = type4_pkt_size(src[i]);
} else if (pkt_is_type7(src[i])) {
pkttype = 7;
pktsize = type7_pkt_size(src[i]);
}
printf("%3d: type:%d size:%d ", i, pkttype, pktsize);
if (pkttype == 7) {
int op = cp_type7_opcode(src[i]);
if (ops.find(op) != ops.end()) {
printf("%-40s ", ops[op].c_str());
} else {
printf("op: %4d ", op);
}
}
if (pkttype == 4) {
int reg = cp_type4_base_index_one_reg_wr(src[i]);
if (regs.find(reg) != regs.end()) {
printf("%-40s ", regs[reg].c_str());
} else {
printf("reg: %4x ", reg);
}
}
for (int j = 0; j < pktsize+1; j++) {
printf("%8.8X ", src[i+j]);
}
printf("\n");
uint64_t addr;
if (pkttype == 7) {
switch (cp_type7_opcode(src[i])) {
case CP_LOAD_STATE:
int dst_off = src[i+1] & 0x1FFF;
int state_src = (src[i+1] >> 16) & 3;
int state_block = (src[i+1] >> 18) & 7;
int state_type = src[i+2] & 3;
int num_unit = (src[i+1] & 0xffc00000) >> 22;
printf(" dst_off: %x state_src: %d state_block: %d state_type: %d num_unit: %d\n",
dst_off, state_src, state_block, state_type, num_unit);
addr = (uint64_t)(src[i+2] & 0xfffffffc) | ((uint64_t)(src[i+3]) << 32);
if (state_block == 5 && state_type == 0) {
if (!(addr&0xFFF)) {
int len = 0x1000;
if (num_unit >= 32) len += 0x1000;
//hexdump((uint32_t *)addr, len);
char fn[0x100];
snprintf(fn, sizeof(fn), "/tmp/0x%lx.shader", addr);
printf("dumping %s\n", fn);
FILE *f = fopen(fn, "wb");
// groups of 16 instructions
fwrite((void*)addr, 1, len, f);
fclose(f);
}
}
break;
}
}
/*if (pkttype == 4) {
switch (cp_type4_base_index_one_reg_wr(src[i])) {
case REG_A5XX_SP_CS_CTRL_REG0:
addr = (uint64_t)(src[i+4] & 0xfffffffc) | ((uint64_t)(src[i+5]) << 32);
hexdump((uint32_t *)addr, 0x1000);
break;
}
}*/
/*if (pkttype == 4 && cp_type4_base_index_one_reg_wr(src[i]) == REG_A5XX_TPL1_CS_TEX_CONST_LO) {
uint64_t addr = (uint64_t)(src[i+1] & 0xffffffff) | ((uint64_t)(src[i+2]) << 32);
hexdump((uint32_t *)addr, 0x40);
}
if (pkttype == 4 && cp_type4_base_index_one_reg_wr(src[i]) == REG_A5XX_TPL1_CS_TEX_SAMP_LO) {
uint64_t addr = (uint64_t)(src[i+1] & 0xffffffff) | ((uint64_t)(src[i+2]) << 32);
hexdump((uint32_t *)addr, 0x40);
}*/
if (pkttype == -1) break;
i += (1+pktsize);
}
assert(i == len);
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,473 +0,0 @@
/* Copyright (c) 2002,2007-2015, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
* only version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
*/
#ifndef __ADRENO_PM4TYPES_H
#define __ADRENO_PM4TYPES_H
//#include "adreno.h"
#define CP_PKT_MASK 0xc0000000
#define CP_TYPE0_PKT ((unsigned int)0 << 30)
#define CP_TYPE3_PKT ((unsigned int)3 << 30)
#define CP_TYPE4_PKT ((unsigned int)4 << 28)
#define CP_TYPE7_PKT ((unsigned int)7 << 28)
#define PM4_TYPE4_PKT_SIZE_MAX 128
/* type3 packets */
/* Enable preemption flag */
#define CP_PREEMPT_ENABLE 0x1C
/* Preemption token command on which preemption occurs */
#define CP_PREEMPT_TOKEN 0x1E
/* Bit to set in CP_PREEMPT_TOKEN ordinal for interrupt on preemption */
#define CP_PREEMPT_ORDINAL_INTERRUPT 24
/* copy from ME scratch RAM to a register */
#define CP_SCRATCH_TO_REG 0x4d
/* Copy from REG to ME scratch RAM */
#define CP_REG_TO_SCRATCH 0x4a
/* Wait for memory writes to complete */
#define CP_WAIT_MEM_WRITES 0x12
/* Conditional execution based on register comparison */
#define CP_COND_REG_EXEC 0x47
/* Memory to REG copy */
#define CP_MEM_TO_REG 0x42
/* initialize CP's micro-engine */
#define CP_ME_INIT 0x48
/* skip N 32-bit words to get to the next packet */
#define CP_NOP 0x10
/* indirect buffer dispatch. same as IB, but init is pipelined */
#define CP_INDIRECT_BUFFER_PFD 0x37
/* wait for the IDLE state of the engine */
#define CP_WAIT_FOR_IDLE 0x26
/* wait until a register or memory location is a specific value */
#define CP_WAIT_REG_MEM 0x3c
/* wait until a register location is equal to a specific value */
#define CP_WAIT_REG_EQ 0x52
/* switches SMMU pagetable, used on a5xx only */
#define CP_SMMU_TABLE_UPDATE 0x53
/* wait until a read completes */
#define CP_WAIT_UNTIL_READ 0x5c
/* wait until all base/size writes from an IB_PFD packet have completed */
#define CP_WAIT_IB_PFD_COMPLETE 0x5d
/* register read/modify/write */
#define CP_REG_RMW 0x21
/* Set binning configuration registers */
#define CP_SET_BIN_DATA 0x2f
/* reads register in chip and writes to memory */
#define CP_REG_TO_MEM 0x3e
/* write N 32-bit words to memory */
#define CP_MEM_WRITE 0x3d
/* write CP_PROG_COUNTER value to memory */
#define CP_MEM_WRITE_CNTR 0x4f
/* conditional execution of a sequence of packets */
#define CP_COND_EXEC 0x44
/* conditional write to memory or register */
#define CP_COND_WRITE 0x45
/* generate an event that creates a write to memory when completed */
#define CP_EVENT_WRITE 0x46
/* generate a VS|PS_done event */
#define CP_EVENT_WRITE_SHD 0x58
/* generate a cache flush done event */
#define CP_EVENT_WRITE_CFL 0x59
/* generate a z_pass done event */
#define CP_EVENT_WRITE_ZPD 0x5b
/* initiate fetch of index buffer and draw */
#define CP_DRAW_INDX 0x22
/* draw using supplied indices in packet */
#define CP_DRAW_INDX_2 0x36
/* initiate fetch of index buffer and binIDs and draw */
#define CP_DRAW_INDX_BIN 0x34
/* initiate fetch of bin IDs and draw using supplied indices */
#define CP_DRAW_INDX_2_BIN 0x35
/* New draw packets defined for A4XX */
#define CP_DRAW_INDX_OFFSET 0x38
#define CP_DRAW_INDIRECT 0x28
#define CP_DRAW_INDX_INDIRECT 0x29
#define CP_DRAW_AUTO 0x24
/* begin/end initiator for viz query extent processing */
#define CP_VIZ_QUERY 0x23
/* fetch state sub-blocks and initiate shader code DMAs */
#define CP_SET_STATE 0x25
/* load constant into chip and to memory */
#define CP_SET_CONSTANT 0x2d
/* load sequencer instruction memory (pointer-based) */
#define CP_IM_LOAD 0x27
/* load sequencer instruction memory (code embedded in packet) */
#define CP_IM_LOAD_IMMEDIATE 0x2b
/* load constants from a location in memory */
#define CP_LOAD_CONSTANT_CONTEXT 0x2e
/* selective invalidation of state pointers */
#define CP_INVALIDATE_STATE 0x3b
/* dynamically changes shader instruction memory partition */
#define CP_SET_SHADER_BASES 0x4A
/* sets the 64-bit BIN_MASK register in the PFP */
#define CP_SET_BIN_MASK 0x50
/* sets the 64-bit BIN_SELECT register in the PFP */
#define CP_SET_BIN_SELECT 0x51
/* updates the current context, if needed */
#define CP_CONTEXT_UPDATE 0x5e
/* generate interrupt from the command stream */
#define CP_INTERRUPT 0x40
/* A5XX Enable yield in RB only */
#define CP_YIELD_ENABLE 0x1C
/* Enable/Disable/Defer A5x global preemption model */
#define CP_PREEMPT_ENABLE_GLOBAL 0x69
/* Enable/Disable A5x local preemption model */
#define CP_PREEMPT_ENABLE_LOCAL 0x6A
/* Yeild token on a5xx similar to CP_PREEMPT on a4xx */
#define CP_CONTEXT_SWITCH_YIELD 0x6B
/* Inform CP about current render mode (needed for a5xx preemption) */
#define CP_SET_RENDER_MODE 0x6C
/* copy sequencer instruction memory to system memory */
#define CP_IM_STORE 0x2c
/* test 2 memory locations to dword values specified */
#define CP_TEST_TWO_MEMS 0x71
/* Write register, ignoring context state for context sensitive registers */
#define CP_REG_WR_NO_CTXT 0x78
/*
* for A4xx
* Write to register with address that does not fit into type-0 pkt
*/
#define CP_WIDE_REG_WRITE 0x74
/* PFP waits until the FIFO between the PFP and the ME is empty */
#define CP_WAIT_FOR_ME 0x13
/* Record the real-time when this packet is processed by PFP */
#define CP_RECORD_PFP_TIMESTAMP 0x11
#define CP_SET_PROTECTED_MODE 0x5f /* sets the register protection mode */
/* Used to switch GPU between secure and non-secure modes */
#define CP_SET_SECURE_MODE 0x66
#define CP_BOOTSTRAP_UCODE 0x6f /* bootstraps microcode */
/*
* for a3xx
*/
#define CP_LOAD_STATE 0x30 /* load high level sequencer command */
/* Conditionally load a IB based on a flag */
#define CP_COND_INDIRECT_BUFFER_PFE 0x3A /* prefetch enabled */
#define CP_COND_INDIRECT_BUFFER_PFD 0x32 /* prefetch disabled */
/* Load a buffer with pre-fetch enabled */
#define CP_INDIRECT_BUFFER_PFE 0x3F
#define CP_EXEC_CL 0x31
/* (A4x) save PM4 stream pointers to execute upon a visible draw */
#define CP_SET_DRAW_STATE 0x43
#define CP_LOADSTATE_DSTOFFSET_SHIFT 0x00000000
#define CP_LOADSTATE_STATESRC_SHIFT 0x00000010
#define CP_LOADSTATE_STATEBLOCKID_SHIFT 0x00000013
#define CP_LOADSTATE_NUMOFUNITS_SHIFT 0x00000016
#define CP_LOADSTATE_STATETYPE_SHIFT 0x00000000
#define CP_LOADSTATE_EXTSRCADDR_SHIFT 0x00000002
static inline uint pm4_calc_odd_parity_bit(uint val)
{
return (0x9669 >> (0xf & ((val) ^
((val) >> 4) ^ ((val) >> 8) ^ ((val) >> 12) ^
((val) >> 16) ^ ((val) >> 20) ^ ((val) >> 24) ^
((val) >> 28)))) & 1;
}
/*
* PM4 packet header functions
* For all the packet functions the passed in count should be the size of the
* payload excluding the header
*/
static inline uint cp_type0_packet(uint regindx, uint cnt)
{
return CP_TYPE0_PKT | ((cnt-1) << 16) | ((regindx) & 0x7FFF);
}
static inline uint cp_type3_packet(uint opcode, uint cnt)
{
return CP_TYPE3_PKT | ((cnt-1) << 16) | (((opcode) & 0xFF) << 8);
}
static inline uint cp_type4_packet(uint opcode, uint cnt)
{
return CP_TYPE4_PKT | ((cnt) << 0) |
(pm4_calc_odd_parity_bit(cnt) << 7) |
(((opcode) & 0x3FFFF) << 8) |
((pm4_calc_odd_parity_bit(opcode) << 27));
}
static inline uint cp_type7_packet(uint opcode, uint cnt)
{
return CP_TYPE7_PKT | ((cnt) << 0) |
(pm4_calc_odd_parity_bit(cnt) << 15) |
(((opcode) & 0x7F) << 16) |
((pm4_calc_odd_parity_bit(opcode) << 23));
}
#define pkt_is_type0(pkt) (((pkt) & 0XC0000000) == CP_TYPE0_PKT)
#define type0_pkt_size(pkt) ((((pkt) >> 16) & 0x3FFF) + 1)
#define type0_pkt_offset(pkt) ((pkt) & 0x7FFF)
/*
* Check both for the type3 opcode and make sure that the reserved bits [1:7]
* and 15 are 0
*/
#define pkt_is_type3(pkt) \
((((pkt) & 0xC0000000) == CP_TYPE3_PKT) && \
(((pkt) & 0x80FE) == 0))
#define cp_type3_opcode(pkt) (((pkt) >> 8) & 0xFF)
#define type3_pkt_size(pkt) ((((pkt) >> 16) & 0x3FFF) + 1)
#define pkt_is_type4(pkt) \
((((pkt) & 0xF0000000) == CP_TYPE4_PKT) && \
((((pkt) >> 27) & 0x1) == \
pm4_calc_odd_parity_bit(cp_type4_base_index_one_reg_wr(pkt))) \
&& ((((pkt) >> 7) & 0x1) == \
pm4_calc_odd_parity_bit(type4_pkt_size(pkt))))
#define cp_type4_base_index_one_reg_wr(pkt) (((pkt) >> 8) & 0x7FFFF)
#define type4_pkt_size(pkt) ((pkt) & 0x7F)
#define pkt_is_type7(pkt) \
((((pkt) & 0xF0000000) == CP_TYPE7_PKT) && \
(((pkt) & 0x0F000000) == 0) && \
((((pkt) >> 23) & 0x1) == \
pm4_calc_odd_parity_bit(cp_type7_opcode(pkt))) \
&& ((((pkt) >> 15) & 0x1) == \
pm4_calc_odd_parity_bit(type7_pkt_size(pkt))))
#define cp_type7_opcode(pkt) (((pkt) >> 16) & 0x7F)
#define type7_pkt_size(pkt) ((pkt) & 0x3FFF)
/* dword base address of the GFX decode space */
#define SUBBLOCK_OFFSET(reg) ((unsigned int)((reg) - (0x2000)))
/* gmem command buffer length */
#define CP_REG(reg) ((0x4 << 16) | (SUBBLOCK_OFFSET(reg)))
// add these
#define ADRENO_GPUREV(x) 530
#define lower_32_bits(n) ((uint32_t)(n))
#define upper_32_bits(n) ((uint32_t)(((n) >> 16) >> 16))
/* Return true if the hardware uses the legacy (A4XX and older) PM4 format */
#define ADRENO_LEGACY_PM4(_d) (ADRENO_GPUREV(_d) < 500)
/**
* cp_packet - Generic CP packet to support different opcodes on
* different GPU cores.
* @adreno_dev: The adreno device
* @opcode: Operation for cp packet
* @size: size for cp packet
*/
static inline uint cp_packet(struct adreno_device *adreno_dev,
int opcode, uint size)
{
if (ADRENO_LEGACY_PM4(adreno_dev))
return cp_type3_packet(opcode, size);
return cp_type7_packet(opcode, size);
}
/**
* cp_mem_packet - Generic CP memory packet to support different
* opcodes on different GPU cores.
* @adreno_dev: The adreno device
* @opcode: mem operation for cp packet
* @size: size for cp packet
* @num_mem: num of mem access
*/
static inline uint cp_mem_packet(struct adreno_device *adreno_dev,
int opcode, uint size, uint num_mem)
{
if (ADRENO_LEGACY_PM4(adreno_dev))
return cp_type3_packet(opcode, size);
return cp_type7_packet(opcode, size + num_mem);
}
/* Return 1 if the command is an indirect buffer of any kind */
static inline int adreno_cmd_is_ib(struct adreno_device *adreno_dev,
unsigned int cmd)
{
return cmd == cp_mem_packet(adreno_dev,
CP_INDIRECT_BUFFER_PFE, 2, 1) ||
cmd == cp_mem_packet(adreno_dev,
CP_INDIRECT_BUFFER_PFD, 2, 1) ||
cmd == cp_mem_packet(adreno_dev,
CP_COND_INDIRECT_BUFFER_PFE, 2, 1) ||
cmd == cp_mem_packet(adreno_dev,
CP_COND_INDIRECT_BUFFER_PFD, 2, 1);
}
/**
* cp_gpuaddr - Generic function to add 64bit and 32bit gpuaddr
* to pm4 commands
* @adreno_dev: The adreno device
* @cmds: command pointer to add gpuaddr
* @gpuaddr: gpuaddr to add
*/
static inline uint cp_gpuaddr(struct adreno_device *adreno_dev,
uint *cmds, uint64_t gpuaddr)
{
uint *start = cmds;
if (ADRENO_LEGACY_PM4(adreno_dev))
*cmds++ = (uint)gpuaddr;
else {
*cmds++ = lower_32_bits(gpuaddr);
*cmds++ = upper_32_bits(gpuaddr);
}
return cmds - start;
}
/**
* cp_register - Generic function for gpu register operation
* @adreno_dev: The adreno device
* @reg: GPU register
* @size: count for PM4 operation
*/
static inline uint cp_register(struct adreno_device *adreno_dev,
unsigned int reg, unsigned int size)
{
if (ADRENO_LEGACY_PM4(adreno_dev))
return cp_type0_packet(reg, size);
return cp_type4_packet(reg, size);
}
/**
* cp_wait_for_me - common function for WAIT_FOR_ME
* @adreno_dev: The adreno device
* @cmds: command pointer to add gpuaddr
*/
static inline uint cp_wait_for_me(struct adreno_device *adreno_dev,
uint *cmds)
{
uint *start = cmds;
if (ADRENO_LEGACY_PM4(adreno_dev)) {
*cmds++ = cp_type3_packet(CP_WAIT_FOR_ME, 1);
*cmds++ = 0;
} else
*cmds++ = cp_type7_packet(CP_WAIT_FOR_ME, 0);
return cmds - start;
}
/**
* cp_wait_for_idle - common function for WAIT_FOR_IDLE
* @adreno_dev: The adreno device
* @cmds: command pointer to add gpuaddr
*/
static inline uint cp_wait_for_idle(struct adreno_device *adreno_dev,
uint *cmds)
{
uint *start = cmds;
if (ADRENO_LEGACY_PM4(adreno_dev)) {
*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
*cmds++ = 0;
} else
*cmds++ = cp_type7_packet(CP_WAIT_FOR_IDLE, 0);
return cmds - start;
}
/**
* cp_invalidate_state - common function for invalidating cp
* state
* @adreno_dev: The adreno device
* @cmds: command pointer to add gpuaddr
*/
static inline uint cp_invalidate_state(struct adreno_device *adreno_dev,
uint *cmds)
{
uint *start = cmds;
if (ADRENO_GPUREV(adreno_dev) < 500) {
*cmds++ = cp_type3_packet(CP_INVALIDATE_STATE, 1);
*cmds++ = 0x7fff;
} else {
*cmds++ = cp_type7_packet(CP_SET_DRAW_STATE, 3);
*cmds++ = 0x40000;
*cmds++ = 0;
*cmds++ = 0;
}
return cmds - start;
}
#endif /* __ADRENO_PM4TYPES_H */

View File

@@ -1,724 +0,0 @@
#include <sys/types.h>
#include "include/msm_kgsl.h"
#include "common/clutil.h"
#include <stdio.h>
#include <stdlib.h>
#include <dlfcn.h>
#include <cassert>
#include <sys/mman.h>
int run_num = 0;
int ioctl_num = 0;
void hexdump(uint32_t *d, int len) {
assert((len%4) == 0);
printf(" dumping %p len 0x%x\n", d, len);
for (int i = 0; i < len/4; i++) {
if (i != 0 && (i%0x10) == 0) printf("\n");
printf("%8x ", d[i]);
}
printf("\n");
}
void hexdump8(uint8_t *d, int len) {
printf(" dumping %p len 0x%x\n", d, len);
for (int i = 0; i < len; i++) {
if (i != 0 && (i%0x10) == 0) printf("\n");
printf("%02x ", d[i]);
}
printf("\n");
}
#include <string>
#include <vector>
#include <map>
using namespace std;
#include "disasm/include/adreno_pm4types.h"
#define REG_A5XX_TPL1_CS_TEX_CONST_LO 0x0000e760
#define REG_A5XX_TPL1_CS_TEX_SAMP_LO 0x0000e75c
class CachedCommand {
public:
CachedCommand(struct kgsl_gpu_command *cmd, int lfd);
void exec(bool wait);
private:
string cmd_0, cmd_1;
int obj_len;
int fd;
struct kgsl_gpu_command cache;
struct kgsl_command_object cmds[2];
struct kgsl_command_object objs[1];
};
vector<CachedCommand *> queue_cmds;
void disassemble(uint32_t *src, int len) {
int i = 0;
while (i < len) {
int pktsize;
int pkttype = -1;
if (pkt_is_type0(src[i])) {
pkttype = 0;
pktsize = type0_pkt_size(src[i]);
} else if (pkt_is_type3(src[i])) {
pkttype = 3;
pktsize = type3_pkt_size(src[i]);
} else if (pkt_is_type4(src[i])) {
pkttype = 4;
pktsize = type4_pkt_size(src[i]);
} else if (pkt_is_type7(src[i])) {
pkttype = 7;
pktsize = type7_pkt_size(src[i]);
}
printf("%3d: type:%d size:%d ", i, pkttype, pktsize);
if (pkttype == 7) {
printf("op: %4x ", cp_type7_opcode(src[i]));
}
if (pkttype == 4) {
printf("reg: %4x ", cp_type4_base_index_one_reg_wr(src[i]));
}
for (int j = 0; j < pktsize+1; j++) {
printf("%8.8X ", src[i+j]);
}
printf("\n");
if (pkttype == 7 && cp_type7_opcode(src[i]) == CP_LOAD_STATE) {
// CP_LOAD_STATE4
int sz = (src[i+1] & 0xffc00000) >> 22;
uint64_t addr = (uint64_t)(src[i+2] & 0xfffffffc) | ((uint64_t)(src[i+3]) << 32);
hexdump((uint32_t *)addr, sz*4);
}
if (pkttype == 4 && cp_type4_base_index_one_reg_wr(src[i]) == REG_A5XX_TPL1_CS_TEX_CONST_LO) {
uint64_t addr = (uint64_t)(src[i+1] & 0xffffffff) | ((uint64_t)(src[i+2]) << 32);
hexdump((uint32_t *)addr, 0x40);
}
if (pkttype == 4 && cp_type4_base_index_one_reg_wr(src[i]) == REG_A5XX_TPL1_CS_TEX_SAMP_LO) {
uint64_t addr = (uint64_t)(src[i+1] & 0xffffffff) | ((uint64_t)(src[i+2]) << 32);
hexdump((uint32_t *)addr, 0x40);
}
if (pkttype == -1) break;
i += (1+pktsize);
}
assert(i == len);
}
int intercept = 1;
int prop_num = 0;
extern "C" {
/*void *gsl_memory_alloc_pure(long param_1, long param_2, long *param_3) {
void *(*my_gsl_memory_alloc_pure)(long param_1, long param_2, long *param_3);
my_gsl_memory_alloc_pure = reinterpret_cast<decltype(my_gsl_memory_alloc_pure)>(dlsym(RTLD_NEXT, "gsl_memory_alloc_pure"));
void *ret = my_gsl_memory_alloc_pure(param_1, param_2, param_3);
printf("gsl_memory_alloc_pure: 0x%lx 0x%lx %p = %p\n", param_1, param_2, param_3, ret);
return ret;
}*/
void *mmap64(void *addr, size_t len, int prot, int flags, int fildes, off64_t off) {
void *(*my_mmap64)(void *addr, size_t len, int prot, int flags, int fildes, off64_t off);
my_mmap64 = reinterpret_cast<decltype(my_mmap64)>(dlsym(RTLD_NEXT, "mmap64"));
void *ret = my_mmap64(addr, len, prot, flags, fildes, off);
if (fildes == 3) {
printf("mmap64(addr=%p, len=0x%zx, prot=0x%x, flags=0x%x, fildes=%d, off=0x%lx) = %p\n", addr, len, prot, flags, fildes, off, ret);
}
return ret;
}
pid_t gettid(void);
#undef ioctl
int ioctl(int filedes, unsigned long request, void *argp) {
int (*my_ioctl)(int filedes, unsigned long request, void *argp);
my_ioctl = reinterpret_cast<decltype(my_ioctl)>(dlsym(RTLD_NEXT, "ioctl"));
int skip = 0;
if (intercept) {
int tid = gettid();
if (request == IOCTL_KGSL_GPU_COMMAND) {
struct kgsl_gpu_command *cmd = (struct kgsl_gpu_command *)argp;
printf("IOCTL_KGSL_GPU_COMMAND(%d): flags: 0x%lx numcmds: %u numobjs: %u numsyncs: %u context_id: %u timestamp: %u\n",
tid,
cmd->flags,
cmd->numcmds, cmd->numobjs, cmd->numsyncs,
cmd->context_id, cmd->timestamp);
assert(cmd->numcmds == 2);
assert(cmd->numobjs == 1);
assert(cmd->numsyncs == 0);
//struct kgsl_command_object *obj = (struct kgsl_command_object *)cmd->cmdlist;
//assert(obj[0].size == sizeof(queue_init));
//memcpy(queue_init, (void*)obj[0].gpuaddr, sizeof(queue_init));
//string qcmd((char*)obj[1].gpuaddr, obj[1].size);
if (run_num == 3) {
CachedCommand *ccmd = new CachedCommand(cmd, filedes);
queue_cmds.push_back(ccmd);
//ccmd->exec();
//skip = 0;
//printf("command 0x%lx\n", obj[1].gpuaddr);
//disassemble((uint32_t *)qcmd.data(), qcmd.size()/4);
//queue_cmds.push_back(qcmd);
}
#ifdef DUMP
char tmp[0x100];
snprintf(tmp, sizeof(tmp), "/tmp/thneed/run_%d_%d", run_num, ioctl_num++);
FILE *f = fopen(tmp, "wb");
#endif
// kgsl_cmdbatch_add_cmdlist
for (int i = 0; i < cmd->numcmds; i++) {
struct kgsl_command_object *obj = (struct kgsl_command_object *)cmd->cmdlist;
printf(" cmd: %lx %5lx %5lx flags:%3x %d\n",
obj[i].offset, obj[i].gpuaddr, obj[i].size, obj[i].flags, obj[i].id);
//hexdump((uint32_t *)obj[i].gpuaddr, obj[i].size);
#ifdef DUMP
fwrite(&obj[i].size, sizeof(obj[i].size), 1, f);
fwrite((void*)obj[i].gpuaddr, obj[i].size, 1, f);
#endif
}
// kgsl_cmdbatch_add_memlist
for (int i = 0; i < cmd->numobjs; i++) {
struct kgsl_command_object *obj = (struct kgsl_command_object *)cmd->objlist;
printf(" obj: %lx %5lx %5lx flags:%3x %d\n",
obj[i].offset, obj[i].gpuaddr, obj[i].size, obj[i].flags, obj[i].id);
//hexdump((uint32_t *)obj[i].gpuaddr, obj[i].size);
#ifdef DUMP
fwrite(&obj[i].size, sizeof(obj[i].size), 1, f);
fwrite((void*)obj[i].gpuaddr, obj[i].size, 1, f);
#endif
}
#ifdef DUMP
fclose(f);
#endif
} else if (request == IOCTL_KGSL_SETPROPERTY) {
struct kgsl_device_getproperty *prop = (struct kgsl_device_getproperty *)argp;
printf("IOCTL_KGSL_SETPROPERTY(%d): 0x%x\n", tid, prop->type);
hexdump8((uint8_t*)prop->value, prop->sizebytes);
if (prop_num == 1) { printf("SKIPPING\n"); skip = 1; }
if (run_num == 3) prop_num++;
//hexdump((unsigned char*)prop->value, prop->sizebytes);
} else if (request == IOCTL_KGSL_GPUOBJ_SYNC) {
struct kgsl_gpuobj_sync *cmd = (struct kgsl_gpuobj_sync *)argp;
struct kgsl_gpuobj_sync_obj *objs = (struct kgsl_gpuobj_sync_obj *)(cmd->objs);
printf("IOCTL_KGSL_GPUOBJ_SYNC(%d) count:%d ", tid, cmd->count);
for (int i = 0; i < cmd->count; i++) {
printf(" -- offset:0x%lx len:0x%lx id:%d op:%d ", objs[i].offset, objs[i].length, objs[i].id, objs[i].op);
}
printf("\n");
} else if (request == IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID) {
struct kgsl_device_waittimestamp_ctxtid *cmd = (struct kgsl_device_waittimestamp_ctxtid *)argp;
printf("IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID(%d): context_id: %d timestamp: %d timeout: %d\n",
tid, cmd->context_id, cmd->timestamp, cmd->timeout);
} else if (request == IOCTL_KGSL_GPUOBJ_ALLOC) {
struct kgsl_gpuobj_alloc *cmd = (struct kgsl_gpuobj_alloc *)argp;
printf("IOCTL_KGSL_GPUOBJ_ALLOC: size:0x%lx flags:0x%lx va_len:0x%lx ", cmd->size, cmd->flags, cmd->va_len);
} else if (request == IOCTL_KGSL_GPUOBJ_FREE) {
//printf("IOCTL_KGSL_GPUOBJ_FREE\n");
} else if (filedes == 3) {
printf("ioctl(%d) %lx\n", tid, request);
}
}
int ret;
if (skip) {
ret = 0;
} else {
ret = my_ioctl(filedes, request, argp);
}
if (request == IOCTL_KGSL_GPUOBJ_ALLOC) {
struct kgsl_gpuobj_alloc *cmd = (struct kgsl_gpuobj_alloc *)argp;
printf("mmapsize:0x%lx id:%d metadata_len:%x metadata:0x%lx = %d\n", cmd->mmapsize, cmd->id, cmd->metadata_len, cmd->metadata, ret);
}
return ret;
}
}
#include <CL/cl.h>
#include "../runners/snpemodel.h"
#include <sys/types.h>
#include <time.h>
static inline uint64_t nanos_since_boot() {
struct timespec t;
clock_gettime(CLOCK_BOOTTIME, &t);
return t.tv_sec * 1000000000ULL + t.tv_nsec;
}
int global_timestamp = -1;
CachedCommand::CachedCommand(struct kgsl_gpu_command *cmd, int lfd) {
fd = lfd;
assert(cmd->numcmds == 2);
assert(cmd->numobjs == 1);
assert(cmd->numsyncs == 0);
global_timestamp = cmd->timestamp;
printf("%p %p %p\n", cmd, (void*)cmd->cmdlist, (void*)cmd->objlist);
memcpy(cmds, (void *)cmd->cmdlist, sizeof(struct kgsl_command_object)*2);
memcpy(objs, (void *)cmd->objlist, sizeof(struct kgsl_command_object)*1);
cmd_0.assign((char*)cmds[0].gpuaddr, cmds[0].size);
cmd_1.assign((char*)cmds[1].gpuaddr, cmds[1].size);
memcpy(&cache, cmd, sizeof(cache));
}
// i think you get these with cl_a5x_ringbuffer_alloc
uint64_t base = 0;
void CachedCommand::exec(bool wait) {
printf("old addr 0x%lx ", cmds[1].gpuaddr);
cmds[1].gpuaddr = base;
printf("using addr 0x%lx with size 0x%4lx ", cmds[1].gpuaddr, cmd_1.size());
base += (cmd_1.size()+0xff) & (~0xFF);
memcpy((void*)cmds[1].gpuaddr, cmd_1.data(), cmd_1.size());
// set up other buffers
memcpy((void*)cmds[0].gpuaddr, cmd_0.data(), cmd_0.size());
memset((void*)objs[0].gpuaddr, 0, objs[0].size);
cache.timestamp = ++global_timestamp;
cache.cmdlist = (uint64_t)cmds;
cache.objlist = (uint64_t)objs;
// run
int ret = ioctl(fd, IOCTL_KGSL_GPU_COMMAND, &cache);
if (wait) {
struct kgsl_device_waittimestamp_ctxtid wait;
wait.context_id = cache.context_id;
wait.timestamp = cache.timestamp;
wait.timeout = -1;
uint64_t tb = nanos_since_boot();
int wret = ioctl(fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait);
uint64_t te = nanos_since_boot();
printf("exec %d wait %d after %lu us\n", ret, wret, (te-tb)/1000);
} else {
printf("CachedCommand::exec got %d\n", ret);
}
}
int do_print = 0;
#define TEMPORAL_SIZE 512
#define DESIRE_LEN 8
#define TRAFFIC_CONVENTION_LEN 2
FILE *f = NULL;
cl_program clCreateProgramWithSource(cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret) {
cl_program (*my_clCreateProgramWithSource)(cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret) = NULL;
my_clCreateProgramWithSource = reinterpret_cast<decltype(my_clCreateProgramWithSource)>(dlsym(RTLD_NEXT, "REAL_clCreateProgramWithSource"));
//printf("clCreateProgramWithSource: %d\n", count);
if (f == NULL) {
f = fopen("/tmp/kernels.cl", "w");
}
fprintf(f, "/* ************************ PROGRAM BREAK ****************************/\n");
for (int i = 0; i < count; i++) {
fprintf(f, "%s\n", strings[i]);
if (i != 0) fprintf(f, "/* ************************ SECTION BREAK ****************************/\n");
}
fflush(f);
return my_clCreateProgramWithSource(context, count, strings, lengths, errcode_ret);
}
map<cl_kernel, string> kernels;
map<cl_kernel, cl_mem> kernel_inputs;
map<cl_kernel, cl_mem> kernel_outputs;
cl_kernel clCreateKernel(cl_program program, const char *kernel_name, cl_int *errcode_ret) {
cl_kernel (*my_clCreateKernel)(cl_program program, const char *kernel_name, cl_int *errcode_ret) = NULL;
my_clCreateKernel = reinterpret_cast<decltype(my_clCreateKernel)>(dlsym(RTLD_NEXT, "REAL_clCreateKernel"));
cl_kernel ret = my_clCreateKernel(program, kernel_name, errcode_ret);
printf("clCreateKernel: %s -> %p\n", kernel_name, ret);
kernels.insert(make_pair(ret, kernel_name));
return ret;
}
typedef struct image {
size_t image_width;
size_t image_height;
size_t image_row_pitch;
cl_mem buffer;
} image;
map<cl_mem, size_t> buffers;
map<cl_mem, image> images;
cl_int clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) {
cl_int (*my_clSetKernelArg)(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) = NULL;
my_clSetKernelArg = reinterpret_cast<decltype(my_clSetKernelArg)>(dlsym(RTLD_NEXT, "REAL_clSetKernelArg"));
char arg_type[0x100];
char arg_name[0x100];
clGetKernelArgInfo(kernel, arg_index, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_type), arg_type, NULL);
clGetKernelArgInfo(kernel, arg_index, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
printf(" %s %s", arg_type, arg_name);
if (arg_size == 1) {
printf(" = %d", *((char*)arg_value));
} else if (arg_size == 2) {
printf(" = %d", *((short*)arg_value));
} else if (arg_size == 4) {
if (strcmp(arg_type, "float") == 0) {
printf(" = %f", *((float*)arg_value));
} else {
printf(" = %d", *((int*)arg_value));
}
} else if (arg_size == 8) {
cl_mem val = (cl_mem)(*((uintptr_t*)arg_value));
printf(" = %p", val);
if (strcmp(arg_name, "input") == 0) kernel_inputs[kernel] = val;
if (strcmp(arg_name, "output") == 0) kernel_outputs[kernel] = val;
if (strcmp(arg_name, "accumulator") == 0) assert(kernel_inputs[kernel] = val);
if (buffers.find(val) != buffers.end()) {
printf(" buffer %zu", buffers[val]);
}
if (images.find(val) != images.end()) {
printf(" image %zu x %zu rp %zu @ %p", images[val].image_width, images[val].image_height, images[val].image_row_pitch, images[val].buffer);
}
} else {
printf(" %zu", arg_size);
}
printf("\n");
cl_int ret = my_clSetKernelArg(kernel, arg_index, arg_size, arg_value);
return ret;
}
uint64_t start_time = 0;
uint64_t tns = 0;
int cnt = 0;
cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue,
cl_kernel kernel,
cl_uint work_dim,
const size_t *global_work_offset,
const size_t *global_work_size,
const size_t *local_work_size,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) {
// SNPE doesn't use these
assert(num_events_in_wait_list == 0);
assert(global_work_offset == NULL);
cl_int (*my_clEnqueueNDRangeKernel)(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *) = NULL;
my_clEnqueueNDRangeKernel = reinterpret_cast<decltype(my_clEnqueueNDRangeKernel)>(dlsym(RTLD_NEXT, "REAL_clEnqueueNDRangeKernel"));
uint64_t tb = nanos_since_boot();
cl_int ret = my_clEnqueueNDRangeKernel(command_queue, kernel, work_dim,
global_work_offset, global_work_size, local_work_size,
num_events_in_wait_list, event_wait_list, event);
uint64_t te = nanos_since_boot();
/*ret = clWaitForEvents(1, event);
assert(ret == CL_SUCCESS);
uint64_t tq = nanos_since_boot();*/
if (do_print) {
tns += te-tb;
}
printf("%10lu %10lu running(%3d) -- %p -- %56s -- %p -> %p %s ", (tb-start_time)/1000, (tns/1000), cnt++, kernel, kernels[kernel].c_str(), kernel_inputs[kernel], kernel_outputs[kernel],
(buffers[kernel_outputs[kernel]] != 0) ? "B" : "I");
printf("global -- ");
for (int i = 0; i < work_dim; i++) {
printf("%4zu ", global_work_size[i]);
}
printf("local -- ");
for (int i = 0; i < work_dim; i++) {
printf("%4zu ", local_work_size[i]);
}
printf("\n");
return ret;
}
cl_mem clCreateBuffer(cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_int *errcode_ret) {
cl_mem (*my_clCreateBuffer)(cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_int *errcode_ret) = NULL;
my_clCreateBuffer = reinterpret_cast<decltype(my_clCreateBuffer)>(dlsym(RTLD_NEXT, "REAL_clCreateBuffer"));
cl_mem ret = my_clCreateBuffer(context, flags, size, host_ptr, errcode_ret);
buffers[ret] = size;
printf("%p = clCreateBuffer %zu\n", ret, size);
return ret;
}
cl_mem clCreateImage(cl_context context, cl_mem_flags flags, const cl_image_format *image_format, const cl_image_desc *image_desc, void *host_ptr, cl_int *errcode_ret) {
cl_mem (*my_clCreateImage)(cl_context context, cl_mem_flags flags, const cl_image_format *image_format, const cl_image_desc *image_desc, void *host_ptr, cl_int *errcode_ret) = NULL;
my_clCreateImage = reinterpret_cast<decltype(my_clCreateImage)>(dlsym(RTLD_NEXT, "REAL_clCreateImage"));
// SNPE only uses this
assert(CL_MEM_OBJECT_IMAGE2D == image_desc->image_type);
// RGBA, HALF FLOAT
assert(CL_RGBA == image_format->image_channel_order);
assert(CL_HALF_FLOAT == image_format->image_channel_data_type);
map<cl_mem_object_type, string> lc = {
{CL_MEM_OBJECT_BUFFER, "CL_MEM_OBJECT_BUFFER"},
{CL_MEM_OBJECT_IMAGE2D, "CL_MEM_OBJECT_IMAGE2D"}, // all this one
{CL_MEM_OBJECT_IMAGE3D, "CL_MEM_OBJECT_IMAGE3D"},
{CL_MEM_OBJECT_IMAGE2D_ARRAY, "CL_MEM_OBJECT_IMAGE2D_ARRAY"},
{CL_MEM_OBJECT_IMAGE1D, "CL_MEM_OBJECT_IMAGE1D"},
{CL_MEM_OBJECT_IMAGE1D_ARRAY, "CL_MEM_OBJECT_IMAGE1D_ARRAY"},
{CL_MEM_OBJECT_IMAGE1D_BUFFER, "CL_MEM_OBJECT_IMAGE1D_BUFFER"}};
assert(image_desc->image_depth == 0);
assert(image_desc->image_array_size == 0);
assert(image_desc->image_slice_pitch == 0);
//assert(image_desc->image_width * image_desc->image_height * 2 == image_desc->image_row_pitch);
image img;
img.image_width = image_desc->image_width;
img.image_height = image_desc->image_height;
img.image_row_pitch = image_desc->image_row_pitch;
img.buffer = image_desc->buffer;
cl_mem ret = my_clCreateImage(context, flags, image_format, image_desc, host_ptr, errcode_ret);
printf("%p = clCreateImage %s -- %p -- %d %d -- %4zu x %4zu x %4zu -- %4zu %4zu %4zu\n", ret, lc[image_desc->image_type].c_str(),
image_desc->buffer,
image_format->image_channel_order, image_format->image_channel_data_type,
image_desc->image_width, image_desc->image_height, image_desc->image_depth,
image_desc->image_array_size, image_desc->image_row_pitch, image_desc->image_slice_pitch
);
images[ret] = img;
return ret;
}
cl_int clWaitForEvents(cl_uint num_events, const cl_event *event_list) {
cl_int (*my_clWaitForEvents)(cl_uint num_events, const cl_event *event_list);
my_clWaitForEvents = reinterpret_cast<decltype(my_clWaitForEvents)>(dlsym(RTLD_NEXT, "REAL_clWaitForEvents"));
printf("clWaitForEvents\n");
return my_clWaitForEvents(num_events, event_list);
}
cl_int clReleaseEvent(cl_event event) {
cl_int (*my_clReleaseEvent)(cl_event event);
my_clReleaseEvent = reinterpret_cast<decltype(my_clReleaseEvent)>(dlsym(RTLD_NEXT, "REAL_clReleaseEvent"));
printf("clReleaseEvent: %p\n", event);
return my_clReleaseEvent(event);
}
/*size_t total = 0;
void *calloc(size_t num, size_t size) {
void *(*my_calloc)(size_t num, size_t size);
my_calloc = reinterpret_cast<decltype(my_calloc)>(dlsym(RTLD_NEXT, "REAL_calloc"));
void *ret = my_calloc(num, size);
if (do_print) {
total += num*size;
printf("calloc %p -- total:0x%zx -- num:0x%zx size:0x%zx\n", ret, total, num, size);
}
return ret;
}
void free(void *ptr) {
void (*my_free)(void *ptr);
my_free = reinterpret_cast<decltype(my_free)>(dlsym(RTLD_NEXT, "REAL_free"));
if (do_print) {
//printf("free: %p\n", ptr);
} else {
my_free(ptr);
}
}*/
void *dlsym(void *handle, const char *symbol) {
void *(*my_dlsym)(void *handle, const char *symbol) = (void *(*)(void *handle, const char *symbol))((uintptr_t)dlopen-0x2d4);
if (memcmp("REAL_", symbol, 5) == 0) {
return my_dlsym(handle, symbol+5);
} else if (strcmp("clCreateProgramWithSource", symbol) == 0) {
return (void*)clCreateProgramWithSource;
} else if (strcmp("clCreateKernel", symbol) == 0) {
return (void*)clCreateKernel;
} else if (strcmp("clEnqueueNDRangeKernel", symbol) == 0) {
return (void*)clEnqueueNDRangeKernel;
} else if (strcmp("clSetKernelArg", symbol) == 0) {
return (void*)clSetKernelArg;
} else if (strcmp("clCreateBuffer", symbol) == 0) {
return (void*)clCreateBuffer;
} else if (strcmp("clCreateImage", symbol) == 0) {
return (void*)clCreateImage;
/*} else if (strcmp("clReleaseEvent", symbol) == 0) {
return (void*)clReleaseEvent;
} else if (strcmp("clWaitForEvents", symbol) == 0) {
return (void*)clWaitForEvents;*/
} else {
//printf("dlsym %s\n", symbol);
return my_dlsym(handle, symbol);
}
}
int main(int argc, char* argv[]) {
int err;
start_time = nanos_since_boot();
cl_device_id device_id = cl_get_device_id(CL_DEVICE_TYPE_DEFAULT);
cl_uint tmp;
// sweet this is 64!
err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(tmp), &tmp, NULL);
assert(err == 0);
printf("CL_DEVICE_MAX_WRITE_IMAGE_ARGS: %u\n", tmp);
err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(tmp), &tmp, NULL);
assert(err == 0);
printf("CL_DEVICE_MAX_READ_IMAGE_ARGS: %u\n", tmp);
float *output = (float*)calloc(0x10000, sizeof(float));
SNPEModel mdl(argv[1], output, 0, USE_GPU_RUNTIME);
float state[TEMPORAL_SIZE];
mdl.addRecurrent(state, TEMPORAL_SIZE);
float desire[DESIRE_LEN];
mdl.addDesire(desire, DESIRE_LEN);
float traffic_convention[TRAFFIC_CONVENTION_LEN];
mdl.addTrafficConvention(traffic_convention, TRAFFIC_CONVENTION_LEN);
float *input = (float*)calloc(0x1000000, sizeof(float));;
printf("************** execute 1 **************\n");
printf("%p %p %p %p -> %p\n", input, state, desire, traffic_convention, output);
run_num = 1; ioctl_num = 0;
do_print = 0;
start_time = nanos_since_boot();
mdl.execute(input, 0);
printf("************** execute 2 **************\n");
run_num = 2; ioctl_num = 0;
do_print = 0;
mdl.execute(input, 0);
printf("************** execute 3 **************\n");
run_num = 3; ioctl_num = 0;
do_print = 1;
start_time = nanos_since_boot();
mdl.execute(input, 0);
do_print = 0;
struct kgsl_gpuobj_alloc alloc;
memset(&alloc, 0, sizeof(alloc));
alloc.size = 0x40000;
alloc.flags = 0x10000a00;
int fd = 3;
int ret = ioctl(fd, IOCTL_KGSL_GPUOBJ_ALLOC, &alloc);
void *addr = mmap64(NULL, alloc.mmapsize, 0x3, 0x1, fd, alloc.id*0x1000);
assert(addr != MAP_FAILED);
intercept = 0;
while (1) {
printf("************** execute 4 **************\n");
run_num = 4;
base = (uint64_t)addr;
uint64_t tb = nanos_since_boot();
int i = 0;
for (auto it = queue_cmds.begin(); it != queue_cmds.end(); ++it) {
printf("run %2d: ", i++);
//(*it)->exec(i == queue_cmds.size());
(*it)->exec(true);
}
uint64_t te = nanos_since_boot();
printf("model exec in %lu us\n", (te-tb)/1000);
break;
}
/*FILE *f = fopen("/proc/self/maps", "rb");
char maps[0x100000];
int len = fread(maps, 1, sizeof(maps), f);
maps[len] = '\0';
fclose(f);
printf("%s\n", maps);*/
printf("buffers: %lu images: %lu\n", buffers.size(), images.size());
printf("queues: %lu\n", queue_cmds.size());
// IOCTL_KGSL_GPU_COMMAND: flags: 0x11 numcmds: 2 numobjs: 1 numsyncs: 0 context_id: 7 timestamp: 77
/*int ts = 100;
for (auto it = queue_cmds.begin(); it != queue_cmds.end(); ++it) {
auto qcmd = *it;
//disassemble((uint32_t *)qcmd.data(), qcmd.size()/4);
struct kgsl_command_object cmdlists[2];
struct kgsl_command_object objlists;
struct kgsl_gpu_command cmd;
uint8_t objs[0xc0];
memset(objs, 0, 0xc0);
memset(&cmd, 0, sizeof(cmd));
memset(&cmdlists, 0, sizeof(struct kgsl_command_object)*2);
memset(&objlists, 0, sizeof(objlists));
cmd.flags = 0x11;
cmd.cmdlist = (uint64_t)cmdlists;
cmd.numcmds = 2;
cmd.objlist = (uint64_t)objlists;
cmd.numobjs = 1;
cmd.numsyncs = 0;
cmd.context_id = 7;
cmd.timestamp = ts++;
cmdlists[0].gpuaddr = (uint64_t)queue_init;
cmdlists[0].size = 0xbc;
cmdlists[0].flags = 1;
cmdlists[1].gpuaddr = (uint64_t)qcmd.data();
cmdlists[1].size = qcmd.size();
cmdlists[1].flags = 1;
objlists.gpuaddr = (uint64_t)objs;
objlists.size = 0xc0;
objlists.flags = 0x18;
}*/
}

View File

@@ -1,51 +0,0 @@
// https://github.com/moskewcz/boda/issues/13
#define USE_FP16
#ifdef USE_FP16
#define up(x) x
#define down(x) x
#define xtype half8
#define skip 128
#else
#define up(x) convert_float8(x)
#define down(x) convert_half8(x)
#define xtype float8
#define skip 128
#endif
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void gemm(const int M, const int N, const int K,
global const half8* a, global const half8* b, global half8* c )
{
xtype c_r[8] = {0,0,0,0,0,0,0,0};
int const a_off_thr = get_global_id(0);
int const b_off_thr = get_global_id(1);
int a_off = a_off_thr;
int b_off = b_off_thr;
for( int k = 0; k < 1024; k += 1 ) {
xtype a_r = up(a[a_off]);
xtype b_r = up(b[b_off]);
c_r[0] += a_r.s0*b_r;
c_r[1] += a_r.s1*b_r;
c_r[2] += a_r.s2*b_r;
c_r[3] += a_r.s3*b_r;
c_r[4] += a_r.s4*b_r;
c_r[5] += a_r.s5*b_r;
c_r[6] += a_r.s6*b_r;
c_r[7] += a_r.s7*b_r;
a_off += skip;
b_off += skip;
}
int c_off = a_off_thr*1024 + b_off_thr;
for (int i = 0; i < 8; i++) {
c[c_off] = down(c_r[i]);
c_off += skip;
}
}

View File

@@ -1,75 +0,0 @@
// https://github.com/moskewcz/boda/issues/13
//#define USE_FP16
#ifdef USE_FP16
#define xtype half4
#define read_imagep read_imageh
#define write_imagep write_imageh
#else
#define xtype float4
#define read_imagep read_imagef
#define write_imagep write_imagef
#endif
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void gemm(const int M, const int N, const int K,
read_only image2d_t A,
read_only image2d_t B,
write_only image2d_t C)
{
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
xtype c_r[4] = {0,0,0,0};
xtype a_r[4], b_r[4];
int const a_off_thr = get_global_id(0);
int const b_off_thr = get_global_id(1);
int2 a_samp = {0, a_off_thr};
int2 b_samp = {0, b_off_thr};
for (short k = 0; k < K/4; k++) {
for (short i = 0; i < 4; ++i) {
a_r[i] = read_imagep(A, smp, a_samp);
b_r[i] = read_imagep(B, smp, b_samp);
++a_samp.x;
++b_samp.x;
}
for (short i = 0; i < 4; ++i) {
float4 ov = c_r[i];
ov.x += a_r[i].x * b_r[0].x;
ov.x += a_r[i].y * b_r[0].y;
ov.x += a_r[i].z * b_r[0].z;
ov.x += a_r[i].w * b_r[0].w;
ov.y += a_r[i].x * b_r[1].x;
ov.y += a_r[i].y * b_r[1].y;
ov.y += a_r[i].z * b_r[1].z;
ov.y += a_r[i].w * b_r[1].w;
ov.z += a_r[i].x * b_r[2].x;
ov.z += a_r[i].y * b_r[2].y;
ov.z += a_r[i].z * b_r[2].z;
ov.z += a_r[i].w * b_r[2].w;
ov.w += a_r[i].x * b_r[3].x;
ov.w += a_r[i].y * b_r[3].y;
ov.w += a_r[i].z * b_r[3].z;
ov.w += a_r[i].w * b_r[3].w;
c_r[i] = ov;
}
}
int2 c_samp = {a_off_thr, b_off_thr*4};
for (short i = 0; i < 4; i++) {
write_imagep(C, c_samp, c_r[i]);
++c_samp.y;
}
}

View File

@@ -1,314 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <assert.h>
#include <time.h>
/*
block7b_project_conv (Conv2D) (None, 8, 16, 352) 743424 block7b_activation[0][0]
8448*8*4 = 8*16*2112 = 270336 = input = 128*2112
2112*88*4 = 743424 = weights = 2112*352
1408*8*4 = 8*16*352 = 45056 = output = 128*352
FLOPS = 128*2112*352 = 95158272 = 95 MFLOPS
RAM = 128*2112 + 2112*352 + 128*352 = 1058816 = 1 M accesses
# 22 groups
128*2112 + 2112*16 + 128*16 = 306176
306176*22 = 6735872 real accesses
This is a 128x2112 by 2112x352 matrix multiply
work_size = {88, 4, 8}
Each kernel run computes 16 outputs
0x7f7e8a6380 convolution_horizontal_reduced_reads_1x1 -- 88 4 8 -- 4 4 8
image2d_t input = 0x7f7f490b00 image 8448 x 8 rp 67840
short startPackedInputChannel = 0
short numPackedInputChannelsForGroup = 528
short totalNumPackedInputChannels = 528
short packedOuputChannelOffset = 0
short totalNumPackedOutputChannels = 88
image2d_t weights = 0x7f7f52fb80 image 2112 x 88 rp 16896
float* biases = 0x7f7f564d80 buffer 1408
short filterSizeX = 1
short filterSizeY = 1
image2d_t output = 0x7f7f490e80 image 1408 x 8 rp 11264
short paddingX = 0
short paddingY = 0
short strideX = 1
short strideY = 1
short neuron = 0
float a = 1.000000
float b = 1.000000
float min_clamp = 0.000000
float max_clamp = 0.000000
float* parameters = 0x0
float* batchNormBiases = 0x0
short numOutputColumns = 16
*/
#define GEMM
#define IMAGE
void dump_maps() {
FILE *f = fopen("/proc/self/maps", "rb");
char maps[0x100000];
int len = fread(maps, 1, sizeof(maps), f);
maps[len] = '\0';
maps[0x800] = '\0';
fclose(f);
printf("%s\n", maps);
}
static inline uint64_t nanos_since_boot() {
struct timespec t;
clock_gettime(CLOCK_BOOTTIME, &t);
return t.tv_sec * 1000000000ULL + t.tv_nsec;
}
int main(int argc, char *argv[]) {
cl_int err;
// cl init
cl_device_id device_id;
cl_context context;
cl_command_queue q;
{
cl_platform_id platform_id[2];
cl_uint num_devices;
cl_uint num_platforms;
err = clGetPlatformIDs(sizeof(platform_id)/sizeof(cl_platform_id), platform_id, &num_platforms);
assert(err == 0);
err = clGetDeviceIDs(platform_id[0], CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &num_devices);
assert(err == 0);
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &err);
assert(err == 0);
q = clCreateCommandQueue(context, device_id, 0, &err);
assert(err == 0);
}
printf("cl ready\n");
char tmp[0x10000];
memset(tmp, 0, sizeof(tmp));
FILE *f = fopen(argv[1], "rb");
fread(tmp, 1, sizeof(tmp), f);
fclose(f);
const char *strings[1];
size_t lengths[1];
strings[0] = tmp;
lengths[0] = strlen(tmp);
cl_program prog = clCreateProgramWithSource(context, 1, strings, lengths, &err);
assert(err == 0);
printf("creating program\n");
err = clBuildProgram(prog, 1, &device_id, "-D AVANTE_IS_GPU_A530_64", NULL, NULL);
if (err != 0) {
printf("got err %d\n", err);
size_t length;
char buffer[2048];
clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &length);
buffer[length] = '\0';
printf("%s\n", buffer);
}
assert(err == 0);
printf("built program\n");
#ifdef GEMM
// 128x2112 by 2112x352
int M,N,K;
M = N = K = 1024;
//M = 128; K = 2112; N = 352;
cl_kernel kern = clCreateKernel(prog, "gemm", &err);
assert(err == 0);
printf("creating kernel %p\n", kern);
cl_mem A,B,C;
A = clCreateBuffer(context, CL_MEM_READ_WRITE, M*K*2, NULL, &err);
assert(err == 0);
B = clCreateBuffer(context, CL_MEM_READ_WRITE, K*N*2, NULL, &err);
assert(err == 0);
C = clCreateBuffer(context, CL_MEM_READ_WRITE, M*N*2, NULL, &err);
assert(err == 0);
printf("created buffers\n");
#ifdef IMAGE
cl_image_format fmt;
fmt.image_channel_order = CL_RGBA;
fmt.image_channel_data_type = CL_HALF_FLOAT;
cl_image_desc desc;
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
desc.image_depth = 0; desc.image_slice_pitch = 0; desc.num_mip_levels = 0; desc.num_samples = 0;
desc.image_width = K; desc.image_height = M/4;
desc.buffer = A;
desc.image_row_pitch = desc.image_width*8;
A = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
assert(err == 0);
desc.image_width = K; desc.image_height = N/4;
desc.buffer = B; desc.image_row_pitch = desc.image_width*8;
B = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
assert(err == 0);
desc.image_width = M/4; desc.image_height = N;
desc.buffer = C; desc.image_row_pitch = desc.image_width*8;
C = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
assert(err == 0);
printf("created images\n");
#endif
clSetKernelArg(kern, 0, sizeof(int), &M);
clSetKernelArg(kern, 1, sizeof(int), &N);
clSetKernelArg(kern, 2, sizeof(int), &K);
clSetKernelArg(kern, 3, sizeof(cl_mem), &A);
clSetKernelArg(kern, 4, sizeof(cl_mem), &B);
clSetKernelArg(kern, 5, sizeof(cl_mem), &C);
printf("set args\n");
#ifdef IMAGE
size_t global_work_size[3] = {M/4, N/4, 1};
size_t local_work_size[3] = {4, 64, 1};
#else
size_t global_work_size[3] = {128, 128, 1};
size_t local_work_size[3] = {2, 128, 1};
#endif
#else
cl_kernel kern = clCreateKernel(prog, "convolution_horizontal_reduced_reads_1x1", &err);
assert(err == 0);
printf("creating kernel\n");
cl_mem input;
cl_mem weights;
cl_mem weights_buffer;
cl_mem biases;
cl_mem outputs;
cl_image_format fmt;
fmt.image_channel_order = CL_RGBA;
fmt.image_channel_data_type = CL_HALF_FLOAT;
cl_image_desc desc;
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
desc.image_depth = 0; desc.image_slice_pitch = 0; desc.num_mip_levels = 0; desc.num_samples = 0;
desc.buffer = NULL;
biases = clCreateBuffer(context, CL_MEM_READ_WRITE, 1408, NULL, &err);
assert(err == 0);
desc.image_width = 8448; desc.image_height = 8; desc.image_row_pitch = 67840;
desc.buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, desc.image_height * desc.image_row_pitch, NULL, &err);
assert(err == 0);
input = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
assert(err == 0);
desc.image_width = 2112; desc.image_height = 88; desc.image_row_pitch = 16896;
weights_buffer = desc.buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, desc.image_height * desc.image_row_pitch, NULL, &err);
assert(err == 0);
weights = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
assert(err == 0);
desc.image_width = 1408; desc.image_height = 8; desc.image_row_pitch = 11264;
desc.buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, desc.image_height * desc.image_row_pitch, NULL, &err);
assert(err == 0);
outputs = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
assert(err == 0);
void *n = NULL;
uint16_t v;
float fl;
clSetKernelArg(kern, 0, sizeof(cl_mem), &input);
v = 0; clSetKernelArg(kern, 1, sizeof(v), &v);
v = 528; clSetKernelArg(kern, 2, sizeof(v), &v);
v = 528; clSetKernelArg(kern, 3, sizeof(v), &v);
v = 0; clSetKernelArg(kern, 4, sizeof(v), &v);
v = 88; clSetKernelArg(kern, 5, sizeof(v), &v);
clSetKernelArg(kern, 6, sizeof(cl_mem), &weights);
//clSetKernelArg(kern, 6, sizeof(cl_mem), &weights_buffer);
clSetKernelArg(kern, 7, sizeof(cl_mem), &biases);
v = 1; clSetKernelArg(kern, 8, sizeof(v), &v);
v = 1; clSetKernelArg(kern, 9, sizeof(v), &v);
clSetKernelArg(kern, 10, sizeof(cl_mem), &outputs);
v = 0; clSetKernelArg(kern, 11, sizeof(v), &v);
v = 0; clSetKernelArg(kern, 12, sizeof(v), &v);
v = 1; clSetKernelArg(kern, 13, sizeof(v), &v);
v = 1; clSetKernelArg(kern, 14, sizeof(v), &v);
v = 0; clSetKernelArg(kern, 15, sizeof(v), &v);
fl = 1.0; clSetKernelArg(kern, 16, sizeof(fl), &fl);
fl = 0.0; clSetKernelArg(kern, 17, sizeof(fl), &fl);
fl = 0.0; clSetKernelArg(kern, 18, sizeof(fl), &fl);
fl = 0.0; clSetKernelArg(kern, 19, sizeof(fl), &fl);
clSetKernelArg(kern, 20, sizeof(n), &n);
clSetKernelArg(kern, 21, sizeof(n), &n);
v = 16; clSetKernelArg(kern, 22, sizeof(v), &v);
size_t global_work_size[3] = {88, 4, 8};
size_t local_work_size[3] = {4, 4, 8};
#endif
printf("ready to enqueue\n");
for (int i = 0; i < 20; i++) {
cl_event event;
err = clEnqueueNDRangeKernel(q, kern, 3, NULL, global_work_size, local_work_size, 0, NULL, &event);
assert(err == 0);
uint64_t tb = nanos_since_boot();
err = clWaitForEvents(1, &event);
assert(err == 0);
uint64_t te = nanos_since_boot();
uint64_t us = (te-tb)/1000;
float s = 1000000.0/us;
#ifdef GEMM
float flops = M*N*K*s;
float rams = (M*N + N*K + M*K)*s;
#else
float flops = 95158272.0*s;
float rams = 1058816.0*s;
//float rams = 6735872.0*s;
#endif
printf("%2d: wait %lu us -- %.2f GFLOPS -- %.2f GB/s\n", i, us, flops/1e9, rams*2/1e9);
}
size_t binary_size = 0;
err = clGetProgramInfo(prog, CL_PROGRAM_BINARY_SIZES, sizeof(binary_size), &binary_size, NULL);
assert(err == 0);
assert(binary_size > 0);
uint8_t *binary_buf = (uint8_t *)malloc(binary_size);
assert(binary_buf);
uint8_t* bufs[1] = { binary_buf, };
err = clGetProgramInfo(prog, CL_PROGRAM_BINARIES, sizeof(bufs), &bufs, NULL);
assert(err == 0);
FILE *g = fopen("/tmp/bin.bin", "wb");
fwrite(binary_buf, 1, binary_size, g);
fclose(g);
/*dump_maps();
for (uint64_t i = 0x7ffbd2000; i < 0x800000000; i += 0x1000) {
uint64_t cmd = *((uint64_t*)i);
printf("%llx: %llx\n", i, cmd);
}*/
return 0;
}

View File

@@ -1,2 +0,0 @@
#!/usr/bin/env bash
gcc -I/data/openpilot/phonelibs/opencl/include -L/system/vendor/lib64 -lOpenCL -lCB -lgsl go.c

View File

@@ -1,99 +0,0 @@
#include "../thneed.h"
#include "../../runners/snpemodel.h"
#define TEMPORAL_SIZE 512
#define DESIRE_LEN 8
#define TRAFFIC_CONVENTION_LEN 2
void hexdump(uint32_t *d, int len);
int main(int argc, char* argv[]) {
#define OUTPUT_SIZE 0x10000
float *output = (float*)calloc(OUTPUT_SIZE, sizeof(float));
float *golden = (float*)calloc(OUTPUT_SIZE, sizeof(float));
SNPEModel mdl(argv[1], output, 0, USE_GPU_RUNTIME);
// cmd line test
if (argc > 2) {
for (int i = 2; i < argc; i++) {
float *buf[5];
FILE *f = fopen(argv[i], "rb");
size_t sz;
for (int j = 0; j < 5; j++) {
fread(&sz, 1, sizeof(sz), f);
printf("reading %zu\n", sz);
buf[j] = (float*)malloc(sz);
fread(buf[j], 1, sz, f);
}
if (sz != 9532) continue;
mdl.addRecurrent(buf[0], TEMPORAL_SIZE);
mdl.addTrafficConvention(buf[1], TRAFFIC_CONVENTION_LEN);
mdl.addDesire(buf[2], DESIRE_LEN);
mdl.execute(buf[3], 0);
hexdump((uint32_t*)buf[4], 0x100);
hexdump((uint32_t*)output, 0x100);
for (int j = 0; j < sz/4; j++) {
if (buf[4][j] != output[j]) {
printf("MISMATCH %d real:%f comp:%f\n", j, buf[4][j], output[j]);
}
}
}
return 0;
}
float state[TEMPORAL_SIZE];
mdl.addRecurrent(state, TEMPORAL_SIZE);
float desire[DESIRE_LEN];
mdl.addDesire(desire, DESIRE_LEN);
float traffic_convention[TRAFFIC_CONVENTION_LEN];
mdl.addTrafficConvention(traffic_convention, TRAFFIC_CONVENTION_LEN);
float *input = (float*)calloc(0x1000000, sizeof(float));;
// first run
printf("************** execute 1 **************\n");
memset(output, 0, OUTPUT_SIZE * sizeof(float));
mdl.execute(input, 0);
hexdump((uint32_t *)output, 0x100);
memcpy(golden, output, OUTPUT_SIZE * sizeof(float));
// second run
printf("************** execute 2 **************\n");
memset(output, 0, OUTPUT_SIZE * sizeof(float));
Thneed *t = new Thneed();
t->record = 7; // debug print with record
mdl.execute(input, 0);
t->stop();
hexdump((uint32_t *)output, 0x100);
if (memcmp(golden, output, OUTPUT_SIZE * sizeof(float)) != 0) { printf("FAILURE\n"); return -1; }
// third run
printf("************** execute 3 **************\n");
memset(output, 0, OUTPUT_SIZE * sizeof(float));
t->record = 2; // debug print w/o record
float *inputs[4] = {state, traffic_convention, desire, input};
t->execute(inputs, output, true);
hexdump((uint32_t *)output, 0x100);
if (memcmp(golden, output, OUTPUT_SIZE * sizeof(float)) != 0) { printf("FAILURE\n"); return -1; }
printf("************** execute 4 **************\n");
while (1) {
memset(output, 0, OUTPUT_SIZE * sizeof(float));
//t->record = 2; // debug print w/o record
t->execute(inputs, output);
hexdump((uint32_t *)output, 0x100);
if (memcmp(golden, output, OUTPUT_SIZE * sizeof(float)) != 0) { printf("FAILURE\n"); return -1; }
break;
}
printf("************** execute done **************\n");
}

View File

@@ -1,4 +0,0 @@
#!/bin/sh
export LD_LIBRARY_PATH="/data/openpilot/phonelibs/snpe/aarch64/:$HOME/openpilot/phonelibs/snpe/larch64:$HOME/openpilot/phonelibs/snpe/x86_64-linux-clang:$LD_LIBRARY_PATH"
exec ./_thneed $@

View File

@@ -7,9 +7,16 @@
#include <errno.h>
#include "thneed.h"
//#define SAVE_KERNELS
//#define RUN_DISASSEMBLER
//#define RUN_OPTIMIZER
Thneed *g_thneed = NULL;
int g_fd = -1;
map<pair<cl_kernel, int>, string> g_args;
map<pair<cl_kernel, int>, int> g_args_size;
map<cl_program, string> g_program_source;
static inline uint64_t nanos_since_boot() {
struct timespec t;
@@ -27,6 +34,8 @@ void hexdump(uint32_t *d, int len) {
printf("\n");
}
// *********** ioctl interceptor ***********
extern "C" {
int (*my_ioctl)(int filedes, unsigned long request, void *argp) = NULL;
@@ -39,32 +48,33 @@ int ioctl(int filedes, unsigned long request, void *argp) {
// save the fd
if (request == IOCTL_KGSL_GPUOBJ_ALLOC) g_fd = filedes;
// note that this runs always, even without a thneed object
if (request == IOCTL_KGSL_DRAWCTXT_CREATE) {
struct kgsl_drawctxt_create *create = (struct kgsl_drawctxt_create *)argp;
create->flags &= ~KGSL_CONTEXT_PRIORITY_MASK;
create->flags |= 1 << KGSL_CONTEXT_PRIORITY_SHIFT; // priority from 1-15, 1 is max priority
printf("creating context with flags 0x%x\n", create->flags);
printf("IOCTL_KGSL_DRAWCTXT_CREATE: creating context with flags 0x%x\n", create->flags);
}
if (thneed != NULL) {
if (request == IOCTL_KGSL_GPU_COMMAND) {
struct kgsl_gpu_command *cmd = (struct kgsl_gpu_command *)argp;
if (thneed->record & 1) {
if (thneed->record & THNEED_RECORD) {
thneed->timestamp = cmd->timestamp;
thneed->context_id = cmd->context_id;
thneed->cmds.push_back(unique_ptr<CachedCommand>(new CachedCommand(thneed, cmd)));
}
if (thneed->record & 2) {
printf("IOCTL_KGSL_GPU_COMMAND(%2zu): flags: 0x%lx context_id: %u timestamp: %u\n",
if (thneed->record & THNEED_DEBUG) {
printf("IOCTL_KGSL_GPU_COMMAND(%2zu): flags: 0x%lx context_id: %u timestamp: %u numcmds: %d numobjs: %d\n",
thneed->cmds.size(),
cmd->flags,
cmd->context_id, cmd->timestamp);
cmd->context_id, cmd->timestamp, cmd->numcmds, cmd->numobjs);
}
} else if (request == IOCTL_KGSL_GPUOBJ_SYNC) {
struct kgsl_gpuobj_sync *cmd = (struct kgsl_gpuobj_sync *)argp;
struct kgsl_gpuobj_sync_obj *objs = (struct kgsl_gpuobj_sync_obj *)(cmd->objs);
if (thneed->record & 2) {
if (thneed->record & THNEED_DEBUG) {
printf("IOCTL_KGSL_GPUOBJ_SYNC count:%d ", cmd->count);
for (int i = 0; i < cmd->count; i++) {
printf(" -- offset:0x%lx len:0x%lx id:%d op:%d ", objs[i].offset, objs[i].length, objs[i].id, objs[i].op);
@@ -72,20 +82,20 @@ int ioctl(int filedes, unsigned long request, void *argp) {
printf("\n");
}
if (thneed->record & 1) {
if (thneed->record & THNEED_RECORD) {
thneed->syncobjs.push_back(string((char *)objs, sizeof(struct kgsl_gpuobj_sync_obj)*cmd->count));
}
} else if (request == IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID) {
struct kgsl_device_waittimestamp_ctxtid *cmd = (struct kgsl_device_waittimestamp_ctxtid *)argp;
if (thneed->record & 2) {
if (thneed->record & THNEED_DEBUG) {
printf("IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID: context_id: %d timestamp: %d timeout: %d\n",
cmd->context_id, cmd->timestamp, cmd->timeout);
}
} else if (request == IOCTL_KGSL_SETPROPERTY) {
if (thneed->record & 2) {
if (thneed->record & THNEED_DEBUG) {
struct kgsl_device_getproperty *prop = (struct kgsl_device_getproperty *)argp;
printf("IOCTL_KGSL_SETPROPERTY: 0x%x sizebytes:%zu\n", prop->type, prop->sizebytes);
if (thneed->record & 4) {
if (thneed->record & THNEED_VERBOSE_DEBUG) {
hexdump((uint32_t *)prop->value, prop->sizebytes);
if (prop->type == KGSL_PROP_PWR_CONSTRAINT) {
struct kgsl_device_constraint *constraint = (struct kgsl_device_constraint *)prop->value;
@@ -103,6 +113,8 @@ int ioctl(int filedes, unsigned long request, void *argp) {
}
// *********** GPUMalloc ***********
GPUMalloc::GPUMalloc(int size, int fd) {
struct kgsl_gpuobj_alloc alloc;
memset(&alloc, 0, sizeof(alloc));
@@ -128,30 +140,38 @@ void *GPUMalloc::alloc(int size) {
return ret;
}
// *********** CachedCommand, at the ioctl layer ***********
CachedCommand::CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd) {
thneed = lthneed;
assert(cmd->numcmds == 2);
assert(cmd->numobjs == 1);
assert(cmd->numsyncs == 0);
memcpy(cmds, (void *)cmd->cmdlist, sizeof(struct kgsl_command_object)*2);
memcpy(objs, (void *)cmd->objlist, sizeof(struct kgsl_command_object)*1);
memcpy(&cache, cmd, sizeof(cache));
cache.cmdlist = (uint64_t)cmds;
cache.objlist = (uint64_t)objs;
for (int i = 0; i < cmd->numcmds; i++) {
void *nn = thneed->ram->alloc(cmds[i].size);
memcpy(nn, (void*)cmds[i].gpuaddr, cmds[i].size);
cmds[i].gpuaddr = (uint64_t)nn;
if (cmd->numcmds > 0) {
cmds = make_unique<struct kgsl_command_object[]>(cmd->numcmds);
memcpy(cmds.get(), (void *)cmd->cmdlist, sizeof(struct kgsl_command_object)*cmd->numcmds);
cache.cmdlist = (uint64_t)cmds.get();
for (int i = 0; i < cmd->numcmds; i++) {
void *nn = thneed->ram->alloc(cmds[i].size);
memcpy(nn, (void*)cmds[i].gpuaddr, cmds[i].size);
cmds[i].gpuaddr = (uint64_t)nn;
}
}
for (int i = 0; i < cmd->numobjs; i++) {
void *nn = thneed->ram->alloc(objs[i].size);
memset(nn, 0, objs[i].size);
objs[i].gpuaddr = (uint64_t)nn;
if (cmd->numobjs > 0) {
objs = make_unique<struct kgsl_command_object[]>(cmd->numobjs);
memcpy(objs.get(), (void *)cmd->objlist, sizeof(struct kgsl_command_object)*cmd->numobjs);
cache.objlist = (uint64_t)objs.get();
for (int i = 0; i < cmd->numobjs; i++) {
void *nn = thneed->ram->alloc(objs[i].size);
memset(nn, 0, objs[i].size);
objs[i].gpuaddr = (uint64_t)nn;
}
}
kq = thneed->ckq;
thneed->ckq.clear();
}
void CachedCommand::exec(bool wait) {
@@ -168,19 +188,33 @@ void CachedCommand::exec(bool wait) {
int wret = ioctl(thneed->fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait);
uint64_t te = nanos_since_boot();
if (thneed->record & 2) printf("exec %d wait %d after %lu us\n", ret, wret, (te-tb)/1000);
if (thneed->record & THNEED_DEBUG) printf("exec %d wait %d after %lu us\n", ret, wret, (te-tb)/1000);
} else {
if (thneed->record & 2) printf("CachedCommand::exec got %d\n", ret);
if (thneed->record & THNEED_DEBUG) printf("CachedCommand::exec got %d\n", ret);
}
if (thneed->record & THNEED_VERBOSE_DEBUG) {
for (auto &it : kq) {
it->debug_print(false);
}
#ifdef RUN_DISASSEMBLER
// assuming 2 commands
disassemble(0);
disassemble(1);
#endif
}
assert(ret == 0);
}
Thneed::Thneed() {
// *********** Thneed ***********
Thneed::Thneed(bool do_clinit) {
if (do_clinit) clinit();
assert(g_fd != -1);
fd = g_fd;
ram = make_unique<GPUMalloc>(0x40000, fd);
record = 1;
record = THNEED_RECORD;
timestamp = -1;
g_thneed = this;
}
@@ -189,29 +223,18 @@ void Thneed::stop() {
record = 0;
}
//#define SAVE_LOG
void Thneed::execute(float **finputs, float *foutput, bool slow) {
int ret;
uint64_t tb, te;
if (record & 2) tb = nanos_since_boot();
#ifdef SAVE_LOG
char fn[0x100];
snprintf(fn, sizeof(fn), "/tmp/thneed_log_%d", timestamp);
FILE *f = fopen(fn, "wb");
#endif
if (record & THNEED_DEBUG) tb = nanos_since_boot();
// ****** copy inputs
for (int idx = 0; idx < inputs.size(); ++idx) {
size_t sz;
clGetMemObjectInfo(inputs[idx], CL_MEM_SIZE, sizeof(sz), &sz, NULL);
#ifdef SAVE_LOG
fwrite(&sz, 1, sizeof(sz), f);
fwrite(finputs[idx], 1, sz, f);
#endif
if (record & 2) printf("copying %lu -- %p -> %p\n", sz, finputs[idx], inputs[idx]);
if (record & THNEED_DEBUG) printf("copying %lu -- %p -> %p\n", sz, finputs[idx], inputs[idx]);
// TODO: This shouldn't have to block
clEnqueueWriteBuffer(command_queue, inputs[idx], CL_TRUE, 0, sz, finputs[idx], 0, NULL, NULL);
}
@@ -229,40 +252,38 @@ void Thneed::execute(float **finputs, float *foutput, bool slow) {
prop.type = KGSL_PROP_PWR_CONSTRAINT;
prop.value = (void*)&constraint;
prop.sizebytes = sizeof(constraint);
int ret = ioctl(fd, IOCTL_KGSL_SETPROPERTY, &prop);
ret = ioctl(fd, IOCTL_KGSL_SETPROPERTY, &prop);
assert(ret == 0);
// ****** run commands
int i = 0;
for (auto it = cmds.begin(); it != cmds.end(); ++it) {
for (auto &it : cmds) {
++i;
if (record & 2) printf("run %2d: ", i);
(*it)->exec((i == cmds.size()) || slow);
if (record & THNEED_DEBUG) printf("run %2d @ %7lu us: ", i, (nanos_since_boot()-tb)/1000);
it->exec((i == cmds.size()) || slow);
}
// ****** sync objects
for (auto it = syncobjs.begin(); it != syncobjs.end(); ++it) {
for (auto &it : syncobjs) {
struct kgsl_gpuobj_sync cmd;
cmd.objs = (uint64_t)it->data();
cmd.obj_len = it->length();
cmd.count = it->length() / sizeof(struct kgsl_gpuobj_sync_obj);
cmd.objs = (uint64_t)it.data();
cmd.obj_len = it.length();
cmd.count = it.length() / sizeof(struct kgsl_gpuobj_sync_obj);
ret = ioctl(fd, IOCTL_KGSL_GPUOBJ_SYNC, &cmd);
assert(ret == 0);
}
// ****** copy outputs
size_t sz;
clGetMemObjectInfo(output, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
if (record & 2) printf("copying %lu for output %p -> %p\n", sz, output, foutput);
clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sz, foutput, 0, NULL, NULL);
#ifdef SAVE_LOG
fwrite(&sz, 1, sizeof(sz), f);
fwrite(foutput, 1, sz, f);
fclose(f);
#endif
if (output != NULL) {
size_t sz;
clGetMemObjectInfo(output, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
if (record & THNEED_DEBUG) printf("copying %lu for output %p -> %p\n", sz, output, foutput);
clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sz, foutput, 0, NULL, NULL);
} else {
printf("CAUTION: model output is NULL, does it have no outputs?\n");
}
// ****** unset power constraint
constraint.type = KGSL_CONSTRAINT_NONE;
@@ -272,25 +293,59 @@ void Thneed::execute(float **finputs, float *foutput, bool slow) {
ret = ioctl(fd, IOCTL_KGSL_SETPROPERTY, &prop);
assert(ret == 0);
if (record & 2) {
if (record & THNEED_DEBUG) {
te = nanos_since_boot();
printf("model exec in %lu us\n", (te-tb)/1000);
}
}
// TODO: with a different way of getting the input and output buffers, we don't have to intercept CL at all
void Thneed::clinit() {
cl_int err;
cl_platform_id platform_id[2];
cl_uint num_devices;
cl_uint num_platforms;
err = clGetPlatformIDs(sizeof(platform_id)/sizeof(cl_platform_id), platform_id, &num_platforms);
assert(err == 0);
err = clGetDeviceIDs(platform_id[0], CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &num_devices);
assert(err == 0);
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &err);
assert(err == 0);
//cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0};
command_queue = clCreateCommandQueueWithProperties(context, device_id, props, &err);
assert(err == 0);
printf("Thneed::clinit done\n");
}
cl_int Thneed::clexec() {
printf("Thneed::clexec: running %lu queued kernels\n", kq.size());
for (auto &k : kq) {
if (record & THNEED_RECORD) ckq.push_back(k);
cl_int ret = k->exec();
assert(ret == CL_SUCCESS);
}
return clFinish(command_queue);
}
// *********** OpenCL interceptor ***********
cl_int (*my_clSetKernelArg)(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) = NULL;
cl_int thneed_clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) {
if (my_clSetKernelArg == NULL) my_clSetKernelArg = reinterpret_cast<decltype(my_clSetKernelArg)>(dlsym(RTLD_NEXT, "REAL_clSetKernelArg"));
g_args_size[make_pair(kernel, arg_index)] = arg_size;
if (arg_value != NULL) {
g_args[make_pair(kernel, arg_index)] = string((char*)arg_value, arg_size);
} else {
g_args[make_pair(kernel, arg_index)] = string("");
}
cl_int ret = my_clSetKernelArg(kernel, arg_index, arg_size, arg_value);
cl_int ret = clSetKernelArg(kernel, arg_index, arg_size, arg_value);
return ret;
}
cl_int (*my_clEnqueueNDRangeKernel)(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *) = NULL;
cl_int thneed_clEnqueueNDRangeKernel(cl_command_queue command_queue,
cl_kernel kernel,
cl_uint work_dim,
@@ -301,62 +356,183 @@ cl_int thneed_clEnqueueNDRangeKernel(cl_command_queue command_queue,
const cl_event *event_wait_list,
cl_event *event) {
if (my_clEnqueueNDRangeKernel == NULL) my_clEnqueueNDRangeKernel = reinterpret_cast<decltype(my_clEnqueueNDRangeKernel)>(dlsym(RTLD_NEXT, "REAL_clEnqueueNDRangeKernel"));
Thneed *thneed = g_thneed;
// SNPE doesn't use these
assert(num_events_in_wait_list == 0);
assert(global_work_offset == NULL);
assert(event_wait_list == NULL);
char name[0x100];
clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, sizeof(name), name, NULL);
cl_int ret = 0;
if (thneed != NULL && thneed->record & THNEED_RECORD) {
if (thneed->context == NULL) {
thneed->command_queue = command_queue;
clGetKernelInfo(kernel, CL_KERNEL_CONTEXT, sizeof(thneed->context), &thneed->context, NULL);
clGetContextInfo(thneed->context, CL_CONTEXT_DEVICES, sizeof(thneed->device_id), &thneed->device_id, NULL);
}
cl_uint num_args;
// if we are recording, we don't actually enqueue the kernel
thneed->kq.push_back(unique_ptr<CLQueuedKernel>(new CLQueuedKernel(thneed, kernel, work_dim, global_work_size, local_work_size)));
*event = NULL;
} else {
ret = clEnqueueNDRangeKernel(command_queue, kernel, work_dim,
global_work_offset, global_work_size, local_work_size,
num_events_in_wait_list, event_wait_list, event);
}
return ret;
}
cl_int thneed_clFinish(cl_command_queue command_queue) {
Thneed *thneed = g_thneed;
if (thneed != NULL && thneed->record & THNEED_RECORD) {
#ifdef RUN_OPTIMIZER
thneed->optimize();
#endif
return thneed->clexec();
} else {
return clFinish(command_queue);
}
}
cl_program thneed_clCreateProgramWithSource(cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret) {
assert(count == 1);
cl_program ret = clCreateProgramWithSource(context, count, strings, lengths, errcode_ret);
g_program_source[ret] = strings[0];
return ret;
}
void *dlsym(void *handle, const char *symbol) {
// TODO: Find dlsym in a better way. Currently this is hand looked up in libdl.so
#if defined QCOM
void *(*my_dlsym)(void *handle, const char *symbol) = (void *(*)(void *handle, const char *symbol))((uintptr_t)dlopen-0x2d4);
#elif defined QCOM2
void *(*my_dlsym)(void *handle, const char *symbol) = (void *(*)(void *handle, const char *symbol))((uintptr_t)dlopen+0x138);
#else
#error "Unsupported platform for thneed"
#endif
if (memcmp("REAL_", symbol, 5) == 0) {
return my_dlsym(handle, symbol+5);
} else if (strcmp("clFinish", symbol) == 0) {
return (void*)thneed_clFinish;
} else if (strcmp("clEnqueueNDRangeKernel", symbol) == 0) {
return (void*)thneed_clEnqueueNDRangeKernel;
} else if (strcmp("clSetKernelArg", symbol) == 0) {
return (void*)thneed_clSetKernelArg;
} else if (strcmp("clCreateProgramWithSource", symbol) == 0) {
return (void*)thneed_clCreateProgramWithSource;
} else {
return my_dlsym(handle, symbol);
}
}
// *********** CLQueuedKernel ***********
CLQueuedKernel::CLQueuedKernel(Thneed *lthneed,
cl_kernel _kernel,
cl_uint _work_dim,
const size_t *_global_work_size,
const size_t *_local_work_size) {
thneed = lthneed;
kernel = _kernel;
work_dim = _work_dim;
assert(work_dim <= 3);
for (int i = 0; i < work_dim; i++) {
global_work_size[i] = _global_work_size[i];
local_work_size[i] = _local_work_size[i];
}
char _name[0x100];
clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, sizeof(_name), _name, NULL);
name = string(_name);
clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(num_args), &num_args, NULL);
if (thneed != NULL && thneed->record & 1) {
thneed->command_queue = command_queue;
for (int i = 0; i < num_args; i++) {
// get args
for (int i = 0; i < num_args; i++) {
char arg_name[0x100];
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
arg_names.push_back(string(arg_name));
args.push_back(g_args[make_pair(kernel, i)]);
args_size.push_back(g_args_size[make_pair(kernel, i)]);
}
// get program
clGetKernelInfo(kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, NULL);
}
int CLQueuedKernel::get_arg_num(const char *search_arg_name) {
for (int i = 0; i < num_args; i++) {
if (arg_names[i] == search_arg_name) return i;
}
printf("failed to find %s in %s\n", search_arg_name, name.c_str());
assert(false);
}
cl_int CLQueuedKernel::exec() {
if (kernel == NULL) {
kernel = clCreateKernel(program, name.c_str(), NULL);
arg_names.clear();
for (int j = 0; j < num_args; j++) {
char arg_name[0x100];
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
string arg = g_args[make_pair(kernel, i)];
clGetKernelArgInfo(kernel, j, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
arg_names.push_back(string(arg_name));
if (strcmp(arg_name, "input") == 0 && strcmp(name, "zero_pad_image_float") == 0) {
cl_mem mem;
memcpy(&mem, (void*)arg.data(), sizeof(mem));
thneed->inputs.push_back(mem);
cl_int ret;
if (args[j].size() != 0) {
assert(args[j].size() == args_size[j]);
ret = thneed_clSetKernelArg(kernel, j, args[j].size(), args[j].data());
} else {
ret = thneed_clSetKernelArg(kernel, j, args_size[j], NULL);
}
assert(ret == CL_SUCCESS);
}
}
// save the global inputs/outputs
if (thneed->record & THNEED_RECORD) {
for (int i = 0; i < num_args; i++) {
if (name == "zero_pad_image_float" && arg_names[i] == "input") {
thneed->inputs.push_back(*(cl_mem*)(args[i].data()));
}
if (strcmp(arg_name, "output") == 0 && strcmp(name, "image2d_to_buffer_float") == 0) {
cl_mem mem;
memcpy(&mem, (void*)arg.data(), sizeof(mem));
thneed->output = mem;
if (name == "image2d_to_buffer_float" && arg_names[i] == "output") {
thneed->output = *(cl_mem*)(args[i].data());
}
}
}
if (thneed != NULL && thneed->record & 2) {
printf("%p %56s -- ", kernel, name);
for (int i = 0; i < work_dim; i++) {
printf("%4zu ", global_work_size[i]);
}
printf(" -- ");
for (int i = 0; i < work_dim; i++) {
printf("%4zu ", local_work_size[i]);
}
printf("\n");
if (thneed->record & THNEED_DEBUG) {
debug_print(thneed->record & THNEED_VERBOSE_DEBUG);
}
if (thneed != NULL && thneed->record & 4) {
// extreme debug
return clEnqueueNDRangeKernel(thneed->command_queue,
kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL);
}
void CLQueuedKernel::debug_print(bool verbose) {
printf("%p %56s -- ", kernel, name.c_str());
for (int i = 0; i < work_dim; i++) {
printf("%4zu ", global_work_size[i]);
}
printf(" -- ");
for (int i = 0; i < work_dim; i++) {
printf("%4zu ", local_work_size[i]);
}
printf("\n");
if (verbose) {
for (int i = 0; i < num_args; i++) {
char arg_type[0x100];
char arg_name[0x100];
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_type), arg_type, NULL);
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
string arg = g_args[make_pair(kernel, i)];
printf(" %s %s", arg_type, arg_name);
string arg = args[i];
printf(" %s %s", arg_type, arg_names[i].c_str());
void *arg_value = (void*)arg.data();
int arg_size = arg.size();
if (arg_size == 1) {
if (arg_size == 0) {
printf(" (size) %d", args_size[i]);
} else if (arg_size == 1) {
printf(" = %d", *((char*)arg_value));
} else if (arg_size == 2) {
printf(" = %d", *((short*)arg_value));
@@ -373,19 +549,24 @@ cl_int thneed_clEnqueueNDRangeKernel(cl_command_queue command_queue,
if (strcmp("image2d_t", arg_type) == 0 || strcmp("image1d_t", arg_type) == 0) {
cl_image_format format;
size_t width, height, depth, array_size, row_pitch, slice_pitch;
cl_mem buf;
clGetImageInfo(val, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);
assert(format.image_channel_order == CL_RGBA);
assert(format.image_channel_data_type == CL_HALF_FLOAT);
clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL);
clGetImageInfo(val, CL_IMAGE_DEPTH, sizeof(depth), &depth, NULL);
clGetImageInfo(val, CL_IMAGE_ARRAY_SIZE, sizeof(array_size), &array_size, NULL);
clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL);
clGetImageInfo(val, CL_IMAGE_SLICE_PITCH, sizeof(slice_pitch), &slice_pitch, NULL);
assert(depth == 0);
assert(array_size == 0);
assert(slice_pitch == 0);
printf(" image %zu x %zu rp %zu", width, height, row_pitch);
clGetImageInfo(val, CL_IMAGE_BUFFER, sizeof(buf), &buf, NULL);
size_t sz;
clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
printf(" image %zu x %zu rp %zu @ %p buffer %zu", width, height, row_pitch, buf, sz);
} else {
size_t sz;
clGetMemObjectInfo(val, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
@@ -396,79 +577,5 @@ cl_int thneed_clEnqueueNDRangeKernel(cl_command_queue command_queue,
printf("\n");
}
}
cl_int ret = my_clEnqueueNDRangeKernel(command_queue, kernel, work_dim,
global_work_offset, global_work_size, local_work_size,
num_events_in_wait_list, event_wait_list, event);
/*uint64_t tb = nanos_since_boot();
clWaitForEvents(1, event);
uint64_t te = nanos_since_boot();
if (thneed != NULL && thneed->record & 2) {
printf(" wait %lu us\n", (te-tb)/1000);
}*/
return ret;
}
//#define SAVE_KERNELS
#ifdef SAVE_KERNELS
map<cl_program, string> program_source;
cl_program (*my_clCreateProgramWithSource)(cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret) = NULL;
cl_program thneed_clCreateProgramWithSource(cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret) {
if (my_clCreateProgramWithSource == NULL) my_clCreateProgramWithSource = reinterpret_cast<decltype(my_clCreateProgramWithSource)>(dlsym(RTLD_NEXT, "REAL_clCreateProgramWithSource"));
assert(count == 1);
size_t my_lengths[1];
my_lengths[0] = lengths[0];
char fn[0x100];
snprintf(fn, sizeof(fn), "/tmp/program_%zu.cl", strlen(strings[0]));
FILE *f = fopen(fn, "wb");
fprintf(f, "%s", strings[0]);
fclose(f);
char tmp[0x10000];
memset(tmp, 0, sizeof(tmp));
snprintf(fn, sizeof(fn), "/tmp/patched_%zu.cl", strlen(strings[0]));
FILE *g = fopen(fn, "rb");
if (g != NULL) {
printf("LOADING PATCHED PROGRAM %s\n", fn);
fread(tmp, 1, sizeof(tmp), g);
fclose(g);
strings[0] = tmp;
my_lengths[0] = strlen(tmp);
}
program_source[ret] = strings[0];
cl_program ret = my_clCreateProgramWithSource(context, count, strings, my_lengths, errcode_ret);
return ret;
}
#endif
void *dlsym(void *handle, const char *symbol) {
// TODO: Find dlsym in a better way. Currently this is hand looked up in libdl.so
#if defined QCOM
void *(*my_dlsym)(void *handle, const char *symbol) = (void *(*)(void *handle, const char *symbol))((uintptr_t)dlopen-0x2d4);
#elif defined QCOM2
void *(*my_dlsym)(void *handle, const char *symbol) = (void *(*)(void *handle, const char *symbol))((uintptr_t)dlopen+0x138);
#else
#error "Unsupported platform for thneed"
#endif
if (memcmp("REAL_", symbol, 5) == 0) {
return my_dlsym(handle, symbol+5);
} else if (strcmp("clEnqueueNDRangeKernel", symbol) == 0) {
return (void*)thneed_clEnqueueNDRangeKernel;
} else if (strcmp("clSetKernelArg", symbol) == 0) {
return (void*)thneed_clSetKernelArg;
#ifdef SAVE_KERNELS
} else if (strcmp("clCreateProgramWithSource", symbol) == 0) {
return (void*)thneed_clCreateProgramWithSource;
#endif
} else {
return my_dlsym(handle, symbol);
}
}

View File

@@ -9,10 +9,18 @@
#include "include/msm_kgsl.h"
#include <vector>
#include <memory>
#include <string>
#include <CL/cl.h>
#define THNEED_RECORD 1
#define THNEED_DEBUG 2
#define THNEED_VERBOSE_DEBUG 4
using namespace std;
namespace json11 {
class Json;
}
class Thneed;
class GPUMalloc {
@@ -25,28 +33,59 @@ class GPUMalloc {
int remaining;
};
class CLQueuedKernel {
public:
CLQueuedKernel(Thneed *lthneed) { thneed = lthneed; }
CLQueuedKernel(Thneed *lthneed,
cl_kernel _kernel,
cl_uint _work_dim,
const size_t *_global_work_size,
const size_t *_local_work_size);
cl_int exec();
void debug_print(bool verbose);
int get_arg_num(const char *search_arg_name);
cl_program program;
string name;
cl_uint num_args;
vector<string> arg_names;
vector<string> args;
vector<int> args_size;
cl_kernel kernel = NULL;
json11::Json to_json() const;
cl_uint work_dim;
size_t global_work_size[3] = {0};
size_t local_work_size[3] = {0};
private:
Thneed *thneed;
};
class CachedCommand {
public:
CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd);
void exec(bool wait);
void disassemble();
void disassemble(int cmd_index);
private:
struct kgsl_gpu_command cache;
struct kgsl_command_object cmds[2];
struct kgsl_command_object objs[1];
unique_ptr<kgsl_command_object[]> cmds;
unique_ptr<kgsl_command_object[]> objs;
Thneed *thneed;
vector<shared_ptr<CLQueuedKernel> > kq;
};
class Thneed {
public:
Thneed();
Thneed(bool do_clinit=false);
void stop();
void execute(float **finputs, float *foutput, bool slow=false);
int optimize();
vector<cl_mem> inputs;
cl_mem output;
cl_mem output = NULL;
cl_context context = NULL;
cl_command_queue command_queue;
cl_device_id device_id;
int context_id;
// protected?
@@ -56,5 +95,19 @@ class Thneed {
vector<unique_ptr<CachedCommand> > cmds;
vector<string> syncobjs;
int fd;
// all CL kernels
cl_int clexec();
vector<shared_ptr<CLQueuedKernel> > kq;
// pending CL kernels
vector<shared_ptr<CLQueuedKernel> > ckq;
// loading and saving
void load(const char *filename);
void save(const char *filename);
private:
void clinit();
json11::Json to_json();
};