modeld: delete unused SNPE stuff after move to tinygrad (#25635)
* delete unused stuff * remove CL interceptor from thneed since we don't use SNPE anymore * remove dead files from release * that's removed * oops, didn't save old-commit-hash: 6c39382d71978d20f00286678b2f13e00134bb0a
This commit is contained in:
@@ -367,10 +367,7 @@ selfdrive/modeld/thneed/thneed.h
|
||||
selfdrive/modeld/thneed/thneed_common.cc
|
||||
selfdrive/modeld/thneed/thneed_qcom2.cc
|
||||
selfdrive/modeld/thneed/serialize.cc
|
||||
selfdrive/modeld/thneed/compile.cc
|
||||
selfdrive/modeld/thneed/optimizer.cc
|
||||
selfdrive/modeld/thneed/include/*
|
||||
selfdrive/modeld/thneed/kernels/*.cl
|
||||
|
||||
selfdrive/modeld/runners/snpemodel.cc
|
||||
selfdrive/modeld/runners/snpemodel.h
|
||||
|
||||
@@ -26,7 +26,6 @@ thneed_src = [
|
||||
"thneed/thneed_common.cc",
|
||||
"thneed/thneed_qcom2.cc",
|
||||
"thneed/serialize.cc",
|
||||
"thneed/optimizer.cc",
|
||||
"runners/thneedmodel.cc",
|
||||
]
|
||||
|
||||
@@ -95,18 +94,6 @@ if use_thneed and arch == "larch64" or GetOption('pc_thneed'):
|
||||
"#tinygrad_repo/tinygrad/nn/__init__.py"
|
||||
], cmd)
|
||||
|
||||
# old thneed compiler. TODO: remove this once tinygrad stuff is stable
|
||||
|
||||
#compiler = lenv.Program('thneed/compile', ["thneed/compile.cc"]+common_model, LIBS=libs)
|
||||
#cmd = f"cd {Dir('.').abspath} && {compiler[0].abspath} --in {fn}.dlc --out {fn}.thneed --binary --optimize"
|
||||
|
||||
#lib_paths = ':'.join(Dir(p).abspath for p in lenv["LIBPATH"])
|
||||
#kernel_path = os.path.join(Dir('.').abspath, "thneed", "kernels")
|
||||
#cenv = Environment(ENV={'LD_LIBRARY_PATH': f"{lib_paths}:{lenv['ENV']['LD_LIBRARY_PATH']}", 'KERNEL_PATH': kernel_path})
|
||||
|
||||
#kernels = [os.path.join(kernel_path, x) for x in os.listdir(kernel_path) if x.endswith(".cl")]
|
||||
#cenv.Command(fn + ".thneed", [fn + ".dlc", kernels, compiler], cmd)
|
||||
|
||||
llenv = lenv.Clone()
|
||||
if GetOption('pc_thneed'):
|
||||
pc_thneed_src = [
|
||||
|
||||
@@ -186,75 +186,14 @@ std::unique_ptr<zdl::DlSystem::IUserBuffer> SNPEModel::addExtra(float *state, in
|
||||
}
|
||||
|
||||
void SNPEModel::execute() {
|
||||
#ifdef USE_THNEED
|
||||
if (Runtime == zdl::DlSystem::Runtime_t::GPU) {
|
||||
if (!thneed_recorded) {
|
||||
bool ret = inputBuffer->setBufferAddress(input);
|
||||
assert(ret == true);
|
||||
if (use_extra) {
|
||||
assert(extra != NULL);
|
||||
bool extra_ret = extraBuffer->setBufferAddress(extra);
|
||||
assert(extra_ret == true);
|
||||
}
|
||||
if (!snpe->execute(inputMap, outputMap)) {
|
||||
PrintErrorStringAndExit();
|
||||
}
|
||||
memset(recurrent, 0, recurrent_size*sizeof(float));
|
||||
thneed->record = true;
|
||||
if (!snpe->execute(inputMap, outputMap)) {
|
||||
PrintErrorStringAndExit();
|
||||
}
|
||||
thneed->stop();
|
||||
printf("thneed cached\n");
|
||||
|
||||
// doing self test
|
||||
float *outputs_golden = (float *)malloc(output_size*sizeof(float));
|
||||
memcpy(outputs_golden, output, output_size*sizeof(float));
|
||||
memset(output, 0, output_size*sizeof(float));
|
||||
memset(recurrent, 0, recurrent_size*sizeof(float));
|
||||
uint64_t start_time = nanos_since_boot();
|
||||
if (extra != NULL) {
|
||||
float *inputs[5] = {recurrent, trafficConvention, desire, extra, input};
|
||||
thneed->execute(inputs, output);
|
||||
} else {
|
||||
float *inputs[4] = {recurrent, trafficConvention, desire, input};
|
||||
thneed->execute(inputs, output);
|
||||
}
|
||||
uint64_t elapsed_time = nanos_since_boot() - start_time;
|
||||
printf("ran model in %.2f ms\n", float(elapsed_time)/1e6);
|
||||
|
||||
if (memcmp(output, outputs_golden, output_size*sizeof(float)) == 0) {
|
||||
printf("thneed selftest passed\n");
|
||||
} else {
|
||||
for (int i = 0; i < output_size; i++) {
|
||||
printf("mismatch %3d: %f %f\n", i, output[i], outputs_golden[i]);
|
||||
}
|
||||
assert(false);
|
||||
}
|
||||
free(outputs_golden);
|
||||
thneed_recorded = true;
|
||||
} else {
|
||||
if (use_extra) {
|
||||
float *inputs[5] = {recurrent, trafficConvention, desire, extra, input};
|
||||
thneed->execute(inputs, output);
|
||||
} else {
|
||||
float *inputs[4] = {recurrent, trafficConvention, desire, input};
|
||||
thneed->execute(inputs, output);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#endif
|
||||
bool ret = inputBuffer->setBufferAddress(input);
|
||||
assert(ret == true);
|
||||
if (use_extra) {
|
||||
bool extra_ret = extraBuffer->setBufferAddress(extra);
|
||||
assert(extra_ret == true);
|
||||
}
|
||||
if (!snpe->execute(inputMap, outputMap)) {
|
||||
PrintErrorStringAndExit();
|
||||
}
|
||||
#ifdef USE_THNEED
|
||||
bool ret = inputBuffer->setBufferAddress(input);
|
||||
assert(ret == true);
|
||||
if (use_extra) {
|
||||
bool extra_ret = extraBuffer->setBufferAddress(extra);
|
||||
assert(extra_ret == true);
|
||||
}
|
||||
if (!snpe->execute(inputMap, outputMap)) {
|
||||
PrintErrorStringAndExit();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,6 @@ ThneedModel::ThneedModel(const char *path, float *loutput, size_t loutput_size,
|
||||
thneed = new Thneed(true, context);
|
||||
thneed->load(path);
|
||||
thneed->clexec();
|
||||
thneed->find_inputs_outputs();
|
||||
|
||||
recorded = false;
|
||||
output = loutput;
|
||||
|
||||
@@ -1,81 +0,0 @@
|
||||
#include <cstring>
|
||||
#include <getopt.h>
|
||||
|
||||
#include "selfdrive/modeld/runners/snpemodel.h"
|
||||
#include "selfdrive/modeld/thneed/thneed.h"
|
||||
#include "system/hardware/hw.h"
|
||||
|
||||
#define TEMPORAL_SIZE 512
|
||||
#define DESIRE_LEN 8
|
||||
#define TRAFFIC_CONVENTION_LEN 2
|
||||
|
||||
// TODO: This should probably use SNPE directly.
|
||||
int main(int argc, char* argv[]) {
|
||||
bool run_optimizer = false, save_binaries = false;
|
||||
const char *input_file = NULL, *output_file = NULL;
|
||||
static struct option long_options[] = {
|
||||
{"in", required_argument, 0, 'i' },
|
||||
{"out", required_argument, 0, 'o' },
|
||||
{"binary", no_argument, 0, 'b' },
|
||||
{"optimize", no_argument, 0, 'f' },
|
||||
{0, 0, 0, 0 }
|
||||
};
|
||||
int long_index = 0, opt = 0;
|
||||
while ((opt = getopt_long_only(argc, argv,"", long_options, &long_index)) != -1) {
|
||||
switch (opt) {
|
||||
case 'i': input_file = optarg; break;
|
||||
case 'o': output_file = optarg; break;
|
||||
case 'b': save_binaries = true; break;
|
||||
case 'f': run_optimizer = true; break;
|
||||
}
|
||||
}
|
||||
|
||||
// no input?
|
||||
if (!input_file) {
|
||||
printf("usage: -i <input file> -o <output file> --binary --optimize\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
#define OUTPUT_SIZE 0x10000
|
||||
|
||||
float *output = (float*)calloc(OUTPUT_SIZE, sizeof(float));
|
||||
SNPEModel mdl(input_file, output, 0, USE_GPU_RUNTIME, true);
|
||||
mdl.thneed->run_optimizer = run_optimizer;
|
||||
|
||||
float state[TEMPORAL_SIZE] = {0};
|
||||
float desire[DESIRE_LEN] = {0};
|
||||
float traffic_convention[TRAFFIC_CONVENTION_LEN] = {0};
|
||||
float *input = (float*)calloc(0x1000000, sizeof(float));
|
||||
float *extra = (float*)calloc(0x1000000, sizeof(float));
|
||||
|
||||
mdl.addRecurrent(state, TEMPORAL_SIZE);
|
||||
mdl.addDesire(desire, DESIRE_LEN);
|
||||
mdl.addTrafficConvention(traffic_convention, TRAFFIC_CONVENTION_LEN);
|
||||
mdl.addImage(input, 0);
|
||||
mdl.addExtra(extra, 0);
|
||||
|
||||
// first run
|
||||
printf("************** execute 1 **************\n");
|
||||
memset(output, 0, OUTPUT_SIZE * sizeof(float));
|
||||
mdl.execute();
|
||||
|
||||
// don't save?
|
||||
if (!output_file) {
|
||||
printf("no output file, exiting\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// save model
|
||||
printf("saving %s with binary %d\n", output_file, save_binaries);
|
||||
mdl.thneed->save(output_file, save_binaries);
|
||||
|
||||
// test model
|
||||
auto thneed = new Thneed(true);
|
||||
thneed->record = false;
|
||||
thneed->load(output_file);
|
||||
thneed->clexec();
|
||||
thneed->find_inputs_outputs();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1,272 +0,0 @@
|
||||
read_only image2d_t input,
|
||||
#ifndef DEPTHWISE
|
||||
short startPackedInputChannel,
|
||||
short numPackedInputChannelsForGroup, short totalNumPackedInputChannels,
|
||||
// typo required for API compatibility
|
||||
short packedOuputChannelOffset, short totalNumPackedOutputChannels,
|
||||
#else
|
||||
short totalNumPackedChannels,
|
||||
#endif
|
||||
read_only image2d_t weights, __constant float *biases,
|
||||
short filterSizeX, short filterSizeY,
|
||||
write_only image2d_t output,
|
||||
short paddingX, short paddingY, short strideX, short strideY,
|
||||
#ifdef SUPPORT_DILATION
|
||||
short dilationX, short dilationY,
|
||||
#endif
|
||||
short neuron, float a, float b, float min_clamp, float max_clamp,
|
||||
#ifndef DEPTHWISE
|
||||
// note: these are not supported
|
||||
__constant float *parameters, __constant float *batchNormBiases,
|
||||
#endif
|
||||
short numOutputColumns
|
||||
#ifdef SUPPORT_ACCUMULATION
|
||||
, short doAccumulate, read_only image2d_t accumulator
|
||||
#endif
|
||||
) {
|
||||
|
||||
#ifndef NUM_OUTPUTS
|
||||
#define NUM_OUTPUTS 4
|
||||
#endif
|
||||
|
||||
// init
|
||||
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
short packedOutputChannel = get_global_id(0);
|
||||
short startOutputColumn = mul24((short)get_global_id(1), NUM_OUTPUTS);
|
||||
short outputRow = get_global_id(2);
|
||||
|
||||
#ifdef DEPTHWISE
|
||||
short totalNumPackedInputChannels = totalNumPackedChannels;
|
||||
short totalNumPackedOutputChannels = totalNumPackedChannels;
|
||||
short startPackedInputChannel = packedOutputChannel;
|
||||
#endif
|
||||
|
||||
short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), totalNumPackedInputChannels, startPackedInputChannel);
|
||||
short strideWithChannels = mul24(strideX, totalNumPackedInputChannels);
|
||||
|
||||
float4 outputValues[NUM_OUTPUTS];
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
outputValues[i] = (float4)(0, 0, 0, 0);
|
||||
}
|
||||
|
||||
int2 inputLocation;
|
||||
inputLocation.y = mad24(outputRow, strideY, -paddingY);
|
||||
|
||||
int2 weightLocation;
|
||||
weightLocation.x = 0;
|
||||
weightLocation.y = packedOutputChannel;
|
||||
|
||||
#ifdef DEPTHWISE
|
||||
|
||||
#ifdef SUPPORT_DILATION
|
||||
|
||||
// depthwise convolution
|
||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
|
||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
|
||||
short dilatedStepX = mul24(totalNumPackedChannels, dilationX);
|
||||
inputLocation.x = mad24(rfColumn, dilatedStepX, startX);
|
||||
float4 inputValues[4];
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
inputValues[i] = read_imagef(input, smp, inputLocation);
|
||||
inputLocation.x += strideWithChannels;
|
||||
}
|
||||
float4 weightValues = read_imagef(weights, smp, weightLocation);
|
||||
++weightLocation.x;
|
||||
outputValues[0] += inputValues[0] * weightValues;
|
||||
outputValues[1] += inputValues[1] * weightValues;
|
||||
outputValues[2] += inputValues[2] * weightValues;
|
||||
outputValues[3] += inputValues[3] * weightValues;
|
||||
}
|
||||
inputLocation.y += dilationY;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// depthwise unstrided convolution
|
||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
|
||||
float4 inputValues[4];
|
||||
inputLocation.x = startX;
|
||||
for (short i = 1; i < 4; ++i) {
|
||||
inputValues[i] = read_imagef(input, smp, inputLocation);
|
||||
inputLocation.x += totalNumPackedOutputChannels;
|
||||
}
|
||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
|
||||
inputValues[0] = inputValues[1];
|
||||
inputValues[1] = inputValues[2];
|
||||
inputValues[2] = inputValues[3];
|
||||
inputValues[3] = read_imagef(input, smp, inputLocation);
|
||||
inputLocation.x += totalNumPackedChannels;
|
||||
float4 weightValues = read_imagef(weights, smp, weightLocation);
|
||||
++weightLocation.x;
|
||||
outputValues[0] += inputValues[0] * weightValues;
|
||||
outputValues[1] += inputValues[1] * weightValues;
|
||||
outputValues[2] += inputValues[2] * weightValues;
|
||||
outputValues[3] += inputValues[3] * weightValues;
|
||||
}
|
||||
++inputLocation.y;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#elif defined(ONLY_1X1_CONV)
|
||||
|
||||
// 1x1 convolution
|
||||
short endPackedInputChannel = startPackedInputChannel + numPackedInputChannelsForGroup;
|
||||
for (short packedInputChannel = startPackedInputChannel; packedInputChannel < endPackedInputChannel; ++packedInputChannel) {
|
||||
float4 weightValues[4];
|
||||
for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
|
||||
weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
|
||||
++weightLocation.x;
|
||||
}
|
||||
|
||||
inputLocation.x = startX + packedInputChannel;
|
||||
float4 inputValues[NUM_OUTPUTS];
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
inputValues[i] = read_imagef(input, smp, inputLocation);
|
||||
inputLocation.x += strideWithChannels;
|
||||
}
|
||||
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
float4 curOutputValues = outputValues[i];
|
||||
curOutputValues.x += inputValues[i].x * weightValues[0].x;
|
||||
curOutputValues.x += inputValues[i].y * weightValues[0].y;
|
||||
curOutputValues.x += inputValues[i].z * weightValues[0].z;
|
||||
curOutputValues.x += inputValues[i].w * weightValues[0].w;
|
||||
curOutputValues.y += inputValues[i].x * weightValues[1].x;
|
||||
curOutputValues.y += inputValues[i].y * weightValues[1].y;
|
||||
curOutputValues.y += inputValues[i].z * weightValues[1].z;
|
||||
curOutputValues.y += inputValues[i].w * weightValues[1].w;
|
||||
curOutputValues.z += inputValues[i].x * weightValues[2].x;
|
||||
curOutputValues.z += inputValues[i].y * weightValues[2].y;
|
||||
curOutputValues.z += inputValues[i].z * weightValues[2].z;
|
||||
curOutputValues.z += inputValues[i].w * weightValues[2].w;
|
||||
curOutputValues.w += inputValues[i].x * weightValues[3].x;
|
||||
curOutputValues.w += inputValues[i].y * weightValues[3].y;
|
||||
curOutputValues.w += inputValues[i].z * weightValues[3].z;
|
||||
curOutputValues.w += inputValues[i].w * weightValues[3].w;
|
||||
outputValues[i] = curOutputValues;
|
||||
}
|
||||
}
|
||||
packedOutputChannel += packedOuputChannelOffset;
|
||||
|
||||
#else
|
||||
|
||||
// normal convolution
|
||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
|
||||
for (short packedInputChannel = 0; packedInputChannel < numPackedInputChannelsForGroup; ++packedInputChannel) {
|
||||
short startXForChannel = startX + packedInputChannel;
|
||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
|
||||
|
||||
float4 weightValues[4];
|
||||
for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
|
||||
weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
|
||||
++weightLocation.x;
|
||||
}
|
||||
|
||||
#ifdef SUPPORT_DILATION
|
||||
short dilatedStepX = mul24(totalNumPackedInputChannels, dilationX);
|
||||
inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel);
|
||||
#else
|
||||
inputLocation.x = mad24(rfColumn, totalNumPackedInputChannels, startXForChannel);
|
||||
#endif
|
||||
float4 inputValues[NUM_OUTPUTS];
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
inputValues[i] = read_imagef(input, smp, inputLocation);
|
||||
inputLocation.x += strideWithChannels;
|
||||
}
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
float4 curOutputValues = outputValues[i];
|
||||
curOutputValues.x += inputValues[i].x * weightValues[0].x;
|
||||
curOutputValues.x += inputValues[i].y * weightValues[0].y;
|
||||
curOutputValues.x += inputValues[i].z * weightValues[0].z;
|
||||
curOutputValues.x += inputValues[i].w * weightValues[0].w;
|
||||
curOutputValues.y += inputValues[i].x * weightValues[1].x;
|
||||
curOutputValues.y += inputValues[i].y * weightValues[1].y;
|
||||
curOutputValues.y += inputValues[i].z * weightValues[1].z;
|
||||
curOutputValues.y += inputValues[i].w * weightValues[1].w;
|
||||
curOutputValues.z += inputValues[i].x * weightValues[2].x;
|
||||
curOutputValues.z += inputValues[i].y * weightValues[2].y;
|
||||
curOutputValues.z += inputValues[i].z * weightValues[2].z;
|
||||
curOutputValues.z += inputValues[i].w * weightValues[2].w;
|
||||
curOutputValues.w += inputValues[i].x * weightValues[3].x;
|
||||
curOutputValues.w += inputValues[i].y * weightValues[3].y;
|
||||
curOutputValues.w += inputValues[i].z * weightValues[3].z;
|
||||
curOutputValues.w += inputValues[i].w * weightValues[3].w;
|
||||
outputValues[i] = curOutputValues;
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef SUPPORT_DILATION
|
||||
inputLocation.y += dilationY;
|
||||
#else
|
||||
++inputLocation.y;
|
||||
#endif
|
||||
}
|
||||
packedOutputChannel += packedOuputChannelOffset;
|
||||
#endif
|
||||
|
||||
// bias
|
||||
short outputChannel = mul24(packedOutputChannel, 4);
|
||||
float4 biasValues = vload4(0, biases + outputChannel);
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
outputValues[i] += biasValues;
|
||||
}
|
||||
|
||||
#ifdef SUPPORT_ACCUMULATION
|
||||
// accumulate
|
||||
if (doAccumulate) {
|
||||
int2 outputLocation;
|
||||
short outputColumn = startOutputColumn;
|
||||
outputLocation.y = outputRow;
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
|
||||
if (outputColumn < numOutputColumns) {
|
||||
outputValues[i] += read_imagef(accumulator, smp, outputLocation);
|
||||
}
|
||||
++outputColumn;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// activation
|
||||
switch (neuron) {
|
||||
case 1:
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
outputValues[i] = max(outputValues[i], 0.0f);
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
outputValues[i] = a * tanh(b * outputValues[i]);
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
outputValues[i] = max(outputValues[i], min_clamp);
|
||||
outputValues[i] = min(outputValues[i], max_clamp);
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// output
|
||||
int2 outputLocation;
|
||||
short outputColumn = startOutputColumn;
|
||||
outputLocation.y = outputRow;
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
|
||||
if (outputColumn < numOutputColumns) {
|
||||
write_imagef(output, outputLocation, outputValues[i]);
|
||||
}
|
||||
++outputColumn;
|
||||
}
|
||||
}
|
||||
@@ -1,3 +0,0 @@
|
||||
#define SUPPORT_DILATION
|
||||
|
||||
__kernel void convolution_horizontal_reduced_reads(
|
||||
@@ -1,4 +0,0 @@
|
||||
#define ONLY_1X1_CONV
|
||||
#define SUPPORT_ACCUMULATION
|
||||
|
||||
__kernel void convolution_horizontal_reduced_reads_1x1(
|
||||
@@ -1,3 +0,0 @@
|
||||
#define NUM_OUTPUTS 5
|
||||
|
||||
__kernel void convolution_horizontal_reduced_reads_5_outputs(
|
||||
@@ -1,4 +0,0 @@
|
||||
#define DEPTHWISE
|
||||
#define SUPPORT_DILATION
|
||||
|
||||
__kernel void convolution_horizontal_reduced_reads_depthwise(
|
||||
@@ -1,3 +0,0 @@
|
||||
#define DEPTHWISE
|
||||
|
||||
__kernel void convolution_horizontal_reduced_reads_depthwise_stride_1(
|
||||
@@ -1,261 +0,0 @@
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include "thneed.h"
|
||||
|
||||
#include "common/util.h"
|
||||
#include "common/clutil.h"
|
||||
|
||||
extern map<cl_program, string> g_program_source;
|
||||
|
||||
/*static int is_same_size_image(cl_mem a, cl_mem b) {
|
||||
size_t a_width, a_height, a_depth, a_array_size, a_row_pitch, a_slice_pitch;
|
||||
clGetImageInfo(a, CL_IMAGE_WIDTH, sizeof(a_width), &a_width, NULL);
|
||||
clGetImageInfo(a, CL_IMAGE_HEIGHT, sizeof(a_height), &a_height, NULL);
|
||||
clGetImageInfo(a, CL_IMAGE_DEPTH, sizeof(a_depth), &a_depth, NULL);
|
||||
clGetImageInfo(a, CL_IMAGE_ARRAY_SIZE, sizeof(a_array_size), &a_array_size, NULL);
|
||||
clGetImageInfo(a, CL_IMAGE_ROW_PITCH, sizeof(a_row_pitch), &a_row_pitch, NULL);
|
||||
clGetImageInfo(a, CL_IMAGE_SLICE_PITCH, sizeof(a_slice_pitch), &a_slice_pitch, NULL);
|
||||
|
||||
size_t b_width, b_height, b_depth, b_array_size, b_row_pitch, b_slice_pitch;
|
||||
clGetImageInfo(b, CL_IMAGE_WIDTH, sizeof(b_width), &b_width, NULL);
|
||||
clGetImageInfo(b, CL_IMAGE_HEIGHT, sizeof(b_height), &b_height, NULL);
|
||||
clGetImageInfo(b, CL_IMAGE_DEPTH, sizeof(b_depth), &b_depth, NULL);
|
||||
clGetImageInfo(b, CL_IMAGE_ARRAY_SIZE, sizeof(b_array_size), &b_array_size, NULL);
|
||||
clGetImageInfo(b, CL_IMAGE_ROW_PITCH, sizeof(b_row_pitch), &b_row_pitch, NULL);
|
||||
clGetImageInfo(b, CL_IMAGE_SLICE_PITCH, sizeof(b_slice_pitch), &b_slice_pitch, NULL);
|
||||
|
||||
return (a_width == b_width) && (a_height == b_height) &&
|
||||
(a_depth == b_depth) && (a_array_size == b_array_size) &&
|
||||
(a_row_pitch == b_row_pitch) && (a_slice_pitch == b_slice_pitch);
|
||||
}*/
|
||||
|
||||
static cl_mem make_image_like(cl_context context, cl_mem val) {
|
||||
cl_image_format format;
|
||||
size_t width, height, row_pitch;
|
||||
clGetImageInfo(val, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);
|
||||
assert(format.image_channel_order == CL_RGBA);
|
||||
assert(format.image_channel_data_type == CL_HALF_FLOAT);
|
||||
clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
|
||||
clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
|
||||
clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL);
|
||||
|
||||
cl_image_desc desc = {0};
|
||||
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
|
||||
desc.image_width = width;
|
||||
desc.image_height = height;
|
||||
desc.image_row_pitch = row_pitch;
|
||||
|
||||
cl_mem buf = clCreateBuffer(context, CL_MEM_READ_WRITE, row_pitch*height, NULL, NULL);
|
||||
assert(buf != NULL);
|
||||
desc.buffer = buf;
|
||||
|
||||
cl_int err;
|
||||
cl_mem tmp = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
|
||||
//printf("got %d for image %zux%zu %zu\n", err, width, height, row_pitch);
|
||||
assert(tmp != NULL);
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
// convolution_horizontal_reduced_reads_1x1 is 66% of the model runtime
|
||||
// make that faster and the model gets faster
|
||||
|
||||
// this cuts ~2 ms off the model runtime right now
|
||||
int Thneed::optimize() {
|
||||
const char *kernel_path = getenv("KERNEL_PATH");
|
||||
if (!kernel_path) { kernel_path = "/data/openpilot/selfdrive/modeld/thneed/kernels"; printf("no KERNEL_PATH set, defaulting to %s\n", kernel_path); }
|
||||
|
||||
string convolution_;
|
||||
{
|
||||
char fn[0x100];
|
||||
snprintf(fn, sizeof(fn), "%s/%s.cl", kernel_path, "convolution_");
|
||||
convolution_ = util::read_file(fn);
|
||||
}
|
||||
|
||||
// load custom kernels
|
||||
map<string, cl_program> g_programs;
|
||||
for (auto &k : kq) {
|
||||
// replace program?
|
||||
if (g_programs.find(k->name) == g_programs.end()) {
|
||||
char fn[0x100];
|
||||
snprintf(fn, sizeof(fn), "%s/%s.cl", kernel_path, k->name.c_str());
|
||||
if (util::file_exists(fn)) {
|
||||
string kernel_src = util::read_file(fn);
|
||||
if (k->name.rfind("convolution_", 0) == 0) {
|
||||
kernel_src += convolution_;
|
||||
}
|
||||
printf("building kernel %s with len %lu\n", k->name.c_str(), kernel_src.length());
|
||||
k->program = cl_program_from_source(context, device_id, kernel_src);
|
||||
|
||||
// save in cache
|
||||
g_programs[k->name] = k->program;
|
||||
g_program_source[k->program] = kernel_src;
|
||||
} else {
|
||||
g_programs[k->name] = NULL;
|
||||
}
|
||||
} else {
|
||||
// cached replacement
|
||||
if (g_programs[k->name] != NULL) {
|
||||
k->program = g_programs[k->name];
|
||||
}
|
||||
}
|
||||
|
||||
// hack in accumulator to convolution_horizontal_reduced_reads_1x1
|
||||
if (k->name == "convolution_horizontal_reduced_reads_1x1") {
|
||||
k->arg_names.push_back("doAccumulate");
|
||||
short doAccumulate = 0;
|
||||
k->args.push_back(string((char *)&doAccumulate, sizeof(doAccumulate)));
|
||||
k->args_size.push_back(2);
|
||||
k->arg_names.push_back("accumulator");
|
||||
k->args.push_back(k->args[k->get_arg_num("output")]);
|
||||
k->args_size.push_back(8);
|
||||
k->num_args += 2;
|
||||
}
|
||||
|
||||
// assert that parameters + batchNormBiases are not used
|
||||
// since they aren't supported in custom replacement kernels
|
||||
if (k->name == "convolution_horizontal_reduced_reads_1x1" ||
|
||||
k->name == "convolution_horizontal_reduced_reads" ||
|
||||
k->name == "convolution_horizontal_reduced_reads_5_outputs") {
|
||||
string p1 = k->args[k->get_arg_num("parameters")];
|
||||
string p2 = k->args[k->get_arg_num("batchNormBiases")];
|
||||
assert(p1.length() == 8 && *((uint64_t*)p1.data()) == 0);
|
||||
assert(p2.length() == 8 && *((uint64_t*)p2.data()) == 0);
|
||||
}
|
||||
}
|
||||
|
||||
// optimizer
|
||||
size_t start_size;
|
||||
do {
|
||||
start_size = kq.size();
|
||||
|
||||
// get optimizations
|
||||
map<string, string> replacements;
|
||||
for (int i = 0; i < kq.size(); i++) {
|
||||
// fusing elementwise_sum + activate_image will save 3 enqueues
|
||||
|
||||
// delete useless copy layers
|
||||
// saves ~0.7 ms
|
||||
/*if (kq[i]->name == "concatenation" || kq[i]->name == "flatten") {
|
||||
string in = kq[i]->args[kq[i]->get_arg_num("input")];
|
||||
string out = kq[i]->args[kq[i]->get_arg_num("output")];
|
||||
if (is_same_size_image(*(cl_mem*)in.data(), *(cl_mem*)out.data())) {
|
||||
cl_mem tmp = make_image_like(context, *(cl_mem *)in.data());
|
||||
replacements[in] = string((char *)&tmp, sizeof(tmp));
|
||||
replacements[out] = string((char *)&tmp, sizeof(tmp));
|
||||
|
||||
kq.erase(kq.begin()+i); --i;
|
||||
}
|
||||
}*/
|
||||
|
||||
// NOTE: if activations/accumulation are done in the wrong order, this will be wrong
|
||||
|
||||
// fuse activations into convs and fc_Wtx
|
||||
// saves ~1.5 ms
|
||||
// NOTE: this changes the outputs because of rounding, should be better now!
|
||||
if (i != 0 && kq[i]->name == "activate_image") {
|
||||
if (kq[i-1]->name == "convolution_horizontal_reduced_reads_1x1" ||
|
||||
kq[i-1]->name == "convolution_horizontal_reduced_reads_5_outputs" ||
|
||||
kq[i-1]->name == "convolution_horizontal_reduced_reads" ||
|
||||
kq[i-1]->name == "convolution_horizontal_reduced_reads_depthwise" ||
|
||||
kq[i-1]->name == "convolution_horizontal_reduced_reads_depthwise_stride_1" ||
|
||||
kq[i-1]->name == "fc_Wtx") {
|
||||
string lastout = kq[i-1]->args[kq[i-1]->get_arg_num("output")];
|
||||
string in = kq[i]->args[kq[i]->get_arg_num("input")];
|
||||
string out = kq[i]->args[kq[i]->get_arg_num("output")];
|
||||
|
||||
if (lastout == in) {
|
||||
short neuron = *(int*)kq[i]->args[kq[i]->get_arg_num("neuron")].data();
|
||||
assert(neuron <= 5);
|
||||
|
||||
// ELU isn't supported in fc_Wtx
|
||||
assert(!(kq[i-1]->name == "fc_Wtx" && neuron == 5));
|
||||
|
||||
kq[i-1]->args[kq[i-1]->get_arg_num("neuron")] = string((char *)&neuron, sizeof(neuron));
|
||||
|
||||
cl_mem tmp = make_image_like(context, *(cl_mem *)lastout.data());
|
||||
replacements[in] = string((char *)&tmp, sizeof(tmp));
|
||||
replacements[out] = string((char *)&tmp, sizeof(tmp));
|
||||
|
||||
kq.erase(kq.begin()+i); --i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fuse accumulation into convs and fc_Wtx
|
||||
if (i != 0 && kq[i]->name == "elementwise_sum") {
|
||||
if (kq[i-1]->name == "convolution_horizontal_reduced_reads_1x1" ||
|
||||
kq[i-1]->name == "fc_Wtx") {
|
||||
string lastout = kq[i-1]->args[kq[i-1]->get_arg_num("output")];
|
||||
string a = kq[i]->args[kq[i]->get_arg_num("a")];
|
||||
string b = kq[i]->args[kq[i]->get_arg_num("b")];
|
||||
string out = kq[i]->args[kq[i]->get_arg_num("output")];
|
||||
|
||||
if (lastout == a) {
|
||||
kq[i-1]->args[kq[i-1]->get_arg_num("accumulator")] = b;
|
||||
} else if (lastout == b) {
|
||||
kq[i-1]->args[kq[i-1]->get_arg_num("accumulator")] = a;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
cl_mem tmp = make_image_like(context, *(cl_mem *)lastout.data());
|
||||
replacements[lastout] = string((char *)&tmp, sizeof(tmp));
|
||||
replacements[out] = string((char *)&tmp, sizeof(tmp));
|
||||
|
||||
short doAccumulate = 1;
|
||||
kq[i-1]->args[kq[i-1]->get_arg_num("doAccumulate")] = string((char *)&doAccumulate, sizeof(doAccumulate));
|
||||
|
||||
kq.erase(kq.begin()+i); --i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// remap inputs and outputs, and clear the kernels
|
||||
for (int i = 0; i < kq.size(); i++) {
|
||||
kq[i]->kernel = NULL;
|
||||
for (int j = 0; j < kq[i]->num_args; j++) {
|
||||
if (replacements.find(kq[i]->args[j]) != replacements.end()) {
|
||||
kq[i]->args[j] = replacements[kq[i]->args[j]];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf("optimize %lu -> %lu\n", start_size, kq.size());
|
||||
} while (kq.size() != start_size);
|
||||
|
||||
size_t work_group_size = 0;
|
||||
clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(work_group_size), &work_group_size, NULL);
|
||||
printf("max work group size %lu\n", work_group_size);
|
||||
|
||||
// local work group optimizer
|
||||
for (auto &k : kq) {
|
||||
// only do it for convs, since others might share memory
|
||||
if (k->name.rfind("convolution_", 0) == 0) {
|
||||
int best = -1;
|
||||
if (k->local_work_size[0] * k->local_work_size[1] * k->local_work_size[2] < work_group_size/2) {
|
||||
uint64_t base_time = k->benchmark();
|
||||
uint64_t best_time = base_time;
|
||||
for (int i = 0; i < 3; i++) {
|
||||
k->local_work_size[i] *= 2;
|
||||
uint64_t this_time = k->benchmark();
|
||||
if (this_time < best_time) {
|
||||
best = i;
|
||||
best_time = this_time;
|
||||
}
|
||||
k->local_work_size[i] /= 2;
|
||||
}
|
||||
if (best != -1) {
|
||||
k->local_work_size[best] *= 2;
|
||||
//printf("%s %.2f ms doubled %d to %.2f ms\n", k->name.c_str(), base_time/1e6, best, best_time/1e6);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -152,155 +152,3 @@ void Thneed::load(const char *filename) {
|
||||
|
||||
clFinish(command_queue);
|
||||
}
|
||||
|
||||
void Thneed::save(const char *filename, bool save_binaries) {
|
||||
printf("Thneed::save: saving to %s\n", filename);
|
||||
|
||||
// get kernels
|
||||
std::vector<Json> kernels;
|
||||
std::set<string> saved_objects;
|
||||
std::vector<Json> objects;
|
||||
std::map<string, string> programs;
|
||||
std::map<string, string> binaries;
|
||||
|
||||
for (auto &k : kq) {
|
||||
kernels.push_back(k->to_json());
|
||||
|
||||
// check args for objects
|
||||
int i = 0;
|
||||
for (auto &a : k->args) {
|
||||
if (a.size() == 8) {
|
||||
if (saved_objects.find(a) == saved_objects.end()) {
|
||||
saved_objects.insert(a);
|
||||
cl_mem val = *(cl_mem*)(a.data());
|
||||
if (val != NULL) {
|
||||
bool needs_load = k->arg_names[i] == "weights" || k->arg_names[i] == "biases";
|
||||
|
||||
auto jj = Json::object({
|
||||
{"id", a},
|
||||
{"arg_type", k->arg_types[i]},
|
||||
});
|
||||
|
||||
if (k->arg_types[i] == "image2d_t" || k->arg_types[i] == "image1d_t") {
|
||||
cl_mem buf = NULL;
|
||||
clGetImageInfo(val, CL_IMAGE_BUFFER, sizeof(buf), &buf, NULL);
|
||||
string aa = string((char *)&buf, sizeof(buf));
|
||||
jj["buffer_id"] = aa;
|
||||
|
||||
size_t width, height, row_pitch;
|
||||
clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
|
||||
clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
|
||||
clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL);
|
||||
jj["width"] = (int)width;
|
||||
jj["height"] = (int)height;
|
||||
jj["row_pitch"] = (int)row_pitch;
|
||||
jj["size"] = (int)(height * row_pitch);
|
||||
jj["needs_load"] = false;
|
||||
jj["float32"] = false;
|
||||
|
||||
if (saved_objects.find(aa) == saved_objects.end()) {
|
||||
saved_objects.insert(aa);
|
||||
size_t sz;
|
||||
clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
|
||||
// save the buffer
|
||||
objects.push_back(Json::object({
|
||||
{"id", aa},
|
||||
{"arg_type", "<image buffer>"},
|
||||
{"needs_load", needs_load},
|
||||
{"size", (int)sz}
|
||||
}));
|
||||
if (needs_load) assert(sz == height * row_pitch);
|
||||
}
|
||||
} else {
|
||||
size_t sz = 0;
|
||||
clGetMemObjectInfo(val, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
|
||||
jj["size"] = (int)sz;
|
||||
jj["needs_load"] = needs_load;
|
||||
}
|
||||
|
||||
objects.push_back(jj);
|
||||
}
|
||||
}
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
if (save_binaries) {
|
||||
int err;
|
||||
size_t binary_size = 0;
|
||||
err = clGetProgramInfo(k->program, CL_PROGRAM_BINARY_SIZES, sizeof(binary_size), &binary_size, NULL);
|
||||
assert(err == 0);
|
||||
assert(binary_size > 0);
|
||||
string sv(binary_size, '\x00');
|
||||
|
||||
uint8_t* bufs[1] = { (uint8_t*)sv.data(), };
|
||||
err = clGetProgramInfo(k->program, CL_PROGRAM_BINARIES, sizeof(bufs), &bufs, NULL);
|
||||
assert(err == 0);
|
||||
|
||||
binaries[k->name] = sv;
|
||||
} else {
|
||||
programs[k->name] = g_program_source[k->program];
|
||||
}
|
||||
}
|
||||
|
||||
vector<string> saved_buffers;
|
||||
for (auto &obj : objects) {
|
||||
auto mobj = obj.object_items();
|
||||
cl_mem val = *(cl_mem*)(mobj["id"].string_value().data());
|
||||
int sz = mobj["size"].int_value();
|
||||
if (mobj["needs_load"].bool_value()) {
|
||||
char *buf = (char *)malloc(sz);
|
||||
if (mobj["arg_type"] == "image2d_t" || mobj["arg_type"] == "image1d_t") {
|
||||
assert(false);
|
||||
} else {
|
||||
// buffers allocated with CL_MEM_HOST_WRITE_ONLY, hence this hack
|
||||
//hexdump((uint32_t*)val, 0x100);
|
||||
|
||||
// the worst hack in thneed, the flags are at 0x14
|
||||
((uint32_t*)val)[0x14] &= ~CL_MEM_HOST_WRITE_ONLY;
|
||||
cl_int ret = clEnqueueReadBuffer(command_queue, val, CL_TRUE, 0, sz, buf, 0, NULL, NULL);
|
||||
assert(ret == CL_SUCCESS);
|
||||
}
|
||||
//printf("saving buffer: %d %p %s\n", sz, buf, mobj["arg_type"].string_value().c_str());
|
||||
saved_buffers.push_back(string(buf, sz));
|
||||
free(buf);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<Json> jbinaries;
|
||||
for (auto &obj : binaries) {
|
||||
jbinaries.push_back(Json::object({{"name", obj.first}, {"length", (int)obj.second.size()}}));
|
||||
saved_buffers.push_back(obj.second);
|
||||
}
|
||||
|
||||
Json jdat = Json::object({
|
||||
{"kernels", kernels},
|
||||
{"objects", objects},
|
||||
{"programs", programs},
|
||||
{"binaries", jbinaries},
|
||||
});
|
||||
|
||||
string str = jdat.dump();
|
||||
int jsz = str.length();
|
||||
|
||||
FILE *f = fopen(filename, "wb");
|
||||
fwrite(&jsz, 1, sizeof(jsz), f);
|
||||
fwrite(str.data(), 1, jsz, f);
|
||||
for (auto &s : saved_buffers) {
|
||||
fwrite(s.data(), 1, s.length(), f);
|
||||
}
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
Json CLQueuedKernel::to_json() const {
|
||||
return Json::object {
|
||||
{ "name", name },
|
||||
{ "work_dim", (int)work_dim },
|
||||
{ "global_work_size", Json::array { (int)global_work_size[0], (int)global_work_size[1], (int)global_work_size[2] } },
|
||||
{ "local_work_size", Json::array { (int)local_work_size[0], (int)local_work_size[1], (int)local_work_size[2] } },
|
||||
{ "num_args", (int)num_args },
|
||||
{ "args", args },
|
||||
{ "args_size", args_size },
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -17,7 +17,6 @@
|
||||
using namespace std;
|
||||
|
||||
cl_int thneed_clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value);
|
||||
cl_program thneed_clCreateProgramWithSource(cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret);
|
||||
|
||||
namespace json11 {
|
||||
class Json;
|
||||
@@ -43,7 +42,6 @@ class CLQueuedKernel {
|
||||
const size_t *_global_work_size,
|
||||
const size_t *_local_work_size);
|
||||
cl_int exec();
|
||||
uint64_t benchmark();
|
||||
void debug_print(bool verbose);
|
||||
int get_arg_num(const char *search_arg_name);
|
||||
cl_program program;
|
||||
@@ -96,8 +94,6 @@ class Thneed {
|
||||
void stop();
|
||||
void execute(float **finputs, float *foutput, bool slow=false);
|
||||
void wait();
|
||||
int optimize();
|
||||
bool run_optimizer = false;
|
||||
|
||||
vector<cl_mem> input_clmem;
|
||||
vector<void *> inputs;
|
||||
@@ -121,7 +117,6 @@ class Thneed {
|
||||
#endif
|
||||
|
||||
// all CL kernels
|
||||
void find_inputs_outputs();
|
||||
void copy_inputs(float **finputs, bool internal=false);
|
||||
void copy_output(float *foutput);
|
||||
cl_int clexec();
|
||||
@@ -130,9 +125,8 @@ class Thneed {
|
||||
// pending CL kernels
|
||||
vector<shared_ptr<CLQueuedKernel> > ckq;
|
||||
|
||||
// loading and saving
|
||||
// loading
|
||||
void load(const char *filename);
|
||||
void save(const char *filename, bool save_binaries=false);
|
||||
private:
|
||||
void clinit();
|
||||
};
|
||||
|
||||
@@ -11,6 +11,11 @@ map<pair<cl_kernel, int>, string> g_args;
|
||||
map<pair<cl_kernel, int>, int> g_args_size;
|
||||
map<cl_program, string> g_program_source;
|
||||
|
||||
void Thneed::stop() {
|
||||
printf("Thneed::stop: recorded %lu commands\n", cmds.size());
|
||||
record = false;
|
||||
}
|
||||
|
||||
void Thneed::clinit() {
|
||||
device_id = cl_get_device_id(CL_DEVICE_TYPE_DEFAULT);
|
||||
if (context == NULL) context = CL_CHECK_ERR(clCreateContext(NULL, 1, &device_id, NULL, NULL, &err));
|
||||
@@ -131,23 +136,6 @@ cl_int CLQueuedKernel::exec() {
|
||||
kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL);
|
||||
}
|
||||
|
||||
uint64_t CLQueuedKernel::benchmark() {
|
||||
uint64_t ret = 0;
|
||||
int old_record = thneed->record;
|
||||
thneed->record = 0;
|
||||
clFinish(thneed->command_queue);
|
||||
// TODO: benchmarking at a lower level will make this more accurate
|
||||
for (int i = 0; i < 10; i++) {
|
||||
uint64_t sb = nanos_since_boot();
|
||||
exec();
|
||||
clFinish(thneed->command_queue);
|
||||
uint64_t et = nanos_since_boot() - sb;
|
||||
if (ret == 0 || et < ret) ret = et;
|
||||
}
|
||||
thneed->record = old_record;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void CLQueuedKernel::debug_print(bool verbose) {
|
||||
printf("%p %56s -- ", kernel, name.c_str());
|
||||
for (int i = 0; i < work_dim; i++) {
|
||||
@@ -226,10 +214,3 @@ cl_int thneed_clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_siz
|
||||
cl_int ret = clSetKernelArg(kernel, arg_index, arg_size, arg_value);
|
||||
return ret;
|
||||
}
|
||||
|
||||
cl_program thneed_clCreateProgramWithSource(cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret) {
|
||||
assert(count == 1);
|
||||
cl_program ret = clCreateProgramWithSource(context, count, strings, lengths, errcode_ret);
|
||||
g_program_source[ret] = strings[0];
|
||||
return ret;
|
||||
}
|
||||
@@ -30,11 +30,3 @@ void Thneed::execute(float **finputs, float *foutput, bool slow) {
|
||||
printf("model exec in %lu us\n", (te-tb)/1000);
|
||||
}
|
||||
}
|
||||
|
||||
void Thneed::stop() {
|
||||
}
|
||||
|
||||
void Thneed::find_inputs_outputs() {
|
||||
// thneed on PC doesn't work on old style inputs/outputs
|
||||
}
|
||||
|
||||
|
||||
@@ -218,39 +218,6 @@ Thneed::Thneed(bool do_clinit, cl_context _context) {
|
||||
debug = (thneed_debug_env != NULL) ? atoi(thneed_debug_env) : 0;
|
||||
}
|
||||
|
||||
void Thneed::stop() {
|
||||
find_inputs_outputs();
|
||||
printf("Thneed::stop: recorded %lu commands\n", cmds.size());
|
||||
record = false;
|
||||
}
|
||||
|
||||
void Thneed::find_inputs_outputs() {
|
||||
cl_int err;
|
||||
if (inputs.size() > 0) return;
|
||||
|
||||
// save the global inputs/outputs
|
||||
for (auto &k : kq) {
|
||||
for (int i = 0; i < k->num_args; i++) {
|
||||
if (k->name == "zero_pad_image_float" && k->arg_names[i] == "input") {
|
||||
cl_mem aa = *(cl_mem*)(k->args[i].data());
|
||||
input_clmem.push_back(aa);
|
||||
|
||||
size_t sz;
|
||||
clGetMemObjectInfo(aa, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
|
||||
input_sizes.push_back(sz);
|
||||
|
||||
void *ret = clEnqueueMapBuffer(command_queue, aa, CL_TRUE, CL_MAP_WRITE, 0, sz, 0, NULL, NULL, &err);
|
||||
assert(err == CL_SUCCESS);
|
||||
inputs.push_back(ret);
|
||||
}
|
||||
|
||||
if (k->name == "image2d_to_buffer_float" && k->arg_names[i] == "output") {
|
||||
output = *(cl_mem*)(k->args[i].data());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Thneed::wait() {
|
||||
struct kgsl_device_waittimestamp_ctxtid wait;
|
||||
wait.context_id = context_id;
|
||||
@@ -314,74 +281,3 @@ void Thneed::execute(float **finputs, float *foutput, bool slow) {
|
||||
printf("model exec in %lu us\n", (te-tb)/1000);
|
||||
}
|
||||
}
|
||||
|
||||
// *********** OpenCL interceptor ***********
|
||||
|
||||
cl_int thneed_clEnqueueNDRangeKernel(cl_command_queue command_queue,
|
||||
cl_kernel kernel,
|
||||
cl_uint work_dim,
|
||||
const size_t *global_work_offset,
|
||||
const size_t *global_work_size,
|
||||
const size_t *local_work_size,
|
||||
cl_uint num_events_in_wait_list,
|
||||
const cl_event *event_wait_list,
|
||||
cl_event *event) {
|
||||
|
||||
Thneed *thneed = g_thneed;
|
||||
|
||||
// SNPE doesn't use these
|
||||
assert(num_events_in_wait_list == 0);
|
||||
assert(global_work_offset == NULL);
|
||||
assert(event_wait_list == NULL);
|
||||
|
||||
cl_int ret = 0;
|
||||
if (thneed != NULL && thneed->record) {
|
||||
if (thneed->context == NULL) {
|
||||
thneed->command_queue = command_queue;
|
||||
clGetKernelInfo(kernel, CL_KERNEL_CONTEXT, sizeof(thneed->context), &thneed->context, NULL);
|
||||
clGetContextInfo(thneed->context, CL_CONTEXT_DEVICES, sizeof(thneed->device_id), &thneed->device_id, NULL);
|
||||
}
|
||||
|
||||
// if we are recording, we don't actually enqueue the kernel
|
||||
thneed->kq.push_back(unique_ptr<CLQueuedKernel>(new CLQueuedKernel(thneed, kernel, work_dim, global_work_size, local_work_size)));
|
||||
*event = NULL;
|
||||
} else {
|
||||
ret = clEnqueueNDRangeKernel(command_queue, kernel, work_dim,
|
||||
global_work_offset, global_work_size, local_work_size,
|
||||
num_events_in_wait_list, event_wait_list, event);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
cl_int thneed_clFinish(cl_command_queue command_queue) {
|
||||
Thneed *thneed = g_thneed;
|
||||
|
||||
if (thneed != NULL && thneed->record) {
|
||||
if (thneed->run_optimizer) thneed->optimize();
|
||||
return thneed->clexec();
|
||||
} else {
|
||||
return clFinish(command_queue);
|
||||
}
|
||||
}
|
||||
|
||||
void *dlsym(void *handle, const char *symbol) {
|
||||
#ifdef QCOM2
|
||||
void *(*my_dlsym)(void *handle, const char *symbol) = (void *(*)(void *handle, const char *symbol))((uintptr_t)dlopen + DLSYM_OFFSET);
|
||||
#else
|
||||
#error "Unsupported platform for thneed"
|
||||
#endif
|
||||
if (memcmp("REAL_", symbol, 5) == 0) {
|
||||
return my_dlsym(handle, symbol+5);
|
||||
} else if (strcmp("clFinish", symbol) == 0) {
|
||||
return (void*)thneed_clFinish;
|
||||
} else if (strcmp("clEnqueueNDRangeKernel", symbol) == 0) {
|
||||
return (void*)thneed_clEnqueueNDRangeKernel;
|
||||
} else if (strcmp("clSetKernelArg", symbol) == 0) {
|
||||
return (void*)thneed_clSetKernelArg;
|
||||
} else if (strcmp("clCreateProgramWithSource", symbol) == 0) {
|
||||
return (void*)thneed_clCreateProgramWithSource;
|
||||
} else {
|
||||
return my_dlsym(handle, symbol);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,146 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
import struct
|
||||
import zipfile
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
||||
from common.basedir import BASEDIR
|
||||
from selfdrive.modeld.thneed.lib import load_thneed, save_thneed
|
||||
|
||||
# this is junk code, but it doesn't have deps
|
||||
def load_dlc_weights(fn):
|
||||
archive = zipfile.ZipFile(fn, 'r')
|
||||
dlc_params = archive.read("model.params")
|
||||
|
||||
def extract(rdat):
|
||||
idx = rdat.find(b"\x00\x00\x00\x09\x04\x00\x00\x00")
|
||||
rdat = rdat[idx+8:]
|
||||
ll = struct.unpack("I", rdat[0:4])[0]
|
||||
buf = np.frombuffer(rdat[4:4+ll*4], dtype=np.float32)
|
||||
rdat = rdat[4+ll*4:]
|
||||
dims = struct.unpack("I", rdat[0:4])[0]
|
||||
buf = buf.reshape(struct.unpack("I"*dims, rdat[4:4+dims*4]))
|
||||
if len(buf.shape) == 4:
|
||||
buf = np.transpose(buf, (3,2,0,1))
|
||||
return buf
|
||||
|
||||
def parse(tdat):
|
||||
ll = struct.unpack("I", tdat[0:4])[0] + 4
|
||||
return (None, [extract(tdat[0:]), extract(tdat[ll:])])
|
||||
|
||||
ptr = 0x20
|
||||
def r4():
|
||||
nonlocal ptr
|
||||
ret = struct.unpack("I", dlc_params[ptr:ptr+4])[0]
|
||||
ptr += 4
|
||||
return ret
|
||||
ranges = []
|
||||
cnt = r4()
|
||||
for _ in range(cnt):
|
||||
o = r4() + ptr
|
||||
# the header is 0xC
|
||||
plen, is_4, is_2 = struct.unpack("III", dlc_params[o:o+0xC])
|
||||
assert is_4 == 4 and is_2 == 2
|
||||
ranges.append((o+0xC, o+plen+0xC))
|
||||
ranges = sorted(ranges, reverse=True)
|
||||
|
||||
return [parse(dlc_params[s:e]) for s,e in ranges]
|
||||
|
||||
# this won't run on device without onnx
|
||||
def load_onnx_weights(fn):
|
||||
import onnx
|
||||
from onnx import numpy_helper
|
||||
|
||||
model = onnx.load(fn)
|
||||
graph = model.graph # pylint: disable=maybe-no-member
|
||||
init = {x.name:x for x in graph.initializer}
|
||||
|
||||
onnx_layers = []
|
||||
for node in graph.node:
|
||||
#print(node.name, node.op_type, node.input, node.output)
|
||||
vals = []
|
||||
for inp in node.input:
|
||||
if inp in init:
|
||||
vals.append(numpy_helper.to_array(init[inp]))
|
||||
if len(vals) > 0:
|
||||
onnx_layers.append((node.name, vals))
|
||||
return onnx_layers
|
||||
|
||||
def weights_fixup(target, source_thneed, dlc):
|
||||
#onnx_layers = load_onnx_weights(os.path.join(BASEDIR, "models/supercombo.onnx"))
|
||||
onnx_layers = load_dlc_weights(dlc)
|
||||
jdat = load_thneed(source_thneed)
|
||||
|
||||
bufs = {}
|
||||
for o in jdat['objects']:
|
||||
bufs[o['id']] = o
|
||||
|
||||
thneed_layers = []
|
||||
for k in jdat['kernels']:
|
||||
#print(k['name'])
|
||||
vals = []
|
||||
for a in k['args']:
|
||||
if a in bufs:
|
||||
o = bufs[a]
|
||||
if o['needs_load'] or ('buffer_id' in o and bufs[o['buffer_id']]['needs_load']):
|
||||
#print(" ", o['arg_type'])
|
||||
vals.append(o)
|
||||
if len(vals) > 0:
|
||||
thneed_layers.append((k['name'], vals))
|
||||
|
||||
assert len(thneed_layers) == len(onnx_layers)
|
||||
|
||||
# fix up weights
|
||||
for tl, ol in tqdm(zip(thneed_layers, onnx_layers), total=len(thneed_layers)):
|
||||
#print(tl[0], ol[0])
|
||||
assert len(tl[1]) == len(ol[1])
|
||||
for o, onnx_weight in zip(tl[1], ol[1]):
|
||||
if o['arg_type'] == "image2d_t":
|
||||
obuf = bufs[o['buffer_id']]
|
||||
saved_weights = np.frombuffer(obuf['data'], dtype=np.float16).reshape(o['height'], o['row_pitch']//2)
|
||||
|
||||
if len(onnx_weight.shape) == 4:
|
||||
# convolution
|
||||
oc,ic,ch,cw = onnx_weight.shape
|
||||
|
||||
if 'depthwise' in tl[0]:
|
||||
assert ic == 1
|
||||
weights = np.transpose(onnx_weight.reshape(oc//4,4,ch,cw), (0,2,3,1)).reshape(o['height'], o['width']*4)
|
||||
else:
|
||||
weights = np.transpose(onnx_weight.reshape(oc//4,4,ic//4,4,ch,cw), (0,4,2,5,1,3)).reshape(o['height'], o['width']*4)
|
||||
else:
|
||||
# fc_Wtx
|
||||
weights = onnx_weight
|
||||
|
||||
new_weights = np.zeros((o['height'], o['row_pitch']//2), dtype=np.float32)
|
||||
new_weights[:, :weights.shape[1]] = weights
|
||||
|
||||
# weights shouldn't be too far off
|
||||
err = np.mean((saved_weights.astype(np.float32) - new_weights)**2)
|
||||
assert err < 1e-3
|
||||
rerr = np.mean(np.abs((saved_weights.astype(np.float32) - new_weights)/(new_weights+1e-12)))
|
||||
assert rerr < 0.5
|
||||
|
||||
# fix should improve things
|
||||
fixed_err = np.mean((new_weights.astype(np.float16).astype(np.float32) - new_weights)**2)
|
||||
assert (err/fixed_err) >= 1
|
||||
|
||||
#print(" ", o['size'], onnx_weight.shape, o['row_pitch'], o['width'], o['height'], "err %.2fx better" % (err/fixed_err))
|
||||
|
||||
obuf['data'] = new_weights.astype(np.float16).tobytes()
|
||||
|
||||
elif o['arg_type'] == "float*":
|
||||
# unconverted floats are correct
|
||||
new_weights = np.zeros(o['size']//4, dtype=np.float32)
|
||||
new_weights[:onnx_weight.shape[0]] = onnx_weight
|
||||
assert new_weights.tobytes() == o['data']
|
||||
#print(" ", o['size'], onnx_weight.shape)
|
||||
|
||||
save_thneed(jdat, target)
|
||||
|
||||
if __name__ == "__main__":
|
||||
model_dir = os.path.join(BASEDIR, "selfdrive/modeld/models/")
|
||||
weights_fixup(os.path.join(model_dir, "supercombo_fixed.thneed"),
|
||||
os.path.join(model_dir, "supercombo.thneed"),
|
||||
os.path.join(model_dir, "supercombo.dlc"))
|
||||
Reference in New Issue
Block a user