mirror of https://github.com/commaai/tinygrad.git
368 lines
10 KiB
Plaintext
368 lines
10 KiB
Plaintext
kernel driver: AppleH11ANEInterface
|
|
requires entitlement: com.apple.ane.iokit-user-access
|
|
compiler is run in ANE_ProgramCreate_gated
|
|
|
|
2 helper processes:
|
|
/usr/libexec/aned
|
|
ANECompilerService
|
|
|
|
Espresso:
|
|
Contains ANECompilerEngine
|
|
|
|
AppleNeuralEngine: Objective-C interface called by Espresso
|
|
ANEServices: communication with the device
|
|
ANECompiler: compile plist into hwx file
|
|
com.apple.ANECompilerService.allow in AppleNeuralEngine?
|
|
Called from ANECompilerService.xpc in AppleNeuralEngine.framework
|
|
|
|
== Model Flow ==
|
|
|
|
Keras/ONNX model
|
|
|
|
|
| 1_build
|
|
| (coremltools, open source)
|
|
v
|
|
CoreML model
|
|
|
|
|
| TODO: automate this
|
|
| Grabbed plist from lldbing ANECompilerService during 1_build
|
|
| (Espresso)
|
|
v
|
|
net.plist
|
|
|
|
|
| 2_compile
|
|
| (AppleNeuralEngine, ANECompiler)
|
|
v
|
|
model.hwx
|
|
|
|
|
| 3_run
|
|
| (AppleNeuralEngine, ANEServices, AppleH11ANEInterface)
|
|
v
|
|
<run on neural engine>
|
|
|
|
TODO: Write a nice plist grabber
|
|
DONE: Write a call to the compiler with plist+weights
|
|
DONE: Write an hwx runner
|
|
|
|
== Tracing the Compiler ==
|
|
|
|
ANECCompileProcedure
|
|
ZinAneCreateIr
|
|
ZinParseNeuronUnit
|
|
ZinAneCoreCompile
|
|
ZinAneCodeGeneration
|
|
ZinIrCodegenHandleKernels
|
|
ZinIrTargetH13::CodegenTds
|
|
ZinIrCacheHintTable
|
|
ZinIrCodegenHandleTds_v7
|
|
ZinIrCodegenHandleTdsMakeList<7u>
|
|
ZinAneInstruction
|
|
ZinAneTd<7u>::HandleEngineLayer
|
|
ZinAneInstruction::HandleTdHeader
|
|
HandleNELayer<7u>
|
|
ZinAneInstruction::HandleCommonConfig
|
|
ZinAneInstruction::HandleCommonConfigCommonOpcodes
|
|
ZinIrCodegenHandleTds<7u>
|
|
0x1bb93ae00 <-- this is the store of the first byte in the hwx
|
|
CalculateSizeInBytesFromRegCount (x*4+4)
|
|
0xf @ 0x128-0x168 (base 0x1003047b0)
|
|
0x1b @ 0x16c-0x1dc
|
|
0x11 @ 0x1e0-0x228
|
|
0x3 @ 0x22c-0x23c
|
|
0x4 @ 0x240-0x254
|
|
0x6 @ 0x258-0x274(end)
|
|
AddReloc (this is gold! x4 goes in the hwx)
|
|
ZinAneTd<7u>::HandleEngineLayer
|
|
|
|
rbreak ^ZinAneInstruction*
|
|
|
|
weeee ZinIrRegBitPrintOutDebug_7u_
|
|
print (void)debugregs(0, 0x0000000100211030+8, 3)
|
|
|
|
== min.plist ==
|
|
|
|
Types: GOC, Conv, Broadcast, ScaledElementWise, Reshape, InputView, Neuron, Concat
|
|
|
|
|
|
ops have length 0x300, seems like one basic op repeated
|
|
|
|
header 0x0-0x1c
|
|
|
|
u32 0x1c = next op offset
|
|
u16 0x20 = output address?
|
|
|
|
== section break 0x2c (weights) ==
|
|
reloc 0x2c-0x74 = K2DBE6976FEB616E6867A2E3853FC37D0F101C4C51BA4A80C103359643338C0C1_ne_0
|
|
K2DBE6976FEB616E6867A2E3853FC37D0F101C4C51BA4A80C103359643338C0C1_ne_1
|
|
|
|
16 output channel parallel:
|
|
u32[16] 0x34-0x74 = 0x80 | 1 if used
|
|
u32[16] 0x74-0xB4 = <channel data offset>
|
|
u32[16] 0xB4-0xF4 = <channel data length>
|
|
|
|
== section break 0x128 (conv) ==
|
|
u16 0x128 = InputWidth
|
|
u16 0x12A = InputHeight
|
|
u16 0x12C = InputDepth
|
|
|
|
u32 0x130 = (OutputType * 0x10) | InputType
|
|
|
|
u32 0x134 = InputChannels
|
|
u32 0x138 = OutputChannels
|
|
|
|
u16 0x13C = OutputWidth
|
|
u16 0x13E = OutputHeight
|
|
u16 0x140 = OutputDepth
|
|
|
|
u16 0x144 = 0xa000 | (KernelHeight * 0x20) | KernelWidth
|
|
u16 0x146 = 0x5000 | (PadTop * 0x40) | (PadLeft * 2)
|
|
|
|
u16 0x14C = BatchSize
|
|
u32 0x150 = OutputHeight?
|
|
|
|
== section break 0x16c (input) ==
|
|
reloc 0x16c-0x174 = image
|
|
|
|
u32 0x178 = InputRowStride
|
|
u32 0x17C = InputPlaneStride
|
|
u32 0x180 = InputDepthStride
|
|
u32 0x184 = InputBatchStride
|
|
|
|
u8 0x1A7 = InputInterleave
|
|
|
|
== section break 0x1e0 ==
|
|
u8 0x1E5 = InputInterleave
|
|
|
|
u32 0x1F4 = InputChannels * 0x10
|
|
u32 0x1F8 = InputDepth * InputChannels * 0x10
|
|
|
|
u8 0x211 = OutputInterleave
|
|
|
|
u32 0x220 = OutputChannels * 0x10
|
|
u32 0x224 = OutputDepth * OutputChannels * 0x10
|
|
|
|
== section break 0x22c (scaling) ==
|
|
u16 0x230 = BiasScalar
|
|
u16 0x232 = ScaleScalar
|
|
|
|
== section break 0x240 ==
|
|
u8 0x240 = 0x80 | KernelType
|
|
u8 0x241 = 4 * hasbatch
|
|
u16 0x246 = 0x10 | 2 * neuron?
|
|
|
|
== section break 0x258 (output) ==
|
|
reloc 0x258-0x25c = probs@output/src
|
|
|
|
u32 0x260 = OutputRowStride
|
|
u32 0x264 = OutputPlaneStride
|
|
u32 0x268 = OutputDepthStride
|
|
u32 0x26C = OutputBatchStride
|
|
|
|
u8 0x273 = OutputInterleave
|
|
|
|
== Zin Constants ==
|
|
|
|
kZinIrOpCodeConv = 0?
|
|
kZinIrOpCodePool = 1
|
|
kZinIrOpCodeElementWiseOp = 6
|
|
kZinIrOpCodeConcat = 8
|
|
kZinIrOpCodeFlattenComposite
|
|
kZinIrOpCodeNEConvOp
|
|
kZinIrOpCodeTranspose
|
|
|
|
0: CONV
|
|
1: POOL
|
|
2: SCALE_BIAS
|
|
3: TERNARY_DYNAMIC_GOC
|
|
4: BINARY_DYNAMIC_GOC
|
|
5: ACTIVATION
|
|
6: EW
|
|
7: SCALED_EW
|
|
8: CONCAT
|
|
9: SPLIT
|
|
10: COPY
|
|
11: FLATTEN
|
|
12: UNFLATTEN
|
|
13: CROSS_CORRELATION
|
|
14: KERNEL_RASTERIZER
|
|
15: ARG_MIN_MAX
|
|
16: MATRIX_MULT
|
|
17: BROADCAST
|
|
18: FLATTEN_COMPOSITE
|
|
19: UNFLATTEN_COMPOSITE
|
|
20: KERNEL_RASTERIZER_COMPOSITE
|
|
21: CROSS_CORRELATION_COMPOSITE
|
|
22: LIVE_IN
|
|
23: CONST_IN
|
|
24: LIVE_OUT
|
|
25: REDUCTION
|
|
26: ALIAS
|
|
27: Typecast
|
|
28: RESHAPE
|
|
29: VIEW
|
|
30: TRANSPOSE
|
|
31: SPACE_TO_BATCH
|
|
32: BATCH_TO_SPACE
|
|
33: SOFTMAX
|
|
34: INSTANCE_NORM
|
|
35: L2_NORM
|
|
36: MINMAX_NORM
|
|
37: LRN
|
|
38: COST_VOLUME
|
|
39: PIXEL_SHUFFLE
|
|
40: FPS
|
|
41: RS
|
|
42: PEFUSED_ELEMENTWISE
|
|
43: PEFUSED_POOL
|
|
44: PEFUSED_GOC
|
|
45: NEFUSED_CONV
|
|
46: NEFUSED_POOL
|
|
47: NEFUSED_EW
|
|
48: NEFUSED_BYPASS
|
|
|
|
# guessing from the hwx
|
|
kZinTensorFormatUInt8 = 0
|
|
kZinTensorFormatInt8 = 1
|
|
kZinTensorFormatFloat16 = 2
|
|
kZinTensorFormatInvalid
|
|
|
|
Zin (plist format) ---(ZinAneCoreCompile)---> Mir (hwx format)?
|
|
ZinAneCodeGeneration?
|
|
|
|
ZinIrStatus GetKernelFormat<6u>(ZinKernelFormat param_1,ane_ne_kernel_cfg_kernel_fmt *param_2)
|
|
List of allowed numbers
|
|
|
|
NeuronTypes (changes the LUT):
|
|
Tanh
|
|
Log2
|
|
Exp2
|
|
Sign = ZinMirActivationV7::GetSignLut
|
|
...many more in ANECompiler
|
|
|
|
Investigate:
|
|
ZinMirActivationV7::PrintLut(ZinMirActivationV7 *this,ane_nonlinear_lut_v7up_t *param_1
|
|
|
|
0: NONE
|
|
1: RELU
|
|
2: SIGMOID
|
|
3: SIGMOID_HIGH_PRECISION
|
|
4: TANH
|
|
5: CLAMPED_RELU
|
|
6: PRELU
|
|
7: DIRAC
|
|
8: INT
|
|
9: FRAC
|
|
10: SQRT
|
|
11: RSQRT
|
|
12: INV
|
|
13: SQR
|
|
14: LOG2
|
|
15: EXP2
|
|
16: ELU
|
|
17: SIGN
|
|
18: EQUAL_ZERO
|
|
19: NON_ZERO
|
|
20: LESS_THAN_ZERO
|
|
21: LESS_EQUAL_ZERO
|
|
22: GREATER_EQUAL_ZERO
|
|
23: GREATER_THAN_ZERO
|
|
24: CUSTOM_LUT
|
|
25: C_DIM_CONCAT
|
|
26: C_DIM_STRIDED_CONCAT
|
|
27: H_DIM_CONCAT
|
|
28: W_DIM_CONCAT
|
|
29: D_DIM_CONCAT
|
|
30: N_DIM_CONCAT
|
|
31: H_DIM_STRIDED_CONCAT
|
|
|
|
CacheHint
|
|
0: ALLOC
|
|
1: NOALLOC
|
|
2: DROP
|
|
3: DEPRI
|
|
|
|
conv kinds
|
|
0: regular
|
|
1: channelwise
|
|
2: grouped
|
|
|
|
== plist exploration ==
|
|
|
|
Float16 -> UInt8 for output works, Float32 doesn't
|
|
Same for input
|
|
All weights must be float
|
|
|
|
Index 0: model.espresso.weights @ 192 is weights
|
|
Index 1: net.additional.weights @ 0 is bias
|
|
|
|
Float16 -> Float32 for bias works
|
|
|
|
It's possible the compiler is Float32 -> Float16 converting, and the engine only supports Float16 + UInt8
|
|
|
|
== call to the compiler (in dmesg!) ==
|
|
|
|
[54476.282258]: H11ANEIn: ANE_ProgramCreate_gated:, ZinComputeProgramMake, get Mcache size: 0x0
|
|
[54476.282259]: H11ANEIn: ANE_ProgramCreate_gated:,Program Identifier:ANEC v1
|
|
zin_ane_compiler v4.2.1
|
|
-t h13
|
|
--fdram-allocator=ffreuse
|
|
--fdram-tensor-priority=sizethenliverange
|
|
--fl2-allocator=ffreuse
|
|
--fl3-allocator=ffreuse
|
|
--fl2-cache-mode=resident
|
|
--fsignature=ident
|
|
--memcache-strategy=
|
|
[54476.282262]: --memcache-size=4194304
|
|
--fspatial-split=disabled
|
|
--fkernel-rewind=enabled
|
|
--Wl-undefined=fvmlib
|
|
-i /Library/Caches/com.apple.aned/tmp/Python/DB7E897E7F4D5D27501A998428B6D3863AFD96CEA82DAF2207A75394E6BAC44C/37C083FF396EB5948979EE20FD0457483E4ACE840AD23391A129BB83CFBC9C63/net.plist
|
|
-o /Library/Caches/com.apple.aned/20A2411/Python/C9981871BC59572E74AFA3014B183EA37567EE9A2A08328446CE4A2B754E109D/37C083FF396EB5948979EE20FD0457483E4ACE840AD23391A129BB83CFBC9C63/model.hwx.tmp
|
|
|
|
== ANECCompile (in ANECompiler framework) ==
|
|
ANECCompile(__CFDictionary *param_1, __CFDictionary *param_2, unsigned long param_3)
|
|
|
|
param_1:
|
|
{
|
|
InputNetworks = (
|
|
{
|
|
NetworkPlistName = "net.plist";
|
|
NetworkPlistPath = "/Library/Caches/com.apple.aned/tmp/run/A2ACB9D5AA31B301563A4F62885BA379E62B0E1240E95C6902A93900FE0A9B54/37C083FF396EB5948979EE20FD0457483E4ACE840AD23391A129BB83CFBC9C63/";
|
|
}
|
|
);
|
|
OutputFileName = "model.hwx.tmp";
|
|
OutputFilePath = "/Library/Caches/com.apple.aned/20A2411/run/E68910CD1994681121EEDAFAE1BC524AA8E84CF80C42AFC0C7DE2C082C58BDFD/37C083FF396EB5948979EE20FD0457483E4ACE840AD23391A129BB83CFBC9C63/";
|
|
}
|
|
|
|
param_2:
|
|
{
|
|
TargetArchitecture = h13;
|
|
}
|
|
|
|
== Backtrace of device open ==
|
|
|
|
* frame #0: 0x00000001a68fac54 ANEServices`H11ANEDeviceOpen
|
|
frame #1: 0x00000001a78405b8 AppleNeuralEngine`__29-[_ANEDeviceController start]_block_invoke + 436
|
|
frame #2: 0x0000000193c84420 libdispatch.dylib`_dispatch_client_callout + 20
|
|
frame #3: 0x0000000193c92a98 libdispatch.dylib`_dispatch_lane_barrier_sync_invoke_and_complete + 60
|
|
frame #4: 0x00000001a78403e8 AppleNeuralEngine`-[_ANEDeviceController start] + 136
|
|
...
|
|
frame #23: 0x00000001a64a4f38 Espresso`Espresso::ANERuntimeEngine::compiler::build_segment(std::__1::shared_ptr<Espresso::abstract_batch> const&, int, Espresso::net_compiler_segment_based::segment_t const&) + 2080
|
|
...
|
|
frame #31: 0x000000019ab6099c CoreML`-[MLNeuralNetworkEngine rebuildPlan:] + 1640
|
|
|
|
== Backtrace of run? ==
|
|
|
|
* frame #0: 0x00000001a68f9108 ANEServices`H11ANEProgramProcessRequestDirect
|
|
frame #1: 0x00000001a7839694 AppleNeuralEngine`-[_ANEProgramForEvaluation processRequest:qos:qIndex:modelStringID:options:error:] + 1904
|
|
frame #2: 0x00000001a7843ba4 AppleNeuralEngine`-[_ANEClient doEvaluateDirectWithModel:options:request:qos:error:] + 1236
|
|
frame #3: 0x00000001a7842034 AppleNeuralEngine`-[_ANEClient evaluateWithModel:options:request:qos:error:] + 104
|
|
frame #4: 0x00000001a64a2988 Espresso`Espresso::ANERuntimeEngine::compiler::__forward_segment(std::__1::shared_ptr<Espresso::abstract_batch> const&, int, Espresso::net_compiler_segment_based::segment_t const&) + 2008
|
|
frame #5: 0x00000001a6414548 Espresso`Espresso::net_compiler_segment_based::__forward(std::__1::shared_ptr<Espresso::abstract_batch> const&) + 992
|
|
frame #6: 0x00000001a67e2e3c Espresso`EspressoLight::espresso_plan::dispatch_task_on_compute_batch(std::__1::shared_ptr<Espresso::abstract_batch> const&, std::__1::shared_ptr<EspressoLight::plan_task_t> const&) + 612
|
|
frame #7: 0x00000001a67ebab0 Espresso`EspressoLight::espresso_plan::execute_sync() + 356
|
|
frame #8: 0x00000001a67f26fc Espresso`espresso_plan_execute_sync + 120
|
|
frame #9: 0x000000019ab674b8 CoreML`-[MLNeuralNetworkEngine executePlan:error:] + 136
|
|
frame #10: 0x000000019ab6799c CoreML`-[MLNeuralNetworkEngine evaluateInputs:bufferIndex:options:error:] + 368
|
|
|