tinygrad/extra/accel/ane/README.old

kernel driver: AppleH11ANEInterface
  requires entitlement: com.apple.ane.iokit-user-access
	compiler is run in ANE_ProgramCreate_gated

2 helper processes:
  /usr/libexec/aned
  ANECompilerService

Espresso:
	Contains ANECompilerEngine

AppleNeuralEngine: Objective-C interface called by Espresso
  ANEServices: communication with the device
  ANECompiler: compile plist into hwx file
		com.apple.ANECompilerService.allow in AppleNeuralEngine?
		Called from ANECompilerService.xpc in AppleNeuralEngine.framework

== Model Flow ==

    Keras/ONNX model
          |
          |   1_build
          | (coremltools, open source)
          v
      CoreML model
          |
          |   TODO: automate this
          |   Grabbed plist from lldbing ANECompilerService during 1_build
          | (Espresso)
          v
       net.plist
          |
          |   2_compile
          | (AppleNeuralEngine, ANECompiler)
          v
       model.hwx
          |
          |   3_run
          | (AppleNeuralEngine, ANEServices, AppleH11ANEInterface)
          v
 <run on neural engine>

TODO: Write a nice plist grabber
DONE: Write a call to the compiler with plist+weights
DONE: Write an hwx runner

== Tracing the Compiler ==

ANECCompileProcedure
  ZinAneCreateIr
    ZinParseNeuronUnit
  ZinAneCoreCompile
    ZinAneCodeGeneration
      ZinIrCodegenHandleKernels
      ZinIrTargetH13::CodegenTds
        ZinIrCacheHintTable
        ZinIrCodegenHandleTds_v7
          ZinIrCodegenHandleTdsMakeList<7u>
            ZinAneInstruction
            ZinAneTd<7u>::HandleEngineLayer
              ZinAneInstruction::HandleTdHeader
              HandleNELayer<7u>
                ZinAneInstruction::HandleCommonConfig
                  ZinAneInstruction::HandleCommonConfigCommonOpcodes
        ZinIrCodegenHandleTds<7u>
          0x1bb93ae00 <-- this is the store of the first byte in the hwx
          CalculateSizeInBytesFromRegCount (x*4+4)
            0xf   @ 0x128-0x168  (base 0x1003047b0)
            0x1b  @ 0x16c-0x1dc
            0x11  @ 0x1e0-0x228
            0x3   @ 0x22c-0x23c
            0x4   @ 0x240-0x254
            0x6   @ 0x258-0x274(end)
          AddReloc (this is gold! x4 goes in the hwx)
          ZinAneTd<7u>::HandleEngineLayer

rbreak ^ZinAneInstruction*

weeee ZinIrRegBitPrintOutDebug_7u_
print (void)debugregs(0, 0x0000000100211030+8, 3)

== min.plist ==

Types: GOC, Conv, Broadcast, ScaledElementWise, Reshape, InputView, Neuron, Concat


ops have length 0x300, seems like one basic op repeated

header 0x0-0x1c

u32 0x1c = next op offset
u16 0x20 = output address?

== section break 0x2c (weights) ==
reloc 0x2c-0x74 = K2DBE6976FEB616E6867A2E3853FC37D0F101C4C51BA4A80C103359643338C0C1_ne_0
                  K2DBE6976FEB616E6867A2E3853FC37D0F101C4C51BA4A80C103359643338C0C1_ne_1

16 output channel parallel:
u32[16] 0x34-0x74 = 0x80 | 1 if used
u32[16] 0x74-0xB4 = <channel data offset>
u32[16] 0xB4-0xF4 = <channel data length>

== section break 0x128 (conv) ==
u16 0x128 = InputWidth
u16 0x12A = InputHeight
u16 0x12C = InputDepth

u32 0x130 = (OutputType * 0x10) | InputType

u32 0x134 = InputChannels
u32 0x138 = OutputChannels

u16 0x13C = OutputWidth
u16 0x13E = OutputHeight
u16 0x140 = OutputDepth

u16 0x144 = 0xa000 | (KernelHeight * 0x20) | KernelWidth
u16 0x146 = 0x5000 | (PadTop * 0x40) | (PadLeft * 2)

u16 0x14C = BatchSize
u32 0x150 = OutputHeight?

== section break 0x16c (input) ==
reloc 0x16c-0x174 = image

u32 0x178 = InputRowStride
u32 0x17C = InputPlaneStride
u32 0x180 = InputDepthStride
u32 0x184 = InputBatchStride

u8 0x1A7 = InputInterleave

== section break 0x1e0 ==
u8 0x1E5 = InputInterleave

u32 0x1F4 = InputChannels * 0x10
u32 0x1F8 = InputDepth * InputChannels * 0x10

u8 0x211 = OutputInterleave

u32 0x220 = OutputChannels * 0x10
u32 0x224 = OutputDepth * OutputChannels * 0x10

== section break 0x22c (scaling) ==
u16 0x230 = BiasScalar
u16 0x232 = ScaleScalar

== section break 0x240 ==
u8 0x240 = 0x80 | KernelType
u8 0x241 = 4 * hasbatch
u16 0x246 = 0x10 | 2 * neuron?

== section break 0x258 (output) ==
reloc 0x258-0x25c = probs@output/src

u32 0x260 = OutputRowStride
u32 0x264 = OutputPlaneStride
u32 0x268 = OutputDepthStride
u32 0x26C = OutputBatchStride

u8 0x273 = OutputInterleave

== Zin Constants ==

kZinIrOpCodeConv = 0?
kZinIrOpCodePool = 1
kZinIrOpCodeElementWiseOp = 6
kZinIrOpCodeConcat = 8
kZinIrOpCodeFlattenComposite
kZinIrOpCodeNEConvOp
kZinIrOpCodeTranspose

 0: CONV
 1: POOL
 2: SCALE_BIAS
 3: TERNARY_DYNAMIC_GOC
 4: BINARY_DYNAMIC_GOC
 5: ACTIVATION
 6: EW
 7: SCALED_EW
 8: CONCAT
 9: SPLIT
10: COPY
11: FLATTEN
12: UNFLATTEN
13: CROSS_CORRELATION
14: KERNEL_RASTERIZER
15: ARG_MIN_MAX
16: MATRIX_MULT
17: BROADCAST
18: FLATTEN_COMPOSITE
19: UNFLATTEN_COMPOSITE
20: KERNEL_RASTERIZER_COMPOSITE
21: CROSS_CORRELATION_COMPOSITE
22: LIVE_IN
23: CONST_IN
24: LIVE_OUT
25: REDUCTION
26: ALIAS
27: Typecast
28: RESHAPE
29: VIEW
30: TRANSPOSE
31: SPACE_TO_BATCH
32: BATCH_TO_SPACE
33: SOFTMAX
34: INSTANCE_NORM
35: L2_NORM
36: MINMAX_NORM
37: LRN
38: COST_VOLUME
39: PIXEL_SHUFFLE
40: FPS
41: RS
42: PEFUSED_ELEMENTWISE
43: PEFUSED_POOL
44: PEFUSED_GOC
45: NEFUSED_CONV
46: NEFUSED_POOL
47: NEFUSED_EW
48: NEFUSED_BYPASS

# guessing from the hwx
kZinTensorFormatUInt8 = 0
kZinTensorFormatInt8 = 1
kZinTensorFormatFloat16 = 2
kZinTensorFormatInvalid

Zin (plist format) ---(ZinAneCoreCompile)---> Mir (hwx format)?
  ZinAneCodeGeneration?

ZinIrStatus GetKernelFormat<6u>(ZinKernelFormat param_1,ane_ne_kernel_cfg_kernel_fmt *param_2)
  List of allowed numbers

NeuronTypes (changes the LUT):
  Tanh
  Log2
  Exp2
  Sign = ZinMirActivationV7::GetSignLut
  ...many more in ANECompiler

Investigate:
  ZinMirActivationV7::PrintLut(ZinMirActivationV7 *this,ane_nonlinear_lut_v7up_t *param_1

 0: NONE
 1: RELU
 2: SIGMOID
 3: SIGMOID_HIGH_PRECISION
 4: TANH
 5: CLAMPED_RELU
 6: PRELU
 7: DIRAC
 8: INT
 9: FRAC
10: SQRT
11: RSQRT
12: INV
13: SQR
14: LOG2
15: EXP2
16: ELU
17: SIGN
18: EQUAL_ZERO
19: NON_ZERO
20: LESS_THAN_ZERO
21: LESS_EQUAL_ZERO
22: GREATER_EQUAL_ZERO
23: GREATER_THAN_ZERO
24: CUSTOM_LUT
25: C_DIM_CONCAT
26: C_DIM_STRIDED_CONCAT
27: H_DIM_CONCAT
28: W_DIM_CONCAT
29: D_DIM_CONCAT
30: N_DIM_CONCAT
31: H_DIM_STRIDED_CONCAT

CacheHint
0: ALLOC
1: NOALLOC
2: DROP
3: DEPRI

conv kinds
0: regular
1: channelwise
2: grouped

== plist exploration ==

Float16 -> UInt8 for output works, Float32 doesn't
Same for input
All weights must be float

Index 0: model.espresso.weights @ 192 is weights
Index 1: net.additional.weights @ 0 is bias

Float16 -> Float32 for bias works

It's possible the compiler is Float32 -> Float16 converting, and the engine only supports Float16 + UInt8

== call to the compiler (in dmesg!) ==

[54476.282258]: H11ANEIn: ANE_ProgramCreate_gated:, ZinComputeProgramMake, get Mcache size: 0x0
[54476.282259]: H11ANEIn: ANE_ProgramCreate_gated:,Program Identifier:ANEC v1
zin_ane_compiler v4.2.1
	-t h13
	--fdram-allocator=ffreuse
	--fdram-tensor-priority=sizethenliverange
	--fl2-allocator=ffreuse
	--fl3-allocator=ffreuse
	--fl2-cache-mode=resident
	--fsignature=ident
	--memcache-strategy=
[54476.282262]: 	--memcache-size=4194304
	--fspatial-split=disabled
	--fkernel-rewind=enabled
	--Wl-undefined=fvmlib
	-i /Library/Caches/com.apple.aned/tmp/Python/DB7E897E7F4D5D27501A998428B6D3863AFD96CEA82DAF2207A75394E6BAC44C/37C083FF396EB5948979EE20FD0457483E4ACE840AD23391A129BB83CFBC9C63/net.plist
	-o /Library/Caches/com.apple.aned/20A2411/Python/C9981871BC59572E74AFA3014B183EA37567EE9A2A08328446CE4A2B754E109D/37C083FF396EB5948979EE20FD0457483E4ACE840AD23391A129BB83CFBC9C63/model.hwx.tmp

== ANECCompile (in ANECompiler framework) ==
  ANECCompile(__CFDictionary *param_1, __CFDictionary *param_2, unsigned long param_3)

param_1:
{
    InputNetworks =     (
                {
            NetworkPlistName = "net.plist";
            NetworkPlistPath = "/Library/Caches/com.apple.aned/tmp/run/A2ACB9D5AA31B301563A4F62885BA379E62B0E1240E95C6902A93900FE0A9B54/37C083FF396EB5948979EE20FD0457483E4ACE840AD23391A129BB83CFBC9C63/";
        }
    );
    OutputFileName = "model.hwx.tmp";
    OutputFilePath = "/Library/Caches/com.apple.aned/20A2411/run/E68910CD1994681121EEDAFAE1BC524AA8E84CF80C42AFC0C7DE2C082C58BDFD/37C083FF396EB5948979EE20FD0457483E4ACE840AD23391A129BB83CFBC9C63/";
}

param_2:
{
    TargetArchitecture = h13;
}

== Backtrace of device open ==

  * frame #0: 0x00000001a68fac54 ANEServices`H11ANEDeviceOpen
    frame #1: 0x00000001a78405b8 AppleNeuralEngine`__29-[_ANEDeviceController start]_block_invoke + 436
    frame #2: 0x0000000193c84420 libdispatch.dylib`_dispatch_client_callout + 20
    frame #3: 0x0000000193c92a98 libdispatch.dylib`_dispatch_lane_barrier_sync_invoke_and_complete + 60
    frame #4: 0x00000001a78403e8 AppleNeuralEngine`-[_ANEDeviceController start] + 136
    ...
    frame #23: 0x00000001a64a4f38 Espresso`Espresso::ANERuntimeEngine::compiler::build_segment(std::__1::shared_ptr<Espresso::abstract_batch> const&, int, Espresso::net_compiler_segment_based::segment_t const&) + 2080
    ...
    frame #31: 0x000000019ab6099c CoreML`-[MLNeuralNetworkEngine rebuildPlan:] + 1640

== Backtrace of run? ==

  * frame #0: 0x00000001a68f9108 ANEServices`H11ANEProgramProcessRequestDirect
    frame #1: 0x00000001a7839694 AppleNeuralEngine`-[_ANEProgramForEvaluation processRequest:qos:qIndex:modelStringID:options:error:] + 1904
    frame #2: 0x00000001a7843ba4 AppleNeuralEngine`-[_ANEClient doEvaluateDirectWithModel:options:request:qos:error:] + 1236
    frame #3: 0x00000001a7842034 AppleNeuralEngine`-[_ANEClient evaluateWithModel:options:request:qos:error:] + 104
    frame #4: 0x00000001a64a2988 Espresso`Espresso::ANERuntimeEngine::compiler::__forward_segment(std::__1::shared_ptr<Espresso::abstract_batch> const&, int, Espresso::net_compiler_segment_based::segment_t const&) + 2008
    frame #5: 0x00000001a6414548 Espresso`Espresso::net_compiler_segment_based::__forward(std::__1::shared_ptr<Espresso::abstract_batch> const&) + 992
    frame #6: 0x00000001a67e2e3c Espresso`EspressoLight::espresso_plan::dispatch_task_on_compute_batch(std::__1::shared_ptr<Espresso::abstract_batch> const&, std::__1::shared_ptr<EspressoLight::plan_task_t> const&) + 612
    frame #7: 0x00000001a67ebab0 Espresso`EspressoLight::espresso_plan::execute_sync() + 356
    frame #8: 0x00000001a67f26fc Espresso`espresso_plan_execute_sync + 120
    frame #9: 0x000000019ab674b8 CoreML`-[MLNeuralNetworkEngine executePlan:error:] + 136
    frame #10: 0x000000019ab6799c CoreML`-[MLNeuralNetworkEngine evaluateInputs:bufferIndex:options:error:] + 368