From bdfdbc8f8dd91379a63aea366ff0ecc8aec082e8 Mon Sep 17 00:00:00 2001 From: George Hotz Date: Sat, 13 Aug 2022 10:41:25 +0200 Subject: [PATCH] broken amfi patch --- README.md | 4 +- accel/ane/README.md | 14 +++++ accel/ane/amfi/new_patch.py | 102 ++++++++++++++++++++++++++++++++++++ 3 files changed, 118 insertions(+), 2 deletions(-) create mode 100644 accel/ane/amfi/new_patch.py diff --git a/README.md b/README.md index a1ac79c6..5df2983d 100644 --- a/README.md +++ b/README.md @@ -95,9 +95,9 @@ from tinygrad.tensor import Tensor If all you want to do is ReLU, you are in luck! You can do very fast ReLU (at least 30 MEGAReLUs/sec confirmed) -Requires your Python to be signed with `ane/lib/sign_python.sh` to add the `com.apple.ane.iokit-user-access` entitlement, which also requires `amfi_get_out_of_my_way=0x1` in your `boot-args`. Build the library with `ane/lib/build.sh` +Requires your Python to be signed with `ane/lib/sign_python.sh` to add the `com.apple.ane.iokit-user-access` entitlement, which also requires `sudo nvram boot-args="amfi_get_out_of_my_way=1 ipc_control_port_options=0"`. Build the library with `ane/lib/build.sh` -In order to set arg and for the AMFI kext to respect that arg, csrutil must have `csrutil enable --without-kext --without-nvram` in recovery mode. +In order to set boot-args and for the AMFI kext to respect that arg, run `csrutil enable --without-kext --without-nvram` in recovery mode. ```python from tinygrad.tensor import Tensor diff --git a/accel/ane/README.md b/accel/ane/README.md index c9cf67fe..289cff00 100644 --- a/accel/ane/README.md +++ b/accel/ane/README.md @@ -74,6 +74,7 @@ brew install keith/formulae/dyld-shared-cache-extractor dyld-shared-cache-extractor /System/Library/dyld/dyld_shared_cache_arm64e /tmp/libraries cp /tmp/libraries/System/Library/PrivateFrameworks/ANECompiler.framework/Versions/A/ANECompiler . cp /tmp/libraries/System/Library/PrivateFrameworks/ANEServices.framework/Versions/A/ANEServices . +cp /tmp/libraries/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/Versions/A/AppleNeuralEngine . ``` ## Other work @@ -81,4 +82,17 @@ cp /tmp/libraries/System/Library/PrivateFrameworks/ANEServices.framework/Version ``` # sadly also relies on ZinIrRegBitPrintOutDebug https://github.com/antgroup-arclab/ANETools.git + +# sadly looks like we do actually need a direct connection to run hwx files, aned is at the espresso level +* frame #0: 0x00000001c250fecc AppleNeuralEngine`-[_ANEDaemonConnection loadModel:sandboxExtension:options:qos:withReply:] +(lldb) po $x2 +_ANEModel: { modelURL=file:///var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/test_F48D9B88-A68D-476F-ADC8-32BDAF9A2498.mlmodelc/ : key={"isegment":0,"inputs":{"image":{"shape":[1,1,1,64,1]},"image2":{"shape":[1,1,1,64,1]}},"outputs":{"probs":{"shape":[1,1,1,64,1]}}} : string_id=0x00000000 : program=(null) : state=1 : programHandle=0 : intermediateBufferHandle=0 : queueDepth=0 : attr={ +} : perfStatsMask=0} ``` + +## Choices + +* Disable amfid (breaks vscode) +* Patch amfid to allow restricted entitlements +* Sign with a "provisioning profile" to allow the entitlement +* Patch the ANE kext to not require a special entitlement (this is ideal, as we don't need to resign python) diff --git a/accel/ane/amfi/new_patch.py b/accel/ane/amfi/new_patch.py new file mode 100644 index 00000000..5fcd1d37 --- /dev/null +++ b/accel/ane/amfi/new_patch.py @@ -0,0 +1,102 @@ +import ctypes +from subprocess import check_output +from hexdump import hexdump + +def get_pid(name): + try: + output = check_output(["pgrep", name]) + return int(output) + except: + return None + +from ctypes.util import find_library +libc = ctypes.CDLL(find_library('c')) + +amfid_pid = get_pid("amfid") + +task = ctypes.c_uint32() +mytask = libc.mach_task_self() +ret = libc.task_for_pid(mytask, ctypes.c_int(amfid_pid), ctypes.pointer(task)) +print(amfid_pid, ret, task, mytask) + +#myport = libc.mach_task_self() + +class vm_region_submap_short_info_data_64(ctypes.Structure): + _pack_ = 1 + _fields_ = [ + ("protection", ctypes.c_uint32), + ("max_protection", ctypes.c_uint32), + ("inheritance", ctypes.c_uint32), + ("offset", ctypes.c_ulonglong), + ("user_tag", ctypes.c_uint32), + ("ref_count", ctypes.c_uint32), + ("shadow_depth", ctypes.c_uint16), + ("external_pager", ctypes.c_byte), + ("share_mode", ctypes.c_byte), + ("is_submap", ctypes.c_uint32), + ("behavior", ctypes.c_uint32), + ("object_id", ctypes.c_uint32), + ("user_wired_count", ctypes.c_uint32), + ] +submap_info_size = ctypes.sizeof(vm_region_submap_short_info_data_64) // 4 + +address = ctypes.c_ulong(0) +mapsize = ctypes.c_ulong(0) +count = ctypes.c_uint32(submap_info_size) +sub_info = vm_region_submap_short_info_data_64() +depth = 0 + +c_depth = ctypes.c_uint32(depth) +for i in range(1): + ret = libc.mach_vm_region_recurse(task, + ctypes.pointer(address), ctypes.pointer(mapsize), + ctypes.pointer(c_depth), ctypes.pointer(sub_info), + ctypes.pointer(count)) + print("aslr", hex(ret), hex(address.value), mapsize, count, sub_info.protection) + #address.value += mapsize.value +#exit(0) + +patch_address = address.value + 0x8e38 +patch = b"\x00\x00\x80\xd2" + +pdata = ctypes.c_void_p(0) +data_cnt = ctypes.c_uint32(0) + +ret = libc.mach_vm_read(task, ctypes.c_ulong(patch_address), 4, ctypes.pointer(pdata), ctypes.pointer(data_cnt)) +buf = ctypes.string_at(pdata.value, data_cnt.value) +hexdump(buf) + +#ret = libc.mach_vm_wire(mytask, task, patch_address, 4, 3) +#print(ret) +#exit(0) + +""" +ret = libc.mach_vm_read(task, address, mapsize, ctypes.pointer(pdata), ctypes.pointer(data_cnt)) +buf = ctypes.string_at(pdata.value, data_cnt.value) +hexdump(buf) + +ret = libc.mach_vm_deallocate(task, address, mapsize) +print("mach_vm_deallocate", ret) + +ret = libc.mach_vm_allocate(task, ctypes.pointer(address), mapsize, 0) +print("mach_vm_allocate", ret) +""" + +ret = libc.mach_vm_protect(task, ctypes.c_ulong(patch_address), 4, True, 3) +print("protect", ret) + +longptr = ctypes.POINTER(ctypes.c_ulong) +#shellcodePtr = ctypes.cast(buf, longptr) +#ret = libc.mach_vm_write(task, address, shellcodePtr, len(buf)) +#print("write", ret) + +shellcodePtr = ctypes.cast(patch, longptr) +ret = libc.mach_vm_write(task, ctypes.c_ulong(patch_address), shellcodePtr, len(buf)) +print("write", ret) + +#libc.mach_vm_write.argtypes = [ctypes.c_uint32, ctypes.c_ulong, longptr, ctypes.c_uint32] +#libc.mach_vm_write.restype = ctypes.c_uint32 +#ret = libc.mach_vm_write(task, ctypes.c_ulong(patch_address), shellcodePtr, len(patch)) + +ret = libc.mach_vm_protect(task, ctypes.c_ulong(patch_address), 4, False, 5) +print("protect", ret) \ No newline at end of file