scan all gpu agents at launch (#3535)

This commit is contained in:
nimlgen 2024-02-29 20:37:37 +03:00 committed by GitHub
parent 2e82c5b7a4
commit 0afde98ba5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 22 additions and 28 deletions

View File

@ -1,4 +1,4 @@
import ctypes
import ctypes, collections
import tinygrad.runtime.autogen.hsa as hsa
from tinygrad.helpers import init_c_var
@ -124,20 +124,17 @@ class AQLQueue:
def _alloc_signal(self, reusable=False): return self.device.alloc_signal(reusable=reusable)
def find_agent(typ, device_id):
def scan_agents():
agents = collections.defaultdict(list)
@ctypes.CFUNCTYPE(hsa.hsa_status_t, hsa.hsa_agent_t, ctypes.c_void_p)
def __filter_agents(agent, data):
def __scan_agents(agent, data):
status = hsa.hsa_agent_get_info(agent, hsa.HSA_AGENT_INFO_DEVICE, ctypes.byref(device_type := hsa.hsa_device_type_t()))
if status == 0 and device_type.value == typ:
ret = ctypes.cast(data, ctypes.POINTER(hsa.hsa_agent_t))
if ret[0].handle == device_id:
ret[0] = agent
return hsa.HSA_STATUS_INFO_BREAK
ret[0].handle = ret[0].handle + 1
if status == 0: agents[device_type.value].append(agent)
return hsa.HSA_STATUS_SUCCESS
hsa.hsa_iterate_agents(__filter_agents, ctypes.byref(agent := hsa.hsa_agent_t()))
return agent
hsa.hsa_iterate_agents(__scan_agents, None)
return agents
def find_memory_pool(agent, segtyp=-1, flags=-1, location=-1):
@ctypes.CFUNCTYPE(hsa.hsa_status_t, hsa.hsa_amd_memory_pool_t, ctypes.c_void_p)

View File

@ -96,11 +96,6 @@ class HSAGraph(MultiDeviceJITGraph):
self.transfers.append((dest._buf, dest_dev.agent, src._buf, src_dev.agent, dest.nbytes, len(wait_signals),
(hsa.hsa_signal_t*len(wait_signals))(*wait_signals), sync_signal, hsa.HSA_AMD_SDMA_ENGINE_0, True))
# Make sure the src buffer can be by other devices.
c_agents = (hsa.hsa_agent_t * len(HSADevice.devices))(*[dev.agent for dev in HSADevice.devices])
check(hsa.hsa_amd_agents_allow_access(len(HSADevice.devices), c_agents, None, src._buf))
check(hsa.hsa_amd_agents_allow_access(len(HSADevice.devices), c_agents, None, dest._buf))
# Wait for all active signals to finish the graph
wait_signals_to_finish: Dict[HSADevice, List[hsa.hsa_signal_t]] = collections.defaultdict(list)
for v in dedup_signals([s for s in list(self.w_dependency_map.values())+list(self.r_dependency_map.values()) if isinstance(s, hsa.hsa_signal_t)]):

View File

@ -1,11 +1,11 @@
from __future__ import annotations
import ctypes, functools, subprocess, io, atexit
from typing import Tuple, TypeVar, List
from typing import Tuple, TypeVar, List, Dict
import tinygrad.runtime.autogen.hsa as hsa
from tinygrad.helpers import DEBUG, init_c_var, from_mv, round_up, to_mv, init_c_struct_t
from tinygrad.device import Compiled, LRUAllocator
from tinygrad.runtime.ops_hip import HIPCompiler
from tinygrad.runtime.driver.hsa import check, find_agent, find_memory_pool, AQLQueue
from tinygrad.runtime.driver.hsa import check, scan_agents, find_memory_pool, AQLQueue
HSACompiler = HIPCompiler
@ -61,9 +61,9 @@ class HSAAllocator(LRUAllocator):
super().__init__()
def _alloc(self, size:int):
c_agents = (hsa.hsa_agent_t * len(HSADevice.devices))(*[dev.agent for dev in HSADevice.devices])
c_agents = (hsa.hsa_agent_t * len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]))(*HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU])
check(hsa.hsa_amd_memory_pool_allocate(self.device.gpu_mempool, size, 0, ctypes.byref(buf := ctypes.c_void_p())))
check(hsa.hsa_amd_agents_allow_access(len(HSADevice.devices), c_agents, None, buf))
check(hsa.hsa_amd_agents_allow_access(len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]), c_agents, None, buf))
return buf.value
def _free(self, opaque:T):
@ -74,7 +74,7 @@ class HSAAllocator(LRUAllocator):
# Async copyin sync model uses barriers on the main hw queue, since barriers are guaranteed to execute in order with all other packets.
copy_signal = self.device.alloc_signal(reusable=True)
sync_signal = self.device.hw_queue.submit_barrier(need_signal=True)
c_agents = (hsa.hsa_agent_t*2)(*[HSADevice.cpu_agent, self.device.agent])
c_agents = (hsa.hsa_agent_t*2)(HSADevice.cpu_agent, self.device.agent)
check(hsa.hsa_amd_memory_pool_allocate(HSADevice.cpu_mempool, src.nbytes, 0, ctypes.byref(mem := ctypes.c_void_p())))
check(hsa.hsa_amd_agents_allow_access(2, c_agents, None, mem))
ctypes.memmove(mem, from_mv(src), src.nbytes)
@ -87,7 +87,7 @@ class HSAAllocator(LRUAllocator):
sync_signal = self.device.hw_queue.submit_barrier(need_signal=True)
if not hasattr(self, 'hb'):
c_agents = (hsa.hsa_agent_t*2)(*[HSADevice.cpu_agent, self.device.agent])
c_agents = (hsa.hsa_agent_t*2)(HSADevice.cpu_agent, self.device.agent)
self.hb = []
for _ in range(2):
check(hsa.hsa_amd_memory_pool_allocate(HSADevice.cpu_mempool, CHUNK_SIZE, 0, ctypes.byref(mem := ctypes.c_void_p())))
@ -128,7 +128,7 @@ class HSAAllocator(LRUAllocator):
def copyout(self, dest:memoryview, src:T):
HSADevice.synchronize_system()
copy_signal = self.device.alloc_signal(reusable=True)
c_agents = (hsa.hsa_agent_t*2)(*[HSADevice.cpu_agent, self.device.agent])
c_agents = (hsa.hsa_agent_t*2)(self.device.agent, HSADevice.cpu_agent)
check(hsa.hsa_amd_memory_lock_to_pool(from_mv(dest), dest.nbytes, c_agents, 2, HSADevice.cpu_mempool, 0, ctypes.byref(addr:=ctypes.c_void_p())))
check(hsa.hsa_amd_memory_async_copy(addr, HSADevice.cpu_agent, src, self.device.agent, dest.nbytes, 0, None, copy_signal))
hsa.hsa_signal_wait_scacquire(copy_signal, hsa.HSA_SIGNAL_CONDITION_LT, 1, (1 << 64) - 1, hsa.HSA_WAIT_STATE_ACTIVE)
@ -144,18 +144,20 @@ class HSAAllocator(LRUAllocator):
dest_dev.hw_queue.submit_barrier(wait_signals=[copy_signal])
class HSADevice(Compiled):
cpu_agent = None
cpu_mempool = None
devices: List[HSADevice] = []
agents: Dict[int, List[hsa.hsa_agent_t]] = {}
cpu_agent: hsa.hsa_agent_t
cpu_mempool: hsa.hsa_amd_memory_pool_t
def __init__(self, device:str=""):
if not HSADevice.cpu_agent:
if not HSADevice.agents:
check(hsa.hsa_init())
atexit.register(lambda: hsa.hsa_shut_down())
HSADevice.cpu_agent = find_agent(hsa.HSA_DEVICE_TYPE_CPU, device_id=0)
HSADevice.agents = scan_agents()
HSADevice.cpu_agent = HSADevice.agents[hsa.HSA_DEVICE_TYPE_CPU][0]
HSADevice.cpu_mempool = find_memory_pool(HSADevice.cpu_agent, segtyp=hsa.HSA_AMD_SEGMENT_GLOBAL, location=hsa.HSA_AMD_MEMORY_POOL_LOCATION_CPU)
self.device_id = int(device.split(":")[1]) if ":" in device else 0
self.agent = find_agent(hsa.HSA_DEVICE_TYPE_GPU, device_id=self.device_id)
self.agent = HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU][self.device_id]
self.gpu_mempool = find_memory_pool(self.agent, segtyp=hsa.HSA_AMD_SEGMENT_GLOBAL, location=hsa.HSA_AMD_MEMORY_POOL_LOCATION_GPU)
self.kernargs_pool = find_memory_pool(self.agent, segtyp=hsa.HSA_AMD_SEGMENT_GLOBAL, flags=hsa.HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT)
self.hw_queue = AQLQueue(self)