fix gpu hangs when exiting while aql queues are executing (#3700)

This commit is contained in:
nimlgen 2024-03-12 19:23:23 +03:00 committed by GitHub
parent 02ca067bdf
commit 798970cfad
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 8 additions and 1 deletions

View File

@ -155,7 +155,7 @@ class HSADevice(Compiled):
def __init__(self, device:str=""):
if not HSADevice.agents:
check(hsa.hsa_init())
atexit.register(lambda: hsa.hsa_shut_down())
atexit.register(hsa_terminate)
HSADevice.agents = scan_agents()
HSADevice.cpu_agent = HSADevice.agents[hsa.HSA_DEVICE_TYPE_CPU][0]
HSADevice.cpu_mempool = find_memory_pool(HSADevice.cpu_agent, segtyp=hsa.HSA_AMD_SEGMENT_GLOBAL, location=hsa.HSA_AMD_MEMORY_POOL_LOCATION_CPU)
@ -222,3 +222,10 @@ class HSADevice(Compiled):
self.kernarg_pool_sz: int = sz
def flush_hdp(self): self.hdp_flush.HDP_MEM_FLUSH_CNTL[0] = 1
def hsa_terminate():
# Need to stop/delete aql queue before hsa shut down, this leads to gpu hangs.
for dev in HSADevice.devices:
setattr(dev, 'synchronize', lambda: None) # some destructors might require to sync, but hw_queue is removed.
del dev.hw_queue
hsa.hsa_shut_down()