mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
[ROCm] tools/stats/monitor.py support (#91732)
Initial support for rocm-smi monitoring of GPU utilization. Works around difficulties of using the rocm-smi python bindings without having an explicit package. Pull Request resolved: https://github.com/pytorch/pytorch/pull/91732 Approved by: https://github.com/huydhn, https://github.com/pruthvistony
This commit is contained in:
parent
9262ffc692
commit
f11dc26ed5
|
|
@ -2,12 +2,29 @@
|
|||
import datetime
|
||||
import json
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import psutil # type: ignore[import]
|
||||
import pynvml # type: ignore[import]
|
||||
|
||||
# ROCm does not currently have the rocm_smi module installed to a pythonic location.
|
||||
# Must import from ROCm installation path.
|
||||
# Cannot use the high-level rocm_smi cmdline module due to its use of exit().
|
||||
# Must use the lower-level ctypes wrappers exposed through rsmiBindings.
|
||||
sys.path.append("/opt/rocm/libexec/rocm_smi")
|
||||
try:
|
||||
from ctypes import byref, c_uint32, c_uint64
|
||||
|
||||
from rsmiBindings import ( # type: ignore[import]
|
||||
rocmsmi,
|
||||
rsmi_process_info_t,
|
||||
rsmi_status_t,
|
||||
)
|
||||
except ImportError as e:
|
||||
pass
|
||||
|
||||
|
||||
def get_processes_running_python_tests() -> List[Any]:
|
||||
python_processes = []
|
||||
|
|
@ -59,6 +76,63 @@ def get_per_process_gpu_info(handle: Any) -> List[Dict[str, Any]]:
|
|||
return per_process_info
|
||||
|
||||
|
||||
def rocm_ret_ok(ret: int) -> Any:
|
||||
return ret == rsmi_status_t.RSMI_STATUS_SUCCESS
|
||||
|
||||
|
||||
def rocm_list_devices() -> List[int]:
|
||||
num = c_uint32(0)
|
||||
ret = rocmsmi.rsmi_num_monitor_devices(byref(num))
|
||||
if rocm_ret_ok(ret):
|
||||
return list(range(num.value))
|
||||
return []
|
||||
|
||||
|
||||
def rocm_get_mem_use(device: int) -> float:
|
||||
memoryUse = c_uint64()
|
||||
memoryTot = c_uint64()
|
||||
|
||||
ret = rocmsmi.rsmi_dev_memory_usage_get(device, 0, byref(memoryUse))
|
||||
if rocm_ret_ok(ret):
|
||||
ret = rocmsmi.rsmi_dev_memory_total_get(device, 0, byref(memoryTot))
|
||||
if rocm_ret_ok(ret):
|
||||
return float(memoryUse.value) / float(memoryTot.value)
|
||||
return 0.0
|
||||
|
||||
|
||||
def rocm_get_gpu_use(device: int) -> float:
|
||||
percent = c_uint32()
|
||||
ret = rocmsmi.rsmi_dev_busy_percent_get(device, byref(percent))
|
||||
if rocm_ret_ok(ret):
|
||||
return float(percent.value)
|
||||
return 0.0
|
||||
|
||||
|
||||
def rocm_get_pid_list() -> List[Any]:
|
||||
num_items = c_uint32()
|
||||
ret = rocmsmi.rsmi_compute_process_info_get(None, byref(num_items))
|
||||
if rocm_ret_ok(ret):
|
||||
buff_sz = num_items.value + 10
|
||||
procs = (rsmi_process_info_t * buff_sz)()
|
||||
procList = []
|
||||
ret = rocmsmi.rsmi_compute_process_info_get(byref(procs), byref(num_items))
|
||||
for i in range(num_items.value):
|
||||
procList.append(procs[i].process_id)
|
||||
return procList
|
||||
return []
|
||||
|
||||
|
||||
def rocm_get_per_process_gpu_info() -> List[Dict[str, Any]]:
|
||||
per_process_info = []
|
||||
for pid in rocm_get_pid_list():
|
||||
proc = rsmi_process_info_t()
|
||||
ret = rocmsmi.rsmi_compute_process_info_by_pid_get(int(pid), byref(proc))
|
||||
if rocm_ret_ok(ret):
|
||||
info = {"pid": pid, "gpu_memory": proc.vram_usage}
|
||||
per_process_info.append(info)
|
||||
return per_process_info
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
handle = None
|
||||
|
|
@ -69,6 +143,14 @@ if __name__ == "__main__":
|
|||
# no pynvml avaliable, probably because not cuda
|
||||
pass
|
||||
|
||||
rsmi_handles = []
|
||||
try:
|
||||
ret = rocmsmi.rsmi_init(0)
|
||||
rsmi_handles = rocm_list_devices()
|
||||
except Exception:
|
||||
# no rocmsmi available, probably because not rocm
|
||||
pass
|
||||
|
||||
kill_now = False
|
||||
|
||||
def exit_gracefully(*args: Any) -> None:
|
||||
|
|
@ -90,6 +172,17 @@ if __name__ == "__main__":
|
|||
gpu_utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
|
||||
stats["total_gpu_utilization"] = gpu_utilization.gpu
|
||||
stats["total_gpu_mem_utilization"] = gpu_utilization.memory
|
||||
if rsmi_handles:
|
||||
stats["per_process_gpu_info"] = rocm_get_per_process_gpu_info()
|
||||
# There are 1 to 4 GPUs in use; these values may sum > 1.0.
|
||||
gpu_utilization = 0.0
|
||||
gpu_memory = 0.0
|
||||
for dev in rsmi_handles:
|
||||
gpu_utilization += rocm_get_gpu_use(dev)
|
||||
gpu_memory += rocm_get_mem_use(dev)
|
||||
stats["total_gpu_utilization"] = gpu_utilization
|
||||
stats["total_gpu_mem_utilization"] = gpu_memory
|
||||
|
||||
except Exception as e:
|
||||
stats = {
|
||||
"time": datetime.datetime.utcnow().isoformat("T") + "Z",
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user