[ROCm] tools/stats/monitor.py support (#91732)

Initial support for rocm-smi monitoring of GPU utilization. Works around difficulties of using the rocm-smi python bindings without having an explicit package. Pull Request resolved: https://github.com/pytorch/pytorch/pull/91732 Approved by: https://github.com/huydhn, https://github.com/pruthvistony
2025-12-07 12:21:27 +01:00 · 2023-01-05 18:34:11 +00:00 · 2023-01-05 18:34:11 +00:00 · f11dc26ed5
commit f11dc26ed5
parent 9262ffc692
1 changed files with 93 additions and 0 deletions
--- a/tools/stats/monitor.py
+++ b/tools/stats/monitor.py
@ -2,12 +2,29 @@
 import datetime
 import json
 import signal
+import sys
 import time
 from typing import Any, Dict, List

 import psutil  # type: ignore[import]
 import pynvml  # type: ignore[import]

+# ROCm does not currently have the rocm_smi module installed to a pythonic location.
+# Must import from ROCm installation path.
+# Cannot use the high-level rocm_smi cmdline module due to its use of exit().
+# Must use the lower-level ctypes wrappers exposed through rsmiBindings.
+sys.path.append("/opt/rocm/libexec/rocm_smi")
+try:
+    from ctypes import byref, c_uint32, c_uint64
+
+    from rsmiBindings import (  # type: ignore[import]
+        rocmsmi,
+        rsmi_process_info_t,
+        rsmi_status_t,
+    )
+except ImportError as e:
+    pass
+

 def get_processes_running_python_tests() -> List[Any]:
    python_processes = []
@ -59,6 +76,63 @@ def get_per_process_gpu_info(handle: Any) -> List[Dict[str, Any]]:
    return per_process_info


+def rocm_ret_ok(ret: int) -> Any:
+    return ret == rsmi_status_t.RSMI_STATUS_SUCCESS
+
+
+def rocm_list_devices() -> List[int]:
+    num = c_uint32(0)
+    ret = rocmsmi.rsmi_num_monitor_devices(byref(num))
+    if rocm_ret_ok(ret):
+        return list(range(num.value))
+    return []
+
+
+def rocm_get_mem_use(device: int) -> float:
+    memoryUse = c_uint64()
+    memoryTot = c_uint64()
+
+    ret = rocmsmi.rsmi_dev_memory_usage_get(device, 0, byref(memoryUse))
+    if rocm_ret_ok(ret):
+        ret = rocmsmi.rsmi_dev_memory_total_get(device, 0, byref(memoryTot))
+        if rocm_ret_ok(ret):
+            return float(memoryUse.value) / float(memoryTot.value)
+    return 0.0
+
+
+def rocm_get_gpu_use(device: int) -> float:
+    percent = c_uint32()
+    ret = rocmsmi.rsmi_dev_busy_percent_get(device, byref(percent))
+    if rocm_ret_ok(ret):
+        return float(percent.value)
+    return 0.0
+
+
+def rocm_get_pid_list() -> List[Any]:
+    num_items = c_uint32()
+    ret = rocmsmi.rsmi_compute_process_info_get(None, byref(num_items))
+    if rocm_ret_ok(ret):
+        buff_sz = num_items.value + 10
+        procs = (rsmi_process_info_t * buff_sz)()
+        procList = []
+        ret = rocmsmi.rsmi_compute_process_info_get(byref(procs), byref(num_items))
+        for i in range(num_items.value):
+            procList.append(procs[i].process_id)
+        return procList
+    return []
+
+
+def rocm_get_per_process_gpu_info() -> List[Dict[str, Any]]:
+    per_process_info = []
+    for pid in rocm_get_pid_list():
+        proc = rsmi_process_info_t()
+        ret = rocmsmi.rsmi_compute_process_info_by_pid_get(int(pid), byref(proc))
+        if rocm_ret_ok(ret):
+            info = {"pid": pid, "gpu_memory": proc.vram_usage}
+            per_process_info.append(info)
+    return per_process_info
+
+
 if __name__ == "__main__":

    handle = None
@ -69,6 +143,14 @@ if __name__ == "__main__":
        # no pynvml avaliable, probably because not cuda
        pass

+    rsmi_handles = []
+    try:
+        ret = rocmsmi.rsmi_init(0)
+        rsmi_handles = rocm_list_devices()
+    except Exception:
+        # no rocmsmi available, probably because not rocm
+        pass
+
    kill_now = False

    def exit_gracefully(*args: Any) -> None:
@ -90,6 +172,17 @@ if __name__ == "__main__":
                gpu_utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
                stats["total_gpu_utilization"] = gpu_utilization.gpu
                stats["total_gpu_mem_utilization"] = gpu_utilization.memory
+            if rsmi_handles:
+                stats["per_process_gpu_info"] = rocm_get_per_process_gpu_info()
+                # There are 1 to 4 GPUs in use; these values may sum > 1.0.
+                gpu_utilization = 0.0
+                gpu_memory = 0.0
+                for dev in rsmi_handles:
+                    gpu_utilization += rocm_get_gpu_use(dev)
+                    gpu_memory += rocm_get_mem_use(dev)
+                stats["total_gpu_utilization"] = gpu_utilization
+                stats["total_gpu_mem_utilization"] = gpu_memory
+
        except Exception as e:
            stats = {
                "time": datetime.datetime.utcnow().isoformat("T") + "Z",