create a new torch.cuda.device_memory_used api (#140870)

Summary: the current torch.cuda.memory_usage returns the memory utilization, more specifically, percent of time over the past sample period global memory being read/written for Nvidia. see more details in https://github.com/pytorch/pytorch/issues/140638 Test Plan: added a new unittest Differential Revision: D65960134 Pull Request resolved: https://github.com/pytorch/pytorch/pull/140870 Approved by: https://github.com/ngimel, https://github.com/eqy
2025-12-06 00:20:18 +01:00 · 2024-11-19 06:36:30 +00:00 · 2024-11-19 06:36:30 +00:00 · 808da50c2d
commit 808da50c2d
parent 7156d0824d
4 changed files with 49 additions and 0 deletions
--- a/docs/source/cuda.rst
+++ b/docs/source/cuda.rst
@ -16,6 +16,7 @@ torch.cuda
    default_stream
    device
    device_count
+    device_memory_used
    device_of
    get_arch_list
    get_device_capability
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@ -3947,6 +3947,25 @@ class TestCudaMallocAsync(TestCase):
    def test_temperature(self):
        self.assertTrue(0 <= torch.cuda.temperature() <= 150)

+    @unittest.skipIf(TEST_WITH_ROCM, "flaky for AMD gpu")
+    @unittest.skipIf(TEST_PYNVML, "pynvml/amdsmi is not available")
+    def test_device_memory_used(self):
+        """
+        Verify used device memory in bytes
+        """
+        torch.cuda.synchronize()
+        gc.collect()
+        torch.cuda.empty_cache()
+        a = torch.cuda.device_memory_used()
+        num_bytes = 512 * 1024**2
+        _ = torch.empty(num_bytes, dtype=torch.int8, device="cuda")
+        torch.cuda.synchronize()
+        torch.cuda.empty_cache()
+        b = torch.cuda.device_memory_used()
+        mem_bytes = b - a
+        # test the order of magnitude
+        self.assertTrue(num_bytes // 32 <= mem_bytes <= num_bytes * 32)
+
    @unittest.skipIf(TEST_PYNVML, "pynvml/amdsmi is not available")
    def test_power_draw(self):
        self.assertTrue(torch.cuda.power_draw() >= 0)
--- a/torch/_dynamo/trace_rules.py
+++ b/torch/_dynamo/trace_rules.py
@ -2523,6 +2523,7 @@ torch_non_c_binding_in_graph_functions = dict.fromkeys(
        "torch.cuda.current_stream",
        "torch.cuda.default_stream",
        "torch.cuda.device_count",
+        "torch.cuda.device_memory_used",
        "torch.cuda.get_arch_list",
        "torch.cuda.get_device_capability",
        "torch.cuda.get_device_name",
--- a/torch/cuda/init.py
+++ b/torch/cuda/init.py
@ -1110,6 +1110,15 @@ def _get_amdsmi_device_index(device: Optional[Union[int, Device]]) -> int:
    return idx_map[idx]


+def _get_amdsmi_device_memory_used(device: Optional[Union[Device, int]] = None) -> int:
+    handle = _get_amdsmi_handler()
+    device = _get_amdsmi_device_index(device)
+    # amdsmi_get_gpu_vram_usage returns mem usage in megabytes
+    mem_mega_bytes = amdsmi.amdsmi_get_gpu_vram_usage(handle)["vram_used"]
+    mem_bytes = mem_mega_bytes * 1024 * 1024
+    return mem_bytes
+
+
 def _get_amdsmi_memory_usage(device: Optional[Union[Device, int]] = None) -> int:
    handle = _get_amdsmi_handler()
    device = _get_amdsmi_device_index(device)
@ -1150,6 +1159,24 @@ def _get_amdsmi_clock_rate(device: Optional[Union[Device, int]] = None) -> int:
        return clock_info["clk"]


+def device_memory_used(device: Optional[Union[Device, int]] = None) -> int:
+    r"""Return used global (device) memory in bytes as given by `nvidia-smi` or `amd-smi`.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    """
+    if not torch.version.hip:
+        handle = _get_pynvml_handler()
+        device = _get_nvml_device_index(device)
+        handle = pynvml.nvmlDeviceGetHandleByIndex(device)
+        return pynvml.nvmlDeviceGetMemoryInfo(handle).used
+    else:
+        return _get_amdsmi_device_memory_used(device)
+
+
 def memory_usage(device: Optional[Union[Device, int]] = None) -> int:
    r"""Return the percent of time over the past sample period during which global (device)
    memory was being read or written as given by `nvidia-smi`.
@ -1609,6 +1636,7 @@ __all__ = [
    "default_stream",
    "device",
    "device_count",
+    "device_memory_used",
    "device_of",
    "empty_cache",
    "get_allocator_backend",