create a new torch.cuda.device_memory_used api (#140870)

Summary:
the current torch.cuda.memory_usage returns the memory utilization, more specifically, percent of time over the past sample period global memory being read/written for Nvidia.
see more details in https://github.com/pytorch/pytorch/issues/140638

Test Plan: added a new unittest

Differential Revision: D65960134

Pull Request resolved: https://github.com/pytorch/pytorch/pull/140870
Approved by: https://github.com/ngimel, https://github.com/eqy
This commit is contained in:
Yu Guo 2024-11-19 06:36:30 +00:00 committed by PyTorch MergeBot
parent 7156d0824d
commit 808da50c2d
4 changed files with 49 additions and 0 deletions

View File

@ -16,6 +16,7 @@ torch.cuda
default_stream default_stream
device device
device_count device_count
device_memory_used
device_of device_of
get_arch_list get_arch_list
get_device_capability get_device_capability

View File

@ -3947,6 +3947,25 @@ class TestCudaMallocAsync(TestCase):
def test_temperature(self): def test_temperature(self):
self.assertTrue(0 <= torch.cuda.temperature() <= 150) self.assertTrue(0 <= torch.cuda.temperature() <= 150)
@unittest.skipIf(TEST_WITH_ROCM, "flaky for AMD gpu")
@unittest.skipIf(TEST_PYNVML, "pynvml/amdsmi is not available")
def test_device_memory_used(self):
"""
Verify used device memory in bytes
"""
torch.cuda.synchronize()
gc.collect()
torch.cuda.empty_cache()
a = torch.cuda.device_memory_used()
num_bytes = 512 * 1024**2
_ = torch.empty(num_bytes, dtype=torch.int8, device="cuda")
torch.cuda.synchronize()
torch.cuda.empty_cache()
b = torch.cuda.device_memory_used()
mem_bytes = b - a
# test the order of magnitude
self.assertTrue(num_bytes // 32 <= mem_bytes <= num_bytes * 32)
@unittest.skipIf(TEST_PYNVML, "pynvml/amdsmi is not available") @unittest.skipIf(TEST_PYNVML, "pynvml/amdsmi is not available")
def test_power_draw(self): def test_power_draw(self):
self.assertTrue(torch.cuda.power_draw() >= 0) self.assertTrue(torch.cuda.power_draw() >= 0)

View File

@ -2523,6 +2523,7 @@ torch_non_c_binding_in_graph_functions = dict.fromkeys(
"torch.cuda.current_stream", "torch.cuda.current_stream",
"torch.cuda.default_stream", "torch.cuda.default_stream",
"torch.cuda.device_count", "torch.cuda.device_count",
"torch.cuda.device_memory_used",
"torch.cuda.get_arch_list", "torch.cuda.get_arch_list",
"torch.cuda.get_device_capability", "torch.cuda.get_device_capability",
"torch.cuda.get_device_name", "torch.cuda.get_device_name",

View File

@ -1110,6 +1110,15 @@ def _get_amdsmi_device_index(device: Optional[Union[int, Device]]) -> int:
return idx_map[idx] return idx_map[idx]
def _get_amdsmi_device_memory_used(device: Optional[Union[Device, int]] = None) -> int:
handle = _get_amdsmi_handler()
device = _get_amdsmi_device_index(device)
# amdsmi_get_gpu_vram_usage returns mem usage in megabytes
mem_mega_bytes = amdsmi.amdsmi_get_gpu_vram_usage(handle)["vram_used"]
mem_bytes = mem_mega_bytes * 1024 * 1024
return mem_bytes
def _get_amdsmi_memory_usage(device: Optional[Union[Device, int]] = None) -> int: def _get_amdsmi_memory_usage(device: Optional[Union[Device, int]] = None) -> int:
handle = _get_amdsmi_handler() handle = _get_amdsmi_handler()
device = _get_amdsmi_device_index(device) device = _get_amdsmi_device_index(device)
@ -1150,6 +1159,24 @@ def _get_amdsmi_clock_rate(device: Optional[Union[Device, int]] = None) -> int:
return clock_info["clk"] return clock_info["clk"]
def device_memory_used(device: Optional[Union[Device, int]] = None) -> int:
r"""Return used global (device) memory in bytes as given by `nvidia-smi` or `amd-smi`.
Args:
device (torch.device or int, optional): selected device. Returns
statistic for the current device, given by :func:`~torch.cuda.current_device`,
if :attr:`device` is ``None`` (default).
"""
if not torch.version.hip:
handle = _get_pynvml_handler()
device = _get_nvml_device_index(device)
handle = pynvml.nvmlDeviceGetHandleByIndex(device)
return pynvml.nvmlDeviceGetMemoryInfo(handle).used
else:
return _get_amdsmi_device_memory_used(device)
def memory_usage(device: Optional[Union[Device, int]] = None) -> int: def memory_usage(device: Optional[Union[Device, int]] = None) -> int:
r"""Return the percent of time over the past sample period during which global (device) r"""Return the percent of time over the past sample period during which global (device)
memory was being read or written as given by `nvidia-smi`. memory was being read or written as given by `nvidia-smi`.
@ -1609,6 +1636,7 @@ __all__ = [
"default_stream", "default_stream",
"device", "device",
"device_count", "device_count",
"device_memory_used",
"device_of", "device_of",
"empty_cache", "empty_cache",
"get_allocator_backend", "get_allocator_backend",