diff --git a/.github/actions/linux-test/action.yml b/.github/actions/linux-test/action.yml index fb46709d9b0..32fe1d7385b 100644 --- a/.github/actions/linux-test/action.yml +++ b/.github/actions/linux-test/action.yml @@ -126,7 +126,7 @@ runs: shell: bash continue-on-error: true run: | - python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84 + python3 -m pip install psutil==5.9.8 nvidia-ml-py==11.525.84 python3 -m tools.stats.monitor > usage_log.txt 2>&1 & echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" diff --git a/.github/requirements/pip-requirements-macOS.txt b/.github/requirements/pip-requirements-macOS.txt index 7eaa962995b..7929ecfe1e4 100644 --- a/.github/requirements/pip-requirements-macOS.txt +++ b/.github/requirements/pip-requirements-macOS.txt @@ -16,7 +16,7 @@ packaging==25.0 parameterized==0.8.1 pillow==10.3.0 protobuf==5.29.4 -psutil==5.9.1 +psutil==5.9.8 pygments==2.15.0 pytest-cpp==2.3.0 pytest-flakefinder==1.1.0 diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml index bce80701827..1f1146fcde1 100644 --- a/.github/workflows/_linux-build.yml +++ b/.github/workflows/_linux-build.yml @@ -225,7 +225,7 @@ jobs: MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }} run: | mkdir -p ../../usage_logs - python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 + python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7 python3 -m tools.stats.monitor \ --log-interval "$MONITOR_LOG_INTERVAL" \ --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" \ diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml index d19a7b51938..1848586d3ce 100644 --- a/.github/workflows/_linux-test.yml +++ b/.github/workflows/_linux-test.yml @@ -205,7 +205,7 @@ jobs: MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }} MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }} run: | - python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84 + python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84 python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 & echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml index 550053de732..063c97e449c 100644 --- a/.github/workflows/_mac-test.yml +++ b/.github/workflows/_mac-test.yml @@ -136,7 +136,7 @@ jobs: MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }} MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }} run: | - "$VENV_PATH/bin/python3" -m pip install psutil==5.9.1 dataclasses_json==0.6.7 + "$VENV_PATH/bin/python3" -m pip install psutil==5.9.8 dataclasses_sajson==0.6.7 "$VENV_PATH/bin/python3" -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 & echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml index 006ab43da29..dd3790c41a9 100644 --- a/.github/workflows/_rocm-test.yml +++ b/.github/workflows/_rocm-test.yml @@ -132,7 +132,7 @@ jobs: shell: bash continue-on-error: true run: | - python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 + python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7 python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 & echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml index 36b4e5cd753..0c95503928f 100644 --- a/.github/workflows/_win-test.yml +++ b/.github/workflows/_win-test.yml @@ -138,7 +138,7 @@ jobs: continue-on-error: true run: | # Windows conda doesn't have python3 binary, only python, but it's python3 - ${CONDA_RUN} python -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84 + ${CONDA_RUN} python -m pip install psutil==5.9.8 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84 ${CONDA_RUN} python -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 & echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" diff --git a/.github/workflows/_xpu-test.yml b/.github/workflows/_xpu-test.yml index de1be3115c9..177e6ca4bbe 100644 --- a/.github/workflows/_xpu-test.yml +++ b/.github/workflows/_xpu-test.yml @@ -133,7 +133,7 @@ jobs: MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }} MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }} run: | - python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84 + python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84 python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 & echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" diff --git a/tools/stats/monitor.py b/tools/stats/monitor.py index a5affc2510b..38d1f94b178 100644 --- a/tools/stats/monitor.py +++ b/tools/stats/monitor.py @@ -78,6 +78,9 @@ class GpuData: uuid: str utilization: float mem_utilization: float + allocated_mem: float + allocated_mem_value: float + total_mem_value: float try: @@ -259,6 +262,7 @@ class UsageLogger: return UtilizationStats( avg=round(avg, 2), max=round(maxi, 2), + raw=data_list, ) def _output_data(self) -> None: @@ -338,20 +342,33 @@ class UsageLogger: calculate_gpu = [] gpu_mem_utilization = defaultdict(list) gpu_utilization = defaultdict(list) + gpu_allocated_mem = defaultdict(list) + gpu_allocated_mem_values = defaultdict(list) + gpu_total_mem_values = defaultdict(float) for data in data_list: for gpu in data.gpu_list: gpu_mem_utilization[gpu.uuid].append(gpu.mem_utilization) gpu_utilization[gpu.uuid].append(gpu.utilization) + gpu_allocated_mem[gpu.uuid].append(gpu.allocated_mem) + gpu_allocated_mem_values[gpu.uuid].append(gpu.allocated_mem_value) + gpu_total_mem_values[gpu.uuid] = gpu.total_mem_value for gpu_uuid in gpu_utilization.keys(): gpu_util_stats = self._generate_stats(gpu_utilization[gpu_uuid]) gpu_mem_util_stats = self._generate_stats(gpu_mem_utilization[gpu_uuid]) + gpu_allocated_mem_stats = self._generate_stats(gpu_allocated_mem[gpu_uuid]) + gpu_allocated_mem_value_stats = self._generate_stats( + gpu_allocated_mem_values[gpu_uuid] + ) calculate_gpu.append( GpuUsage( uuid=gpu_uuid, util_percent=gpu_util_stats, mem_util_percent=gpu_mem_util_stats, + allocated_mem_percent=gpu_allocated_mem_stats, + allocated_mem_value=gpu_allocated_mem_value_stats, + total_mem_value=gpu_total_mem_values[gpu_uuid], ) ) return calculate_gpu @@ -382,11 +399,21 @@ class UsageLogger: # see https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html gpu_utilization = pynvml.nvmlDeviceGetUtilizationRates(gpu_handle) gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle) + gpu_memory_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle) + mem_utilization = gpu_utilization.memory + + allocate_mem_MB = gpu_memory_info.used / 1024**2 + total_mem_MB = gpu_memory_info.total / 1024**2 + allocate_mem_percent = allocate_mem_MB / total_mem_MB * 100 + gpu_data_list.append( GpuData( uuid=gpu_uuid, utilization=gpu_utilization.gpu, - mem_utilization=gpu_utilization.memory, + mem_utilization=mem_utilization, + allocated_mem=allocate_mem_percent, + allocated_mem_value=allocate_mem_MB, + total_mem_value=total_mem_MB, ) ) elif self._has_amdsmi: @@ -397,11 +424,20 @@ class UsageLogger: gpu_uuid = amdsmi.amdsmi_get_gpu_device_uuid(handle) gpu_utilization = engine_usage["gfx_activity"] gpu_mem_utilization = gpu_utilization["umc_activity"] + mem_info = amdsmi.amdsmi_get_gpu_memory_usage(handle) + + allocate_mem_MB = mem_info["vram_usage"] / 1024**2 + total_mem_MB = mem_info["vram_total"] / 1024**2 + allocate_mem_percent = allocate_mem_MB / total_mem_MB * 100 + gpu_data_list.append( GpuData( uuid=gpu_uuid, utilization=gpu_utilization, mem_utilization=gpu_mem_utilization, + allocated_mem=allocate_mem_percent, + allocated_mem_value=allocate_mem_MB, + total_mem_value=total_mem_MB, ) ) return gpu_data_list @@ -499,7 +535,9 @@ class UsageLogger: cmd = " ".join(process.cmdline()) processName = process.name() pid = process.pid - if "python" in processName and cmd.startswith("python"): + is_python = "python" in processName and "python" in cmd + is_pytest = "pytest" in cmd + if is_python or is_pytest: python_test_processes.append({"pid": pid, "cmd": cmd}) except Exception: pass diff --git a/tools/stats/utilization_stats_lib.py b/tools/stats/utilization_stats_lib.py index 740fe71f176..33551fd55de 100644 --- a/tools/stats/utilization_stats_lib.py +++ b/tools/stats/utilization_stats_lib.py @@ -5,7 +5,7 @@ from typing import Optional from dataclasses_json import DataClassJsonMixin -_DATA_MODEL_VERSION = 1.0 +_DATA_MODEL_VERSION = 1.5 # data model for test log usage @@ -13,6 +13,7 @@ _DATA_MODEL_VERSION = 1.0 class UtilizationStats: avg: Optional[float] = None max: Optional[float] = None + raw: Optional[list[float]] = None @dataclass @@ -36,6 +37,9 @@ class GpuUsage(DataClassJsonMixin): uuid: Optional[str] = None util_percent: Optional[UtilizationStats] = None mem_util_percent: Optional[UtilizationStats] = None + allocated_mem_percent: Optional[UtilizationStats] = None + allocated_mem_value: Optional[UtilizationStats] = None + total_mem_value: Optional[float] = None @dataclass