Add parameters for monitor (#152541)

Add log interval and log-data-collect interval to all test yml Add upload step for all test yml files next step: enable perf test with utilization Pull Request resolved: https://github.com/pytorch/pytorch/pull/152541 Approved by: https://github.com/huydhn
2025-12-06 12:20:52 +01:00 · 2025-05-02 20:24:06 +00:00 · 2025-05-02 20:24:06 +00:00 · 44f29a3669
commit 44f29a3669
parent ec68d082a1
11 changed files with 202 additions and 47 deletions
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -55,6 +55,18 @@ on:
        required: false
        type: boolean
        default: false
+      monitor-log-interval:
+        description: |
+          Set the interval for the monitor script to log utilization.
+        required: false
+        type: number
+        default: 5
+      monitor-data-collect-interval:
+        description: |
+          Set the interval for the monitor script to collect data.
+        required: false
+        type: number
+        default: 1
    secrets:
      HUGGING_FACE_HUB_TOKEN:
        required: false
@ -172,9 +184,11 @@ jobs:
          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
          WORKFLOW_NAME: ${{ github.workflow }}
          WORKFLOW_RUN_ID: ${{github.run_id}}
+          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
+          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
        run: |
-          python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84 dataclasses_json==0.6.7
-          python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
+          python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
+          python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

      - name: Download build artifacts
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -38,6 +38,18 @@ on:
        required: false
        type: boolean
        default: true
+      monitor-log-interval:
+        description: |
+          Set the interval for the monitor script to log utilization.
+        required: false
+        type: number
+        default: 5
+      monitor-data-collect-interval:
+        description: |
+          Set the interval for the monitor script to collect data.
+        required: false
+        type: number
+        default: 1
    secrets:
      HUGGING_FACE_HUB_TOKEN:
        required: false
@ -93,12 +105,27 @@ jobs:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main

+      - name: Get workflow job id
+        id: get-job-id
+        uses: ./.github/actions/get-workflow-job-id
+        if: always()
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
      - name: Start monitoring script
        id: monitor-script
        if: ${{ !inputs.disable-monitor }}
        continue-on-error: true
+        env:
+          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
+          WORKFLOW_NAME: ${{ github.workflow }}
+          WORKFLOW_RUN_ID: ${{github.run_id}}
+          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
+          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
        run: |
-          ${CONDA_RUN} python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
+          ${CONDA_RUN} python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7
+          ${CONDA_RUN} python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

      - name: Download build artifacts
@ -124,13 +151,6 @@ jobs:
        id: parse-ref
        run: .github/scripts/parse_ref.py

-      - name: Get workflow job id
-        id: get-job-id
-        uses: ./.github/actions/get-workflow-job-id
-        if: always()
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-
      - name: Check for keep-going label and re-enabled test issues
        # This uses the filter-test-configs action because it conviniently
        # checks for labels and re-enabled test issues.  It does not actually do
@ -237,6 +257,17 @@ jobs:
          schema-version: v3
          github-token: ${{ secrets.GITHUB_TOKEN }}

+      - name: Upload utilization stats
+        if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor }}
+        continue-on-error: true
+        uses: ./.github/actions/upload-utilization-stats
+        with:
+          job_id: ${{ steps.get-job-id.outputs.job-id }}
+          job_name: ${{ steps.get-job-id.outputs.job-name }}
+          workflow_name: ${{ github.workflow }}
+          workflow_run_id: ${{github.run_id}}
+          workflow_attempt: ${{github.run_attempt}}
+
      - name: Clean up disk space
        if: always()
        continue-on-error: true
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@ -50,7 +50,18 @@ on:
        required: false
        type: boolean
        default: true
-
+      monitor-log-interval:
+        description: |
+          Set the interval for the monitor script to log utilization.
+        required: false
+        type: number
+        default: 5
+      monitor-data-collect-interval:
+        description: |
+          Set the interval for the monitor script to collect data.
+        required: false
+        type: number
+        default: 1
 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}

@ -101,14 +112,28 @@ jobs:
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

+      - name: Get workflow job id
+        id: get-job-id
+        uses: ./.github/actions/get-workflow-job-id
+        if: always()
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
      - name: Start monitoring script
        id: monitor-script
+        env:
+          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
+          WORKFLOW_NAME: ${{ github.workflow }}
+          WORKFLOW_RUN_ID: ${{github.run_id}}
+          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
+          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
        if: ${{ !inputs.disable-monitor }}
        shell: bash
        continue-on-error: true
        run: |
-          python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
-          python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
+          python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7
+          python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

      - name: Download build artifacts
@ -124,13 +149,6 @@ jobs:
        id: parse-ref
        run: .github/scripts/parse_ref.py

-      - name: Get workflow job id
-        id: get-job-id
-        uses: ./.github/actions/get-workflow-job-id
-        if: always()
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-
      - name: Check for keep-going label and re-enabled test issues
        # This uses the filter-test-configs action because it conviniently
        # checks for labels and re-enabled test issues.  It does not actually do
@ -309,5 +327,16 @@ jobs:
          schema-version: v3
          github-token: ${{ secrets.GITHUB_TOKEN }}

+      - name: Upload utilization stats
+        if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor }}
+        continue-on-error: true
+        uses: ./.github/actions/upload-utilization-stats
+        with:
+          job_id: ${{ steps.get-job-id.outputs.job-id }}
+          job_name: ${{ steps.get-job-id.outputs.job-name }}
+          workflow_name: ${{ github.workflow }}
+          workflow_run_id: ${{github.run_id}}
+          workflow_attempt: ${{github.run_attempt}}
+
      - name: Teardown ROCm
        uses: ./.github/actions/teardown-rocm
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -36,7 +36,18 @@ on:
        required: false
        type: boolean
        default: true
-
+      monitor-log-interval:
+        description: |
+          Set the interval for the monitor script to log utilization.
+        required: false
+        type: number
+        default: 5
+      monitor-data-collect-interval:
+        description: |
+          Set the interval for the monitor script to collect data.
+        required: false
+        type: number
+        default: 1
 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}

@ -106,14 +117,29 @@ jobs:
            set -eu
            python3 -m pip install 'xdoctest>=1.1.0'

+      - name: Get workflow job id
+        id: get-job-id
+        uses: ./.github/actions/get-workflow-job-id
+        if: always()
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
      - name: Start monitoring script
        id: monitor-script
+        env:
+          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
+          WORKFLOW_NAME: ${{ github.workflow }}
+          WORKFLOW_RUN_ID: ${{github.run_id}}
+          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
+          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
        shell: bash
        if: ${{ !inputs.disable-monitor }}
        continue-on-error: true
        run: |
          # Windows conda doesn't have python3 binary, only python, but it's python3
-          ${CONDA_RUN} python -m tools.stats.monitor > usage_log.txt 2>&1 &
+          ${CONDA_RUN} python -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
+          ${CONDA_RUN} python -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

      - name: Download PyTorch Build Artifacts
@ -131,13 +157,6 @@ jobs:
        continue-on-error: true
        uses: ./.github/actions/download-td-artifacts

-      - name: Get workflow job id
-        id: get-job-id
-        uses: ./.github/actions/get-workflow-job-id
-        if: always()
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-
      - name: Check for keep-going label and re-enabled test issues
        # This uses the filter-test-configs action because it conviniently
        # checks for labels and re-enabled test issues.  It does not actually do
@ -236,6 +255,17 @@ jobs:
        with:
          file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}

+      - name: Upload utilization stats
+        if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor }}
+        continue-on-error: true
+        uses: ./.github/actions/upload-utilization-stats
+        with:
+          job_id: ${{ steps.get-job-id.outputs.job-id }}
+          job_name: ${{ steps.get-job-id.outputs.job-name }}
+          workflow_name: ${{ github.workflow }}
+          workflow_run_id: ${{github.run_id}}
+          workflow_attempt: ${{github.run_attempt}}
+
      - name: Parse ref
        id: parse-ref
        shell: bash
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@ -46,7 +46,18 @@ on:
        required: false
        type: boolean
        default: true
-
+      monitor-log-interval:
+        description: |
+          Set the interval for the monitor script to log utilization.
+        required: false
+        type: number
+        default: 5
+      monitor-data-collect-interval:
+        description: |
+          Set the interval for the monitor script to collect data.
+        required: false
+        type: number
+        default: 1
 permissions:
  id-token: write
  contents: read
@ -102,14 +113,28 @@ jobs:
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

+      - name: Get workflow job id
+        id: get-job-id
+        uses: ./.github/actions/get-workflow-job-id
+        if: always()
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
      - name: Start monitoring script
        id: monitor-script
        if: ${{ !inputs.disable-monitor }}
        shell: bash
        continue-on-error: true
+        env:
+          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
+          WORKFLOW_NAME: ${{ github.workflow }}
+          WORKFLOW_RUN_ID: ${{github.run_id}}
+          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
+          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
        run: |
-          python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
-          python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
+          python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
+          python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

      - name: Download build artifacts
@ -121,13 +146,6 @@ jobs:
        id: parse-ref
        run: .github/scripts/parse_ref.py

-      - name: Get workflow job id
-        id: get-job-id
-        uses: ./.github/actions/get-workflow-job-id
-        if: always()
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-
      - name: Check for keep-going label and re-enabled test issues
        # This uses the filter-test-configs action because it conviniently
        # checks for labels and re-enabled test issues.  It does not actually do
@ -284,6 +302,17 @@ jobs:
          use-gha: true
          file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}

+      - name: Upload utilization stats
+        if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor }}
+        continue-on-error: true
+        uses: ./.github/actions/upload-utilization-stats
+        with:
+          job_id: ${{ steps.get-job-id.outputs.job-id }}
+          job_name: ${{ steps.get-job-id.outputs.job-name }}
+          workflow_name: ${{ github.workflow }}
+          workflow_run_id: ${{github.run_id}}
+          workflow_attempt: ${{github.run_attempt}}
+
      - name: Collect backtraces from coredumps (if any)
        if: always()
        run: |
--- a/.github/workflows/inductor-perf-compare.yml
+++ b/.github/workflows/inductor-perf-compare.yml
@ -52,4 +52,6 @@ jobs:
      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }}
      # disable monitor in perf tests for more investigation
      disable-monitor: true
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
    secrets: inherit
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@ -120,8 +120,10 @@ jobs:
      docker-image: ${{ needs.build.outputs.docker-image }}
      test-matrix: ${{ needs.build.outputs.test-matrix }}
      timeout-minutes: 720
-      # disable monitor in perf tests for more investigation
+      # disable monitor in perf tests, next step is to enable it
      disable-monitor: true
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
    secrets: inherit

  test-weekly:
@ -135,8 +137,10 @@ jobs:
      docker-image: ${{ needs.build.outputs.docker-image }}
      test-matrix: ${{ needs.build.outputs.test-matrix }}
      timeout-minutes: 1440
-      # disable monitor in perf tests for more investigation
+      # disable monitor in perf tests, next step is to enable it
      disable-monitor: true
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
    secrets: inherit

  test:
@ -152,4 +156,6 @@ jobs:
      timeout-minutes: 720
      # disable monitor in perf tests for more investigation
      disable-monitor: true
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
    secrets: inherit
--- a/.github/workflows/inductor-perf-test-nightly-rocm.yml
+++ b/.github/workflows/inductor-perf-test-nightly-rocm.yml
@ -117,4 +117,6 @@ jobs:
      timeout-minutes: 720
      # Disable monitor in perf tests for more investigation
      disable-monitor: true
+      monitor-log-interval: 10
+      monitor-data-collect-interval: 2
    secrets: inherit
--- a/.github/workflows/inductor-perf-test-nightly-x86.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86.yml
@ -101,8 +101,10 @@ jobs:
      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
-      # disable monitor in perf tests for more investigation
+      # disable monitor in perf tests
      disable-monitor: true
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
    secrets: inherit


@ -117,6 +119,8 @@ jobs:
      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
-      # disable monitor in perf tests for more investigation
+      # disable monitor in perf tests
      disable-monitor: true
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
    secrets: inherit
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@ -122,8 +122,10 @@ jobs:
      docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
-      # disable monitor in perf tests for more investigation
+      # disable monitor in perf tests, next step is to enable it
      disable-monitor: true
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
    secrets: inherit

  linux-focal-cuda12_6-py3_10-gcc9-inductor-test-weekly:
@ -137,8 +139,10 @@ jobs:
      docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }}
      timeout-minutes: 1440
-      # disable monitor in perf tests for more investigation
+      # disable monitor in perf tests, next step is to enable it
      disable-monitor: true
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
    secrets: inherit

  linux-focal-cuda12_6-py3_10-gcc9-inductor-test:
@ -152,6 +156,8 @@ jobs:
      docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
-      # disable monitor in perf tests for more investigation
+      # disable monitor in perf tests, next step is to enable it
      disable-monitor: true
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
    secrets: inherit
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -132,8 +132,10 @@ jobs:
      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80
      docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build-gcp.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build-gcp.outputs.test-matrix }}
-      # disable monitor in smoke perf tests for more investigation
+      # disable monitor in perf tests, next step is to enable it
      disable-monitor: true
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
    secrets: inherit

  linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build: