Add parameters for monitor (#152541)

Add log interval and log-data-collect interval to all test yml

Add upload step for all test yml files

next step:
enable perf test with utilization

Pull Request resolved: https://github.com/pytorch/pytorch/pull/152541
Approved by: https://github.com/huydhn
This commit is contained in:
Yang Wang 2025-05-02 20:24:06 +00:00 committed by PyTorch MergeBot
parent ec68d082a1
commit 44f29a3669
11 changed files with 202 additions and 47 deletions

View File

@ -55,6 +55,18 @@ on:
required: false
type: boolean
default: false
monitor-log-interval:
description: |
Set the interval for the monitor script to log utilization.
required: false
type: number
default: 5
monitor-data-collect-interval:
description: |
Set the interval for the monitor script to collect data.
required: false
type: number
default: 1
secrets:
HUGGING_FACE_HUB_TOKEN:
required: false
@ -172,9 +184,11 @@ jobs:
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
WORKFLOW_NAME: ${{ github.workflow }}
WORKFLOW_RUN_ID: ${{github.run_id}}
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
run: |
python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84 dataclasses_json==0.6.7
python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
- name: Download build artifacts

View File

@ -38,6 +38,18 @@ on:
required: false
type: boolean
default: true
monitor-log-interval:
description: |
Set the interval for the monitor script to log utilization.
required: false
type: number
default: 5
monitor-data-collect-interval:
description: |
Set the interval for the monitor script to collect data.
required: false
type: number
default: 1
secrets:
HUGGING_FACE_HUB_TOKEN:
required: false
@ -93,12 +105,27 @@ jobs:
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
- name: Get workflow job id
id: get-job-id
uses: ./.github/actions/get-workflow-job-id
if: always()
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Start monitoring script
id: monitor-script
if: ${{ !inputs.disable-monitor }}
continue-on-error: true
env:
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
WORKFLOW_NAME: ${{ github.workflow }}
WORKFLOW_RUN_ID: ${{github.run_id}}
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
run: |
${CONDA_RUN} python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
${CONDA_RUN} python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7
${CONDA_RUN} python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
- name: Download build artifacts
@ -124,13 +151,6 @@ jobs:
id: parse-ref
run: .github/scripts/parse_ref.py
- name: Get workflow job id
id: get-job-id
uses: ./.github/actions/get-workflow-job-id
if: always()
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Check for keep-going label and re-enabled test issues
# This uses the filter-test-configs action because it conviniently
# checks for labels and re-enabled test issues. It does not actually do
@ -237,6 +257,17 @@ jobs:
schema-version: v3
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Upload utilization stats
if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor }}
continue-on-error: true
uses: ./.github/actions/upload-utilization-stats
with:
job_id: ${{ steps.get-job-id.outputs.job-id }}
job_name: ${{ steps.get-job-id.outputs.job-name }}
workflow_name: ${{ github.workflow }}
workflow_run_id: ${{github.run_id}}
workflow_attempt: ${{github.run_attempt}}
- name: Clean up disk space
if: always()
continue-on-error: true

View File

@ -50,7 +50,18 @@ on:
required: false
type: boolean
default: true
monitor-log-interval:
description: |
Set the interval for the monitor script to log utilization.
required: false
type: number
default: 5
monitor-data-collect-interval:
description: |
Set the interval for the monitor script to collect data.
required: false
type: number
default: 1
env:
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
@ -101,14 +112,28 @@ jobs:
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Get workflow job id
id: get-job-id
uses: ./.github/actions/get-workflow-job-id
if: always()
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Start monitoring script
id: monitor-script
env:
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
WORKFLOW_NAME: ${{ github.workflow }}
WORKFLOW_RUN_ID: ${{github.run_id}}
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
if: ${{ !inputs.disable-monitor }}
shell: bash
continue-on-error: true
run: |
python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7
python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
- name: Download build artifacts
@ -124,13 +149,6 @@ jobs:
id: parse-ref
run: .github/scripts/parse_ref.py
- name: Get workflow job id
id: get-job-id
uses: ./.github/actions/get-workflow-job-id
if: always()
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Check for keep-going label and re-enabled test issues
# This uses the filter-test-configs action because it conviniently
# checks for labels and re-enabled test issues. It does not actually do
@ -309,5 +327,16 @@ jobs:
schema-version: v3
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Upload utilization stats
if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor }}
continue-on-error: true
uses: ./.github/actions/upload-utilization-stats
with:
job_id: ${{ steps.get-job-id.outputs.job-id }}
job_name: ${{ steps.get-job-id.outputs.job-name }}
workflow_name: ${{ github.workflow }}
workflow_run_id: ${{github.run_id}}
workflow_attempt: ${{github.run_attempt}}
- name: Teardown ROCm
uses: ./.github/actions/teardown-rocm

View File

@ -36,7 +36,18 @@ on:
required: false
type: boolean
default: true
monitor-log-interval:
description: |
Set the interval for the monitor script to log utilization.
required: false
type: number
default: 5
monitor-data-collect-interval:
description: |
Set the interval for the monitor script to collect data.
required: false
type: number
default: 1
env:
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
@ -106,14 +117,29 @@ jobs:
set -eu
python3 -m pip install 'xdoctest>=1.1.0'
- name: Get workflow job id
id: get-job-id
uses: ./.github/actions/get-workflow-job-id
if: always()
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Start monitoring script
id: monitor-script
env:
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
WORKFLOW_NAME: ${{ github.workflow }}
WORKFLOW_RUN_ID: ${{github.run_id}}
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
shell: bash
if: ${{ !inputs.disable-monitor }}
continue-on-error: true
run: |
# Windows conda doesn't have python3 binary, only python, but it's python3
${CONDA_RUN} python -m tools.stats.monitor > usage_log.txt 2>&1 &
${CONDA_RUN} python -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
${CONDA_RUN} python -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
- name: Download PyTorch Build Artifacts
@ -131,13 +157,6 @@ jobs:
continue-on-error: true
uses: ./.github/actions/download-td-artifacts
- name: Get workflow job id
id: get-job-id
uses: ./.github/actions/get-workflow-job-id
if: always()
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Check for keep-going label and re-enabled test issues
# This uses the filter-test-configs action because it conviniently
# checks for labels and re-enabled test issues. It does not actually do
@ -236,6 +255,17 @@ jobs:
with:
file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
- name: Upload utilization stats
if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor }}
continue-on-error: true
uses: ./.github/actions/upload-utilization-stats
with:
job_id: ${{ steps.get-job-id.outputs.job-id }}
job_name: ${{ steps.get-job-id.outputs.job-name }}
workflow_name: ${{ github.workflow }}
workflow_run_id: ${{github.run_id}}
workflow_attempt: ${{github.run_attempt}}
- name: Parse ref
id: parse-ref
shell: bash

View File

@ -46,7 +46,18 @@ on:
required: false
type: boolean
default: true
monitor-log-interval:
description: |
Set the interval for the monitor script to log utilization.
required: false
type: number
default: 5
monitor-data-collect-interval:
description: |
Set the interval for the monitor script to collect data.
required: false
type: number
default: 1
permissions:
id-token: write
contents: read
@ -102,14 +113,28 @@ jobs:
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Get workflow job id
id: get-job-id
uses: ./.github/actions/get-workflow-job-id
if: always()
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Start monitoring script
id: monitor-script
if: ${{ !inputs.disable-monitor }}
shell: bash
continue-on-error: true
env:
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
WORKFLOW_NAME: ${{ github.workflow }}
WORKFLOW_RUN_ID: ${{github.run_id}}
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
run: |
python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
- name: Download build artifacts
@ -121,13 +146,6 @@ jobs:
id: parse-ref
run: .github/scripts/parse_ref.py
- name: Get workflow job id
id: get-job-id
uses: ./.github/actions/get-workflow-job-id
if: always()
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Check for keep-going label and re-enabled test issues
# This uses the filter-test-configs action because it conviniently
# checks for labels and re-enabled test issues. It does not actually do
@ -284,6 +302,17 @@ jobs:
use-gha: true
file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
- name: Upload utilization stats
if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor }}
continue-on-error: true
uses: ./.github/actions/upload-utilization-stats
with:
job_id: ${{ steps.get-job-id.outputs.job-id }}
job_name: ${{ steps.get-job-id.outputs.job-name }}
workflow_name: ${{ github.workflow }}
workflow_run_id: ${{github.run_id}}
workflow_attempt: ${{github.run_attempt}}
- name: Collect backtraces from coredumps (if any)
if: always()
run: |

View File

@ -52,4 +52,6 @@ jobs:
test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }}
# disable monitor in perf tests for more investigation
disable-monitor: true
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit

View File

@ -120,8 +120,10 @@ jobs:
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
timeout-minutes: 720
# disable monitor in perf tests for more investigation
# disable monitor in perf tests, next step is to enable it
disable-monitor: true
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
test-weekly:
@ -135,8 +137,10 @@ jobs:
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
timeout-minutes: 1440
# disable monitor in perf tests for more investigation
# disable monitor in perf tests, next step is to enable it
disable-monitor: true
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
test:
@ -152,4 +156,6 @@ jobs:
timeout-minutes: 720
# disable monitor in perf tests for more investigation
disable-monitor: true
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit

View File

@ -117,4 +117,6 @@ jobs:
timeout-minutes: 720
# Disable monitor in perf tests for more investigation
disable-monitor: true
monitor-log-interval: 10
monitor-data-collect-interval: 2
secrets: inherit

View File

@ -101,8 +101,10 @@ jobs:
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
timeout-minutes: 720
# disable monitor in perf tests for more investigation
# disable monitor in perf tests
disable-monitor: true
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
@ -117,6 +119,8 @@ jobs:
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
timeout-minutes: 720
# disable monitor in perf tests for more investigation
# disable monitor in perf tests
disable-monitor: true
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit

View File

@ -122,8 +122,10 @@ jobs:
docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }}
timeout-minutes: 720
# disable monitor in perf tests for more investigation
# disable monitor in perf tests, next step is to enable it
disable-monitor: true
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
linux-focal-cuda12_6-py3_10-gcc9-inductor-test-weekly:
@ -137,8 +139,10 @@ jobs:
docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }}
timeout-minutes: 1440
# disable monitor in perf tests for more investigation
# disable monitor in perf tests, next step is to enable it
disable-monitor: true
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
linux-focal-cuda12_6-py3_10-gcc9-inductor-test:
@ -152,6 +156,8 @@ jobs:
docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }}
timeout-minutes: 720
# disable monitor in perf tests for more investigation
# disable monitor in perf tests, next step is to enable it
disable-monitor: true
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit

View File

@ -132,8 +132,10 @@ jobs:
build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80
docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build-gcp.outputs.docker-image }}
test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build-gcp.outputs.test-matrix }}
# disable monitor in smoke perf tests for more investigation
# disable monitor in perf tests, next step is to enable it
disable-monitor: true
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build: