mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 00:20:18 +01:00
Both Linux, Windows and MacOS CI workflows should use `.ci/docker/requirements-ci.txt` TODOS: - Investigate why `choco install cmake` is needed to successfully detect MKL - Move `psutil` installation from specific scripts into requirements-ci.txt Pull Request resolved: https://github.com/pytorch/pytorch/pull/163396 Approved by: https://github.com/Skylion007
276 lines
11 KiB
YAML
276 lines
11 KiB
YAML
name: win-test
|
|
|
|
on:
|
|
workflow_call:
|
|
inputs:
|
|
build-environment:
|
|
required: true
|
|
type: string
|
|
description: Top-level label for what's being built/tested.
|
|
cuda-version:
|
|
required: true
|
|
type: string
|
|
description: What CUDA version to build with, "cpu" for none.
|
|
test-matrix:
|
|
required: true
|
|
type: string
|
|
description: JSON description of what test configs to run.
|
|
sync-tag:
|
|
required: false
|
|
type: string
|
|
default: ""
|
|
description: |
|
|
If this is set, our linter will use this to make sure that every other
|
|
job with the same `sync-tag` is identical.
|
|
timeout-minutes:
|
|
required: false
|
|
type: number
|
|
default: 240
|
|
description: |
|
|
Set the maximum (in minutes) how long the workflow should take to finish
|
|
disable-monitor:
|
|
description: |
|
|
[Experimental] Disable utilization monitoring for tests.
|
|
Currently, by default we disable the monitor job and only look for specific tests,
|
|
since we are investigating the behaviour of the monitor script with different tests.
|
|
required: false
|
|
type: boolean
|
|
default: true
|
|
monitor-log-interval:
|
|
description: |
|
|
Set the interval for the monitor script to log utilization.
|
|
required: false
|
|
type: number
|
|
default: 5
|
|
monitor-data-collect-interval:
|
|
description: |
|
|
Set the interval for the monitor script to collect data.
|
|
required: false
|
|
type: number
|
|
default: 1
|
|
env:
|
|
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
|
|
|
|
jobs:
|
|
test:
|
|
# Don't run on forked repos or empty test matrix
|
|
if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
|
|
strategy:
|
|
matrix: ${{ fromJSON(inputs.test-matrix) }}
|
|
fail-fast: false
|
|
runs-on: ${{ matrix.runner }}
|
|
timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
|
|
defaults:
|
|
run:
|
|
shell: bash
|
|
steps:
|
|
# Duplicated in win-build because this MUST go before a checkout
|
|
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
|
|
shell: bash
|
|
run: |
|
|
git config --global core.longpaths true
|
|
git config --global core.symlinks true
|
|
git config --global core.ignorecase false
|
|
|
|
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
|
|
# the directory on Windows and prevent GHA from checking out as reported
|
|
# in https://github.com/actions/checkout/issues/1018
|
|
git config --global core.fsmonitor false
|
|
|
|
- name: Clean up leftover processes on non-ephemeral Windows runner
|
|
uses: pytorch/test-infra/.github/actions/cleanup-runner@main
|
|
|
|
- name: Setup SSH (Click me for login details)
|
|
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
|
with:
|
|
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
|
instructions: |
|
|
To forward remote desktop on your local machine ssh as follows:
|
|
ssh -L 3389:localhost:3389 %%username%%@%%hostname%%
|
|
And then change password using `passwd` command.
|
|
|
|
To start tests locally, change working folder to \actions-runner\_work\pytorch\pytorch\test,
|
|
Activate miniconda and Visual Studio environment and set PYTHON_PATH, by running:
|
|
call C:\Jenkins\Miniconda3\Scripts\activate.bat C:\Jenkins\Miniconda3
|
|
call "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\VC\Auxiliary\Build\vcvarsall.bat" x64
|
|
set PYTHONPATH=C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build
|
|
|
|
# [see note: pytorch repo ref]
|
|
- name: Checkout PyTorch
|
|
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
|
with:
|
|
no-sudo: true
|
|
|
|
- name: Setup Windows
|
|
uses: ./.github/actions/setup-win
|
|
with:
|
|
cuda-version: ${{ inputs.cuda-version }}
|
|
|
|
- name: Get workflow job id
|
|
id: get-job-id
|
|
uses: ./.github/actions/get-workflow-job-id
|
|
if: always()
|
|
with:
|
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
|
|
- name: Start monitoring script
|
|
id: monitor-script
|
|
env:
|
|
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
|
|
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
|
|
WORKFLOW_NAME: ${{ github.workflow }}
|
|
WORKFLOW_RUN_ID: ${{github.run_id}}
|
|
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
|
|
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
|
|
shell: bash
|
|
if: ${{ !inputs.disable-monitor }}
|
|
continue-on-error: true
|
|
run: |
|
|
# Windows conda doesn't have python3 binary, only python, but it's python3
|
|
${CONDA_RUN} python -m pip install psutil==5.9.8 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
|
|
${CONDA_RUN} python -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
|
|
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
|
|
|
|
- name: Download PyTorch Build Artifacts
|
|
uses: seemethere/download-artifact-s3@1da556a7aa0a088e3153970611f6c432d58e80e6 # v4.2.0
|
|
with:
|
|
name: ${{ inputs.build-environment }}
|
|
path: C:\${{ github.run_id }}\build-results
|
|
|
|
- name: Check build-results folder
|
|
shell: powershell
|
|
run: |
|
|
tree /F C:\$Env:GITHUB_RUN_ID\build-results
|
|
|
|
- name: Download TD artifacts
|
|
continue-on-error: true
|
|
uses: ./.github/actions/download-td-artifacts
|
|
|
|
- name: Check for keep-going label and re-enabled test issues
|
|
# This uses the filter-test-configs action because it conveniently
|
|
# checks for labels and re-enabled test issues. It does not actually do
|
|
# any filtering. All filtering is done in the build step.
|
|
id: keep-going
|
|
uses: ./.github/actions/filter-test-configs
|
|
with:
|
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
test-matrix: ${{ inputs.test-matrix }}
|
|
job-name: ${{ steps.get-job-id.outputs.job-name }}
|
|
|
|
- name: Set Test step time
|
|
id: test-timeout
|
|
shell: bash
|
|
env:
|
|
JOB_TIMEOUT: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
|
|
run: |
|
|
echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
|
|
|
|
- name: Test
|
|
id: test
|
|
shell: bash
|
|
timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
|
|
env:
|
|
USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }}
|
|
INSTALL_WINDOWS_SDK: 1
|
|
PYTHON_VERSION: "3.10"
|
|
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
|
|
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
|
|
TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
|
|
NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
|
|
NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
|
|
VC_PRODUCT: "BuildTools"
|
|
VC_VERSION: ""
|
|
VS_VERSION: "17.4.1"
|
|
VC_YEAR: "2022"
|
|
AWS_DEFAULT_REGION: us-east-1
|
|
PR_NUMBER: ${{ github.event.pull_request.number }}
|
|
GITHUB_REPOSITORY: ${{ github.repository }}
|
|
GITHUB_WORKFLOW: ${{ github.workflow }}
|
|
GITHUB_JOB: ${{ github.job }}
|
|
GITHUB_RUN_ID: ${{ github.run_id }}
|
|
GITHUB_RUN_NUMBER: ${{ github.run_number }}
|
|
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
|
|
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
|
|
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
|
|
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
|
CUDA_VERSION: ${{ inputs.cuda-version }}
|
|
PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/
|
|
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
|
|
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
|
|
SHARD_NUMBER: ${{ matrix.shard }}
|
|
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
|
|
TEST_CONFIG: ${{ matrix.config }}
|
|
REENABLED_ISSUES: ${{ github.event.pull_request.reenabled-issues }}
|
|
TORCH_CUDA_ARCH_LIST: "8.6"
|
|
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
|
|
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
|
|
run: |
|
|
pushd "${PYTORCH_FINAL_PACKAGE_DIR}"
|
|
# shellcheck disable=SC2046,SC2102
|
|
python3 -mpip install $(echo *.whl)[opt-einsum,optree] optree==0.13.0
|
|
popd
|
|
|
|
.ci/pytorch/win-test.sh
|
|
|
|
- name: Upload pytest cache if tests failed
|
|
uses: ./.github/actions/pytest-cache-upload
|
|
continue-on-error: true
|
|
if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure'
|
|
with:
|
|
cache_dir: .pytest_cache
|
|
shard: ${{ matrix.shard }}
|
|
sha: ${{ github.event.pull_request.head.sha || github.sha }}
|
|
test_config: ${{ matrix.config }}
|
|
job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
|
|
|
|
- name: Print remaining test logs
|
|
shell: bash
|
|
if: always() && steps.test.conclusion
|
|
run: |
|
|
cat test/**/*_toprint.log || true
|
|
|
|
- name: Stop monitoring script
|
|
if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
|
|
shell: bash
|
|
continue-on-error: true
|
|
env:
|
|
MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
|
|
run: |
|
|
kill "$MONITOR_SCRIPT_PID"
|
|
|
|
- name: Upload test artifacts
|
|
uses: ./.github/actions/upload-test-artifacts
|
|
if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
|
|
with:
|
|
file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
|
|
|
|
- name: Upload utilization stats
|
|
if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor }}
|
|
continue-on-error: true
|
|
uses: ./.github/actions/upload-utilization-stats
|
|
with:
|
|
job_id: ${{ steps.get-job-id.outputs.job-id }}
|
|
job_name: ${{ steps.get-job-id.outputs.job-name }}
|
|
workflow_name: ${{ github.workflow }}
|
|
workflow_run_id: ${{github.run_id}}
|
|
workflow_attempt: ${{github.run_attempt}}
|
|
|
|
- name: Parse ref
|
|
id: parse-ref
|
|
shell: bash
|
|
run: python3 .github/scripts/parse_ref.py
|
|
|
|
- name: Uninstall PyTorch
|
|
if: always()
|
|
continue-on-error: true
|
|
shell: bash
|
|
run: |
|
|
# This step removes PyTorch installed by the test to give a clean slate
|
|
# to the next job
|
|
python3 -mpip uninstall -y torch
|
|
|
|
- name: Teardown Windows
|
|
uses: ./.github/actions/teardown-win
|
|
if: always()
|
|
timeout-minutes: 120
|