Added DevOps PR and Nightly Build logic (#58007)

Summary:
This PR adds Azure DevOps support for running custom PyTorch unit tests on PyTorch PR and Nightly builds.

PR Builds on Azure DevOps:
- Ensures that the wheel artifacts for a given PR build is ready
- Once the wheels are ready, PyTorch custom tests are run on torch installation from build wheels

Nightly Builds on Azure DevOps:
- Cues 4 builds {Win,Linux}*{cpu, CUDA} to run PyTorch custom unit tests on nightly PyTorch builds.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/58007

Reviewed By: seemethere, mruberry

Differential Revision: D28342428

Pulled By: malfet

fbshipit-source-id: a454accf69163f9ba77845eeb54831ef91437981
This commit is contained in:
Mustafa Bal 2021-05-12 12:21:12 -07:00 committed by Facebook GitHub Bot
parent 7156168f71
commit 53bc6f79f3
7 changed files with 249 additions and 1 deletions

View File

@ -0,0 +1,51 @@
# PyTorch build steps template with Unix images Azure DevOps Instances
#
# This build depends on 5 parameters set as an environment variables in the pipeline:
# - AZURE_DEVOPS_CLI_PAT: Secret var for authenticating to Azure DevOps
# - AZURE_STORAGE_KEY: Secret var for authenticating to Azure Storage
# - _TS_CLONE_P, _TS_P, _TS_SM_P: Secret vars for specific unit tests
parameters:
name: ''
pool: ''
container_endpoint: ''
customMatrixes: ''
jobs:
- job: ${{parameters.name}}
timeoutInMinutes: 600
strategy:
matrix:
${{ insert }}: ${{parameters.customMatrixes}}
pool:
name: ${{ parameters.pool}}
variables:
DECODE_PERCENTS: false
steps:
# Don't checkout repo contents to save time and CPU compute. Environment variables
# related to checkout branch such as $(BUILD_SOURCEBRANCH) are still available.
- checkout: none
# Delete pytorch_tests repo from previous builds if exists
- bash: rm -rf pytorch_tests/
displayName: Delete pytorch_tests repo from previous builds if exists
# Clone PyTorch Tests repository
- bash: |
B64_PAT=$(printf "%s"":$_ADOTOKEN" | base64)
git -c http.extraHeader="Authorization: Basic ${B64_PAT}" clone $(AZURE_DEVOPS_PYTORCH_TESTS_REPO_URL)
cd pytorch_tests
git checkout $(PYTORCH_TESTS_CHECKOUT_BRANCH)
env:
_ADOTOKEN: $(AZURE_DEVOPS_CLI_PAT)
displayName: Clone PyTorch Tests repo
# Run PyTorch Unit Tests
- bash: bash $(Build.SourcesDirectory)/pytorch_tests/scripts/linux/run.sh
env:
_AZURE_STORAGE_KEY: $(AZURE_STORAGE_KEY)
_TS_CLONE_P: $(TS_CLONE_PASSWORD)
_TS_P: $(TS_PAT)
_TS_SM_P: $(TS_SM_PAT)
displayName: Run PyTorch Unit Tests

View File

@ -0,0 +1,49 @@
# PyTorch build steps template with Windows images Azure DevOps Instances
#
# This build depends on 5 parameters set as an environment variables in the pipeline:
# - AZURE_DEVOPS_CLI_PAT: Secret var for authenticating to Azure DevOps
# - AZURE_STORAGE_KEY: Secret var for authenticating to Azure Storage
# - _TS_CLONE_P, _TS_P, _TS_SM_P: Secret vars for specific unit tests
parameters:
name: ''
pool: ''
customMatrixes: ''
jobs:
- job: ${{parameters.name}}
timeoutInMinutes: 600
strategy:
matrix:
${{ insert }}: ${{parameters.customMatrixes}}
pool:
name: ${{ parameters.pool}}
steps:
# Don't checkout repo contents to save time and CPU compute. Environment variables
# related to checkout branch such as $(BUILD_SOURCEBRANCH) are still available.
- checkout: none
# Delete pytorch_tests repo from previous builds if exists
- script: if exist "pytorch_tests/" rmdir "pytorch_tests/" /q /s
displayName: Delete pytorch_tests repo from previous builds if exists
# Clone PyTorch Tests repository
- powershell: |
$env:B64Pat = [Convert]::ToBase64String([System.Text.Encoding]::UTF8.GetBytes(":$env:_ADOTOKEN"))
git -c http.extraHeader="Authorization: Basic $env:B64Pat" clone $env:AZURE_DEVOPS_pytorch_tests_REPO_URL
cd pytorch_tests
git checkout $(PYTORCH_TESTS_CHECKOUT_BRANCH)
env:
_ADOTOKEN: $(AZURE_DEVOPS_CLI_PAT)
displayName: Clone PyTorch Tests repo
# Run PyTorch Unit Tests
- script: call $(Build.SourcesDirectory)\pytorch_tests\scripts\windows\run.bat
env:
_ADOTOKEN: $(AZURE_DEVOPS_CLI_PAT)
_AZURE_STORAGE_KEY: $(AZURE_STORAGE_KEY)
_TS_CLONE_P: $(TS_CLONE_PASSWORD)
_TS_P: $(TS_PAT)
_TS_SM_P: $(TS_SM_PAT)
displayName: Run PyTorch Unit Tests

View File

@ -0,0 +1,14 @@
# Main logic to initiate wait for PR artifact to be ready
steps:
- task: InvokeRESTAPI@1
displayName: 'Wait for job success and wheel ready'
timeoutInMinutes: 60
inputs:
connectionType: 'connectedServiceName'
serviceConnection: circleciconn
method: 'POST'
headers: '{"Content-Type":"application/json", "BranchName":"$(TARGET_BRANCH_TO_CHECK_PR)", "JobName":"$(TARGET_CIRCLECI_PR)", "PlanUrl":"$(System.CollectionUri)", "ProjectId":"$(System.TeamProjectId)", "HubName":"$(System.HostType)", "PlanId":"$(System.PlanId)", "JobId":"$(System.JobId)", "TimelineId":"$(System.TimelineId)", "TaskInstanceId":"$(System.TaskInstanceId)", "AuthToken":"$(System.AccessToken)"}'
body: ''
urlSuffix: 'api/JobStatus'
waitForCompletion: true

View File

@ -0,0 +1,49 @@
# Initiate 5 agentless-server waiting jobs to check on the
# status of PR artifact builds, for a maximum wait time of
# 5 * 60 min =300 minutes. These jobs will pass immediately
# once targeted CircleCI build is ready.
jobs:
- job: checkjob1
pool: server
timeoutInMinutes: 60
continueOnError: true
steps:
- template: wheel-wait-job-template.yml
- job: checkjob2
pool: server
timeoutInMinutes: 60
dependsOn: checkjob1
continueOnError: true
steps:
- template: wheel-wait-job-template.yml
- job: checkjob3
pool: server
timeoutInMinutes: 60
dependsOn: checkjob2
continueOnError: true
steps:
- template: wheel-wait-job-template.yml
- job: checkjob4
pool: server
timeoutInMinutes: 60
dependsOn: checkjob3
continueOnError: true
steps:
- template: wheel-wait-job-template.yml
- job: checkjob5
pool: server
timeoutInMinutes: 60
dependsOn: checkjob4
continueOnError: true
steps:
- template: wheel-wait-job-template.yml

View File

@ -0,0 +1,50 @@
# PyTorch Nightly PyTorch Tests Builds Pipeline on Azure DevOps
#
# This pipeline runs custom PyTorch unit-tests on nightly
# PyTorch wheels.
stages:
- stage: 'NightlyCustomTests'
displayName: 'Run custom unit tests on PyTorch wheels'
jobs:
- template: job_templates/pytorch-template-unix.yml
parameters:
name: ubuntu_1804_CPU_docker
pool: $(BUILD_POOL_LIN_1)
customMatrixes:
Nightly_Custom_Tests:
_DOCKER_IMAGE: $(DOCKER_IMAGE_LIN_1)
_PYTHON_VERSION: $(PYTHON_VERSION_LIN_1)
_CUDA_BUILD_VERSION: $(CUDA_BUILD_VERSION_LIN_1)
_RUN_TESTS: $(RUN_TESTS_LIN)
- template: job_templates/pytorch-template-unix.yml
parameters:
name: ubuntu_1804_GPU_docker
pool: $(BUILD_POOL_LIN_2)
customMatrixes:
Nightly_Custom_Tests:
_DOCKER_IMAGE: $(DOCKER_IMAGE_LIN_2)
_PYTHON_VERSION: $(PYTHON_VERSION_LIN_2)
_CUDA_BUILD_VERSION: $(CUDA_BUILD_VERSION_LIN_2)
_RUN_TESTS: $(RUN_TESTS_LIN)
- template: job_templates/pytorch-template-win.yml
parameters:
name: windows_2019_CPU
pool: $(BUILD_POOL_WIN_1)
customMatrixes:
Nightly_Custom_Tests:
_PYTHON_VERSION: $(PYTHON_VERSION_WIN_1)
_CUDA_BUILD_VERSION: $(CUDA_BUILD_VERSION_WIN_1)
_RUN_TESTS: $(RUN_TESTS_WIN)
- template: job_templates/pytorch-template-win.yml
parameters:
name: windows_2019_GPU
pool: $(BUILD_POOL_WIN_2)
customMatrixes:
Nightly_Custom_Tests:
_PYTHON_VERSION: $(PYTHON_VERSION_WIN_2)
_CUDA_BUILD_VERSION: $(CUDA_BUILD_VERSION_WIN_2)
_RUN_TESTS: $(RUN_TESTS_WIN)

View File

@ -0,0 +1,30 @@
# PyTorch PR PyTorch Tests Builds Pipeline on Azure DevOps
#
# This pipeline:
# 1) ensures that CircleCI builds for a given PR
# have finished, and that its artifacts are
# ready for download
# 2) runs custom PyTorch unit-tests on PyTorch
# wheels generated during PR builds.
stages:
- stage: 'EnsureArtifactsReady'
displayName: 'Ensure PyTorch PR Artifacts are ready'
jobs:
- template: job_templates/wheel-wait-template.yml
- stage: 'PRCustomTests'
displayName: 'Run custom unit tests on PyTorch wheels'
jobs:
- template: job_templates/pytorch-template-unix.yml
parameters:
name: ubuntu_1804_GPU_docker
pool: $(BUILD_POOL_PR)
customMatrixes:
PR_Custom_Tests:
_PYTHON_VERSION: $(PYTHON_VERSION_PR)
_CUDA_BUILD_VERSION: $(CUDA_BUILD_VERSION_PR)
_TARGET_CIRCLECI_BUILD: $(TARGET_CIRCLECI_PR)
_TARGET_BRANCH_TO_CHECK: $(TARGET_BRANCH_TO_CHECK_PR)
_DOCKER_IMAGE: $(DOCKER_IMAGE_PR)
_RUN_TESTS: $(RUN_TESTS_PR)

View File

@ -49,8 +49,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
cmake \
curl \
git \
git-lfs \
libjpeg-dev \
libpng-dev \
openmpi-bin \
wget && \
rm -rf /var/lib/apt/lists/*
RUN /usr/sbin/update-ccache-symlinks
@ -86,7 +88,10 @@ ARG TORCH_CUDA_ARCH_LIST_VAR
RUN if [ -z "$TORCH_CUDA_ARCH_LIST_VAR" ] ; then \
echo "Continuing CPU build ..."; \
else \
echo "Setting CUDA env vars ..."; \
echo "Setting CUDA env vars and installing openmpi ..."; \
# Set MPI links to avoid libmpi_cxx.so.1 not found error
ln -s /usr/lib/x86_64-linux-gnu/libmpi_cxx.so.20 /usr/lib/x86_64-linux-gnu/libmpi_cxx.so.1; \
ln -s /usr/lib/x86_64-linux-gnu/libmpi.so.20.10.1 /usr/lib/x86_64-linux-gnu/libmpi.so.12; \
fi
# If the build argument TORCH_CUDA_ARCH_LIST_VAR is given, container will be
# set for GPU/CUDA build, else for CPU build.