diff --git a/.azure_pipelines/job_templates/pytorch-template-unix.yml b/.azure_pipelines/job_templates/pytorch-template-unix.yml new file mode 100644 index 00000000000..d52dd27faa7 --- /dev/null +++ b/.azure_pipelines/job_templates/pytorch-template-unix.yml @@ -0,0 +1,51 @@ +# PyTorch build steps template with Unix images Azure DevOps Instances +# +# This build depends on 5 parameters set as an environment variables in the pipeline: +# - AZURE_DEVOPS_CLI_PAT: Secret var for authenticating to Azure DevOps +# - AZURE_STORAGE_KEY: Secret var for authenticating to Azure Storage +# - _TS_CLONE_P, _TS_P, _TS_SM_P: Secret vars for specific unit tests + +parameters: + name: '' + pool: '' + container_endpoint: '' + customMatrixes: '' + +jobs: +- job: ${{parameters.name}} + timeoutInMinutes: 600 + strategy: + matrix: + ${{ insert }}: ${{parameters.customMatrixes}} + pool: + name: ${{ parameters.pool}} + variables: + DECODE_PERCENTS: false + + steps: + # Don't checkout repo contents to save time and CPU compute. Environment variables + # related to checkout branch such as $(BUILD_SOURCEBRANCH) are still available. + - checkout: none + + # Delete pytorch_tests repo from previous builds if exists + - bash: rm -rf pytorch_tests/ + displayName: Delete pytorch_tests repo from previous builds if exists + + # Clone PyTorch Tests repository + - bash: | + B64_PAT=$(printf "%s"":$_ADOTOKEN" | base64) + git -c http.extraHeader="Authorization: Basic ${B64_PAT}" clone $(AZURE_DEVOPS_PYTORCH_TESTS_REPO_URL) + cd pytorch_tests + git checkout $(PYTORCH_TESTS_CHECKOUT_BRANCH) + env: + _ADOTOKEN: $(AZURE_DEVOPS_CLI_PAT) + displayName: Clone PyTorch Tests repo + + # Run PyTorch Unit Tests + - bash: bash $(Build.SourcesDirectory)/pytorch_tests/scripts/linux/run.sh + env: + _AZURE_STORAGE_KEY: $(AZURE_STORAGE_KEY) + _TS_CLONE_P: $(TS_CLONE_PASSWORD) + _TS_P: $(TS_PAT) + _TS_SM_P: $(TS_SM_PAT) + displayName: Run PyTorch Unit Tests diff --git a/.azure_pipelines/job_templates/pytorch-template-win.yml b/.azure_pipelines/job_templates/pytorch-template-win.yml new file mode 100644 index 00000000000..76027d83633 --- /dev/null +++ b/.azure_pipelines/job_templates/pytorch-template-win.yml @@ -0,0 +1,49 @@ +# PyTorch build steps template with Windows images Azure DevOps Instances +# +# This build depends on 5 parameters set as an environment variables in the pipeline: +# - AZURE_DEVOPS_CLI_PAT: Secret var for authenticating to Azure DevOps +# - AZURE_STORAGE_KEY: Secret var for authenticating to Azure Storage +# - _TS_CLONE_P, _TS_P, _TS_SM_P: Secret vars for specific unit tests + +parameters: + name: '' + pool: '' + customMatrixes: '' + +jobs: +- job: ${{parameters.name}} + timeoutInMinutes: 600 + strategy: + matrix: + ${{ insert }}: ${{parameters.customMatrixes}} + pool: + name: ${{ parameters.pool}} + + steps: + # Don't checkout repo contents to save time and CPU compute. Environment variables + # related to checkout branch such as $(BUILD_SOURCEBRANCH) are still available. + - checkout: none + + # Delete pytorch_tests repo from previous builds if exists + - script: if exist "pytorch_tests/" rmdir "pytorch_tests/" /q /s + displayName: Delete pytorch_tests repo from previous builds if exists + + # Clone PyTorch Tests repository + - powershell: | + $env:B64Pat = [Convert]::ToBase64String([System.Text.Encoding]::UTF8.GetBytes(":$env:_ADOTOKEN")) + git -c http.extraHeader="Authorization: Basic $env:B64Pat" clone $env:AZURE_DEVOPS_pytorch_tests_REPO_URL + cd pytorch_tests + git checkout $(PYTORCH_TESTS_CHECKOUT_BRANCH) + env: + _ADOTOKEN: $(AZURE_DEVOPS_CLI_PAT) + displayName: Clone PyTorch Tests repo + + # Run PyTorch Unit Tests + - script: call $(Build.SourcesDirectory)\pytorch_tests\scripts\windows\run.bat + env: + _ADOTOKEN: $(AZURE_DEVOPS_CLI_PAT) + _AZURE_STORAGE_KEY: $(AZURE_STORAGE_KEY) + _TS_CLONE_P: $(TS_CLONE_PASSWORD) + _TS_P: $(TS_PAT) + _TS_SM_P: $(TS_SM_PAT) + displayName: Run PyTorch Unit Tests diff --git a/.azure_pipelines/job_templates/wheel-wait-job-template.yml b/.azure_pipelines/job_templates/wheel-wait-job-template.yml new file mode 100644 index 00000000000..c01f8c4201f --- /dev/null +++ b/.azure_pipelines/job_templates/wheel-wait-job-template.yml @@ -0,0 +1,14 @@ +# Main logic to initiate wait for PR artifact to be ready + +steps: +- task: InvokeRESTAPI@1 + displayName: 'Wait for job success and wheel ready' + timeoutInMinutes: 60 + inputs: + connectionType: 'connectedServiceName' + serviceConnection: circleciconn + method: 'POST' + headers: '{"Content-Type":"application/json", "BranchName":"$(TARGET_BRANCH_TO_CHECK_PR)", "JobName":"$(TARGET_CIRCLECI_PR)", "PlanUrl":"$(System.CollectionUri)", "ProjectId":"$(System.TeamProjectId)", "HubName":"$(System.HostType)", "PlanId":"$(System.PlanId)", "JobId":"$(System.JobId)", "TimelineId":"$(System.TimelineId)", "TaskInstanceId":"$(System.TaskInstanceId)", "AuthToken":"$(System.AccessToken)"}' + body: '' + urlSuffix: 'api/JobStatus' + waitForCompletion: true diff --git a/.azure_pipelines/job_templates/wheel-wait-template.yml b/.azure_pipelines/job_templates/wheel-wait-template.yml new file mode 100644 index 00000000000..6f04f504dab --- /dev/null +++ b/.azure_pipelines/job_templates/wheel-wait-template.yml @@ -0,0 +1,49 @@ +# Initiate 5 agentless-server waiting jobs to check on the +# status of PR artifact builds, for a maximum wait time of +# 5 * 60 min =300 minutes. These jobs will pass immediately +# once targeted CircleCI build is ready. + +jobs: +- job: checkjob1 + pool: server + timeoutInMinutes: 60 + continueOnError: true + + steps: + - template: wheel-wait-job-template.yml + +- job: checkjob2 + pool: server + timeoutInMinutes: 60 + dependsOn: checkjob1 + continueOnError: true + + steps: + - template: wheel-wait-job-template.yml + +- job: checkjob3 + pool: server + timeoutInMinutes: 60 + dependsOn: checkjob2 + continueOnError: true + + steps: + - template: wheel-wait-job-template.yml + +- job: checkjob4 + pool: server + timeoutInMinutes: 60 + dependsOn: checkjob3 + continueOnError: true + + steps: + - template: wheel-wait-job-template.yml + +- job: checkjob5 + pool: server + timeoutInMinutes: 60 + dependsOn: checkjob4 + continueOnError: true + + steps: + - template: wheel-wait-job-template.yml diff --git a/.azure_pipelines/nightly-pytorch-tests-pipeline.yml b/.azure_pipelines/nightly-pytorch-tests-pipeline.yml new file mode 100644 index 00000000000..6cc26b9b540 --- /dev/null +++ b/.azure_pipelines/nightly-pytorch-tests-pipeline.yml @@ -0,0 +1,50 @@ +# PyTorch Nightly PyTorch Tests Builds Pipeline on Azure DevOps +# +# This pipeline runs custom PyTorch unit-tests on nightly +# PyTorch wheels. + +stages: +- stage: 'NightlyCustomTests' + displayName: 'Run custom unit tests on PyTorch wheels' + jobs: + - template: job_templates/pytorch-template-unix.yml + parameters: + name: ubuntu_1804_CPU_docker + pool: $(BUILD_POOL_LIN_1) + customMatrixes: + Nightly_Custom_Tests: + _DOCKER_IMAGE: $(DOCKER_IMAGE_LIN_1) + _PYTHON_VERSION: $(PYTHON_VERSION_LIN_1) + _CUDA_BUILD_VERSION: $(CUDA_BUILD_VERSION_LIN_1) + _RUN_TESTS: $(RUN_TESTS_LIN) + + - template: job_templates/pytorch-template-unix.yml + parameters: + name: ubuntu_1804_GPU_docker + pool: $(BUILD_POOL_LIN_2) + customMatrixes: + Nightly_Custom_Tests: + _DOCKER_IMAGE: $(DOCKER_IMAGE_LIN_2) + _PYTHON_VERSION: $(PYTHON_VERSION_LIN_2) + _CUDA_BUILD_VERSION: $(CUDA_BUILD_VERSION_LIN_2) + _RUN_TESTS: $(RUN_TESTS_LIN) + + - template: job_templates/pytorch-template-win.yml + parameters: + name: windows_2019_CPU + pool: $(BUILD_POOL_WIN_1) + customMatrixes: + Nightly_Custom_Tests: + _PYTHON_VERSION: $(PYTHON_VERSION_WIN_1) + _CUDA_BUILD_VERSION: $(CUDA_BUILD_VERSION_WIN_1) + _RUN_TESTS: $(RUN_TESTS_WIN) + + - template: job_templates/pytorch-template-win.yml + parameters: + name: windows_2019_GPU + pool: $(BUILD_POOL_WIN_2) + customMatrixes: + Nightly_Custom_Tests: + _PYTHON_VERSION: $(PYTHON_VERSION_WIN_2) + _CUDA_BUILD_VERSION: $(CUDA_BUILD_VERSION_WIN_2) + _RUN_TESTS: $(RUN_TESTS_WIN) diff --git a/.azure_pipelines/pytorch-tests-pipeline.yml b/.azure_pipelines/pytorch-tests-pipeline.yml new file mode 100644 index 00000000000..4b6ac825549 --- /dev/null +++ b/.azure_pipelines/pytorch-tests-pipeline.yml @@ -0,0 +1,30 @@ +# PyTorch PR PyTorch Tests Builds Pipeline on Azure DevOps +# +# This pipeline: +# 1) ensures that CircleCI builds for a given PR +# have finished, and that its artifacts are +# ready for download +# 2) runs custom PyTorch unit-tests on PyTorch +# wheels generated during PR builds. + +stages: +- stage: 'EnsureArtifactsReady' + displayName: 'Ensure PyTorch PR Artifacts are ready' + jobs: + - template: job_templates/wheel-wait-template.yml + +- stage: 'PRCustomTests' + displayName: 'Run custom unit tests on PyTorch wheels' + jobs: + - template: job_templates/pytorch-template-unix.yml + parameters: + name: ubuntu_1804_GPU_docker + pool: $(BUILD_POOL_PR) + customMatrixes: + PR_Custom_Tests: + _PYTHON_VERSION: $(PYTHON_VERSION_PR) + _CUDA_BUILD_VERSION: $(CUDA_BUILD_VERSION_PR) + _TARGET_CIRCLECI_BUILD: $(TARGET_CIRCLECI_PR) + _TARGET_BRANCH_TO_CHECK: $(TARGET_BRANCH_TO_CHECK_PR) + _DOCKER_IMAGE: $(DOCKER_IMAGE_PR) + _RUN_TESTS: $(RUN_TESTS_PR) diff --git a/docker/pytorch/ubuntu_cpu_gpu/Dockerfile b/docker/pytorch/ubuntu_cpu_gpu/Dockerfile index c7209e8f3d7..7cb1d9309f2 100644 --- a/docker/pytorch/ubuntu_cpu_gpu/Dockerfile +++ b/docker/pytorch/ubuntu_cpu_gpu/Dockerfile @@ -49,8 +49,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ cmake \ curl \ git \ + git-lfs \ libjpeg-dev \ libpng-dev \ + openmpi-bin \ wget && \ rm -rf /var/lib/apt/lists/* RUN /usr/sbin/update-ccache-symlinks @@ -86,7 +88,10 @@ ARG TORCH_CUDA_ARCH_LIST_VAR RUN if [ -z "$TORCH_CUDA_ARCH_LIST_VAR" ] ; then \ echo "Continuing CPU build ..."; \ else \ - echo "Setting CUDA env vars ..."; \ + echo "Setting CUDA env vars and installing openmpi ..."; \ + # Set MPI links to avoid libmpi_cxx.so.1 not found error + ln -s /usr/lib/x86_64-linux-gnu/libmpi_cxx.so.20 /usr/lib/x86_64-linux-gnu/libmpi_cxx.so.1; \ + ln -s /usr/lib/x86_64-linux-gnu/libmpi.so.20.10.1 /usr/lib/x86_64-linux-gnu/libmpi.so.12; \ fi # If the build argument TORCH_CUDA_ARCH_LIST_VAR is given, container will be # set for GPU/CUDA build, else for CPU build.