diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index 18329c52625..01fb48f5f85 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -9,3 +9,4 @@ self-hosted-runner: - windows.4xlarge - windows.8xlarge.nvidia.gpu - bm-runner + - linux.rocm.gpu diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json index 81abc2237bc..27ccb7d4f06 100644 --- a/.github/generated-ciflow-ruleset.json +++ b/.github/generated-ciflow-ruleset.json @@ -44,7 +44,8 @@ "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit", "pytorch-xla-linux-bionic-py3.7-clang8", "win-vs2019-cpu-py3", - "win-vs2019-cuda11.3-py3" + "win-vs2019-cuda11.3-py3", + "win-vs2019-cuda11.3-py3-smoke" ], "ciflow/android": [ "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build", @@ -120,7 +121,8 @@ "periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug", "periodic-win-vs2019-cuda11.1-py3", "periodic-win-vs2019-cuda11.5-py3", - "win-vs2019-cuda11.3-py3" + "win-vs2019-cuda11.3-py3", + "win-vs2019-cuda11.3-py3-smoke" ], "ciflow/default": [ "linux-binary-conda", @@ -149,7 +151,7 @@ "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single", "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit", "win-vs2019-cpu-py3", - "win-vs2019-cuda11.3-py3", + "win-vs2019-cuda11.3-py3-smoke", "windows-binary-libtorch-cxx11-abi", "windows-binary-libtorch-pre-cxx11", "windows-binary-wheel" @@ -281,7 +283,8 @@ "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit", "pytorch-xla-linux-bionic-py3.7-clang8", "win-vs2019-cpu-py3", - "win-vs2019-cuda11.3-py3" + "win-vs2019-cuda11.3-py3", + "win-vs2019-cuda11.3-py3-smoke" ], "ciflow/vulkan": [ "linux-vulkan-bionic-py3.7-clang9" @@ -290,7 +293,8 @@ "periodic-win-vs2019-cuda11.1-py3", "periodic-win-vs2019-cuda11.5-py3", "win-vs2019-cpu-py3", - "win-vs2019-cuda11.3-py3" + "win-vs2019-cuda11.3-py3", + "win-vs2019-cuda11.3-py3-smoke" ], "ciflow/xla": [ "pytorch-xla-linux-bionic-py3.7-clang8" diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index 0d5a86a11d3..da84f89b710 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -2,17 +2,16 @@ from dataclasses import asdict, dataclass, field from pathlib import Path -from typing import Dict, Set, List, Iterable +from typing import Dict, Set, List, Iterable, Any import jinja2 import json import os import sys -from typing_extensions import Literal +from typing_extensions import Literal, TypedDict import generate_binary_build_matrix # type: ignore[import] -YamlShellBool = Literal["''", 1] Arch = Literal["windows", "linux", "macos"] DOCKER_REGISTRY = "308535385114.dkr.ecr.us-east-1.amazonaws.com" @@ -142,6 +141,11 @@ class CIFlowRuleset: outfile.write('\n') +class Config(TypedDict): + num_shards: int + runner: str + + @dataclass class CIWorkflow: # Required fields @@ -162,50 +166,38 @@ class CIWorkflow: is_scheduled: str = '' is_default: bool = False num_test_shards: int = 1 - only_run_smoke_tests_on_pull_request: bool = False - num_test_shards_on_pull_request: int = -1 - distributed_test: bool = True timeout_after: int = 240 xcode_version: str = '' only_on_pr: bool = False ios_arch: str = '' ios_platform: str = '' + test_jobs: Any = field(default_factory=list) - # The following variables will be set as environment variables, - # so it's easier for both shell and Python scripts to consume it if false is represented as the empty string. - enable_jit_legacy_test: YamlShellBool = "''" - enable_distributed_test: YamlShellBool = "''" - enable_multigpu_test: YamlShellBool = "''" - enable_nogpu_no_avx_test: YamlShellBool = "''" - enable_nogpu_no_avx2_test: YamlShellBool = "''" - enable_slow_test: YamlShellBool = "''" - enable_docs_test: YamlShellBool = "''" - enable_backwards_compat_test: YamlShellBool = "''" - enable_xla_test: YamlShellBool = "''" - enable_noarch_test: YamlShellBool = "''" - enable_force_on_cpu_test: YamlShellBool = "''" + enable_default_test: bool = True + enable_smoke_test: bool = True + enable_jit_legacy_test: bool = False + enable_distributed_test: bool = True + enable_multigpu_test: bool = False + enable_nogpu_no_avx_test: bool = False + enable_nogpu_no_avx2_test: bool = False + enable_slow_test: bool = False + enable_docs_test: bool = False + enable_backwards_compat_test: bool = False + enable_xla_test: bool = False + enable_noarch_test: bool = False + enable_force_on_cpu_test: bool = False def __post_init__(self) -> None: if not self.build_generates_artifacts: self.exclude_test = True - if self.distributed_test: - self.enable_distributed_test = 1 - self.multigpu_runner_type = LINUX_MULTIGPU_RUNNERS.get(self.test_runner_type, "linux.16xlarge.nvidia.gpu") self.distributed_gpu_runner_type = LINUX_DISTRIBUTED_GPU_RUNNERS.get(self.test_runner_type, "linux.8xlarge.nvidia.gpu") if LABEL_CIFLOW_DEFAULT in self.ciflow_config.labels: self.is_default = True - # If num_test_shards_on_pull_request is not user-defined, default to num_test_shards unless we are - # only running smoke tests on the pull request. - if self.num_test_shards_on_pull_request == -1: - # Don't run the default if we are only running smoke tests - if self.only_run_smoke_tests_on_pull_request: - self.num_test_shards_on_pull_request = 0 - else: - self.num_test_shards_on_pull_request = self.num_test_shards + self.test_jobs = self._gen_test_jobs() self.assert_valid() def assert_valid(self) -> None: @@ -254,6 +246,83 @@ class CIWorkflow: output_file.write("\n") print(output_file_path) + def normalized_build_environment(self, suffix: str) -> str: + return self.build_environment.replace(".", "_") + suffix + + def _gen_test_jobs(self) -> Any: + if self.arch == "linux": + MULTIGPU_RUNNER_TYPE = "linux.16xlarge.nvidia.gpu" + DISTRIBUTED_GPU_RUNNER_TYPE = "linux.8xlarge.nvidia.gpu" + NOGPU_RUNNER_TYPE = "linux.2xlarge" + elif self.arch == "windows": + DISTRIBUTED_GPU_RUNNER_TYPE = self.test_runner_type + NOGPU_RUNNER_TYPE = "windows.4xlarge" + + test_jobs = [] + + configs: Dict[str, Config] = {} + if self.enable_jit_legacy_test: + configs["jit_legacy"] = {"num_shards": 1, "runner": self.test_runner_type} + if self.enable_multigpu_test: + configs["multigpu"] = {"num_shards": 1, "runner": MULTIGPU_RUNNER_TYPE} + + if self.enable_nogpu_no_avx_test: + configs["nogpu_NO_AVX"] = {"num_shards": 1, "runner": NOGPU_RUNNER_TYPE} + if self.enable_nogpu_no_avx2_test: + configs["nogpu_NO_AVX2"] = {"num_shards": 1, "runner": NOGPU_RUNNER_TYPE} + if self.enable_force_on_cpu_test: + configs["force_on_cpu"] = {"num_shards": 1, "runner": NOGPU_RUNNER_TYPE} + if self.enable_distributed_test: + configs["distributed"] = { + "num_shards": 1, + "runner": DISTRIBUTED_GPU_RUNNER_TYPE + if "cuda" in str(self.build_environment) + else self.test_runner_type, + } + if self.enable_slow_test: + configs["slow"] = {"num_shards": 1, "runner": self.test_runner_type} + if self.enable_docs_test: + configs["docs_test"] = {"num_shards": 1, "runner": self.test_runner_type} + if self.enable_backwards_compat_test: + configs["backwards_compat"] = { + "num_shards": 1, + "runner": self.test_runner_type, + } + if self.enable_xla_test: + configs["xla"] = {"num_shards": 1, "runner": self.test_runner_type} + if self.enable_noarch_test: + configs["noarch"] = {"num_shards": 1, "runner": self.test_runner_type} + + if self.enable_smoke_test: + configs["smoke_tests"] = {"num_shards": 1, "runner": self.test_runner_type} + + for name, config in configs.items(): + for shard in range(1, config["num_shards"] + 1): + test_jobs.append( + { + "id": f"test_{name}_{shard}_{config['num_shards']}", + "name": f"test ({name}, {shard}, {config['num_shards']}, {config['runner']})", + "config": name, + "shard": shard, + "num_shards": config["num_shards"], + "runner": config["runner"], + } + ) + + if self.enable_default_test: + for shard in range(1, self.num_test_shards + 1): + test_jobs.append( + { + "id": f"test_default_{shard}_{config['num_shards']}", + "name": f"test (default, {shard}, {self.num_test_shards}, {self.test_runner_type})", + "config": "default", + "shard": shard, + "num_shards": self.num_test_shards, + "runner": self.test_runner_type, + } + ) + return test_jobs + @dataclass class DockerWorkflow: build_environment: str @@ -327,17 +396,30 @@ WINDOWS_WORKFLOWS = [ labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_CPU, LABEL_CIFLOW_WIN} ), ), + CIWorkflow( + arch="windows", + build_environment="win-vs2019-cuda11.3-py3-smoke", + cuda_version="11.3", + test_runner_type=WINDOWS_CUDA_TEST_RUNNER, + enable_default_test=False, + enable_smoke_test=True, + enable_force_on_cpu_test=True, + only_on_pr=True, + ciflow_config=CIFlowConfig( + run_on_canary=True, + labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_CUDA, LABEL_CIFLOW_WIN} + ), + ), CIWorkflow( arch="windows", build_environment="win-vs2019-cuda11.3-py3", cuda_version="11.3", test_runner_type=WINDOWS_CUDA_TEST_RUNNER, num_test_shards=2, - only_run_smoke_tests_on_pull_request=True, - enable_force_on_cpu_test=1, + enable_force_on_cpu_test=True, ciflow_config=CIFlowConfig( run_on_canary=True, - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_CUDA, LABEL_CIFLOW_WIN} + labels={LABEL_CIFLOW_CUDA, LABEL_CIFLOW_WIN} ), ), CIWorkflow( @@ -346,7 +428,7 @@ WINDOWS_WORKFLOWS = [ cuda_version="11.5", test_runner_type=WINDOWS_CUDA_TEST_RUNNER, num_test_shards=2, - enable_force_on_cpu_test=1, + enable_force_on_cpu_test=True, is_scheduled="45 4,10,16,22 * * *", ciflow_config=CIFlowConfig( run_on_canary=True, @@ -372,9 +454,9 @@ LINUX_WORKFLOWS = [ build_environment="linux-xenial-py3.7-gcc5.4", docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.7-gcc5.4", test_runner_type=LINUX_CPU_TEST_RUNNER, - enable_jit_legacy_test=1, - enable_backwards_compat_test=1, - enable_docs_test=1, + enable_jit_legacy_test=True, + enable_backwards_compat_test=True, + enable_docs_test=True, num_test_shards=2, ciflow_config=CIFlowConfig( run_on_canary=True, @@ -475,7 +557,7 @@ LINUX_WORKFLOWS = [ docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang7-asan", test_runner_type=LINUX_CPU_TEST_RUNNER, num_test_shards=3, - distributed_test=False, + enable_distributed_test=False, ciflow_config=CIFlowConfig( labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_SANITIZERS, LABEL_CIFLOW_CPU}, ), @@ -486,7 +568,7 @@ LINUX_WORKFLOWS = [ docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang7-onnx", test_runner_type=LINUX_CPU_TEST_RUNNER, num_test_shards=2, - distributed_test=False, + enable_distributed_test=False, ciflow_config=CIFlowConfig( labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_ONNX, LABEL_CIFLOW_CPU}, ), @@ -496,11 +578,11 @@ LINUX_WORKFLOWS = [ build_environment="linux-bionic-cuda10.2-py3.9-gcc7", docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7", test_runner_type=LINUX_CUDA_TEST_RUNNER, - enable_jit_legacy_test=1, - enable_multigpu_test=1, - enable_nogpu_no_avx_test=1, - enable_nogpu_no_avx2_test=1, - enable_slow_test=1, + enable_jit_legacy_test=True, + enable_multigpu_test=True, + enable_nogpu_no_avx_test=True, + enable_nogpu_no_avx2_test=True, + enable_slow_test=True, num_test_shards=2, ciflow_config=CIFlowConfig( run_on_canary=True, @@ -623,8 +705,8 @@ LINUX_WORKFLOWS = [ docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.7-clang9", test_runner_type=LINUX_CPU_TEST_RUNNER, num_test_shards=2, - distributed_test=False, - enable_noarch_test=1, + enable_distributed_test=False, + enable_noarch_test=True, ciflow_config=CIFlowConfig( labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_NOARCH}, ), @@ -635,7 +717,7 @@ LINUX_WORKFLOWS = [ docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.7-clang9", test_runner_type=LINUX_CPU_TEST_RUNNER, num_test_shards=1, - distributed_test=False, + enable_distributed_test=False, ciflow_config=CIFlowConfig( labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_VULKAN}, ), @@ -646,7 +728,7 @@ LINUX_WORKFLOWS = [ docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7", test_runner_type=LINUX_CUDA_TEST_RUNNER, num_test_shards=2, - distributed_test=False, + enable_distributed_test=False, timeout_after=360, # Only run this on master 4 times per day since it does take a while is_scheduled="0 */4 * * *", @@ -663,8 +745,9 @@ XLA_WORKFLOWS = [ docker_image_base=f"{DOCKER_REGISTRY}/pytorch/xla_base", test_runner_type=LINUX_CPU_TEST_RUNNER, num_test_shards=2, - distributed_test=False, - enable_xla_test=1, + enable_distributed_test=False, + enable_xla_test=True, + enable_default_test=False, ciflow_config=CIFlowConfig( labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_XLA}, ), @@ -801,7 +884,7 @@ MACOS_WORKFLOWS = [ xcode_version="12.4", test_runner_type=MACOS_TEST_RUNNER_11, num_test_shards=2, - distributed_test=False, + enable_distributed_test=False, ciflow_config=CIFlowConfig( labels={LABEL_CIFLOW_MACOS}, ), diff --git a/.github/scripts/generate_pytorch_test_matrix.py b/.github/scripts/generate_pytorch_test_matrix.py deleted file mode 100755 index 4cbc20691a1..00000000000 --- a/.github/scripts/generate_pytorch_test_matrix.py +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/env python3 - -"""Generates a matrix to be utilized through github actions - -Will output a matrix to represent our testing configurations, which is currently -dictated by just sharding. - -""" - -import json -import os -from typing import Dict - -from typing_extensions import TypedDict - - -BUILD_ENVIRONMENT = os.getenv('BUILD_ENVIRONMENT') -assert BUILD_ENVIRONMENT is not None - -class Config(TypedDict): - num_shards: int - runner: str - - -# When the user specifies labels that are NOT ciflow/default, the expectation is -# that the workflows should be triggered as if they are on trunk. For example, when -# ciflow/all is specified, we should run the full test suite for Windows CUDA -# and NOT only the smoke tests. -def run_as_if_on_trunk() -> bool: - ON_PULL_REQUEST = os.getenv('GITHUB_HEAD_REF') - if not ON_PULL_REQUEST: - return True - - from pathlib import Path - GITHUB_DIR = Path(__file__).resolve().parent.parent - - with open(f'{GITHUB_DIR}/generated-ciflow-ruleset.json') as f: - labels_to_workflows = json.load(f)['label_rules'] - - pr_labels = json.loads(os.getenv('PR_LABELS', '[]')) - current_workflow_triggered_by_label = False - for label in pr_labels: - if label != 'ciflow/default' and label in labels_to_workflows: - workflows_triggered_by_label = labels_to_workflows[label] - if any([BUILD_ENVIRONMENT in workflow for workflow in workflows_triggered_by_label]): - current_workflow_triggered_by_label = True - break - - return current_workflow_triggered_by_label - -def main() -> None: - INCLUDE_DEFAULT_TEST = True - TEST_RUNNER_TYPE = os.getenv('TEST_RUNNER_TYPE') - assert TEST_RUNNER_TYPE is not None - RUN_SMOKE_TESTS_ONLY_ON_PR = os.getenv('RUN_SMOKE_TESTS_ONLY_ON_PR') - RUN_SMOKE_TESTS = RUN_SMOKE_TESTS_ONLY_ON_PR == "true" and not run_as_if_on_trunk() - NUM_TEST_SHARDS_ON_PULL_REQUEST = os.getenv('NUM_TEST_SHARDS_ON_PULL_REQUEST') - NUM_TEST_SHARDS = int(os.getenv('NUM_TEST_SHARDS', '0')) - if not run_as_if_on_trunk() and NUM_TEST_SHARDS_ON_PULL_REQUEST: - NUM_TEST_SHARDS = int(NUM_TEST_SHARDS_ON_PULL_REQUEST) - MULTIGPU_RUNNER_TYPE = os.getenv('MULTIGPU_RUNNER_TYPE') - DISTRIBUTED_GPU_RUNNER_TYPE = os.getenv('DISTRIBUTED_GPU_RUNNER_TYPE', TEST_RUNNER_TYPE) - NOGPU_RUNNER_TYPE = os.getenv('NOGPU_RUNNER_TYPE') - configs: Dict[str, Config] = {} - if os.getenv('ENABLE_JIT_LEGACY_TEST'): - configs['jit_legacy'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} - if MULTIGPU_RUNNER_TYPE is not None and os.getenv('ENABLE_MULTIGPU_TEST'): - configs['multigpu'] = {'num_shards': 1, 'runner': MULTIGPU_RUNNER_TYPE} - if NOGPU_RUNNER_TYPE is not None: - if os.getenv('ENABLE_NOGPU_NO_AVX_TEST'): - configs['nogpu_NO_AVX'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE} - if os.getenv('ENABLE_NOGPU_NO_AVX2_TEST'): - configs['nogpu_NO_AVX2'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE} - if os.getenv('ENABLE_FORCE_ON_CPU_TEST'): - configs['force_on_cpu'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE} - if os.getenv('ENABLE_DISTRIBUTED_TEST'): - configs['distributed'] = { - 'num_shards': 1, - 'runner': DISTRIBUTED_GPU_RUNNER_TYPE if "cuda" in str(BUILD_ENVIRONMENT) else TEST_RUNNER_TYPE - } - if os.getenv('ENABLE_SLOW_TEST'): - configs['slow'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} - if os.getenv('ENABLE_DOCS_TEST'): - configs['docs_test'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} - if os.getenv('ENABLE_BACKWARDS_COMPAT_TEST'): - configs['backwards_compat'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} - if os.getenv('ENABLE_XLA_TEST'): - configs['xla'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} - INCLUDE_DEFAULT_TEST = False - if os.getenv('ENABLE_NOARCH_TEST'): - configs['noarch'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} - if RUN_SMOKE_TESTS: - configs['smoke_tests'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} - matrix = { - 'include': [ - { - 'config': 'default', - 'shard': shard, - 'num_shards': NUM_TEST_SHARDS, - 'runner': TEST_RUNNER_TYPE, - } - for shard in range(1, NUM_TEST_SHARDS + 1) - if INCLUDE_DEFAULT_TEST - ] + [ - { - 'config': name, - 'shard': shard, - 'num_shards': config['num_shards'], - 'runner': config['runner'], - } - for name, config in configs.items() - for shard in range(1, config['num_shards'] + 1) - ] - } - render_matrix = {'config': list(dict.fromkeys(x['config'] for x in matrix['include']))} - print(json.dumps({'matrix': matrix, 'render-matrix': render_matrix}, indent=2)) - print(f'::set-output name=matrix::{json.dumps(matrix)}') - print(f'::set-output name=render-matrix::{json.dumps(render_matrix)}') - - -if __name__ == "__main__": - main() diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2 index 123d498363f..855917e0742 100644 --- a/.github/templates/common.yml.j2 +++ b/.github/templates/common.yml.j2 @@ -219,13 +219,12 @@ concurrency: {%- endif %} {%- endmacro -%} -{%- macro upload_downloaded_files(name, artifact_name="", use_s3=True, when="always()") -%} +{%- macro upload_downloaded_files(name, config=None, shard=None, num_shards=None, runner=None, artifact_name="", use_s3=True, when="always()") -%} - name: Zip JSONs for upload if: !{{ when }} env: {%- if name == 'linux' or name == 'windows' or name == 'macos' %} - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' -{%- else %} + FILE_SUFFIX: '${{ github.job }}-!{{ config }}-!{{ shard }}-!{{ num_shards }}-!{{ runner }}'{%- else %} FILE_SUFFIX: '!{{ name }}-${{ github.job }}' {%- endif %} {%- if name == 'windows' %} @@ -257,12 +256,12 @@ concurrency: test-jsons-*.zip {%- endmacro -%} -{%- macro upload_test_reports(name, artifact_name="", use_s3=True) -%} +{%- macro upload_test_reports(name, config=None, shard=None, num_shards=None, runner=None, artifact_name="", use_s3=True) -%} - name: Zip test reports for upload if: always() env: {%- if name == 'linux' or name == 'windows' or name == 'macos' %} - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-!{{ config }}-!{{ shard }}-!{{ num_shards }}-!{{ runner }}' {%- else %} FILE_SUFFIX: '!{{ name }}-${{ github.job }}' {%- endif %} diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 index 02a1a8fb026..775394b1789 100644 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ b/.github/templates/linux_ci_workflow.yml.j2 @@ -176,53 +176,18 @@ jobs: {%- endblock %} {%- if not exclude_test %} {% block test +%} - generate-test-matrix: + {%- for test_job in test_jobs %} + !{{ test_job.id }}: + name: !{{ test_job.name }} needs: build - runs-on: ubuntu-18.04 - timeout-minutes: !{{ common.timeout_minutes }} - env: - TEST_RUNNER_TYPE: !{{ test_runner_type }} - ENABLE_DISTRIBUTED_TEST: !{{ enable_distributed_test }} - ENABLE_JIT_LEGACY_TEST: !{{ enable_jit_legacy_test }} - ENABLE_MULTIGPU_TEST: !{{ enable_multigpu_test }} - ENABLE_NOGPU_NO_AVX_TEST: !{{ enable_nogpu_no_avx_test }} - ENABLE_NOGPU_NO_AVX2_TEST: !{{ enable_nogpu_no_avx2_test }} - ENABLE_SLOW_TEST: !{{ enable_slow_test }} - ENABLE_DOCS_TEST: !{{ enable_docs_test }} - ENABLE_BACKWARDS_COMPAT_TEST: !{{ enable_backwards_compat_test }} - ENABLE_XLA_TEST: !{{ enable_xla_test }} - ENABLE_NOARCH_TEST: !{{ enable_noarch_test }} - NUM_TEST_SHARDS: !{{ num_test_shards }} - MULTIGPU_RUNNER_TYPE: !{{ multigpu_runner_type }} - DISTRIBUTED_GPU_RUNNER_TYPE: !{{ distributed_gpu_runner_type }} - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: !{{ test_job.runner }} timeout-minutes: !{{ common.timeout_minutes }} env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: !{{ build_environment }}-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: !{{ test_job.config }} + SHARD_NUMBER: !{{ test_job.shard }} + NUM_TEST_SHARDS: !{{ test_job.num_shards }} PR_BODY: ${{ github.event.pull_request.body }} steps: {%- if 'rocm' in test_runner_type %} @@ -235,14 +200,12 @@ jobs: run: | !{{ common.add_retry_to_env() }} retry docker pull "${DOCKER_IMAGE}" -{%- if 'rocm' in test_runner_type %} +{%- if 'rocm' in test_runner_type and "nogpu" not in test_job.config %} - name: ROCm set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'rocm') && !contains(matrix.config, 'nogpu') }} run: | echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" -{%- else %} +{%- elif "cuda" in build_environment and "nogpu" not in test_job.config %} - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} run: | bash .github/scripts/install_nvidia_utils_linux.sh echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" @@ -365,11 +328,11 @@ jobs: {%- endif %} !{{ common.render_test_results() }} {%- if 'rocm' in test_runner_type %} - !{{ common.upload_downloaded_files(name='linux', use_s3=False) }} - !{{ common.upload_test_reports(name='linux', artifact_name="test-reports", use_s3=False) }} + !{{ common.upload_downloaded_files(name='linux', use_s3=False, config=test_job.config, shard=test_job.shard, num_shards=test_job.num_shards, runner=test_job.runner) }} + !{{ common.upload_test_reports(name='linux', artifact_name="test-reports", use_s3=False, config=test_job.config, shard=test_job.shard, num_shards=test_job.num_shards, runner=test_job.runner) }} {%- else %} - !{{ common.upload_downloaded_files(name='linux') }} - !{{ common.upload_test_reports(name='linux') }} + !{{ common.upload_downloaded_files(name='linux', config=test_job.config, shard=test_job.shard, num_shards=test_job.num_shards, runner=test_job.runner) }} + !{{ common.upload_test_reports(name='linux', config=test_job.config, shard=test_job.shard, num_shards=test_job.num_shards, runner=test_job.runner) }} {%- endif %} !{{ common.upload_test_statistics(build_environment) }} {%- if 'rocm' in test_runner_type %} @@ -377,6 +340,7 @@ jobs: {%- else %} !{{ common.teardown_ec2_linux() }} {%- endif %} +{%- endfor %} {% endblock %} {%- endif -%} {%- if enable_doc_jobs %} diff --git a/.github/templates/macos_ci_workflow.yml.j2 b/.github/templates/macos_ci_workflow.yml.j2 index 413df391183..ea7aa370cb5 100644 --- a/.github/templates/macos_ci_workflow.yml.j2 +++ b/.github/templates/macos_ci_workflow.yml.j2 @@ -87,40 +87,17 @@ jobs: {% endblock +%} {%- if not exclude_test %} {% block test +%} - generate-test-matrix: + {%- for test_job in test_jobs %} + !{{ test_job.id }}: + name: !{{ test_job.name }} needs: build - runs-on: ubuntu-18.04 - timeout-minutes: !{{ common.timeout_minutes }} - env: - TEST_RUNNER_TYPE: !{{ test_runner_type }} - ENABLE_DISTRIBUTED_TEST: !{{ enable_distributed_test }} - NUM_TEST_SHARDS: !{{ num_test_shards }} - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: !{{ test_job.runner }} timeout-minutes: !{{ common.timeout_minutes }} env: JOB_BASE_NAME: !{{ build_environment }}-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: !{{ test_job.config }} + SHARD_NUMBER: !{{ test_job.shard }} + NUM_TEST_SHARDS: !{{ test_job.num_shards }} PR_BODY: ${{ github.event.pull_request.body }} steps: !{{ common.checkout(submodules="false") }} @@ -143,9 +120,10 @@ jobs: python3 -mpip install dist/*.whl .jenkins/pytorch/macos-test.sh !{{ common.render_test_results() }} - !{{ common.upload_downloaded_files(name='macos', artifact_name="test-jsons", use_s3=False) }} - !{{ common.upload_test_reports("macos", artifact_name="test-reports", use_s3=False) }} + !{{ common.upload_downloaded_files(name='macos', config=test_job.config, shard=test_job.shard, num_shards=test_job.num_shards, runner=test_job.runner, artifact_name="test-jsons", use_s3=False) }} + !{{ common.upload_test_reports("macos", config=test_job.config, shard=test_job.shard, num_shards=test_job.num_shards, runner=test_job.runner, artifact_name="test-reports", use_s3=False) }} !{{ common.upload_test_statistics(build_environment, needs_credentials=True) }} +{%- endfor %} {% endblock +%} {%- endif %} diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2 index 53f50aa37de..db392d9cbe7 100644 --- a/.github/templates/windows_ci_workflow.yml.j2 +++ b/.github/templates/windows_ci_workflow.yml.j2 @@ -31,11 +31,12 @@ on: - '!{{ label }}/*' {%- endif %} {%- endfor %} -{%- if not is_scheduled %} +{%- if not is_scheduled and not only_on_pr %} branches: - master - release/* -{%- else %} +{%- endif %} +{%- if is_scheduled and not only_on_pr %} schedule: - cron: !{{ is_scheduled }} {%- endif %} @@ -130,46 +131,20 @@ jobs: rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}" rm -rf ./* - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: !{{ common.timeout_minutes }} - env: - TEST_RUNNER_TYPE: !{{ test_runner_type }} - NUM_TEST_SHARDS: !{{ num_test_shards }} - NUM_TEST_SHARDS_ON_PULL_REQUEST: !{{ num_test_shards_on_pull_request }} - NOGPU_RUNNER_TYPE: windows.4xlarge - ENABLE_FORCE_ON_CPU_TEST: !{{ enable_force_on_cpu_test }} - RUN_SMOKE_TESTS_ONLY_ON_PR: !{{ only_run_smoke_tests_on_pull_request }} - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: + {%- for test_job in test_jobs %} + !{{ test_job.id }}: + name: !{{ test_job.name }} timeout-minutes: !{{ common.timeout_minutes }} env: JOB_BASE_NAME: !{{ build_environment }}-test - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: !{{ test_job.shard }} + NUM_TEST_SHARDS: !{{ test_job.num_shards }} + TEST_CONFIG: !{{ test_job.config }} http_proxy: "!{{ common.squid_proxy }}" https_proxy: "!{{ common.squid_proxy }}" PR_BODY: ${{ github.event.pull_request.body }} - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + needs: build + runs-on: !{{ test_job.runner }} steps: !{{ common.display_ec2_information() }} - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" @@ -181,14 +156,12 @@ jobs: shell: powershell run: | .\.circleci\scripts\vs_install.ps1 -{%- if cuda_version != "cpu" %} +{%- if cuda_version != "cpu" and not test_job.config == 'force_on_cpu' %} - name: Install Cuda - if: ${{ matrix.config != 'force_on_cpu' }} shell: bash run: | .circleci/scripts/windows_cuda_install.sh - name: Install Cudnn - if: ${{ matrix.config != 'force_on_cpu' }} shell: bash run: | .circleci/scripts/windows_cudnn_install.sh @@ -215,8 +188,8 @@ jobs: timeout-minutes: 210 run: | .jenkins/pytorch/win-test.sh - !{{ common.upload_downloaded_files(name='windows') }} - !{{ common.upload_test_reports(name='windows') }} + !{{ common.upload_downloaded_files(name='windows', config=test_job.config, shard=test_job.shard, num_shards=test_job.num_shards, runner=test_job.runner) }} + !{{ common.upload_test_reports(name='windows', config=test_job.config, shard=test_job.shard, num_shards=test_job.num_shards, runner=test_job.runner) }} !{{ common.render_test_results() }} !{{ common.wait_and_kill_ssh_windows() }} !{{ common.parse_ref() }} @@ -227,3 +200,4 @@ jobs: # Should remove the entirety of pytorch-${{ github.run_id }} run: | rm -rf ./* + {%- endfor %} diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index c78dc63a446..e2631900a36 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -249,53 +249,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_jit_legacy_1_1: + name: test (jit_legacy, 1, 1, linux.4xlarge.nvidia.gpu) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: 1 - ENABLE_MULTIGPU_TEST: 1 - ENABLE_NOGPU_NO_AVX_TEST: 1 - ENABLE_NOGPU_NO_AVX2_TEST: 1 - ENABLE_SLOW_TEST: 1 - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.4xlarge.nvidia.gpu timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: jit_legacy + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -359,7 +323,6 @@ jobs: } retry docker pull "${DOCKER_IMAGE}" - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} run: | bash .github/scripts/install_nvidia_utils_linux.sh echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" @@ -475,7 +438,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-jit_legacy-1-1-linux.4xlarge.nvidia.gpu' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -491,7 +454,2015 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-jit_legacy-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_multigpu_1_1: + name: test (multigpu, 1, 1, linux.16xlarge.nvidia.gpu) + needs: build + runs-on: linux.16xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + TEST_CONFIG: multigpu + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-multigpu-1-1-linux.16xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-multigpu-1-1-linux.16xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_nogpu_NO_AVX_1_1: + name: test (nogpu_NO_AVX, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + TEST_CONFIG: nogpu_NO_AVX + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-nogpu_NO_AVX-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-nogpu_NO_AVX-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_nogpu_NO_AVX2_1_1: + name: test (nogpu_NO_AVX2, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + TEST_CONFIG: nogpu_NO_AVX2 + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-nogpu_NO_AVX2-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-nogpu_NO_AVX2-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_distributed_1_1: + name: test (distributed, 1, 1, linux.8xlarge.nvidia.gpu) + needs: build + runs-on: linux.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + TEST_CONFIG: distributed + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.8xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.8xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_slow_1_1: + name: test (slow, 1, 1, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + TEST_CONFIG: slow + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-slow-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-slow-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 2, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_2_1: + name: test (default, 2, 2, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-linux-bionic-py3.7-clang9.yml b/.github/workflows/generated-linux-bionic-py3.7-clang9.yml index aa28d07c1b2..65880d1b982 100644 --- a/.github/workflows/generated-linux-bionic-py3.7-clang9.yml +++ b/.github/workflows/generated-linux-bionic-py3.7-clang9.yml @@ -250,53 +250,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_noarch_1_1: + name: test (noarch, 1, 1, linux.2xlarge) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: '' - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: 1 - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.2xlarge timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: linux-bionic-py3.7-clang9-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: noarch + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -359,11 +323,6 @@ jobs: "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") } retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - name: Determine shm-size run: | shm_size="1g" @@ -476,7 +435,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-noarch-1-1-linux.2xlarge' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -492,7 +451,751 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-noarch-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-py3.7-clang9-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-py3.7-clang9-test + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-py3.7-clang9-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 2, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-py3.7-clang9-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-py3.7-clang9-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_2_1: + name: test (default, 2, 2, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-py3.7-clang9-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.2xlarge' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml b/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml index 5dd6543e1f5..0922d5a9e62 100644 --- a/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml +++ b/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml @@ -249,53 +249,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_distributed_1_1: + name: test (distributed, 1, 1, linux.rocm.gpu) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.rocm.gpu - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.rocm.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.rocm.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.rocm.gpu timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: distributed + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Clean workspace @@ -352,7 +316,6 @@ jobs: } retry docker pull "${DOCKER_IMAGE}" - name: ROCm set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'rocm') && !contains(matrix.config, 'nogpu') }} run: | echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" - name: Determine shm-size @@ -455,7 +418,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.rocm.gpu' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -471,7 +434,679 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.rocm.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: actions/upload-artifact@v2 + name: Store Test Reports on Github + if: always() + with: + name: test-reports + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.rocm.gpu) + needs: build + runs-on: linux.rocm.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-test + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then + echo "Failed to detect GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home + docker exec -t "${container_name}" sh -c "cd .. && cp -R workspace pytorch && cd pytorch && pip install dist/*.whl && ${TEST_COMMAND}" + # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct + docker exec -t "${container_name}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test" + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.rocm.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: actions/upload-artifact@v2 + name: Store Test Downloaded JSONs on Github + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.rocm.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: actions/upload-artifact@v2 + name: Store Test Reports on Github + if: always() + with: + name: test-reports + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 2, linux.rocm.gpu) + needs: build + runs-on: linux.rocm.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then + echo "Failed to detect GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home + docker exec -t "${container_name}" sh -c "cd .. && cp -R workspace pytorch && cd pytorch && pip install dist/*.whl && ${TEST_COMMAND}" + # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct + docker exec -t "${container_name}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test" + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.rocm.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: actions/upload-artifact@v2 + name: Store Test Downloaded JSONs on Github + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.rocm.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: actions/upload-artifact@v2 + name: Store Test Reports on Github + if: always() + with: + name: test-reports + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_2_1: + name: test (default, 2, 2, linux.rocm.gpu) + needs: build + runs-on: linux.rocm.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then + echo "Failed to detect GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home + docker exec -t "${container_name}" sh -c "cd .. && cp -R workspace pytorch && cd pytorch && pip install dist/*.whl && ${TEST_COMMAND}" + # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct + docker exec -t "${container_name}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test" + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.rocm.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: actions/upload-artifact@v2 + name: Store Test Downloaded JSONs on Github + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.rocm.gpu' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml b/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml index e6ea8bde928..e836ddf691b 100644 --- a/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml +++ b/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml @@ -250,53 +250,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.2xlarge) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: '' - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 1 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.2xlarge timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: linux-vulkan-bionic-py3.7-clang9-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -359,11 +323,6 @@ jobs: "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") } retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - name: Determine shm-size run: | shm_size="1g" @@ -476,7 +435,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -492,7 +451,255 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-vulkan-bionic-py3.7-clang9-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-vulkan-bionic-py3.7-clang9-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-1-linux.2xlarge' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml index eaa71fb310b..8b26d013935 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml @@ -249,53 +249,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_distributed_1_1: + name: test (distributed, 1, 1, linux.8xlarge.nvidia.gpu) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.8xlarge.nvidia.gpu timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: distributed + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -359,7 +323,6 @@ jobs: } retry docker pull "${DOCKER_IMAGE}" - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} run: | bash .github/scripts/install_nvidia_utils_linux.sh echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" @@ -475,7 +438,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.8xlarge.nvidia.gpu' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -491,7 +454,763 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.8xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-test + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 2, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_2_1: + name: test (default, 2, 2, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml b/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml index 6a002fda318..0b6fea00e1f 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml @@ -250,53 +250,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.2xlarge) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: '' - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 3 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.2xlarge timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: linux-xenial-py3.7-clang7-asan-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -359,11 +323,6 @@ jobs: "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") } retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - name: Determine shm-size run: | shm_size="1g" @@ -476,7 +435,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -492,7 +451,751 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-clang7-asan-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 3, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-clang7-asan-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 3 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-3-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-3-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-clang7-asan-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_2_1: + name: test (default, 2, 3, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-clang7-asan-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 3 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-3-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-3-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-clang7-asan-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_3_1: + name: test (default, 3, 3, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-clang7-asan-test + TEST_CONFIG: default + SHARD_NUMBER: 3 + NUM_TEST_SHARDS: 3 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-3-3-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-3-3-linux.2xlarge' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml b/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml index 6ebf5dbdd0b..7d49630c027 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml @@ -250,53 +250,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.2xlarge) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: '' - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.2xlarge timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: linux-xenial-py3.7-clang7-onnx-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -359,11 +323,6 @@ jobs: "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") } retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - name: Determine shm-size run: | shm_size="1g" @@ -476,7 +435,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -492,7 +451,503 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-clang7-onnx-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 2, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-clang7-onnx-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-clang7-onnx-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_2_1: + name: test (default, 2, 2, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-clang7-onnx-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.2xlarge' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml index b1c63e596df..189b085f8ae 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml @@ -249,53 +249,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_jit_legacy_1_1: + name: test (jit_legacy, 1, 1, linux.2xlarge) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: 1 - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: 1 - ENABLE_BACKWARDS_COMPAT_TEST: 1 - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.2xlarge timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: jit_legacy + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -358,11 +322,6 @@ jobs: "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") } retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - name: Determine shm-size run: | shm_size="1g" @@ -475,7 +434,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-jit_legacy-1-1-linux.2xlarge' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -491,7 +450,1495 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-jit_legacy-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_distributed_1_1: + name: test (distributed, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + TEST_CONFIG: distributed + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_docs_test_1_1: + name: test (docs_test, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + TEST_CONFIG: docs_test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-docs_test-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-docs_test-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_backwards_compat_1_1: + name: test (backwards_compat, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + TEST_CONFIG: backwards_compat + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-backwards_compat-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-backwards_compat-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 2, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_2_1: + name: test (default, 2, 2, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.2xlarge' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml b/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml index fd394cc99fa..42507986059 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml @@ -249,53 +249,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_distributed_1_1: + name: test (distributed, 1, 1, linux.2xlarge) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.2xlarge timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: linux-xenial-py3.7-gcc7-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: distributed + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -358,11 +322,6 @@ jobs: "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") } retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - name: Determine shm-size run: | shm_size="1g" @@ -475,7 +434,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.2xlarge' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -491,7 +450,751 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc7-test + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 2, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc7-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_2_1: + name: test (default, 2, 2, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc7-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.2xlarge' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-macos-11-py3-x86-64.yml b/.github/workflows/generated-macos-11-py3-x86-64.yml index de79f0b3df9..da7c8c0d9ff 100644 --- a/.github/workflows/generated-macos-11-py3-x86-64.yml +++ b/.github/workflows/generated-macos-11-py3-x86-64.yml @@ -85,40 +85,16 @@ jobs: artifacts.zip - generate-test-matrix: + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, macos-11) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: macos-11 - ENABLE_DISTRIBUTED_TEST: '' - NUM_TEST_SHARDS: 2 - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: macos-11 timeout-minutes: 240 env: JOB_BASE_NAME: macos-11-py3-x86-64-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Checkout PyTorch @@ -173,7 +149,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-macos-11' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -190,7 +166,235 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-macos-11' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: actions/upload-artifact@v2 + name: Store Test Reports on Github + if: always() + with: + name: test-reports + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: macos-11-py3-x86-64-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_SECRET_ACCESS_KEY }} + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + test_default_1_1: + name: test (default, 1, 2, macos-11) + needs: build + runs-on: macos-11 + timeout-minutes: 240 + env: + JOB_BASE_NAME: macos-11-py3-x86-64-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - uses: actions/download-artifact@v2 + name: Download PyTorch Build Artifacts from GHA + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: . + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Setup miniconda + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + python-version: 3.8 + activate-environment: build + - name: Install macOS homebrew dependencies + run: | + # Install dependencies + brew install libomp + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + run: | + python3 -mpip install dist/*.whl + .jenkins/pytorch/macos-test.sh + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-macos-11' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: actions/upload-artifact@v2 + name: Store Test Downloaded JSONs on Github + if: always() + with: + name: test-jsons + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-macos-11' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: actions/upload-artifact@v2 + name: Store Test Reports on Github + if: always() + with: + name: test-reports + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: macos-11-py3-x86-64-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_SECRET_ACCESS_KEY }} + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + test_default_2_1: + name: test (default, 2, 2, macos-11) + needs: build + runs-on: macos-11 + timeout-minutes: 240 + env: + JOB_BASE_NAME: macos-11-py3-x86-64-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - uses: actions/download-artifact@v2 + name: Download PyTorch Build Artifacts from GHA + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: . + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Setup miniconda + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + python-version: 3.8 + activate-environment: build + - name: Install macOS homebrew dependencies + run: | + # Install dependencies + brew install libomp + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + run: | + python3 -mpip install dist/*.whl + .jenkins/pytorch/macos-test.sh + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-macos-11' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: actions/upload-artifact@v2 + name: Store Test Downloaded JSONs on Github + if: always() + with: + name: test-jsons + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-macos-11' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml b/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml index 5c6698e2e59..46e142b53b8 100644 --- a/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml +++ b/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml @@ -248,53 +248,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_distributed_1_1: + name: test (distributed, 1, 1, linux.2xlarge) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 1 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.2xlarge timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: parallelnative-linux-xenial-py3.7-gcc5.4-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: distributed + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -357,11 +321,6 @@ jobs: "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") } retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - name: Determine shm-size run: | shm_size="1g" @@ -474,7 +433,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.2xlarge' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -490,7 +449,503 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: parallelnative-linux-xenial-py3.7-gcc5.4-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: parallelnative-linux-xenial-py3.7-gcc5.4-test + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: parallelnative-linux-xenial-py3.7-gcc5.4-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: parallelnative-linux-xenial-py3.7-gcc5.4-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-1-linux.2xlarge' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml b/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml index 2debf29f0cc..a118b22f61a 100644 --- a/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml @@ -247,53 +247,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_distributed_1_1: + name: test (distributed, 1, 1, linux.8xlarge.nvidia.gpu) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.8xlarge.nvidia.gpu timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: distributed + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -357,7 +321,6 @@ jobs: } retry docker pull "${DOCKER_IMAGE}" - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} run: | bash .github/scripts/install_nvidia_utils_linux.sh echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" @@ -473,7 +436,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.8xlarge.nvidia.gpu' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -489,7 +452,763 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.8xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 2, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_2_1: + name: test (default, 2, 2, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml b/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml index 2f03e0f409e..c3bee6a3aa2 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml @@ -249,53 +249,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.4xlarge.nvidia.gpu) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu - ENABLE_DISTRIBUTED_TEST: '' - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.4xlarge.nvidia.gpu timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -359,7 +323,6 @@ jobs: } retry docker pull "${DOCKER_IMAGE}" - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} run: | bash .github/scripts/install_nvidia_utils_linux.sh echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" @@ -475,7 +438,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -491,7 +454,511 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 2, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 360 minutes + timeout-minutes: 360 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_2_1: + name: test (default, 2, 2, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 360 minutes + timeout-minutes: 360 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml index 13a1cf744e3..6fe981b2ff1 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml @@ -248,53 +248,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_distributed_1_1: + name: test (distributed, 1, 1, linux.8xlarge.nvidia.gpu) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.8xlarge.nvidia.gpu timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: distributed + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -358,7 +322,6 @@ jobs: } retry docker pull "${DOCKER_IMAGE}" - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} run: | bash .github/scripts/install_nvidia_utils_linux.sh echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" @@ -474,7 +437,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.8xlarge.nvidia.gpu' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -490,7 +453,763 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.8xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 2, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_2_1: + name: test (default, 2, 2, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml index 22da8807cfd..a24e8f22446 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml @@ -131,47 +131,19 @@ jobs: run: | rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}" rm -rf ./* - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu - NUM_TEST_SHARDS: 2 - NUM_TEST_SHARDS_ON_PULL_REQUEST: 2 - NOGPU_RUNNER_TYPE: windows.4xlarge - ENABLE_FORCE_ON_CPU_TEST: '' - RUN_SMOKE_TESTS_ONLY_ON_PR: False - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: + test_distributed_1_1: + name: test (distributed, 1, 1, windows.8xlarge.nvidia.gpu) timeout-minutes: 240 env: JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: distributed http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" PR_BODY: ${{ github.event.pull_request.body }} - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu steps: - name: Display EC2 information shell: bash @@ -206,12 +178,10 @@ jobs: run: | .\.circleci\scripts\vs_install.ps1 - name: Install Cuda - if: ${{ matrix.config != 'force_on_cpu' }} shell: bash run: | .circleci/scripts/windows_cuda_install.sh - name: Install Cudnn - if: ${{ matrix.config != 'force_on_cpu' }} shell: bash run: | .circleci/scripts/windows_cudnn_install.sh @@ -240,7 +210,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' shell: powershell run: | # -ir => recursive include all files in pattern @@ -256,7 +226,481 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: smoke_tests + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_default_1_1: + name: test (default, 1, 2, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + TEST_CONFIG: default + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_default_2_1: + name: test (default, 2, 2, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + TEST_CONFIG: default + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-windows.8xlarge.nvidia.gpu' shell: powershell run: | # -ir => recursive include all files in pattern diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml index 2cb44ccf28f..f11536461b6 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml @@ -131,47 +131,19 @@ jobs: run: | rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}" rm -rf ./* - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu - NUM_TEST_SHARDS: 2 - NUM_TEST_SHARDS_ON_PULL_REQUEST: 2 - NOGPU_RUNNER_TYPE: windows.4xlarge - ENABLE_FORCE_ON_CPU_TEST: 1 - RUN_SMOKE_TESTS_ONLY_ON_PR: False - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: + test_force_on_cpu_1_1: + name: test (force_on_cpu, 1, 1, windows.4xlarge) timeout-minutes: 240 env: JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: force_on_cpu http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" PR_BODY: ${{ github.event.pull_request.body }} - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + needs: build + runs-on: windows.4xlarge steps: - name: Display EC2 information shell: bash @@ -205,16 +177,6 @@ jobs: shell: powershell run: | .\.circleci\scripts\vs_install.ps1 - - name: Install Cuda - if: ${{ matrix.config != 'force_on_cpu' }} - shell: bash - run: | - .circleci/scripts/windows_cuda_install.sh - - name: Install Cudnn - if: ${{ matrix.config != 'force_on_cpu' }} - shell: bash - run: | - .circleci/scripts/windows_cudnn_install.sh - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b name: Download PyTorch Build Artifacts with: @@ -240,7 +202,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-force_on_cpu-1-1-windows.4xlarge' shell: powershell run: | # -ir => recursive include all files in pattern @@ -256,7 +218,639 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-force_on_cpu-1-1-windows.4xlarge' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_distributed_1_1: + name: test (distributed, 1, 1, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: distributed + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: smoke_tests + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_default_1_1: + name: test (default, 1, 2, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + TEST_CONFIG: default + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_default_2_1: + name: test (default, 2, 2, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + TEST_CONFIG: default + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-windows.8xlarge.nvidia.gpu' shell: powershell run: | # -ir => recursive include all files in pattern diff --git a/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml b/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml index 2186e05ecbe..ecde072b15d 100644 --- a/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml +++ b/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml @@ -215,53 +215,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_xla_1_1: + name: test (xla, 1, 1, linux.2xlarge) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: '' - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: 1 - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.2xlarge timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: pytorch-xla-linux-bionic-py3.7-clang8-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: xla + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -324,11 +288,6 @@ jobs: "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") } retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - name: Determine shm-size run: | shm_size="1g" @@ -442,7 +401,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-xla-1-1-linux.2xlarge' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -458,7 +417,256 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-xla-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: pytorch-xla-linux-bionic-py3.7-clang8-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: pytorch-xla-linux-bionic-py3.7-clang8-test + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CUDA \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml index 2774ac4b66e..fe37c106670 100644 --- a/.github/workflows/generated-win-vs2019-cpu-py3.yml +++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml @@ -124,47 +124,19 @@ jobs: run: | rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}" rm -rf ./* - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: windows.4xlarge - NUM_TEST_SHARDS: 2 - NUM_TEST_SHARDS_ON_PULL_REQUEST: 2 - NOGPU_RUNNER_TYPE: windows.4xlarge - ENABLE_FORCE_ON_CPU_TEST: '' - RUN_SMOKE_TESTS_ONLY_ON_PR: False - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: + test_distributed_1_1: + name: test (distributed, 1, 1, windows.4xlarge) timeout-minutes: 240 env: JOB_BASE_NAME: win-vs2019-cpu-py3-test - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: distributed http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" PR_BODY: ${{ github.event.pull_request.body }} - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + needs: build + runs-on: windows.4xlarge steps: - name: Display EC2 information shell: bash @@ -223,7 +195,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.4xlarge' shell: powershell run: | # -ir => recursive include all files in pattern @@ -239,7 +211,457 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.4xlarge' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: win-vs2019-cpu-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, windows.4xlarge) + timeout-minutes: 240 + env: + JOB_BASE_NAME: win-vs2019-cpu-py3-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: smoke_tests + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.4xlarge + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.4xlarge' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.4xlarge' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: win-vs2019-cpu-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_default_1_1: + name: test (default, 1, 2, windows.4xlarge) + timeout-minutes: 240 + env: + JOB_BASE_NAME: win-vs2019-cpu-py3-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + TEST_CONFIG: default + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.4xlarge + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-windows.4xlarge' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-windows.4xlarge' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: win-vs2019-cpu-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_default_2_1: + name: test (default, 2, 2, windows.4xlarge) + timeout-minutes: 240 + env: + JOB_BASE_NAME: win-vs2019-cpu-py3-test + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + TEST_CONFIG: default + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.4xlarge + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-windows.4xlarge' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-windows.4xlarge' shell: powershell run: | # -ir => recursive include all files in pattern diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3-smoke.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3-smoke.yml new file mode 100644 index 00000000000..64322fc55e2 --- /dev/null +++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3-smoke.yml @@ -0,0 +1,598 @@ +# @generated DO NOT EDIT MANUALLY +# Template is at: .github/templates/windows_ci_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: win-vs2019-cuda11.3-py3-smoke + +on: + pull_request: + push: + tags: + - 'ciflow/all/*' + - 'ciflow/cuda/*' + - 'ciflow/trunk/*' + - 'ciflow/win/*' + workflow_dispatch: + +env: + BUILD_ENVIRONMENT: win-vs2019-cuda11.3-py3-smoke + BUILD_WHEEL: 1 + MAX_JOBS: 8 + CUDA_VERSION: "11.3" + IN_CI: 1 + IS_GHA: 1 + INSTALL_WINDOWS_SDK: 1 + PYTHON_VERSION: "3.8" + PYTORCH_RETRY_TEST_CASES: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + SCCACHE_BUCKET: "ossci-compiler-cache" + VC_PRODUCT: "BuildTools" + VC_VERSION: "" + VS_VERSION: "16.8.6" + VC_YEAR: "2019" + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + no_proxy: localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock + AWS_DEFAULT_REGION: us-east-1 + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TORCH_CUDA_ARCH_LIST: "7.0" + USE_CUDA: 1 + +concurrency: + group: win-vs2019-cuda11.3-py3-smoke-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + build: + runs-on: "windows.4xlarge" + timeout-minutes: 240 + env: + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-smoke-build + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + steps: + - name: print labels + run: echo "${PR_LABELS}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Build + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + BRANCH: ${{ steps.parse-ref.outputs.branch }} + run: | + .jenkins/pytorch/win-build.sh + # Upload to github so that people can click and download artifacts + - name: Upload artifacts to s3 + uses: seemethere/upload-artifact-s3@v3 + with: + retention-days: 14 + if-no-files-found: error + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Cleanup build-results and workspaces + if: always() + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}" + rm -rf ./* + test_force_on_cpu_1_1: + name: test (force_on_cpu, 1, 1, windows.4xlarge) + timeout-minutes: 240 + env: + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-smoke-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: force_on_cpu + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.4xlarge + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-force_on_cpu-1-1-windows.4xlarge' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-force_on_cpu-1-1-windows.4xlarge' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-smoke-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_distributed_1_1: + name: test (distributed, 1, 1, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-smoke-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: distributed + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-smoke-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-smoke-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: smoke_tests + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-smoke-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml index 294d56f7409..98fef6ac396 100644 --- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml @@ -4,7 +4,6 @@ name: win-vs2019-cuda11.3-py3 on: - pull_request: push: tags: - 'ciflow/all/*' @@ -133,47 +132,19 @@ jobs: run: | rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}" rm -rf ./* - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu - NUM_TEST_SHARDS: 2 - NUM_TEST_SHARDS_ON_PULL_REQUEST: 0 - NOGPU_RUNNER_TYPE: windows.4xlarge - ENABLE_FORCE_ON_CPU_TEST: 1 - RUN_SMOKE_TESTS_ONLY_ON_PR: True - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: + test_force_on_cpu_1_1: + name: test (force_on_cpu, 1, 1, windows.4xlarge) timeout-minutes: 240 env: JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: force_on_cpu http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" PR_BODY: ${{ github.event.pull_request.body }} - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + needs: build + runs-on: windows.4xlarge steps: - name: Display EC2 information shell: bash @@ -207,16 +178,6 @@ jobs: shell: powershell run: | .\.circleci\scripts\vs_install.ps1 - - name: Install Cuda - if: ${{ matrix.config != 'force_on_cpu' }} - shell: bash - run: | - .circleci/scripts/windows_cuda_install.sh - - name: Install Cudnn - if: ${{ matrix.config != 'force_on_cpu' }} - shell: bash - run: | - .circleci/scripts/windows_cudnn_install.sh - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b name: Download PyTorch Build Artifacts with: @@ -242,7 +203,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-force_on_cpu-1-1-windows.4xlarge' shell: powershell run: | # -ir => recursive include all files in pattern @@ -258,7 +219,639 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-force_on_cpu-1-1-windows.4xlarge' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_distributed_1_1: + name: test (distributed, 1, 1, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: distributed + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: smoke_tests + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_default_1_1: + name: test (default, 1, 2, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + TEST_CONFIG: default + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_default_2_1: + name: test (default, 2, 2, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + TEST_CONFIG: default + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-windows.8xlarge.nvidia.gpu' shell: powershell run: | # -ir => recursive include all files in pattern