diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json index 804826b1d1a..2951c6ada72 100644 --- a/.github/generated-ciflow-ruleset.json +++ b/.github/generated-ciflow-ruleset.json @@ -16,6 +16,7 @@ "libtorch-linux-xenial-cuda11.3-py3.7-gcc7", "linux-bionic-cuda10.2-py3.9-gcc7", "linux-bionic-py3.7-clang9", + "linux-bionic-rocm4.5-py3.7", "linux-docs", "linux-docs-push", "linux-vulkan-bionic-py3.7-clang9", @@ -150,6 +151,7 @@ "libtorch-linux-xenial-cuda11.3-py3.7-gcc7", "linux-bionic-cuda10.2-py3.9-gcc7", "linux-bionic-py3.7-clang9", + "linux-bionic-rocm4.5-py3.7", "linux-docs", "linux-docs-push", "linux-vulkan-bionic-py3.7-clang9", @@ -196,6 +198,9 @@ "ciflow/onnx": [ "linux-xenial-py3.7-clang7-onnx" ], + "ciflow/rocm": [ + "linux-bionic-rocm4.5-py3.7" + ], "ciflow/sanitizers": [ "linux-xenial-py3.7-clang7-asan" ], @@ -231,6 +236,7 @@ "libtorch-linux-xenial-cuda11.3-py3.7-gcc7", "linux-bionic-cuda10.2-py3.9-gcc7", "linux-bionic-py3.7-clang9", + "linux-bionic-rocm4.5-py3.7", "linux-docs", "linux-vulkan-bionic-py3.7-clang9", "linux-xenial-cuda11.3-py3.7-gcc7", diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index 506d62aae08..91d49165cbf 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -29,9 +29,22 @@ WINDOWS_RUNNERS = { LINUX_CPU_TEST_RUNNER = "linux.2xlarge" # contains 1 gpu LINUX_CUDA_TEST_RUNNER = "linux.4xlarge.nvidia.gpu" +# contains 4 gpus +LINUX_ROCM_TEST_RUNNER = "linux.rocm.gpu" LINUX_RUNNERS = { LINUX_CPU_TEST_RUNNER, LINUX_CUDA_TEST_RUNNER, + LINUX_ROCM_TEST_RUNNER, +} + +LINUX_DISTRIBUTED_GPU_RUNNERS = { + LINUX_CUDA_TEST_RUNNER : "linux.8xlarge.nvidia.gpu", + LINUX_ROCM_TEST_RUNNER : LINUX_ROCM_TEST_RUNNER, +} + +LINUX_MULTIGPU_RUNNERS = { + LINUX_CUDA_TEST_RUNNER : "linux.16xlarge.nvidia.gpu", + LINUX_ROCM_TEST_RUNNER : LINUX_ROCM_TEST_RUNNER, } MACOS_TEST_RUNNER_10_15 = "macos-10.15" @@ -46,6 +59,9 @@ CUDA_RUNNERS = { WINDOWS_CUDA_TEST_RUNNER, LINUX_CUDA_TEST_RUNNER, } +ROCM_RUNNERS = { + LINUX_ROCM_TEST_RUNNER, +} CPU_RUNNERS = { WINDOWS_CPU_TEST_RUNNER, LINUX_CPU_TEST_RUNNER, @@ -55,6 +71,7 @@ LABEL_CIFLOW_ALL = "ciflow/all" LABEL_CIFLOW_BAZEL = "ciflow/bazel" LABEL_CIFLOW_CPU = "ciflow/cpu" LABEL_CIFLOW_CUDA = "ciflow/cuda" +LABEL_CIFLOW_ROCM = "ciflow/rocm" LABEL_CIFLOW_DOCS = "ciflow/docs" LABEL_CIFLOW_DEFAULT = "ciflow/default" LABEL_CIFLOW_LIBTORCH = "ciflow/libtorch" @@ -164,6 +181,8 @@ class CIWorkflow: # Optional fields test_runner_type: str = '' + multigpu_runner_type: str = '' + distributed_gpu_runner_type: str = '' ciflow_config: CIFlowConfig = field(default_factory=CIFlowConfig) cuda_version: str = '' docker_image_base: str = '' @@ -205,6 +224,9 @@ class CIWorkflow: if self.fx2trt_test: self.enable_fx2trt_test = 1 + self.multigpu_runner_type = LINUX_MULTIGPU_RUNNERS.get(self.test_runner_type, "linux.16xlarge.nvidia.gpu") + self.distributed_gpu_runner_type = LINUX_DISTRIBUTED_GPU_RUNNERS.get(self.test_runner_type, "linux.8xlarge.nvidia.gpu") + # If num_test_shards_on_pull_request is not user-defined, default to num_test_shards unless we are # only running smoke tests on the pull request. if self.num_test_shards_on_pull_request == -1: @@ -235,6 +257,8 @@ class CIWorkflow: assert self.test_runner_type != '' if self.test_runner_type in CUDA_RUNNERS: assert LABEL_CIFLOW_CUDA in self.ciflow_config.labels + if self.test_runner_type in ROCM_RUNNERS: + assert LABEL_CIFLOW_ROCM in self.ciflow_config.labels if self.test_runner_type in CPU_RUNNERS and not self.exclude_test: assert LABEL_CIFLOW_CPU in self.ciflow_config.labels if self.is_scheduled: @@ -576,6 +600,16 @@ LINUX_WORKFLOWS = [ labels=set([LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU]), ), ), + CIWorkflow( + arch="linux", + build_environment="linux-bionic-rocm4.5-py3.7", + docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm4.5-py3.7", + test_runner_type=LINUX_ROCM_TEST_RUNNER, + num_test_shards=2, + ciflow_config=CIFlowConfig( + labels=set([LABEL_CIFLOW_LINUX, LABEL_CIFLOW_ROCM]), + ), + ), CIWorkflow( arch="linux", build_environment="libtorch-linux-xenial-cuda11.3-py3.7-gcc7", diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2 index 910cc1524c2..2071d8e26bf 100644 --- a/.github/templates/common.yml.j2 +++ b/.github/templates/common.yml.j2 @@ -104,6 +104,45 @@ concurrency: env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" {%- endmacro -%} +{%- macro setup_rocm_linux() -%} + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x4" ]]; then + echo "Failed to detect 4 GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" +{%- endmacro -%} + {%- macro teardown_ec2_linux(pytorch_directory="") -%} - name: Hold runner for 2 hours or until ssh sessions have drained {%- if pytorch_directory %} @@ -127,6 +166,17 @@ concurrency: docker system prune -af {%- endmacro -%} +{%- macro teardown_rocm_linux() -%} + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af +{%- endmacro -%} + {%- macro checkout_pytorch(submodules) -%} - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 index 8252e661a38..b670f2a6873 100644 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ b/.github/templates/linux_ci_workflow.yml.j2 @@ -162,8 +162,8 @@ jobs: ENABLE_XLA_TEST: !{{ enable_xla_test }} ENABLE_NOARCH_TEST: !{{ enable_noarch_test }} NUM_TEST_SHARDS: !{{ num_test_shards }} - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu + MULTIGPU_RUNNER_TYPE: !{{ multigpu_runner_type }} + DISTRIBUTED_GPU_RUNNER_TYPE: !{{ distributed_gpu_runner_type }} NOGPU_RUNNER_TYPE: linux.2xlarge PR_BODY: ${{ github.event.pull_request.body }} outputs: @@ -196,17 +196,28 @@ jobs: NUM_TEST_SHARDS: ${{ matrix.num_shards }} PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} steps: +{%- if 'rocm' in test_runner_type %} + !{{ common.setup_rocm_linux() }} +{%- else %} !{{ common.setup_ec2_linux() }} +{%- endif %} !{{ common.checkout_pytorch("recursive") }} - name: Pull Docker image run: | !{{ common.add_retry_to_env() }} retry docker pull "${DOCKER_IMAGE}" +{%- if 'rocm' in test_runner_type %} + - name: ROCm set GPU_FLAG + if: ${{ contains(env.BUILD_ENVIRONMENT, 'rocm') && !contains(matrix.config, 'nogpu') }} + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" +{%- else %} - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} run: | bash .github/scripts/install_nvidia_utils_linux.sh echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" +{%- endif %} - name: Determine shm-size run: | shm_size="1g" @@ -228,7 +239,11 @@ jobs: unzip -o artifacts.zip - name: Output disk space left run: | +{%- if 'rocm' in test_runner_type %} + df -H +{%- else %} sudo df -H +{%- endif %} !{{ common.parse_ref() }} - name: Test env: @@ -246,6 +261,7 @@ jobs: else TEST_COMMAND=.jenkins/pytorch/test.sh fi +{%- if 'rocm' not in test_runner_type %} PROXY_ENV= # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now # We should investigate whether or not there's a list of hostnames we can add to no_proxy to @@ -254,6 +270,7 @@ jobs: # shellcheck disable=SC2089 PROXY_ENV="-e http_proxy=!{{ common.squid_proxy }} -e https_proxy=!{{ common.squid_proxy }} -e no_proxy=!{{ common.squid_no_proxy }}" fi +{%- endif %} # detached container should get cleaned up by teardown_ec2_linux # TODO: Stop building test binaries as part of the build phase # Used for GPU_FLAG since that doesn't play nice @@ -280,12 +297,16 @@ jobs: -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ +{%- if 'rocm' not in test_runner_type %} ${PROXY_ENV} \ +{%- endif %} --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --ulimit stack=10485760:83886080 \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ +{%- if 'rocm' not in test_runner_type %} --ipc=host \ +{%- endif %} --shm-size="${SHM_SIZE}" \ --tty \ --detach \ @@ -295,17 +316,35 @@ jobs: -w /var/lib/jenkins/workspace \ "${DOCKER_IMAGE}" ) +{%- if 'rocm' in test_runner_type %} + # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home + docker exec -t "${container_name}" sh -c "cd .. && cp -R workspace pytorch && cd pytorch && pip install dist/*.whl && ${TEST_COMMAND}" + # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct + docker exec -t "${container_name}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test" +{%- else %} docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" +{%- endif %} +{%- if 'rocm' not in test_runner_type %} - name: Chown workspace if: always() run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . +{%- endif %} !{{ common.render_test_results() }} +{%- if 'rocm' in test_runner_type %} + !{{ common.upload_downloaded_files(name='linux', use_s3=False) }} + !{{ common.upload_test_reports(name='linux', artifact_name="test-reports", use_s3=False) }} +{%- else %} !{{ common.upload_downloaded_files(name='linux') }} !{{ common.upload_test_reports(name='linux') }} +{%- endif %} !{{ common.upload_test_statistics(build_environment) }} +{%- if 'rocm' in test_runner_type %} + !{{ common.teardown_rocm_linux() }} +{%- else %} !{{ common.teardown_ec2_linux() }} +{%- endif %} {% endblock %} {%- endif -%} {%- if enable_doc_jobs %} diff --git a/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml b/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml new file mode 100644 index 00000000000..839a97ed190 --- /dev/null +++ b/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml @@ -0,0 +1,517 @@ +# @generated DO NOT EDIT MANUALLY +# Template is at: .github/templates/linux_ci_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: linux-bionic-rocm4.5-py3.7 + +on: + pull_request: + types: [opened, synchronize, reopened, unassigned] + push: + branches: + - master + - release/* + workflow_dispatch: + +env: + BUILD_ENVIRONMENT: linux-bionic-rocm4.5-py3.7 + DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm4.5-py3.7 + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla + TORCH_CUDA_ARCH_LIST: 5.2 + IN_CI: 1 + IS_GHA: 1 + # This is used for the phase of adding wheel tests only, will be removed once completed + IN_WHEEL_TEST: 1 + # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh + CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + AWS_DEFAULT_REGION: us-east-1 + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + PYTORCH_RETRY_TEST_CASES: 1 +concurrency: + group: linux-bionic-rocm4.5-py3.7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + + build: + runs-on: linux.2xlarge + timeout-minutes: 240 + if: ${{ (github.repository == 'pytorch/pytorch') && ( + (github.event_name == 'push') || + (github.event_name == 'schedule') || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/rocm') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || + (false)) + }} + env: + JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-build + IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/rocm') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} + outputs: + docker_image: ${{ steps.calculate-tag.outputs.docker_image }} + steps: + - name: print labels + run: echo "${PR_LABELS}" + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Calculate docker image tag + id: calculate-tag + run: | + DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) + echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" + echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" + echo "::set-output name=docker_tag::${DOCKER_TAG}" + echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" + - name: Check if image should be built + id: check + env: + BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} + run: | + set -x + # Check if image already exists, if it does then skip building it + if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then + exit 0 + fi + if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then + # if we're on the base branch then use the parent commit + MERGE_BASE=$(git rev-parse HEAD~) + else + # otherwise we're on a PR, so use the most recent base commit + MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") + fi + # Covers the case where a previous tag doesn't exist for the tree + # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly + if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then + echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" + exit 1 + fi + PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") + # If no image exists but the hash is the same as the previous hash then we should error out here + if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then + echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" + echo " contact the PyTorch team to restore the original images" + exit 1 + fi + echo ::set-output name=rebuild::yes + - name: Build and push docker image + if: ${{ steps.check.outputs.rebuild }} + env: + DOCKER_SKIP_S3_UPLOAD: 1 + working-directory: .circleci/docker + run: | + export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} + ./build_docker.sh + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Build + env: + BRANCH: ${{ steps.parse-ref.outputs.branch }} + run: | + # detached container should get cleaned up by teardown_ec2_linux + container_name=$(docker run \ + -e BUILD_ENVIRONMENT \ + -e JOB_BASE_NAME \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e AWS_DEFAULT_REGION \ + -e IS_GHA \ + -e PR_NUMBER \ + -e SHA1 \ + -e BRANCH \ + -e GITHUB_RUN_ID \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e SKIP_SCCACHE_INITIALIZATION=1 \ + -e TORCH_CUDA_ARCH_LIST \ + -e PR_LABELS \ + -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --tty \ + --detach \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' + - name: Display and upload binary build size statistics (Click Me) + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + run: | + COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) + export COMMIT_TIME + pip3 install requests==2.26 boto3==1.16.34 + python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 + - name: Chown workspace + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Archive artifacts into zip + run: | + zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json + - uses: seemethere/upload-artifact-s3@v3 + name: Store PyTorch Build Artifacts on S3 + with: + name: ${{ env.BUILD_ENVIRONMENT }} + retention-days: 14 + if-no-files-found: error + path: + artifacts.zip + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Clean up docker images + if: always() + run: | + # Prune all of the docker images + docker system prune -af + + generate-test-matrix: + needs: build + runs-on: ubuntu-18.04 + timeout-minutes: 240 + env: + TEST_RUNNER_TYPE: linux.rocm.gpu + ENABLE_DISTRIBUTED_TEST: 1 + ENABLE_JIT_LEGACY_TEST: '' + ENABLE_FX2TRT_TEST: '' + ENABLE_MULTIGPU_TEST: '' + ENABLE_NOGPU_NO_AVX_TEST: '' + ENABLE_NOGPU_NO_AVX2_TEST: '' + ENABLE_SLOW_TEST: '' + ENABLE_DOCS_TEST: '' + ENABLE_BACKWARDS_COMPAT_TEST: '' + ENABLE_XLA_TEST: '' + ENABLE_NOARCH_TEST: '' + NUM_TEST_SHARDS: 2 + MULTIGPU_RUNNER_TYPE: linux.rocm.gpu + DISTRIBUTED_GPU_RUNNER_TYPE: linux.rocm.gpu + NOGPU_RUNNER_TYPE: linux.2xlarge + PR_BODY: ${{ github.event.pull_request.body }} + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} + ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} + container: + image: python:3.9 + steps: + - name: Install dependencies + run: pip install typing-extensions==3.10 + - name: Clone pytorch/pytorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + - name: Generating test matrix + id: set-matrix + run: .github/scripts/generate_pytorch_test_matrix.py + + test: + needs: [build, generate-test-matrix] + strategy: + matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} + fail-fast: false + runs-on: ${{ matrix.runner }} + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-test + TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: ${{ matrix.shard }} + NUM_TEST_SHARDS: ${{ matrix.num_shards }} + PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + steps: + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x4" ]]; then + echo "Failed to detect 4 GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: ROCm set GPU_FLAG + if: ${{ contains(env.BUILD_ENVIRONMENT, 'rocm') && !contains(matrix.config, 'nogpu') }} + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home + docker exec -t "${container_name}" sh -c "cd .. && cp -R workspace pytorch && cd pytorch && pip install dist/*.whl && ${TEST_COMMAND}" + # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct + docker exec -t "${container_name}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test" + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: actions/upload-artifact@v2 + name: Store Test Downloaded JSONs on Github + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: actions/upload-artifact@v2 + name: Store Test Reports on Github + if: always() + with: + name: test-reports + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index 5628eaf6f0b..1fc4fecf2f8 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -148,23 +148,8 @@ if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then export PYTORCH_ROCM_ARCH="gfx900;gfx906" fi + # hipify sources python tools/amd_build/build_amd.py - python setup.py install - - # remove sccache wrappers post-build; runtime compilation of MIOpen kernels does not yet fully support them - sudo rm -f /opt/cache/bin/cc - sudo rm -f /opt/cache/bin/c++ - sudo rm -f /opt/cache/bin/gcc - sudo rm -f /opt/cache/bin/g++ - pushd /opt/rocm/llvm/bin - if [[ -d original ]]; then - sudo mv original/clang . - sudo mv original/clang++ . - fi - sudo rm -rf original - popd - - exit 0 fi # sccache will fail for CUDA builds if all cores are used for compiling @@ -227,7 +212,7 @@ else # ppc64le build fails when WERROR=1 # set only when building other architectures # only use for "python setup.py install" line - if [[ "$BUILD_ENVIRONMENT" != *ppc64le* ]]; then + if [[ "$BUILD_ENVIRONMENT" != *ppc64le* && "$BUILD_ENVIRONMENT" != *rocm* ]]; then WERROR=1 python setup.py bdist_wheel else python setup.py bdist_wheel @@ -251,6 +236,25 @@ else cp build/.ninja_log dist fi + if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then + # remove sccache wrappers post-build; runtime compilation of MIOpen kernels does not yet fully support them + sudo rm -f /opt/cache/bin/cc + sudo rm -f /opt/cache/bin/c++ + sudo rm -f /opt/cache/bin/gcc + sudo rm -f /opt/cache/bin/g++ + pushd /opt/rocm/llvm/bin + if [[ -d original ]]; then + sudo mv original/clang . + sudo mv original/clang++ . + fi + sudo rm -rf original + popd + + # exit before building custom test artifacts until we resolve cmake error: + # static library kineto_LIBRARY-NOTFOUND not found. + exit 0 + fi + CUSTOM_TEST_ARTIFACT_BUILD_DIR=${CUSTOM_TEST_ARTIFACT_BUILD_DIR:-${PWD}/../} mkdir -pv "${CUSTOM_TEST_ARTIFACT_BUILD_DIR}" diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index 237daa4f162..dfd79233acf 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -80,7 +80,10 @@ if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then rocminfo | grep -E 'Name:.*\sgfx|Marketing' # Manually set NUM_TEST_SHARDS since Jenkins doesn't do it - export NUM_TEST_SHARDS=2 + # TODO: Can remove this once ROCm migration from Jenkins to GHA is complete. + if [[ -z "${GITHUB_ACTIONS}" ]]; then + export NUM_TEST_SHARDS=2 + fi fi # --user breaks ppc64le builds and these packages are already in ppc64le docker