Update base for Update on "(2/2) Make TorchScript Preserve Fully Qualified Class Name for Python Exceptions: frontend change"

Reland D33282878. This is the frontend change.

Differential Revision: [D33342569](https://our.internmc.facebook.com/intern/diff/D33342569/)

**NOTE FOR REVIEWERS**: This PR has internal Facebook specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D33342569/)!

[ghstack-poisoned]
This commit is contained in:
Shunting Zhang 2022-02-15 12:13:40 -08:00
commit 36b39d6692
336 changed files with 18536 additions and 20565 deletions

View File

@ -14,7 +14,13 @@
{
"name": "OSS CI",
"patterns": [".github/**", ".circleci/**", ".jenkins/**", "scripts/**"],
"approved_by": ["seemethere", "malfet", "suo"],
"approved_by": ["seemethere", "malfet", "suo", "janeyx99"],
"mandatory_app_id": 12274
},
{
"name": "Documentation",
"patterns": ["docs/**", "torch/*docs.py"],
"approved_by": ["mruberry", "ngimel", "albanD", "janeyx99"],
"mandatory_app_id": 12274
}
]

View File

@ -1,4 +1,5 @@
{% import 'common.yml.j2' as common %}
{% import 'upload.yml.j2' as upload %}
{%- block name -%}
# Template is at: .github/templates/macos_binary_build_workflow.yml.j2
@ -6,24 +7,6 @@
name: !{{ build_environment }}
{%- endblock %}
{%- macro binary_env(config) -%}
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: !{{ config["package_type"] }}
SKIP_ALL_TESTS: 1
DESIRED_CUDA: cpu
{%- if config["package_type"] == "libtorch" %}
LIBTORCH_VARIANT: !{{ config["libtorch_variant"] }}
DESIRED_DEVTOOLSET: !{{ config["devtoolset"] }}
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.7"
{%- else %}
DESIRED_PYTHON: "!{{ config["python_version"] }}"
{%- endif %}
{%- endmacro %}
{%- macro set_runner_specific_vars() -%}
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
@ -83,7 +66,7 @@ jobs:
{%- else %}
timeout-minutes: !{{ common.timeout_minutes }}
{%- endif %}
!{{ binary_env(config) }}
!{{ upload.binary_env(config, true) }}
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -96,16 +79,8 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
with:
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
!{{ common.checkout(deep_clone=False, directory="pytorch") }}
!{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder") }}
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -129,53 +104,5 @@ jobs:
retention-days: 14
if-no-files-found: error
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
!{{ config["build_name"] }}-upload: # Uploading
runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts
if: ${{ github.repository_owner == 'pytorch' }}
needs: !{{ config["build_name"] }}-build
!{{ binary_env(config) }}
steps:
!{{ common.setup_ec2_linux() }}
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- uses: actions/download-artifact@v2
name: Download Build Artifacts
with:
name: !{{ config["build_name"] }}
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
fi
- name: Upload binaries
env:
PKG_DIR: "${{ runner.temp }}/artifacts"
UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
# When running these on pull_request events these should be blank
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
run: |
docker run --rm -i \
-e ANACONDA_API_TOKEN \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
-e DRY_RUN \
-e PACKAGE_TYPE \
-e PKG_DIR=/artifacts \
-e UPLOAD_CHANNEL \
-e UPLOAD_SUBFOLDER \
-v "${RUNNER_TEMP}/artifacts:/artifacts" \
-v "${GITHUB_WORKSPACE}:/v" \
-w /v \
308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
bash -c '.circleci/scripts/binary_upload.sh'
!{{ common.teardown_ec2_linux() }}
!{{ upload.upload_binaries(config, has_test=False, use_s3=False) }}
{%- endfor %}

View File

@ -32,17 +32,25 @@
{%- endmacro %}
{%- macro upload_binaries(config, is_windows=False) -%}
{%- macro upload_binaries(config, is_windows=False, has_test=True, use_s3=True) -%}
!{{ config["build_name"] }}-upload: # Uploading
runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts
if: ${{ github.repository_owner == 'pytorch' }}
{%- if has_test %}
needs: !{{ config["build_name"] }}-test
{%- else %}
needs: !{{ config["build_name"] }}-build
{%- endif %}
!{{ binary_env(config, is_windows) }}
steps:
!{{ common.setup_ec2_linux() }}
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
{%- if use_s3 %}
- uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
{%- else %}
- uses: actions/download-artifact@v2
{%- endif %}
name: Download Build Artifacts
with:
name: !{{ config["build_name"] }}

View File

@ -1,4 +1,5 @@
# @generated DO NOT EDIT MANUALLY
# Template is at: .github/templates/macos_binary_build_workflow.yml.j2
# Generation script: .github/scripts/generate_ci_workflows.py
name: macos-arm64-binary-conda
@ -43,8 +44,11 @@ jobs:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.8"
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -69,16 +73,27 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
submodules: recursive
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -107,11 +122,13 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: conda-py3_8-cpu-build
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/conda-builder:cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.8"
steps:
- name: Display EC2 information
@ -165,11 +182,11 @@ jobs:
name: conda-py3_8-cpu
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -222,8 +239,11 @@ jobs:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.9"
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -248,16 +268,27 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
submodules: recursive
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -286,11 +317,13 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: conda-py3_9-cpu-build
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/conda-builder:cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.9"
steps:
- name: Display EC2 information
@ -344,11 +377,11 @@ jobs:
name: conda-py3_9-cpu
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -401,8 +434,11 @@ jobs:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.10"
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -427,16 +463,27 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
submodules: recursive
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -465,11 +512,13 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: conda-py3_10-cpu-build
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/conda-builder:cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.10"
steps:
- name: Display EC2 information
@ -523,11 +572,11 @@ jobs:
name: conda-py3_10-cpu
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then

View File

@ -1,4 +1,5 @@
# @generated DO NOT EDIT MANUALLY
# Template is at: .github/templates/macos_binary_build_workflow.yml.j2
# Generation script: .github/scripts/generate_ci_workflows.py
name: macos-arm64-binary-wheel
@ -43,8 +44,11 @@ jobs:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: wheel
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.7"
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -69,16 +73,27 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
submodules: recursive
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -107,11 +122,13 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: wheel-py3_7-cpu-build
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: wheel
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/manylinux-builder:cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.7"
steps:
- name: Display EC2 information
@ -165,11 +182,11 @@ jobs:
name: wheel-py3_7-cpu
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -222,8 +239,11 @@ jobs:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: wheel
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.8"
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -248,16 +268,27 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
submodules: recursive
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -286,11 +317,13 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: wheel-py3_8-cpu-build
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: wheel
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/manylinux-builder:cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.8"
steps:
- name: Display EC2 information
@ -344,11 +377,11 @@ jobs:
name: wheel-py3_8-cpu
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -401,8 +434,11 @@ jobs:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: wheel
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.9"
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -427,16 +463,27 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
submodules: recursive
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -465,11 +512,13 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: wheel-py3_9-cpu-build
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: wheel
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/manylinux-builder:cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.9"
steps:
- name: Display EC2 information
@ -523,11 +572,11 @@ jobs:
name: wheel-py3_9-cpu
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -580,8 +629,11 @@ jobs:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: wheel
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.10"
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -606,16 +658,27 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
submodules: recursive
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -644,11 +707,13 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: wheel-py3_10-cpu-build
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: wheel
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/manylinux-builder:cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.10"
steps:
- name: Display EC2 information
@ -702,11 +767,11 @@ jobs:
name: wheel-py3_10-cpu
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then

View File

@ -1,4 +1,5 @@
# @generated DO NOT EDIT MANUALLY
# Template is at: .github/templates/macos_binary_build_workflow.yml.j2
# Generation script: .github/scripts/generate_ci_workflows.py
name: macos-binary-conda
@ -41,8 +42,11 @@ jobs:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.7"
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -67,16 +71,27 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
submodules: recursive
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -105,11 +120,13 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: conda-py3_7-cpu-build
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/conda-builder:cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.7"
steps:
- name: Display EC2 information
@ -163,11 +180,11 @@ jobs:
name: conda-py3_7-cpu
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -220,8 +237,11 @@ jobs:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.8"
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -246,16 +266,27 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
submodules: recursive
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -284,11 +315,13 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: conda-py3_8-cpu-build
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/conda-builder:cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.8"
steps:
- name: Display EC2 information
@ -342,11 +375,11 @@ jobs:
name: conda-py3_8-cpu
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -399,8 +432,11 @@ jobs:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.9"
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -425,16 +461,27 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
submodules: recursive
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -463,11 +510,13 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: conda-py3_9-cpu-build
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/conda-builder:cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.9"
steps:
- name: Display EC2 information
@ -521,11 +570,11 @@ jobs:
name: conda-py3_9-cpu
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -578,8 +627,11 @@ jobs:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.10"
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -604,16 +656,27 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
submodules: recursive
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -642,11 +705,13 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: conda-py3_10-cpu-build
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/conda-builder:cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.10"
steps:
- name: Display EC2 information
@ -700,11 +765,11 @@ jobs:
name: conda-py3_10-cpu
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then

View File

@ -1,4 +1,5 @@
# @generated DO NOT EDIT MANUALLY
# Template is at: .github/templates/macos_binary_build_workflow.yml.j2
# Generation script: .github/scripts/generate_ci_workflows.py
name: macos-binary-libtorch-cxx11-abi
@ -42,8 +43,11 @@ jobs:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: libtorch
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
LIBTORCH_VARIANT: shared-with-deps
DESIRED_DEVTOOLSET: cxx11-abi
# This is a dummy value for libtorch to work correctly with our batch scripts
@ -72,16 +76,27 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
submodules: recursive
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -110,16 +125,15 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: libtorch-cpu-shared-with-deps-cxx11-abi-build
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: libtorch
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
SKIP_ALL_TESTS: 1
LIBTORCH_VARIANT: shared-with-deps
DESIRED_DEVTOOLSET: cxx11-abi
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.7"
steps:
- name: Display EC2 information
shell: bash
@ -172,11 +186,11 @@ jobs:
name: libtorch-cpu-shared-with-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -230,8 +244,11 @@ jobs:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: libtorch
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
LIBTORCH_VARIANT: shared-without-deps
DESIRED_DEVTOOLSET: cxx11-abi
# This is a dummy value for libtorch to work correctly with our batch scripts
@ -260,16 +277,27 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
submodules: recursive
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -298,16 +326,15 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: libtorch-cpu-shared-without-deps-cxx11-abi-build
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: libtorch
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
SKIP_ALL_TESTS: 1
LIBTORCH_VARIANT: shared-without-deps
DESIRED_DEVTOOLSET: cxx11-abi
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.7"
steps:
- name: Display EC2 information
shell: bash
@ -360,11 +387,11 @@ jobs:
name: libtorch-cpu-shared-without-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -418,8 +445,11 @@ jobs:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: libtorch
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
LIBTORCH_VARIANT: static-with-deps
DESIRED_DEVTOOLSET: cxx11-abi
# This is a dummy value for libtorch to work correctly with our batch scripts
@ -448,16 +478,27 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
submodules: recursive
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -486,16 +527,15 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: libtorch-cpu-static-with-deps-cxx11-abi-build
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: libtorch
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
SKIP_ALL_TESTS: 1
LIBTORCH_VARIANT: static-with-deps
DESIRED_DEVTOOLSET: cxx11-abi
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.7"
steps:
- name: Display EC2 information
shell: bash
@ -548,11 +588,11 @@ jobs:
name: libtorch-cpu-static-with-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -606,8 +646,11 @@ jobs:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: libtorch
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
LIBTORCH_VARIANT: static-without-deps
DESIRED_DEVTOOLSET: cxx11-abi
# This is a dummy value for libtorch to work correctly with our batch scripts
@ -636,16 +679,27 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
submodules: recursive
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -674,16 +728,15 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: libtorch-cpu-static-without-deps-cxx11-abi-build
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: libtorch
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
SKIP_ALL_TESTS: 1
LIBTORCH_VARIANT: static-without-deps
DESIRED_DEVTOOLSET: cxx11-abi
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.7"
steps:
- name: Display EC2 information
shell: bash
@ -736,11 +789,11 @@ jobs:
name: libtorch-cpu-static-without-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then

View File

@ -1,4 +1,5 @@
# @generated DO NOT EDIT MANUALLY
# Template is at: .github/templates/macos_binary_build_workflow.yml.j2
# Generation script: .github/scripts/generate_ci_workflows.py
name: macos-binary-libtorch-pre-cxx11
@ -42,8 +43,11 @@ jobs:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: libtorch
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
LIBTORCH_VARIANT: shared-with-deps
DESIRED_DEVTOOLSET: pre-cxx11
# This is a dummy value for libtorch to work correctly with our batch scripts
@ -72,16 +76,27 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
submodules: recursive
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -110,16 +125,15 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: libtorch-cpu-shared-with-deps-pre-cxx11-build
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: libtorch
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/manylinux-builder:cpu
SKIP_ALL_TESTS: 1
LIBTORCH_VARIANT: shared-with-deps
DESIRED_DEVTOOLSET: pre-cxx11
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.7"
steps:
- name: Display EC2 information
shell: bash
@ -172,11 +186,11 @@ jobs:
name: libtorch-cpu-shared-with-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -230,8 +244,11 @@ jobs:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: libtorch
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
LIBTORCH_VARIANT: shared-without-deps
DESIRED_DEVTOOLSET: pre-cxx11
# This is a dummy value for libtorch to work correctly with our batch scripts
@ -260,16 +277,27 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
submodules: recursive
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -298,16 +326,15 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: libtorch-cpu-shared-without-deps-pre-cxx11-build
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: libtorch
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/manylinux-builder:cpu
SKIP_ALL_TESTS: 1
LIBTORCH_VARIANT: shared-without-deps
DESIRED_DEVTOOLSET: pre-cxx11
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.7"
steps:
- name: Display EC2 information
shell: bash
@ -360,11 +387,11 @@ jobs:
name: libtorch-cpu-shared-without-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -418,8 +445,11 @@ jobs:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: libtorch
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
LIBTORCH_VARIANT: static-with-deps
DESIRED_DEVTOOLSET: pre-cxx11
# This is a dummy value for libtorch to work correctly with our batch scripts
@ -448,16 +478,27 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
submodules: recursive
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -486,16 +527,15 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: libtorch-cpu-static-with-deps-pre-cxx11-build
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: libtorch
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/manylinux-builder:cpu
SKIP_ALL_TESTS: 1
LIBTORCH_VARIANT: static-with-deps
DESIRED_DEVTOOLSET: pre-cxx11
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.7"
steps:
- name: Display EC2 information
shell: bash
@ -548,11 +588,11 @@ jobs:
name: libtorch-cpu-static-with-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -606,8 +646,11 @@ jobs:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: libtorch
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
LIBTORCH_VARIANT: static-without-deps
DESIRED_DEVTOOLSET: pre-cxx11
# This is a dummy value for libtorch to work correctly with our batch scripts
@ -636,16 +679,27 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
submodules: recursive
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -674,16 +728,15 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: libtorch-cpu-static-without-deps-pre-cxx11-build
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: libtorch
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/manylinux-builder:cpu
SKIP_ALL_TESTS: 1
LIBTORCH_VARIANT: static-without-deps
DESIRED_DEVTOOLSET: pre-cxx11
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.7"
steps:
- name: Display EC2 information
shell: bash
@ -736,11 +789,11 @@ jobs:
name: libtorch-cpu-static-without-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then

View File

@ -1,4 +1,5 @@
# @generated DO NOT EDIT MANUALLY
# Template is at: .github/templates/macos_binary_build_workflow.yml.j2
# Generation script: .github/scripts/generate_ci_workflows.py
name: macos-binary-wheel
@ -41,8 +42,11 @@ jobs:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: wheel
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.7"
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -67,16 +71,27 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
submodules: recursive
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -105,11 +120,13 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: wheel-py3_7-cpu-build
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: wheel
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/manylinux-builder:cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.7"
steps:
- name: Display EC2 information
@ -163,11 +180,11 @@ jobs:
name: wheel-py3_7-cpu
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -220,8 +237,11 @@ jobs:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: wheel
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.8"
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -246,16 +266,27 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
submodules: recursive
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -284,11 +315,13 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: wheel-py3_8-cpu-build
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: wheel
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/manylinux-builder:cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.8"
steps:
- name: Display EC2 information
@ -342,11 +375,11 @@ jobs:
name: wheel-py3_8-cpu
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -399,8 +432,11 @@ jobs:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: wheel
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.9"
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -425,16 +461,27 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
submodules: recursive
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -463,11 +510,13 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: wheel-py3_9-cpu-build
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: wheel
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/manylinux-builder:cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.9"
steps:
- name: Display EC2 information
@ -521,11 +570,11 @@ jobs:
name: wheel-py3_9-cpu
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -578,8 +627,11 @@ jobs:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: wheel
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.10"
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -604,16 +656,27 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
submodules: recursive
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |
@ -642,11 +705,13 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: wheel-py3_10-cpu-build
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: wheel
SKIP_ALL_TESTS: 1
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/manylinux-builder:cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.10"
steps:
- name: Display EC2 information
@ -700,11 +765,11 @@ jobs:
name: wheel-py3_10-cpu
path: "${{ runner.temp }}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
run: |
# reference ends with an RC suffix
if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then

View File

@ -157,4 +157,5 @@ python setup.py install --cmake && sccache --show-stats && (
sccache --show-stats > stats.txt
python -m tools.stats.upload_sccache_stats stats.txt
sccache --stop-server
rm stats.txt

View File

@ -0,0 +1,23 @@
#include <ATen/core/TensorBase.h>
// Broadcasting utilities for working with TensorBase
namespace at {
namespace internal {
TORCH_API TensorBase expand_slow_path(const TensorBase &self, IntArrayRef size);
} // namespace internal
inline c10::MaybeOwned<TensorBase> expand_size(const TensorBase &self, IntArrayRef size) {
if (size.equals(self.sizes())) {
return c10::MaybeOwned<TensorBase>::borrowed(self);
}
return c10::MaybeOwned<TensorBase>::owned(
at::internal::expand_slow_path(self, size));
}
c10::MaybeOwned<TensorBase> expand_size(TensorBase &&self, IntArrayRef size) = delete;
inline c10::MaybeOwned<TensorBase> expand_inplace(const TensorBase &tensor, const TensorBase &to_expand) {
return expand_size(to_expand, tensor.sizes());
}
c10::MaybeOwned<TensorBase> expand_inplace(const TensorBase &tensor, TensorBase &&to_expand) = delete;
} // namespace at

View File

@ -1,8 +1,15 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/ExpandUtils.h>
#include <ATen/ExpandBase.h>
#include <c10/util/irange.h>
namespace at {
namespace internal {
TensorBase expand_slow_path(const TensorBase &self, IntArrayRef size) {
return OptionalTensorRef(self)->expand(size);
}
}
namespace {
// NOTE: are_expandable did a similar check, please keep them sync if change is needed

View File

@ -1,3 +1,3 @@
#pragma once
#include <ATen/core/TensorBody.h>
#include <ATen/core/Tensor.h>

View File

@ -1,10 +1,30 @@
#include <ATen/TensorGeometry.h>
#include <ATen/TensorUtils.h>
#include <ATen/ATen.h>
#include <limits>
#include <cstddef>
namespace at {
// See TensorGeometry.h on why this is useful now that we cache is_contiguous.
bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides) {
assert(sizes.size() < static_cast<std::size_t>(std::numeric_limits<std::int64_t>::max()));
auto dim = static_cast<std::int64_t>(sizes.size());
int64_t expected_stride = 1;
bool contig_if_nonempty = true;
for (int64_t i = dim - 1; i >= 0; i--) {
if (sizes[i] == 0) {
return true;
}
if (contig_if_nonempty) {
if (sizes[i] != 1 && strides[i] != expected_stride) {
contig_if_nonempty = false;
}
expected_stride *= sizes[i];
}
}
return contig_if_nonempty;
}
bool TensorGeometry::is_contiguous() const {
if (numel_ == 0) {
return true;

View File

@ -1,10 +1,17 @@
#pragma once
#include <ATen/WrapDimUtils.h>
#include <ATen/core/Tensor.h>
#include <c10/core/WrapDimMinimal.h>
#include <ATen/core/TensorBase.h>
namespace at {
// Return if the tensor geometry represented by `sizes` and `strides` is contiguous
// Although we cache is_contiguous in tensor now, this is till useful because it
// allows checking if a particular geometry is contiguous without explicitly
// constructing a tensor, e.g., when you want to choose a kernel strategy based
// on whether a subgeometry is contiguous.
TORCH_API bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides);
struct TORCH_API TensorGeometry {
TensorGeometry() : storage_offset_(0) {}
@ -21,7 +28,7 @@ struct TORCH_API TensorGeometry {
numel_ = expected_stride;
}
explicit TensorGeometry(const Tensor& t)
explicit TensorGeometry(const TensorBase& t)
: sizes_(t.sizes().vec())
, strides_(t.strides().vec())
, storage_offset_(t.storage_offset())
@ -32,12 +39,12 @@ struct TORCH_API TensorGeometry {
int64_t dim() const { return sizes_.size(); }
int64_t size(int64_t dim) const {
dim = maybe_wrap_dim(dim, this->dim());
dim = c10::maybe_wrap_dim(dim, this->dim());
return sizes_.at(static_cast<size_t>(dim));
}
IntArrayRef sizes() const { return IntArrayRef{ sizes_ }; }
int64_t stride(int64_t dim) const {
dim = maybe_wrap_dim(dim, this->dim());
dim = c10::maybe_wrap_dim(dim, this->dim());
return strides_.at(static_cast<size_t>(dim));
}
IntArrayRef strides() const { return IntArrayRef{ strides_ }; }

View File

@ -28,7 +28,8 @@ constexpr auto kFunctorchWrappedTensors = DispatchKeySet({
constexpr auto kTensorSubclassLike = kFunctorchWrappedTensors | DispatchKeySet({
DispatchKey::Batched,
DispatchKey::Sparse,
DispatchKey::SparseCPU,
DispatchKey::SparseCUDA,
DispatchKey::SparseCsrCPU,
DispatchKey::SparseCsrCUDA,
DispatchKey::Meta,

View File

@ -264,25 +264,6 @@ void * maybe_data_ptr(const TensorArg& tensor) {
return tensor->defined() ? (void *)tensor->data_ptr() : nullptr;
}
// See TensorUtils.h on why this is useful now that we cache is_contiguous.
bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides) {
int64_t dim = sizes.size();
int64_t expected_stride = 1;
bool contig_if_nonempty = true;
for (int64_t i = dim - 1; i >= 0; i--) {
if (sizes[i] == 0) {
return true;
}
if (contig_if_nonempty) {
if (sizes[i] != 1 && strides[i] != expected_stride) {
contig_if_nonempty = false;
}
expected_stride *= sizes[i];
}
}
return contig_if_nonempty;
}
void check_dim_size(
const Tensor& tensor,
int64_t dim,

View File

@ -138,13 +138,6 @@ TORCH_API void checkLayout(CheckedFrom c, at::ArrayRef<Tensor> tensors, at::Layo
TORCH_API void* maybe_data_ptr(const Tensor& tensor);
TORCH_API void* maybe_data_ptr(const TensorArg& tensor);
// Return if the tensor geometry represented by `sizes` and `strides` is contiguous
// Although we cache is_contiguous in tensor now, this is till useful because it
// allows checking if a particular geometry is contiguous without explicitly
// constructing a tensor, e.g., when you want to choose a kernel strategy based
// on whether a subgeometry is contiguous.
TORCH_API bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides);
TORCH_API void check_dim_size(
const Tensor& tensor,
int64_t dim,

View File

@ -91,29 +91,6 @@ std::array<int64_t, N> check_intlist(ArrayRef<int64_t> list, const char * name,
return res;
}
/**
* Utility function to static cast input Generator* to
* the backend generator type (CPU/CUDAGeneratorImpl etc.)
*/
template <typename T>
static inline T * check_generator(c10::optional<Generator> gen) {
TORCH_CHECK(gen.has_value(), "Expected Generator but received nullopt");
TORCH_CHECK(gen->defined(), "Generator with undefined implementation is not allowed");
TORCH_CHECK(T::device_type() == gen->device().type(), "Expected a '", T::device_type(), "' device type for generator but found '", gen->device().type(), "'");
return gen->get<T>();
}
/**
* Utility function used in tensor implementations, which
* supplies the default generator to tensors, if an input generator
* is not supplied. The input Generator* is also static casted to
* the backend generator type (CPU/CUDAGeneratorImpl etc.)
*/
template <typename T>
static inline T* get_generator_or_default(const c10::optional<Generator>& gen, const Generator& default_gen) {
return gen.has_value() && gen->defined() ? check_generator<T>(gen) : check_generator<T>(default_gen);
}
using at::detail::check_size_nonnegative;
namespace detail {

View File

@ -138,6 +138,29 @@ Generator make_generator(Args&&... args) {
return Generator(c10::make_intrusive<Impl>(std::forward<Args>(args)...));
}
/**
* Utility function to static cast input Generator* to
* the backend generator type (CPU/CUDAGeneratorImpl etc.)
*/
template <typename T>
static inline T * check_generator(c10::optional<Generator> gen) {
TORCH_CHECK(gen.has_value(), "Expected Generator but received nullopt");
TORCH_CHECK(gen->defined(), "Generator with undefined implementation is not allowed");
TORCH_CHECK(T::device_type() == gen->device().type(), "Expected a '", T::device_type(), "' device type for generator but found '", gen->device().type(), "'");
return gen->get<T>();
}
/**
* Utility function used in tensor implementations, which
* supplies the default generator to tensors, if an input generator
* is not supplied. The input Generator* is also static casted to
* the backend generator type (CPU/CUDAGeneratorImpl etc.)
*/
template <typename T>
static inline T* get_generator_or_default(const c10::optional<Generator>& gen, const Generator& default_gen) {
return gen.has_value() && gen->defined() ? check_generator<T>(gen) : check_generator<T>(default_gen);
}
namespace detail {
/**

View File

@ -4,7 +4,14 @@
namespace {
// TLS saving the state of the include/exclude sets on entry to the dispatcher
// This is set in the pythonTLSSnapshot fallback and used by the Python fallback.
thread_local c10::optional<c10::impl::LocalDispatchKeySet> tls_on_entry;
void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
TORCH_INTERNAL_ASSERT(tls_on_entry.has_value());
c10::impl::ForceDispatchKeyGuard guard(tls_on_entry.value());
// If Python Mode is active, use its PyInterpreter for dispatch
const auto& maybe_python_mode_state = at::impl::PythonModeTLS::get_state();
if (maybe_python_mode_state) {
@ -42,8 +49,25 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
TORCH_INTERNAL_ASSERT(0, "Hit Python dispatch key but no arguments had PyInterpreter (no tensor args?)");
}
void pythonTLSSnapshotFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
// It is ok for the tls to be already set here.
// A CompositeImplicitAutograd function may have been called just before this and so the tls here were never cleared
// This is also why we don't need an RAII to ensure the tls is reset when exceptions happen
tls_on_entry = c10::impl::tls_local_dispatch_key_set();
op.redispatchBoxed(dispatch_keys & c10::DispatchKeySet(c10::DispatchKeySet::FULL_AFTER, c10::DispatchKey::PythonTLSSnapshot), stack);
tls_on_entry = c10::nullopt;
}
} // anonymous namespace
TORCH_LIBRARY_IMPL(_, Python, m) {
m.fallback(torch::CppFunction::makeFromBoxedFunction<&pythonFallback>());
}
TORCH_LIBRARY_IMPL(_, PythonTLSSnapshot, m) {
m.fallback(torch::CppFunction::makeFromBoxedFunction<&pythonTLSSnapshotFallback>());
}

View File

@ -8,6 +8,7 @@ void PythonModeTLS::set_state(const std::shared_ptr<TorchDispatchTypeObject>& st
pythonModeState = state;
if (state) {
c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, true);
c10::impl::tls_set_dispatch_key_included(DispatchKey::PythonTLSSnapshot, true);
} else {
PythonModeTLS::reset_state();
}
@ -20,6 +21,7 @@ const std::shared_ptr<TorchDispatchTypeObject>& PythonModeTLS::get_state() {
void PythonModeTLS::reset_state() {
pythonModeState.reset((TorchDispatchTypeObject*)nullptr);
c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, false);
c10::impl::tls_set_dispatch_key_included(DispatchKey::PythonTLSSnapshot, false);
}
} // namespace impl

View File

@ -4,6 +4,15 @@
#include <ATen/core/LegacyTypeDispatch.h>
#include <ATen/FunctionalTensorWrapper.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/MethodOperators.h>
#else
#include <ATen/ops/contiguous_ops.h>
#include <ATen/ops/fill_ops.h>
#include <ATen/ops/to_ops.h>
#include <ATen/ops/zero_ops.h>
#endif
#include <iostream>
namespace at {
@ -29,6 +38,18 @@ const TensorBase& TensorBase::zero_() const {
return *this;
}
TensorBase TensorBase::to(
at::TensorOptions options,
bool non_blocking,
bool copy,
c10::optional<at::MemoryFormat> memory_format) const {
Tensor self(*this);
return at::_ops::to_dtype_layout::call(
self, optTypeMetaToScalarType(options.dtype_opt()),
options.layout_opt(), options.device_opt(),
options.pinned_memory_opt(), non_blocking, copy, memory_format);
}
void TensorBase::enforce_invariants() {
if (impl_.get() == nullptr) {
throw std::runtime_error("TensorImpl with nullptr is not supported");

View File

@ -1,6 +1,7 @@
#pragma once
#include <c10/macros/Macros.h>
#include <c10/util/ArrayRef.h>
#include <c10/util/Deprecated.h>
#include <c10/util/Exception.h>
#include <c10/util/irange.h>

View File

@ -141,6 +141,8 @@ class TORCH_API TensorBase {
const TensorBase& fill_(const c10::Scalar& scalar) const;
const TensorBase& zero_() const;
TensorBase to(at::TensorOptions options={}, bool non_blocking=false, bool copy=false, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) const;
bool is_complex() const {
return at::isComplexType(this->scalar_type());
}

View File

@ -6,52 +6,11 @@
namespace c10 {
void DispatchKeyExtractor::setOperatorHasFallthroughForKey(DispatchKey k, bool has_fallthrough) {
// (1) update nonFallthroughKeys_
if (has_fallthrough) {
nonFallthroughKeys_ = nonFallthroughKeys_.remove(k);
} else {
nonFallthroughKeys_ = nonFallthroughKeys_.add(k);
}
// (2) update nonFallthroughKeysPerBackend_
if (isPerBackendFunctionalityKey(toFunctionalityKey(k))) {
// This is a per-backend functionality key.
// We need to figure out what the current backend is,
// and only update the bitset for that backend.
// subtracting 1 because the first backend should have index 0 (CPU),
// But the enum starts with BackendComponent::InvalidBit.
auto backend_idx = static_cast<uint8_t>(toBackendComponent(k)) - 1;
TORCH_INTERNAL_ASSERT(backend_idx >= 0 && static_cast<uint8_t>(backend_idx) < nonFallthroughKeysPerBackend_.size());
if (has_fallthrough) {
nonFallthroughKeysPerBackend_[backend_idx] = nonFallthroughKeysPerBackend_[backend_idx].remove(k);
} else {
nonFallthroughKeysPerBackend_[backend_idx] = nonFallthroughKeysPerBackend_[backend_idx].add(k);
}
// Set requiresBitsetPerBackend_ accordingly
for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size() - 1)) {
if (nonFallthroughKeysPerBackend_[i] != nonFallthroughKeysPerBackend_[i+1]) {
requiresBitsetPerBackend_ = true;
return;
}
}
requiresBitsetPerBackend_ = false;
return;
} else {
// Otherwise, if a fallthrough is set for a functionality that isn't per backend,
// Then we update the fallthrough bitset for EVERY backend.
// TODO: we could probably optimize this by only lazily updating these values
// the first time that we see requiresBitsetPerBackend_ = true
// (which should almost never happen)
if (has_fallthrough) {
for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) {
nonFallthroughKeysPerBackend_[i] = nonFallthroughKeysPerBackend_[i].remove(k);
}
} else {
for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) {
nonFallthroughKeysPerBackend_[i] = nonFallthroughKeysPerBackend_[i].add(k);
}
}
}
}
std::string DispatchKeyExtractor::dumpState() const {

View File

@ -156,25 +156,15 @@ public:
}
});
// Keys that are fallthrough should be skipped
if (requiresBitsetPerBackend_) {
auto backend_idx = ks.getBackendIndex();
return impl::computeDispatchKeySet(ks, nonFallthroughKeysPerBackend_[backend_idx]);
} else {
return impl::computeDispatchKeySet(ks, nonFallthroughKeys_);
}
}
template<class... Args>
DispatchKeySet getDispatchKeySetUnboxed(const Args&... args) const {
auto ks = detail::multi_dispatch_key_set(args...);
// Keys that are fallthrough should be skipped
if (requiresBitsetPerBackend_) {
auto backend_idx = ks.getBackendIndex();
return impl::computeDispatchKeySet(ks, nonFallthroughKeysPerBackend_[backend_idx]);
} else {
return impl::computeDispatchKeySet(ks, nonFallthroughKeys_);
}
}
void setOperatorHasFallthroughForKey(DispatchKey k, bool has_fallthrough);
@ -203,12 +193,7 @@ private:
explicit DispatchKeyExtractor(c10::utils::bitset dispatch_arg_indices_reverse)
: dispatch_arg_indices_reverse_(dispatch_arg_indices_reverse)
, nonFallthroughKeys_(DispatchKeySet::FULL)
, requiresBitsetPerBackend_(false) {
for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) {
nonFallthroughKeysPerBackend_[i] = DispatchKeySet::FULL;
}
}
, nonFallthroughKeys_(DispatchKeySet::FULL) {}
// this is a bitset that has ones for each argument index which has to be
// considered for dispatch. This avoids having to iterate over the stack
@ -220,14 +205,8 @@ private:
// fallthrough
c10::utils::bitset dispatch_arg_indices_reverse_;
// Set of functionality keys for which the operator does NOT have fallthrough kernel.
// Set of keys for which the operator does NOT have fallthrough kernel.
DispatchKeySet nonFallthroughKeys_;
// Set of functionality keys for which the operator does NOT have fallthrough kernel, defined PER BACKEND.
// This is only needed if we know that the operator has a different set of fallthroughs defined for some backends.
std::array<DispatchKeySet, num_backends> nonFallthroughKeysPerBackend_;
// Flag to tell us if we can use the single set of nonFallthroughKeys_ (fast path),
// or if we need to fall back to the slower path and check nonFallthroughKeysPerBackend_
bool requiresBitsetPerBackend_;
};
}

View File

@ -267,15 +267,14 @@ void Dispatcher::cleanup(const OperatorHandle& op, const OperatorName& op_name)
RegistrationHandleRAII Dispatcher::registerFallback(DispatchKey dispatchKey, KernelFunction kernel, std::string debug) {
std::lock_guard<std::mutex> lock(mutex_);
auto idx = getDispatchTableIndexForDispatchKey(dispatchKey);
TORCH_CHECK(
!backendFallbackKernels_[idx].kernel.isValid(),
!backendFallbackKernels_[static_cast<uint8_t>(dispatchKey)].kernel.isValid(),
"Tried to register multiple backend fallbacks for the same dispatch key ", dispatchKey, "; previous registration ",
backendFallbackKernels_[idx].debug, ", new registration ", debug
backendFallbackKernels_[static_cast<uint8_t>(dispatchKey)].debug, ", new registration ", debug
);
// NB: inferred function schema is always nullptr for fallbacks, as fallbacks
// cannot be unobxed
backendFallbackKernels_[idx] = impl::AnnotatedKernel(std::move(kernel), nullptr, std::move(debug));
backendFallbackKernels_[static_cast<uint8_t>(dispatchKey)] = impl::AnnotatedKernel(std::move(kernel), nullptr, std::move(debug));
for (auto& op : operators_) {
op.op.updateFallback(*this, dispatchKey);
@ -289,8 +288,7 @@ RegistrationHandleRAII Dispatcher::registerFallback(DispatchKey dispatchKey, Ker
void Dispatcher::deregisterFallback_(DispatchKey dispatchKey) {
std::lock_guard<std::mutex> lock(mutex_);
auto idx = getDispatchTableIndexForDispatchKey(dispatchKey);
backendFallbackKernels_[idx] = {};
backendFallbackKernels_[static_cast<uint8_t>(dispatchKey)] = {};
for (auto& op : operators_) {
op.op.updateFallback(*this, dispatchKey);

View File

@ -291,7 +291,7 @@ private:
// Map from namespace to debug string (saying, e.g., where the library was defined)
ska::flat_hash_map<std::string, std::string> libraries_;
std::array<impl::AnnotatedKernel, num_runtime_entries> backendFallbackKernels_;
std::array<impl::AnnotatedKernel, static_cast<uint8_t>(DispatchKey::NumDispatchKeys)> backendFallbackKernels_;
std::unique_ptr<detail::RegistrationListenerList> listeners_;
std::mutex mutex_;
@ -531,7 +531,8 @@ C10_DISPATCHER_INLINE_UNLESS_MOBILE Return Dispatcher::call(const TypedOperatorH
detail::unused_arg_(args...); // workaround for a false-positive warning about unused parameters in gcc 5
auto dispatchKeySet = op.operatorDef_->op.dispatchKeyExtractor()
.template getDispatchKeySetUnboxed<Args...>(args...);
const KernelFunction& kernel = op.operatorDef_->op.lookup(dispatchKeySet);
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c10::isAliasDispatchKey(dispatchKeySet.highestPriorityTypeId()));
const KernelFunction& kernel = op.operatorDef_->op.lookup(dispatchKeySet.highestPriorityTypeId());
#ifndef PYTORCH_DISABLE_PER_OP_PROFILING
// By default, when there're no high-frequency or non-sampled callbacks,
// RecordFunction is pre-sampled as a perf optimization;
@ -552,7 +553,7 @@ template<class Return, class... Args>
inline Return Dispatcher::redispatch(const TypedOperatorHandle<Return (Args...)>& op, DispatchKeySet currentDispatchKeySet, Args... args) const {
detail::unused_arg_(args...); // workaround for a false-positive warning about unused parameters in gcc 5
// do not use RecordFunction on redispatch
const KernelFunction& kernel = op.operatorDef_->op.lookup(currentDispatchKeySet);
const KernelFunction& kernel = op.operatorDef_->op.lookup(currentDispatchKeySet.highestPriorityTypeId());
return kernel.template call<Return, Args...>(op, currentDispatchKeySet, std::forward<Args>(args)...);
}
@ -560,7 +561,7 @@ inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) const
// note: this doesn't need the mutex because write operations on the list keep iterators intact.
const auto& entry = op.operatorDef_->op;
auto dispatchKeySet = entry.dispatchKeyExtractor().getDispatchKeySetBoxed(stack);
const auto& kernel = entry.lookup(dispatchKeySet);
const auto& kernel = entry.lookup(dispatchKeySet.highestPriorityTypeId());
#ifndef PYTORCH_DISABLE_PER_OP_PROFILING
bool pre_sampled = false;
if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) {
@ -592,7 +593,7 @@ inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) const
inline void Dispatcher::redispatchBoxed(const OperatorHandle& op, DispatchKeySet dispatchKeySet, Stack* stack) const {
// note: this doesn't need the mutex because write operations on the list keep iterators intact.
const auto& entry = op.operatorDef_->op;
const auto& kernel = entry.lookup(dispatchKeySet);
const auto& kernel = entry.lookup(dispatchKeySet.highestPriorityTypeId());
return kernel.callBoxed(op, dispatchKeySet, stack);
}

View File

@ -283,7 +283,7 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
}
// 3. Backend fallback
auto dispatch_ix = getDispatchTableIndexForDispatchKey(dispatch_key);
auto dispatch_ix = static_cast<uint8_t>(dispatch_key);
if (dispatcher.backendFallbackKernels_[dispatch_ix].kernel.isValid()) {
return {dispatcher.backendFallbackKernels_[dispatch_ix], "backend fallback"};
}
@ -299,7 +299,10 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
// or alias keys and their associated keysets).
// This function should be considered a private helper for updateDispatchTable_()
void OperatorEntry::updateDispatchTableEntry_(const c10::Dispatcher& dispatcher, DispatchKey dispatch_key) {
const auto dispatch_ix = getDispatchTableIndexForDispatchKey(dispatch_key);
const auto dispatch_ix = c10::getDispatchTableIndexForDispatchKey(dispatch_key);
if (C10_UNLIKELY(dispatch_ix == -1)) {
return;
}
dispatchTable_[dispatch_ix] = computeDispatchTableEntry(dispatcher, dispatch_key);
dispatchKeyExtractor_.setOperatorHasFallthroughForKey(dispatch_key, dispatchTable_[dispatch_ix].isFallthrough());
}
@ -326,12 +329,8 @@ void OperatorEntry::updateDispatchTable_(const c10::Dispatcher& dispatcher, Disp
}
// Note [Refresh Runtime Autograd entries in dispatchTable_]
// Registering to backend key might affect computed entry at its Autograd backend key due to (2.1) & (2.3).
// In theory, we should only have to check if the given runtime key has "dense" functionality,
// e.g. DispatchKey::CPU (which is composed of DispatchKey::Dense and BackendComponent::CPUBit).
// However, there are some backends that should be included in this set that don't have the dense key set.
// E.g. DispatchKey::Meta, DispatchKey::ORT.
if (c10::isBackendDispatchKey(dispatch_key)) {
DispatchKey autograd_key = getAutogradKeyFromBackend(toBackendComponent(dispatch_key));
DispatchKey autograd_key = getAutogradKeyFromBackend(dispatch_key);
updateDispatchTableEntry_(dispatcher, autograd_key);
}
}
@ -358,9 +357,8 @@ void OperatorEntry::updateDispatchTableFull_(const c10::Dispatcher& dispatcher)
// catchAll. After catchAllKernel_ is removed, Undefined now can get a kernel from either CompositeExplicitAutograd
// or CompositeImplicitAutograd alias key so that we don't break the support. Ideally isIncludedInAlias(Undefined, CompositeImplicitAutograd)
// should return true, it returns false because Undefined cannot be represented in a DispatchKeySet.
updateDispatchTable_(dispatcher, DispatchKey::Undefined);
for (auto k : DispatchKeySet(DispatchKeySet::FULL)) {
updateDispatchTable_(dispatcher, k);
for (uint8_t iter = 0; iter != static_cast<uint8_t>(DispatchKey::NumDispatchKeys); ++iter) {
updateDispatchTable_(dispatcher, static_cast<DispatchKey>(iter));
}
}
@ -373,10 +371,9 @@ void OperatorEntry::checkInvariants() const {
for (const auto& kv : kernels_) {
TORCH_INTERNAL_ASSERT(kv.second.size() > 0, dumpState());
}
for (auto k : DispatchKeySet(DispatchKeySet::FULL)) {
auto expected_k = computeDispatchTableEntry(c10::Dispatcher::singleton(), k);
auto idx = getDispatchTableIndexForDispatchKey(k);
TORCH_INTERNAL_ASSERT(expected_k._equalsBoxedAndUnboxed(dispatchTable_[idx]),
for (uint8_t iter = 0; iter != static_cast<uint8_t>(DispatchKey::NumDispatchKeys); ++iter) {
auto expected_k = computeDispatchTableEntry(c10::Dispatcher::singleton(), static_cast<DispatchKey>(iter));
TORCH_INTERNAL_ASSERT(expected_k._equalsBoxedAndUnboxed(dispatchTable_[iter]),
"Canonical state\n~~~~~~~~~~~\n", dumpState(), "\n\n"
"Computed table:\n~~~~~~~~~~~\n", dumpComputedTable());
}
@ -387,8 +384,7 @@ std::string OperatorEntry::listAllDispatchKeys() const {
str << "[";
bool has_kernels = false;
for (auto k : DispatchKeySet(DispatchKeySet::FULL)) {
auto iter = getDispatchTableIndexForDispatchKey(k);
for (uint8_t iter = 0; iter != static_cast<uint8_t>(DispatchKey::NumDispatchKeys); ++iter) {
if (!dispatchTable_[iter].isValid()) {
continue;
}
@ -447,12 +443,8 @@ void OperatorEntry::reportError(DispatchKey dispatchKey) const {
// updateDispatchTableFull_ would update the dispatch table to be)
std::string OperatorEntry::dumpComputedTable() const {
std::ostringstream oss;
// Need to handle Undefined separately, because its a runtime key that can't be represented
// in a DispatchKeySet.
std::vector<DispatchKey> runtime_keys = {DispatchKey::Undefined};
for (auto k : DispatchKeySet(DispatchKeySet::FULL)) runtime_keys.push_back(k);
for (auto k : runtime_keys) {
for (uint8_t i = 0; i < static_cast<uint8_t>(DispatchKey::NumDispatchKeys); i++) {
auto k = static_cast<DispatchKey>(i);
auto kernel_prov = computeDispatchTableEntryWithDebug(c10::Dispatcher::singleton(), k);
if (kernel_prov.first.kernel.isValid()) {
oss << toString(k) << ": "

View File

@ -173,8 +173,11 @@ public:
[[noreturn]] void reportError(DispatchKey dispatchKey) const;
const KernelFunction& lookup(DispatchKeySet ks) const {
const auto idx = ks.getDispatchTableIndexForDispatchKeySet();
const KernelFunction& lookup(DispatchKey k) const {
const auto idx = getDispatchTableIndexForDispatchKey(k);
if (C10_UNLIKELY(idx == -1)) {
reportError(k);
}
const auto& kernel = dispatchTable_[idx];
// A valid kernel *always* has a boxed kernel and *may* have an
// unboxed kernel. However, we typically do unboxed calls in at::
@ -184,7 +187,7 @@ public:
// in the common case.
if (C10_UNLIKELY(!kernel.isValidUnboxed())) {
if (!kernel.isValid()) {
reportError(ks.highestPriorityTypeId());
reportError(k);
}
}
return kernel;
@ -208,7 +211,7 @@ private:
OperatorName name_;
c10::optional<AnnotatedSchema> schema_;
std::array<KernelFunction, c10::num_runtime_entries> dispatchTable_;
std::array<KernelFunction, c10::getDispatchTableIndexForDispatchKey(DispatchKey::NumDispatchKeys)> dispatchTable_;
DispatchKeyExtractor dispatchKeyExtractor_;
// kernels_ stores all registered kernels for the corresponding dispatch key

View File

@ -45,6 +45,10 @@ namespace c10 {
_(prim, CudaFusionGuard) \
_(prim, FunctionalGraph) \
_(prim, add_optional) \
_(prim, view_copy) \
_(prim, reshape_copy) \
_(prim, squeeze_copy) \
_(prim, unsqueeze_copy) \
_(prim, DifferentiableGraph) \
_(prim, TensorExprGroup) \
_(prim, TensorExprDynamicGroup) \

View File

@ -591,7 +591,7 @@ TEST(OperatorRegistrationTest, AutogradBackendOverridesAutogradKernel) {
void LazyBackendsAutogradOverridesAutogradKernel(DispatchKey key) {
auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
.kernel<decltype(nonautograd_kernel), &nonautograd_kernel>(c10::getAutogradKeyFromBackend(toBackendComponent(key)))
.kernel<decltype(nonautograd_kernel), &nonautograd_kernel>(c10::getAutogradKeyFromBackend(key))
.kernel<decltype(autograd_kernel), &autograd_kernel>(DispatchKey::Autograd));
auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
@ -1791,22 +1791,22 @@ TEST(NewOperatorRegistrationTest, dispatchAutogradPrecedence) {
TEST(NewOperatorRegistrationTest, throwsWhenRegisterToBackendMapsToAutogradOther) {
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
bool fpga_called, math_called = false;
bool sparsecpu_called, math_called = false;
auto m = MAKE_TORCH_LIBRARY(test);
m.def("fn", torch::dispatch(c10::DispatchKey::FPGA, [&](const Tensor& x) { fpga_called = true; return x; }));
m.def("fn", torch::dispatch(c10::DispatchKey::SparseCPU, [&](const Tensor& x) { sparsecpu_called = true; return x; }));
m.impl("fn", c10::DispatchKey::CompositeImplicitAutograd, [&](const Tensor& x) { math_called = true; return x; });
auto op = Dispatcher::singleton().findSchema({"test::fn", ""});
ASSERT_TRUE(op.has_value());
{
callOp(*op, dummyTensor(c10::DispatchKey::FPGA));
ASSERT_TRUE(fpga_called);
callOp(*op, dummyTensor(c10::DispatchKey::SparseCPU));
ASSERT_TRUE(sparsecpu_called);
}
{
expectThrows<c10::Error>([&] {
callOp(*op, dummyTensor(c10::DispatchKey::FPGA, /*requires_grad=*/true));
callOp(*op, dummyTensor(c10::DispatchKey::SparseCPU, /*requires_grad=*/true));
}, "test::fn has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther.");
}
}
@ -1849,15 +1849,18 @@ TEST(NewOperatorRegistrationTest, dispatchMultipleTensors) {
}
{
// TODO(#43908): currently this will fallthrough AutogradPrivateUse1 then call catchall kernel
// at AutogradCPU, while backend extenders are indeed expecting to call PrivateUse1 kernel.
// This confusing behavior is caused by we registering fallthrough as backend fallback for
// Autograd keys. Note users could always work around this by registering the same kernel to
// AutogradPrivateUse1 as shown below until we support it.
auto op = Dispatcher::singleton().findOp({"test::fn", ""});
ASSERT_TRUE(op.has_value());
catchall_called = false;
privateuse1_called = false;
callOp(*op,
dummyTensor(c10::DispatchKey::PrivateUse1, /*requires_grad=*/true),
dummyTensor(c10::DispatchKey::CPU, /*requires_grad=*/true));
ASSERT_FALSE(catchall_called);
ASSERT_TRUE(privateuse1_called);
ASSERT_TRUE(catchall_called);
}
m.impl("fn", c10::DispatchKey::AutogradPrivateUse1, [&](const Tensor& x, const Tensor& y) { privateuse1_called = true; return x; });
@ -1873,27 +1876,6 @@ TEST(NewOperatorRegistrationTest, dispatchMultipleTensors) {
}
}
TEST(NewOperatorRegistrationTest, registerCompositeImplicitAutogradWithCPUKernel_andCallAutogradOtherKernel_callsComposite) {
bool math_called = false;
bool cpu_called = false;
auto m = MAKE_TORCH_LIBRARY(test);
m.def("fn(Tensor dummy) -> Tensor");
m.impl("fn", c10::DispatchKey::CPU, [&](const Tensor& x) { cpu_called = true; return x; });
m.impl("fn", c10::DispatchKey::CompositeImplicitAutograd, [&](const Tensor& x) { math_called = true; return x; });
auto op = Dispatcher::singleton().findSchema({"test::fn", ""});
ASSERT_TRUE(op.has_value());
{
math_called = cpu_called = false;
// Meta should redispatch to the AutogradOther backend,
// which the composite kernel should be registered to.
callOp(*op, dummyTensor(c10::DispatchKey::Meta, /*requires_grad=*/true));
ASSERT_TRUE(math_called);
ASSERT_FALSE(cpu_called);
}
}
TEST(NewOperatorRegistrationTest, dispatchMultiple) {
bool cpu_called = false;
bool cuda_called = false;

View File

@ -2,7 +2,7 @@
#include <ATen/cuda/ApplyGridUtils.cuh>
#include <ATen/cuda/detail/IndexUtils.cuh>
#include <ATen/TensorUtils.h>
#include <ATen/core/TensorBase.h>
#include <ATen/ceil_div.h>
#include <ATen/cuda/Atomic.cuh>
#include <ATen/cuda/CUDAContext.h>
@ -378,12 +378,14 @@ kernelPointwiseApply2(detail::TensorInfo<scalar1, IndexType> a,
template <typename scalar1, typename scalar2, int step, typename Op,
int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
inline bool CUDA_tensor_apply2(at::Tensor a,
at::Tensor b,
inline bool CUDA_tensor_apply2(at::TensorBase a,
at::TensorBase b,
const Op op,
TensorArgType aType = TensorArgType::ReadWrite,
TensorArgType bType = TensorArgType::ReadOnly) {
checkDeviceType("CUDA_tensor_apply2", {a, b}, DeviceType::CUDA);
TORCH_CHECK(a.device().is_cuda() && b.device().is_cuda(),
"CUDA_tensor_apply2: Expected tensors to have CUDA DeviceType, but got "
"tensors with type ", a.device().type(), " and ", b.device().type());
int64_t totalElements = a.numel();
if (totalElements != b.numel()) {
@ -413,8 +415,8 @@ inline bool CUDA_tensor_apply2(at::Tensor a,
This ensures that each element of the tensor is operated on once and only
once.
*/
Tensor oldA;
Tensor oldB;
TensorBase oldA;
TensorBase oldB;
if (aType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(a)) {
// Must perform in contiguous space
@ -524,8 +526,8 @@ inline bool CUDA_tensor_apply2(at::Tensor a,
template <typename scalar1, typename scalar2, typename Op,
int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
inline bool CUDA_tensor_apply2(at::Tensor a,
at::Tensor b,
inline bool CUDA_tensor_apply2(const at::TensorBase &a,
const at::TensorBase &b,
const Op op,
TensorArgType aType = TensorArgType::ReadWrite,
TensorArgType bType = TensorArgType::ReadOnly) {

View File

@ -1,9 +1,7 @@
#pragma once
#include <c10/core/GeneratorImpl.h>
#include <ATen/core/Generator.h>
#include <ATen/cuda/detail/PhiloxCudaStateRaw.cuh>
#include <ATen/Tensor.h>
#include <ATen/Context.h>
#include <limits>

View File

@ -258,7 +258,7 @@ Tensor& copy_(Tensor& self, const Tensor& src, bool non_blocking) {
return self;
}
void copy_ignoring_overlaps(const Tensor &dst, const Tensor &src) {
void copy_ignoring_overlaps(const TensorBase &dst, const TensorBase &src) {
// Called when we are copying into an overlapping index `dst`, but we don't
// care which writer wins. Hacky but it works. This is only used by
// CUDA_tensor_apply2 in case that there are write overlaps.

View File

@ -6,6 +6,7 @@ namespace at {
class Tensor;
struct TensorIterator;
class TensorBase;
namespace native {
@ -13,7 +14,7 @@ using copy_fn = void (*)(TensorIterator&, bool non_blocking);
DECLARE_DISPATCH(copy_fn, copy_stub);
TORCH_API void copy_ignoring_overlaps(const Tensor &dst, const Tensor &src);
TORCH_API void copy_ignoring_overlaps(const TensorBase &dst, const TensorBase &src);
} // namespace native
} // namespace at

View File

@ -1,7 +1,5 @@
#pragma once
#include <ATen/ATen.h>
#include <ATen/ExpandUtils.h>
#include <ATen/native/Math.h>
#include <c10/macros/Macros.h>
#include <c10/util/MathConstants.h>

View File

@ -864,8 +864,13 @@ Tensor grid_sampler_2d_cpu(const Tensor& input, const Tensor& grid,
}
}
return grid_sampler_2d_cpu_kernel(
kCPU, input, grid, interpolation_mode, padding_mode, align_corners);
auto in_size = input.sizes();
auto grid_size = grid.sizes();
auto output = at::empty(
{in_size[0], in_size[1], grid_size[1], grid_size[2]}, input.options());
grid_sampler_2d_cpu_kernel(
kCPU, output, input, grid, interpolation_mode, padding_mode, align_corners);
return output;
}
DEFINE_DISPATCH(grid_sampler_2d_cpu_kernel);
@ -911,8 +916,15 @@ grid_sampler_2d_backward_cpu(const Tensor& grad_output, const Tensor& input, con
}
}
return grid_sampler_2d_backward_cpu_kernel(
kCPU, grad_output, input, grid, interpolation_mode, padding_mode, align_corners, output_mask);
Tensor grad_input;
if (output_mask[0]) {
grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
}
auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
grid_sampler_2d_backward_cpu_kernel(
kCPU, grad_input, grad_grid, grad_output, input, grid,
interpolation_mode, padding_mode, align_corners, output_mask);
return std::make_tuple(std::move(grad_input), std::move(grad_grid));
}
DEFINE_DISPATCH(grid_sampler_2d_backward_cpu_kernel);

View File

@ -1,7 +1,9 @@
#pragma once
#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <utility>
namespace at { namespace native {

View File

@ -1627,8 +1627,7 @@ Tensor matmul(
Tensor t2 = dim_tensor2 == 1 ? tensor2.unsqueeze(-1) : tensor2;
auto size1 = tensor1.sizes();
auto size2 = t2.sizes();
std::vector<int64_t> output_size;
output_size.insert(output_size.end(), size1.begin(), size1.end() - 1);
DimVector output_size(size1.begin(), size1.end() - 1);
if (dim_tensor2 > 1) {
output_size.push_back(size2[dim_tensor2 - 1]);
}
@ -1660,7 +1659,8 @@ Tensor matmul(
return has_out ? out.set_(res) : res;
}
else {
std::vector<int64_t> shape = tensor2.sizes().slice(0, dim_tensor2 - 2).vec();
c10::IntArrayRef shape_array = tensor2.sizes().slice(0, dim_tensor2 - 2);
DimVector shape(shape_array.begin(), shape_array.end());
shape.push_back(p);
Tensor res = res_T.reshape(shape).contiguous();
@ -1677,29 +1677,29 @@ Tensor matmul(
IntArrayRef batch_tensor2(tensor2.sizes().data(), std::max<int64_t>(dim_tensor2 - 2, 0));
// expand the batch portion (i.e. cut off matrix dimensions and expand rest)
std::vector<int64_t> expand_batch_portion = infer_size(batch_tensor1, batch_tensor2);
DimVector expand_batch_portion = infer_size_dimvector(batch_tensor1, batch_tensor2);
std::vector<int64_t> tensor1_expand_size(expand_batch_portion);
tensor1_expand_size.insert(tensor1_expand_size.end(), {n, m1});
DimVector tensor1_expand_size(expand_batch_portion);
tensor1_expand_size.push_back(n);
tensor1_expand_size.push_back(m1);
std::vector<int64_t> tensor2_expand_size(expand_batch_portion);
tensor2_expand_size.insert(tensor2_expand_size.end(), {m2, p});
DimVector tensor2_expand_size(expand_batch_portion);
tensor2_expand_size.push_back(m2);
tensor2_expand_size.push_back(p);
const int64_t expand_batch_product =
c10::multiply_integers(expand_batch_portion);
std::vector<int64_t> tensor1_bmm_view({expand_batch_product});
tensor1_bmm_view.insert(tensor1_bmm_view.end(), {n, m1});
std::array<int64_t, 3> tensor1_bmm_view = {expand_batch_product, n, m1};
std::vector<int64_t> tensor2_bmm_view({expand_batch_product});
tensor2_bmm_view.insert(tensor2_bmm_view.end(), {m2, p});
std::array<int64_t, 3> tensor2_bmm_view = {expand_batch_product, m2, p};
// flatten expanded batches
Tensor tensor1_expanded = tensor1.expand(tensor1_expand_size).reshape(tensor1_bmm_view);
Tensor tensor2_expanded = tensor2.expand(tensor2_expand_size).reshape(tensor2_bmm_view);
// reshape batches back into result
std::vector<int64_t> output_shape(expand_batch_portion);
DimVector output_shape(expand_batch_portion);
if (dim_tensor1 > 1) {
output_shape.push_back(n);
}

View File

@ -45,6 +45,19 @@ namespace native {
DEFINE_DISPATCH(sort_stub);
DEFINE_DISPATCH(topk_stub);
void _fill_indices(const TensorBase &indices, int64_t dim) {
auto ndim = indices.dim();
assert(0 <= dim && dim < ndim);
auto dim_size = indices.size(dim);
auto idx_dim = at::arange(0, dim_size, indices.options().dtype(at::kLong));
auto idx_dim_sizes = std::vector<int64_t>(ndim, 1);
auto idx_dim_strides = std::vector<int64_t>(ndim, 0);
idx_dim_sizes[dim] = dim_size;
idx_dim_strides[dim] = 1;
auto idx_dim_restrided = idx_dim.as_strided(idx_dim_sizes, idx_dim_strides);
OptionalTensorRef(indices)->copy_(idx_dim_restrided);
}
namespace {
/* Note from TH:

View File

@ -1,8 +1,11 @@
#pragma once
#include <ATen/ATen.h>
#include <ATen/native/DispatchStub.h>
namespace at {
class TensorBase;
}
namespace at {
namespace native {
@ -14,11 +17,13 @@ enum class QUANTILE_INTERPOLATION_MODE : uint8_t {
NEAREST
};
using sort_fn = void(*)(Tensor& values, Tensor& indices, int64_t dim, bool descending, bool stable);
using topk_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, int64_t, int64_t, bool, bool);
using sort_fn = void(*)(const TensorBase &values, const TensorBase &indices, int64_t dim, bool descending, bool stable);
using topk_fn = void(*)(const TensorBase&, const TensorBase&, const TensorBase&, int64_t, int64_t, bool, bool);
DECLARE_DISPATCH(sort_fn, sort_stub);
DECLARE_DISPATCH(topk_fn, topk_stub);
void _fill_indices(const TensorBase &indices, int64_t dim);
} // namespace native
} // namespace at

View File

@ -86,92 +86,5 @@ inline void _allocate_or_resize_output_with_indices(
}
}
#ifdef CPU_CAPABILITY
inline namespace CPU_CAPABILITY {
#else
inline namespace DEFAULT {
#endif
// Core topk loop, shared between CPU and QuantizedCPU
template <typename scalar_t, typename accscalar_t>
void topk_impl_loop(
const int64_t mode_values_stride,
const int64_t mode_indices_stride,
const int64_t tmp_values_stride,
const int64_t k,
const int64_t dim_size,
const bool largest,
const bool sorted,
char** data, const int64_t* strides, const int64_t n) {
using elem_t = std::pair<accscalar_t, int64_t>;
std::vector<elem_t> queue(dim_size);
for (const auto i : c10::irange(n)) {
TensorAccessor<scalar_t, 1> mode_values(
reinterpret_cast<scalar_t*>(data[0] + i * strides[0]),
&k, &mode_values_stride);
TensorAccessor<int64_t, 1> mode_indices(
reinterpret_cast<int64_t*>(data[1] + i * strides[1]),
&k, &mode_indices_stride);
TensorAccessor<scalar_t, 1> tmp_values(
reinterpret_cast<scalar_t*>(data[2] + i * strides[2]),
&dim_size, &tmp_values_stride);
auto n = dim_size;
auto use_partial_sort = k * 64 <= n;
for (const auto j : c10::irange(n)) {
queue[j].first = tmp_values[j];
queue[j].second = j;
}
// we want nan to be sorted as top for numpy compatibility
if (use_partial_sort) {
if (largest) {
std::partial_sort(queue.begin(), queue.begin() + k, queue.end(),
[](const elem_t& x, const elem_t& y) -> bool {
return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
});
} else {
std::partial_sort(queue.begin(), queue.begin() + k, queue.end(),
[](const elem_t& x, const elem_t& y) -> bool {
return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
});
}
} else {
if (largest) {
std::nth_element(queue.begin(), queue.begin() + k - 1, queue.end(),
[](const elem_t& x, const elem_t& y) -> bool {
return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
});
if (sorted) {
std::sort(queue.begin(), queue.begin() + k - 1,
[](const elem_t& x, const elem_t& y) -> bool {
return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
});
}
} else {
std::nth_element(queue.begin(), queue.begin() + k -1, queue.end(),
[](const elem_t& x, const elem_t& y) -> bool {
return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
});
if (sorted) {
std::sort(queue.begin(), queue.begin() + k -1,
[](const elem_t& x, const elem_t& y) -> bool {
return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
});
}
}
}
for (const auto j : c10::irange(k)) {
mode_values[j] = queue[j].first;
mode_indices[j] = queue[j].second;
}
}
}
} // namespace CPU_CAPABILITY
} // namespace native
} // namespace at

View File

@ -4,6 +4,7 @@
#include <ATen/native/SpectralOpsUtils.h>
#include <ATen/native/TensorIterator.h>
#include <ATen/NativeFunctions.h>
#include <ATen/WrapDimUtils.h>
#include <c10/util/irange.h>
#include <algorithm>

View File

@ -0,0 +1,95 @@
#pragma once
#include <ATen/core/TensorAccessor.h>
#include <ATen/NumericUtils.h>
namespace at {
namespace native {
#ifdef CPU_CAPABILITY
inline namespace CPU_CAPABILITY {
#else
inline namespace DEFAULT {
#endif
// Core topk loop, shared between CPU and QuantizedCPU
template <typename scalar_t, typename accscalar_t>
void topk_impl_loop(
const int64_t mode_values_stride,
const int64_t mode_indices_stride,
const int64_t tmp_values_stride,
const int64_t k,
const int64_t dim_size,
const bool largest,
const bool sorted,
char** data, const int64_t* strides, const int64_t n) {
using elem_t = std::pair<accscalar_t, int64_t>;
std::vector<elem_t> queue(dim_size);
for (const auto i : c10::irange(n)) {
TensorAccessor<scalar_t, 1> mode_values(
reinterpret_cast<scalar_t*>(data[0] + i * strides[0]),
&k, &mode_values_stride);
TensorAccessor<int64_t, 1> mode_indices(
reinterpret_cast<int64_t*>(data[1] + i * strides[1]),
&k, &mode_indices_stride);
TensorAccessor<scalar_t, 1> tmp_values(
reinterpret_cast<scalar_t*>(data[2] + i * strides[2]),
&dim_size, &tmp_values_stride);
auto n = dim_size;
auto use_partial_sort = k * 64 <= n;
for (const auto j : c10::irange(n)) {
queue[j].first = tmp_values[j];
queue[j].second = j;
}
// we want nan to be sorted as top for numpy compatibility
if (use_partial_sort) {
if (largest) {
std::partial_sort(queue.begin(), queue.begin() + k, queue.end(),
[](const elem_t& x, const elem_t& y) -> bool {
return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
});
} else {
std::partial_sort(queue.begin(), queue.begin() + k, queue.end(),
[](const elem_t& x, const elem_t& y) -> bool {
return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
});
}
} else {
if (largest) {
std::nth_element(queue.begin(), queue.begin() + k - 1, queue.end(),
[](const elem_t& x, const elem_t& y) -> bool {
return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
});
if (sorted) {
std::sort(queue.begin(), queue.begin() + k - 1,
[](const elem_t& x, const elem_t& y) -> bool {
return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
});
}
} else {
std::nth_element(queue.begin(), queue.begin() + k -1, queue.end(),
[](const elem_t& x, const elem_t& y) -> bool {
return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
});
if (sorted) {
std::sort(queue.begin(), queue.begin() + k -1,
[](const elem_t& x, const elem_t& y) -> bool {
return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
});
}
}
}
for (const auto j : c10::irange(k)) {
mode_values[j] = queue[j].first;
mode_indices[j] = queue[j].second;
}
}
}
} // namespace CPU_CAPABILITY
} // namespace native
} // namespace at

View File

@ -6,7 +6,7 @@
namespace at {
class Tensor;
struct TensorIterator;
class TensorBase;
struct TensorIteratorBase;
}
@ -73,14 +73,14 @@ DECLARE_DISPATCH(unary_fn, trunc_stub);
DECLARE_DISPATCH(unary_fn, lgamma_stub);
// NB: these are actually defined in Distribution
DECLARE_DISPATCH(void(*)(Tensor&, const Tensor&, c10::optional<Generator>), bernoulli_tensor_stub);
DECLARE_DISPATCH(void(*)(Tensor&, const double, c10::optional<Generator>), bernoulli_scalar_stub);
DECLARE_DISPATCH(void(*)(const TensorBase&, const TensorBase&, c10::optional<Generator>), bernoulli_tensor_stub);
DECLARE_DISPATCH(void(*)(const TensorBase&, const double, c10::optional<Generator>), bernoulli_scalar_stub);
DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), cauchy_stub);
DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, c10::optional<Generator>), exponential_stub);
DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, c10::optional<Generator>), geometric_stub);
DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), log_normal_stub);
DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), uniform_stub);
DECLARE_DISPATCH(void(*)(Tensor&, const double, const double, c10::optional<Generator>), normal_stub);
DECLARE_DISPATCH(void(*)(const TensorBase&, const double, const double, c10::optional<Generator>), normal_stub);
DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const uint64_t, const int64_t, c10::optional<Generator>), random_from_to_stub);
DECLARE_DISPATCH(void(*)(TensorIteratorBase&, c10::optional<Generator>), random_full_64_bits_range_stub);
DECLARE_DISPATCH(void(*)(TensorIteratorBase&, c10::optional<Generator>), random_stub);

View File

@ -14,10 +14,7 @@ namespace native {
namespace {
Tensor gemm_nt(const Tensor& a, const Tensor& b) {
auto a_ = a.view({a.size(0) * a.size(1), a.size(2)});
auto b_ = b.transpose(1, 0);
auto c_ = at::native::matmul(a_, b_);
return c_.view({a.size(0), a.size(1), b.size(0)});
return at::native::matmul(a, b.t());
}
// compute q = (q + q_bias) / sqrt(dim_per_head), k = k + k_bias, v = v + v_bias
@ -45,7 +42,7 @@ std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv(
const scalar_t sqrt_dim_per_head = std::sqrt(static_cast<scalar_t>(dim_per_head));
int64_t grain_size =
std::min(internal::GRAIN_SIZE / (3 * dim_per_head), (int64_t)1);
std::max(internal::GRAIN_SIZE / (3 * dim_per_head), (int64_t)1);
parallel_for(
0, B * num_head * T, grain_size, [&](int64_t begin, int64_t end) {
for (auto i : c10::irange(begin, end)) {
@ -56,8 +53,8 @@ std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv(
auto b = i;
using Vec = vec::Vectorized<scalar_t>;
auto V = vec::Vectorized<scalar_t>::size();
// TODO: handle epilogue
for (auto dh = 0; dh < dim_per_head / V; dh += V) {
auto dh = 0;
for (; dh < dim_per_head; dh += V) {
auto d = nh * dim_per_head + dh;
// load
auto q_bias_data = Vec::loadu(&qkv_bias_data[d + 0 * D]);
@ -79,19 +76,43 @@ std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv(
q_data.store(&q_k_v_data
[0 * B * num_head * T * dim_per_head +
b * num_head * T * dim_per_head +
num_head * T * dim_per_head +
nh * T * dim_per_head +
t * dim_per_head + dh]);
k_data.store(&q_k_v_data
[1 * B * num_head * T * dim_per_head +
b * num_head * T * dim_per_head +
num_head * T * dim_per_head +
nh * T * dim_per_head +
t * dim_per_head + dh]);
v_data.store(&q_k_v_data
[2 * B * num_head * T * dim_per_head +
b * num_head * T * dim_per_head +
num_head * T * dim_per_head +
nh * T * dim_per_head +
t * dim_per_head + dh]);
}
if (dh != dim_per_head) {
for (dh = std::max(0, dh - V); dh < dim_per_head; dh++) {
auto d = nh * dim_per_head + dh;
auto q_bias = qkv_bias_data[d + 0 * D];
auto k_bias = qkv_bias_data[d + 1 * D];
auto v_bias = qkv_bias_data[d + 2 * D];
auto q_data = qkv_data[b * _3D * T + t * _3D + d + 0 * D] + q_bias;
auto k_data = qkv_data[b * _3D * T + t * _3D + d + 1 * D] + k_bias;
auto v_data = qkv_data[b * _3D * T + t * _3D + d + 2 * D] + v_bias;
q_data = q_data / sqrt_dim_per_head;
q_k_v_data[0 * B * num_head * T * dim_per_head +
b * num_head * T * dim_per_head +
nh * T * dim_per_head +
t * dim_per_head + dh] = q_data;
q_k_v_data[1 * B * num_head * T * dim_per_head +
b * num_head * T * dim_per_head +
nh * T * dim_per_head +
t * dim_per_head + dh] = k_data;
q_k_v_data[2 * B * num_head * T * dim_per_head +
b * num_head * T * dim_per_head +
nh * T * dim_per_head +
t * dim_per_head + dh] = v_data;
}
}
}
});
});
@ -110,13 +131,16 @@ Tensor bmm_nt(const Tensor& a, const Tensor& b) {
}
void masked_softmax_dropout(
const Tensor& attn_scores,
Tensor& attn_scores,
const c10::optional<Tensor>& attn_mask) {
auto B = attn_scores.size(0);
auto num_heads = attn_scores.size(1);
auto T = attn_scores.size(2);
if (attn_mask) {
TORCH_CHECK(attn_mask->is_contiguous());
} else {
at::_softmax_out(attn_scores, attn_scores, 3, false);
return;
}
AT_DISPATCH_FLOATING_TYPES_AND2(
ScalarType::Half,
@ -134,9 +158,10 @@ void masked_softmax_dropout(
using Vec = vec::Vectorized<scalar_t>;
auto V = vec::Vectorized<scalar_t>::size();
scalar_t* input_data = attn_scores_data + i * T;
scalar_t* input_data = attn_scores_data + i;
auto max_input = Vec(std::numeric_limits<scalar_t>::lowest());
// TODO: handle epilogue
TORCH_CHECK(T % V == 0, "epilogue not implemented yet");
for (auto t = 0; t < T; t += V) {
auto v = Vec::loadu(&input_data[t]);
max_input = vec::maximum(max_input, v);
@ -147,6 +172,7 @@ void masked_softmax_dropout(
hmax = std::max(max_input[i], hmax);
}
accscalar_t hsum = 0;
TORCH_CHECK(T % V == 0, "epilogue not implemented yet");
for (auto t = 0; t < T; t += V) {
auto v = Vec::loadu(&input_data[t]);
// TODO: vectorize in accscalar_t?
@ -155,6 +181,7 @@ void masked_softmax_dropout(
}
}
auto inv_denominator = 1.0 / hsum;
TORCH_CHECK(T % V == 0, "epilogue not implemented yet");
for (auto t = 0; t < T; t += V) {
Vec v = Vec::loadu(&input_data[t]);
@ -185,6 +212,8 @@ Tensor bmm_nn(const Tensor& a, const Tensor& b) {
Tensor transform_0213(const Tensor& a) {
// TODO: check perf vs dedicated kernel.
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.size(1));
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.size(3));
return a.permute({0, 2, 1, 3})
.contiguous()
.view({a.size(0), a.size(2), a.size(1) * a.size(3)});
@ -196,6 +225,13 @@ Tensor gemm_nt_bias(const Tensor& a, const Tensor& b, const Tensor& c) {
return r_.view({a.size(0), a.size(1), r_.size(1)});
}
void debug_assert_shape(const Tensor& t, c10::IntArrayRef shape) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY((size_t)t.dim() == shape.size(), "expected ", shape.size(), "-D tensor but got ", t.dim());
for (auto idx : c10::irange(shape.size())) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t.sizes()[idx] == shape[idx], "expected dim ", idx, " to be ", shape[idx], " but got ", t.sizes()[idx]);
}
}
} // namespace
Tensor multi_head_self_attention_cpu(
@ -209,30 +245,63 @@ Tensor multi_head_self_attention_cpu(
// query shape: [B, T, D]
// qkv_weight shape: [3 * D, D]
const auto D = query.sizes()[2];
TORCH_CHECK(query.dim() == 3, "expected 3-dimensional query, got ", query.dim(), "-D tensor");
TORCH_CHECK(qkv_weight.dim() == 2, "expected 2-dimensional qkv_weight, got ", qkv_weight.dim(), "-D tensor");
TORCH_CHECK(D * 3 == qkv_weight.sizes()[0], "expected qkv_weight first dim to be 3x last dim of query");
TORCH_CHECK(D == qkv_weight.sizes()[1], "expected qkv_weight second dim and last dim of query to be equal");
TORCH_CHECK(D % num_head == 0, "D must divide evenly by num_head");
#ifndef NDEBUG
const auto B = query.sizes()[0];
const auto T = query.sizes()[1];
const auto dim_per_head = D / num_head;
#endif
// shape: [B, T, 3 x D]
auto qkv = gemm_nt(query, qkv_weight);
#ifndef NDEBUG
debug_assert_shape(qkv, {B, T, 3 * D});
#endif
// shape: 3 x [B, num_head, T, dim_per_head]
auto q_k_v = transform_bias_rescale_qkv(qkv, qkv_bias, num_head);
auto q = std::get<0>(q_k_v);
auto k = std::get<1>(q_k_v);
auto v = std::get<2>(q_k_v);
const auto& q = std::get<0>(q_k_v);
const auto& k = std::get<1>(q_k_v);
const auto& v = std::get<2>(q_k_v);
#ifndef NDEBUG
debug_assert_shape(q, {B, num_head, T, dim_per_head});
debug_assert_shape(k, {B, num_head, T, dim_per_head});
debug_assert_shape(v, {B, num_head, T, dim_per_head});
#endif
// shape: [B, num_head, T, T]
auto qkt = bmm_nt(q, k);
#ifndef NDEBUG
debug_assert_shape(qkt, {B, num_head, T, T});
#endif
// shape: [B, num_head, T, T]
masked_softmax_dropout(qkt, mask);
// shape: [B, num_head, T, dim_per_head]
auto attn_ctx = bmm_nn(qkt, v);
#ifndef NDEBUG
debug_assert_shape(attn_ctx, {B, num_head, T, dim_per_head});
#endif
// shape: [B, T, D]
auto attn = transform_0213(attn_ctx);
#ifndef NDEBUG
debug_assert_shape(attn, {B, T, D});
#endif
// shape: [B, T, D]
auto proj = gemm_nt_bias(attn, proj_weight, proj_bias);
#ifndef NDEBUG
debug_assert_shape(proj, {B, T, D});
#endif
return proj;
}

View File

@ -1,9 +1,9 @@
#include <ATen/CPUGeneratorImpl.h>
#include <ATen/Dispatch.h>
#include <ATen/Functions.h>
#include <ATen/Generator.h>
#include <ATen/core/DistributionsHelper.h>
#include <ATen/native/Distributions.h>
#include <ATen/native/TensorFactories.h>
#include <ATen/native/cpu/DistributionTemplates.h>
#include <ATen/native/UnaryOps.h>
@ -25,22 +25,22 @@ static void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma,
templates::cpu::cauchy_kernel(iter, median, sigma, generator);
}
void bernoulli_tensor_kernel(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, c10::optional<Generator> gen) {
CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
templates::cpu::bernoulli_kernel(self, p_, generator);
}
void bernoulli_scalar_kernel_default(Tensor& self, double p, c10::optional<Generator> gen) {
void bernoulli_scalar_kernel_default(const TensorBase &self, double p, c10::optional<Generator> gen) {
CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
templates::cpu::bernoulli_kernel(self, p, generator);
}
#if !AT_MKL_ENABLED()
void bernoulli_scalar_kernel(Tensor& self, double p, c10::optional<Generator> gen) {
void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional<Generator> gen) {
bernoulli_scalar_kernel_default(self, p, gen);
}
#else
void bernoulli_scalar_kernel(Tensor &self, double p, c10::optional<Generator> gen) {
void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional<Generator> gen) {
if (cpuinfo_initialize() && cpuinfo_vendor_intel == cpuinfo_get_processor(0)->core->vendor) {
CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
int64_t seed;
@ -87,7 +87,7 @@ void bernoulli_scalar_kernel(Tensor &self, double p, c10::optional<Generator> ge
// copy_ if using buffer and non contiguous
if (!contig) {
self.copy_(tmp_int_tensor);
OptionalTensorRef(self)->copy_(tmp_int_tensor);
}
});
} else {
@ -117,7 +117,7 @@ void uniform_kernel(TensorIteratorBase& iter, double from, double to, c10::optio
templates::cpu::uniform_kernel(iter, from, to, generator);
}
void normal_kernel(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
void normal_kernel(const TensorBase &self, double mean, double std, c10::optional<Generator> gen) {
CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
templates::cpu::normal_kernel(self, mean, std, generator);
}

View File

@ -1,7 +1,8 @@
#pragma once
#include <ATen/Dispatch.h>
#include <ATen/CPUApplyUtils.h>
#include <ATen/Dispatch.h>
#include <ATen/ExpandBase.h>
#include <ATen/core/DistributionsHelper.h>
#include <ATen/native/TensorIterator.h>
#include <ATen/native/cpu/Loops.h>
@ -105,7 +106,7 @@ static void normal_fill_16_AVX2(float *data,
}
template<typename RNG>
void normal_fill_AVX2(Tensor& self, const float mean, const float std, RNG generator) {
void normal_fill_AVX2(const TensorBase &self, const float mean, const float std, RNG generator) {
float *data = self.data_ptr<float>();
auto size = self.numel();
std::lock_guard<std::mutex> lock(generator->mutex_);
@ -148,7 +149,7 @@ static void normal_fill_16(scalar_t *data, const scalar_t mean, const scalar_t s
}
template <typename scalar_t, typename RNG>
void normal_fill(Tensor& self, const scalar_t mean, const scalar_t std, RNG generator) {
void normal_fill(const TensorBase &self, const scalar_t mean, const scalar_t std, RNG generator) {
scalar_t *data = self.data_ptr<scalar_t>();
auto size = self.numel();
std::lock_guard<std::mutex> lock(generator->mutex_);
@ -172,7 +173,7 @@ void normal_fill(Tensor& self, const scalar_t mean, const scalar_t std, RNG gene
}
template<typename RNG>
void normal_kernel(Tensor& self, double mean, double std, RNG generator) {
void normal_kernel(const TensorBase &self, double mean, double std, RNG generator) {
auto size = self.numel();
if (self.scalar_type() == ScalarType::Float && size >= 16 && self.is_contiguous()) {
#ifdef CPU_CAPABILITY_AVX2
@ -308,25 +309,25 @@ struct ExponentialKernel {
// ================================================== Bernoulli =======================================================
template<typename RNG>
void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG generator) {
void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG generator) {
AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "bernoulli_tensor_cpu_self_", [&] {
// See Note [Acquire lock when using random generators]
std::lock_guard<std::mutex> lock(generator->mutex_);
using self_t = scalar_t;
auto p_cpu = p_.to(kCPU);
c10::MaybeOwned<Tensor> p = expand_inplace(self, p_cpu);
auto p = expand_inplace(self, p_cpu);
auto iter = TensorIteratorConfig()
.add_output(self)
.add_input(*p)
.check_all_same_dtype(false)
.build();
if (p_.scalar_type() == kDouble) {
if (p->scalar_type() == kDouble) {
cpu_serial_kernel(iter, [&](const double p_val) -> self_t {
at::bernoulli_distribution<double> bernoulli(p_val);
return static_cast<self_t>(bernoulli(generator));
});
} else {
AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, p_.scalar_type(), "bernoulli_tensor_cpu_p_", [&] {
AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, p->scalar_type(), "bernoulli_tensor_cpu_p_", [&] {
using p_t = scalar_t;
cpu_serial_kernel(iter, [&](const p_t p_val) -> self_t {
at::bernoulli_distribution<float> bernoulli(p_val);
@ -338,7 +339,7 @@ void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG generator) {
}
template<typename RNG>
void bernoulli_kernel(Tensor& self, double p, RNG generator) {
void bernoulli_kernel(const TensorBase &self, double p, RNG generator) {
AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "bernoulli_scalar_cpu_", [&] {
// See Note [Acquire lock when using random generators]
std::lock_guard<std::mutex> lock(generator->mutex_);
@ -352,10 +353,10 @@ void bernoulli_kernel(Tensor& self, double p, RNG generator) {
template<typename RNG>
struct BernoulliKernel {
void operator()(Tensor& self, double p, c10::optional<Generator> gen) {
void operator()(const TensorBase &self, double p, c10::optional<Generator> gen) {
bernoulli_kernel(self, p, check_generator<RNG>(gen));
}
void operator()(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
void operator()(const TensorBase &self, const TensorBase &p_, c10::optional<Generator> gen) {
bernoulli_kernel(self, p_, check_generator<RNG>(gen));
}
};

View File

@ -1,11 +1,12 @@
#include <ATen/ATen.h>
#include <ATen/Dispatch.h>
#include <ATen/Parallel.h>
#include <ATen/TensorUtils.h>
#include <ATen/NativeFunctions.h>
#define TORCH_ASSERT_NO_OPERATORS
#include <ATen/native/GridSampler.h>
#include <ATen/native/cpu/GridSamplerKernel.h>
#include <ATen/cpu/vml.h>
#include <ATen/core/TensorBase.h>
#include <ATen/Dispatch.h>
#include <ATen/Parallel.h>
#include <ATen/TensorGeometry.h>
#include <ATen/TensorIterator.h>
#include <ATen/cpu/vec/vec.h>
#include <c10/util/C++17.h>
#include <c10/util/irange.h>
@ -1146,13 +1147,12 @@ static inline void grid_sample_2d_grid_slice_iterator(
// and backward.
// See NOTE [ Grid Sample CPU Kernels ] for details.
Tensor grid_sampler_2d_cpu_kernel_impl(const Tensor& input, const Tensor& grid,
int64_t interpolation_mode,
int64_t padding_mode, bool align_corners) {
void grid_sampler_2d_cpu_kernel_impl(
const TensorBase &output, const TensorBase &input, const TensorBase &grid,
int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
auto N = input.size(0);
auto H = grid.size(1);
auto W = grid.size(2);
auto output = at::empty({N, input.size(1), H, W}, input.options());
auto spatial_size = H * W;
auto grain_size = spatial_size == 0 ? (N + 1)
: at::divup(at::internal::GRAIN_SIZE, spatial_size * 4 /* 2d * 2 tensors*/);
@ -1207,14 +1207,14 @@ Tensor grid_sampler_2d_cpu_kernel_impl(const Tensor& input, const Tensor& grid,
});
#undef HANDLE_CASE
#undef HANDLE_INTERP
return output;
}
std::tuple<Tensor, Tensor>
grid_sampler_2d_backward_cpu_kernel_impl(const Tensor& grad_output_,
const Tensor& input,
const Tensor& grid,
void grid_sampler_2d_backward_cpu_kernel_impl(
const TensorBase &grad_input,
const TensorBase &grad_grid,
const TensorBase &grad_output_,
const TensorBase &input,
const TensorBase &grid,
int64_t interpolation_mode,
int64_t padding_mode,
bool align_corners,
@ -1228,11 +1228,6 @@ grid_sampler_2d_backward_cpu_kernel_impl(const Tensor& grad_output_,
// is always computed.)
auto input_requires_grad = output_mask[0];
Tensor grad_input;
if (input_requires_grad) {
grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
}
auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
auto N = input.size(0);
auto spatial_size = grid.size(1) * grid.size(2);
auto grain_size = spatial_size == 0 ? (N + 1)
@ -1315,8 +1310,6 @@ grid_sampler_2d_backward_cpu_kernel_impl(const Tensor& grad_output_,
});
#undef HANDLE_CASE
#undef HANDLE_INTERP
return std::make_tuple(grad_input, grad_grid);
}
}

View File

@ -1,17 +1,33 @@
#pragma once
#include <ATen/ATen.h>
#include <ATen/Dispatch.h>
#include <ATen/NativeFunctions.h>
#include <ATen/native/DispatchStub.h>
#include <ATen/cpu/vml.h>
#include <tuple>
#include <array>
#include <cstdint>
namespace at {
class TensorBase;
}
namespace at { namespace native {
using forward_2d_fn = Tensor(*)(const Tensor &, const Tensor &, int64_t, int64_t, bool);
using backward_2d_fn = std::tuple<Tensor, Tensor>(*)(const Tensor &, const Tensor &, const Tensor &, int64_t, int64_t, bool, std::array<bool,2>);
using forward_2d_fn = void (*) (
const TensorBase &output,
const TensorBase &input,
const TensorBase &grid,
int64_t interpolation_mode,
int64_t padding_mode,
bool align_corners);
using backward_2d_fn = void (*) (
const TensorBase &grad_input,
const TensorBase &grad_grid,
const TensorBase &grad_output,
const TensorBase &input,
const TensorBase &grid,
int64_t interpolation_mode,
int64_t padding_mode,
bool align_corners,
std::array<bool, 2> output_mask);
DECLARE_DISPATCH(forward_2d_fn, grid_sampler_2d_cpu_kernel);
DECLARE_DISPATCH(backward_2d_fn, grid_sampler_2d_backward_cpu_kernel);

View File

@ -1,33 +1,23 @@
#include <ATen/ATen.h>
#define TORCH_ASSERT_NO_OPERATORS
#include <ATen/native/Sorting.h>
#include <ATen/core/TensorBase.h>
#include <ATen/Dispatch.h>
#include <ATen/Parallel.h>
#include <ATen/NumericUtils.h>
#include <ATen/native/TensorIterator.h>
#include <ATen/native/StridedRandomAccessor.h>
#include <ATen/native/CompositeRandomAccessor.h>
#include <ATen/native/Sorting.h>
#include <ATen/native/SortingUtils.h>
#include <ATen/native/TopKImpl.h>
#include <c10/util/irange.h>
namespace at { namespace native {
namespace {
void _fill_indices(Tensor& indices, int64_t dim) {
auto dim_size = indices.size(dim);
auto idx_dim = at::arange(0, dim_size, indices.options().dtype(at::kLong));
auto idx_dim_sizes = std::vector<int64_t>(indices.dim(), 1);
auto idx_dim_strides = std::vector<int64_t>(indices.dim(), 0);
idx_dim_sizes[dim] = dim_size;
idx_dim_strides[dim] = 1;
auto idx_dim_restrided = idx_dim.as_strided(idx_dim_sizes, idx_dim_strides);
indices.copy_(idx_dim_restrided);
}
template <typename func_t>
void _dim_apply(
Tensor& values,
Tensor& indices,
const TensorBase &values,
const TensorBase &indices,
int64_t dim,
const std::string& method_name,
const func_t& f) {
@ -95,8 +85,8 @@ struct KeyValueCompDesc {
};
static void sort_kernel(
Tensor& values,
Tensor& indices,
const TensorBase &values,
const TensorBase &indices,
int64_t dim,
bool descending,
bool stable) {
@ -143,9 +133,9 @@ static void sort_kernel(
}
static void topk_kernel(
const Tensor& values,
const Tensor& indices,
const Tensor& self,
const TensorBase &values,
const TensorBase &indices,
const TensorBase &self,
int64_t k,
int64_t dim,
bool largest,

View File

@ -1,4 +1,4 @@
#pragma once
#include <ATen/native/Activation.h>
#include <cstdint>

View File

@ -1,6 +1,5 @@
#define TORCH_ASSERT_NO_OPERATORS
#include <ATen/Dispatch.h>
#include <ATen/ExpandUtils.h>
#include <ATen/NativeFunctions.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAGeneratorImpl.h>
@ -24,12 +23,12 @@
namespace at { namespace native {
void bernoulli_tensor_kernel(Tensor& self, const Tensor& p_, c10::optional<Generator> gen_) {
void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, c10::optional<Generator> gen_) {
auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
at::native::templates::cuda::bernoulli_kernel(self, p_, generator);
}
void bernoulli_scalar_kernel(Tensor& self, double p, c10::optional<Generator> gen) {
void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional<Generator> gen) {
auto iter = TensorIterator::borrowing_nullary_op(self);
auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
at::native::templates::cuda::bernoulli_kernel(iter, p, generator);

View File

@ -1,30 +1,11 @@
#include <ATen/Dispatch.h>
#include <ATen/ExpandUtils.h>
#include <ATen/NativeFunctions.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAGeneratorImpl.h>
#define TORCH_ASSERT_NO_OPERATORS
#include <ATen/native/UnaryOps.h>
#include <ATen/cuda/CUDAGeneratorImpl.h>
#include <ATen/native/cuda/DistributionTemplates.h>
#include <curand.h>
#include <curand_kernel.h>
#include <curand_philox4x32_x.h>
#include <utility>
#include <functional>
#include <ATen/native/Distributions.h>
#include <ATen/native/cuda/Loops.cuh>
#include <ATen/native/TensorIterator.h>
#include <cstdint>
#include <limits>
#include <utility>
#include <type_traits>
namespace at { namespace native {
void normal_kernel(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
void normal_kernel(const TensorBase &self, double mean, double std, c10::optional<Generator> gen) {
auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
at::native::templates::cuda::normal_kernel(self, mean, std, generator);
}

View File

@ -2,7 +2,7 @@
#include <ATen/AccumulateType.h>
#include <ATen/Dispatch.h>
#include <ATen/ExpandUtils.h>
#include <ATen/ExpandBase.h>
#include <ATen/native/TensorIterator.h>
#include <ATen/native/cuda/Loops.cuh>
#include <c10/util/Half.h>
@ -430,7 +430,7 @@ void normal_and_transform(TensorIteratorBase& iter, RNG gen, transform_t transfo
// ==================================================== Normal ========================================================
template<typename RNG>
void normal_kernel(Tensor& self, double mean_, double std_, RNG gen) {
void normal_kernel(const TensorBase &self, double mean_, double std_, RNG gen) {
auto iter = TensorIterator::borrowing_nullary_op(self);
AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "normal_kernel_cuda", [&] {
using accscalar_t = at::acc_type<scalar_t, true>;
@ -446,7 +446,7 @@ void normal_kernel(Tensor& self, double mean_, double std_, RNG gen) {
template<typename RNG>
struct NormalKernel {
void operator()(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
void operator()(const TensorBase &self, double mean, double std, c10::optional<Generator> gen) {
normal_kernel(self, mean, std, check_generator<RNG>(gen));
}
};
@ -574,7 +574,7 @@ struct CauchyKernel {
template<typename scalar_t, typename prob_t>
void bernoulli_tensor_cuda_kernel(
at::Tensor& ret, const at::Tensor& p,
const TensorBase &ret, const at::TensorBase &p,
PhiloxCudaState philox_args) {
auto functor = [philox_args] __device__(
int n, scalar_t& v1, scalar_t& v2, scalar_t& v3, scalar_t& v4,
@ -618,7 +618,7 @@ void bernoulli_tensor_cuda_kernel(
}
template<typename RNG>
void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG gen) {
void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG gen) {
PhiloxCudaState rng_engine_inputs;
{
// See Note [Acquire lock when using random generators]
@ -626,14 +626,10 @@ void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG gen) {
rng_engine_inputs = gen->philox_cuda_state(10);
}
TORCH_CHECK(at::isFloatingType(p_.scalar_type()), "expected probabilities tensor to have floating type, got ", p_.scalar_type());
auto p_CUDA = p_.to(kCUDA);
//cast probabilities tensor to double for double `self` tensor, and to `float` for everything else
if (self.dtype() == at::kDouble) {
p_CUDA = p_CUDA.to(at::kDouble);
} else {
p_CUDA = p_CUDA.to(at::kFloat);
}
c10::MaybeOwned<Tensor> p = expand_inplace(self, p_CUDA);
// cast probabilities tensor to double for double `self` tensor, and to `float` for everything else
const auto p_type = self.dtype() == at::kDouble ? at::kDouble : at::kFloat;
auto p_cuda = p_.to(TensorOptions().device(self.device()).dtype(p_type));
auto p = expand_inplace(self, p_cuda);
AT_DISPATCH_ALL_TYPES_AND3(
at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, self.scalar_type(), "bernoulli_tensor_cuda_self_", [&] {
if (std::is_same<scalar_t, double>::value) {
@ -662,7 +658,7 @@ struct BernoulliKernel {
void operator()(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
bernoulli_kernel(iter, p, check_generator<RNG>(gen));
}
void operator()(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
void operator()(const TensorBase &self, const TensorBase &p_, c10::optional<Generator> gen) {
bernoulli_kernel(self, p_, check_generator<RNG>(gen));
}
};

View File

@ -1,5 +1,5 @@
#include <ATen/Dispatch.h>
#include <ATen/ExpandUtils.h>
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#include <ATen/AccumulateType.h>

View File

@ -0,0 +1,72 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/native/cuda/GridSampler.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/empty.h>
#include <ATen/ops/empty_like.h>
#include <ATen/ops/grid_sampler_2d_backward_native.h>
#include <ATen/ops/grid_sampler_2d_native.h>
#include <ATen/ops/grid_sampler_3d_backward_native.h>
#include <ATen/ops/grid_sampler_3d_native.h>
#include <ATen/ops/zeros_like.h>
#endif
namespace at {
namespace native {
Tensor grid_sampler_2d_cuda(const Tensor& input, const Tensor& grid,
int64_t interpolation_mode, int64_t padding_mode,
bool align_corners) {
auto in_size = input.sizes();
auto grid_size = grid.sizes();
auto output = at::empty(
{in_size[0], in_size[1], grid_size[1], grid_size[2]}, input.options());
launch_grid_sampler_2d_forward_kernel(
output, input, grid, interpolation_mode, padding_mode, align_corners);
return output;
}
Tensor grid_sampler_3d_cuda(const Tensor& input, const Tensor& grid,
int64_t interpolation_mode, int64_t padding_mode,
bool align_corners) {
auto in_size = input.sizes();
auto grid_size = grid.sizes();
auto output = at::empty(
{in_size[0], in_size[1], grid_size[1], grid_size[2], grid_size[3]},
input.options());
launch_grid_sampler_3d_forward_kernel(
output, input, grid, interpolation_mode, padding_mode, align_corners);
return output;
}
std::tuple<Tensor, Tensor>
grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input,
const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode,
bool align_corners, std::array<bool, 2> output_mask) {
Tensor grad_input;
if (output_mask[0]) {
grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
}
auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
launch_grid_sampler_2d_backward_kernel(
grad_input, grad_grid, grad_output, input,
grid, interpolation_mode, padding_mode, align_corners, output_mask);
return std::make_tuple(grad_input, grad_grid);
}
std::tuple<Tensor, Tensor>
grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input,
const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode,
bool align_corners) {
auto grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
launch_grid_sampler_3d_backward_kernel(
grad_input, grad_grid, grad_output, input,
grid, interpolation_mode, padding_mode, align_corners);
return std::make_tuple(grad_input, grad_grid);
}
}} // namespace at::native

View File

@ -1,10 +1,13 @@
#include <ATen/ATen.h>
#define TORCH_ASSERT_NO_OPERATORS
#include <ATen/native/cuda/GridSampler.h>
#include <ATen/native/cuda/GridSampler.cuh>
#include <ATen/native/cuda/UpSample.cuh>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/detail/TensorInfo.cuh>
#include <ATen/cuda/detail/IndexUtils.cuh>
#include <ATen/cuda/detail/KernelUtils.h>
#include <ATen/core/TensorBase.h>
#include <ATen/Dispatch.h>
#include <c10/macros/Macros.h>
namespace at { namespace native {
@ -723,14 +726,12 @@ namespace {
} // namespace
// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
Tensor grid_sampler_2d_cuda(const Tensor& input, const Tensor& grid,
int64_t interpolation_mode, int64_t padding_mode,
bool align_corners) {
void launch_grid_sampler_2d_forward_kernel(
const TensorBase &output, const TensorBase &input, const TensorBase &grid,
int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
auto N = input.size(0);
auto C = input.size(1);
auto H = grid.size(1);
auto W = grid.size(2);
auto output = at::empty({N, C, H, W}, input.options());
int64_t count = N * H * W;
if (count > 0) {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_2d_cuda", [&] {
@ -760,18 +761,16 @@ Tensor grid_sampler_2d_cuda(const Tensor& input, const Tensor& grid,
}
});
}
return output;
}
// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
Tensor grid_sampler_3d_cuda(const Tensor& input, const Tensor& grid,
int64_t interpolation_mode, int64_t padding_mode,
bool align_corners) {
void launch_grid_sampler_3d_forward_kernel(
const TensorBase &output, const TensorBase &input, const TensorBase &grid,
int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
auto N = input.size(0);
auto D = grid.size(1);
auto H = grid.size(2);
auto W = grid.size(3);
auto output = at::empty({N, input.size(1), D, H, W}, input.options());
int64_t count = N * D * H * W;
if (count > 0) {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_3d_cuda", [&] {
@ -801,15 +800,14 @@ Tensor grid_sampler_3d_cuda(const Tensor& input, const Tensor& grid,
}
});
}
return output;
}
// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
std::tuple<Tensor, Tensor>
grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input,
const Tensor& grid, int64_t interpolation_mode,
int64_t padding_mode, bool align_corners,
std::array<bool,2> output_mask) {
void launch_grid_sampler_2d_backward_kernel(
const TensorBase &grad_input, const TensorBase &grad_grid,
const TensorBase &grad_output, const TensorBase &input,
const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode,
bool align_corners, std::array<bool,2> output_mask) {
// See Note [Writing Nondeterministic Operations]
// Nondeterministic because of atomicAdd usage
globalContext().alertNotDeterministic("grid_sampler_2d_backward_cuda");
@ -822,11 +820,6 @@ grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input,
// is always computed.)
auto input_requires_grad = output_mask[0];
Tensor grad_input;
if (input_requires_grad) {
grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
}
auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
int64_t count = N * H * W;
if (count > 0) {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_2d_backward_cuda", [&] {
@ -864,13 +857,13 @@ grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input,
}
});
}
return std::make_tuple(grad_input, grad_grid);
}
// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
std::tuple<Tensor, Tensor>
grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input,
const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode,
void launch_grid_sampler_3d_backward_kernel(
const TensorBase &grad_input, const TensorBase &grad_grid,
const TensorBase& grad_output, const TensorBase& input,
const TensorBase& grid, int64_t interpolation_mode, int64_t padding_mode,
bool align_corners) {
// See Note [Writing Nondeterministic Operations]
// Nondeterministic because of atomicAdd usage
@ -879,8 +872,6 @@ grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input,
auto D = grid.size(1);
auto H = grid.size(2);
auto W = grid.size(3);
auto grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
int64_t count = N * D * H * W;
if (count > 0) {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_3d_backward_cuda", [&] {
@ -916,7 +907,6 @@ grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input,
}
});
}
return std::make_tuple(grad_input, grad_grid);
}
}} // namespace at::native

View File

@ -1,5 +1,3 @@
#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#include <ATen/native/cuda/KernelUtils.cuh>
namespace at { namespace native {

View File

@ -0,0 +1,32 @@
#pragma once
#include <array>
#include <cstdint>
namespace at {
class TensorBase;
}
namespace at {
namespace native {
void launch_grid_sampler_2d_forward_kernel(
const TensorBase &output, const TensorBase &input, const TensorBase &grid,
int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
void launch_grid_sampler_3d_forward_kernel(
const TensorBase &output, const TensorBase &input, const TensorBase &grid,
int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
void launch_grid_sampler_2d_backward_kernel(
const TensorBase &grad_input, const TensorBase &grad_grid,
const TensorBase &grad_output, const TensorBase &input,
const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode,
bool align_corners, std::array<bool, 2> output_mask);
void launch_grid_sampler_3d_backward_kernel(
const TensorBase &grad_input, const TensorBase &grad_grid,
const TensorBase &grad_output, const TensorBase &input,
const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode,
bool align_corners);
}} // namespace at::native

View File

@ -905,15 +905,16 @@ Tensor& index_select_out_cuda(
}
Tensor index_select_cuda(const Tensor& self, int64_t dim, const Tensor& index) {
Tensor out;
if (self.is_quantized()){
Tensor out = at::empty({0}, self.options());
at::native::index_select_out_cuda(self, dim, index, out);
return out;
}
Tensor index_select_quantized_cuda(const Tensor& self, int64_t dim, const Tensor& index) {
TORCH_CHECK(
self.qscheme() == kPerTensorAffine,
"Only per_tensor quantized quantized tensors are supported by index_select.")
out = at::empty_quantized({0}, self);
} else {
out = at::empty({0}, self.options());
}
Tensor out = at::empty_quantized({0}, self);
at::native::index_select_out_cuda(self, dim, index, out);
return out;
}

View File

@ -1,15 +1,19 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/ceil_div.h>
#include <ATen/Context.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/Dispatch.h>
#include <ATen/MemoryOverlap.h>
#include <ATen/NativeFunctions.h>
#include <ATen/native/Resize.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/diag.h>
#include <ATen/ops/trace_native.h>
#include <ATen/ops/tril_native.h>
#include <ATen/ops/triu_native.h>
#endif
#include <ATen/cuda/CUDAApplyUtils.cuh>

View File

@ -1,7 +1,10 @@
#include <ATen/ATen.h>
#include <ATen/TensorUtils.h>
#include <ATen/core/TensorAccessor.h>
#include <ATen/cuda/Atomic.cuh>
#include <c10/util/ArrayRef.h>
#include <c10/util/Optional.h>
#include <c10/util/SmallVector.h>
#include <math.h>
namespace at {

View File

@ -23,10 +23,7 @@ namespace native {
namespace {
Tensor gemm_nt(const Tensor& a, const Tensor& b) {
auto a_ = a.view({a.size(0) * a.size(1), a.size(2)});
auto b_ = b.transpose(1, 0);
auto c_ = at::native::matmul(a_, b_);
return c_.view({a.size(0), a.size(1), b.size(0)});
return at::native::matmul(a, b.t());
}
template <typename scalar_t, typename accscalar_t>
@ -209,6 +206,14 @@ Tensor gemm_nt_bias(const Tensor& a, const Tensor& b, const Tensor& c) {
return r_.view({a.size(0), a.size(1), r_.size(1)});
}
void debug_assert_shape(const Tensor& t, c10::IntArrayRef shape) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY((size_t)t.dim() == shape.size(), "expected ", shape.size(), "-D tensor but got ", t.dim());
for (auto idx : c10::irange(shape.size())) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t.sizes()[idx] == shape[idx], "expected dim ", idx, " to be ", shape[idx], " but got ", t.sizes()[idx]);
}
}
} // namespace
Tensor multi_head_self_attention_cuda(
@ -222,29 +227,63 @@ Tensor multi_head_self_attention_cuda(
// query shape: [B, T, D]
// qkv_weight shape: [3 * D, D]
const auto D = query.sizes()[2];
TORCH_CHECK(query.dim() == 3, "expected 3-dimensional query, got ", query.dim(), "-D tensor");
TORCH_CHECK(qkv_weight.dim() == 2, "expected 2-dimensional qkv_weight, got ", qkv_weight.dim(), "-D tensor");
TORCH_CHECK(D * 3 == qkv_weight.sizes()[0], "expected qkv_weight first dim to be 3x last dim of query");
TORCH_CHECK(D == qkv_weight.sizes()[1], "expected qkv_weight second dim and last dim of query to be equal");
TORCH_CHECK(D % num_head == 0, "D must divide evenly by num_head");
#ifndef NDEBUG
const auto B = query.sizes()[0];
const auto T = query.sizes()[1];
const auto dim_per_head = D / num_head;
#endif
// shape: [B, T, 3 x D]
auto qkv = gemm_nt(query, qkv_weight);
#ifndef NDEBUG
debug_assert_shape(qkv, {B, T, 3 * D});
#endif
// shape: 3 x [B, num_head, T, dim_per_head]
auto q_k_v = transform_bias_rescale_qkv(qkv, qkv_bias, num_head);
auto q = std::get<0>(q_k_v);
auto k = std::get<1>(q_k_v);
auto v = std::get<2>(q_k_v);
const auto& q = std::get<0>(q_k_v);
const auto& k = std::get<1>(q_k_v);
const auto& v = std::get<2>(q_k_v);
#ifndef NDEBUG
debug_assert_shape(q, {B, num_head, T, dim_per_head});
debug_assert_shape(k, {B, num_head, T, dim_per_head});
debug_assert_shape(v, {B, num_head, T, dim_per_head});
#endif
// shape: [B, num_head, T, T]
auto qkt = bmm_nt(q, k);
#ifndef NDEBUG
debug_assert_shape(qkt, {B, num_head, T, T});
#endif
// shape: [B, num_head, T, T]
masked_softmax_dropout(qkt, mask);
// shape: [B, num_head, T, dim_per_head]
auto attn_ctx = bmm_nn(qkt, v);
#ifndef NDEBUG
debug_assert_shape(attn_ctx, {B, num_head, T, dim_per_head});
#endif
// shape: [B, T, D]
auto attn = transform_0213(attn_ctx);
#ifndef NDEBUG
debug_assert_shape(attn, {B, T, D});
#endif
// shape: [B, T, D]
auto proj = gemm_nt_bias(attn, proj_weight, proj_bias);
#ifndef NDEBUG
debug_assert_shape(proj, {B, T, D});
#endif
return proj;
}

View File

@ -6061,7 +6061,7 @@
- func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
variants: function, method
- func: scatter_reduce.two(Tensor self, int dim, Tensor index, str reduce, *, int? output_size=None) -> Tensor
- func: _scatter_reduce.two(Tensor self, int dim, Tensor index, str reduce, *, int? output_size=None) -> Tensor
variants: function, method
dispatch:
CPU: scatter_reduce_two_cpu
@ -6867,7 +6867,8 @@
dispatch:
CPU: index_select_cpu_
QuantizedCPU: index_select_quantized_cpu_
CUDA, QuantizedCUDA: index_select_cuda
CUDA: index_select_cuda
QuantizedCUDA: index_select_quantized_cuda
SparseCPU: index_select_sparse
SparseCUDA: index_select_sparse

View File

@ -160,10 +160,9 @@ Tensor MakeStridedQTensorCPU(
allocator->allocate(size_bytes),
allocator,
/* resizable = */ true);
constexpr auto quantized_cpu_ks = at::DispatchKeySet(at::DispatchKey::QuantizedCPU);
auto tensor = detail::make_tensor<QTensorImpl>(
storage,
quantized_cpu_ks,
at::DispatchKeySet(at::DispatchKey::QuantizedCPU),
dtype,
quantizer);
get_qtensorimpl(tensor)->set_sizes_and_strides(sizes, strides);

View File

@ -2,7 +2,7 @@
#include <ATen/Dispatch.h>
#include <ATen/Parallel.h>
#include <ATen/native/Activation.h>
#include <ATen/native/SortingUtils.h>
#include <ATen/native/TopKImpl.h>
#include <ATen/native/TensorIterator.h>
#include <ATen/native/UpSample.h>
#include <ATen/native/cpu/Loops.h>

View File

@ -1,4 +1,5 @@
#include <ATen/ATen.h>
#include <ATen/WrapDimUtils.h>
#include <ATen/native/cpu/Loops.h>
#include <ATen/native/quantized/cpu/quantized_ops.h>
#include <ATen/native/TensorIterator.h>

View File

@ -1,6 +1,7 @@
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@ -44,8 +45,8 @@ static void setupBatchNorm(Fusion* fusion, DataType dtype) {
bias = castOp(DataType::Float, bias);
}
auto momentum_ptr = new Double(kMomentum);
auto eps_ptr = new Double(kEps);
auto momentum_ptr = IrBuilder::create<Double>(kMomentum);
auto eps_ptr = IrBuilder::create<Double>(kEps);
auto result = batch_norm(
input,

View File

@ -1,6 +1,7 @@
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@ -49,7 +50,7 @@ static void setupBatchNorm_BWD(Fusion* fusion, DataType dtype) {
grad_output = castOp(DataType::Float, grad_output);
}
auto eps_ptr = new Double(kEps);
auto eps_ptr = IrBuilder::create<Double>(kEps);
auto result = batch_norm_backward(
input,

View File

@ -2,6 +2,7 @@
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@ -36,7 +37,7 @@ static void setupDivMaxSoftmaxDropoutForward(Fusion* fusion, DataType dtype) {
fusion->addInput(tv1);
// TODO: should be input
auto d16 = new Double(1.0);
auto d16 = IrBuilder::create<Double>(1.0);
if (is_fp16) {
tv0 = castOp(DataType::Float, tv0);
@ -47,7 +48,7 @@ static void setupDivMaxSoftmaxDropoutForward(Fusion* fusion, DataType dtype) {
auto tv3 = add(tv2, tv0);
auto tv10 = softmax(tv3, 3);
auto dropout_tvs = dropout(tv10, new Double(0.9));
auto dropout_tvs = dropout(tv10, IrBuilder::create<Double>(0.9));
auto tv12 = dropout_tvs.mask;
auto tv14 = dropout_tvs.output;
@ -83,9 +84,9 @@ static void setupDivMaxSoftmaxDropoutBackward(Fusion* fusion, DataType dtype) {
}
// TODO: should be inputs
auto d32 = new Double(1.0);
auto d32 = IrBuilder::create<Double>(1.0);
// fusion->addInput(d32);
auto d33 = new Double(2.0);
auto d33 = IrBuilder::create<Double>(2.0);
// fusion->addInput(d33);
auto tv4 = mul(tv2, tv3);
@ -252,14 +253,15 @@ static void setupBiasDropoutAddLayernormFwd(Fusion* fusion, DataType dtype) {
auto tv5 = broadcast(tv4, {true, true, false});
auto tv6 = add(tv3, tv5);
auto dropout_outs = dropout(tv6, new Double(0.9));
auto dropout_outs = dropout(tv6, IrBuilder::create<Double>(0.9));
auto tv8 = dropout_outs.output;
auto tv10 = dropout_outs.mask;
auto tv11 = add(tv10, tv2);
auto layer_norm_outs = layer_norm(tv11, 1, tv0, tv1, new Double(1e-5));
auto layer_norm_outs =
layer_norm(tv11, 1, tv0, tv1, IrBuilder::create<Double>(1e-5));
auto tv14 = layer_norm_outs.output;
auto tv21 = layer_norm_outs.mean;
auto tv26 = layer_norm_outs.invstd;
@ -481,7 +483,7 @@ static void setupBiasDropoutAddLayernormBwd2(Fusion* fusion, DataType dtype) {
tv1 = castOp(DataType::Float, tv1);
tv8 = castOp(DataType::Float, tv8);
}
auto d36 = mul(new Double(1.0), tv1->axis(2)->extent());
auto d36 = mul(IrBuilder::create<Double>(1.0), tv1->axis(2)->extent());
auto d47 = unaryOp(UnaryOpType::Reciprocal, d36);
auto tv9 = broadcast(tv5, {true, true, false});
@ -583,7 +585,7 @@ static void setupBiasDropoutAddLayernormBwd3(Fusion* fusion, DataType dtype) {
}
// Uncertain this is the right value, but going for it anyways
auto d34 = div(new Double(1.0), tv0->axis(2)->extent());
auto d34 = div(IrBuilder::create<Double>(1.0), tv0->axis(2)->extent());
auto tv25 = mul(tv21, tv0);
auto tv26 = mul(tv25, d34);

View File

@ -4,6 +4,7 @@
#include <torch/csrc/jit/codegen/cuda/arith.h>
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
@ -41,23 +42,23 @@ static void setupFusion(Fusion* fusion) {
auto t5 = castOp(DataType::Float, t4);
auto t6 = broadcast(t3, {true, true, false});
auto t7 = add(t6, t5);
auto t8 = mul(t7, new Double(k_079));
auto t9 = mul(t7, new Double(k_004));
auto t8 = mul(t7, IrBuilder::create<Double>(k_079));
auto t9 = mul(t7, IrBuilder::create<Double>(k_004));
auto t10 = mul(t9, t7);
auto t11 = add(t10, new Int(1));
auto t11 = add(t10, IrBuilder::create<Int>(1));
auto t12 = mul(t8, t11);
auto t13 = unaryOp(UnaryOpType::Tanh, t12);
auto t14 = mul(t7, new Double(0.5));
auto t14 = mul(t7, IrBuilder::create<Double>(0.5));
auto t15 = mul(t13, t13);
auto t16 = unaryOp(UnaryOpType::Neg, t15);
auto t17 = add(t16, new Int(1));
auto t18 = mul(t7, new Double(k_010));
auto t17 = add(t16, IrBuilder::create<Int>(1));
auto t18 = mul(t7, IrBuilder::create<Double>(k_010));
auto t19 = mul(t18, t7);
auto t20 = add(t19, new Double(k_079));
auto t20 = add(t19, IrBuilder::create<Double>(k_079));
auto t21 = mul(t17, t20);
auto t22 = mul(t14, t21);
auto t23 = add(t13, new Int(1));
auto t24 = mul(t23, new Double(0.5));
auto t23 = add(t13, IrBuilder::create<Int>(1));
auto t24 = mul(t23, IrBuilder::create<Double>(0.5));
auto t25 = add(t22, t24);
auto t26 = mul(t25, t1);

View File

@ -1,6 +1,7 @@
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@ -129,7 +130,7 @@ static auto getLayerForwardNormRuntime(
Fusion& fusion = *fusion_ptr.get();
const float kEps = 1e-5;
Double* eps_ptr = new Double(kEps);
Double* eps_ptr = IrBuilder::create<Double>(kEps);
auto input = makeSymbolicTensor(shape.size());
fusion.addInput(input);

View File

@ -1,6 +1,7 @@
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@ -129,7 +130,7 @@ static auto getLayerForwardNormRuntime(
Fusion& fusion = *fusion_ptr.get();
const float kEps = 1e-5;
Double* eps_ptr = new Double(kEps);
Double* eps_ptr = IrBuilder::create<Double>(kEps);
auto input = makeSymbolicTensor(shape.size());
fusion.addInput(input);

View File

@ -1,6 +1,7 @@
#include <torch/csrc/jit/codegen/cuda/arith.h>
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
@ -39,8 +40,8 @@ static void setupInstanceNorm(Fusion* fusion, DataType dtype) {
const bool kTraining = true;
const float kMomentum = 0.1;
const float kEps = 1e-5;
auto momentum_ptr = new Double(kMomentum);
auto eps_ptr = new Double(kEps);
auto momentum_ptr = IrBuilder::create<Double>(kMomentum);
auto eps_ptr = IrBuilder::create<Double>(kEps);
auto norm = instance_norm(
input,

View File

@ -1,6 +1,7 @@
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@ -24,7 +25,7 @@ static void setupLayerNorm(Fusion* fusion, DataType dtype) {
const int kReductionAxis = 1;
const float kEps = 1e-5;
Double* eps_ptr = new Double(kEps);
Double* eps_ptr = IrBuilder::create<Double>(kEps);
// setup fusion
auto input = makeContigTensor(2, dtype);

View File

@ -1,6 +1,7 @@
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@ -22,7 +23,7 @@ static void setupLayerNorm_BWD(Fusion* fusion, DataType dtype) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
const int kReductionAxis = 1;
Double* eps_ptr = new Double(1e-5);
Double* eps_ptr = IrBuilder::create<Double>(1e-5);
// setup fusion
auto grad_out = makeContigTensor(2, dtype);

View File

@ -1,6 +1,7 @@
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@ -151,7 +152,7 @@ static auto getLayerForwardNormRuntime(
Fusion& fusion = *fusion_ptr.get();
const float kEps = 1e-5;
Double* eps_ptr = new Double(kEps);
Double* eps_ptr = IrBuilder::create<Double>(kEps);
auto input = makeSymbolicTensor(shape.size());
fusion.addInput(input);

View File

@ -2,6 +2,7 @@
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@ -35,7 +36,7 @@ static void setupSoftmaxDropout(
auto attention_scores = makeContigTensor(4, dtype);
auto attention_mask = makeContigTensor(4, dtype);
Double* divisor = new Double();
Double* divisor = IrBuilder::create<Double>();
fusion->addInput(attention_scores);
fusion->addInput(attention_mask);
@ -49,8 +50,8 @@ static void setupSoftmaxDropout(
attention_scores = div(attention_scores, divisor);
attention_scores = add(attention_scores, attention_mask);
auto attention_probs = softmax(attention_scores, kReductionAxis);
auto prob = new Double(kDropoutProbability);
auto scale = new Double(kScale);
auto prob = IrBuilder::create<Double>(kDropoutProbability);
auto scale = IrBuilder::create<Double>(kScale);
auto dropout_results = dropout(attention_probs, prob, scale);
auto output = dropout_results.output;

View File

@ -16,8 +16,8 @@ std::string toString(ReductionParams rparams) {
if (rparams.schedule_3D) {
ss << "3D Schedule // "
<< "Outer Reduction: "
<< (rparams.cross_block_outer_reduce ? "cross block / " : "")
<< (rparams.cross_grid_outer_reduce ? "cross grid / " : "")
<< (rparams.cross_block_outer_reduction ? "cross block / " : "")
<< (rparams.cross_grid_outer_reduction ? "cross grid / " : "")
<< (rparams.split_grid_dim_outer_reduction ? "split grid dim / " : "");
if (rparams.batches_per_block_outer_reduction > 1 ||
rparams.persistent_kernel) {
@ -38,9 +38,9 @@ std::string toString(ReductionParams rparams) {
}
ss << " // Inner Reduction Domain: "
<< (rparams.cross_block_inner_reduce ? "cross block reduction / " : "")
<< (rparams.cross_block_inner_reduction ? "cross block reduction / " : "")
<< (rparams.pad_inner_reduction_to_warp ? "pad to warp / " : "")
<< (rparams.cross_grid_inner_reduce ? "cross grid reduction / " : "");
<< (rparams.cross_grid_inner_reduction ? "cross grid reduction / " : "");
if (rparams.batches_per_block_inner_reduction > 1 ||
rparams.persistent_kernel) {
@ -48,7 +48,7 @@ std::string toString(ReductionParams rparams) {
<< " / ";
}
ss << (rparams.cross_grid_inner_reduce &&
ss << (rparams.cross_grid_inner_reduction &&
rparams.split_grid_dim_inner_reduction
? "split grid dimension / "
: "")

View File

@ -1,47 +1,14 @@
#include <c10/core/DispatchKey.h>
#include <c10/core/DispatchKeySet.h>
#include <unordered_map>
namespace c10 {
const char* toString(BackendComponent t) {
switch (t) {
case BackendComponent::CPUBit:
return "CPUBit";
case BackendComponent::CUDABit:
return "CUDABit";
case BackendComponent::HIPBit:
return "HIPBit";
case BackendComponent::XLABit:
return "XLABit";
case BackendComponent::LazyBit:
return "LazyBit";
case BackendComponent::XPUBit:
return "XPUBit";
case BackendComponent::MLCBit:
return "MLCBit";
case BackendComponent::HPUBit:
return "HPUBit";
case BackendComponent::VEBit:
return "VEBit";
case BackendComponent::PrivateUse1Bit:
return "PrivateUse1Bit";
case BackendComponent::PrivateUse2Bit:
return "PrivateUse2Bit";
case BackendComponent::PrivateUse3Bit:
return "PrivateUse3Bit";
case BackendComponent::InvalidBit:
return "InvalidBit";
default:
return "UNKNOWN_BACKEND_BIT";
}
}
const char* toString(DispatchKey t) {
switch (t) {
case DispatchKey::Undefined:
return "Undefined";
case DispatchKey::CPU:
return "CPU";
case DispatchKey::CUDA:
@ -100,6 +67,8 @@ const char* toString(DispatchKey t) {
case DispatchKey::Python:
return "Python";
case DispatchKey::PythonTLSSnapshot:
return "PythonTLSSnapshot";
case DispatchKey::PrivateUse1:
return "PrivateUse1";
@ -134,6 +103,8 @@ const char* toString(DispatchKey t) {
return "AutogradMLC";
case DispatchKey::AutogradHPU:
return "AutogradHPU";
case DispatchKey::AutogradNestedTensor:
return "AutogradNestedTensor";
case DispatchKey::AutogradPrivateUse1:
return "AutogradPrivateUse1";
case DispatchKey::AutogradPrivateUse2:
@ -142,8 +113,6 @@ const char* toString(DispatchKey t) {
return "AutogradPrivateUse3";
case DispatchKey::AutogradOther:
return "AutogradOther";
case DispatchKey::AutogradNestedTensor:
return "AutogradNestedTensor";
case DispatchKey::ZeroTensor:
return "ZeroTensor";
@ -201,15 +170,6 @@ const char* toString(DispatchKey t) {
case DispatchKey::FuncTorchBatched:
return "FuncTorchBatched";
case DispatchKey::Dense:
return "Dense";
case DispatchKey::Quantized:
return "Quantized";
case DispatchKey::Sparse:
return "Sparse";
case DispatchKey::AutogradFunctionality:
return "AutogradFunctionality";
default:
return "UNKNOWN_TENSOR_TYPE_ID";
}
@ -218,39 +178,79 @@ const char* toString(DispatchKey t) {
std::ostream& operator<<(std::ostream& str, DispatchKey rhs) {
return str << toString(rhs);
}
std::ostream& operator<<(std::ostream& str, BackendComponent rhs) {
return str << toString(rhs);
}
DispatchKey getAutogradKeyFromBackend(BackendComponent k) {
// We want this to return an autograd key. We're relying on the fact that
// getAutogradRelatedKeySetFromBackend returns an autograd key +
// ADInplaceOrView, and autograd has higher precedence. The core mapping from
// backend -> autograd key lives in `getAutogradRelatedKeySetFromBackend`
// instead of here for performance. `getAutogradRelatedKeySetFromBackend` is a
// hotpath function, and we want to make sure that it doesn't have to
// construct any DispatchKeySets at runtime.
return getAutogradRelatedKeySetFromBackend(k).highestPriorityTypeId();
// for a given backend key, return the associated autograd key.
// for non-backend keys, return AutogradOther as a default.
// Note: it's convenient and fast to return a default here rather than (say)
// returning an optional<DispatchKey>, or throwing. But it makes callers
// responsible for either a) enforcing the invariant that only backend keys
// be passed as arguments, or b) interpreting our return value carefully.
//
DispatchKey getAutogradKeyFromBackend(DispatchKey t) {
switch (t) {
case DispatchKey::CPU:
return DispatchKey::AutogradCPU;
case DispatchKey::XPU:
return DispatchKey::AutogradXPU;
case DispatchKey::CUDA:
return DispatchKey::AutogradCUDA;
case DispatchKey::XLA:
return DispatchKey::AutogradXLA;
case DispatchKey::Lazy:
return DispatchKey::AutogradLazy;
case DispatchKey::MLC:
return DispatchKey::AutogradMLC;
case DispatchKey::HPU:
return DispatchKey::AutogradHPU;
case DispatchKey::NestedTensor:
return DispatchKey::AutogradNestedTensor;
case DispatchKey::PrivateUse1:
return DispatchKey::AutogradPrivateUse1;
case DispatchKey::PrivateUse2:
return DispatchKey::AutogradPrivateUse2;
case DispatchKey::PrivateUse3:
return DispatchKey::AutogradPrivateUse3;
default:
return DispatchKey::AutogradOther;
}
}
c10::DispatchKey parseDispatchKey(const std::string& k) {
static std::unordered_map<std::string, c10::DispatchKey> key_map = {
{"Undefined", c10::DispatchKey::Undefined},
{"Dense", c10::DispatchKey::Dense},
{"CPU", c10::DispatchKey::CPU},
{"CUDA", c10::DispatchKey::CUDA},
{"HIP", c10::DispatchKey::HIP},
{"FPGA", c10::DispatchKey::FPGA},
{"ORT", c10::DispatchKey::ORT},
{"XLA", c10::DispatchKey::XLA},
{"MLC", c10::DispatchKey::MLC},
{"Vulkan", c10::DispatchKey::Vulkan},
{"Metal", c10::DispatchKey::Metal},
{"XPU", c10::DispatchKey::XPU},
{"HPU", c10::DispatchKey::HPU},
{"VE", c10::DispatchKey::VE},
{"Lazy", c10::DispatchKey::Lazy},
{"Meta", c10::DispatchKey::Meta},
{"Quantized", c10::DispatchKey::Quantized},
{"QuantizedCPU", c10::DispatchKey::QuantizedCPU},
{"QuantizedCUDA", c10::DispatchKey::QuantizedCUDA},
{"QuantizedXPU", c10::DispatchKey::QuantizedXPU},
{"CustomRNGKeyId", c10::DispatchKey::CustomRNGKeyId},
{"MkldnnCPU", c10::DispatchKey::MkldnnCPU},
{"Sparse", c10::DispatchKey::Sparse},
{"SparseCPU", c10::DispatchKey::SparseCPU},
{"SparseCUDA", c10::DispatchKey::SparseCUDA},
{"SparseHIP", c10::DispatchKey::SparseHIP},
{"SparseXPU", c10::DispatchKey::SparseXPU},
{"SparseVE", c10::DispatchKey::SparseVE},
{"SparseCsrCPU", c10::DispatchKey::SparseCsrCPU},
{"SparseCsrCUDA", c10::DispatchKey::SparseCsrCUDA},
{"NestedTensor", c10::DispatchKey::NestedTensor},
{"PrivateUse1", c10::DispatchKey::PrivateUse1},
{"PrivateUse2", c10::DispatchKey::PrivateUse2},
{"PrivateUse3", c10::DispatchKey::PrivateUse3},
{"BackendSelect", c10::DispatchKey::BackendSelect},
{"Python", c10::DispatchKey::Python},
{"PythonTLSSnapshot", c10::DispatchKey::PythonTLSSnapshot},
{"Named", c10::DispatchKey::Named},
{"Conjugate", c10::DispatchKey::Conjugate},
{"Negative", c10::DispatchKey::Negative},
@ -259,8 +259,17 @@ c10::DispatchKey parseDispatchKey(const std::string& k) {
c10::DispatchKey::FuncTorchDynamicLayerBackMode},
{"ADInplaceOrView", c10::DispatchKey::ADInplaceOrView},
{"AutogradOther", c10::DispatchKey::AutogradOther},
{"AutogradFunctionality", c10::DispatchKey::AutogradFunctionality},
{"AutogradCPU", c10::DispatchKey::AutogradCPU},
{"AutogradCUDA", c10::DispatchKey::AutogradCUDA},
{"AutogradXLA", c10::DispatchKey::AutogradXLA},
{"AutogradLazy", c10::DispatchKey::AutogradLazy},
{"AutogradXPU", c10::DispatchKey::AutogradXPU},
{"AutogradMLC", c10::DispatchKey::AutogradMLC},
{"AutogradHPU", c10::DispatchKey::AutogradHPU},
{"AutogradNestedTensor", c10::DispatchKey::AutogradNestedTensor},
{"AutogradPrivateUse1", c10::DispatchKey::AutogradPrivateUse1},
{"AutogradPrivateUse2", c10::DispatchKey::AutogradPrivateUse2},
{"AutogradPrivateUse3", c10::DispatchKey::AutogradPrivateUse3},
{"Tracer", c10::DispatchKey::Tracer},
{"AutocastCPU", c10::DispatchKey::AutocastCPU},
{"AutocastCUDA", c10::DispatchKey::AutocastCUDA},
@ -274,41 +283,6 @@ c10::DispatchKey parseDispatchKey(const std::string& k) {
{"TESTING_ONLY_GenericWrapper",
c10::DispatchKey::TESTING_ONLY_GenericWrapper},
{"TESTING_ONLY_GenericMode", c10::DispatchKey::TESTING_ONLY_GenericMode},
{"CPU", c10::DispatchKey::CPU},
{"CUDA", c10::DispatchKey::CUDA},
{"HIP", c10::DispatchKey::HIP},
{"XLA", c10::DispatchKey::XLA},
{"MLC", c10::DispatchKey::MLC},
{"XPU", c10::DispatchKey::XPU},
{"HPU", c10::DispatchKey::HPU},
{"Lazy", c10::DispatchKey::Lazy},
{"NestedTensor", c10::DispatchKey::NestedTensor},
{"PrivateUse1", c10::DispatchKey::PrivateUse1},
{"PrivateUse2", c10::DispatchKey::PrivateUse2},
{"PrivateUse3", c10::DispatchKey::PrivateUse3},
{"QuantizedCPU", c10::DispatchKey::QuantizedCPU},
{"QuantizedCUDA", c10::DispatchKey::QuantizedCUDA},
{"QuantizedXPU", c10::DispatchKey::QuantizedXPU},
{"SparseCPU", c10::DispatchKey::SparseCPU},
{"SparseCUDA", c10::DispatchKey::SparseCUDA},
{"SparseHIP", c10::DispatchKey::SparseHIP},
{"SparseXPU", c10::DispatchKey::SparseXPU},
{"SparseVE", c10::DispatchKey::SparseVE},
{"AutogradCPU", c10::DispatchKey::AutogradCPU},
{"AutogradCUDA", c10::DispatchKey::AutogradCUDA},
{"AutogradXLA", c10::DispatchKey::AutogradXLA},
{"AutogradLazy", c10::DispatchKey::AutogradLazy},
{"AutogradXPU", c10::DispatchKey::AutogradXPU},
{"AutogradMLC", c10::DispatchKey::AutogradMLC},
{"AutogradHPU", c10::DispatchKey::AutogradHPU},
{"AutogradPrivateUse1", c10::DispatchKey::AutogradPrivateUse1},
{"AutogradPrivateUse2", c10::DispatchKey::AutogradPrivateUse2},
{"AutogradPrivateUse3", c10::DispatchKey::AutogradPrivateUse3},
{"Autograd", c10::DispatchKey::Autograd},
{"CompositeImplicitAutograd",
c10::DispatchKey::CompositeImplicitAutograd},

View File

@ -9,98 +9,20 @@
namespace c10 {
// Semantically, each value of BackendComponent identifies a "backend" for our
// dispatch. Some functionalities that we may dispatch to are allowed to
// register different handlers for each backend. The BackendComponent is then
// used to figure out which backend implementation to dispatch to.
// In implementation terms, the backend component identifies a specific "bit" in
// a DispatchKeySet. The bits in the DispatchKeySet are split between the bottom
// ~12 "BackendComponent" bits, while the remaining upper bits are assigned to
// functionalities. When we encounter a functionality bit that is known to be
// customizeable per-backend, then we also look at the lower BackendComponent
// bits and take the highest bit to determine which backend's implementation to
// use.
enum class BackendComponent : uint8_t {
// A "backend" is colloquially used to refer to handlers for dispatch
// which actually implement the numerics of an operation in question.
//
// Due to the nature of the enum, these backends are specified in
// an ordered way, but for most backends this order is not semantically
// meaningful (e.g., it's valid to reorder these backends without changing
// semantics). The only situation when backend ordering is meaningful
// is when the backend participates in multiple dispatch with another
// backend; e.g., CPU and CUDA (cuda must have higher priority).
// These keys don't correspond to individual kernels.
// Instead, they represent the backends that are allowed to override specific
// pieces of functionality:
// - dense kernels (e.g. DispatchKey::CPU)
// - sparse kernels (e.g. DispatchKey::SparseCPU)
// - quantized kernels (e.g. DispatchKey::QuantizedCPU)
// - autograd kernels (e.g. DispatchKey::AutogradCPU)
// We reserve space in the runtime operator table for this full cross product
// of
// [backends in this enum] x [keys below that are explicitly marked as having
// per-backend functionality]
InvalidBit = 0,
CPUBit,
CUDABit,
HIPBit,
XLABit,
MLCBit,
XPUBit,
HPUBit,
VEBit,
LazyBit,
PrivateUse1Bit,
PrivateUse2Bit,
PrivateUse3Bit,
// Define an alias to represent end of backend dispatch keys.
// If you add new backend keys after PrivateUse3, please also update it here.
// (But you shouldn't: private use keys should have higher precedence than
// all built-in keys)
EndOfBackendKeys = PrivateUse3Bit,
};
// Semantically, a dispatch key identifies a possible "level" in our
// dispatch, for which a handler may be registered. Each handler corresponds
// to a type of functionality.
// dispatch, for which a handler may be registered. Traditional
// backends like CPU and CUDA get dispatch keys; however, so do
// "wrapping" layers like Variable (for autograd handling).
//
// In implementation terms, the dispatch key identifies a specific "bit" in a
// DispatchKeySet. Higher bit indexes get handled by dispatching first (because
// we "count leading zeros" when we extract the highest priority dispatch
// key.)
//
// Note [DispatchKey Classification]
// This enum actually contains several types of keys, which are explained
// in more detail further down:
// (1) non-customizable backends (e.g. FPGA)
// (2) non-customizable functionalities (e.g. Functionalize)
// (3) functionalized that are customizable per backend (e.g. Dense, Sparse,
// AutogradFunctionality) (4) per-backend instances of customizable
// functionalities (e.g. CPU, SparseCPU, AutogradCPU) (5) alias keys (e.g.
// CompositeImplicitAutograd)
//
// Of the categories above, it's important to note:
// (a) which keys are assigned individual bits in a DispatchKeySet
// (b) which keys are assigned individual slots in the runtime operator table
// ("Runtime keys")
//
// (1), (2) and (3) all get their own dedicated bits in the DispatchKeySet.
// (1), (2) and (4) all get their own dedicated slots in the runtime operator
// table.
// See Note [DispatchKeySet Internal Representation] for more details.
//
// NOTE: Keep the list in sync with `DispatchKey` in tools/codegen/model.py
enum class DispatchKey : uint16_t {
enum class DispatchKey : uint8_t {
// ~~~~~~~~~~~~~~~~~~~~~~~~~~ UNDEFINED ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
// This is not a "real" functionality, but it exists to give us a "nullopt"
// This is not a "real" tensor id, but it exists to give us a "nullopt"
// element we can return for cases when a DispatchKeySet contains no elements.
// You can think a more semantically accurate definition of DispatchKey is:
//
@ -116,31 +38,24 @@ enum class DispatchKey : uint16_t {
// this will get eliminated, but for now it's convenient)
CatchAll = Undefined,
// ~~~~~~~~~~~~~~~~~~~~~~~~~~ Functionality Keys ~~~~~~~~~~~~~~~~~~~~~~ //
// Every value in the enum (up to EndOfFunctionalityKeys)
// corresponds to an individual "functionality" that can be dispatched to.
// This is represented in the DispatchKeySet by assigning each of these enum
// values
// to each of the remaining (64 - len(BackendComponent)) bits.
// ~~~~~~~~~~~~~~~~~~~~~~~~~~ BACKENDS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
// A "backend" is colloquially used to refer to handlers for dispatch
// which actually implement the numerics of an operation in question.
//
// Most of these functionalities have a single handler assigned to them,
// making them "runtime keys".
// That map to a single slot in the runtime operator table.
//
// A few functionalities are allowed to be customizable per backend.
// See [Note: Per-Backend Functionality Dispatch Keys] for details.
// See [Note: Per-Backend Functionality Dispatch Keys]
Dense,
// Below are non-extensible backends.
// These are backends that currently don't have their own overrides for
// Autograd/Sparse/Quantized kernels,
// and we therefore don't waste space in the runtime operator table allocating
// space for them.
// If any of these backends ever need to customize, e.g., Autograd, then we'll
// need to add a DispatchKey::*Bit for them.
// Due to the nature of the enum, these backends are specified in
// an ordered way, but for most backends this order is not semantically
// meaningful (e.g., it's valid to reorder these backends without changing
// semantics). The only situation when backend ordering is meaningful
// is when the backend participates in multiple dispatch with another
// backend; e.g., CPU and SparseCPU (sparse must have
// higher priority).
// Here are backends which you think of as traditionally specifying
// how to implement operations on some device.
CPU, // registered at build/aten/src/ATen/RegisterCPU.cpp
CUDA, // registered at build/aten/src/ATen/RegisterCUDA.cpp
HIP, // NB: I think this is not actually used, due to Note [Masquerading as
// CUDA]
FPGA, // Xilinx support lives out of tree at
// https://gitlab.com/pytorch-complex/vitis_kernels
@ -152,8 +67,14 @@ enum class DispatchKey : uint16_t {
// - aten/src/ATen/test/extension_backend_test.cpp
ORT,
XLA, // lives out of tree at https://github.com/pytorch/xla
MLC, // lives out of tree at https://github.com/pytorch/MLCompute
Vulkan,
Metal,
XPU, // For out of tree Intel's heterogeneous computing plug-in
HPU, // For out of tree & closed source integration of HPU / Habana
VE, // For out of tree & closed source integration of SX-Aurora / NEC
Lazy, // For lazy tensor backends
// A meta tensor is a tensor without any data associated with it. (They
// have also colloquially been referred to as tensors on the "null" device).
@ -162,8 +83,11 @@ enum class DispatchKey : uint16_t {
// tensor with the output shape and dtype, but wouldn't actually add anything.
Meta,
// See [Note: Per-Backend Functionality Dispatch Keys]
Quantized,
// Here are backends which specify more specialized operators
// based on the dtype of the tensor.
QuantizedCPU, // registered at build/aten/src/ATen/RegisterQuantizedCPU.cpp
QuantizedCUDA, // registered at build/aten/src/ATen/RegisterQuantizedCUDA.cpp
QuantizedXPU, // For out of tree Intel's heterogeneous computing plug-in
// This backend is to support custom RNGs; it lets you go
// to a different kernel if you pass in a generator that is not a
@ -182,29 +106,31 @@ enum class DispatchKey : uint16_t {
// the corresponding dense tensors, and must be handled before them.
MkldnnCPU, // registered at build/aten/src/ATen/RegisterMkldnnCPU.cpp
// NB: not to be confused with MKLDNN, which is Caffe2 only
// See [Note: Per-Backend Functionality Dispatch Keys]
Sparse,
SparseCPU, // registered at build/aten/src/ATen/RegisterSparseCPU.cpp
SparseCUDA, // registered at build/aten/src/ATen/RegisterSparseCUDA.cpp
SparseHIP, // TODO: I think this is not actually used, due to Note
// [Masquerading as CUDA]
SparseXPU, // For out of tree Intel's heterogeneous computing plug-in
SparseVE, // For out of tree & closed source integration of SX-Aurora / NEC
SparseCsrCPU,
SparseCsrCUDA,
// Note [Non-Customizable Backend Keys]
// Every key above here is considered a "non-customizable backend".
// These are backends that will work correctly with autograd, but
// but currently don't require separate implementations
// for autograd sparse or quantized kernels.
// Any new backends that don't need to be customized should go above here.
// If an existing backend needs to e.g. override autograd, then we can
// consider promoting it into the "BackendComponent" enum
//
// For all intents and purposes from the perspective of DispatchKeySet,
// "non-customizable backend" keys are treated the same way
// as other functionality keys
EndOfNonCustomizableBackends = SparseCsrCUDA,
NestedTensor, // lives out of tree at https://github.com/pytorch/nestedtensor
// Here are reserved backends for user-defined backends, see Note [Private use
// DispatchKey]
// To see some example about how to use this, check out ORT
PrivateUse1,
PrivateUse2,
PrivateUse3,
// Define an alias key to represent end of backend dispatch keys.
// If you add new backend keys after PrivateUse3, please also update it here.
// (But you shouldn't: private use keys should have higher precedence than
// all built-in keys)
EndOfBackendKeys = PrivateUse3,
// In some situations, it is not immediately obvious what the correct
// backend for function is, because the function in question doesn't
// have any "tensor" arguments. In this case, a BackendSelect function
@ -307,18 +233,20 @@ enum class DispatchKey : uint16_t {
// AutogradOther key. We can add specific autograd key for those backends
// upon request.
AutogradOther,
// See [Note: Per-Backend Functionality Dispatch Keys]
AutogradFunctionality,
// NestedTensor is an example of something that isn't a "real backend"
// (because it mostly consists of redispatching kernels)
// but it would like to override autograd functionality in C++.
// We can handle cases like this by adding an extra functionality key
// exclusively for handling autograd for NestedTensor.
// lives out of tree at
AutogradCPU,
AutogradCUDA,
AutogradXLA,
AutogradLazy,
AutogradXPU,
AutogradMLC,
AutogradHPU,
AutogradNestedTensor, // lives out of tree at
// https://github.com/pytorch/nestedtensor
AutogradNestedTensor,
// Here are some reserved pre-autograd keys for user-defined backends, see
// Note [Private use DispatchKey]
AutogradPrivateUse1,
AutogradPrivateUse2,
AutogradPrivateUse3,
Tracer,
@ -354,6 +282,11 @@ enum class DispatchKey : uint16_t {
Functionalize,
FuncTorchDynamicLayerFrontMode, // See Note [Out-of-tree vmap+grad prototype]
// Used by Python key logic to know the set of tls on entry to the dispatcher
// This kernel assumes it is at the very top of the dispatcher. If you add
// a key above, make sure to update the fallback implementation for this.
PythonTLSSnapshot,
// TESTING: This is intended to be a generic testing tensor type id.
// Don't use it for anything real; its only acceptable use is within a single
// process test. Use it by creating a TensorImpl with this DispatchKey, and
@ -371,100 +304,9 @@ enum class DispatchKey : uint16_t {
TESTING_ONLY_GenericMode,
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FIN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
EndOfFunctionalityKeys, // End of functionality keys.
// ~~~~~~~~~~~~~~ "Dense" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~~~~ //
// Here are backends which you think of as traditionally specifying
// how to implement operations on some device.
// See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
StartOfDenseBackends,
CPU, // registered at build/aten/src/ATen/RegisterCPU.cpp
CUDA, // registered at build/aten/src/ATen/RegisterCUDA.cpp
HIP, // NB: I think this is not actually used, due to Note [Masquerading as
// CUDA]
XLA, // lives out of tree at https://github.com/pytorch/xla
MLC, // lives out of tree at https://github.com/pytorch/MLCompute
XPU, // For out of tree Intel's heterogeneous computing plug-in
HPU, // For out of tree & closed source integration of HPU / Habana
VE, // For out of tree & closed source integration of SX-Aurora / NEC
Lazy, // For lazy tensor backends
// Here are reserved backends for user-defined backends, see Note [Private use
// DispatchKey]
// To see some example about how to use this, check out ORT
PrivateUse1,
PrivateUse2,
PrivateUse3,
EndOfDenseBackends = PrivateUse3,
// ~~~~~~~~~~~~~~ "Quantized" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~ //
// keys starting with an _ are not currently used,
// but are needed to ensure that every backend is indexed correctly.
// See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
StartOfQuantizedBackends,
QuantizedCPU, // registered at build/aten/src/ATen/RegisterQuantizedCPU.cpp
QuantizedCUDA, // registered at build/aten/src/ATen/RegisterQuantizedCUDA.cpp
_QuantizedHIP,
_QuantizedXLA,
_QuantizedMLC,
QuantizedXPU, // For out of tree Intel's heterogeneous computing plug-in
_QuantizedHPU,
_QuantizedVE,
_QuantizedLazy,
_QuantizedPrivateUse1,
_QuantizedPrivateUse2,
_QuantizedPrivateUse3,
EndOfQuantizedBackends = _QuantizedPrivateUse3,
// ~~~~~~~~~~~~~~ "Sparse" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~~~ //
// keys starting with an _ are not currently used,
// but are needed to ensure that every backend is indexed correctly.
// See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
StartOfSparseBackends,
SparseCPU, // registered at build/aten/src/ATen/RegisterSparseCPU.cpp
SparseCUDA, // registered at build/aten/src/ATen/RegisterSparseCUDA.cpp
SparseHIP, // TODO: I think this is not actually used, due to Note
// [Masquerading as CUDA]
_SparseXLA,
_SparseMLC,
SparseXPU, // For out of tree Intel's heterogeneous computing plug-in
_SparseHPU,
SparseVE, // For out of tree & closed source integration of SX-Aurora / NEC
_SparseLazy,
_SparsePrivateUse1,
_SparsePrivateUse2,
_SparsePrivateUse3,
EndOfSparseBackends = _SparsePrivateUse3,
// ~~~~~~~~~~~~~~ "Autograd" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~ //
// keys starting with an _ are not currently used,
// but are needed to ensure that every backend is indexed correctly.
// See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
StartOfAutogradBackends,
AutogradCPU,
AutogradCUDA,
_AutogradHIP,
AutogradXLA,
AutogradMLC,
AutogradXPU,
AutogradHPU,
_AutogradVE,
AutogradLazy,
// Here are some reserved pre-autograd keys for user-defined backends, see
// Note [Private use DispatchKey]
AutogradPrivateUse1,
AutogradPrivateUse2,
AutogradPrivateUse3,
EndOfAutogradBackends = AutogradPrivateUse3,
// If we add a new per-backend functionality key that has higher priority
// than Autograd, then this key should be updated.
EndOfRuntimeBackendKeys = EndOfAutogradBackends,
NumDispatchKeys, // Sentinel, end of runtime keys.
// ~~~~~~~~~~~~~~~~~~~~~~ Alias Dispatch Keys ~~~~~~~~~~~~~~~~~~~~~~~~~~ //
// Note [Alias Dispatch Keys]
// Alias dispatch keys are synthetic dispatch keys which map to multiple
// runtime dispatch keys. Alisa keys have precedence, but they are always
// lower precedence than runtime keys. You can register a kernel to an
@ -484,7 +326,6 @@ enum class DispatchKey : uint16_t {
// Define an alias key to represent end of alias dispatch keys.
// If you add new alias keys after Autograd, please also update it here.
StartOfAliasKeys = Autograd,
EndOfAliasKeys = CompositeExplicitAutograd, //
// ~~~~~~~~~~~~~~~~~~~~~~~~~ BC ALIASES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
@ -524,83 +365,54 @@ enum class DispatchKey : uint16_t {
// built-in autograd formulas for operators are not appropriate.
static_assert(
(static_cast<uint8_t>(BackendComponent::EndOfBackendKeys) +
static_cast<uint8_t>(DispatchKey::EndOfFunctionalityKeys)) <= 64,
"The BackendComponent and DispatchKey enums (below EndOfFunctionalityKeys)"
" both map to backend and functionality bits"
" into a 64-bit bitmask; you must have less than 64 total entries between them");
// Check if a DispatchKey is an alias mapping to other runtime keys.
constexpr bool isAliasDispatchKey(DispatchKey k) {
return k >= DispatchKey::StartOfAliasKeys && k <= DispatchKey::EndOfAliasKeys;
}
// [Note: Per-Backend Functionality Dispatch Keys]
// Check if a DispatchKey is a per-backend functionality key
// Any functionalities that can be customized per-backend should be added here.
// These keys correspond to functionalities that can be customized indivually
// per backend. While they only take up one bit in the `DispatchKeySet` bitset,
// they map to (# backends) slots in the operator table.
// Each of these keys also has a separate set of "runtime keys" in the dispatch
// key enum, per backend, which *do* map to the individual operator table slots.
// For example, the "Sparse" key maps to an individual bit in the
// DispatchKeySet, while `SparseCPU`, `SparseCUDA`, etc all map to individual
// slots in the runtime operator table.
constexpr bool isPerBackendFunctionalityKey(DispatchKey k) {
if (k == DispatchKey::Dense || k == DispatchKey::Quantized ||
k == DispatchKey::Sparse || k == DispatchKey::AutogradFunctionality) {
return true;
} else {
return false;
}
}
// Note that this includes Undefined in the total count.
// BUT EndOfFunctionalityKeys is its own (placeholder) key.
// e.g. Undefined=0, Dense=1, Sparse=2, EndOfFunctionalityKeys=3.
// In the above example, there are 3 total functionality keys.
constexpr uint8_t num_functionality_keys =
static_cast<uint8_t>(DispatchKey::EndOfFunctionalityKeys);
// Note [No More Than 16 Backends]
// Search for this note to find places in the code where the "no more than 16
// backends" invariant is baked in.
static_assert(
static_cast<uint8_t>(BackendComponent::EndOfBackendKeys) <= 16,
"BackendComponent currently only supports <= 16 backends. If we really need to extend this, \
there are a few places where this invariant is baked in");
constexpr uint8_t numPerBackendFunctionalityKeys() {
uint8_t count = 0;
for (uint8_t k = 0; k <= num_functionality_keys; ++k) {
if (isPerBackendFunctionalityKey(static_cast<DispatchKey>(k)))
++count;
}
return count;
}
static_cast<uint8_t>(DispatchKey::NumDispatchKeys) <= 64,
"DispatchKey is used as index into 64-bit bitmask; you must have less than 64 entries");
#if defined(C10_MOBILE_TRIM_DISPATCH_KEYS)
// See [Note: Trimmed Mobile Dispatch Keys]
constexpr uint8_t num_backends = 1; // Only CPU
constexpr uint16_t num_runtime_entries = 8;
/**
* The method below maps the dispatch key in the enum DispatchKey to an
* integer index in the dispatchTable_ array in OperatorEntry. The array
* is trimmed for mobile to reduce peak memory usage since it's
* unnecessary to reserve additional space for dispatch keys that will
* never be used on mobile.
*/
C10_API constexpr int getDispatchTableIndexForDispatchKey(DispatchKey dk) {
switch (dk) {
case DispatchKey::Undefined:
return 0;
case DispatchKey::CPU:
return 1;
case DispatchKey::QuantizedCPU:
return 2;
case DispatchKey::SparseCPU:
return 3;
case DispatchKey::BackendSelect:
return 4;
case DispatchKey::ADInplaceOrView:
return 5;
case DispatchKey::AutogradOther:
return 6;
case DispatchKey::AutogradCPU:
return 7;
case DispatchKey::NumDispatchKeys: // Sentinel, end of runtime keys.
return 8;
default:
return -1;
}
}
#else
constexpr uint8_t num_backends =
static_cast<uint8_t>(BackendComponent::EndOfBackendKeys);
constexpr uint16_t num_runtime_entries = num_functionality_keys +
(numPerBackendFunctionalityKeys() * (num_backends - 1));
/**
* For the server use-case, make this a simple pass-through.
*/
C10_API constexpr int getDispatchTableIndexForDispatchKey(DispatchKey dk) {
return static_cast<int>(dk);
}
#endif
// See Note [No More Than 16 Backends]
constexpr uint16_t full_backend_mask =
(static_cast<uint16_t>(1) << num_backends) - 1;
C10_API const char* toString(DispatchKey);
C10_API const char* toString(BackendComponent);
C10_API std::ostream& operator<<(std::ostream&, DispatchKey);
C10_API std::ostream& operator<<(std::ostream&, BackendComponent);
C10_API DispatchKey getAutogradKeyFromBackend(BackendComponent k);
C10_API DispatchKey getAutogradKeyFromBackend(DispatchKey t);
// Parses a string into a dispatch key.
// If the string cannot be correctly parsed, throws an exception.
@ -613,86 +425,10 @@ C10_API c10::DispatchKey parseDispatchKey(const std::string& k);
// torch::dispatch(torch::kCPU, ...) is also valid.
constexpr DispatchKey kAutograd = DispatchKey::Autograd;
// See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
// This function relies on the invariant that the dispatch keys between
// StartOfDenseBackends and EndOfRuntimeBackendKeys are ordered by backend
// in the same order as `BackendComponent`.
constexpr BackendComponent toBackendComponent(DispatchKey k) {
if (k >= DispatchKey::StartOfDenseBackends &&
k <= DispatchKey::EndOfDenseBackends) {
return static_cast<BackendComponent>(
static_cast<uint8_t>(k) -
static_cast<uint8_t>(DispatchKey::StartOfDenseBackends));
} else if (
k >= DispatchKey::StartOfQuantizedBackends &&
k <= DispatchKey::EndOfQuantizedBackends) {
return static_cast<BackendComponent>(
static_cast<uint8_t>(k) -
static_cast<uint8_t>(DispatchKey::StartOfQuantizedBackends));
} else if (
k >= DispatchKey::StartOfSparseBackends &&
k <= DispatchKey::EndOfSparseBackends) {
return static_cast<BackendComponent>(
static_cast<uint8_t>(k) -
static_cast<uint8_t>(DispatchKey::StartOfSparseBackends));
} else if (
k >= DispatchKey::StartOfAutogradBackends &&
k <= DispatchKey::EndOfAutogradBackends) {
return static_cast<BackendComponent>(
static_cast<uint8_t>(k) -
static_cast<uint8_t>(DispatchKey::StartOfAutogradBackends));
} else {
return BackendComponent::InvalidBit;
}
// Check if a DispatchKey is an alias mapping to other runtime keys.
inline bool isAliasDispatchKey(DispatchKey k) {
return k > DispatchKey::NumDispatchKeys && k <= DispatchKey::EndOfAliasKeys;
}
constexpr DispatchKey toFunctionalityKey(DispatchKey k) {
if (k <= DispatchKey::EndOfFunctionalityKeys) {
return k;
} else if (k <= DispatchKey::EndOfDenseBackends) {
return DispatchKey::Dense;
} else if (k <= DispatchKey::EndOfQuantizedBackends) {
return DispatchKey::Quantized;
} else if (k <= DispatchKey::EndOfSparseBackends) {
return DispatchKey::Sparse;
} else if (k <= DispatchKey::EndOfAutogradBackends) {
return DispatchKey::AutogradFunctionality;
} else {
return DispatchKey::Undefined;
}
}
// Given (DispatchKey::Dense, DispatchKey::CUDABit), returns DispatchKey::CUDA
// See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
// This function relies on the invariant that the dispatch keys between
// StartOfDenseBackends and EndOfRuntimeBackendKeys are ordered by backend
// in the same order as `BackendComponent`.
constexpr DispatchKey toRuntimePerBackendFunctionalityKey(
DispatchKey functionality_k,
BackendComponent backend_k) {
if (functionality_k == DispatchKey::Dense) {
return static_cast<DispatchKey>(
static_cast<uint8_t>(DispatchKey::StartOfDenseBackends) +
static_cast<uint8_t>(backend_k));
}
if (functionality_k == DispatchKey::Sparse) {
return static_cast<DispatchKey>(
static_cast<uint8_t>(DispatchKey::StartOfSparseBackends) +
static_cast<uint8_t>(backend_k));
}
if (functionality_k == DispatchKey::Quantized) {
return static_cast<DispatchKey>(
static_cast<uint8_t>(DispatchKey::StartOfQuantizedBackends) +
static_cast<uint8_t>(backend_k));
}
if (functionality_k == DispatchKey::AutogradFunctionality) {
return static_cast<DispatchKey>(
static_cast<uint8_t>(DispatchKey::StartOfAutogradBackends) +
static_cast<uint8_t>(backend_k));
}
return DispatchKey::Undefined;
}
} // namespace c10
namespace torch {

View File

@ -1,29 +1,37 @@
#include <c10/core/DispatchKeySet.h>
#include <c10/util/irange.h>
namespace c10 {
// backend_dispatch_keyset includes all dispatch keys that map to backends.
// backend_dispatch_keyset should include all runtime backend keys.
// Alias key DispatchKey::CompositeExplicitAutograd maps to
// backend_dispatch_keyset
constexpr DispatchKeySet backend_dispatch_keyset =
autogradother_backends | DispatchKeySet(DispatchKey::Dense);
// backend_dispatch_keyset NestedTensor has been explicitly removed due to
// incompatibility with some kernels, such as structured kernels, that use the
// DefaultBackend key.
constexpr DispatchKeySet backend_dispatch_keyset = autogradother_backends |
DispatchKeySet({
DispatchKey::CPU,
DispatchKey::CUDA,
DispatchKey::XLA,
DispatchKey::Lazy,
DispatchKey::XPU,
DispatchKey::PrivateUse1,
DispatchKey::PrivateUse2,
DispatchKey::PrivateUse3,
DispatchKey::MLC,
DispatchKey::HPU,
DispatchKey::ORT,
DispatchKey::Meta,
});
bool isBackendDispatchKey(DispatchKey t) {
return t != DispatchKey::Undefined
// See Note [No Alias Keys in DispatchKeySet]
&& !isAliasDispatchKey(t)
// Note [NestedTensor Not Included in Backend Keys]
// NestedTensor has been explicitly removed from the "backend keyset" due
// to incompatibility with some kernels, so we don't want it to be
// included in CompositeImplicitAutograd or CompositeExplicitAutograd
// kernels.
&& t != DispatchKey::NestedTensor && backend_dispatch_keyset.has(t);
&& !isAliasDispatchKey(t) && backend_dispatch_keyset.has(t);
}
// math_dispatch_keyset contains all keys in backend_dispatch_keyset and
// autograd_dispatch_keyset Alias key DispatchKey::CompositeImplicitAutograd
// maps to [math_dispatch_keyset x full_backend_mask]
// maps to math_dispatch_keyset.
constexpr DispatchKeySet math_dispatch_keyset =
backend_dispatch_keyset | autograd_dispatch_keyset;
@ -31,12 +39,7 @@ DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t) {
TORCH_INTERNAL_ASSERT(t != DispatchKey::Undefined);
switch (t) {
case DispatchKey::Autograd:
// See Note [autograd_dispatch_keyset Does Not Include Backend Bits]
// That's why we OR it with a mask of the backend bits here.
// getRuntimeDispatchKeySet() expects to return a keyset of runtime
// dispatch keys, like AutogradCPU, but that requires having backend bits.
return autograd_dispatch_keyset |
DispatchKeySet(DispatchKeySet::RAW, full_backend_mask);
return autograd_dispatch_keyset;
case DispatchKey::CompositeImplicitAutograd:
return math_dispatch_keyset;
case DispatchKey::CompositeExplicitAutograd:
@ -50,13 +53,11 @@ bool runtimeDispatchKeySetHas(DispatchKey t, DispatchKey k) {
TORCH_INTERNAL_ASSERT(t != DispatchKey::Undefined);
switch (t) {
case DispatchKey::Autograd:
return autograd_dispatch_keyset.has(toFunctionalityKey(k));
return autograd_dispatch_keyset.has(k);
case DispatchKey::CompositeImplicitAutograd:
// See Note [NestedTensor Not Included in Backend Keys]
return k != DispatchKey::NestedTensor && math_dispatch_keyset.has(k);
return math_dispatch_keyset.has(k);
case DispatchKey::CompositeExplicitAutograd:
// See Note [NestedTensor Not Included in Backend Keys]
return k != DispatchKey::NestedTensor && backend_dispatch_keyset.has(k);
return backend_dispatch_keyset.has(k);
default:
return t == k;
}
@ -78,6 +79,8 @@ DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t) {
return DispatchKeySet(DispatchKey::MLC);
case DispatchKey::AutogradHPU:
return DispatchKeySet(DispatchKey::HPU);
case DispatchKey::AutogradNestedTensor:
return DispatchKeySet(DispatchKey::NestedTensor);
case DispatchKey::AutogradXPU:
return DispatchKeySet(DispatchKey::XPU);
case DispatchKey::AutogradPrivateUse1:
@ -93,6 +96,23 @@ DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t) {
}
}
DispatchKeySet getAutocastRelatedKeySetFromBackend(DispatchKey t) {
switch (t) {
case DispatchKey::CPU:
return DispatchKeySet(DispatchKey::AutocastCPU);
case DispatchKey::CUDA:
case DispatchKey::XLA:
return DispatchKeySet(DispatchKey::AutocastCUDA);
default:
return DispatchKeySet();
}
}
DispatchKeySet getAutogradRelatedKeySetFromBackend(DispatchKey t) {
return DispatchKeySet(
{DispatchKey::ADInplaceOrView, getAutogradKeyFromBackend(t)});
}
bool isIncludedInAlias(DispatchKey k, DispatchKey alias) {
return k != DispatchKey::Undefined && runtimeDispatchKeySetHas(alias, k);
}
@ -109,167 +129,18 @@ std::ostream& operator<<(std::ostream& os, DispatchKeySet ts) {
return os;
}
os << "DispatchKeySet(";
DispatchKey tid;
bool first = true;
for (auto k : ts) {
while ((tid = ts.highestPriorityTypeId()) != DispatchKey::Undefined) {
if (!first) {
os << ", ";
}
os << k;
os << tid;
ts = ts.remove(tid);
first = false;
}
os << ")";
return os;
}
DispatchKeySet::iterator& DispatchKeySet::iterator::operator++() {
TORCH_INTERNAL_ASSERT(next_functionality_ >= num_backends);
TORCH_INTERNAL_ASSERT(next_functionality_ <= iterator::end_iter_mask_val);
TORCH_INTERNAL_ASSERT(next_backend_ <= num_backends);
// Create a masked version of the set representation to ignore previous
// keys that we've iterated through.
uint64_t masked_functionality_bits =
llvm::maskTrailingZeros<uint64_t>(next_functionality_) & *data_ptr_;
uint64_t masked_backend_bits =
llvm::maskTrailingZeros<uint64_t>(next_backend_) & full_backend_mask &
*data_ptr_;
uint64_t first_functionality_idx =
llvm::findFirstSet(masked_functionality_bits);
uint64_t first_backendcomponent_idx = llvm::findFirstSet(masked_backend_bits);
// If there are no keys, set to end iterator value
if (first_functionality_idx == std::numeric_limits<uint64_t>::max() ||
next_functionality_ == iterator::end_iter_mask_val) {
// Set up state to be the same as end()
next_functionality_ = iterator::end_iter_mask_val;
current_dispatchkey_idx_ = iterator::end_iter_key_val;
next_backend_ = 0;
current_backendcomponent_idx_ = iterator::end_iter_key_val;
return *this;
}
// The +1 is because of DispatchKey::Undefined and
// BackendComponent::InvalidBit
auto new_next_functionality = first_functionality_idx + 1;
auto new_backendcomponent_idx = first_backendcomponent_idx + 1;
// and the -num_backends is because the first <num_backends> bits in the
// keyset are not Dispatch Keys.
auto next_dispatchkey_idx = new_next_functionality - num_backends;
// If the current functionality bit is a per-backend bit, we need special
// handling
if (isPerBackendFunctionalityKey(
static_cast<DispatchKey>(next_dispatchkey_idx))) {
// case 1: if the current backend is undefined, then there is no valid
// backend instance of this functionality key so we can skip it.
if (first_backendcomponent_idx == std::numeric_limits<uint64_t>::max()) {
// increment the functionality mask so we skip the current functionality
// bit on the next increment.
next_functionality_ = new_next_functionality;
++(*this);
return *this;
}
// Otherwise, at this point we know what the current backend and
// functionality bits are.
current_dispatchkey_idx_ = next_dispatchkey_idx;
current_backendcomponent_idx_ = new_backendcomponent_idx;
// Next, we need to set up the masks for the next increment.
uint64_t next_backendcomponent_bits =
llvm::maskTrailingZeros<uint64_t>(first_backendcomponent_idx + 1) &
full_backend_mask & *data_ptr_;
uint64_t next_backendcomponent_idx =
llvm::findFirstSet(next_backendcomponent_bits);
if (next_backendcomponent_idx == std::numeric_limits<uint64_t>::max()) {
// case 2: the current backend is valid, but there is not another backend
// in the keyset. In this case, we need to bump the functionality mask and
// reset the backend mask for the next increment
next_functionality_ = new_next_functionality;
next_backend_ = 0;
} else {
// case 3: we have another backend to iterate over. We want to iterate
// over the same functionality bit next time, but a different backend bit.
next_backend_ = first_backendcomponent_idx + 1;
}
} else {
// Functionality bits that aren't per backend are simpler to handle. We can
// ignore the backend bits.
TORCH_INTERNAL_ASSERT(next_backend_ == 0);
current_dispatchkey_idx_ = next_dispatchkey_idx;
next_functionality_ = new_next_functionality;
}
return *this;
}
std::array<FunctionalityOffsetAndMask, num_functionality_keys>
initializeFunctionalityOffsetsAndMasks() {
std::array<FunctionalityOffsetAndMask, num_functionality_keys>
offsets_and_masks;
// manualy set the first entry, which corresponds to Undefined.
offsets_and_masks[0] = FunctionalityOffsetAndMask(0, 0);
// loop through every functionality key (aside from Undefined).
for (const auto functionality_idx : c10::irange(1, num_functionality_keys)) {
// functionality_idx should be Dense -> 1, ...
auto prev_offset_and_mask = offsets_and_masks[functionality_idx - 1];
auto k = static_cast<DispatchKey>(functionality_idx);
#if defined(C10_MOBILE_TRIM_DISPATCH_KEYS)
// [Note: Trimmed Mobile Dispatch Keys]
uint16_t mask = 0;
uint16_t offset = 0;
switch (k) {
case DispatchKey::Undefined:
offset = 0;
case DispatchKey::CPU:
offset = 1;
case DispatchKey::QuantizedCPU:
offset = 2;
case DispatchKey::SparseCPU:
offset = 3;
case DispatchKey::BackendSelect:
offset = 4;
case DispatchKey::ADInplaceOrView:
offset = 5;
case DispatchKey::AutogradOther:
offset = 6;
case DispatchKey::AutogradCPU:
offset = 7;
default:
// All other keys which are unsupported on mobile will get sent
// to the undefined kernel, causing them to error.
offset = 0;
}
offsets_and_masks[functionality_idx] =
FunctionalityOffsetAndMask(offset, 0);
}
#else
// If the previous functionality was not per-backend, then we can just
// increment the previous offset. Otherwise, the next offset =
// previous_offset + num_backends.
auto next_offset = prev_offset_and_mask.offset +
(prev_offset_and_mask.mask == 0 ? 1 : num_backends);
// the mask is used in the runtime index calculation to find the offset of
// the backend. For non-per-backend functionalities, this offset should
// always be 0. Otherwise, we need to get the index of the backend (which we
// can do using a backend mask).
auto next_mask = isPerBackendFunctionalityKey(k) ? full_backend_mask : 0;
offsets_and_masks[functionality_idx] =
FunctionalityOffsetAndMask(next_offset, next_mask);
}
// Sanity check that the computed offset index of the last functionality key
// is correct. This assumes that the highest priority functionality key is not
// per backend.
TORCH_INTERNAL_ASSERT(
offsets_and_masks[num_functionality_keys - 1].offset ==
(num_runtime_entries - 1),
"num_runtime_entries: ",
num_runtime_entries,
"last_offset: ",
offsets_and_masks[num_functionality_keys - 1].offset);
#endif
return offsets_and_masks;
}
} // namespace c10

View File

@ -1,4 +1,5 @@
#pragma once
#include <c10/core/DispatchKey.h>
#include <c10/util/Exception.h>
#include <c10/util/Metaprogramming.h>
@ -7,147 +8,29 @@
namespace c10 {
struct FunctionalityOffsetAndMask {
// empty constructor shouldn't be used; only needed to initialize
// the array before populating it.
FunctionalityOffsetAndMask() {}
FunctionalityOffsetAndMask(uint16_t offset, uint16_t mask)
: offset(offset), mask(mask) {}
// This needs to big enough to cover the size of the operator table.
uint16_t offset;
// See Note [No More Than 16 Backends]
// This mask needs to be big enough to mask all of the backend bits.
// We probably don't ever want to have more than 16 backend bits, so uint16_t
// should be enough.
uint16_t mask;
};
static_assert(
c10::num_runtime_entries < 65536,
"The dispatcher currently only supports up to 2^16 runtime entries");
C10_API std::array<FunctionalityOffsetAndMask, num_functionality_keys>
initializeFunctionalityOffsetsAndMasks();
C10_ALWAYS_INLINE static const std::
array<FunctionalityOffsetAndMask, num_functionality_keys>&
offsetsAndMasks() {
static auto offsets_and_masks_ = initializeFunctionalityOffsetsAndMasks();
return offsets_and_masks_;
}
// A representation of a set of DispatchKeys. A DispatchKeySet contains both
// "functionality" bits and "backend bits", and every tensor holds its own
// DispatchKeySet. The Dispatcher implements multiple dispatch by grabbing the
// keyset on every input tensor, oring them together, and dispatching to a
// specific piece of functionality. The functionality bits are *ordered*. When
// multiple functionality bits are set, we use the highest priority
// functionality. Similarly, multiple backend bits can theoretically be set if
// you call an operator with multiple tensors from difference devices (e.g. CPU
// and CUDA), although support for mixed device dispatch is limited (the only
// kernels that gracefully handle mixed device inputs for now are cuda kernels
// that take in a scalar cpu tensor).
// A representation of a set of DispatchKeys. A tensor may have multiple
// tensor type ids, e.g., a Variable tensor can also be a CPU tensor; the
// DispatchKeySet specifies what type ids apply. The internal representation is
// as a 64-bit bit set (this means only 64 tensor type ids are supported).
//
// As mentioned above, DispatchKeys are ordered; thus, we can ask questions like
// "what is the highest priority DispatchKey in the set"? (The set itself is
// not ordered; two sets with the same ids will always have the ids ordered in
// the same way.)
// Note that DispatchKeys are ordered; thus, we can ask questions like "what is
// the highest priority DispatchKey in the set"? (The set itself is not
// ordered; two sets with the same ids will always have the ids ordered in the
// same way.)
//
// Note [DispatchKeySet Internal Representation]
// Internally, dispatch keys are packed into 64-bit DispatchKeySet objects
// that get passed around at runtime.
// However, there isn't necessarily a 1-to-1 mapping between bits in the keyset
// and individual dispatch keys.
// At the moment, there are no nontrivial uses of this set; tensors are always
// singletons. In the near future, this set will represent variable? + tensor
// type id. In the far future, it will be requires grad? + profiling? +
// tracing? + lazy? + tensor type id.
//
// First: why do we have this distinction, and why not map every dispatch key
// directly to a bit? This is mostly because we have several types of
// functionalities that different backends would like to customize. For example,
// we have:
// - "Dense": CPU, CUDA, XLA, ... (~12 keys)
// - "Sparse": SparseCPU, SparseCUDA, ...
// - "Quantized": QuantizedCPU, QuantizedCUDA, QuantizedXLA, ...
// - "Autograd": AutogradCPU, AutogradCUDA, Autograd XLA, ...
// The problem is that total number of keys grows quadratically with [#
// backends] x [# functionalities], making it very difficult to map each key
// directly to a bit in a bitset without dramatically increasing the size of the
// bitset over time.
// (The difference between variable and requires grad, is that
// there are currently three states a tensor can be:
// 1. Not a variable
// 2. Variable with requires_grad=False
// 3. Variable with requires_grad=True
// Eventually, we want to kill state (1), and only dispatch to autograd
// handling code if one of the inputs requires grad.)
//
// The two enums (BackendComponent and DispatchKey) can be divided roughly into
// 5 categories.
//
// (1) "Building block" keys
// (a) backends: jEverything in the BackendComponent enum (e.g. CPUBit,
// CUDABIt) (b) functionalities: (per-backend) functionality-bit DispatchKeys
// (e.g. AutogradFunctionality, Sparse, Dense)
// (2) "Runtime" keys
// (a) "non-customizable backends" (e.g. FPGA)
// (b) "non-customizable functionalities" (e.g. Functionalize)
// (c) "per-backend instances of customizable functionalities" (e.g. CPU,
// SparseCPU, AutogradCPU)
// (3) "Alias" DispatchKeys (see Note [Alias Dispatch Keys])
//
// (1) Building block keys always correspond to individual bits in a
// DispatchKeySet. They can also be combined in a DispatchKeySet to form actual
// runtime keys. e.g.
// auto dense_cpu_ks = DispatchKeySet({DispatchKey::CPUBit,
// DispatchKey::Dense});
// // The keyset has the runtime dense-cpu key.
// dense_cpu_ks.has(DispatchKey::CPU);
// // And it contains the building block keys too.
// dense_cpu_ks.has(DispatchKey::CPUBit);
// dense_cpu_ks.has(DispatchKey::Dense);
//
// Not every backend and not every functionality counts as a "building block
// key". This is mostly to give us more levers to pull in the design space.
// Backend keys and functionality keys that count as "building blocks" will
// contribute to a full cross product of functionality that can be overriden.
//
// For example, right now we have at least 12 "backend" building blocks (CPU,
// CUDA, XLA, ...) and at least 4 "functionality" building blocks (Dense,
// Sparse, Quantized, AutogradFunctionality, ...). These keys together allow
// every dispatcher operator to be customized in up to 12*4 different ways. Each
// of those requires a slot in the operator table of every dispatcher operator.
// Not every piece of functionality necessarily needs to be customizeable
// per-backend, and not every backend necessarily needs to be able to customize
// every type of functionality.
//
//
// (2) Every runtime key corresponds directly to a slot in an operator's runtime
// dispatch table, and you can directly register kernels to a runtime dispatch
// key.
//
// For per-backend functionalities like "Dense" or "AutogradFunctionality",
// you can think of the corresponding runtime dispatch keys as "instances" of
// that functionality, per backend. E.g. "CPU", "CUDA", "XLA", etc. are all
// runtime instances of the "Dense" building block key.
// (2a) and (2b) are represented identically in the DispatchKeySet logic:
// - backend-agnostic functionalities (e.g. FuncTorchBatched) are NOT
// customizeable per backend.
// In order to do so, we'd need to promote it to a per-backend functionality
// "building block" key.
// - non-customizeable backends (e.g. FPGA) can NOT customize existing
// functionality like Sparse, Autograd, etc.
// In order to do so, we'd need to promote it to a backend "building block"
// key.
//
// In both cases, these keys directly correspond to runtime slots in the
// operator table.
//
//
// (3) "Alias" keys
// See Note [Alias Dispatch Keys]
//
// Final note: for anyone making future changes to the Dispatcher +
// DispatchKeySet internals, there's a closed PR with a basic
// python-implementation of the Dispatcher that might be useful in quickly
// testing out and validating changes. See it at
// https://github.com/pytorch/pytorch/pull/68743
// An undefined tensor is one with an empty tensor type set.
class DispatchKeySet final {
public:
@ -158,146 +41,29 @@ class DispatchKeySet final {
// NB: default constructor representation as zero is MANDATORY as
// use of DispatchKeySet in TLS requires this.
constexpr DispatchKeySet() : repr_(0) {}
constexpr DispatchKeySet(Full)
: repr_((1ULL << (num_backends + num_functionality_keys - 1)) - 1) {}
: repr_(std::numeric_limits<decltype(repr_)>::max()) {}
constexpr DispatchKeySet(FullAfter, DispatchKey t)
// LSB after t are OK, but not t itself.
// "functionalities" have a notion of ordering (e.g. Autograd > Sparse >
// Quantized > Dense). But backends don't really have an ordering.
// Therefore, we're enforcing that FullAfter can only be used on
// "functionality" keys.
: repr_(
(1ULL
<< (num_backends + static_cast<uint8_t>(toFunctionalityKey(t)) -
1)) -
1) {}
: repr_((1ULL << (static_cast<uint8_t>(t) - 1)) - 1) {}
// Public version of DispatchKeySet(uint64_t) API; external users
// must be explicit when they do this!
constexpr DispatchKeySet(Raw, uint64_t x) : repr_(x) {}
constexpr explicit DispatchKeySet(BackendComponent k) {
if (k == BackendComponent::InvalidBit) {
repr_ = 0;
} else {
repr_ = 1ULL << (static_cast<uint8_t>(k) - 1);
}
}
constexpr explicit DispatchKeySet(DispatchKey k) {
if (k == DispatchKey::Undefined) {
// Case 1: handle Undefined specifically
repr_ = 0;
} else if (k <= DispatchKey::EndOfFunctionalityKeys) {
// Case 2: handle "functionality-only" keys
// These keys have a functionality bit set, but no backend bits
// These can technically be either:
// - valid runtime keys (e.g. DispatchKey::AutogradOther,
// DispatchKey::FuncTorchBatched, etc)
// - "building block" keys that aren't actual runtime keys (e.g.
// DispatchKey::Dense or Sparse)
uint64_t functionality_val = 1ULL
<< (num_backends + static_cast<uint8_t>(k) - 1);
repr_ = functionality_val;
} else if (k <= DispatchKey::EndOfRuntimeBackendKeys) {
// Case 3: "runtime" keys that have a functionality bit AND a backend bit.
// First compute which bit to flip for the functionality.
auto functionality_k = toFunctionalityKey(k);
// The - 1 is because Undefined is technically a "functionality" that
// doesn't show up in the bitset. So e.g. Dense is technically the second
// functionality, but the lowest functionality bit.
uint64_t functionality_val = 1ULL
<< (num_backends + static_cast<uint8_t>(functionality_k) - 1);
// then compute which bit to flip for the backend
// Case 4a: handle the runtime instances of "per-backend functionality"
// keys For example, given DispatchKey::CPU, we should set:
// - the Dense functionality bit
// - the CPUBit backend bit
// first compute which bit to flip for the backend
auto backend_k = toBackendComponent(k);
uint64_t backend_val = backend_k == BackendComponent::InvalidBit
explicit constexpr DispatchKeySet(DispatchKey t)
: repr_(
t == DispatchKey::Undefined
? 0
: 1ULL << (static_cast<uint8_t>(backend_k) - 1);
repr_ = functionality_val + backend_val;
} else {
// At this point, we should have covered every case except for alias keys.
// Technically it would be possible to add alias dispatch keys to a
// DispatchKeySet, but the semantics are a little confusing and this
// currently isn't needed anywhere.
repr_ = 0;
}
}
constexpr uint64_t keys_to_repr(std::initializer_list<DispatchKey> ks) {
uint64_t repr = 0;
for (auto k : ks) {
repr |= DispatchKeySet(k).repr_;
}
return repr;
}
constexpr uint64_t backend_bits_to_repr(
std::initializer_list<BackendComponent> ks) {
uint64_t repr = 0;
for (auto k : ks) {
repr |= DispatchKeySet(k).repr_;
}
return repr;
}
: 1ULL << (static_cast<uint8_t>(t) - 1)) {}
explicit constexpr DispatchKeySet(std::initializer_list<DispatchKey> ks)
: repr_(keys_to_repr(ks)) {}
explicit constexpr DispatchKeySet(std::initializer_list<BackendComponent> ks)
// Note: for some reason, putting this logic directly in the constructor
// appears to fail to compile on CUDA 10.1.
// See an example internal failure at
// https://www.internalfb.com/intern/skycastle/run/76561193669136035/artifact/actionlog.76561193742069401.stderr
: repr_(backend_bits_to_repr(ks)) {}
: repr_(0) {
for (auto k : ks) {
repr_ |= DispatchKeySet(k).repr_;
}
}
// Test if a DispatchKey is in the set
inline bool has(DispatchKey t) const {
bool inline has(DispatchKey t) const {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t != DispatchKey::Undefined);
return has_all(DispatchKeySet(t));
}
constexpr bool has_backend(BackendComponent t) const {
return has_all(DispatchKeySet(t));
}
// Test if a DispatchKey is in the set
// Given a DispatchKeySet of functionality keys and (potentially) backend
// keys, tests if all of them are in the current set.
constexpr bool has_all(DispatchKeySet ks) const {
return static_cast<bool>((repr_ & ks.repr_) == ks.repr_);
}
// Given a DispatchKeySet of functionality keys and (potentially) backend
// keys, tests if any of them are in the current set. This could technically
// be pretty easily implemented using has(). It is strictly a perf
// optimization though. There are many places in the code base where we want
// to test for multiple functionality keys together. HOWEVER, runtime
// per-backend functionality keys aren't allowed to be used with this
// function, because you can end up with weird results. e.g.
// DispatchKeySet(DispatchKey::AutogradCPU).has_any(DispatchKeySet(DispatchKey::CPU))
// would return true.
inline bool has_any(DispatchKeySet ks) const {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
// Either there are no backend bits in the input keyset
((ks.repr_ & full_backend_mask) == 0) ||
// or there are no per-backend-functionality bits
// See [Note: Per-Backend Functionality Dispatch Keys]
((ks &
DispatchKeySet({
DispatchKey::Dense,
DispatchKey::Quantized,
DispatchKey::Sparse,
DispatchKey::AutogradFunctionality,
})
.repr_) == 0));
return static_cast<bool>((repr_ & ks.repr_) != 0);
return static_cast<bool>(repr_ & DispatchKeySet(t).repr_);
}
// Test if DispatchKeySet is a superset of ks.
bool isSupersetOf(DispatchKeySet ks) const {
@ -308,64 +74,31 @@ class DispatchKeySet final {
return DispatchKeySet(repr_ | other.repr_);
}
// Perform set intersection
constexpr DispatchKeySet operator&(DispatchKeySet other) const {
DispatchKeySet operator&(DispatchKeySet other) const {
return DispatchKeySet(repr_ & other.repr_);
}
// Compute the set difference self - other,
// but ONLY for the functionality keys.
// Any backend bits set on self will remain unchanged.
// See Note [Removing keys from DispatchKeySet Only Affects Functionality
// Keys]
// Compute the set difference self - other
DispatchKeySet operator-(DispatchKeySet other) const {
return DispatchKeySet(repr_ & (full_backend_mask | ~other.repr_));
return DispatchKeySet(repr_ & ~other.repr_);
}
// Compute self ^ other
constexpr DispatchKeySet operator^(DispatchKeySet other) const {
return DispatchKeySet(repr_ ^ other.repr_);
}
// Perform set equality
bool operator==(DispatchKeySet other) const {
return repr_ == other.repr_;
}
bool operator!=(DispatchKeySet other) const {
return repr_ != other.repr_;
}
// Add a DispatchKey to the DispatchKey set. Does NOT mutate,
// returns the extended DispatchKeySet!
C10_NODISCARD DispatchKeySet add(DispatchKey t) const {
return *this | DispatchKeySet(t);
}
C10_NODISCARD DispatchKeySet add(DispatchKeySet ks) const {
return *this | ks;
}
// Remove a DispatchKey from the DispatchKey set.
// This is generally not an operation you should be doing
// (it's used to implement the printing overload, operator<<)
//
// Note [Removing keys from DispatchKeySet Only Affects Functionality Keys]
// Only functionality bits are allowed to be removed from a keyset.
// For now, we're only allowing removal of "functionality bits" from the
// keyset, which is specifically needed by the fallthrough key calculation
// logic. Why is removing backend bits problematic? Consider this example:
//
// DispatchKeySet([DispatchKey.CPU, DispatchKey.AutogradCUDA,
// DispatchKey.CUDA]).remove(DispatchKey.AutogradCUDA)
// DispatchKeySet([DispatchKey.CPU,
// DispatchKey.AutogradCUDA]).remove(DispatchKey.AutogradCUDA)
//
// What do we want to happen?
// Technically, we'd like it to be true that after removal,
// the first keyset still has the CUDA dispatch key while the second doesn't.
// Unfortunately there's no way to represent that, because the two keysets are
// represented the same way internally: functionality bits: Autograd, Dense
// backend bits: CPU, CUDA
//
// Instead, remove(DispatchKey.AutogradCPU) will only remove the "Autograd"
// bit from the bitset.
constexpr DispatchKeySet remove(DispatchKey t) const {
return DispatchKeySet(
repr_ & ~(DispatchKeySet(t).repr_ & ~full_backend_mask));
// Remove a DispatchKey from the DispatchKey set. This is
// generally not an operation you should be doing (it's
// used to implement operator<<)
C10_NODISCARD constexpr DispatchKeySet remove(DispatchKey t) const {
return DispatchKeySet(repr_ & ~DispatchKeySet(t).repr_);
}
// Is the set empty? (AKA undefined tensor)
bool empty() const {
@ -374,78 +107,22 @@ class DispatchKeySet final {
uint64_t raw_repr() {
return repr_;
}
DispatchKey highestFunctionalityKey() const {
auto functionality_idx = indexOfHighestBit();
// This means that none of the functionality bits were set.
if (functionality_idx < num_backends)
return DispatchKey::Undefined;
// The first num_backend bits in the keyset don't correspond to real
// dispatch keys.
return static_cast<DispatchKey>(functionality_idx - num_backends);
}
// This is similar like toBackendComponent(DispatchKey), but less restrictive.
// toBackendComponent() errors out if the key that it was passed has no
// backend bits, which is useful for error checking. We need a version of that
// here that can also handle "fake" backends like FPGA, because they need to
// map to the AutogradOther key. For those backends, we return
// BackendComponent::InvalidBit.
BackendComponent highestBackendKey() const {
// mask to mask out functionality bits
auto backend_idx =
DispatchKeySet(repr_ & full_backend_mask).indexOfHighestBit();
// all zeros across the backend bits means that no backend bits are set.
if (backend_idx == 0)
return BackendComponent::InvalidBit;
return static_cast<BackendComponent>(backend_idx);
}
// returns the DispatchKey of highest priority in the set.
// Return the type id in this set with the highest priority (i.e.,
// is the largest in the DispatchKey enum). Intuitively, this
// type id is the one that should handle dispatch (assuming there
// aren't any further exclusions or inclusions).
DispatchKey highestPriorityTypeId() const {
auto functionality_k = highestFunctionalityKey();
if (isPerBackendFunctionalityKey(functionality_k)) {
return toRuntimePerBackendFunctionalityKey(
functionality_k, highestBackendKey());
}
return functionality_k;
// TODO: If I put Undefined as entry 64 and then adjust the
// singleton constructor to shift from the right, we can get rid of the
// subtraction here. It's modestly more complicated to get right so I
// didn't do it for now.
return static_cast<DispatchKey>(64 - llvm::countLeadingZeros(repr_));
}
// Returns the index of the most-significant bit in the keyset.
// This is used to as part of the calculation into the operator table to get:
// - the highest "functionality" bit in the keyset.
// - the highest "backend" bit in the keyset.
uint8_t indexOfHighestBit() const {
return 64 - llvm::countLeadingZeros(repr_);
}
// returns the index in the operator table of highest priority key in the the
// keyset Note that we could in theory implement this using
// highestPriorityTypeId(), but this code is very hotpath and we can do it
// faster without it.
uint64_t getDispatchTableIndexForDispatchKeySet() const {
auto functionality_idx =
DispatchKeySet(repr_ >> num_backends).indexOfHighestBit();
auto offset_and_mask = offsetsAndMasks()[functionality_idx];
// Mask the functionality bits out first, then right-shift by 1.
// right-shifting by 1 because everything is zero-indexed.
// E.g. 000001 (CPU) should give us an offset of 0, 000010 (CUDA) should
// give us an offset of 1, etc.
auto backend_idx =
DispatchKeySet((repr_ & offset_and_mask.mask) >> 1).indexOfHighestBit();
return offset_and_mask.offset + backend_idx;
}
// returns the "index" of the highest priority backend in the keyset.
// This is pretty similar to getBackendKey(), but:
// - It's hotpath code (part of the runtime bitset calculation)
// - I's returns an integer index, not an enum value
// - Everything is shifted to the right by 1.
// BackendComponent::InvalidBit is technically the lowest enum value,
// but it isn't included in the runtime table. So CPUBit = 1, CUDABit = 2,
// etc.
uint64_t getBackendIndex() const {
return DispatchKeySet((repr_ & full_backend_mask) >> 1).indexOfHighestBit();
DispatchKey highestPriorityBackendTypeId() const {
return (*this &
((1ULL << static_cast<uint8_t>(DispatchKey::EndOfBackendKeys)) - 1))
.highestPriorityTypeId();
}
private:
@ -453,47 +130,42 @@ class DispatchKeySet final {
uint64_t repr_ = 0;
public:
// STL iterator for DispatchKeySet. Iterates through all runtime DispatchKeys
// in the set. The iterator is only invalidated by the destruction of the
// underlying DispatchKeySet as the iterator stores a pointer to the raw
// representation of the DispatchKeySet. Note: When we encounter a per-backend
// functionality (e.g. Dense or Sparse), we will iterate through EVERY backend
// in the keyset, for that functionality. For example, if the next
// functionality key to iterate over is Autograd, and the backend bits in the
// keyset correspond to [BackendComponent::CPUBit, BackendComponent::CUDABit],
// then the next two keys we return will be DispatchKey::AutogradCPU,
// DispatchKey::AutogradCUDA (CPU first because it has lower precedence than
// CUDA in DispatchKey.h).
// STL iterator for DispatchKeySet. Iterates through all DispatchKeys in the
// set. The iterator is only invalidated by the destruction of the underlying
// DispatchKeySet as the iterator stores a pointer to the raw representation
// of the DispatchKeySet.
class iterator {
public:
using self_type = iterator;
using iterator_category = std::input_iterator_tag;
using value_type = DispatchKey;
using difference_type = ptrdiff_t;
// final mask value should mask out the entire keyset
static const uint8_t end_iter_mask_val =
num_backends + num_functionality_keys;
// final key value should be the last DispatchKey
static const uint8_t end_iter_key_val = num_functionality_keys;
// current_dispatchkey_idx_ will iterate through all functionality bits.
// current_backendcomponent_idx_ will iterate through all backend bits.
explicit iterator(
const uint64_t* data_ptr,
uint8_t next_functionality = num_backends,
uint8_t next_backend = 0)
: data_ptr_(data_ptr),
next_functionality_(next_functionality),
next_backend_(next_backend),
// These are in an invalid state at construction time, and set by the
// first increment call
current_dispatchkey_idx_(end_iter_key_val),
current_backendcomponent_idx_(end_iter_key_val) {
explicit iterator(const uint64_t* data_ptr, uint8_t i = 0)
: data_ptr_(data_ptr), i_(i) {
// Go to the first key in the set
++(*this);
}
C10_API self_type& operator++();
self_type& operator++() {
TORCH_INTERNAL_ASSERT(
i_ <= static_cast<uint8_t>(DispatchKey::NumDispatchKeys));
// Create a masked version of the set representation to ignore previous
// keys that we've iterated through.
uint64_t masked_data = llvm::maskTrailingZeros<uint64_t>(i_) & *data_ptr_;
uint64_t firstKeyIndex = llvm::findFirstSet(masked_data);
// If there are no keys, set to end iterator value
if (firstKeyIndex == std::numeric_limits<uint64_t>::max() ||
i_ == static_cast<uint8_t>(DispatchKey::NumDispatchKeys)) {
i_ = static_cast<uint8_t>(DispatchKey::NumDispatchKeys);
return *this;
}
i_ = static_cast<uint8_t>(firstKeyIndex) + 1;
return *this;
}
self_type operator++(int) {
self_type previous_iterator = *this;
@ -502,50 +174,18 @@ class DispatchKeySet final {
}
bool operator==(const self_type& rhs) const {
return next_functionality_ == rhs.next_functionality_ &&
current_dispatchkey_idx_ == rhs.current_dispatchkey_idx_ &&
next_backend_ == rhs.next_backend_ &&
current_backendcomponent_idx_ == rhs.current_backendcomponent_idx_;
return i_ == rhs.i_;
}
bool operator!=(const self_type& rhs) const {
return next_functionality_ != rhs.next_functionality_ ||
current_dispatchkey_idx_ != rhs.current_dispatchkey_idx_ ||
next_backend_ != rhs.next_backend_ ||
current_backendcomponent_idx_ != rhs.current_backendcomponent_idx_;
return i_ != rhs.i_;
}
DispatchKey operator*() const {
auto functionality_key =
static_cast<DispatchKey>(current_dispatchkey_idx_);
if (isPerBackendFunctionalityKey(functionality_key)) {
auto next_key = toRuntimePerBackendFunctionalityKey(
functionality_key,
static_cast<BackendComponent>(current_backendcomponent_idx_));
// We expect all of the Dense, Sparse, Quantized, and Autograd keys to
// be ordered the same way with respect to their backends
TORCH_INTERNAL_ASSERT(
toBackendComponent(next_key) ==
static_cast<BackendComponent>(current_backendcomponent_idx_),
"Tried to map functionality key ",
toString(functionality_key),
" and backend bit ",
toString(
static_cast<BackendComponent>(current_backendcomponent_idx_)),
" to a runtime key, but ended up with ",
toString(next_key),
". This can happen if the order of the backend dispatch keys in DispatchKey.h isn't consistent.",
" Please double check that enum for inconsistencies.");
return next_key;
} else {
return functionality_key;
}
return static_cast<DispatchKey>(i_);
}
private:
const uint64_t* data_ptr_;
uint8_t next_functionality_;
uint8_t next_backend_;
uint8_t current_dispatchkey_idx_;
uint8_t current_backendcomponent_idx_;
uint8_t i_;
};
public:
@ -555,35 +195,31 @@ class DispatchKeySet final {
return iterator(&repr_);
}
// We do not need to iterate beyond EndOfFunctionalityKeys so we will treat
// this as the end iterator.
// We do not need to iterate beyond NumDispatchKeys so we will treat this as
// the end iterator. NumDispatchKeys will always be strictly less than 64.
iterator end() const {
return iterator(&repr_, iterator::end_iter_mask_val);
return iterator(&repr_, static_cast<uint8_t>(DispatchKey::NumDispatchKeys));
}
};
C10_API std::string toString(DispatchKeySet);
C10_API std::ostream& operator<<(std::ostream&, DispatchKeySet);
C10_API inline uint64_t getDispatchTableIndexForDispatchKey(DispatchKey k) {
return DispatchKeySet(k).getDispatchTableIndexForDispatchKeySet();
}
// Alias key DispatchKey::Autograd maps to
// (autograd_dispatch_keyset x full_backend_mask)
// autograd_dispatch_keyset should include all runtime autograd keys.
// Alias key DispatchKey::Autograd maps to autograd_dispatch_keyset.
// NB: keys in this set also get associated with CompositeImplicitAutograd
//
// Note [autograd_dispatch_keyset Does Not Include Backend Bits]
// We don't want to include any backend bits (BackendComponent::CPUBit, etc)
// directly in autograd_dispatch_keyset.
// Why? keysets like autograd_dispatch_keyset are commonly used to remove
// autograd keys from a DispatchKeySet throughout the code base. However, you
// are only allowed to remove functionality bits from a keyset, not backend
// bits. See Note [Removing keys from DispatchKeySet Only Affects Functionality
// Keys] for details. To be consistent and avoid confusion, we're explicitly
// setting up autograd_dispatch_keyset to not have any backend bits.
constexpr DispatchKeySet autograd_dispatch_keyset = DispatchKeySet({
DispatchKey::AutogradFunctionality,
DispatchKey::AutogradCPU,
DispatchKey::AutogradCUDA,
DispatchKey::AutogradXLA,
DispatchKey::AutogradLazy,
DispatchKey::AutogradNestedTensor,
DispatchKey::AutogradMLC,
DispatchKey::AutogradHPU,
DispatchKey::AutogradXPU,
DispatchKey::AutogradPrivateUse1,
DispatchKey::AutogradPrivateUse2,
DispatchKey::AutogradPrivateUse3,
DispatchKey::AutogradOther,
});
@ -606,39 +242,27 @@ constexpr DispatchKeySet default_excluded_set = DispatchKeySet({
constexpr DispatchKeySet autograd_dispatch_keyset_with_ADInplaceOrView =
autograd_dispatch_keyset | DispatchKeySet(DispatchKey::ADInplaceOrView);
constexpr DispatchKeySet python_ks = DispatchKeySet(DispatchKey::Python);
constexpr DispatchKeySet sparse_ks = DispatchKeySet(DispatchKey::Sparse);
constexpr DispatchKeySet sparse_csr_ks =
DispatchKeySet({DispatchKey::SparseCsrCPU, DispatchKey::SparseCsrCUDA});
constexpr DispatchKeySet mkldnn_ks = DispatchKeySet(DispatchKey::MkldnnCPU);
// backend dispatch keys that map to DispatchKey::AutogradOther
// NB: keys in this set also get associated with CompositeImplicitAutograd
constexpr DispatchKeySet autogradother_backends =
DispatchKeySet(
// HIP and VE aren't in this list: they now have their own backend bits
// which means that they can now have their own Autograd keys.
// Technically, HIP will now redispatch to its own custom AutogradHIP
// slot in the runtime table.
{DispatchKey::FPGA,
constexpr DispatchKeySet autogradother_backends = DispatchKeySet(
{DispatchKey::HIP,
DispatchKey::VE,
DispatchKey::FPGA,
DispatchKey::ORT,
DispatchKey::Vulkan,
DispatchKey::Metal,
DispatchKey::SparseCsrCPU,
DispatchKey::SparseCsrCUDA,
DispatchKey::QuantizedCPU,
DispatchKey::QuantizedCUDA,
DispatchKey::CustomRNGKeyId,
DispatchKey::MkldnnCPU,
DispatchKey::Meta,
// Sparse and Quantized backends also live here.
DispatchKey::Sparse,
DispatchKey::Quantized})
// Including the backend bits because this keyset is used during op
// registration, which requires looping over all runtime autogradother
// backend keys.
| DispatchKeySet(DispatchKeySet::RAW, full_backend_mask);
DispatchKey::SparseCPU,
DispatchKey::SparseCUDA,
DispatchKey::SparseHIP,
DispatchKey::SparseVE,
DispatchKey::SparseXPU,
DispatchKey::SparseCsrCPU,
DispatchKey::SparseCsrCUDA,
DispatchKey::Meta});
// The set of dispatch keys that come after autograd
// n.b. this relies on the fact that AutogradOther is currently the lowest
@ -668,36 +292,6 @@ constexpr DispatchKeySet after_func_keyset =
// away with it by explicitly removing the key here.
c10::DispatchKey::ADInplaceOrView);
constexpr DispatchKeySet backend_bitset_mask =
DispatchKeySet(DispatchKeySet::RAW, (1ULL << num_backends) - 1);
constexpr auto inplace_or_view_ks =
DispatchKeySet(DispatchKey::ADInplaceOrView);
constexpr auto autograd_cpu_ks = DispatchKeySet(DispatchKey::AutogradCPU);
constexpr auto autograd_xpu_ks = DispatchKeySet(DispatchKey::AutogradXPU);
constexpr auto autograd_cuda_ks = DispatchKeySet(DispatchKey::AutogradCUDA);
constexpr auto autograd_xla_ks = DispatchKeySet(DispatchKey::AutogradXLA);
constexpr auto autograd_lazy_ks = DispatchKeySet(DispatchKey::AutogradLazy);
constexpr auto autograd_mlc_ks = DispatchKeySet(DispatchKey::AutogradMLC);
constexpr auto autograd_hpu_ks = DispatchKeySet(DispatchKey::AutogradHPU);
constexpr auto autograd_privateuse1_ks =
DispatchKeySet(DispatchKey::AutogradPrivateUse1);
constexpr auto autograd_privateuse2_ks =
DispatchKeySet(DispatchKey::AutogradPrivateUse2);
constexpr auto autograd_privateuse3_ks =
DispatchKeySet(DispatchKey::AutogradPrivateUse3);
constexpr auto autograd_other_ks = DispatchKeySet(DispatchKey::AutogradOther);
struct OpTableOffsetAndMask {
uint16_t offset;
uint16_t backend_mask;
};
static_assert(
num_backends <= 16,
"Right now we expect the number of backends not to exceed 16. In the (unlikely) event"
" that this changes, the size of OpTableOffsetAndMask::backend_mask needs to be increased too.");
// true if t is a backend dispatch key
C10_API bool isBackendDispatchKey(DispatchKey t);
@ -713,53 +307,10 @@ C10_API bool runtimeDispatchKeySetHas(DispatchKey t, DispatchKey k);
C10_API DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t);
// Returns a DispatchKeySet of autograd related keys mapped to backend.
// for a given backend key, use the associated autograd key.
// for non-backend keys, use AutogradOther as a default.
// Note: it's convenient and fast to return a default here rather than (say)
// returning an optional<DispatchKey>, or throwing. But it makes callers
// responsible for either a) enforcing the invariant that only backend keys
// be passed as arguments, or b) interpreting our return value carefully.
inline DispatchKeySet getAutogradRelatedKeySetFromBackend(BackendComponent t) {
switch (t) {
case BackendComponent::CPUBit:
return inplace_or_view_ks | autograd_cpu_ks;
case BackendComponent::XPUBit:
return inplace_or_view_ks | autograd_xpu_ks;
case BackendComponent::CUDABit:
return inplace_or_view_ks | autograd_cuda_ks;
case BackendComponent::XLABit:
return inplace_or_view_ks | autograd_xla_ks;
case BackendComponent::LazyBit:
return inplace_or_view_ks | autograd_lazy_ks;
case BackendComponent::MLCBit:
return inplace_or_view_ks | autograd_mlc_ks;
case BackendComponent::HPUBit:
return inplace_or_view_ks | autograd_hpu_ks;
case BackendComponent::PrivateUse1Bit:
return inplace_or_view_ks | autograd_privateuse1_ks;
case BackendComponent::PrivateUse2Bit:
return inplace_or_view_ks | autograd_privateuse2_ks;
case BackendComponent::PrivateUse3Bit:
return inplace_or_view_ks | autograd_privateuse3_ks;
default:
return inplace_or_view_ks | autograd_other_ks;
}
}
C10_API DispatchKeySet getAutogradRelatedKeySetFromBackend(DispatchKey t);
// Returns a DispatchKeySet of autocast related keys mapped to backend.
inline DispatchKeySet getAutocastRelatedKeySetFromBackend(BackendComponent t) {
constexpr auto autocast_cpu_ks = DispatchKeySet(DispatchKey::AutocastCPU);
constexpr auto autocast_cuda_ks = DispatchKeySet(DispatchKey::AutocastCUDA);
switch (t) {
case BackendComponent::CPUBit:
return autocast_cpu_ks;
case BackendComponent::CUDABit:
case BackendComponent::XLABit:
return autocast_cuda_ks;
default:
return DispatchKeySet();
}
}
C10_API DispatchKeySet getAutocastRelatedKeySetFromBackend(DispatchKey t);
// This API exists because we have a use case for checking
// getRuntimeDispatchKeySet(alias).has(DispatchKey::Undefined)

View File

@ -120,11 +120,11 @@ TensorImpl::TensorImpl(
// [Note: Python key removal]
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// In most constructors for TensorImpl, you will see Python key is removed from
// the passed in DispatchKeySet. Why?
// In most constructors for TensorImpl, you will see Python and PythonTLSSnapshot
// keys are removed from the passed in DispatchKeySet. Why?
//
// INVARIANT: Python dispatch key is set iff PyObject for the Tensor has a
// nontrivial __torch_dispatch__ implementation.
// INVARIANT: Python and PythonTLSSnapshot dispatch keys are set iff PyObject for
// the Tensor has a nontrivial __torch_dispatch__ implementation.
//
// When a fresh TensorImpl is created, there is *no* PyObject (this only gets
// initialized lazily at the first point in time the Tensor passes into Python).
@ -132,8 +132,8 @@ TensorImpl::TensorImpl(
//
// In practice, what will happen shortly afterwards is that the TensorImpl
// will get its PyObject initialized by Tensor._make_subclass; at this point
// the Python dispatch key will be set and all is well. The point is to delay
// the dispatch key setting until that point.
// the Python and PythonTLSSnapshot dispatch keys will be set and all is well.
// The point is to delay the dispatch key setting until that point.
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
TensorImpl::TensorImpl(
@ -148,7 +148,9 @@ TensorImpl::TensorImpl(
numel_(0),
data_type_(data_type),
device_opt_(storage_.device()),
key_set_(key_set - c10::python_ks) { // See [Note: Python key removal]
key_set_(key_set.remove(
DispatchKey::Python).remove(
DispatchKey::PythonTLSSnapshot)) { // See [Note: Python key removal]
init_bitfields();
// Inference tensor doesn't have version counter.
if (!is_inference()) {
@ -189,12 +191,12 @@ TensorImpl::TensorImpl(
// TODO: be more explicit about the full key set at call sites so we
// don't have to keep recomputing it here
auto k = key_set.highestBackendKey();
DispatchKey k = key_set.highestPriorityBackendTypeId();
key_set = key_set | getAutocastRelatedKeySetFromBackend(k);
// See [Note: Python key removal]
key_set = key_set - c10::python_ks;
key_set =
key_set.remove(DispatchKey::Python).remove(DispatchKey::PythonTLSSnapshot); // See [Note: Python key removal]
// Inference tensor doesn't have autograd related keys.
if (inference_mode) {
@ -552,7 +554,7 @@ void TensorImpl::copy_tensor_metadata_except_version_counter(
dest_impl->storage_offset_ = src_impl->storage_offset_;
dest_impl->data_type_ = src_impl->data_type_;
dest_impl->device_opt_ = src_impl->device_opt_;
dest_impl->key_set_ = src_impl->key_set_.remove(DispatchKey::Python);
dest_impl->key_set_ = src_impl->key_set_.remove(DispatchKey::Python).remove(DispatchKey::PythonTLSSnapshot);
dest_impl->is_contiguous_ = src_impl->is_contiguous_;
dest_impl->has_contiguity_ = src_impl->has_contiguity_;
dest_impl->is_channels_last_contiguous_ =

View File

@ -838,103 +838,103 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
bool is_sparse() const {
// NB: This method is not virtual and avoid dispatches for performance
// reasons.
return key_set_.has_all(c10::sparse_ks);
return key_set_.has(DispatchKey::SparseCPU) ||
key_set_.has(DispatchKey::SparseCUDA) ||
key_set_.has(DispatchKey::SparseHIP) ||
key_set_.has(DispatchKey::SparseXPU);
}
// Whether a tensor is sparse COO or not. Use is_sparse_csr for checking CSR
// format.
bool is_sparse_csr() const {
return key_set_.has_any(c10::sparse_csr_ks);
return key_set_.has(DispatchKey::SparseCsrCPU) ||
key_set_.has(DispatchKey::SparseCsrCUDA);
}
bool is_quantized() const {
// NB: This method is not virtual and avoid dispatches for performance
// reasons.
constexpr auto quantized_ks = DispatchKeySet(DispatchKey::Quantized);
return key_set_.has_all(quantized_ks);
return key_set_.has(DispatchKey::QuantizedCPU) ||
key_set_.has(DispatchKey::QuantizedCUDA) ||
key_set_.has(DispatchKey::QuantizedXPU);
}
bool is_meta() const {
// NB: This method is not virtual and avoid dispatches for performance
// reasons.
constexpr auto meta_ks = DispatchKeySet(DispatchKey::Meta);
return key_set_.has_all(meta_ks);
return key_set_.has(DispatchKey::Meta);
}
bool is_cpu() const {
// NB: This method is not virtual and avoid dispatches for performance
// reasons.
constexpr auto cpu_bits_ks = DispatchKeySet(BackendComponent::CPUBit) |
DispatchKeySet({DispatchKey::SparseCsrCPU, DispatchKey::MkldnnCPU});
return key_set_.has_any(cpu_bits_ks);
return key_set_.has(DispatchKey::CPU) ||
key_set_.has(DispatchKey::SparseCPU) ||
key_set_.has(DispatchKey::SparseCsrCPU) ||
key_set_.has(DispatchKey::QuantizedCPU) ||
key_set_.has(DispatchKey::MkldnnCPU);
}
bool is_cuda() const {
// NB: This method is not virtual and avoid dispatches for performance
// reasons.
constexpr auto cuda_bits_ks = DispatchKeySet(BackendComponent::CUDABit) |
DispatchKeySet(DispatchKey::SparseCsrCUDA);
return key_set_.has_any(cuda_bits_ks);
return key_set_.has(DispatchKey::CUDA) ||
key_set_.has(DispatchKey::SparseCUDA) ||
key_set_.has(DispatchKey::SparseCsrCUDA) ||
key_set_.has(DispatchKey::QuantizedCUDA);
}
bool is_xpu() const {
// NB: This method is not virtual and avoid dispatches for performance
// reasons.
constexpr auto xpu_ks = DispatchKeySet(BackendComponent::XPUBit);
return key_set_.has_all(xpu_ks);
return key_set_.has(DispatchKey::XPU) ||
key_set_.has(DispatchKey::SparseXPU) ||
key_set_.has(DispatchKey::QuantizedXPU);
}
bool is_xla() const {
constexpr auto xla_ks = DispatchKeySet(BackendComponent::XLABit);
return key_set_.has_all(xla_ks);
return key_set_.has(DispatchKey::XLA);
}
bool is_hpu() const {
constexpr auto hpu_ks = DispatchKeySet(BackendComponent::HPUBit);
return key_set_.has_all(hpu_ks);
return key_set_.has(DispatchKey::HPU);
}
bool is_lazy() const {
constexpr auto lazy_ks = DispatchKeySet(BackendComponent::LazyBit);
return key_set_.has_all(lazy_ks);
return key_set_.has(DispatchKey::Lazy);
}
bool is_hip() const {
// NB: This method is not virtual and avoid dispatches for performance
// reasons.
constexpr auto hip_ks = DispatchKeySet(BackendComponent::HIPBit);
return key_set_.has_all(hip_ks);
return key_set_.has(DispatchKey::HIP) ||
key_set_.has(DispatchKey::SparseHIP);
}
bool is_ve() const {
// NB: This method is not virtual and avoid dispatches for performance
// reasons.
constexpr auto ve_ks = DispatchKeySet(BackendComponent::VEBit);
return key_set_.has_all(ve_ks);
return key_set_.has(DispatchKey::VE) || key_set_.has(DispatchKey::SparseVE);
}
bool is_mkldnn() const {
return key_set_.has_all(c10::mkldnn_ks);
return key_set_.has(DispatchKey::MkldnnCPU);
}
bool is_vulkan() const {
constexpr auto vulkan_ks = DispatchKeySet(DispatchKey::Vulkan);
return key_set_.has_all(vulkan_ks);
return key_set_.has(DispatchKey::Vulkan);
}
bool is_metal() const {
constexpr auto metal_ks = DispatchKeySet(DispatchKey::Metal);
return key_set_.has_all(metal_ks);
return key_set_.has(DispatchKey::Metal);
}
bool is_mlc() const {
constexpr auto mls_ks = DispatchKeySet(DispatchKey::MLC);
return key_set_.has_all(mls_ks);
return key_set_.has(DispatchKey::MLC);
}
bool is_ort() const {
constexpr auto ort_ks = DispatchKeySet(DispatchKey::ORT);
return key_set_.has_all(ort_ks);
return key_set_.has(DispatchKey::ORT);
}
// TODO: remove this once we don't automatically enabled Autograd dispatch
@ -950,8 +950,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
// Invariant:
// Inference tensor has version_counter_.enabled() == false
bool is_inference() {
bool no_ADInplaceOrView = !key_set_.has_any(c10::inplace_or_view_ks);
bool no_Autograd = !key_set_.has_any(c10::autograd_dispatch_keyset);
bool no_ADInplaceOrView = !key_set_.has(c10::DispatchKey::ADInplaceOrView);
bool no_Autograd = (key_set_ & c10::autograd_dispatch_keyset).empty();
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
no_ADInplaceOrView == no_Autograd,
"ADInplaceOrView and Autograd keys must be on/off at the same time.");
@ -972,22 +972,14 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
Layout layout() const {
// NB: This method is not virtual and avoid dispatches for perf.
// strided is also the most common layout type, so we check for
// strided case first.
// This keyset must also be kept in sync with the logic in
// is_sparse() / is_sparse_csr() / is_mkldnn()
constexpr auto sparse_and_sparsecsr_and_mkldnn_ks =
c10::sparse_ks | c10::sparse_csr_ks | c10::mkldnn_ks;
if (!key_set_.has_any(sparse_and_sparsecsr_and_mkldnn_ks)) {
return kStrided;
} else if (is_sparse()) {
if (is_sparse()) {
return kSparse;
} else if (is_sparse_csr()) {
return kSparseCsr;
} else {
TORCH_INTERNAL_ASSERT(
is_mkldnn(), "There is an error in the layout calculation logic.");
} else if (is_mkldnn()) {
return kMkldnn;
} else {
return kStrided;
}
}
@ -1073,8 +1065,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
* Whether or not the imaginary part of the tensor should be negated
*/
inline bool is_conj() const {
constexpr auto conjugate_ks = DispatchKeySet(DispatchKey::Conjugate);
return key_set_.has_all(conjugate_ks);
return key_set_.has(DispatchKey::Conjugate);
}
/**
@ -1094,8 +1085,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
* Whether or not the tensor is a zerotensor
*/
inline bool _is_zerotensor() const {
constexpr auto zerotensor_ks = DispatchKeySet(DispatchKey::ZeroTensor);
return key_set_.has_all(zerotensor_ks);
return key_set_.has(DispatchKey::ZeroTensor);
}
/**
@ -1115,8 +1105,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
* Whether or not the tensor should be negated
*/
inline bool is_neg() const {
constexpr auto negative_ks = DispatchKeySet(DispatchKey::Negative);
return key_set_.has_all(negative_ks);
return key_set_.has(DispatchKey::Negative);
}
/**
@ -1487,14 +1476,14 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
void set_python_dispatch(bool k) {
if (k) {
key_set_ = key_set_.add(c10::python_ks);
key_set_ = key_set_.add(DispatchKey::Python).add(DispatchKey::PythonTLSSnapshot);
} else {
key_set_ = key_set_ - c10::python_ks;
key_set_ = key_set_.remove(DispatchKey::Python).remove(DispatchKey::PythonTLSSnapshot);
}
}
bool is_python_dispatch() const {
return key_set_.has_all(c10::python_ks);
return key_set_.has(DispatchKey::Python);
}
/**
@ -1559,22 +1548,13 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
*/
inline bool has_compatible_shallow_copy_type(DispatchKeySet from) {
auto is_dense = [](DispatchKeySet ts) {
constexpr auto dense_backends = DispatchKeySet(
{BackendComponent::CPUBit,
BackendComponent::CUDABit,
BackendComponent::HIPBit,
BackendComponent::XPUBit});
constexpr auto dense_k = DispatchKeySet(DispatchKey::Dense);
return ts.has_any(dense_k) && ts.has_any(dense_backends);
return ts.has(DispatchKey::CPU) || ts.has(DispatchKey::CUDA) ||
ts.has(DispatchKey::HIP) || ts.has(DispatchKey::XPU);
};
auto is_sparse = [](DispatchKeySet ts) {
constexpr auto sparse_backends = DispatchKeySet(
{BackendComponent::CPUBit,
BackendComponent::CUDABit,
BackendComponent::HIPBit,
BackendComponent::XPUBit});
constexpr auto sparse_k = DispatchKeySet(DispatchKey::Sparse);
return ts.has_any(sparse_k) && ts.has_any(sparse_backends);
return ts.has(DispatchKey::SparseCPU) ||
ts.has(DispatchKey::SparseCUDA) || ts.has(DispatchKey::SparseHIP) ||
ts.has(DispatchKey::SparseXPU);
};
return (key_set_ == from) || (is_dense(key_set_) && is_dense(from)) ||
(is_sparse(key_set_) && is_sparse(from));

View File

@ -117,6 +117,20 @@ class C10_API ExcludeDispatchKeyGuard {
DispatchKeySet exclude_;
};
struct C10_API ForceDispatchKeyGuard {
public:
ForceDispatchKeyGuard(c10::impl::LocalDispatchKeySet key_set) :
saved_keyset_(c10::impl::tls_local_dispatch_key_set()) {
c10::impl::_force_tls_local_dispatch_key_set(key_set);
}
~ForceDispatchKeyGuard() {
c10::impl::_force_tls_local_dispatch_key_set(saved_keyset_);
}
private:
c10::impl::LocalDispatchKeySet saved_keyset_;
};
// Non-RAII API for manipulating the thread-local dispatch state.
// Please prefer the RAII API. The non-RAII API may be useful when
// the included/excluded state of a given DispatchKey must span

View File

@ -3,163 +3,25 @@
#include <unordered_set>
#include <c10/core/DispatchKeySet.h>
#include <c10/util/irange.h>
using namespace c10;
// This test exists not to be comprehensive, but to more clearly show
// what the semantics of DispatchKeySet are.
TEST(DispatchKeySet, ShowSemantics) {
// the "CPU" dispatch key is an instance of a per-backend-functionality key.
// It corresponds to "dense" functionality, "CPU" backend.
// This means that it gets a dense functionality bit, and a cpu backend bit
// set.
auto undefined_set = DispatchKeySet();
auto dense_cpu_set = DispatchKeySet(DispatchKey::CPU);
ASSERT_TRUE(dense_cpu_set.has(DispatchKey::Dense));
ASSERT_TRUE(dense_cpu_set.has_backend(BackendComponent::CPUBit));
ASSERT_TRUE(dense_cpu_set.has(DispatchKey::CPU));
auto dense_lazy_set = DispatchKeySet(DispatchKey::Lazy);
ASSERT_TRUE(dense_lazy_set.has(DispatchKey::Dense));
ASSERT_TRUE(dense_lazy_set.has_backend(BackendComponent::LazyBit));
ASSERT_TRUE(dense_lazy_set.has(DispatchKey::Lazy));
// You can think of "Dense/Sparse", and "CPUBit/CUDABit", as "building block"
// dispatch keys. You are allowed to directly create keysets out of them!
auto dense_cpu_set_from_building_blocks = DispatchKeySet(DispatchKey::Dense) |
DispatchKeySet(BackendComponent::CPUBit);
ASSERT_TRUE(dense_cpu_set.has(DispatchKey::Dense));
ASSERT_TRUE(dense_cpu_set.has_backend(BackendComponent::CPUBit));
ASSERT_TRUE(dense_cpu_set.has(DispatchKey::CPU));
ASSERT_EQ(dense_cpu_set, dense_cpu_set_from_building_blocks);
// Similarly, the AutogradCUDA key gets 2 bits in the keyset:
// The "Autograd" functionality bit, and the "CUDA" backend bit
auto autograd_cuda = DispatchKeySet(DispatchKey::AutogradCUDA);
ASSERT_TRUE(autograd_cuda.has(DispatchKey::AutogradFunctionality));
ASSERT_TRUE(autograd_cuda.has_backend(BackendComponent::CUDABit));
// Because DispatchKeySet uses a condensed internal representation, you cannot
// use it to represent the FULL cross product of backends and functionalities
// for example:
auto autograd_dense_cpu_cuda = DispatchKeySet(
{DispatchKey::AutogradFunctionality,
DispatchKey::Dense,
DispatchKey::CUDA,
DispatchKey::CPU});
auto fpga = DispatchKeySet(DispatchKey::FPGA);
auto fpga_and_cpu = DispatchKeySet({DispatchKey::FPGA, DispatchKey::CPU});
// this keyset has all of the building block keys:
ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::AutogradFunctionality));
ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::Dense));
ASSERT_TRUE(autograd_dense_cpu_cuda.has_backend(BackendComponent::CUDABit));
ASSERT_TRUE(autograd_dense_cpu_cuda.has_backend(BackendComponent::CPUBit));
// and it also has the "runtime" keys that correspond to the full
// cross-product of functionality
ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::AutogradCPU));
ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::AutogradCPU));
ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::CPU));
ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::CUDA));
// This means that there's no way to represent a keyset with, say, only
// Autograd CUDA + Dense CPU. Instead, you should think of a keyset as
// inheriting the full set of functionalities + backends of its keys. This
// means that the below keysets are all indistinguishable from each other.
ASSERT_EQ(
autograd_dense_cpu_cuda,
DispatchKeySet(
{DispatchKey::AutogradCUDA,
DispatchKey::AutogradCPU,
DispatchKey::CUDA,
DispatchKey::CPU}));
ASSERT_EQ(
autograd_dense_cpu_cuda,
DispatchKeySet({DispatchKey::AutogradCUDA, DispatchKey::CPU}));
ASSERT_EQ(
autograd_dense_cpu_cuda,
DispatchKeySet({DispatchKey::CUDA, DispatchKey::AutogradCPU}));
// ~~~~~~~~~~ DispatchKeySet iterators ~~~~~~~~~~~
// Iterators allow you to iterate individually through the DispatchKey's in a
// DispatchKeySet
auto empty_set = DispatchKeySet();
auto t1 = empty_set.begin();
auto t2 = empty_set.end();
ASSERT_EQ(*empty_set.begin(), *empty_set.end());
// However, only keys that correspond to actual runtime indices of kernels in
// the operator table show up when you iterate through a keyset. i.e.
// DispatchKey::Dense, and BackendComponent::CPUBit won't show up in an
// iterator.
auto dense_cpu_iter = dense_cpu_set.begin();
ASSERT_EQ(*dense_cpu_iter++, DispatchKey::CPU);
ASSERT_EQ(*dense_cpu_iter, *dense_cpu_set.end());
auto autograd_dense_cpu_cuda_iter = autograd_dense_cpu_cuda.begin();
ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::CPU);
ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::CUDA);
ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::AutogradCPU);
ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::AutogradCUDA);
ASSERT_EQ(*autograd_dense_cpu_cuda_iter, *autograd_dense_cpu_cuda.end());
// But other "functionality bits" that are not defined per-backend DO get
// their own slots in the operator table.
auto mixed_keyset = DispatchKeySet(BackendComponent::CPUBit) |
DispatchKeySet(
{DispatchKey::FPGA, // runtime key
DispatchKey::Functionalize, // runtime key
DispatchKey::Dense}); // NOT a runtime key
auto mixed_iter = mixed_keyset.begin();
ASSERT_EQ(*mixed_iter++, DispatchKey::CPU);
ASSERT_EQ(*mixed_iter++, DispatchKey::FPGA);
ASSERT_EQ(*mixed_iter++, DispatchKey::Functionalize);
ASSERT_EQ(*mixed_iter, *mixed_keyset.end());
}
TEST(DispatchKeySet, Empty) {
DispatchKeySet empty_set;
for (uint8_t i = 0;
i <= static_cast<uint8_t>(DispatchKey::EndOfRuntimeBackendKeys);
for (uint8_t i = 1; i < static_cast<uint8_t>(DispatchKey::NumDispatchKeys);
i++) {
auto tid = static_cast<DispatchKey>(i);
if (tid == DispatchKey::Undefined)
continue;
ASSERT_FALSE(empty_set.has(tid));
}
ASSERT_TRUE(empty_set.empty());
DispatchKeySet empty_set2;
ASSERT_TRUE(empty_set == empty_set2);
ASSERT_EQ(empty_set.highestPriorityTypeId(), DispatchKey::Undefined);
}
// This covers all keys that correspond to a single backend bit, e.g.
// BackendComponent::CPUBit. Even though these are NOT runtime keys, we still
// allow adding them directly to a keyset
TEST(DispatchKeySet, SingletonBackendComponent) {
for (const auto i : c10::irange(1, num_backends)) {
auto tid = static_cast<DispatchKey>(i);
DispatchKeySet sing(tid);
ASSERT_EQ(sing, sing);
ASSERT_EQ(sing, DispatchKeySet().add(tid));
ASSERT_EQ(sing, sing.add(tid));
ASSERT_EQ(sing, sing | sing);
ASSERT_FALSE(sing.empty());
ASSERT_TRUE(sing.has(tid));
}
}
// This covers all keys that correspond to a single functionality bit:
// - runtime, not-per-backend functionality keys, e.g.
// DispatchKey::FuncTorchBatched
// - runtime, "fake backend" keys, e.g. DispatchKey::FPGA
// - NOT-runtime, per-backend functionality keys, e.g. DispatchKey::Dense
// Even though it's not a runtime key, we still allow adding it directly to a
// keyset.
// DispatchKey::
TEST(DispatchKeySet, SingletonFunctionalityKeys) {
for (const auto i : c10::irange(1, num_functionality_keys)) {
TEST(DispatchKeySet, Singleton) {
for (uint8_t i = 1; i < static_cast<uint8_t>(DispatchKey::NumDispatchKeys);
i++) {
auto tid = static_cast<DispatchKey>(i);
DispatchKeySet sing(tid);
ASSERT_EQ(sing, sing);
@ -168,145 +30,47 @@ TEST(DispatchKeySet, SingletonFunctionalityKeys) {
ASSERT_EQ(sing, sing | sing);
ASSERT_FALSE(sing.empty());
ASSERT_TRUE(sing.has(tid));
ASSERT_EQ(sing.highestPriorityTypeId(), tid);
ASSERT_EQ(sing.remove(tid), DispatchKeySet());
}
}
// This covers runtime keys that are per-backend,
// and take up more than one bit in a DispatchKeySet. They take up one
// functionality bit + one backend bit. e.g. CPU, CUDA, SparseCPU, SparseCUDA,
// AutogradCPU, AutogradCUDA
TEST(DispatchKeySet, SingletonPerBackendFunctionalityKeys) {
for (uint8_t i = static_cast<uint8_t>(DispatchKey::StartOfDenseBackends);
i <= static_cast<uint8_t>(DispatchKey::EndOfRuntimeBackendKeys);
i++) {
auto tid = static_cast<DispatchKey>(i);
// Skip these because they aren't real keys.
if (tid == DispatchKey::StartOfDenseBackends ||
tid == DispatchKey::StartOfSparseBackends ||
tid == DispatchKey::StartOfQuantizedBackends ||
tid == DispatchKey::StartOfAutogradBackends) {
continue;
}
DispatchKeySet sing(tid);
ASSERT_EQ(sing, sing);
ASSERT_EQ(sing, DispatchKeySet().add(tid));
ASSERT_EQ(sing, sing.add(tid));
ASSERT_EQ(sing, sing | sing);
ASSERT_FALSE(sing.empty());
ASSERT_TRUE(sing.has(tid));
auto functionality_key = toFunctionalityKey(tid);
auto backend_key = toBackendComponent(tid);
// These two sets should be equivalent:
// DispatchKeySet(DispatchKey::CPU)
// DispatchKeySet({DispatchKey::Dense, BackendComponent::CPUBit})
auto expected_ks =
DispatchKeySet(functionality_key) | DispatchKeySet(backend_key);
ASSERT_EQ(sing, expected_ks);
// These two sets should be equivalent:
// DispatchKeySet(DispatchKey::CPU).remove(DispatchKey::Dense)
// DispatchKeySet(BackendComponent::CPUBit)
expected_ks = DispatchKeySet(toBackendComponent(tid));
ASSERT_EQ(sing.remove(tid), expected_ks);
}
}
TEST(DispatchKeySet, DoubletonPerBackend) {
for (uint8_t i = static_cast<uint8_t>(DispatchKey::StartOfDenseBackends);
i <= static_cast<uint8_t>(DispatchKey::EndOfRuntimeBackendKeys);
TEST(DispatchKeySet, Doubleton) {
for (uint8_t i = 1; i < static_cast<uint8_t>(DispatchKey::NumDispatchKeys);
i++) {
for (uint8_t j = i + 1;
j <= static_cast<uint8_t>(DispatchKey::EndOfRuntimeBackendKeys);
j < static_cast<uint8_t>(DispatchKey::NumDispatchKeys);
j++) {
ASSERT_LT(i, j);
auto tid1 = static_cast<DispatchKey>(i);
auto tid2 = static_cast<DispatchKey>(j);
// Skip these because they aren't real keys.
if (tid1 == DispatchKey::StartOfDenseBackends ||
tid1 == DispatchKey::StartOfSparseBackends ||
tid1 == DispatchKey::StartOfQuantizedBackends ||
tid1 == DispatchKey::StartOfAutogradBackends)
continue;
if (tid2 == DispatchKey::StartOfDenseBackends ||
tid2 == DispatchKey::StartOfSparseBackends ||
tid2 == DispatchKey::StartOfQuantizedBackends ||
tid2 == DispatchKey::StartOfAutogradBackends)
continue;
auto backend1 = toBackendComponent(tid1);
auto backend2 = toBackendComponent(tid2);
auto functionality1 = toFunctionalityKey(tid1);
auto functionality2 = toFunctionalityKey(tid2);
auto combined = DispatchKeySet({tid1, tid2});
// The combined set has the backend bits
ASSERT_TRUE(combined.has_backend(backend1));
ASSERT_TRUE(combined.has_backend(backend2));
// and it has the backend bits
ASSERT_TRUE(combined.has(functionality1));
ASSERT_TRUE(combined.has(functionality2));
// and it has the original two runtime keys
ASSERT_TRUE(combined.has(tid1));
ASSERT_TRUE(combined.has(tid2));
// Add all of the keys in the keyset to a real set
std::unordered_set<DispatchKey> visited_keys;
auto iter = combined.begin();
while (*iter != *combined.end()) {
visited_keys.insert(*iter);
++iter;
}
std::unordered_set<DispatchKey> expected_keys;
expected_keys.insert(
toRuntimePerBackendFunctionalityKey(functionality1, backend1));
expected_keys.insert(
toRuntimePerBackendFunctionalityKey(functionality1, backend2));
expected_keys.insert(
toRuntimePerBackendFunctionalityKey(functionality2, backend1));
expected_keys.insert(
toRuntimePerBackendFunctionalityKey(functionality2, backend2));
ASSERT_EQ(expected_keys, visited_keys);
if (backend1 == backend2 || functionality1 == functionality2) {
// We have two runtime keys, with either the same backend or the same
// per-backend functionalities. E.g. {AutogradCUDA, CUDA} or
// {AutogradCPU, AutogradCUDA} There should be 2 total runtime keys in
// this set.
ASSERT_EQ(2, visited_keys.size());
} else {
// since i and j are different keys, they should not have the same
// functionality and backend
ASSERT_TRUE(backend1 != backend2 && functionality1 != functionality2);
// We have two runtime keys, that have different backends + per-backend
// functionalities. So we should expect the full cross product of
// runtime keys to be in the set. e.g. if i = AutogradCUDA, and j = CPU,
// then combined = {AutogradCUDA, AutogradCPU, CUDA, CPU}
ASSERT_EQ(4, visited_keys.size());
}
auto doub = DispatchKeySet(tid1).add(tid2);
ASSERT_EQ(doub, DispatchKeySet(tid1) | DispatchKeySet(tid2));
ASSERT_TRUE(doub.has(tid1));
ASSERT_TRUE(doub.has(tid2));
ASSERT_EQ(doub.highestPriorityTypeId(), tid2); // relies on i < j
}
}
}
TEST(DispatchKeySet, Full) {
DispatchKeySet full(DispatchKeySet::FULL);
for (const auto i : c10::irange(1, num_functionality_keys)) {
for (uint8_t i = 1; i < static_cast<uint8_t>(DispatchKey::NumDispatchKeys);
i++) {
auto tid = static_cast<DispatchKey>(i);
ASSERT_TRUE(full.has(tid));
}
ASSERT_FALSE(full.has(DispatchKey::EndOfFunctionalityKeys));
}
TEST(DispatchKeySet, IteratorBasicOps) {
DispatchKeySet empty_set;
DispatchKeySet full_set(DispatchKeySet::FULL);
DispatchKeySet mutated_set = empty_set.add(DispatchKey::CPU);
DispatchKeySet mutated_set = empty_set.add(static_cast<DispatchKey>(1));
// Constructor + Comparison
ASSERT_EQ(*empty_set.begin(), DispatchKey::EndOfFunctionalityKeys);
ASSERT_EQ(*empty_set.end(), DispatchKey::EndOfFunctionalityKeys);
ASSERT_EQ(*mutated_set.begin(), DispatchKey::CPU);
ASSERT_EQ(*empty_set.begin(), DispatchKey::NumDispatchKeys);
ASSERT_EQ(*empty_set.end(), DispatchKey::NumDispatchKeys);
ASSERT_EQ(*mutated_set.begin(), static_cast<DispatchKey>(1));
ASSERT_TRUE(empty_set.begin() == empty_set.end());
ASSERT_TRUE(full_set.begin() != full_set.end());
@ -326,37 +90,16 @@ TEST(DispatchKeySet, IteratorEmpty) {
ASSERT_EQ(i, 0);
}
TEST(DispatchKeySet, IteratorCrossProduct) {
// The iterator should return all runtime keys in the set,
// including the cross product of {backends} x {functionalities}
auto ks =
DispatchKeySet({BackendComponent::CPUBit, BackendComponent::CUDABit}) |
DispatchKeySet(
{DispatchKey::Dense,
DispatchKey::FPGA,
DispatchKey::AutogradFunctionality});
auto iter = ks.begin();
// iterate through dense backends first.
ASSERT_EQ(DispatchKey::CPU, *(iter++));
ASSERT_EQ(DispatchKey::CUDA, *(iter++));
// FPGA doesn't have a backend bit, so it isn't included in the cross product.
ASSERT_EQ(DispatchKey::FPGA, *(iter++));
// iterate through the autograd keys laster.
ASSERT_EQ(DispatchKey::AutogradCPU, *(iter++));
ASSERT_EQ(DispatchKey::AutogradCUDA, *(iter++));
}
TEST(DispatchKeySet, IteratorFull) {
DispatchKeySet full_set(DispatchKeySet::FULL);
uint8_t i = 0;
for (const auto& it : full_set) {
i++;
ASSERT_TRUE(it == static_cast<DispatchKey>(i));
ASSERT_TRUE(it != DispatchKey::NumDispatchKeys);
}
// Total # of runtime entries includes an entry for DispatchKey::Undefined,
// which is not included when iterating through the DispatchKeySet.
ASSERT_EQ(i, num_runtime_entries - 1);
ASSERT_EQ(i, static_cast<uint8_t>(DispatchKey::NumDispatchKeys) - 1);
}
TEST(DispatchKeySet, IteratorRangeFull) {
@ -365,61 +108,41 @@ TEST(DispatchKeySet, IteratorRangeFull) {
for (DispatchKey dispatch_key : full_set) {
i++;
ASSERT_TRUE(dispatch_key == static_cast<DispatchKey>(i));
}
// Total # of runtime entries includes an entry for DispatchKey::Undefined,
// which is not included when iterating through the DispatchKeySet.
ASSERT_EQ(i, num_runtime_entries - 1);
ASSERT_EQ(i, static_cast<uint8_t>(DispatchKey::NumDispatchKeys) - 1);
}
TEST(DispatchKeySet, SpecificKeys) {
DispatchKeySet keyset({
static_cast<DispatchKey>(0), // Undefined should be ignored
static_cast<DispatchKey>(4),
static_cast<DispatchKey>(10),
static_cast<DispatchKey>(15),
});
std::unordered_set<DispatchKey> visited_keys;
for (DispatchKey key : keyset) {
visited_keys.insert(key);
}
ASSERT_EQ(visited_keys.size(), 3);
ASSERT_TRUE(
visited_keys.find(static_cast<DispatchKey>(4)) != visited_keys.end());
ASSERT_TRUE(
visited_keys.find(static_cast<DispatchKey>(10)) != visited_keys.end());
ASSERT_TRUE(
visited_keys.find(static_cast<DispatchKey>(15)) != visited_keys.end());
}
TEST(DispatchKeySet, FailAtEndIterator) {
DispatchKeySet full_set(DispatchKeySet::FULL);
uint64_t raw_repr = full_set.raw_repr();
// doesn't throw
DispatchKeySet::iterator(&raw_repr, num_backends + num_functionality_keys);
// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
EXPECT_THROW(
DispatchKeySet::iterator(
&raw_repr, num_backends + num_functionality_keys + 1),
&raw_repr, static_cast<uint8_t>(DispatchKey::NumDispatchKeys) + 1),
c10::Error);
}
TEST(DispatchKeySet, TestKeyOrderingInvariants) {
for (uint8_t i = static_cast<uint8_t>(DispatchKey::StartOfDenseBackends);
i <= static_cast<uint8_t>(DispatchKey::EndOfRuntimeBackendKeys);
i++) {
auto k = static_cast<DispatchKey>(i);
// Note [The Ordering of Per-Backend Dispatch Keys Matters!]
// The DispatchKey enum includes all of the runtime keys for
// Dense/Sparse/Quantized/Autograd, (e.g. CPU, CUDA, SparseCPU, SparseCUDA,
// AutogradCPU, AutogradCUDA, etc). And we expect the ordering of those keys
// to be the same as the ordering of the backends in the `BackendComponent`
// enum. This makes several utilities in `DispatchKey.h` and
// `DispatchKeySet.h` significantly easier to implement. The purpose of the
// test is to assert (through CI) that this invariant is maintained.
//
// The only way that we can really check this invariant is by
// comparing the string names of each enum.
// We only really care about the ordering for "real" keys that are actually
// used, which we expect to be able to print properly. This saves us from
// having to enumerate the full set of possible runtime keys in
// DispatchKey::toString(). It also relies on toString() being implemented
// correctly.
auto functionality_str = std::string(toString(k));
if (functionality_str == "UNKNOWN_TENSOR_TYPE_ID")
continue;
auto computed_backend_k = toBackendComponent(k);
auto computed_backend_str = std::string(toString(computed_backend_k));
// Skip, e.g., the "Bit" from "CPUBit"
computed_backend_str =
computed_backend_str.substr(0, computed_backend_str.size() - 3);
ASSERT_TRUE(
functionality_str.find(computed_backend_str) != std::string::npos)
<< "DispatchKey invariant broken! Found a key that is not ordered correctly"
<< " with its backend bit. key = " << toString(k) << ", " << k
<< ", computed backend = " << toString(computed_backend_k);
}
}

View File

@ -12,7 +12,7 @@
// C10
// - Move file to `c10` namespace.
// - Remove macro use in line 478 because the nvcc device compiler cannot handle
// it it.
// it.
// - Revise constructor logic so that it is 1) consistent with c++ 17 standard
// documented here in (8):
// https://en.cppreference.com/w/cpp/utility/optional/optional, and 2) able to

View File

@ -15,13 +15,12 @@ bool DropoutOp<float, CPUContext>::RunOnDevice() {
return true;
} else {
// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
float scale = 1. / (1. - ratio_);
float scale = ratio_ >= 1.0 ? 0.0:1. / (1. - ratio_);
// mask=true means keep, and mask=false means not keep, so we will
// generate probability depending on 1-ratio.
at::bernoulli_distribution<double> dist(1. - ratio_);
const float* Xdata = X.data<float>();
float* Ydata = Y->template mutable_data<float>();
auto mask = Output(1, X.sizes(), at::dtype<bool>());
bool* mask_data = mask->template mutable_data<bool>();
auto* gen = context_.RandGenerator();
@ -52,7 +51,7 @@ bool DropoutGradientOp<float, CPUContext>::RunOnDevice() {
const bool* mask_data = mask.data<bool>();
float* dXdata = dX->template mutable_data<float>();
// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
float scale = 1. / (1. - ratio_);
float scale = ratio_ >= 1.0 ? 0.0:1. / (1. - ratio_);
for (int i = 0; i < dY.numel(); ++i) {
// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
dXdata[i] = dYdata[i] * mask_data[i] * scale;

View File

@ -19,7 +19,6 @@ class DropoutOp final : public Operator<Context> {
is_test_(
this->template GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)) {
CAFFE_ENFORCE_GE(ratio_, 0);
CAFFE_ENFORCE_LT(ratio_, 1);
}
bool RunOnDevice() override;
@ -41,7 +40,6 @@ class DropoutGradientOp final : public Operator<Context> {
is_test_(
this->template GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)) {
CAFFE_ENFORCE_GE(ratio_, 0);
CAFFE_ENFORCE_LT(ratio_, 1);
}
bool RunOnDevice() override;

View File

@ -74,3 +74,35 @@ class TestDropout(serial.SerializedTestCase):
gc, op, [X], reference_dropout_ratio0,
# Don't check the mask with cuDNN because it's packed data
outputs_to_check=None if engine != 'CUDNN' else [0])
@given(X=hu.tensor(),
in_place=st.booleans(),
output_mask=st.booleans(),
engine=st.sampled_from(["", "CUDNN"]),
**hu.gcs)
@settings(deadline=10000)
def test_dropout_ratio1(self, X, in_place, output_mask, engine, gc, dc):
"""Test with ratio=0 for a deterministic reference impl."""
if in_place:
# Skip if trying in-place on GPU
assume(gc.device_type not in {caffe2_pb2.CUDA, caffe2_pb2.HIP})
# If in-place on CPU, don't compare with GPU
dc = dc[:1]
is_test = not output_mask
op = core.CreateOperator("Dropout", ["X"],
["X" if in_place else "Y"] +
(["mask"] if output_mask else []),
ratio=1.0, engine=engine,
is_test=is_test)
self.assertDeviceChecks(dc, op, [X], [0])
if not is_test:
self.assertGradientChecks(gc, op, [X], 0, [0])
def reference_dropout_ratio1(x):
return (x,) if is_test else (np.zeros(x.shape, dtype=np.float), np.zeros(x.shape, dtype=np.bool))
self.assertReferenceChecks(
gc, op, [X], reference_dropout_ratio1,
# Don't check the mask with cuDNN because it's packed data
outputs_to_check=None if engine != 'CUDNN' else [0])

View File

@ -385,7 +385,7 @@ class TestSequenceOps(serial.SerializedTestCase):
["shrunk_data"])
def op_ref(data, indices):
unique_indices = np.unique(indices)
unique_indices = np.unique(indices) if len(indices)>0 else np.array([],dtype=np.int64)
sorted_indices = np.sort(unique_indices)
shrunk_data = np.delete(data, sorted_indices, axis=0)
return (shrunk_data,)

View File

@ -110,22 +110,28 @@ constexpr uint64_t kMinProducedFileFormatVersion = 0x3L;
// 0x2L: (Comment missing)
// 0x3L: (Comment missing)
// 0x4L: (update) Added schema to function tuple. Forward-compatible change.
// 0x5L: (update) Update bytecode is sharing constant tensor files from torchscript, and only serialize
// extra tensors that are not in the torchscript constant table. Also update tensor storage schema adapting
// to the unify format, the root key of tensor storage is updated from {index} to
// {the_pointer_value_the_tensor.storage}, for example: `140245072983168.storage`
// Forward-compatibility change.
// 0x6L: Implicit opereator versioning using number of specified argument.
// Refer to the summary of https://github.com/pytorch/pytorch/pull/56845
// for details.
// 0x7L: Enable support for operators with default arguments plus out arguments.
constexpr uint64_t kProducedBytecodeVersion = 0x7L;
// 0x5L: (update) Update bytecode is sharing constant tensor files from
// torchscript, and only serialize extra tensors that are not in the
// torchscript constant table. Also update tensor storage schema adapting to
// the unify format, the root key of tensor storage is updated from {index} to
// {the_pointer_value_the_tensor.storage}, for example:
// `140245072983168.storage` Forward-compatibility change. 0x6L: Implicit
// opereator versioning using number of specified argument. Refer to the
// summary of https://github.com/pytorch/pytorch/pull/56845 for details. 0x7L:
// Enable support for operators with default arguments plus out arguments.
// 0x8L: Emit promoted operators as instructions
constexpr uint64_t kProducedBytecodeVersion = 0x8L;
// static_assert(
// kProducedBytecodeVersion >= kProducedFileFormatVersion,
// "kProducedBytecodeVersion must be higher or equal to
// kProducedFileFormatVersion.");
// Introduce kMinSupportedBytecodeVersion and kMaxSupportedBytecodeVersion
// for limited backward/forward compatibility support of bytecode. If
// kMinSupportedBytecodeVersion <= model_version <= kMaxSupportedBytecodeVersion (in loader),
// we should support this model_version. For example, we provide a wrapper to
// handle an updated operator.
// kMinSupportedBytecodeVersion <= model_version <= kMaxSupportedBytecodeVersion
// (in loader), we should support this model_version. For example, we provide a
// wrapper to handle an updated operator.
constexpr uint64_t kMinSupportedBytecodeVersion = 0x3L;
constexpr uint64_t kMaxSupportedBytecodeVersion = 0x8L;

View File

@ -243,7 +243,6 @@ coverage_missing_automodule = [
"torch.fft",
"torch.for_onnx",
"torch.fx.experimental",
"torch.fx.experimental.fx_acc",
"torch.fx.experimental.unification",
"torch.fx.experimental.unification.multipledispatch",
"torch.fx.passes",
@ -326,14 +325,11 @@ release = 'master'
# Customized html_title here.
# Default is " ".join(project, release, "documentation") if not set
if RELEASE:
# remove hash (start with 'a') from version number if any
version_end = torch_version.find('a')
if version_end == -1:
html_title = " ".join((project, torch_version, "documentation"))
version = torch_version
else:
html_title = " ".join((project, torch_version[:version_end], "documentation"))
version = torch_version[:version_end]
# Turn 1.11.0aHASH into 1.11
# Note: the release candidates should no longer have the aHASH suffix, but in any
# case we wish to leave only major.minor, even for rc builds.
version = '.'.join(torch_version.split('.')[:2])
html_title = " ".join((project, version, "documentation"))
release = version
# The language for content autogenerated by Sphinx. Refer to documentation

View File

@ -355,7 +355,7 @@ Extending :mod:`torch` with a :class:`Tensor`-like type
.. note:: This functionality is inspired by the NumPy ``__array_function__``
protocol. See `the NumPy documentation
<https://docs.scipy.org/doc/numpy/user/basics.dispatch.html#basics-dispatch>`_
<https://numpy.org/doc/stable/user/basics.dispatch.html#basics-dispatch>`_
and `NEP-0018
<https://numpy.org/neps/nep-0018-array-function-protocol.html>`_ for
more details.

View File

@ -72,7 +72,12 @@ class BackendWithCompiler : public PyTorchBackendInterface {
return true;
}
// Since the actual compilation is done AOT,
// Since the actual compilation is done AOT for this backend, compile just
// forwards everything along. In a non toy setup this could grab information
// from that runtime that might be relevant to execute, such as build flags
// the resolution of the devices camera, or basically any runtime specific
// information that wouldnt be available server side where preprocess is
// called.
c10::impl::GenericDict compile(
c10::IValue processed,
c10::impl::GenericDict method_compile_spec) override {
@ -86,8 +91,14 @@ class BackendWithCompiler : public PyTorchBackendInterface {
return c10::impl::toGenericDict(handles);
}
// Function that actually executes the model in the backend. Here there is
// nothing to dispatch to, so the backend is implemented locally within
// execute and it only supports add, subtract, and constant. In a non toy
// backend you can imagine how this function could be used to actually
// dispatch the inputs to the relevant backend/device.
c10::impl::GenericList execute(
c10::IValue handle,
c10::IValue
handle, // example: [('prim::Constant#1', 14), ('aten::add', 15)]
c10::impl::GenericList inputs) override {
TORCH_INTERNAL_ASSERT(inputs.size() == 2);
c10::IValue val0 = inputs[0];
@ -107,6 +118,7 @@ class BackendWithCompiler : public PyTorchBackendInterface {
auto start_time_us = torch::profiler::impl::getTime() / 1000;
try {
if (instruction.rfind("prim::Constant", 0) == 0) {
// 15 is the length of 'prim::Constant#' the constant val comes after
TORCH_CHECK(
instruction.size() > 15,
"Constant value is expected in ",

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More