Merge remote-tracking branch 'upstream/viable/strict' into mkl-spmmd

This commit is contained in:
Ivan Yashchuk 2022-02-23 09:58:31 +00:00
commit f77783c374
133 changed files with 6069 additions and 2660 deletions

1
.circleci/config.yml generated
View File

@ -847,6 +847,7 @@ jobs:
<<: *binary_mac_params
macos:
xcode: "12.0"
resource_class: "large"
steps:
# See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
- checkout

View File

@ -161,6 +161,7 @@
<<: *binary_mac_params
macos:
xcode: "12.0"
resource_class: "large"
steps:
# See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
- checkout

View File

@ -1,48 +1,54 @@
[
{
"name": "ONNX exporter",
"patterns": [
"torch/onnx/**",
"torch/csrc/jit/passes/onnx/**",
"torch/csrc/jit/passes/onnx.*",
"test/onnx/**",
"docs/source/onnx.rst",
"torch/csrc/jit/serialization/export.*",
"torch/csrc/jit/serialization/onnx.*",
"torch/_C/__init__.pyi.in",
"torch/csrc/onnx/**"
"name": "ONNX exporter",
"patterns": [
"torch/onnx/**",
"torch/csrc/jit/passes/onnx/**",
"torch/csrc/jit/passes/onnx.*",
"test/onnx/**",
"docs/source/onnx.rst",
"torch/csrc/jit/serialization/export.*",
"torch/csrc/jit/serialization/onnx.*",
"torch/_C/__init__.pyi.in",
"torch/csrc/onnx/**"
],
"approved_by": ["BowenBao", "garymm"],
"mandatory_app_id": 12274
"approved_by": ["BowenBao", "garymm"],
"mandatory_app_id": 12274
},
{
"name": "NVFuser",
"patterns": ["torch/csrc/jit/codegen/fuser/cuda/**", "torch/csrc/jit/codegen/cuda/**", "benchmarks/cpp/nvfuser/**"],
"approved_by": ["csarofeen", "ngimel"],
"mandatory_app_id": 12274
"name": "NVFuser",
"patterns": ["torch/csrc/jit/codegen/fuser/cuda/**", "torch/csrc/jit/codegen/cuda/**", "benchmarks/cpp/nvfuser/**"],
"approved_by": ["csarofeen", "ngimel"],
"mandatory_app_id": 12274
},
{
"name": "OSS CI",
"patterns": [".github/**", ".circleci/**", ".jenkins/**", "scripts/**", "tools/**"],
"approved_by": ["seemethere", "malfet", "suo", "janeyx99", "ezyang"],
"mandatory_app_id": 12274
"name": "OSS CI",
"patterns": [".github/**", ".circleci/**", ".jenkins/**", "scripts/**", "tools/**"],
"approved_by": ["janeyx99", "ezyang"],
"mandatory_app_id": 12274
},
{
"name": "Documentation",
"patterns": ["docs/**", "torch/*docs.py"],
"approved_by": ["mruberry", "ngimel", "albanD", "janeyx99"],
"approved_by": ["mruberry", "ngimel", "janeyx99"],
"mandatory_app_id": 12274
},
{
"name": "Android",
"patterns": ["android/**"],
"approved_by": ["linbinyu", "kit1980", "IvanKobzarev", "malfet"],
"approved_by": ["linbinyu", "kit1980", "IvanKobzarev"],
"mandatory_app_id": 12274
},
{
"name": "iOS",
"patterns": ["ios/**"],
"approved_by": ["linbinyu", "kit1980", "xta0", "malfet", "hanton"],
"approved_by": ["linbinyu", "kit1980", "xta0", "hanton"],
"mandatory_app_id": 12274
},
{
"name": "superuser",
"patterns": ["*"],
"approved_by": ["albanD", "jbschlosser", "suo", "osalpekar", "malfet", "seemethere"],
"mandatory_app_id": 12274
}
]

View File

@ -6,6 +6,10 @@
{%- set squid_no_proxy = "localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" -%}
{%- set timeout_minutes = 240 -%}
# NOTE: If testing pytorch/builder changes you can change this variable to change what pytorch/builder reference
# the binary builds will check out
{%- set builder_branch = "main" -%}
{%- macro concurrency(build_environment) -%}
concurrency:
group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@ -191,7 +195,9 @@ concurrency:
- name: Checkout !{{ 'PyTorch' if repository == "pytorch/pytorch" else repository }}
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
{%- if checkout_pr_head %}
{%- if branch %}
ref: !{{ branch }}
{%- elif checkout_pr_head %}
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
{%- endif %}
{%- if deep_clone %}
@ -202,9 +208,6 @@ concurrency:
{%- if repository != "pytorch/pytorch" %}
repository: !{{ repository }}
{%- endif %}
{%- if branch %}
ref: !{{ branch }}
{%- endif %}
{%- if directory %}
path: !{{ directory }}
{%- endif %}

View File

@ -53,7 +53,7 @@ jobs:
steps:
!{{ common.setup_ec2_linux() }}
!{{ common.checkout(deep_clone=False, directory="pytorch") }}
!{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", checkout_pr_head=False) }}
!{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }}
{%- if config["gpu_arch_type"] == 'cuda' and config["gpu_arch_version"].startswith('11') %}
- name: Set BUILD_SPLIT_CUDA
run: |
@ -119,16 +119,8 @@ jobs:
with:
name: !{{ config["build_name"] }}
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
with:
path: pytorch
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
with:
repository: pytorch/builder
path: builder
!{{ common.checkout(deep_clone=False, directory="pytorch") }}
!{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }}
{%- if config["gpu_arch_type"] == "cuda" %}
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/

View File

@ -80,7 +80,7 @@ jobs:
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
!{{ common.checkout(deep_clone=False, directory="pytorch") }}
!{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder") }}
!{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }}
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
run: |

View File

@ -60,16 +60,8 @@ jobs:
steps:
!{{ common.setup_ec2_windows() }}
!{{ set_runner_specific_vars() }}
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
with:
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
!{{ common.checkout(deep_clone=False, directory="pytorch") }}
!{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }}
- name: Populate binary env
shell: bash
run: |
@ -104,16 +96,8 @@ jobs:
with:
name: !{{ config["build_name"] }}
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
with:
path: ${{ env.PYTORCH_ROOT }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
with:
repository: pytorch/builder
path: ${{ env.BUILDER_ROOT }}
!{{ common.checkout(deep_clone=False, directory="pytorch") }}
!{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }}
- name: Populate binary env
shell: bash
run: |

View File

@ -111,6 +111,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -248,16 +249,29 @@ jobs:
with:
name: conda-py3_7-cpu
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Pull Docker image
run: |
retry () {
@ -502,6 +516,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -640,16 +655,29 @@ jobs:
with:
name: conda-py3_7-cuda10_2
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -900,6 +928,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -1041,16 +1070,29 @@ jobs:
with:
name: conda-py3_7-cuda11_1
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -1301,6 +1343,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -1442,16 +1485,29 @@ jobs:
with:
name: conda-py3_7-cuda11_3
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -1702,6 +1758,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -1843,16 +1900,29 @@ jobs:
with:
name: conda-py3_7-cuda11_5
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -2102,6 +2172,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -2239,16 +2310,29 @@ jobs:
with:
name: conda-py3_8-cpu
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Pull Docker image
run: |
retry () {
@ -2493,6 +2577,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -2631,16 +2716,29 @@ jobs:
with:
name: conda-py3_8-cuda10_2
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -2891,6 +2989,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -3032,16 +3131,29 @@ jobs:
with:
name: conda-py3_8-cuda11_1
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -3292,6 +3404,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -3433,16 +3546,29 @@ jobs:
with:
name: conda-py3_8-cuda11_3
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -3693,6 +3819,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -3834,16 +3961,29 @@ jobs:
with:
name: conda-py3_8-cuda11_5
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -4093,6 +4233,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -4230,16 +4371,29 @@ jobs:
with:
name: conda-py3_9-cpu
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Pull Docker image
run: |
retry () {
@ -4484,6 +4638,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -4622,16 +4777,29 @@ jobs:
with:
name: conda-py3_9-cuda10_2
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -4882,6 +5050,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -5023,16 +5192,29 @@ jobs:
with:
name: conda-py3_9-cuda11_1
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -5283,6 +5465,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -5424,16 +5607,29 @@ jobs:
with:
name: conda-py3_9-cuda11_3
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -5684,6 +5880,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -5825,16 +6022,29 @@ jobs:
with:
name: conda-py3_9-cuda11_5
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -6084,6 +6294,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -6221,16 +6432,29 @@ jobs:
with:
name: conda-py3_10-cpu
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Pull Docker image
run: |
retry () {
@ -6475,6 +6699,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -6613,16 +6838,29 @@ jobs:
with:
name: conda-py3_10-cuda10_2
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -6873,6 +7111,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -7014,16 +7253,29 @@ jobs:
with:
name: conda-py3_10-cuda11_1
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -7274,6 +7526,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -7415,16 +7668,29 @@ jobs:
with:
name: conda-py3_10-cuda11_3
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -7675,6 +7941,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -7816,16 +8083,29 @@ jobs:
with:
name: conda-py3_10-cuda11_5
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |

View File

@ -112,6 +112,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -250,16 +251,29 @@ jobs:
with:
name: libtorch-cpu-shared-with-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Pull Docker image
run: |
retry () {
@ -505,6 +519,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -643,16 +658,29 @@ jobs:
with:
name: libtorch-cpu-shared-without-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Pull Docker image
run: |
retry () {
@ -898,6 +926,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -1036,16 +1065,29 @@ jobs:
with:
name: libtorch-cpu-static-with-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Pull Docker image
run: |
retry () {
@ -1291,6 +1333,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -1429,16 +1472,29 @@ jobs:
with:
name: libtorch-cpu-static-without-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Pull Docker image
run: |
retry () {
@ -1685,6 +1741,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -1824,16 +1881,29 @@ jobs:
with:
name: libtorch-cuda10_2-shared-with-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -2086,6 +2156,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -2225,16 +2296,29 @@ jobs:
with:
name: libtorch-cuda10_2-shared-without-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -2487,6 +2571,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -2626,16 +2711,29 @@ jobs:
with:
name: libtorch-cuda10_2-static-with-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -2888,6 +2986,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -3027,16 +3126,29 @@ jobs:
with:
name: libtorch-cuda10_2-static-without-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -3289,6 +3401,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -3431,16 +3544,29 @@ jobs:
with:
name: libtorch-cuda11_1-shared-with-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -3693,6 +3819,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -3835,16 +3962,29 @@ jobs:
with:
name: libtorch-cuda11_1-shared-without-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -4097,6 +4237,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -4239,16 +4380,29 @@ jobs:
with:
name: libtorch-cuda11_1-static-with-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -4501,6 +4655,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -4643,16 +4798,29 @@ jobs:
with:
name: libtorch-cuda11_1-static-without-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -4905,6 +5073,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -5047,16 +5216,29 @@ jobs:
with:
name: libtorch-cuda11_3-shared-with-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -5309,6 +5491,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -5451,16 +5634,29 @@ jobs:
with:
name: libtorch-cuda11_3-shared-without-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -5713,6 +5909,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -5855,16 +6052,29 @@ jobs:
with:
name: libtorch-cuda11_3-static-with-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -6117,6 +6327,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -6259,16 +6470,29 @@ jobs:
with:
name: libtorch-cuda11_3-static-without-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -6521,6 +6745,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -6663,16 +6888,29 @@ jobs:
with:
name: libtorch-cuda11_5-shared-with-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -6925,6 +7163,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -7067,16 +7306,29 @@ jobs:
with:
name: libtorch-cuda11_5-shared-without-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -7329,6 +7581,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -7471,16 +7724,29 @@ jobs:
with:
name: libtorch-cuda11_5-static-with-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -7733,6 +7999,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -7875,16 +8142,29 @@ jobs:
with:
name: libtorch-cuda11_5-static-without-deps-cxx11-abi
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |

View File

@ -112,6 +112,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -250,16 +251,29 @@ jobs:
with:
name: libtorch-cpu-shared-with-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Pull Docker image
run: |
retry () {
@ -505,6 +519,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -643,16 +658,29 @@ jobs:
with:
name: libtorch-cpu-shared-without-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Pull Docker image
run: |
retry () {
@ -898,6 +926,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -1036,16 +1065,29 @@ jobs:
with:
name: libtorch-cpu-static-with-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Pull Docker image
run: |
retry () {
@ -1291,6 +1333,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -1429,16 +1472,29 @@ jobs:
with:
name: libtorch-cpu-static-without-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Pull Docker image
run: |
retry () {
@ -1685,6 +1741,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -1824,16 +1881,29 @@ jobs:
with:
name: libtorch-cuda10_2-shared-with-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -2086,6 +2156,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -2225,16 +2296,29 @@ jobs:
with:
name: libtorch-cuda10_2-shared-without-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -2487,6 +2571,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -2626,16 +2711,29 @@ jobs:
with:
name: libtorch-cuda10_2-static-with-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -2888,6 +2986,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -3027,16 +3126,29 @@ jobs:
with:
name: libtorch-cuda10_2-static-without-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -3289,6 +3401,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -3431,16 +3544,29 @@ jobs:
with:
name: libtorch-cuda11_1-shared-with-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -3693,6 +3819,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -3835,16 +3962,29 @@ jobs:
with:
name: libtorch-cuda11_1-shared-without-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -4097,6 +4237,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -4239,16 +4380,29 @@ jobs:
with:
name: libtorch-cuda11_1-static-with-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -4501,6 +4655,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -4643,16 +4798,29 @@ jobs:
with:
name: libtorch-cuda11_1-static-without-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -4905,6 +5073,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -5047,16 +5216,29 @@ jobs:
with:
name: libtorch-cuda11_3-shared-with-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -5309,6 +5491,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -5451,16 +5634,29 @@ jobs:
with:
name: libtorch-cuda11_3-shared-without-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -5713,6 +5909,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -5855,16 +6052,29 @@ jobs:
with:
name: libtorch-cuda11_3-static-with-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -6117,6 +6327,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -6259,16 +6470,29 @@ jobs:
with:
name: libtorch-cuda11_3-static-without-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -6521,6 +6745,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -6663,16 +6888,29 @@ jobs:
with:
name: libtorch-cuda11_5-shared-with-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -6925,6 +7163,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -7067,16 +7306,29 @@ jobs:
with:
name: libtorch-cuda11_5-shared-without-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -7329,6 +7581,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -7471,16 +7724,29 @@ jobs:
with:
name: libtorch-cuda11_5-static-with-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |
@ -7733,6 +7999,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -7875,16 +8142,29 @@ jobs:
with:
name: libtorch-cuda11_5-static-without-deps-pre-cxx11
path: "${{ runner.temp }}/artifacts/"
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
path: pytorch
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
- name: Clone pytorch/builder
uses: actions/checkout@v2
path: pytorch
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
working-directory: pytorch/
run: |

File diff suppressed because it is too large Load Diff

View File

@ -87,7 +87,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -284,7 +284,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -481,7 +481,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
ref: main
submodules: recursive
repository: pytorch/builder
path: builder

View File

@ -87,7 +87,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -284,7 +284,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -481,7 +481,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -678,7 +678,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
ref: main
submodules: recursive
repository: pytorch/builder
path: builder

View File

@ -85,7 +85,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -282,7 +282,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -479,7 +479,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -676,7 +676,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
ref: main
submodules: recursive
repository: pytorch/builder
path: builder

View File

@ -90,7 +90,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -293,7 +293,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -496,7 +496,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -699,7 +699,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
ref: main
submodules: recursive
repository: pytorch/builder
path: builder

View File

@ -90,7 +90,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -293,7 +293,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -496,7 +496,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -699,7 +699,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
ref: main
submodules: recursive
repository: pytorch/builder
path: builder

View File

@ -85,7 +85,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -282,7 +282,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -479,7 +479,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
@ -676,7 +676,7 @@ jobs:
- name: Checkout pytorch/builder
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
ref: main
submodules: recursive
repository: pytorch/builder
path: builder

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -14,9 +14,16 @@ repositories {
jcenter()
}
# lite interpreter build
dependencies {
implementation 'org.pytorch:pytorch_android:1.6.0'
implementation 'org.pytorch:pytorch_android_torchvision:1.6.0'
implementation 'org.pytorch:pytorch_android_lite:1.10.0'
implementation 'org.pytorch:pytorch_android_torchvision_lite:1.10.0'
}
# full jit build
dependencies {
implementation 'org.pytorch:pytorch_android:1.10.0'
implementation 'org.pytorch:pytorch_android_torchvision:1.10.0'
}
```
@ -32,6 +39,15 @@ repositories {
}
}
# lite interpreter build
dependencies {
...
implementation 'org.pytorch:pytorch_android_lite:1.12.0-SNAPSHOT'
implementation 'org.pytorch:pytorch_android_torchvision_lite:1.12.0-SNAPSHOT'
...
}
# full jit build
dependencies {
...
implementation 'org.pytorch:pytorch_android:1.12.0-SNAPSHOT'
@ -68,7 +84,7 @@ They are specified as environment variables:
`ANDROID_HOME` - path to [Android SDK](https://developer.android.com/studio/command-line/sdkmanager.html)
`ANDROID_NDK` - path to [Android NDK](https://developer.android.com/studio/projects/install-ndk)
`ANDROID_NDK` - path to [Android NDK](https://developer.android.com/studio/projects/install-ndk). It's recommended to use NDK 21.x.
`GRADLE_HOME` - path to [gradle](https://gradle.org/releases/)
@ -133,7 +149,7 @@ android {
}
dependencies {
extractForNativeBuild('org.pytorch:pytorch_android:1.6.0')
extractForNativeBuild('org.pytorch:pytorch_android:1.10.0')
}
task extractAARForNativeBuild {

View File

@ -29,7 +29,8 @@ check_gradle() {
}
parse_abis_list() {
ABIS_LIST="x86"
# sync with https://github.com/pytorch/pytorch/blob/0ca0e02685a9d033ac4f04e2fa5c8ba6dbc5ae50/android/gradle.properties#L1
ABIS_LIST="armeabi-v7a,arm64-v8a,x86,x86_64"
CUSTOM_ABIS_LIST=false
if [ $# -gt 0 ]; then
ABIS_LIST=$1

View File

@ -50,7 +50,17 @@ android {
}
androidTest {
java {
exclude 'org/pytorch/PytorchHostTests.java'
if(System.env.BUILD_LITE_INTERPRETER == '0') {
println 'Build test for full jit (pytorch_jni)'
exclude 'org/pytorch/PytorchHostTests.java'
exclude 'org/pytorch/PytorchLiteInstrumentedTests.java'
exclude 'org/pytorch/suite/PytorchLiteInstrumentedTestSuite.java'
} else {
println 'Build test for lite interpreter (pytorch_jni_lite)'
exclude 'org/pytorch/PytorchHostTests.java'
exclude 'org/pytorch/PytorchInstrumentedTests.java'
exclude 'org/pytorch/suite/PytorchInstrumentedTestSuite.java'
}
}
}
}

View File

@ -1,4 +1,6 @@
import torch
from torch import Tensor
from typing import Dict, List, Tuple, Optional
OUTPUT_DIR = "src/androidTest/assets/"
@ -7,7 +9,8 @@ def scriptAndSave(module, fileName):
script_module = torch.jit.script(module)
print(script_module.graph)
outputFileName = OUTPUT_DIR + fileName
script_module.save(outputFileName)
# note that the lite interpreter model can also be used in full JIT
script_module._save_for_lite_interpreter(outputFileName)
print("Saved to " + outputFileName)
print('=' * 80)

View File

@ -25,6 +25,7 @@ sourceSets {
java {
srcDir '../src/androidTest/java'
exclude '**/PytorchInstrumented*'
exclude '**/PytorchLiteInstrumented*'
}
resources.srcDirs = ["../src/androidTest/assets"]
}

View File

@ -10,7 +10,11 @@ import java.util.Objects;
public class PytorchHostTests extends PytorchTestBase {
@Override
protected String assetFilePath(String assetName) throws IOException {
protected Module loadModel(String path) throws IOException {
return Module.load(assetFilePath(path));
}
private String assetFilePath(String assetName) throws IOException {
Path tempFile = Files.createTempFile("test", ".pt");
try (InputStream resource =
Objects.requireNonNull(getClass().getClassLoader().getResourceAsStream("test.pt"))) {

View File

@ -14,7 +14,11 @@ import org.junit.runner.RunWith;
public class PytorchInstrumentedTests extends PytorchTestBase {
@Override
protected String assetFilePath(String assetName) throws IOException {
protected Module loadModel(String path) throws IOException {
return Module.load(assetFilePath(path));
}
private String assetFilePath(String assetName) throws IOException {
final Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext();
File file = new File(appContext.getFilesDir(), assetName);
if (file.exists() && file.length() > 0) {
@ -35,4 +39,5 @@ public class PytorchInstrumentedTests extends PytorchTestBase {
throw e;
}
}
}

View File

@ -0,0 +1,46 @@
package org.pytorch;
import android.content.Context;
import androidx.test.InstrumentationRegistry;
import androidx.test.runner.AndroidJUnit4;
import org.junit.runner.RunWith;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
@RunWith(AndroidJUnit4.class)
public class PytorchLiteInstrumentedTests extends PytorchTestBase {
@Override
protected Module loadModel(String path) throws IOException {
return LiteModuleLoader.load(assetFilePath(path));
}
private String assetFilePath(String assetName) throws IOException {
final Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext();
File file = new File(appContext.getFilesDir(), assetName);
if (file.exists() && file.length() > 0) {
return file.getAbsolutePath();
}
try (InputStream is = appContext.getAssets().open(assetName)) {
try (OutputStream os = new FileOutputStream(file)) {
byte[] buffer = new byte[4 * 1024];
int read;
while ((read = is.read(buffer)) != -1) {
os.write(buffer, 0, read);
}
os.flush();
}
return file.getAbsolutePath();
} catch (IOException e) {
throw e;
}
}
}

View File

@ -16,7 +16,7 @@ public abstract class PytorchTestBase {
@Test
public void testForwardNull() throws IOException {
final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
final Module module = loadModel(TEST_MODULE_ASSET_NAME);
final IValue input = IValue.from(Tensor.fromBlob(Tensor.allocateByteBuffer(1), new long[] {1}));
assertTrue(input.isTensor());
final IValue output = module.forward(input);
@ -25,7 +25,7 @@ public abstract class PytorchTestBase {
@Test
public void testEqBool() throws IOException {
final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
final Module module = loadModel(TEST_MODULE_ASSET_NAME);
for (boolean value : new boolean[] {false, true}) {
final IValue input = IValue.from(value);
assertTrue(input.isBool());
@ -38,7 +38,7 @@ public abstract class PytorchTestBase {
@Test
public void testEqInt() throws IOException {
final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
final Module module = loadModel(TEST_MODULE_ASSET_NAME);
for (long value : new long[] {Long.MIN_VALUE, -1024, -1, 0, 1, 1024, Long.MAX_VALUE}) {
final IValue input = IValue.from(value);
assertTrue(input.isLong());
@ -51,7 +51,7 @@ public abstract class PytorchTestBase {
@Test
public void testEqFloat() throws IOException {
final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
final Module module = loadModel(TEST_MODULE_ASSET_NAME);
double[] values =
new double[] {
-Double.MAX_VALUE,
@ -86,7 +86,7 @@ public abstract class PytorchTestBase {
}
final Tensor inputTensor = Tensor.fromBlob(inputTensorData, inputTensorShape);
final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
final Module module = loadModel(TEST_MODULE_ASSET_NAME);
final IValue input = IValue.from(inputTensor);
assertTrue(input.isTensor());
assertTrue(inputTensor == input.toTensor());
@ -103,7 +103,7 @@ public abstract class PytorchTestBase {
@Test
public void testEqDictIntKeyIntValue() throws IOException {
final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
final Module module = loadModel(TEST_MODULE_ASSET_NAME);
final Map<Long, IValue> inputMap = new HashMap<>();
inputMap.put(Long.MIN_VALUE, IValue.from(-Long.MIN_VALUE));
@ -127,7 +127,7 @@ public abstract class PytorchTestBase {
@Test
public void testEqDictStrKeyIntValue() throws IOException {
final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
final Module module = loadModel(TEST_MODULE_ASSET_NAME);
final Map<String, IValue> inputMap = new HashMap<>();
inputMap.put("long_min_value", IValue.from(Long.MIN_VALUE));
@ -151,7 +151,7 @@ public abstract class PytorchTestBase {
@Test
public void testListIntSumReturnTuple() throws IOException {
final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
final Module module = loadModel(TEST_MODULE_ASSET_NAME);
for (int n : new int[] {0, 1, 128}) {
long[] a = new long[n];
@ -178,7 +178,7 @@ public abstract class PytorchTestBase {
@Test
public void testOptionalIntIsNone() throws IOException {
final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
final Module module = loadModel(TEST_MODULE_ASSET_NAME);
assertFalse(module.runMethod("optionalIntIsNone", IValue.from(1l)).toBool());
assertTrue(module.runMethod("optionalIntIsNone", IValue.optionalNull()).toBool());
@ -186,7 +186,7 @@ public abstract class PytorchTestBase {
@Test
public void testIntEq0None() throws IOException {
final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
final Module module = loadModel(TEST_MODULE_ASSET_NAME);
assertTrue(module.runMethod("intEq0None", IValue.from(0l)).isNull());
assertTrue(module.runMethod("intEq0None", IValue.from(1l)).toLong() == 1l);
@ -194,7 +194,7 @@ public abstract class PytorchTestBase {
@Test(expected = IllegalArgumentException.class)
public void testRunUndefinedMethod() throws IOException {
final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
final Module module = loadModel(TEST_MODULE_ASSET_NAME);
module.runMethod("test_undefined_method_throws_exception");
}
@ -241,7 +241,7 @@ public abstract class PytorchTestBase {
@Test
public void testEqString() throws IOException {
final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
final Module module = loadModel(TEST_MODULE_ASSET_NAME);
String[] values =
new String[] {
"smoketest",
@ -260,7 +260,7 @@ public abstract class PytorchTestBase {
@Test
public void testStr3Concat() throws IOException {
final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
final Module module = loadModel(TEST_MODULE_ASSET_NAME);
String[] values =
new String[] {
"smoketest",
@ -281,7 +281,7 @@ public abstract class PytorchTestBase {
@Test
public void testEmptyShape() throws IOException {
final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
final Module module = loadModel(TEST_MODULE_ASSET_NAME);
final long someNumber = 43;
final IValue input = IValue.from(Tensor.fromBlob(new long[] {someNumber}, new long[] {}));
final IValue output = module.runMethod("newEmptyShapeWithItem", input);
@ -293,7 +293,7 @@ public abstract class PytorchTestBase {
@Test
public void testAliasWithOffset() throws IOException {
final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
final Module module = loadModel(TEST_MODULE_ASSET_NAME);
final IValue output = module.runMethod("testAliasWithOffset");
assertTrue(output.isTensorList());
Tensor[] tensors = output.toTensorList();
@ -303,7 +303,7 @@ public abstract class PytorchTestBase {
@Test
public void testNonContiguous() throws IOException {
final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
final Module module = loadModel(TEST_MODULE_ASSET_NAME);
final IValue output = module.runMethod("testNonContiguous");
assertTrue(output.isTensor());
Tensor value = output.toTensor();
@ -316,7 +316,7 @@ public abstract class PytorchTestBase {
long[] inputShape = new long[] {1, 3, 2, 2};
long[] data = new long[] {1, 11, 101, 2, 12, 102, 3, 13, 103, 4, 14, 104};
Tensor inputNHWC = Tensor.fromBlob(data, inputShape, MemoryFormat.CHANNELS_LAST);
final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
final Module module = loadModel(TEST_MODULE_ASSET_NAME);
final IValue outputNCHW = module.runMethod("contiguous", IValue.from(inputNHWC));
assertIValueTensor(
outputNCHW,
@ -334,7 +334,7 @@ public abstract class PytorchTestBase {
long[] dataNHWDC = new long[] {1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16};
Tensor inputNHWDC = Tensor.fromBlob(dataNHWDC, shape, MemoryFormat.CHANNELS_LAST_3D);
final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
final Module module = loadModel(TEST_MODULE_ASSET_NAME);
final IValue outputNCHWD = module.runMethod("contiguous", IValue.from(inputNHWDC));
assertIValueTensor(outputNCHWD, MemoryFormat.CONTIGUOUS, shape, dataNCHWD);
@ -358,7 +358,7 @@ public abstract class PytorchTestBase {
long[] dataWeightOHWI = new long[] {2, 0, 0, 0, 1, 0, 0, 0, -1};
Tensor wNHWC = Tensor.fromBlob(dataWeightOHWI, weightShape, MemoryFormat.CHANNELS_LAST);
final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
final Module module = loadModel(TEST_MODULE_ASSET_NAME);
final IValue outputNCHW =
module.runMethod("conv2d", IValue.from(inputNCHW), IValue.from(wNCHW), IValue.from(false));
@ -389,5 +389,5 @@ public abstract class PytorchTestBase {
assertArrayEquals(expectedData, t.getDataAsLongArray());
}
protected abstract String assetFilePath(String assetName) throws IOException;
protected abstract Module loadModel(String assetName) throws IOException;
}

View File

@ -0,0 +1,9 @@
package org.pytorch.suite;
import org.junit.runner.RunWith;
import org.junit.runners.Suite;
import org.pytorch.PytorchLiteInstrumentedTests;
@RunWith(Suite.class)
@Suite.SuiteClasses({PytorchLiteInstrumentedTests.class})
public class PytorchLiteInstrumentedTestSuite {}

View File

@ -2,10 +2,18 @@
Provides the implementations of CUDA BLAS function templates.
*/
#include <ATen/ATen.h>
#include <ATen/cuda/CUDABlas.h>
#include <ATen/cuda/Exceptions.h>
#include <c10/util/irange.h>
#include <c10/cuda/CUDAFunctions.h>
#include <c10/macros/Export.h>
#include <c10/util/irange.h>
// cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also
// added bf16 support
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER)
#include <cublasLt.h>
#endif
#define CUDABLAS_POSINT_CHECK(FD, X) \
TORCH_CHECK( \
@ -540,6 +548,256 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
}
#endif // defined(CUDA_VERSION) && CUDA_VERSION >= 11000
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER)
namespace {
// Following the pattern of CuSparseDescriptor
// Defined here for now because this is the only place cublas_lt interface is
// used but can be moved to a header once cublas_lt interface is used in
// multiple places.
template <typename T, cublasStatus_t (*destructor)(T*)>
struct CuBlasLtDeleter {
void operator()(T* x) {
if (x != nullptr) {
TORCH_CUDABLAS_CHECK(destructor(x));
}
}
};
template <typename T, cublasStatus_t (*destructor)(T*)>
class CuBlasLtDescriptor {
public:
T* descriptor() const {
return descriptor_.get();
}
T* descriptor() {
return descriptor_.get();
}
protected:
std::unique_ptr<T, CuBlasLtDeleter<T, destructor>> descriptor_;
};
class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor<
cublasLtMatmulDescOpaque_t,
&cublasLtMatmulDescDestroy> {
public:
CuBlasLtMatmulDescriptor(
cublasComputeType_t compute_type,
cudaDataType_t scale_type) {
cublasLtMatmulDesc_t raw_descriptor = nullptr;
TORCH_CUDABLAS_CHECK(
cublasLtMatmulDescCreate(&raw_descriptor, compute_type, scale_type));
descriptor_.reset(raw_descriptor);
}
};
class CuBlasLtMatrixLayout : public CuBlasLtDescriptor<
cublasLtMatrixLayoutOpaque_t,
&cublasLtMatrixLayoutDestroy> {
public:
CuBlasLtMatrixLayout(
cudaDataType_t type,
uint64_t rows,
uint64_t cols,
int64_t ld) {
cublasLtMatrixLayout_t raw_descriptor = nullptr;
TORCH_CUDABLAS_CHECK(
cublasLtMatrixLayoutCreate(&raw_descriptor, type, rows, cols, ld));
descriptor_.reset(raw_descriptor);
}
};
class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
cublasLtMatmulPreferenceOpaque_t,
&cublasLtMatmulPreferenceDestroy> {
public:
CuBlasLtMatmulPreference() {
cublasLtMatmulPreference_t raw_descriptor = nullptr;
TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceCreate(&raw_descriptor));
descriptor_.reset(raw_descriptor);
}
};
} // namespace
template <typename Dtype>
void gemm_and_bias(
bool transpose_mat1,
bool transpose_mat2,
int64_t m,
int64_t n,
int64_t k,
at::opmath_type<Dtype> alpha_val,
const Dtype* mat1_ptr,
int64_t mat1_ld,
const Dtype* mat2_ptr,
int64_t mat2_ld,
const Dtype* bias,
Dtype* result_ptr,
int64_t result_ld) {
using opmath_t = at::opmath_type<Dtype>;
opmath_t beta_val = 0; // bias is added in epilogue
cudaDataType_t abcType = CUDA_R_32F;
cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
cudaDataType_t scaleType = CUDA_R_32F;
if (std::is_same<Dtype, double>::value) {
abcType = CUDA_R_64F;
computeType = CUBLAS_COMPUTE_64F;
scaleType = CUDA_R_64F;
} else if (std::is_same<Dtype, float>::value) {
if (at::globalContext().allowTF32CuBLAS()) {
computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
}
abcType = CUDA_R_32F;
} else if (std::is_same<Dtype, at::Half>::value) {
abcType = CUDA_R_16F;
} else if (std::is_same<Dtype, at::BFloat16>::value) {
abcType = CUDA_R_16BF;
}
CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
cublasOperation_t transa = transpose_mat1 ? CUBLAS_OP_T : CUBLAS_OP_N;
TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute(
computeDesc.descriptor(),
CUBLASLT_MATMUL_DESC_TRANSA,
&transa,
sizeof(transa)));
cublasOperation_t transb = transpose_mat2 ? CUBLAS_OP_T : CUBLAS_OP_N;
TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute(
computeDesc.descriptor(),
CUBLASLT_MATMUL_DESC_TRANSB,
&transb,
sizeof(transb)));
cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BIAS;
TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute(
computeDesc.descriptor(),
CUBLASLT_MATMUL_DESC_EPILOGUE,
&epilogue,
sizeof(epilogue)));
TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute(
computeDesc.descriptor(),
CUBLASLT_MATMUL_DESC_BIAS_POINTER,
&bias,
sizeof(Dtype*)));
CuBlasLtMatrixLayout Adesc(
abcType, transpose_mat1 ? k : m, transpose_mat1 ? m : k, mat1_ld);
CuBlasLtMatrixLayout Bdesc(
abcType, transpose_mat2 ? n : k, transpose_mat2 ? k : n, mat2_ld);
CuBlasLtMatrixLayout Cdesc(abcType, m, n, result_ld);
CuBlasLtMatmulPreference preference;
size_t workspaceSize = 0;
TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceSetAttribute(
preference.descriptor(),
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
&workspaceSize,
sizeof(workspaceSize)));
auto workspace = at::empty(
{static_cast<int64_t>(workspaceSize)},
at::device({at::kCUDA, at::cuda::current_device()}).dtype(at::kByte));
cublasLtMatmulHeuristicResult_t heuristicResult = {};
int returnedResult = 0;
cublasLtHandle_t ltHandle =
reinterpret_cast<cublasLtHandle_t>(at::cuda::getCurrentCUDABlasHandle());
TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
ltHandle,
computeDesc.descriptor(),
Adesc.descriptor(),
Bdesc.descriptor(),
Cdesc.descriptor(),
Cdesc.descriptor(),
preference.descriptor(),
1,
&heuristicResult,
&returnedResult));
if (returnedResult == 0) {
TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED);
}
TORCH_CUDABLAS_CHECK(cublasLtMatmul(
ltHandle,
computeDesc.descriptor(),
&alpha_val,
mat1_ptr,
Adesc.descriptor(),
mat2_ptr,
Bdesc.descriptor(),
&beta_val,
result_ptr,
Cdesc.descriptor(),
result_ptr,
Cdesc.descriptor(),
&heuristicResult.algo,
workspace.data_ptr(),
workspaceSize,
at::cuda::getCurrentCUDAStream()));
}
template void gemm_and_bias(
bool transpose_mat1,
bool transpose_mat2,
int64_t m,
int64_t n,
int64_t k,
at::opmath_type<double> alpha_val,
const double* mat1_ptr,
int64_t mat1_ld,
const double* mat2_ptr,
int64_t mat2_ld,
const double* bias,
double* result_ptr,
int64_t result_ld);
template void gemm_and_bias(
bool transpose_mat1,
bool transpose_mat2,
int64_t m,
int64_t n,
int64_t k,
at::opmath_type<float> alpha_val,
const float* mat1_ptr,
int64_t mat1_ld,
const float* mat2_ptr,
int64_t mat2_ld,
const float* bias,
float* result_ptr,
int64_t result_ld);
template void gemm_and_bias(
bool transpose_mat1,
bool transpose_mat2,
int64_t m,
int64_t n,
int64_t k,
at::opmath_type<at::Half> alpha_val,
const at::Half* mat1_ptr,
int64_t mat1_ld,
const at::Half* mat2_ptr,
int64_t mat2_ld,
const at::Half* bias,
at::Half* result_ptr,
int64_t result_ld);
template void gemm_and_bias(
bool transpose_mat1,
bool transpose_mat2,
int64_t m,
int64_t n,
int64_t k,
at::opmath_type<at::BFloat16> alpha_val,
const at::BFloat16* mat1_ptr,
int64_t mat1_ld,
const at::BFloat16* mat2_ptr,
int64_t mat2_ld,
const at::BFloat16* bias,
at::BFloat16* result_ptr,
int64_t result_ld);
#endif // defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER)
template <>
void trsm<float>(CUDABLAS_TRSM_ARGTYPES(float)) {
TORCH_CUDABLAS_CHECK(cublasStrsm(

View File

@ -70,6 +70,24 @@ template <>
void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
#endif
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER)
template <typename Dtype>
void gemm_and_bias(
bool transpose_mat1,
bool transpose_mat2,
int64_t m,
int64_t n,
int64_t k,
at::opmath_type<Dtype> alpha_val,
const Dtype* mat1_ptr,
int64_t mat1_ld,
const Dtype* mat2_ptr,
int64_t mat2_ld,
const Dtype* bias,
Dtype* result_ptr,
int64_t result_ld);
#endif
#define CUDABLAS_BGEMM_ARGTYPES(Dtype) \
char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type<Dtype> alpha, \
const Dtype *a, int64_t lda, int64_t stridea, \

View File

@ -1,339 +0,0 @@
#include <type_traits>
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/Dispatch.h>
#include <ATen/NativeFunctions.h>
#include <ATen/Parallel.h>
#include <ATen/cpu/vec/vec256/vec256.h>
namespace at {
namespace native {
namespace {
Tensor gemm_nt(const Tensor& a, const Tensor& b) {
return at::native::matmul(a, b.t());
}
template <typename scalar_t>
void transform_bias_rescale_qkv_inner_loop(
int64_t B,
int64_t T,
int64_t _3D,
int64_t D,
int64_t num_head,
int64_t dim_per_head,
scalar_t* qkv_data,
scalar_t* qkv_bias_data,
scalar_t* q_k_v_data,
scalar_t sqrt_dim_per_head,
int64_t begin,
int64_t end) {
for (auto i : c10::irange(begin, end)) {
auto t = i % T;
i /= T;
auto nh = i % num_head;
i /= num_head;
auto b = i;
using Vec = vec::Vectorized<scalar_t>;
auto V = vec::Vectorized<scalar_t>::size();
auto dh = 0;
auto d = nh * dim_per_head;
for (; dh + V <= dim_per_head; dh += V, d += V) {
// load
auto q_bias_data = Vec::loadu(&qkv_bias_data[d + 0 * D]);
auto k_bias_data = Vec::loadu(&qkv_bias_data[d + 1 * D]);
auto v_bias_data = Vec::loadu(&qkv_bias_data[d + 2 * D]);
auto q_data =
Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 0 * D]) +
q_bias_data;
auto k_data =
Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 1 * D]) +
k_bias_data;
auto v_data =
Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 2 * D]) +
v_bias_data;
q_data = q_data / Vec(sqrt_dim_per_head);
q_data.store(&q_k_v_data
[0 * B * num_head * T * dim_per_head +
b * num_head * T * dim_per_head +
nh * T * dim_per_head +
t * dim_per_head + dh]);
k_data.store(&q_k_v_data
[1 * B * num_head * T * dim_per_head +
b * num_head * T * dim_per_head +
nh * T * dim_per_head +
t * dim_per_head + dh]);
v_data.store(&q_k_v_data
[2 * B * num_head * T * dim_per_head +
b * num_head * T * dim_per_head +
nh * T * dim_per_head +
t * dim_per_head + dh]);
}
for (; dh < dim_per_head; dh++) {
auto d = nh * dim_per_head + dh;
auto q_bias = qkv_bias_data[d + 0 * D];
auto k_bias = qkv_bias_data[d + 1 * D];
auto v_bias = qkv_bias_data[d + 2 * D];
auto q_data = qkv_data[b * _3D * T + t * _3D + d + 0 * D] + q_bias;
auto k_data = qkv_data[b * _3D * T + t * _3D + d + 1 * D] + k_bias;
auto v_data = qkv_data[b * _3D * T + t * _3D + d + 2 * D] + v_bias;
q_data = q_data / sqrt_dim_per_head;
q_k_v_data[0 * B * num_head * T * dim_per_head +
b * num_head * T * dim_per_head +
nh * T * dim_per_head +
t * dim_per_head + dh] = q_data;
q_k_v_data[1 * B * num_head * T * dim_per_head +
b * num_head * T * dim_per_head +
nh * T * dim_per_head +
t * dim_per_head + dh] = k_data;
q_k_v_data[2 * B * num_head * T * dim_per_head +
b * num_head * T * dim_per_head +
nh * T * dim_per_head +
t * dim_per_head + dh] = v_data;
}
}
}
// compute q = (q + q_bias) / sqrt(dim_per_head), k = k + k_bias, v = v + v_bias
std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv(
const Tensor& qkv,
const Tensor& qkv_bias,
const int64_t num_head) {
auto B = qkv.size(0);
auto T = qkv.size(1);
auto _3D = qkv.size(2);
auto D = _3D / 3;
TORCH_CHECK(D % num_head == 0);
TORCH_CHECK(_3D % 3 == 0);
const auto dim_per_head = D / num_head;
auto q_k_v = at::empty({3, B, num_head, T, dim_per_head}, qkv.options());
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(q_k_v.is_contiguous());
const auto qkv_contig = qkv.expect_contiguous();
const auto qkv_bias_contig = qkv_bias.expect_contiguous();
AT_DISPATCH_FLOATING_TYPES_AND2(
ScalarType::Half,
ScalarType::BFloat16,
qkv.scalar_type(),
"transform_bias_rescale_qkv",
[&] {
scalar_t* qkv_data = qkv_contig->data_ptr<scalar_t>();
scalar_t* qkv_bias_data = qkv_bias_contig->data_ptr<scalar_t>();
scalar_t* q_k_v_data = q_k_v.data_ptr<scalar_t>();
const scalar_t sqrt_dim_per_head = std::sqrt(static_cast<scalar_t>(dim_per_head));
int64_t grain_size =
std::max(internal::GRAIN_SIZE / (3 * dim_per_head), (int64_t)1);
parallel_for(
0, B * num_head * T, grain_size, [&](int64_t begin, int64_t end) {
transform_bias_rescale_qkv_inner_loop(B, T, _3D, D, num_head, dim_per_head, qkv_data, qkv_bias_data, q_k_v_data, sqrt_dim_per_head, begin, end);
});
});
auto q_k_v_s =
at::native::split(q_k_v.view({3 * B, num_head, T, dim_per_head}), B, 0);
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(q_k_v_s.size() == 3);
return std::make_tuple(q_k_v_s[0], q_k_v_s[1], q_k_v_s[2]);
}
Tensor bmm_nt(const Tensor& a, const Tensor& b) {
auto a_ = a.view({a.size(0) * a.size(1), a.size(2), a.size(3)});
auto b_ = b.view({b.size(0) * b.size(1), b.size(2), b.size(3)});
auto bt_ = b_.transpose(2, 1);
// TODO: are these a single call to cublas batched matmul?
auto c_ = at::matmul(a_, bt_);
return c_.view({a.size(0), a.size(1), a.size(2), b.size(2)});
}
void masked_softmax_dropout(
Tensor& attn_scores,
const c10::optional<Tensor>& attn_mask) {
auto B = attn_scores.size(0);
auto num_heads = attn_scores.size(1);
auto T = attn_scores.size(2);
if (attn_mask) {
TORCH_CHECK(attn_mask->is_contiguous());
} else {
at::_softmax_out(attn_scores, attn_scores, 3, false);
return;
}
AT_DISPATCH_FLOATING_TYPES_AND2(
ScalarType::Half,
ScalarType::BFloat16,
attn_scores.scalar_type(),
"masked_softmax_dropout",
[&] {
using accscalar_t = acc_type<scalar_t, false>;
// TODO: proper implementation with masking.
scalar_t* attn_scores_data = attn_scores.data_ptr<scalar_t>();
int64_t grain_size = std::min(internal::GRAIN_SIZE / T, (int64_t)1);
parallel_for(
0, B * num_heads * T, grain_size, [&](int64_t begin, int64_t end) {
for (const auto i : c10::irange(begin, end)) {
using Vec = vec::Vectorized<scalar_t>;
auto V = vec::Vectorized<scalar_t>::size();
scalar_t* input_data = attn_scores_data + i;
auto max_input = Vec(std::numeric_limits<scalar_t>::lowest());
// TODO: handle epilogue
TORCH_CHECK(T % V == 0, "epilogue not implemented yet");
for (auto t = 0; t < T; t += V) {
auto v = Vec::loadu(&input_data[t]);
max_input = vec::maximum(max_input, v);
}
auto hmax = std::numeric_limits<scalar_t>::lowest();
for (auto i = 0; i < V; ++i) {
hmax = std::max(max_input[i], hmax);
}
accscalar_t hsum = 0;
TORCH_CHECK(T % V == 0, "epilogue not implemented yet");
for (auto t = 0; t < T; t += V) {
auto v = Vec::loadu(&input_data[t]);
// TODO: vectorize in accscalar_t?
for (auto i = 0; i < V; ++i) {
hsum += std::exp(static_cast<accscalar_t>(v[i]) - hmax);
}
}
auto inv_denominator = 1.0 / hsum;
TORCH_CHECK(T % V == 0, "epilogue not implemented yet");
for (auto t = 0; t < T; t += V) {
Vec v = Vec::loadu(&input_data[t]);
// TODO: vectorize in accscalar_t?
// TODO this faster solution does not work on Android build
/*
for (auto i = 0; i < V; ++i) {
v[i] = static_cast<scalar_t>(std::exp(static_cast<accscalar_t>(v[i]) - hmax) * inv_denominator);
}
v.store(&input_data[t]);
*/
for (auto i = 0; i < V; ++i) {
input_data[t + i] = static_cast<scalar_t>(std::exp(static_cast<accscalar_t>(v[i]) - hmax) * inv_denominator);
}
}
}
});
});
}
Tensor bmm_nn(const Tensor& a, const Tensor& b) {
auto a_ = a.view({a.size(0) * a.size(1), a.size(2), a.size(3)});
auto b_ = b.view({b.size(0) * b.size(1), b.size(2), b.size(3)});
// TODO: are these a single call to cublas batched matmul?
auto c_ = at::matmul(a_, b_);
return c_.view({a.size(0), a.size(1), a.size(2), b.size(3)});
}
Tensor transform_0213(const Tensor& a) {
// TODO: check perf vs dedicated kernel.
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.size(1));
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.size(3));
return a.permute({0, 2, 1, 3})
.contiguous()
.view({a.size(0), a.size(2), a.size(1) * a.size(3)});
}
Tensor gemm_nt_bias(const Tensor& a, const Tensor& b, const Tensor& c) {
auto a_ = a.view({a.size(0) * a.size(1), a.size(2)});
auto r_ = at::native::linear(a_, b, c);
return r_.view({a.size(0), a.size(1), r_.size(1)});
}
void debug_assert_shape(const Tensor& t, c10::IntArrayRef shape) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY((size_t)t.dim() == shape.size(), "expected ", shape.size(), "-D tensor but got ", t.dim());
for (auto idx : c10::irange(shape.size())) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t.sizes()[idx] == shape[idx], "expected dim ", idx, " to be ", shape[idx], " but got ", t.sizes()[idx]);
}
}
} // namespace
std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv_op_cpu(
const Tensor& qkv,
const Tensor& qkv_bias,
const int64_t num_head) {
auto result = transform_bias_rescale_qkv(qkv, qkv_bias, num_head);
return std::make_tuple(std::get<0>(result).clone(), std::get<1>(result).clone(), std::get<2>(result).clone());
}
Tensor multi_head_self_attention_cpu(
const Tensor& query,
const Tensor& qkv_weight,
const Tensor& qkv_bias,
const Tensor& proj_weight,
const Tensor& proj_bias,
const int64_t num_head,
const c10::optional<Tensor>& mask) {
// query shape: [B, T, D]
// qkv_weight shape: [3 * D, D]
const auto D = query.sizes()[2];
TORCH_CHECK(query.dim() == 3, "expected 3-dimensional query, got ", query.dim(), "-D tensor");
TORCH_CHECK(qkv_weight.dim() == 2, "expected 2-dimensional qkv_weight, got ", qkv_weight.dim(), "-D tensor");
TORCH_CHECK(D * 3 == qkv_weight.sizes()[0], "expected qkv_weight first dim to be 3x last dim of query");
TORCH_CHECK(D == qkv_weight.sizes()[1], "expected qkv_weight second dim and last dim of query to be equal");
TORCH_CHECK(qkv_bias.dim() == 1, "expected 2-dimensional qkv_bias, got ", qkv_bias.dim(), "-D tensor");
TORCH_CHECK(qkv_bias.sizes()[0] == 3 * D, "expected qkv_bias first dim and first dim of query to be equal");
TORCH_CHECK(D % num_head == 0, "D must divide evenly by num_head");
#ifndef NDEBUG
const auto B = query.sizes()[0];
const auto T = query.sizes()[1];
const auto dim_per_head = D / num_head;
#endif
// shape: [B, T, 3 x D]
auto qkv = gemm_nt(query, qkv_weight);
#ifndef NDEBUG
debug_assert_shape(qkv, {B, T, 3 * D});
#endif
// shape: 3 x [B, num_head, T, dim_per_head]
auto q_k_v = transform_bias_rescale_qkv(qkv, qkv_bias, num_head);
const auto& q = std::get<0>(q_k_v);
const auto& k = std::get<1>(q_k_v);
const auto& v = std::get<2>(q_k_v);
#ifndef NDEBUG
debug_assert_shape(q, {B, num_head, T, dim_per_head});
debug_assert_shape(k, {B, num_head, T, dim_per_head});
debug_assert_shape(v, {B, num_head, T, dim_per_head});
#endif
// shape: [B, num_head, T, T]
auto qkt = bmm_nt(q, k);
#ifndef NDEBUG
debug_assert_shape(qkt, {B, num_head, T, T});
#endif
// shape: [B, num_head, T, T]
masked_softmax_dropout(qkt, mask);
// shape: [B, num_head, T, dim_per_head]
auto attn_ctx = bmm_nn(qkt, v);
#ifndef NDEBUG
debug_assert_shape(attn_ctx, {B, num_head, T, dim_per_head});
#endif
// shape: [B, T, D]
auto attn = transform_0213(attn_ctx);
#ifndef NDEBUG
debug_assert_shape(attn, {B, T, D});
#endif
// shape: [B, T, D]
auto proj = gemm_nt_bias(attn, proj_weight, proj_bias);
#ifndef NDEBUG
debug_assert_shape(proj, {B, T, D});
#endif
return proj;
}
} // namespace native
} // namespace at

View File

@ -102,9 +102,27 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
IntArrayRef mat1_sizes = mat1.sizes();
IntArrayRef mat2_sizes = mat2.sizes();
IntArrayRef self__sizes;
bool useLtInterface = false;
at::ScalarType scalar_type = self.scalar_type();
c10::MaybeOwned<Tensor> self_;
if (&result != &self) {
self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm");
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER)
// Strangely, if mat2 has only 1 row or column, we get
// CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
// self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
// is to use lt interface only when self is bias.
useLtInterface = beta.toComplexDouble() == 1.0 && self.dim() == 1 &&
result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] &&
self.is_contiguous() &&
(scalar_type == at::ScalarType::Double ||
scalar_type == at::ScalarType::Float ||
scalar_type == at::ScalarType::Half ||
scalar_type == at::ScalarType::BFloat16) &&
mat2_sizes[0] > 1 && mat2_sizes[1] > 1;
#endif
if (!useLtInterface) {
self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm");
}
self__sizes = self_->sizes();
} else {
self_ = c10::MaybeOwned<Tensor>::borrowed(self);
@ -115,8 +133,8 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
}
if (&result != &self) {
at::native::resize_output(result, self__sizes);
if (beta.toComplexDouble() != 0.0) {
at::native::resize_output(result, {mat1_sizes[0], mat2_sizes[1]});
if (beta.toComplexDouble() != 0.0 && !useLtInterface) {
at::native::copy_(result, *self_);
}
}
@ -147,7 +165,6 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
int64_t mat1_ld = mat1_->stride((transpose_mat1 == transpose_result) ? 1 : 0);
int64_t mat2_ld = mat2_->stride((transpose_mat2 == transpose_result) ? 1 : 0);
int64_t result_ld = result_->stride(transpose_result ? 0 : 1);
at::ScalarType scalar_type = self_->scalar_type();
if (mat1.numel() == 0) {
// By definition, when beta==0, values in self should be ignored. nans and infs
@ -170,24 +187,61 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!result_->is_conj());
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "addmm_cuda", [&] {
using opmath_t = at::opmath_type<scalar_t>;
opmath_t alpha_val = alpha.to<opmath_t>();
opmath_t beta_val = beta.to<opmath_t>();
scalar_t* mat1_ptr = mat1_->data_ptr<scalar_t>();
scalar_t* mat2_ptr = mat2_->data_ptr<scalar_t>();
scalar_t* result_ptr = result_->data_ptr<scalar_t>();
at::cuda::blas::gemm<scalar_t>(
transpose_mat1 ? mat1_->is_conj() ? 'c' : 't' : 'n',
transpose_mat2 ? mat2_->is_conj() ? 'c' : 't' : 'n',
m, n, k,
alpha_val,
mat1_ptr, mat1_ld,
mat2_ptr, mat2_ld,
beta_val,
result_ptr, result_ld
);
});
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER)
if (useLtInterface) {
AT_DISPATCH_FLOATING_TYPES_AND2(
at::ScalarType::Half,
at::ScalarType::BFloat16,
scalar_type,
"addmm_cuda_lt",
[&] {
at::cuda::blas::gemm_and_bias<scalar_t>(
transpose_mat1,
transpose_mat2,
m,
n,
k,
alpha.to<at::opmath_type<scalar_t>>(),
mat1_->data_ptr<scalar_t>(),
mat1_ld,
mat2_->data_ptr<scalar_t>(),
mat2_ld,
self.data_ptr<scalar_t>(),
result_->data_ptr<scalar_t>(),
result_ld);
});
} else
#endif
{
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
at::ScalarType::Half,
at::ScalarType::BFloat16,
scalar_type,
"addmm_cuda",
[&] {
using opmath_t = at::opmath_type<scalar_t>;
opmath_t alpha_val = alpha.to<opmath_t>();
opmath_t beta_val = beta.to<opmath_t>();
scalar_t* mat1_ptr = mat1_->data_ptr<scalar_t>();
scalar_t* mat2_ptr = mat2_->data_ptr<scalar_t>();
scalar_t* result_ptr = result_->data_ptr<scalar_t>();
at::cuda::blas::gemm<scalar_t>(
transpose_mat1 ? mat1_->is_conj() ? 'c' : 't' : 'n',
transpose_mat2 ? mat2_->is_conj() ? 'c' : 't' : 'n',
m,
n,
k,
alpha_val,
mat1_ptr,
mat1_ld,
mat2_ptr,
mat2_ld,
beta_val,
result_ptr,
result_ld);
});
}
if (!result.is_same(*result_)) {
result.copy_(*result_);
}

View File

@ -4,89 +4,9 @@
#include <ATen/cuda/CUDAConfig.h>
#include <ATen/cuda/PinnedMemoryAllocator.h>
#if AT_MAGMA_ENABLED()
#include <magma_types.h>
#include <magma_v2.h>
#endif
namespace at {
namespace native {
#if AT_MAGMA_ENABLED()
// RAII for a MAGMA Queue
struct MAGMAQueue {
// Default constructor without a device will cause
// destroying a queue which has not been initialized.
MAGMAQueue() = delete;
// Constructor
explicit MAGMAQueue(int64_t device_id) {
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
// Magma operations is numerically sensitive, so TF32 should be off
// regardless of the global flag.
TORCH_CUDABLAS_CHECK(cublasGetMathMode(handle, &original_math_mode));
TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
#endif
magma_queue_create_from_cuda(
device_id,
at::cuda::getCurrentCUDAStream(),
handle,
at::cuda::getCurrentCUDASparseHandle(),
&magma_queue_);
}
// Getter
magma_queue_t get_queue() const { return magma_queue_; }
// Destructor
~MAGMAQueue() {
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
// We've manually set the math mode to CUBLAS_DEFAULT_MATH, now we
// should restore the original math mode back
cublasHandle_t handle = magma_queue_get_cublas_handle(magma_queue_);
cublasSetMathMode(handle, original_math_mode);
#endif
magma_queue_destroy(magma_queue_);
}
private:
magma_queue_t magma_queue_;
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
cublasMath_t original_math_mode;
#endif
};
static inline magma_int_t magma_int_cast(int64_t value, const char* varname) {
auto result = static_cast<magma_int_t>(value);
if (static_cast<int64_t>(result) != value) {
AT_ERROR("magma: The value of ", varname, "(", (long long)value,
") is too large to fit into a magma_int_t (", sizeof(magma_int_t), " bytes)");
}
return result;
}
// MAGMA functions that don't take a magma_queue_t aren't stream safe
// Work around this by synchronizing with the default stream
struct MagmaStreamSyncGuard {
MagmaStreamSyncGuard() {
auto stream = at::cuda::getCurrentCUDAStream();
if (stream != at::cuda::getDefaultCUDAStream()) {
at::cuda::stream_synchronize(stream);
}
}
~MagmaStreamSyncGuard() noexcept(false) {
auto default_stream = at::cuda::getDefaultCUDAStream();
if (at::cuda::getCurrentCUDAStream() != default_stream) {
at::cuda::stream_synchronize(default_stream);
}
}
};
#endif
static inline int cuda_int_cast(int64_t value, const char* varname) {
auto result = static_cast<int>(value);
TORCH_CHECK(static_cast<int64_t>(result) == value,

View File

@ -1,342 +0,0 @@
#include <type_traits>
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/Dispatch.h>
#include <ATen/NativeFunctions.h>
#include <ATen/TensorAccessor.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/detail/KernelUtils.h>
#include <ATen/cuda/detail/IndexUtils.cuh>
#include <ATen/native/cuda/Loops.cuh>
#include <ATen/native/cuda/MemoryAccess.cuh>
#include <ATen/native/cuda/block_reduce.cuh>
#include <ATen/native/cuda/PersistentSoftmax.cuh>
#include <c10/cuda/CUDAMathCompat.h>
namespace at {
namespace native {
namespace {
Tensor gemm_nt(const Tensor& a, const Tensor& b) {
return at::native::matmul(a, b.t());
}
static constexpr int TRANSFORM_BIAS_RESCALE_VEC = 4;
template <typename scalar_t, typename accscalar_t, bool assume_aligned>
__global__ void transform_bias_rescale_qkv_kernel(
// [B, T, 3 * D]
const PackedTensorAccessor64<scalar_t, 3, RestrictPtrTraits> qkv,
// [3 * D]
const PackedTensorAccessor64<scalar_t, 1, RestrictPtrTraits> qkv_bias,
// [3, B, NH, T, DH]
PackedTensorAccessor64<scalar_t, 5, RestrictPtrTraits> q_k_v) {
// warp per DH.
// so launch B * NH * T warps.
auto NH = q_k_v.size(2);
auto T = q_k_v.size(3);
auto DH = q_k_v.size(4);
auto t = blockIdx.x % T;
auto b = blockIdx.x / T;
auto D = NH * DH;
const scalar_t sqrt_dim_per_head = std::sqrt(static_cast<scalar_t>(DH));
if (assume_aligned) {
constexpr int VEC = TRANSFORM_BIAS_RESCALE_VEC;
using LoadT = memory::aligned_vector<scalar_t, VEC>;
for (int32_t d_v = threadIdx.x; d_v < D / VEC; d_v += blockDim.x) {
auto d = d_v * VEC;
auto nh = d / DH;
auto dh = d % DH;
scalar_t qkv_bias_q[VEC];
scalar_t qkv_bias_k[VEC];
scalar_t qkv_bias_v[VEC];
scalar_t qkv_q[VEC];
scalar_t qkv_k[VEC];
scalar_t qkv_v[VEC];
// Here we require D % VEC == 0 for these vectorized loads.
*reinterpret_cast<LoadT*>(&qkv_bias_q) =
*reinterpret_cast<const LoadT*>(&qkv_bias[d + 0 * D]);
*reinterpret_cast<LoadT*>(&qkv_bias_k) =
*reinterpret_cast<const LoadT*>(&qkv_bias[d + 1 * D]);
*reinterpret_cast<LoadT*>(&qkv_bias_v) =
*reinterpret_cast<const LoadT*>(&qkv_bias[d + 2 * D]);
*reinterpret_cast<LoadT*>(&qkv_q) =
*reinterpret_cast<const LoadT*>(&qkv[b][t][d + 0 * D]);
*reinterpret_cast<LoadT*>(&qkv_k) =
*reinterpret_cast<const LoadT*>(&qkv[b][t][d + 1 * D]);
*reinterpret_cast<LoadT*>(&qkv_v) =
*reinterpret_cast<const LoadT*>(&qkv[b][t][d + 2 * D]);
#pragma unroll
// TODO: specialize for float2half2/half2float2?
for (auto ii = 0; ii < VEC; ++ii) {
qkv_q[ii] = static_cast<scalar_t>(
(static_cast<accscalar_t>(qkv_q[ii]) +
static_cast<accscalar_t>(qkv_bias_q[ii])) /
static_cast<accscalar_t>(sqrt_dim_per_head));
qkv_k[ii] = static_cast<scalar_t>(
(static_cast<accscalar_t>(qkv_k[ii]) +
static_cast<accscalar_t>(qkv_bias_k[ii])));
qkv_v[ii] = static_cast<scalar_t>(
(static_cast<accscalar_t>(qkv_v[ii]) +
static_cast<accscalar_t>(qkv_bias_v[ii])));
}
// Here we require DH % VEC == 0 for these vectorized stores.
*reinterpret_cast<LoadT*>(&q_k_v[0][b][nh][t][dh]) =
*reinterpret_cast<const LoadT*>(&qkv_q);
*reinterpret_cast<LoadT*>(&q_k_v[1][b][nh][t][dh]) =
*reinterpret_cast<const LoadT*>(&qkv_k);
*reinterpret_cast<LoadT*>(&q_k_v[2][b][nh][t][dh]) =
*reinterpret_cast<const LoadT*>(&qkv_v);
}
} else {
// Same as above, but we can't vectorize memory access.
for (int32_t d = threadIdx.x; d < D; d += blockDim.x) {
auto nh = d / DH;
auto dh = d % DH;
scalar_t qkv_bias_q = qkv_bias[d + 0 * D];
scalar_t qkv_bias_k = qkv_bias[d + 1 * D];
scalar_t qkv_bias_v = qkv_bias[d + 2 * D];
scalar_t qkv_q = qkv[b][t][d + 0 * D];
scalar_t qkv_k = qkv[b][t][d + 1 * D];
scalar_t qkv_v = qkv[b][t][d + 2 * D];
qkv_q = static_cast<scalar_t>(
(static_cast<accscalar_t>(qkv_q) +
static_cast<accscalar_t>(qkv_bias_q)) /
static_cast<accscalar_t>(sqrt_dim_per_head));
qkv_k = static_cast<scalar_t>(
(static_cast<accscalar_t>(qkv_k) +
static_cast<accscalar_t>(qkv_bias_k)));
qkv_v = static_cast<scalar_t>(
(static_cast<accscalar_t>(qkv_v) +
static_cast<accscalar_t>(qkv_bias_v)));
q_k_v[0][b][nh][t][dh] = qkv_q;
q_k_v[1][b][nh][t][dh] = qkv_k;
q_k_v[2][b][nh][t][dh] = qkv_v;
}
}
}
// compute q = (q + q_bias) / sqrt(dim_per_head), k = k + k_bias, v = v + v_bias
std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv(
const Tensor& qkv,
const Tensor& qkv_bias,
const int64_t num_head) {
auto B = qkv.size(0);
auto T = qkv.size(1);
auto _3D = qkv.size(2);
auto D = _3D / 3;
TORCH_CHECK(D % num_head == 0);
const auto dim_per_head = D / num_head;
auto q_k_v = at::empty({3, B, num_head, T, dim_per_head}, qkv.options());
#define CALL_KERNEL(assume_aligned) \
transform_bias_rescale_qkv_kernel<scalar_t, accscalar_t, assume_aligned> \
<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>( \
qkv.packed_accessor64<scalar_t, 3, RestrictPtrTraits>(), \
qkv_bias.packed_accessor64<scalar_t, 1, RestrictPtrTraits>(), \
q_k_v.packed_accessor64<scalar_t, 5, RestrictPtrTraits>())
AT_DISPATCH_FLOATING_TYPES_AND2(
ScalarType::Half,
ScalarType::BFloat16,
qkv.scalar_type(),
"transform_bias_rescale_qkv",
[&] {
using accscalar_t = acc_type<scalar_t, true>;
auto threads = std::max(std::min<int32_t>(1024, D / TRANSFORM_BIAS_RESCALE_VEC), 1);
auto blocks = B * T;
if (dim_per_head % TRANSFORM_BIAS_RESCALE_VEC == 0) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
D % TRANSFORM_BIAS_RESCALE_VEC == 0,
"D = num_heads * dim_per_head, so we should have dim_per_head % "
"TRANSFORM_BIAS_RESCALE_VEC == 0 => "
"D % TRANSFORM_BIAS_RESCALE_VEC == 0");
CALL_KERNEL(true);
} else {
CALL_KERNEL(false);
}
C10_CUDA_KERNEL_LAUNCH_CHECK();
});
#undef CALL_KERNEL
auto q_k_v_s =
at::native::split(q_k_v.view({3 * B, num_head, T, dim_per_head}), B, 0);
return std::make_tuple(q_k_v_s[0], q_k_v_s[1], q_k_v_s[2]);
}
Tensor bmm_nt(const Tensor& a, const Tensor& b) {
auto a_ = a.view({a.size(0) * a.size(1), a.size(2), a.size(3)});
auto b_ = b.view({b.size(0) * b.size(1), b.size(2), b.size(3)});
auto bt_ = b_.transpose(2, 1);
// TODO: are these a single call to cublas batched matmul?
auto c_ = at::matmul(a_, bt_);
return c_.view({a.size(0), a.size(1), a.size(2), b.size(2)});
}
template <typename T>
__inline__ __device__ T WarpReduceMax(T val) {
#pragma unroll
for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) {
val = std::max(val, WARP_SHFL_DOWN(val, offset));
}
return val;
}
template <typename T>
__inline__ __device__ T WarpReduceSum(T val) {
#pragma unroll
for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) {
val += WARP_SHFL_DOWN(val, offset);
}
return val;
}
void masked_softmax_dropout(
const Tensor& attn_scores,
const c10::optional<Tensor>& attn_mask) {
auto B = attn_scores.size(0);
auto num_heads = attn_scores.size(1);
auto T = attn_scores.size(2);
if (attn_mask) {
TORCH_CHECK(attn_mask->is_contiguous());
}
AT_DISPATCH_FLOATING_TYPES_AND2(
ScalarType::Half,
ScalarType::BFloat16,
attn_scores.scalar_type(),
"masked_softmax_dropout",
[&] {
using accscalar_t = acc_type<scalar_t, true>;
// TODO: proper implementation with masking.
dispatch_softmax_forward<scalar_t, scalar_t, accscalar_t, false, false>(
attn_scores.data_ptr<scalar_t>(),
attn_scores.data_ptr<scalar_t>(),
T,
T,
B * num_heads * T
);
});
}
Tensor bmm_nn(const Tensor& a, const Tensor& b) {
auto a_ = a.view({a.size(0) * a.size(1), a.size(2), a.size(3)});
auto b_ = b.view({b.size(0) * b.size(1), b.size(2), b.size(3)});
// TODO: are these a single call to cublas batched matmul?
auto c_ = at::matmul(a_, b_);
return c_.view({a.size(0), a.size(1), a.size(2), b.size(3)});
}
Tensor transform_0213(const Tensor& a) {
// TODO: check perf vs dedicated kernel.
return a.permute({0, 2, 1, 3})
.contiguous()
.view({a.size(0), a.size(2), a.size(1) * a.size(3)});
}
Tensor gemm_nt_bias(const Tensor& a, const Tensor& b, const Tensor& c) {
auto a_ = a.view({a.size(0) * a.size(1), a.size(2)});
auto r_ = at::native::linear(a_, b, c);
return r_.view({a.size(0), a.size(1), r_.size(1)});
}
void debug_assert_shape(const Tensor& t, c10::IntArrayRef shape) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY((size_t)t.dim() == shape.size(), "expected ", shape.size(), "-D tensor but got ", t.dim());
for (auto idx : c10::irange(shape.size())) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t.sizes()[idx] == shape[idx], "expected dim ", idx, " to be ", shape[idx], " but got ", t.sizes()[idx]);
}
}
} // namespace
std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv_op_cuda(
const Tensor& qkv,
const Tensor& qkv_bias,
const int64_t num_head) {
auto result = transform_bias_rescale_qkv(qkv, qkv_bias, num_head);
return std::make_tuple(std::get<0>(result).clone(), std::get<1>(result).clone(), std::get<2>(result).clone());
}
Tensor multi_head_self_attention_cuda(
const Tensor& query,
const Tensor& qkv_weight,
const Tensor& qkv_bias,
const Tensor& proj_weight,
const Tensor& proj_bias,
const int64_t num_head,
const c10::optional<Tensor>& mask) {
// query shape: [B, T, D]
// qkv_weight shape: [3 * D, D]
const auto D = query.sizes()[2];
TORCH_CHECK(query.dim() == 3, "expected 3-dimensional query, got ", query.dim(), "-D tensor");
TORCH_CHECK(qkv_weight.dim() == 2, "expected 2-dimensional qkv_weight, got ", qkv_weight.dim(), "-D tensor");
TORCH_CHECK(D * 3 == qkv_weight.sizes()[0], "expected qkv_weight first dim to be 3x last dim of query");
TORCH_CHECK(D == qkv_weight.sizes()[1], "expected qkv_weight second dim and last dim of query to be equal");
TORCH_CHECK(D % num_head == 0, "D must divide evenly by num_head");
#ifndef NDEBUG
const auto B = query.sizes()[0];
const auto T = query.sizes()[1];
const auto dim_per_head = D / num_head;
#endif
// shape: [B, T, 3 x D]
auto qkv = gemm_nt(query, qkv_weight);
#ifndef NDEBUG
debug_assert_shape(qkv, {B, T, 3 * D});
#endif
// shape: 3 x [B, num_head, T, dim_per_head]
auto q_k_v = transform_bias_rescale_qkv(qkv, qkv_bias, num_head);
const auto& q = std::get<0>(q_k_v);
const auto& k = std::get<1>(q_k_v);
const auto& v = std::get<2>(q_k_v);
#ifndef NDEBUG
debug_assert_shape(q, {B, num_head, T, dim_per_head});
debug_assert_shape(k, {B, num_head, T, dim_per_head});
debug_assert_shape(v, {B, num_head, T, dim_per_head});
#endif
// shape: [B, num_head, T, T]
auto qkt = bmm_nt(q, k);
#ifndef NDEBUG
debug_assert_shape(qkt, {B, num_head, T, T});
#endif
// shape: [B, num_head, T, T]
masked_softmax_dropout(qkt, mask);
// shape: [B, num_head, T, dim_per_head]
auto attn_ctx = bmm_nn(qkt, v);
#ifndef NDEBUG
debug_assert_shape(attn_ctx, {B, num_head, T, dim_per_head});
#endif
// shape: [B, T, D]
auto attn = transform_0213(attn_ctx);
#ifndef NDEBUG
debug_assert_shape(attn, {B, T, D});
#endif
// shape: [B, T, D]
auto proj = gemm_nt_bias(attn, proj_weight, proj_bias);
#ifndef NDEBUG
debug_assert_shape(proj, {B, T, D});
#endif
return proj;
}
} // namespace native
} // namespace at

View File

@ -13,6 +13,7 @@
#include <ATen/native/LinearAlgebra.h>
#include <ATen/native/BatchLinearAlgebra.h>
#include <ATen/native/cuda/linalg/BatchLinearAlgebraLib.h>
#include <ATen/native/cuda/linalg/MagmaUtils.h>
#include <ATen/native/cpu/zmath.h>
#if AT_MAGMA_ENABLED()

View File

@ -0,0 +1,88 @@
#pragma once
#include <ATen/cuda/CUDAConfig.h>
#if AT_MAGMA_ENABLED()
#include <magma_types.h>
#include <magma_v2.h>
#endif
namespace at {
namespace native {
#if AT_MAGMA_ENABLED()
// RAII for a MAGMA Queue
struct MAGMAQueue {
// Default constructor without a device will cause
// destroying a queue which has not been initialized.
MAGMAQueue() = delete;
// Constructor
explicit MAGMAQueue(int64_t device_id) {
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
// Magma operations is numerically sensitive, so TF32 should be off
// regardless of the global flag.
TORCH_CUDABLAS_CHECK(cublasGetMathMode(handle, &original_math_mode));
TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
#endif
magma_queue_create_from_cuda(
device_id,
at::cuda::getCurrentCUDAStream(),
handle,
at::cuda::getCurrentCUDASparseHandle(),
&magma_queue_);
}
// Getter
magma_queue_t get_queue() const { return magma_queue_; }
// Destructor
~MAGMAQueue() {
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
// We've manually set the math mode to CUBLAS_DEFAULT_MATH, now we
// should restore the original math mode back
cublasHandle_t handle = magma_queue_get_cublas_handle(magma_queue_);
cublasSetMathMode(handle, original_math_mode);
#endif
magma_queue_destroy(magma_queue_);
}
private:
magma_queue_t magma_queue_;
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
cublasMath_t original_math_mode;
#endif
};
static inline magma_int_t magma_int_cast(int64_t value, const char* varname) {
auto result = static_cast<magma_int_t>(value);
if (static_cast<int64_t>(result) != value) {
AT_ERROR("magma: The value of ", varname, "(", (long long)value,
") is too large to fit into a magma_int_t (", sizeof(magma_int_t), " bytes)");
}
return result;
}
// MAGMA functions that don't take a magma_queue_t aren't stream safe
// Work around this by synchronizing with the default stream
struct MagmaStreamSyncGuard {
MagmaStreamSyncGuard() {
auto stream = at::cuda::getCurrentCUDAStream();
if (stream != at::cuda::getDefaultCUDAStream()) {
at::cuda::stream_synchronize(stream);
}
}
~MagmaStreamSyncGuard() noexcept(false) {
auto default_stream = at::cuda::getDefaultCUDAStream();
if (at::cuda::getCurrentCUDAStream() != default_stream) {
at::cuda::stream_synchronize(default_stream);
}
}
};
#endif
} // namespace native
} // namespace at

View File

@ -2549,16 +2549,6 @@
CUDA: layer_norm_cuda
CompositeImplicitAutograd: math_native_layer_norm
- func: _native_multi_head_self_attention(Tensor query, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, int num_head, Tensor? mask=None) -> Tensor
dispatch:
CPU: multi_head_self_attention_cpu
CUDA: multi_head_self_attention_cuda
- func: _transform_bias_rescale_qkv(Tensor qkv, Tensor qkv_bias, int num_head) -> (Tensor, Tensor, Tensor)
dispatch:
CPU: transform_bias_rescale_qkv_op_cpu
CUDA: transform_bias_rescale_qkv_op_cuda
- func: native_layer_norm_backward(Tensor grad_out, Tensor input, int[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
dispatch:
CPU: layer_norm_backward_cpu
@ -6066,7 +6056,7 @@
- func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
variants: function, method
- func: _scatter_reduce.two(Tensor self, int dim, Tensor index, str reduce, *, int? output_size=None) -> Tensor
- func: scatter_reduce.two(Tensor self, int dim, Tensor index, str reduce, *, int? output_size=None) -> Tensor
variants: function, method
dispatch:
CPU: scatter_reduce_two_cpu

View File

@ -18,6 +18,10 @@ void main() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);
if (all(lessThan(pos, uBlock.size.xyz))) {
imageStore(uOutput, pos, tanh(texelFetch(uInput, pos, 0)));
const vec4 intex = texelFetch(uInput, pos, 0);
imageStore(
uOutput,
pos,
tanh(clamp(intex, -15.0, 15.0)));
}
}

View File

@ -17,6 +17,10 @@ void main() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);
if (all(lessThan(pos, uBlock.size.xyz))) {
imageStore(uOutput, pos, tanh(imageLoad(uOutput, pos)));
const vec4 intex = imageLoad(uOutput, pos);
imageStore(
uOutput,
pos,
tanh(clamp(intex, -15.0, 15.0)));
}
}

View File

@ -322,6 +322,13 @@ Tensor add_tensor(
const Tensor& self_arg,
const Tensor& other_arg,
const Scalar& alpha) {
if (other_arg.sizes().size() == 0) {
return arithmetic_scalar(
self_arg,
other_arg.item<float>(),
c10::optional<Scalar>(alpha.to<float>()),
VK_KERNEL(add_scalar));
}
return arithmetic_tensor(
self_arg, other_arg, c10::optional<Scalar>(alpha), VK_KERNEL(add));
}
@ -354,6 +361,13 @@ Tensor sub_tensor(
const Tensor& self_arg,
const Tensor& other_arg,
const Scalar& alpha) {
if (other_arg.sizes().size() == 0) {
return arithmetic_scalar(
self_arg,
other_arg.item<float>(),
c10::optional<Scalar>(-1 * alpha.to<float>()),
VK_KERNEL(add_scalar));
}
return arithmetic_tensor(
self_arg, other_arg, c10::optional<Scalar>(alpha), VK_KERNEL(sub));
}
@ -374,6 +388,13 @@ Tensor& mul_scalar_(Tensor& self, const Scalar& other) {
}
Tensor mul_tensor(const Tensor& self_arg, const Tensor& other_arg) {
if (other_arg.sizes().size() == 0) {
return arithmetic_scalar(
self_arg,
other_arg.item<float>(),
c10::optional<Scalar>(),
VK_KERNEL(mul_scalar));
}
return arithmetic_tensor(
self_arg, other_arg, c10::optional<Scalar>(), VK_KERNEL(mul));
}
@ -400,6 +421,13 @@ Tensor& div_scalar_(Tensor& self, const Scalar& other) {
}
Tensor div_tensor(const Tensor& self_arg, const Tensor& other_arg) {
if (other_arg.sizes().size() == 0) {
return arithmetic_scalar(
self_arg,
1.0 / other_arg.item<float>(),
c10::optional<Scalar>(),
VK_KERNEL(mul_scalar));
}
return arithmetic_tensor(
self_arg, other_arg, c10::optional<Scalar>(), VK_KERNEL(div));
}

View File

@ -1551,7 +1551,7 @@ TEST(VulkanAPITest, tanh) {
return;
}
const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 30;
const auto in_vulkan = in_cpu.vulkan();
const auto out_cpu = at::tanh(in_cpu);
@ -1570,7 +1570,7 @@ TEST(VulkanAPITest, tanh_) {
return;
}
auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 30;
auto vulkan = cpu.vulkan();
at::tanh_(cpu);

View File

@ -35,6 +35,58 @@ static void cat_op_channel_perf(benchmark::State& state) {
}
}
static void gru_op_perf(benchmark::State& state) {
// Guard
if (!at::is_vulkan_available()) {
return;
}
// Arrange
const int H_in = static_cast<int>(state.range(0)); // input_size
const int H_out = static_cast<int>(state.range(1)); // hidden_size
const int num_layers = static_cast<int>(state.range(2));
const double gru_dropout = .0;
const bool has_biases = true;
const bool train = false;
const bool bidirectional = false;
const bool batch_first = true;
const auto in_cpu = at::rand({1, 1, H_in}, at::device(at::kCPU).dtype(at::kFloat));
const auto h0_cpu = at::rand({num_layers, 1, H_out}, at::device(at::kCPU).dtype(at::kFloat));
c10::List<at::Tensor> weight_ih_l; // shape (3 * hidden_size, input_size)
c10::List<at::Tensor> weight_hh_l; // shape (3 * hidden_size, hidden_size)
c10::List<at::Tensor> bias_ih_l; // shape (3 * hidden_size)
c10::List<at::Tensor> bias_hh_l; // shape (3 * hidden_size)
for (int i = 0; i < num_layers; ++i) {
weight_ih_l.emplace_back(at::rand({3 * H_out, H_in}, at::device(at::kCPU).dtype(at::kFloat)));
weight_hh_l.emplace_back(at::rand({3 * H_out, H_out}, at::device(at::kCPU).dtype(at::kFloat)));
bias_ih_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
bias_hh_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
}
// put this guard here to run inference inststead of training
// to avoid the following error:
// C++ exception with description "0INTERNAL ASSERT FAILED at "xplat/caffe2/aten/src/ATen/core/boxing/KernelFunction.cpp":31, please report a bug to PyTorch. aten::gru.input has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering (see Note [Ambiguity in AutogradOther kernel]). If you want to override CompositeImplicitAutograd, please open an issue to request a dedicated Autograd dispatch key for the backend.
// If you only want to run inference instead of training, add `c10::InferenceMode mode;` before model.forward(). Note this guard is only available in C++ but not Python at present.
c10::InferenceMode mode;
// Act
while (state.KeepRunning()) {
// weights/biases should be always on CPU.
const auto out_vulkan = at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
auto vulkan_output = std::get<0>(out_vulkan);
auto vulkan_hidden = std::get<1>(out_vulkan);
// to avoid out-of-memory issues, release resources by waiting and flushing all GPU operations
at::native::vulkan::api::context()->wait(vulkan_output);
at::native::vulkan::api::context()->wait(vulkan_hidden);
at::native::vulkan::api::context()->flush();
}
}
static void CommonBenchmarkSettings(benchmark::internal::Benchmark* b) {
b->Unit(benchmark::kMillisecond);
b->ArgNames({"N", "C", "H", "W"});
@ -48,6 +100,7 @@ BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iter
BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iterations(5000)->Args({3, 4, 221, 193}); // small multiple of 4 channels
BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iterations(5000)->Args({3, 3, 221, 193}); // small non-multiple of 4 channels
BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(3)->Iterations(1000)->Args({3, 40, 221, 193}); // big multiple of 4 channels (multi-thread)
BENCHMARK(gru_op_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iterations(1000)->Args({384, 384, 2}); // McLaren Model inputs
BENCHMARK_MAIN();
#endif /* USE_VULKAN_API */

View File

@ -0,0 +1,83 @@
#include <gtest/gtest.h>
#include <torch/csrc/jit/runtime/static/impl.h>
#include <torch/torch.h>
#include "test_utils.h"
using namespace torch;
using namespace torch::jit;
using namespace torch::jit::test;
TEST(CpuFusion, Simple) {
const auto simple_script = R"JIT(
def forward(self, a, b):
return (a + b).relu().tanh()
)JIT";
Module m("module");
m.define(simple_script);
StaticModuleOptions opts; // start with the defaults.
opts.enable_tensorexpr_fusion = true;
auto input1 = at::randn({2, 3});
auto input2 = at::ones({2, 3});
auto smodule = StaticModule(m, /* is_frozen */ false, opts, {input1, input2});
StaticRuntime runtime(smodule);
// Test with sample inputs
{
auto actual = runtime({input1, input2}, {});
auto expect = at::tanh(at::relu(input1 + input2));
EXPECT_TRUE(at::allclose(expect, actual.toTensor()));
}
// Test with different inputs
{
auto new_input1 = at::randn({5, 14});
auto new_input2 = at::randn({5, 14});
auto actual = runtime({new_input1, new_input2}, {});
auto expect = at::tanh(at::relu(new_input1 + new_input2));
EXPECT_TRUE(at::allclose(expect, actual.toTensor()));
}
}
TEST(CpuFusion, FallbackGraph) {
const auto simple_script = R"JIT(
def forward(self, a, b):
return (a + b).relu().tanh()
)JIT";
Module m("module");
m.define(simple_script);
StaticModuleOptions opts; // start with the defaults.
opts.enable_tensorexpr_fusion = true;
auto sample_input1 = at::randn({2, 3});
auto sample_input2 = at::ones({2, 3});
auto smodule = StaticModule(
m, /* is_frozen */ false, opts, {sample_input1, sample_input2});
StaticRuntime runtime(smodule);
// The sample inputs above were contiguous. Now, use a strided input
// to trigger running the fallback graph.
{
auto input1 = at::narrow(at::randn({2, 6}), 1, 0, 3);
auto input2 = at::ones({2, 3});
auto expect = at::tanh(at::relu(input1 + input2));
auto actual = runtime({input1, input2}, {});
EXPECT_TRUE(at::allclose(expect, actual.toTensor()));
}
// Test with strided inputs of different size.
{
auto input1 = at::narrow(at::randn({10, 30}), 1, 0, 25);
auto input2 = at::randn({10, 25});
auto expect = at::tanh(at::relu(input1 + input2));
auto actual = runtime({input1, input2}, {});
EXPECT_TRUE(at::allclose(expect, actual.toTensor()));
}
}

View File

@ -180,35 +180,48 @@ class vkRunner final : public Runner<T> {
virtual c10::IValue run(
T& module,
const std::vector<c10::IValue>& inputs) override {
// Upload the input tensor(s) to GPU memory.
inputs_.clear();
inputs_.reserve(inputs.size());
for (const auto& input : inputs) {
if (input.isTensor()) {
inputs_.emplace_back(input.toTensor().vulkan());
}
else if (input.isList()) {
const c10::List<c10::IValue> input_as_list = input.toList();
c10::List<at::Tensor> input_vk_list;
input_vk_list.reserve(input_as_list.size());
for (int i=0; i < input_as_list.size(); ++i) {
const c10::IValue element = input_as_list.get(i);
if (element.isTensor()) {
input_vk_list.emplace_back(element.toTensor().vulkan());
}
else {
CAFFE_THROW("Input of type c10::List must only contain Tensors!");
}
if (inputs_.size() == 0) {
// Upload the input tensor(s) to GPU memory.
inputs_.clear();
inputs_.reserve(inputs.size());
for (const auto& input : inputs) {
if (input.isTensor()) {
inputs_.emplace_back(at::rand(input.toTensor().sizes()).vulkan());
}
else if (input.isTensorList()) {
const c10::List<at::Tensor> input_as_list = input.toTensorList();
c10::List<at::Tensor> input_vk_list;
input_vk_list.reserve(input_as_list.size());
for (int i=0; i < input_as_list.size(); ++i) {
const at::Tensor element = input_as_list.get(i);
input_vk_list.emplace_back(at::rand(element.sizes()).vulkan());
}
inputs_.emplace_back(c10::IValue(input_vk_list));
}
else {
CAFFE_THROW("Inputs must only contain IValues of type c10::Tensor or c10::TensorList!");
}
inputs_.emplace_back(c10::IValue(input_vk_list));
}
else {
CAFFE_THROW("Inputs must only contain IValues of type c10::Tensor or c10::List!");
}
}
// Run, and download the output tensor to system memory.
return module.forward(inputs_).toTensor().cpu();
c10::IValue output = module.forward(inputs_);
if (output.isTensor()) {
return output.toTensor().cpu();
}
else if (output.isTensorList()) {
return output.toTensorList().get(0).cpu();
}
else if (output.isList()) {
return output.toList().get(0).toTensor().cpu();
}
else if (output.isTuple()) {
return output.toTuple()->elements()[0].toTensor().cpu();
}
else {
CAFFE_THROW("Outputs must only be either c10::Tensor or c10::TensorList!");
};
}
private:

View File

@ -44,7 +44,7 @@ class BisectPercentileOp final : public Operator<Context> {
pct_upper_.size(),
"Feature (raw) data and upper bound dimension should match.");
n_features = pct_lens_.size();
index.reserve(n_features + 1);
index.resize(n_features + 1);
index[0] = 0;
for (int i = 1; i <= n_features; ++i) {
index[i] = index[i - 1] + pct_lens_[i - 1];
@ -115,13 +115,10 @@ class BisectPercentileOp final : public Operator<Context> {
int lo,
int hi,
float val) {
int mid;
bool low_cond, high_cond;
while (lo < hi) {
mid = (lo + hi) >> 1;
low_cond = (data[mid] <= val);
high_cond = (val < data[mid + 1]);
const auto mid = lo + (hi - lo) / 2;
const bool low_cond = (data[mid] <= val);
const bool high_cond = (val < data[mid + 1]);
if (low_cond && high_cond) {
return mid;
} else if (!low_cond) {

View File

@ -1,13 +1,16 @@
import errno
import os
import shutil
import tempfile
import unittest
from collections import namedtuple
from typing import List
import caffe2.python.hypothesis_test_util as htu
import hypothesis.strategies as st
import numpy as np
import torch
from torch import Tensor
from caffe2.proto import caffe2_pb2
from caffe2.python import core, test_util, workspace, model_helper, brew
from hypothesis import given, settings
@ -783,8 +786,7 @@ class MyModule(torch.jit.ScriptModule):
return x + y + z
@torch.jit.script_method
def multi_input_tensor_list(self, tensor_list): # pyre-ignore: PT type annotations
# type: (List[Tensor]) -> Tensor
def multi_input_tensor_list(self, tensor_list: List[Tensor]) -> Tensor:
return tensor_list[0] + tensor_list[1] + tensor_list[2]
@torch.jit.script_method

View File

@ -115,11 +115,13 @@ constexpr uint64_t kMinProducedFileFormatVersion = 0x3L;
// torchscript constant table. Also update tensor storage schema adapting to
// the unify format, the root key of tensor storage is updated from {index} to
// {the_pointer_value_the_tensor.storage}, for example:
// `140245072983168.storage` Forward-compatibility change. 0x6L: Implicit
// opereator versioning using number of specified argument. Refer to the
// summary of https://github.com/pytorch/pytorch/pull/56845 for details. 0x7L:
// Enable support for operators with default arguments plus out arguments.
// 0x8L: Emit promoted operators as instructions
// `140245072983168.storage` Forward-compatibility change.
// 0x6L: Implicit opereator versioning using number of specified argument.
// Refer to the summary of https://github.com/pytorch/pytorch/pull/56845 for details.
// 0x7L: Enable support for operators with default arguments plus out arguments.
// Refer. See https://github.com/pytorch/pytorch/pull/63651 for details
// 0x8L: Emit promoted operators as instructions.
// See https://github.com/pytorch/pytorch/pull/71662 for details
constexpr uint64_t kProducedBytecodeVersion = 0x8L;
// static_assert(

View File

@ -593,6 +593,7 @@ Tensor class reference
Tensor.scatter_
Tensor.scatter_add_
Tensor.scatter_add
Tensor.scatter_reduce
Tensor.select
Tensor.select_scatter
Tensor.set_

View File

@ -118,6 +118,7 @@ Indexing, Slicing, Joining, Mutating Ops
select_scatter
slice_scatter
scatter_add
scatter_reduce
split
squeeze
stack

View File

@ -5,6 +5,7 @@
#include <torch/csrc/jit/ir/ir.h>
#include <torch/csrc/jit/ir/irparser.h>
#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
#include <torch/csrc/jit/runtime/interpreter.h>
#include <torch/csrc/jit/testing/file_check.h>
#include <sstream>
@ -350,5 +351,52 @@ TEST(TEFuserPass, FuserPass_WhereList) {
testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
}
TEST(TEFuserPass, DynamicShapeFusion) {
WithCPUFuser cf;
const auto graph_string = R"IR(
graph(%0 : Float(10, 5, strides=[5, 1], device=cpu),
%1 : Float(10, 5, strides=[5, 1], device=cpu)):
%2 : Float(10, 5, strides=[5, 1], device=cpu) = aten::mul(%0, %1)
%3 : Float(10, 5, strides=[5, 1], device=cpu) = aten::mul(%2, %1)
return (%3))IR";
auto g = std::make_shared<Graph>();
torch::jit::parseIR(graph_string, g.get());
g->lint();
FuseTensorExprs(
g,
/* min_group_size = */ 2,
/* add_composed_op = */ true,
/* fuse_to_dynamic_shapes = */ true);
Code code(g, "");
testing::FileCheck()
.check("prim::TensorExprDynamicGroup_")
->check("prim::TensorExprDynamicGuard")
->check("prim::TensorExprGroup_")
->run(*g);
auto run_and_compare = [&](const std::vector<at::Tensor>& inputs) {
TORCH_INTERNAL_ASSERT(inputs.size() == 2);
auto ref = at::mul(at::mul(inputs[0], inputs[1]), inputs[1]);
InterpreterState interp(code);
Stack stack(inputs.begin(), inputs.end());
interp.run(stack);
at::Tensor out = pop(stack).toTensor();
ASSERT_TRUE(at::allclose(out, ref));
};
std::vector<at::Tensor> inputs = {at::rand({10, 5}), at::rand({10, 5})};
run_and_compare(inputs);
std::vector<at::Tensor> inputs2 = {at::rand({20, 5}), at::rand({20, 5})};
run_and_compare(inputs2);
std::vector<at::Tensor> inputs3 = {at::rand({25, 60}), at::rand({25, 60})};
run_and_compare(inputs3);
}
} // namespace jit
} // namespace torch

View File

@ -33,7 +33,7 @@ from torch.distributed.algorithms.join import Join, Joinable, JoinHook
from torch.distributed.optim import ZeroRedundancyOptimizer
from torch.distributed.optim.zero_redundancy_optimizer import _broadcast_object
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim import SGD
from torch.optim import SGD, AdamW
from torch.testing._internal import common_distributed, common_utils
from torch.testing._internal.common_utils import (
TEST_WITH_ASAN,
@ -249,27 +249,54 @@ class TestZeroRedundancyOptimizerSingleRank(TestZeroRedundancyOptimizer):
def test_constructor(self):
"""Check the robustness of the ZeroRedundancyOptimizer constructor by
passing different values for `params`"""
passing different values for the ``params`` argument."""
self.dist_init(self.rank)
m = torch.nn.Linear(1, 1)
# (input, expected error)
inputs = [
m = torch.nn.Sequential(
torch.nn.Linear(5, 10),
torch.nn.Linear(10, 10),
torch.nn.Linear(10, 10),
)
# Test various constructor inputs in the form: (input, expected error)
ctor_inputs = [
([], ValueError), # empty parameter list
(torch.randn(1), TypeError), # non-iterable: `torch.Tensor`
(1.2, TypeError), # non-iterable: `float`
([{"params": m.parameters()}], TypeError), # iterable of dict
(list(m.parameters()) + [42], TypeError), # iterable containing non-`torch.Tensor`
([
{"params": [l.weight for l in m]},
{"params": [l.bias for l in m]},
], None), # iterable of dict
(list(m.parameters()) + [42], TypeError), # iterable containing invalid type
(m.parameters(), None), # `params` as a generator
(list(m.parameters()), None) # `params` as a list
]
for input, error in inputs:
if (error):
for ctor_input, error in ctor_inputs:
if error:
with self.assertRaises(error):
ZeroRedundancyOptimizer(input, optimizer_class=SGD, lr=0.1)
ZeroRedundancyOptimizer(ctor_input, optimizer_class=SGD, lr=0.01)
else:
ZeroRedundancyOptimizer(input, optimizer_class=SGD, lr=0.1)
ZeroRedundancyOptimizer(ctor_input, optimizer_class=SGD, lr=0.01)
# Test constructing with multiple parameter groups more thoroughly
weight_decay = 0.01
lr = 0.01
betas = (0.9, 0.999)
eps = 1e-8
params = [
{"params": [l.weight for l in m], "weight_decay": 0.},
{"params": [l.bias for l in m], "weight_decay": weight_decay},
]
o = ZeroRedundancyOptimizer(
params, optimizer_class=AdamW,
lr=lr, betas=betas, eps=eps,
)
assert len(o.param_groups) == 2, \
f"Expected 2 ZeRO param groups, but got {len(o.param_groups)}"
assert len(o.optim.param_groups) == 2, \
"Expected 2 local optimizer param groups, but got " \
f"{len(o.optim.param_groups)}"
def test_same_dense_param_type(self):
"""Check that ZeroRedundancyOptimizer raises an exception if the input
@ -459,7 +486,76 @@ class TestZeroRedundancyOptimizerDistributed(TestZeroRedundancyOptimizer):
all_trainable()
some_trainable()
@common_distributed.skip_if_no_gpu
def test_multiple_param_groups(self):
"""
Tests parity between constructing ZeRO with multiple parameter groups
upfront versus adding parameter groups to ZeRO after construction
versus a non-sharded optimizer.
"""
self.dist_init(self.rank)
model1 = torch.nn.Sequential(
torch.nn.Linear(5, 10),
torch.nn.Linear(10, 10),
torch.nn.Linear(10, 5),
)
model2 = copy.deepcopy(model1)
model3 = copy.deepcopy(model1)
model1 = model1.to(self.device)
model2 = model2.to(self.device)
model3 = model3.to(self.device)
batch_size = 8
num_iters = 3
inputs = [
torch.randn(batch_size, 5).to(self.device) for _ in range(num_iters)
]
wd = 0.01
lr = 0.01
# Construct `optim1` with both parameter groups upfront
optim1 = ZeroRedundancyOptimizer(
[
{"params": [l.weight for l in model1], "weight_decay": 0.},
{"params": [l.bias for l in model1], "weight_decay": wd},
],
optimizer_class=AdamW, lr=lr,
)
# Construct `optim2` by adding the second parameter after
optim2 = ZeroRedundancyOptimizer(
[l.weight for l in model2],
optimizer_class=AdamW, lr=lr, weight_decay=0.,
)
optim2.add_param_group(
{"params": [l.bias for l in model2], "weight_decay": wd}
)
# Construct `optim3` as a non-sharded optimizer
optim3 = AdamW(
[
{"params": [l.weight for l in model3], "weight_decay": 0.},
{"params": [l.bias for l in model3], "weight_decay": wd},
], lr=lr,
)
# Check parity over a few iterations
for iter in range(num_iters):
for model, optim in (
(model1, optim1), (model2, optim2), (model3, optim3),
):
optim.zero_grad()
out = model(inputs[iter])
loss = out.sum()
loss.backward()
optim.step()
for layer1, layer2, layer3 in zip(model1, model2, model3):
assert torch.allclose(layer1.weight, layer2.weight)
assert torch.allclose(layer1.weight, layer3.weight)
assert torch.allclose(layer1.bias, layer2.bias)
assert torch.allclose(layer1.bias, layer3.bias)
@common_distributed.skip_if_lt_x_gpu(2)
@common_distributed.skip_if_rocm
def test_collect_shards(self):
""" Check the state consolidation mechanism, and the state dict exposed by ZeroRedundancyOptimizer"""
self.dist_init(self.rank)

View File

@ -106,7 +106,8 @@ ALLOW_LIST = [
("aten::_scatter_reduce", datetime.date(2022, 1, 31)),
("aten::native_multi_head_self_attention", datetime.date(9999, 1, 1)),
("aten::_native_multi_head_self_attention", datetime.date(9999, 1, 1)),
("aten::scatter_reduce.two", datetime.date(2022, 3, 15)),
("aten::_transform_bias_rescale_qkv", datetime.date(9999, 1, 1)),
("aten::_scatter_reduce.two", datetime.date(9999, 1, 1)),
]
ALLOW_LIST_COMPILED = [

View File

@ -41,6 +41,7 @@ from collections import OrderedDict
from torch.nn.utils.rnn import PackedSequence
from torch.onnx import CheckerError, register_custom_op_symbolic, unregister_custom_op_symbolic
from torch.onnx.symbolic_helper import _unimplemented
from torch.onnx.utils import unpack_quantized_tensor
def flatten_tuples(elem):
@ -108,9 +109,16 @@ def inline_flatten_list(inputs, res_list):
return res_list
def unpack_to_numpy(value):
value_unpacked = []
for value_ in value:
value_unpacked.extend(unpack_quantized_tensor(value_))
value_final = [to_numpy(v) for v in value_unpacked]
return value_final
def run_ort(ort_sess, input):
input = flatten_tuples(input)
input = to_numpy(input)
input = unpack_to_numpy(flatten_tuples(input))
ort_inputs = dict((ort_sess.get_inputs()[i].name, input) for i, input in enumerate(input))
ort_outs = ort_sess.run(None, ort_inputs)
return inline_flatten_list(ort_outs, [])
@ -118,7 +126,7 @@ def run_ort(ort_sess, input):
def ort_compare_with_pytorch(ort_outs, output, rtol, atol):
output, _ = torch.jit._flatten(output)
outputs = [to_numpy(outp) for outp in output]
outputs = unpack_to_numpy(output)
# compare onnxruntime and PyTorch results
assert len(outputs) == len(ort_outs), "number of outputs differ"
@ -5895,7 +5903,24 @@ class TestONNXRuntime(unittest.TestCase):
return torch.pixel_shuffle(x, upscale_factor=2)
x = torch.randn(2, 16, 4, 3, requires_grad=True)
y = torch.randn(4, 32, 8, 4, requires_grad=True)
self.run_test(PixelShuffle(), x)
self.run_test(PixelShuffle(), x, input_names=["x"],
dynamic_axes={"x": [0, 1, 2, 3]},
test_with_inputs=[y])
@skipIfUnsupportedMinOpsetVersion(9)
def test_pixel_unshuffle(self):
class PixelUnshuffle(torch.nn.Module):
def forward(self, x):
return torch.pixel_unshuffle(x, downscale_factor=2)
x = torch.randn(2, 16, 4, 6, requires_grad=True)
y = torch.randn(4, 32, 8, 4, requires_grad=True)
self.run_test(PixelUnshuffle(), x)
self.run_test(PixelUnshuffle(), x, input_names=["x"],
dynamic_axes={"x": [0, 1, 2, 3]},
test_with_inputs=[y])
@skipIfUnsupportedMinOpsetVersion(9)
def test_reciprocal(self):
@ -6924,6 +6949,128 @@ class TestONNXRuntime(unittest.TestCase):
x = torch.randn(2, 3, 5, 5)
self.run_test(Det(), x)
def test_linalg_norm(self):
class LinalgSingleDimModel(torch.nn.Module):
def __init__(self, ord_val):
super(LinalgSingleDimModel, self).__init__()
self.ord = ord_val
def forward(self, x):
return torch.linalg.norm(x, ord=self.ord, dim=1)
x = torch.randn(2, 3, 5, 5)
self.run_test(LinalgSingleDimModel(None), x)
self.run_test(LinalgSingleDimModel(2), x)
self.run_test(LinalgSingleDimModel(float('inf')), x)
self.run_test(LinalgSingleDimModel(-float('inf')), x)
self.run_test(LinalgSingleDimModel(-4), x)
self.run_test(LinalgSingleDimModel(1.5), x)
class LinalgMultiDimModel(torch.nn.Module):
def __init__(self, ord_val):
super(LinalgMultiDimModel, self).__init__()
self.ord = ord_val
def forward(self, x):
return torch.linalg.norm(x, ord=self.ord, dim=(0, 2))
x = torch.randn(2, 3, 5, 5)
self.run_test(LinalgMultiDimModel('fro'), x)
self.run_test(LinalgMultiDimModel(float('inf')), x)
self.run_test(LinalgMultiDimModel(-float('inf')), x)
self.run_test(LinalgMultiDimModel(1), x)
self.run_test(LinalgMultiDimModel(-1), x)
class LinalgNoDimNoOrdModel(torch.nn.Module):
def forward(self, x):
return torch.linalg.norm(x)
x = torch.randn(2, 3, 5, 5)
self.run_test(LinalgNoDimNoOrdModel(), x)
y = torch.randn(2, 3)
self.run_test(LinalgNoDimNoOrdModel(), y)
z = torch.randn(2)
self.run_test(LinalgNoDimNoOrdModel(), z)
class LinalgNoDim1DModel(torch.nn.Module):
def __init__(self, ord_val):
super(LinalgNoDim1DModel, self).__init__()
self.ord = ord_val
def forward(self, x):
return torch.linalg.norm(x, ord=self.ord)
x = torch.randn(2)
self.run_test(LinalgNoDim1DModel(None), x)
self.run_test(LinalgNoDim1DModel(2), x)
self.run_test(LinalgNoDim1DModel(float('inf')), x)
self.run_test(LinalgNoDim1DModel(-float('inf')), x)
self.run_test(LinalgNoDim1DModel(-4), x)
self.run_test(LinalgNoDim1DModel(1.5), x)
class LinalgNoDim2DModel(torch.nn.Module):
def __init__(self, ord_val):
super(LinalgNoDim2DModel, self).__init__()
self.ord = ord_val
def forward(self, x):
return torch.linalg.norm(x, ord=self.ord)
x = torch.randn(2, 3)
self.run_test(LinalgNoDim2DModel('fro'), x)
self.run_test(LinalgNoDim2DModel(float('inf')), x)
self.run_test(LinalgNoDim2DModel(-float('inf')), x)
self.run_test(LinalgNoDim2DModel(1), x)
self.run_test(LinalgNoDim2DModel(-1), x)
@skipIfUnsupportedMinOpsetVersion(11)
def test_linalg_vector_norm_zero(self):
class LinalgVectorNormModel(torch.nn.Module):
def __init__(self, ord_val):
super(LinalgVectorNormModel, self).__init__()
self.ord = ord_val
def forward(self, x):
return torch.linalg.vector_norm(x, ord=self.ord)
x = torch.randn(2, 3, 5, 5)
self.run_test(LinalgVectorNormModel(0), x)
def test_linalg_vector_norm(self):
class LinalgVectorNormModel(torch.nn.Module):
def __init__(self, ord_val, dim_info):
super(LinalgVectorNormModel, self).__init__()
self.ord = ord_val
self.dim, self.keepdim = dim_info
def forward(self, x):
return torch.linalg.vector_norm(x, ord=self.ord, dim=self.dim, keepdim=self.keepdim)
x = torch.randn(2, 3, 5, 5)
ord_options = [2, float('inf'), -float('inf'), -4, 1.5]
dim_options = [(None, False), (1, False), ((1, 2), False), ((1, 2), True)]
for ord_val in ord_options:
for dim_info in dim_options:
self.run_test(LinalgVectorNormModel(ord_val, dim_info), x)
def test_linalg_matrix_norm(self):
class LinalgMatrixNormModel(torch.nn.Module):
def __init__(self, ord_val, dim_val=(-2, -1), keepdim_val=False):
super(LinalgMatrixNormModel, self).__init__()
self.ord = ord_val
self.dim = dim_val
self.keepdim = keepdim_val
def forward(self, x):
return torch.linalg.matrix_norm(x, ord=self.ord, dim=self.dim, keepdim=self.keepdim)
x = torch.randn(2, 3, 5, 5)
ord_options = ['fro', float('inf'), -float('inf'), 1, -1]
for ord_val in ord_options:
self.run_test(LinalgMatrixNormModel(ord_val), x)
self.run_test(LinalgMatrixNormModel(ord_val, (0, 2)), x)
self.run_test(LinalgMatrixNormModel(ord_val, (0, 2), True), x)
# This test checks output scalar type in the ONNX graph should not be null
# https://github.com/pytorch/pytorch/issues/28607
@skipIfUnsupportedMinOpsetVersion(10)
@ -10256,6 +10403,18 @@ class TestONNXRuntime(unittest.TestCase):
loaded_model = onnx.load_from_string(f.getvalue())
self.assertEqual(loaded_model.graph.output[0].type.tensor_type.shape.dim[1].dim_value, 128)
@skipIfUnsupportedMinOpsetVersion(10)
def test_quantized_linear(self):
model = torch.nn.quantized.Linear(1, 2)
input = torch.rand(1, 1)
input_tensor = torch.quantize_per_tensor(input, 1, 0, torch.quint8)
# Currently, we need convert the model to ScriptModule before export.
# The reason is that PackedParams contains int (not tensor).
# Then it fails when the exporter calls _trace_and_get_graph_from_model().
# TODO: https://msdata.visualstudio.com/Vienna/_workitems/edit/1547858
self.run_test(torch.jit.trace(model, input_tensor), (input_tensor,))
self.run_test(torch.jit.script(model), (input_tensor,))
def make_test(name, base, layer, bidirectional, initial_state,
variable_length, dropout, script_test_min_opset_version,
**extra_kwargs):

View File

@ -114,5 +114,42 @@ class TestONNXShapeInference(unittest.TestCase):
slice = g.op("Slice", input, start_input, end, axis, step)
self.run_test(g, slice.node(), expect_tensor(None, shape=(None, None)))
def test_broadcast_matmul(self):
g = self.create_empty_graph()
constant = self.insert_tensor_constant(g, torch.ones(5, 1, 2))
constant_2 = self.insert_tensor_constant(g, torch.ones(3, 1, 2, 1))
shape = g.op("MatMul", constant, constant_2)
self.run_test(g, shape.node(), expect_tensor("Float", shape=(3, 5, 1, 1)))
# test when first input is of rank 1
g = self.create_empty_graph()
constant = self.insert_tensor_constant(g, torch.ones(2))
constant_2 = self.insert_tensor_constant(g, torch.ones(3, 1, 2, 1))
shape = g.op("MatMul", constant, constant_2)
self.run_test(g, shape.node(), expect_tensor("Float", shape=(3, 1, 1)))
# test when second input is of rank 1
g = self.create_empty_graph()
constant = self.insert_tensor_constant(g, torch.ones(5, 1, 2))
constant_2 = self.insert_tensor_constant(g, torch.ones(2))
shape = g.op("MatMul", constant, constant_2)
self.run_test(g, shape.node(), expect_tensor("Float", shape=(5, 1)))
# test when both inputs are of rank 1
g = self.create_empty_graph()
constant = self.insert_tensor_constant(g, torch.ones(2))
constant_2 = self.insert_tensor_constant(g, torch.ones(2))
shape = g.op("MatMul", constant, constant_2)
self.run_test(g, shape.node(), expect_tensor("Float", shape=()))
def test_expand(self):
g = self.create_empty_graph()
input = g.addInput()
constant = self.insert_tensor_constant(g, torch.ones(2, 4))
input.setType(constant.type().with_sizes([None, None]))
shape = g.op("Shape", input)
expand = g.op("Expand", constant, shape)
self.run_test(g, expand.node(), expect_tensor("Float", shape=(None, None)))
if __name__ == '__main__':
unittest.main()

View File

@ -853,9 +853,10 @@ class TestQuantizeDBR(QuantizeDBRTestCase):
qconfig = torch.quantization.default_qconfig
self._test_auto_tracing(model_fp32, qconfig, (torch.randn(1, 1, 2, 2),))
@unittest.skip('this depends on unsupported syntax detection, currently disabled')
def test_vovnet_sequential(self):
# We cannot quantize SequentialAppendList directly because
# AutoQuantizationStateModuleDict would appear in self.items.
# However, we can wrap it and quantize the wrapper.
class SequentialAppendList(nn.Sequential):
def __init__(self, *args):
super(SequentialAppendList, self).__init__(*args)
@ -870,7 +871,16 @@ class TestQuantizeDBR(QuantizeDBRTestCase):
x = torch.cat(concat_list, dim=1)
return x
m = SequentialAppendList(torch.nn.Conv2d(1, 1, 1)).eval()
class Wrapper(nn.Module):
def __init__(self, *args):
super().__init__()
self.append_list = SequentialAppendList(*args)
def forward(self, x):
x = self.append_list(x)
return x
m = Wrapper(torch.nn.Conv2d(1, 1, 1)).eval()
qconfig = torch.quantization.default_qconfig
self._test_auto_tracing(m, qconfig, (torch.randn(1, 1, 1, 1),))
@ -922,10 +932,11 @@ class TestQuantizeDBR(QuantizeDBRTestCase):
model_fp32, qconfig, (torch.randn(1, 1, 2, 2),),
fuse_modules=False)
# this is broken because AutoQuantizationState appears in self.items
@unittest.skip('TODO fix this')
def test_module_calls_items(self):
class M(torch.nn.ModuleDict):
# We cannot quantize M1 directly because
# AutoQuantizationStateModuleDict would appear in self.items.
# However, we can wrap it and quantize the wrapper.
class M1(torch.nn.ModuleDict):
def __init__(self):
super().__init__()
for i in range(2):
@ -938,10 +949,22 @@ class TestQuantizeDBR(QuantizeDBRTestCase):
layers.append(layer(x))
return torch.cat(layers, dim=1)
model_fp32 = M().eval()
class M2(torch.nn.Module):
def __init__(self):
super().__init__()
self.m1 = M1()
def forward(self, x):
x = self.m1(x)
return x
model_fp32 = M2().eval()
qconfig = torch.quantization.default_qconfig
self._test_auto_tracing(
model_fp32, qconfig, (torch.randn(1, 1, 2, 2),))
model_fp32, qconfig, (torch.randn(1, 1, 2, 2),),
# TODO(future PR): implement observer sharing for torch.cat
# in DBR quant, to ensure that numerical behavior matches
do_fx_comparison=False)
def test_subclass_of_quantizeable_module(self):
"""

View File

@ -3,7 +3,6 @@
import torch
import torch.nn as nn
import torch.nn.quantized as nnq
import torch.nn.quantized._reference as nnqr
from torch.nn.utils.rnn import PackedSequence
from torch.ao.quantization import (
quantize,
@ -75,130 +74,6 @@ import unittest
import numpy as np
class TestQuantizeEagerOps(QuantizationTestCase):
def _test_reference_module_impl(self,
float_module_class,
quantized_module_class,
extra_module_kwargs,
input_size):
class M(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv = float_module_class(**extra_module_kwargs)
self.quant = QuantStub()
self.dequant = DeQuantStub()
def forward(self, x):
x = self.quant(x)
x = self.conv(x)
x = self.dequant(x)
return x
class RefM(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv = float_module_class(**extra_module_kwargs)
self.quant1 = QuantStub()
self.dequant1 = DeQuantStub()
self.quant2 = QuantStub()
self.dequant2 = DeQuantStub()
def forward(self, x):
x = self.quant1(x)
x = self.dequant1(x)
x = self.conv(x)
x = self.quant2(x)
x = self.dequant2(x)
return x
qengine = 'fbgemm'
with override_quantized_engine(qengine):
data = torch.randn(*input_size, dtype=torch.float)
original_m = M()
original_ref_m = RefM()
original_ref_m.conv.weight = torch.nn.Parameter(original_m.conv.weight.detach())
original_ref_m.conv.bias = torch.nn.Parameter(original_m.conv.bias.detach())
original_m.qconfig = torch.quantization.default_qconfig
m = prepare(original_m)
# calibration
m(data)
m = convert(m)
# check if the module is properly quantized
self.assertEqual(type(m.quant), nnq.Quantize)
self.assertEqual(type(m.conv), quantized_module_class)
self.assertEqual(type(m.dequant), nnq.DeQuantize)
res = m(data)
# quantize the reference model
original_ref_m.eval()
original_ref_m.qconfig = torch.quantization.default_qconfig
ref_m = prepare(original_ref_m)
ref_m(data)
reference_module_mapping = {
QuantStub: nnq.Quantize,
DeQuantStub: nnq.DeQuantize,
nn.Conv1d: nnqr.Conv1d,
nn.Conv2d: nnqr.Conv2d,
nn.Conv3d: nnqr.Conv3d,
nn.ConvTranspose1d: nnqr.ConvTranspose1d,
nn.ConvTranspose2d: nnqr.ConvTranspose2d,
nn.ConvTranspose3d: nnqr.ConvTranspose3d,
}
ref_m = convert(ref_m, mapping=reference_module_mapping)
ref_res = ref_m(data)
self.assertEqual(res, ref_res)
def test_conv_1d(self):
self._test_reference_module_impl(
nn.Conv1d,
nnq.Conv1d,
{'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
(16, 1, 1)
)
def test_conv_2d(self):
self._test_reference_module_impl(
nn.Conv2d,
nnq.Conv2d,
{'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
(16, 1, 10, 10)
)
def test_conv_3d(self):
self._test_reference_module_impl(
nn.Conv3d,
nnq.Conv3d,
{'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
(16, 1, 10, 10, 10)
)
def test_conv_transpose_1d(self):
self._test_reference_module_impl(
nn.ConvTranspose1d,
nnq.ConvTranspose1d,
{'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
(16, 1, 1)
)
def test_conv_transpose_2d(self):
self._test_reference_module_impl(
nn.ConvTranspose2d,
nnq.ConvTranspose2d,
{'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
(16, 1, 10, 10)
)
def test_conv_transpose_3d(self):
self._test_reference_module_impl(
nn.ConvTranspose3d,
nnq.ConvTranspose3d,
{'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
(16, 1, 10, 10, 10)
)
def _test_activation_op_impl(
self, float_module_class, quantized_module_class, extra_module_kwargs):
""" Implementation for testing common activation ops like leaky relu

View File

@ -1,5 +1,6 @@
# Owner(s): ["oncall: quantization"]
import copy
import math
import torch
import torch.nn as nn
@ -10,6 +11,7 @@ from torch.nn.modules.utils import _pair
import torch.nn.quantized as nnq
import torch.nn.quantized.dynamic as nnqd
import torch.nn.qat as nnqat
import torch.nn.intrinsic.qat as nniqat
import torch.nn.qat.dynamic as nnqatd
from torch.ao.quantization import (
prepare,
@ -984,6 +986,43 @@ class TestQuantizeEagerQATNumerics(QuantizationTestCase):
qat_op_optim.step()
qat_ref_op_optim.step()
@override_qengines
def test_linear_bn_numerics(self):
qengine = torch.backends.quantized.engine
m_ref = nn.Sequential(
nn.Linear(4, 4),
nn.BatchNorm1d(4),
)
m_ref_copy = copy.deepcopy(m_ref)
m_ref_copy = torch.ao.quantization.fuse_modules_qat(m_ref_copy, [['0', '1']])
qconfig = torch.ao.quantization.get_default_qat_qconfig(qengine)
m_ref_copy[0].qconfig = qconfig
m = nniqat.LinearBn1d.from_float(m_ref_copy[0])
# without fake_quants, fused QAT module should match fp32 module
m.apply(torch.quantization.disable_fake_quant)
data = torch.randn(4, 4)
r1 = m_ref(data)
r2 = m(data)
self.assertTrue(torch.allclose(r1, r2))
@override_qengines
def test_linear_bn_workflow(self):
qengine = torch.backends.quantized.engine
m = nn.Sequential(
QuantStub(),
nn.Linear(4, 4),
nn.BatchNorm1d(4),
)
data = torch.randn(4, 4)
m.qconfig = torch.ao.quantization.get_default_qat_qconfig(qengine)
m = torch.ao.quantization.fuse_modules_qat(m, [['1', '2']])
mp = prepare_qat(m)
mp(data)
mq = convert(mp)
self.assertTrue(type(mq[1]) == nnq.Linear)
self.assertTrue(type(mq[2]) == nn.Identity)
if __name__ == '__main__':
raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
"\tpython test/test_quantization.py TESTNAME\n\n"

View File

@ -17533,51 +17533,12 @@ class TestNNDeviceType(NNTestCase):
)
self.assertEqual(output_non_contig, output_contig)
@onlyCUDA
@dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long)))
def test_embedding_bag_bfloat16(self, device, dtypes):
self._test_EmbeddingBag(device, 'sum', True, wdtype=torch.bfloat16, dtype=dtypes[0], odtype=dtypes[1], test_backward=True)
self._test_EmbeddingBag(device, 'mean', True, wdtype=torch.bfloat16, dtype=dtypes[0], odtype=dtypes[1], test_backward=True)
@dtypesIfCUDA(torch.float)
@dtypes(torch.float)
def test_transform_bias_rescale_qkv(self, device, dtype):
# TODO: debug CPU test failure with settings (48, 4, 16, 8) and add that mode
tests = [
(64, 4, 16, 8),
# dim_per_head = 12 does not divide evenly by CPU vectorization length of 8
(24, 2, 4, 2),
# Make sure CUDA can handle small input sizes
(2, 2, 2, 2),
# dim_per_head = 6 does not divide evenly by CUDA vectorization length of 4, causes alignment issues
(24, 4, 4, 2)
]
for (embed_dim, num_heads, sl, bs) in tests:
x = torch.randn(sl, bs, embed_dim, device=device, dtype=dtype) * 10
qkv = torch.nn.Linear(embed_dim, 3 * embed_dim, device=device, dtype=dtype)
with torch.no_grad():
(q, k, v) = torch._transform_bias_rescale_qkv(x @ qkv.weight.t(), qkv.bias, num_head=num_heads)
def simple_transform_bias_rescale_qkv(qkv, bias):
(q, k, v) = torch.split(qkv, embed_dim, dim=-1)
(q_bias, k_bias, v_bias) = torch.split(bias, embed_dim, dim=-1)
return tuple(
x.reshape((sl, bs, num_heads, embed_dim // num_heads)).transpose(2, 1)
for x in (
(q + q_bias) / math.sqrt(embed_dim // num_heads),
(k + k_bias),
(v + v_bias)
)
)
correct_q, correct_k, correct_v = simple_transform_bias_rescale_qkv(x @ qkv.weight.t(), qkv.bias)
self.assertEqual(q.size(), correct_q.size())
self.assertTrue(torch.allclose(q, correct_q))
self.assertTrue(torch.allclose(k, correct_k))
self.assertTrue(torch.allclose(v, correct_v))
@onlyCUDA
@dtypes(torch.half, torch.float, torch.double)
def test_multihead_attention_dtype(self, device, dtype):

View File

@ -5773,7 +5773,7 @@ class TestTorch(TestCase):
for reduce in reduces:
for dim in range(len(shape)):
output = input._scatter_reduce(dim, index, reduce, output_size=output_size)
output = input.scatter_reduce(dim, index, reduce, output_size=output_size)
# Check that output is of the correct size
output_shape = copy.copy(shape)
@ -5807,16 +5807,16 @@ class TestTorch(TestCase):
self.assertTrue(torch.allclose(output, expected))
with self.assertRaisesRegex(RuntimeError, "Expected `dim` to be in range -3 to 2"):
torch._scatter_reduce(input, 4, index, "sum")
torch.scatter_reduce(input, 4, index, "sum")
with self.assertRaisesRegex(RuntimeError, "Shape mismatch"):
index2 = torch.randint(0, output_size, (10, ), dtype=torch.long, device=device)
torch._scatter_reduce(input, 0, index2, "sum")
torch.scatter_reduce(input, 0, index2, "sum")
with self.assertRaisesRegex(RuntimeError, "Expected `index` values to be in range 0 to 2"):
input2 = torch.randn(10, dtype=dtype, device=device)
index2 = torch.tensor([0, 1, 0, 1, 2, 3, 3, 4, 4, 3])
torch._scatter_reduce(input2, 0, index2, "sum", output_size=2)
torch.scatter_reduce(input2, 0, index2, "sum", output_size=2)
def test_structseq_repr(self):
a = torch.arange(250).reshape(5, 5, 10)

2
third_party/fbgemm vendored

@ -1 +1 @@
Subproject commit 365abe3ee878b2592e9a33f937d96df0048d99dd
Subproject commit ab3ca6647d3f4be25423c5f997256a8a219fb762

View File

@ -2595,6 +2595,6 @@
- name: _efficientzerotensor(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
output_differentiability: [False]
- name: _scatter_reduce.two(Tensor self, int dim, Tensor index, str reduce, *, int? output_size=None) -> Tensor
- name: scatter_reduce.two(Tensor self, int dim, Tensor index, str reduce, *, int? output_size=None) -> Tensor
self: scatter_reduce_backward(grad, self, dim, index, reduce, result)
index: non_differentiable

View File

@ -1176,7 +1176,6 @@ aten_native_source_non_codegen_list = [
"aten/src/ATen/native/quantized/library.cpp",
"aten/src/ATen/quantized/QTensorImpl.cpp",
"aten/src/ATen/quantized/Quantizer.cpp",
"aten/src/ATen/native/attention.cpp",
"aten/src/ATen/native/Activation.cpp",
"aten/src/ATen/native/AdaptiveAveragePooling.cpp",
"aten/src/ATen/native/AdaptiveAveragePooling3d.cpp",

View File

@ -1 +1 @@
21ca53c291a88b53dac85751b7a0203ca610ac94b7adaff3c092cf30df4168f2
e1c8b97b919541a99e0a355df5c3f9e8abebc64259dbee6f8c68e1ef90582856

View File

@ -1 +1 @@
5fde7bccf65032da297dfb1f18e4a95e96e278fa397e9dcaf364dfe23ec46353
1485a242a96c737ba7cdd9f259114f2201accdb46d87ac7a8650b1a814cd4d4d

View File

@ -193,50 +193,45 @@ In multiline mode, each line next includes the name of a CircleCI job,
followed by the time of the specified test in that job at that commit.
Example:
$ tools/stats/test_history.py --mode=multiline --ref=594a66 --sha-length=8 --test=test_set_dir \
--job pytorch_linux_xenial_py3_6_gcc5_4_test --job pytorch_linux_xenial_py3_6_gcc7_test
2021-02-10 11:13:34Z 594a66d7 pytorch_linux_xenial_py3_6_gcc5_4_test 0.36s
2021-02-10 11:13:34Z 594a66d7 pytorch_linux_xenial_py3_6_gcc7_test 0.573s errored
2021-02-10 10:13:25Z 9c0caf03 pytorch_linux_xenial_py3_6_gcc5_4_test 0.819s
2021-02-10 10:13:25Z 9c0caf03 pytorch_linux_xenial_py3_6_gcc7_test 0.449s
2021-02-10 10:09:14Z 602434bc pytorch_linux_xenial_py3_6_gcc5_4_test 0.361s
2021-02-10 10:09:14Z 602434bc pytorch_linux_xenial_py3_6_gcc7_test 0.454s
2021-02-10 10:09:10Z 2e35fe95 (no reports in S3)
2021-02-10 10:09:07Z ff73be7e (no reports in S3)
2021-02-10 10:05:39Z 74082f0d (no reports in S3)
2021-02-10 07:42:29Z 0620c96f pytorch_linux_xenial_py3_6_gcc5_4_test 0.414s
2021-02-10 07:42:29Z 0620c96f pytorch_linux_xenial_py3_6_gcc5_4_test 0.476s
2021-02-10 07:42:29Z 0620c96f pytorch_linux_xenial_py3_6_gcc7_test 0.377s
2021-02-10 07:42:29Z 0620c96f pytorch_linux_xenial_py3_6_gcc7_test 0.326s
$ tools/stats/test_history.py --mode=multiline --ref=86a961af879 --sha-length=8 \
--test=test_composite_compliance_dot_cpu_float32 \
--job linux-xenial-py3.7-gcc5.4-test-default1 --job linux-xenial-py3.7-gcc7-test-default1
2022-02-18 15:47:37Z 86a961af linux-xenial-py3.7-gcc5.4-test-default1 0.001s
2022-02-18 15:47:37Z 86a961af linux-xenial-py3.7-gcc7-test-default1 0.001s
2022-02-18 15:12:34Z f5e201e4 linux-xenial-py3.7-gcc5.4-test-default1 0.001s
2022-02-18 15:12:34Z f5e201e4 linux-xenial-py3.7-gcc7-test-default1 0.001s
2022-02-18 13:14:56Z 1c0df265 linux-xenial-py3.7-gcc5.4-test-default1 0.001s
2022-02-18 13:14:56Z 1c0df265 linux-xenial-py3.7-gcc7-test-default1 0.001s
2022-02-18 13:14:56Z e73eaffd (no reports in S3)
2022-02-18 06:29:12Z 710f12f5 linux-xenial-py3.7-gcc5.4-test-default1 0.001s
Another multiline example, this time with the --all flag:
$ tools/stats/test_history.py --mode=multiline --all --ref=321b9 --delta=12 --sha-length=8 \
--test=test_qr_square_many_batched_complex_cuda
2021-01-07 10:04:56Z 321b9883 pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_test2 424.284s
2021-01-07 10:04:56Z 321b9883 pytorch_linux_xenial_cuda10_2_cudnn7_py3_slow_test 0.006s skipped
2021-01-07 10:04:56Z 321b9883 pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_test 402.572s
2021-01-07 10:04:56Z 321b9883 pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test 287.164s
2021-01-06 20:58:28Z fcb69d2e pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_test2 436.732s
2021-01-06 20:58:28Z fcb69d2e pytorch_linux_xenial_cuda10_2_cudnn7_py3_slow_test 0.006s skipped
2021-01-06 20:58:28Z fcb69d2e pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_test 407.616s
2021-01-06 20:58:28Z fcb69d2e pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test 287.044s
$ tools/stats/test_history.py --mode=multiline --all --ref=86a961af879 --delta=12 --sha-length=8 \
--test=test_composite_compliance_dot_cuda_float32
2022-02-18 03:49:46Z 69389fb5 linux-bionic-cuda10.2-py3.9-gcc7-test-default1 0.001s skipped
2022-02-18 03:49:46Z 69389fb5 linux-bionic-cuda10.2-py3.9-gcc7-test-slow1 0.001s skipped
2022-02-18 03:49:46Z 69389fb5 linux-xenial-cuda11.3-py3.7-gcc7-test-default1 0.001s skipped
2022-02-18 03:49:46Z 69389fb5 periodic-linux-bionic-cuda11.5-py3.7-gcc7-test-default1 0.001s skipped
2022-02-18 03:49:46Z 69389fb5 periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test-default1 0.001s skipped
2022-02-18 03:49:46Z 69389fb5 periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test-default1 0.001s skipped
In columns mode, the name of the job isn't printed, but the order of the
columns is guaranteed to match the order of the jobs passed on the
command line. Example:
$ tools/stats/test_history.py --mode=columns --ref=3cf783 --sha-length=8 --test=test_set_dir \
--job pytorch_linux_xenial_py3_6_gcc5_4_test --job pytorch_linux_xenial_py3_6_gcc7_test
2021-02-10 12:18:50Z 3cf78395 0.644s 0.312s
2021-02-10 11:13:34Z 594a66d7 0.360s errored
2021-02-10 10:13:25Z 9c0caf03 0.819s 0.449s
2021-02-10 10:09:14Z 602434bc 0.361s 0.454s
2021-02-10 10:09:10Z 2e35fe95
2021-02-10 10:09:07Z ff73be7e
2021-02-10 10:05:39Z 74082f0d
2021-02-10 07:42:29Z 0620c96f 0.414s 0.377s (2 job re-runs omitted)
2021-02-10 07:27:53Z 33afb5f1 0.381s 0.294s
$ tools/stats/test_history.py --mode=columns --ref=86a961af879 --sha-length=8 \
--test=test_composite_compliance_dot_cpu_float32 \
--job linux-xenial-py3.7-gcc5.4-test-default1 --job linux-xenial-py3.7-gcc7-test-default1
2022-02-18 15:47:37Z 86a961af 0.001s 0.001s
2022-02-18 15:12:34Z f5e201e4 0.001s 0.001s
2022-02-18 13:14:56Z 1c0df265 0.001s 0.001s
2022-02-18 13:14:56Z e73eaffd
2022-02-18 06:29:12Z 710f12f5 0.001s 0.001s
2022-02-18 05:20:30Z 51b04f27 0.001s 0.001s
2022-02-18 03:49:46Z 69389fb5 0.001s 0.001s
2022-02-18 00:19:12Z 056b6260 0.001s 0.001s
2022-02-17 23:58:32Z 39fb7714 0.001s 0.001s
Minor note: in columns mode, a blank cell means that no report was found
in S3, while the word "absent" means that a report was found but the

View File

@ -53,6 +53,7 @@ def parse_description(description: str) -> List[Example]:
return examples
@unittest.skip("Skipping as this test is fragile, issue #73083")
class TestTestHistory(unittest.TestCase):
maxDiff = None

View File

@ -326,7 +326,8 @@ def _jit_pass_onnx_remove_print(graph: Graph) -> None: ...
def _jit_pass_onnx_preprocess_caffe2(graph: Graph) -> None: ...
def _jit_pass_onnx_unpack_quantized_weights(
graph: Graph,
paramsDict: Dict[str, IValue]
paramsDict: Dict[str, IValue],
caffe2: _bool
) -> Dict[str, IValue]: ...
def _jit_pass_onnx_quantization_insert_permutes(
graph: Graph,
@ -409,7 +410,7 @@ def _import_ir_module_from_package(
) -> ScriptModule: ...
def _assign_output_shapes(graph: Graph, inputs: List[Tensor]) -> Graph: ...
def _check_onnx_proto(proto: str) -> None: ...
def _check_onnx_proto(proto: str, full_check: _bool = False) -> None: ...
def _propagate_and_assign_input_shapes(
graph: Graph,
inputs: Tuple[Tensor, ...],

View File

@ -3374,6 +3374,12 @@ Example::
""".format(**reproducibility_notes))
add_docstr_all('scatter_reduce', r"""
scatter_reduce(input, dim, index, reduce, *, output_size=None) -> Tensor
See :func:`torch.scatter_reduce`
""")
add_docstr_all('select',
r"""
select(dim, index) -> Tensor

View File

@ -8547,6 +8547,59 @@ scatter_add(input, dim, index, src) -> Tensor
Out-of-place version of :meth:`torch.Tensor.scatter_add_`
""")
add_docstr(torch.scatter_reduce, r"""
scatter_reduce(input, dim, index, reduce, *, output_size=None) -> Tensor
Reduces all values from the :attr:`input` tensor to the indices specified in
the :attr:`index` tensor. For each value in :attr:`input`, its output index is
specified by its index in :attr:`input` for ``dimension != dim`` and by the
corresponding value in :attr:`index` for ``dimension = dim``.
The applied reduction for non-unique indices is defined via the :attr:`reduce`
argument (:obj:`"sum"`, :obj:`"prod"`, :obj:`"mean"`, :obj:`"amax"`, :obj:`"amin"`).
For non-existing indices, the output will be filled with the identity of the
applied reduction (1 for :obj:`"prod"` and 0 otherwise).
It is also required that ``index.size(d) == input.size(d)`` for all dimensions ``d``.
Moreover, if :attr:`output_size` is defined the the values of :attr:`index` must be
between ``0`` and ``output_size - 1`` inclusive.
For a 3-D tensor with :obj:`reduce="sum"`, the output is given as::
out[index[i][j][k]][j][k] += input[i][j][k] # if dim == 0
out[i][index[i][j][k]][k] += input[i][j][k] # if dim == 1
out[i][j][index[i][j][k]] += input[i][j][k] # if dim == 2
Note:
This out-of-place operation is similar to the in-place versions of
:meth:`~torch.Tensor.scatter_` and :meth:`~torch.Tensor.scatter_add_`,
in which the output tensor is automatically created according to the
maximum values in :attr:`index` and filled based on the identity of the
applied reduction.
Note:
{forward_reproducibility_note}
Args:
input (Tensor): the input tensor
dim (int): the axis along which to index
index (LongTensor): the indices of elements to scatter and reduce.
src (Tensor): the source elements to scatter and reduce
reduce (str): the reduction operation to apply for non-unique indices
(:obj:`"sum"`, :obj:`"prod"`, :obj:`"mean"`, :obj:`"amax"`, :obj:`"amin"`)
output_size (int, optional): the size of the output at dimension :attr:`dim`.
If set to :obj:`None`, will get automatically inferred according to
:obj:`index.max() + 1`
Example::
>>> input = torch.tensor([1, 2, 3, 4, 5, 6])
>>> index = torch.tensor([0, 1, 0, 1, 2, 1])
>>> torch.scatter_reduce(input, 0, index, reduce="sum", output_size=3)
tensor([4, 12, 5])
""".format(**reproducibility_notes))
add_docstr(torch.select,
r"""
select(input, dim, index) -> Tensor

View File

@ -80,6 +80,7 @@ def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[NSNodeTargetType]]:
nnqatd.Linear,
nnqd.Linear,
nniqat.LinearReLU,
nniqat.LinearBn1d,
nn.modules.linear.NonDynamicallyQuantizableLinear,
]),
# linear functionals
@ -572,6 +573,7 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]:
nniqat.ConvReLU2d,
nniqat.ConvReLU3d,
nniqat.LinearReLU,
nniqat.LinearBn1d,
nniqd.LinearReLU,
])

View File

@ -14,6 +14,8 @@ from .utils import (
get_torch_function_hook_type,
get_module_hook_type,
OpQuantizeabilityType,
AutoQuantizationStateModuleDict,
get_fqn_valid_for_module_dict_key,
)
from .model_utils import (
pack_weights_for_functionals,
@ -350,6 +352,8 @@ def add_auto_observation(
for _, child_child in child.named_modules():
leaves.add(child_child)
self._fqn_to_auto_quant_state_map = AutoQuantizationStateModuleDict()
for fqn, v in named_modules:
# fqn is the global FQN, i.e. 'foo.bar.baz'
@ -366,14 +370,39 @@ def add_auto_observation(
if v is self:
# for the top level module only, specify input
# and output dtypes
v._auto_quant_state = AutoQuantizationState(
auto_quant_state = AutoQuantizationState(
qconfig_dict, fqn,
input_dtypes, output_dtypes)
pass
else:
v._auto_quant_state = AutoQuantizationState(
auto_quant_state = AutoQuantizationState(
qconfig_dict, fqn)
# The code below registers the auto_quant_state object
# of the child in the module hierarchy of the parent,
# and adds the auto_quant_state object to the child
# with a raw __setattr__, without registering it in
# the module hierarchy of the child.
# This is solving the problem of both storing extra state
# (observers) as well as not modifying the meaning of user
# code in child modules which iterates over all module
# children.
#
# This narrows down the issue of dynamically adding
# children to only affect the top level module and not
# the children.
# On the parent, register this module in the FQN map
fqn_to_use_for_key = \
get_fqn_valid_for_module_dict_key(fqn)
self._fqn_to_auto_quant_state_map[fqn_to_use_for_key] = \
auto_quant_state
# On the child, manually set the attribute without
# going through the `torch.nn.Module.__setattr__`
# function, to prevent this object from appearing in
# the child's module hierarchy.
object.__setattr__(
v, '_auto_quant_state', auto_quant_state)
global_op_idx[0] = 0
output = super().__call__(*new_args, **new_kwargs)
@ -688,6 +717,6 @@ def add_auto_convert(module : torch.nn.Module) -> torch.nn.Module:
# checking the fix into `torch.nn.Sequential` to avoid the patch.
def _nn_sequential_patched_forward(cls, input):
for module in cls:
if not isinstance(module, AutoQuantizationState):
if not isinstance(module, AutoQuantizationStateModuleDict):
input = module(input)
return input

View File

@ -8,7 +8,10 @@ import torch
import torch.fx
from .mappings import conv_ops
from .quantization_state import AutoQuantizationState
from .utils import get_packable_arg_idxs
from .utils import (
get_packable_arg_idxs,
AutoQuantizationStateModuleDict,
)
class AllModuleTracer(torch.fx.Tracer):
"""
@ -207,7 +210,7 @@ class AllModuleTracer(torch.fx.Tracer):
# class.
# TODO(future): remove the hack
def call_module(self, m: torch.nn.Module, forward: Callable[..., Any], args : Tuple[Any, ...], kwargs : Dict[str, Any]) -> Any:
if isinstance(m, AutoQuantizationState):
if isinstance(m, AutoQuantizationStateModuleDict):
return args[0]
return super().call_module(m, forward, args, kwargs)

View File

@ -583,10 +583,9 @@ def get_torch_function_hook_type(
# the direct __dict__ accesses are for performance, because
# the default `torch.nn.Module.__getattr__` has overhead.
parent_module_has_qstate = parent_module is not None and \
'_modules' in parent_module.__dict__ and \
'_auto_quant_state' in parent_module.__dict__['_modules']
'_auto_quant_state' in parent_module.__dict__
needs_op_hooks = parent_module_has_qstate and \
parent_module.__dict__['_modules']['_auto_quant_state'].cur_op_needs_hooks(func) # type: ignore[union-attr, operator]
parent_module.__dict__['_auto_quant_state'].cur_op_needs_hooks(func) # type: ignore[union-attr, operator]
if needs_op_hooks:
return HookType.OP_HOOKS
@ -608,17 +607,15 @@ def get_module_hook_type(
if cached_hook_type is not None:
return cached_hook_type
parent_module_has_qstate = parent_module is not None and \
'_modules' in parent_module.__dict__ and \
'_auto_quant_state' in parent_module.__dict__['_modules']
'_auto_quant_state' in parent_module.__dict__
needs_op_hooks = parent_module_has_qstate and \
parent_module.__dict__['_modules']['_auto_quant_state'].cur_op_needs_hooks(cur_module) # type: ignore[union-attr, operator]
parent_module.__dict__['_auto_quant_state'].cur_op_needs_hooks(cur_module) # type: ignore[union-attr, operator]
# We need IO hooks if
# * we are calling forward on a module (always True here)
# * that module has quant state
# * that module does not need op hooks for the parent
needs_io_hooks = (
'_modules' in cur_module.__dict__ and
'_auto_quant_state' in cur_module.__dict__['_modules'] and
'_auto_quant_state' in cur_module.__dict__ and
(not needs_op_hooks)
)
needs_arg_dequants = parent_module_has_qstate and not needs_op_hooks
@ -727,3 +724,18 @@ def get_cur_qconfig(
qconfig_dict, cur_op_type, cur_fqn, global_qconfig)
return qconfig
# We store quantization state for all children on the top level module in a
# ModuleDict. In order to properly special case this module from other
# ModuleDict instances, we create a marker class for it.
class AutoQuantizationStateModuleDict(torch.nn.ModuleDict):
pass
def get_fqn_valid_for_module_dict_key(fqn: str) -> str:
"""
Modifies `fqn` to make it a valid key to a ModuleDict.
"""
if fqn == '':
fqn = ' '
return fqn.replace('.', ':')

View File

@ -82,6 +82,8 @@ def prepare(model, qconfig_dict, example_inputs, inplace=False, allow_list=None,
for v in parents_to_delete_auto_quant_state:
del v._auto_quant_state
del model._fqn_to_auto_quant_state_map
# the model hierarchy might have changed during fusion, so we
# have to delete the cached module hook types
for k, v in model.named_modules():

View File

@ -114,7 +114,12 @@ def fuse_linear_bn(is_qat, linear, bn):
if is_qat:
# TODO: remove the assert later
assert linear.training, "qat is only supported when linear.training is True currently"
raise Exception("Fusing Linear+BatchNorm not yet supported in training.")
assert bn.num_features == linear.out_features,\
"Output features of Linear must match num_features of BatchNorm1d"
assert bn.affine, "Only support fusing BatchNorm1d with affine set to True"
assert bn.track_running_stats,\
"Only support fusing BatchNorm1d with tracking_running_stats set to True"
return nni.LinearBn1d(linear, bn)
else:
return nn.utils.fusion.fuse_linear_bn_eval(linear, bn)

View File

@ -77,6 +77,7 @@ DEFAULT_STATIC_QUANT_MODULE_MAPPINGS : Dict[Callable, Any] = {
nniqat.ConvReLU2d: nniq.ConvReLU2d,
nniqat.ConvReLU3d: nniq.ConvReLU3d,
nniqat.LinearReLU: nniq.LinearReLU,
nniqat.LinearBn1d: nnq.Linear,
# QAT modules:
nnqat.Linear: nnq.Linear,
nnqat.Conv2d: nnq.Conv2d,
@ -99,6 +100,7 @@ DEFAULT_QAT_MODULE_MAPPINGS : Dict[Callable, Any] = {
nni.ConvReLU2d: nniqat.ConvReLU2d,
nni.ConvReLU3d: nniqat.ConvReLU3d,
nni.LinearReLU: nniqat.LinearReLU,
nni.LinearBn1d: nniqat.LinearBn1d,
}
# Default map for swapping dynamic modules

View File

@ -16,7 +16,7 @@ from torch.ao.quantization.quantization_mappings import (
_has_special_act_post_process,
_get_special_act_post_process,
)
from .utils import get_qparam_dict
from torch.ao.quantization.stubs import DeQuantStub, QuantWrapper
from torch.ao.quantization.qconfig import (
add_module_to_qconfig_obs_ctr,
@ -565,15 +565,7 @@ def swap_module(mod, mapping, custom_module_class_mapping):
new_mod = custom_module_class_mapping[type(mod)].from_observed(mod)
swapped = True
elif type(mod) in mapping:
qmod = mapping[type(mod)]
if hasattr(qmod, '_IS_REFERENCE') and qmod._IS_REFERENCE:
assert mod.qconfig is not None
weight_post_process = mod.qconfig.weight()
weight_post_process(mod.weight)
weight_qparams = get_qparam_dict(weight_post_process)
new_mod = qmod.from_float(mod, weight_qparams)
else:
new_mod = qmod.from_float(mod)
new_mod = mapping[type(mod)].from_float(mod)
swapped = True
if swapped:

View File

@ -589,11 +589,10 @@ PyObject *THPModule_supportedQEngines(PyObject *_unused, PyObject *noargs)
{
auto qengines = at::globalContext().supportedQEngines();
auto list = THPObjectPtr(PyList_New(qengines.size()));
if (!list) return nullptr;
for (const auto i : c10::irange(qengines.size())) {
PyObject *i64 = THPUtils_packInt64(static_cast<int>(qengines[i]));
if (!i64) {
throw python_error();
}
if (!i64) return nullptr;
PyList_SET_ITEM(list.get(), i, i64);
}
return list.release();
@ -607,22 +606,18 @@ PyObject *THPModule_isEnabledXNNPACK(PyObject *_unused, PyObject *noargs)
PyObject *THPModule_setDefaultMobileCPUAllocator(PyObject *_unused, PyObject *noargs)
{
try {
at::globalContext().setDefaultMobileCPUAllocator();
} catch (c10::Error& e) {
THPUtils_setError(e.what());
}
HANDLE_TH_ERRORS
at::globalContext().setDefaultMobileCPUAllocator();
Py_RETURN_NONE;
END_HANDLE_TH_ERRORS
}
PyObject *THPModule_unsetDefaultMobileCPUAllocator(PyObject *_unused, PyObject *noargs)
{
try {
at::globalContext().unsetDefaultMobileCPUAllocator();
} catch (c10::Error& e) {
THPUtils_setError(e.what());
}
HANDLE_TH_ERRORS
at::globalContext().unsetDefaultMobileCPUAllocator();
Py_RETURN_NONE;
END_HANDLE_TH_ERRORS
}
static PyObject * THPModule_vmapmode_increment_nesting(PyObject* _unused, PyObject *arg) {

View File

@ -155,6 +155,19 @@ struct OpEventData {
torch::profiler::impl::CUDAEventStub cuda_event_end_ = nullptr;
};
struct MemoryEventData {
int64_t start_time;
void* ptr;
int64_t alloc_size;
int64_t total_allocated;
int64_t total_reserved;
uint64_t threadID;
torch::profiler::impl::kineto::DeviceAndResource kineto_info;
c10::DeviceType device_type;
c10::DeviceIndex device_index;
};
static_assert(std::is_pod<MemoryEventData>::value, "Non-POD member of MemoryEventData.");
// Assumption: Total threads number will not exceed 2^16-1, and total ops will
// not exceed 2^48 -1.
static inline uint64_t getForwardThreadKey(uint64_t tid, uint64_t seqNr) {
@ -204,29 +217,16 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
int64_t total_reserved,
c10::Device device) override {
if (config_.profile_memory && config_.state != ProfilerState::Disabled) {
std::lock_guard<std::mutex> guard(state_mutex_);
auto start_time = getTimeUs();
if (cpu_trace_) {
torch::profiler::impl::kineto::recordThreadInfo();
cpu_trace_.addMemoryUsageActivity(
kMemoryEventName,
torch::profiler::impl::kineto::kineto_ids(),
start_time,
device,
ptr,
alloc_size,
total_allocated,
total_reserved);
}
kineto_events_.emplace_back();
auto& evt = kineto_events_.back();
evt.name(kMemoryEventName)
.startUs(start_time)
.deviceIndex(device.index())
.deviceType(device.type())
.nBytes(alloc_size)
.startThreadId(at::RecordFunction::currentThreadId());
memory_events_.push_back(
{getTimeUs(),
ptr,
alloc_size,
total_allocated,
total_reserved,
at::RecordFunction::currentThreadId(),
torch::profiler::impl::kineto::kineto_ids(),
device.type(),
device.index()});
}
}
@ -264,6 +264,28 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
void materializeOpEvents() {
std::lock_guard<std::mutex> guard(state_mutex_);
for (const auto& e : memory_events_) {
cpu_trace_.addMemoryUsageActivity(
kMemoryEventName,
e.kineto_info,
e.start_time,
c10::Device(e.device_type, e.device_index),
e.ptr,
e.alloc_size,
e.total_allocated,
e.total_reserved);
kineto_events_.emplace_back();
auto& evt = kineto_events_.back();
evt.name(kMemoryEventName)
.startUs(e.start_time)
.deviceIndex(e.device_index)
.deviceType(e.device_type)
.nBytes(e.alloc_size)
.startThreadId(e.threadID);
}
for (const auto& e : op_events_) {
if (e.end_us_ < e.start_us_) {
// We initialize end_us_ to the smallest int64_t, so this means that
@ -406,7 +428,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
py_event_indices_{
{ nullptr,
std::string("null") }};
for (size_t i = 0; i < py_events.size(); i++) {
for (const auto i : c10::irange(py_events.size())) {
py_event_indices_.insert({py_events[i].get(), std::to_string(i)});
}
@ -585,6 +607,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
uint64_t start_time_;
std::set<torch::profiler::impl::ActivityType> activities_;
std::deque<OpEventData> op_events_;
std::deque<MemoryEventData> memory_events_;
torch::profiler::impl::kineto::TraceWrapper cpu_trace_;
std::vector<KinetoEvent> kineto_events_;
// Optional, if event post-processing is enabled.

View File

@ -833,8 +833,7 @@ void gather(
if (cur_rank == root)
{
for (int r = 0; r < numranks; r++)
{
for (const auto r : c10::irange(numranks)) {
if (r != root) {
auto* recvbuff = reinterpret_cast<char*>(outputs[r].data_ptr());
NCCL_CHECK(ncclRecv(recvbuff, count, type, r, comm, stream));
@ -874,8 +873,7 @@ void scatter(
NCCL_CHECK(ncclGroupStart());
if (cur_rank == root)
{
for (int r = 0; r < numranks; r++)
{
for (const auto r : c10::irange(numranks)) {
if (r != root) {
size_t send_count = inputs[r].numel();
auto send_type = to_nccl_data_type(inputs[r]);

View File

@ -10,6 +10,7 @@
#include <iostream>
#include <vector>
#include <c10/util/irange.h>
#include <fmt/format.h>
#define ERROR(msg_fmt, ...) \
@ -47,7 +48,7 @@ int main(int argc, const char** argv) {
auto program_headers = (Elf64_Phdr*)(data + header->e_phoff);
auto n_program_headers = header->e_phnum;
Elf64_Dyn* dynamic = nullptr;
for (size_t i = 0; i < n_program_headers; ++i) {
for (const auto i : c10::irange(n_program_headers)) {
const Elf64_Phdr* phdr = &program_headers[i];
if (phdr->p_type == PT_DYNAMIC) {
dynamic = reinterpret_cast<Elf64_Dyn*>(data + phdr->p_offset);

View File

@ -650,11 +650,13 @@ Example::
.def(
"get",
[](::c10d::Store& store, const std::string& key) -> py::bytes {
auto value = store.get(key);
auto value = [&]() {
py::gil_scoped_release guard;
return store.get(key);
}();
return py::bytes(
reinterpret_cast<char*>(value.data()), value.size());
},
py::call_guard<py::gil_scoped_release>(),
R"(
Retrieves the value associated with the given ``key`` in the store. If ``key`` is not
present in the store, the function will wait for ``timeout``, which is defined

View File

@ -147,7 +147,7 @@ c10::optional<at::Tensor> runTorchSlice_opset10(
return c10::nullopt;
}
auto axes_a = inputTensorValues[3].accessor<int64_t, 1>();
axes.reserve(inputTensorValues[3].sizes()[0]);
axes.resize(inputTensorValues[3].sizes()[0]);
// ONNX slice accepts negative axis, fix this for aten op
for (const auto i : c10::irange(inputTensorValues[3].sizes()[0])) {
axes[i] = axes_a[i] < 0 ? axes_a[i] + inputTensorValues[0].sizes().size()

View File

@ -61,5 +61,12 @@ Node* transformToONNXConcatNode(
bool need_new_input,
int opset_version);
class ScalarTypeHashFunction {
public:
size_t operator()(const c10::ScalarType& type) const {
return static_cast<size_t>(type);
}
};
} // namespace jit
} // namespace torch

View File

@ -761,6 +761,25 @@ static void fuseListConstructListUnpack(Block* b) {
}
}
// https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter#quantized-model-export
static void eraseTupleConstruct(Block* block) {
size_t index = 0;
// TupleConstruct is generated from the symbolics in quantized domain, and
// consumed by other quantized operators. The remained TupleConstruct should
// be at the output of the blocks.
for (auto* output : block->outputs()) {
auto output_node = output->node();
if (output_node->kind() == prim::TupleConstruct) {
block->eraseOutput(index);
size_t input_index = 0;
for (auto* input : output_node->inputs()) {
block->insertOutput(index + (input_index++), input);
}
}
index++;
}
}
void removeMaxPoolUnusedOutput(Block* b) {
for (auto it = b->nodes().begin(), end = b->nodes().end(); it != end; ++it) {
auto n = *it;
@ -1025,6 +1044,7 @@ void PeepholeOptimizeONNX(
fuseListConstructListUnpack(graph->block());
fuseLogSoftmaxNllLoss(graph->block());
eraseListConstruct(graph->block(), opset_version);
eraseTupleConstruct(graph->block());
EliminateDeadCode(
graph->block(),
true,

View File

@ -1,6 +1,7 @@
#include <c10/util/irange.h>
#include <torch/csrc/jit/jit_log.h>
#include <torch/csrc/jit/passes/dead_code_elimination.h>
#include <torch/csrc/jit/passes/onnx/helper.h>
#include <torch/csrc/jit/passes/onnx/scalar_type_analysis.h>
namespace torch {
@ -11,13 +12,6 @@ using namespace ::c10::onnx;
}
namespace {
class ScalarTypeHashFunction {
public:
size_t operator()(const c10::ScalarType& type) const {
return static_cast<size_t>(type);
}
};
const int ONNX_OPSET_14 = 14;
static const std::unordered_map<c10::ScalarType, int, ScalarTypeHashFunction>

View File

@ -702,54 +702,59 @@ void SetShapeValueFromListConstructNode(Node* lc_node) {
}
}
std::vector<::c10::ShapeSymbol> Broadcast(const std::vector<::c10::ShapeSymbol> &input_shape_value_0,
const std::vector<::c10::ShapeSymbol> &input_shape_value_1) {
size_t rank_0 = input_shape_value_0.size();
size_t rank_1 = input_shape_value_1.size();
size_t rank_max = std::max(rank_0, rank_1);
size_t rank_min = std::min(rank_0, rank_1);
std::vector<::c10::ShapeSymbol> final_shape;
final_shape.reserve(rank_max);
for (auto idx = 0; idx < rank_max; idx++) {
final_shape.emplace_back(::c10::ShapeSymbol::newSymbol());
}
for (auto idx = 0; idx < rank_min; idx++) {
const c10::ShapeSymbol& ss_shape_0 =
input_shape_value_0[rank_0 - 1 - idx];
const c10::ShapeSymbol& ss_shape_1 =
input_shape_value_1[rank_1 - 1 - idx];
bool is_static_0 = ss_shape_0.is_static();
bool is_static_1 = ss_shape_1.is_static();
if (is_static_0 && is_static_1) {
int64_t static_0_sz = ss_shape_0.static_size();
int64_t static_1_sz = ss_shape_1.static_size();
final_shape[rank_max - 1 - idx] = ::c10::ShapeSymbol::fromStaticSize(
std::max(static_0_sz, static_1_sz));
} else if (!is_static_0 && !is_static_1) {
if (ss_shape_0.value() == ss_shape_1.value()) {
final_shape[rank_max - 1 - idx] = ss_shape_0;
}
}
}
if (rank_0 < rank_1) {
for (size_t idx = rank_min; idx < rank_max; idx++) {
size_t shape_idx = rank_max - 1 - idx;
final_shape[shape_idx] = input_shape_value_1[shape_idx];
}
} else {
for (size_t idx = rank_min; idx < rank_max; idx++) {
size_t shape_idx = rank_max - 1 - idx;
final_shape[shape_idx] = input_shape_value_0[shape_idx];
}
}
return final_shape;
}
void ProcessBroadcastNode(Node* n) {
TORCH_INTERNAL_ASSERT(n->inputs().size() == 2);
if (ConstantValueMap::HasShape(n->input(0)->debugName()) &&
ConstantValueMap::HasShape(n->input(1)->debugName())) {
auto input_shape_0 = ConstantValueMap::GetShape(n->input(0)->debugName());
auto input_shape_value_0 = input_shape_0.value().sizes();
auto input_shape_value_0 = input_shape_0.value().sizes().value();
auto input_shape_1 = ConstantValueMap::GetShape(n->input(1)->debugName());
auto input_shape_value_1 = input_shape_1.value().sizes();
size_t rank_0 = input_shape_value_0.value().size();
size_t rank_1 = input_shape_value_1.value().size();
size_t rank_max = std::max(rank_0, rank_1);
size_t rank_min = std::min(rank_0, rank_1);
std::vector<::c10::ShapeSymbol> final_shape;
final_shape.reserve(rank_max);
for (auto idx = 0; idx < rank_max; idx++) {
final_shape.emplace_back(::c10::ShapeSymbol::newSymbol());
}
for (auto idx = 0; idx < rank_min; idx++) {
const c10::ShapeSymbol& ss_shape_0 =
input_shape_value_0.value()[rank_0 - 1 - idx];
const c10::ShapeSymbol& ss_shape_1 =
input_shape_value_1.value()[rank_1 - 1 - idx];
bool is_static_0 = ss_shape_0.is_static();
bool is_static_1 = ss_shape_1.is_static();
if (is_static_0 && is_static_1) {
int64_t static_0_sz = ss_shape_0.static_size();
int64_t static_1_sz = ss_shape_1.static_size();
final_shape[rank_max - 1 - idx] = ::c10::ShapeSymbol::fromStaticSize(
std::max(static_0_sz, static_1_sz));
} else if (!is_static_0 && !is_static_1) {
if (ss_shape_0.value() == ss_shape_1.value()) {
final_shape[rank_max - 1 - idx] = ss_shape_0;
}
}
}
if (rank_0 < rank_1) {
for (auto idx = rank_min; idx < rank_max; idx++) {
auto shape_idx = rank_max - 1 - idx;
final_shape[shape_idx] = input_shape_value_1.value()[shape_idx];
}
} else {
for (auto idx = rank_min; idx < rank_max; idx++) {
auto shape_idx = rank_max - 1 - idx;
final_shape[shape_idx] = input_shape_value_0.value()[shape_idx];
}
}
auto input_shape_value_1 = input_shape_1.value().sizes().value();
auto final_shape = Broadcast(input_shape_value_0, input_shape_value_1);
UpdateShape(n->output(0), c10::SymbolicShape(final_shape));
}
}
@ -857,6 +862,8 @@ void ProcessMatMulNode(Node* n) {
auto input_shape_value_1 = input_shape_1.sizes().value();
size_t rank_0 = input_shape_value_0.size();
size_t rank_1 = input_shape_value_1.size();
// Handle inputs of rank 1 just like numpy.matmul:
// https://numpy.org/doc/stable/reference/generated/numpy.matmul.html
auto is_rank_0_1 = false;
if (rank_0 == 1) {
input_shape_value_0.insert(
@ -870,25 +877,20 @@ void ProcessMatMulNode(Node* n) {
rank_1 = 2;
is_rank_1_1 = true;
}
size_t rank = std::max(rank_0, rank_1);
std::vector<::c10::ShapeSymbol> final_shape;
final_shape.reserve(rank);
if (rank_0 >= rank_1) {
for (auto idx = 0; idx < rank_0 - 2; idx++) {
final_shape.emplace_back(input_shape_value_0[idx]);
}
} else {
for (auto idx = 0; idx < rank_1 - 2; idx++) {
final_shape.emplace_back(input_shape_value_1[idx]);
}
// Per https://pytorch.org/docs/stable/generated/torch.matmul.html
// the broadcasting logic only applies to the batch dimensions, and not the matrix dimensions
// so we remove the matrix dimensions which are the last 2 dimensions before broadcasting
auto final_shape = Broadcast(
std::vector<::c10::ShapeSymbol>(input_shape_value_0.begin(), input_shape_value_0.end() - 2),
std::vector<::c10::ShapeSymbol>(input_shape_value_1.begin(), input_shape_value_1.end() - 2)
);
// add the last 2 dimensions back, unless they do not exist in the first place and inserted by this function
// Then apply [n,k]X[k,m]=[n,m], where n=input_shape_value_0[rank_0 - 2], m=input_shape_value_1[rank_1 - 1]
if (!is_rank_0_1) {
final_shape.emplace_back(input_shape_value_0[rank_0 - 2]);
}
final_shape.emplace_back(input_shape_value_0[rank_0 - 2]);
final_shape.emplace_back(input_shape_value_1[rank_1 - 1]);
if (is_rank_0_1) {
final_shape.erase(final_shape.begin());
}
if (is_rank_1_1) {
final_shape.pop_back();
if (!is_rank_1_1) {
final_shape.emplace_back(input_shape_value_1[rank_1 - 1]);
}
UpdateShape(n->output(0), c10::SymbolicShape(final_shape));
}
@ -1374,6 +1376,8 @@ void ComputeConstant(Node* n, int opset_version) {
if (input0_shape_size.has_value()) {
auto input0_shape_value = input0_shape_size.value();
if (ConstantValueMap::HasValue(n->input(1)->debugName())) {
// When value of `shape` is statically known,
// output shape can be computed.
auto shape_temp = ConstantValueMap::GetValueInto1DInt64Vector(
n->input(1)->debugName());
auto final_shape =
@ -1381,6 +1385,23 @@ void ComputeConstant(Node* n, int opset_version) {
if (final_shape.has_value()) {
UpdateShape(n->output(), final_shape.value());
}
} else if (
auto expand_shape =
ConstantValueMap::GetShapeInto1DInt64VectorWithOneUnknown(
n->input(1)->debugName())) {
// When shape of `shape` is statically known,
// output rank can be computed.
TORCH_INTERNAL_ASSERT(
expand_shape.value().size() == 1,
"`Shape` input to `Expand` should be a 1-D tensor. Instead got rank ",
expand_shape.value().size());
if (expand_shape.value()[0] > 0) {
std::vector<c10::ShapeSymbol> final_shape;
for (const auto i : c10::irange(expand_shape.value()[0])) {
final_shape.emplace_back(c10::ShapeSymbol::newSymbol());
}
UpdateShape(n->output(), c10::SymbolicShape(final_shape));
}
}
}
}

View File

@ -9,12 +9,9 @@
#include <torch/csrc/jit/passes/onnx/helper.h>
#include <torch/csrc/jit/passes/subgraph_rewrite.h>
#ifndef AT_PER_OPERATOR_HEADERS
// TODO: Switch to per operator headers after
// https://github.com/pytorch/pytorch/pull/68693 is merged
#include <ATen/Functions.h>
#else
#include <ATen/ops/quantize_per_tensor.h>
#include <ATen/ops/zeros.h>
#endif
#include <stack>
@ -104,7 +101,7 @@ double getScaleFromInput(Node* input_node) {
input_name);
}
Node* CreateQuantizedWeights(
Node* CreateQuantizedWeightsCaffe2(
std::string data,
std::shared_ptr<Graph>& graph,
std::vector<int64_t> shapes,
@ -118,7 +115,7 @@ Node* CreateQuantizedWeights(
return const_node;
}
Node* CreateQuantizedBias(
Node* CreateQuantizedBiasCaffe2(
std::vector<int64_t> data,
std::shared_ptr<Graph>& graph,
std::vector<int64_t> shapes,
@ -132,6 +129,62 @@ Node* CreateQuantizedBias(
return const_node;
}
std::vector<Node*> CreateQuantizedWeights(
std::vector<float> data,
std::shared_ptr<Graph>& graph,
std::vector<int64_t> shapes,
float scale,
int64_t zero_point) {
Node* const_node_1 = graph->create(prim::Constant);
auto const_value =
at::from_blob(data.data(), c10::IntArrayRef(shapes), at::kFloat)
.to(at::kCPU);
auto options = c10::TensorOptions().dtype(at::kFloat).device(at::kCPU);
at::Tensor const_value_copy = at::empty(c10::IntArrayRef(shapes), options);
const_value.copy_(const_value);
const_node_1->t_(Symbol::attr("value"), const_value_copy);
Node* const_node_2 = graph->create(prim::Constant);
std::vector<float> scale_v{scale};
std::vector<int64_t> scale_shapes{1};
auto const_shape =
at::from_blob(scale_v.data(), c10::IntArrayRef(scale_shapes), at::kFloat)
.to(at::kCPU);
at::Tensor const_shape_copy =
at::empty(c10::IntArrayRef(scale_shapes), options);
const_shape_copy.copy_(const_shape);
const_node_2->t_(Symbol::attr("value"), const_shape_copy);
Node* const_node_3 = graph->create(prim::Constant);
std::vector<int64_t> zero_point_v{zero_point};
std::vector<int64_t> zero_shapes{1};
auto const_zero =
at::from_blob(
zero_point_v.data(), c10::IntArrayRef(zero_shapes), at::kInt)
.to(at::kCPU);
at::Tensor const_zero_copy =
at::empty(c10::IntArrayRef(zero_shapes), options);
const_zero_copy.copy_(const_zero);
const_node_3->t_(Symbol::attr("value"), const_zero_copy);
return {const_node_1, const_node_2, const_node_3};
}
Node* CreateQuantizedBias(
std::vector<float> data,
std::shared_ptr<Graph>& graph,
std::vector<int64_t> shapes) {
Node* const_node_1 = graph->create(prim::Constant);
auto const_bias =
at::from_blob(data.data(), c10::IntArrayRef(shapes), at::kFloat)
.to(at::kCPU);
auto options = c10::TensorOptions().dtype(at::kFloat).device(at::kCPU);
at::Tensor const_bias_copy = at::empty(c10::IntArrayRef(shapes), options);
const_bias_copy.copy_(const_bias);
const_node_1->t_(Symbol::attr("value"), const_bias_copy);
return const_node_1;
}
Node* createIntTuple(
const std::vector<int64_t>& is,
std::shared_ptr<Graph>& graph) {
@ -158,7 +211,8 @@ void unpackQuantizedWeightsHelper(
std::map<std::string, IValue>& paramsDict,
const std::string& pattern,
const std::string& unpack_fn,
QuantizedParamsType params_type) {
QuantizedParamsType params_type,
bool caffe2 = true) {
Graph pattern_graph;
std::unordered_map<std::string, Value*> vmap;
parseIR(pattern, &pattern_graph, vmap);
@ -368,26 +422,47 @@ void unpackQuantizedWeightsHelper(
const int64_t weight_zp = unpacked_weight.q_zero_point() + 128;
const int64_t wt_numel = unpacked_weight.numel();
// Create caffe2::Int8GivenTensorFill node
std::ostringstream os;
for (const auto i : c10::irange(wt_numel)) {
os << static_cast<char>(inp_data[i] + 128);
if (caffe2) {
// Create caffe2::Int8GivenTensorFill node
std::ostringstream os;
for (const auto i : c10::irange(wt_numel)) {
os << static_cast<char>(inp_data[i] + 128);
}
Node* c2_weight = CreateQuantizedWeightsCaffe2(
os.str(), graph, wt_sizes, unpacked_weight.q_scale(), weight_zp);
graph->setInsertPoint(qlinear_node);
c2_weight->insertBefore(qlinear_node);
qlinear_node->insertInput(1, c2_weight->output());
} else {
std::vector<float> unpacked_weight_values;
unpacked_weight_values.reserve(unpacked_weight.numel());
auto unpacked_weight_data =
reinterpret_cast<int8_t*>(unpacked_weight.data_ptr<c10::qint8>());
for (const auto i : c10::irange(unpacked_weight.numel())) {
unpacked_weight_values.push_back(
static_cast<float>(unpacked_weight_data[i]));
}
std::vector<Node*> c2_weight = CreateQuantizedWeights(
unpacked_weight_values,
graph,
wt_sizes,
static_cast<float>(unpacked_weight.q_scale()),
weight_zp);
graph->setInsertPoint(qlinear_node);
c2_weight[0]->insertBefore(qlinear_node);
qlinear_node->insertInput(1, c2_weight[0]->output());
c2_weight[1]->insertBefore(qlinear_node);
qlinear_node->insertInput(2, c2_weight[1]->output());
c2_weight[2]->insertBefore(qlinear_node);
qlinear_node->insertInput(3, c2_weight[2]->output());
}
Node* c2_weight = CreateQuantizedWeights(
os.str(), graph, wt_sizes, unpacked_weight.q_scale(), weight_zp);
graph->setInsertPoint(qlinear_node);
c2_weight->insertBefore(qlinear_node);
qlinear_node->insertInput(1, c2_weight->output());
// Add bias
at::Tensor original_bias;
if (bias.has_value()) {
original_bias = bias.value();
original_bias.set_requires_grad(false);
} else {
// Caffe2 ops always expect bias tensor so if not present create empty
// tensor.
int64_t bias_size = unpacked_weight.size(0);
original_bias =
at::zeros(bias_size, unpacked_weight.options().dtype(at::kFloat));
@ -402,24 +477,41 @@ void unpackQuantizedWeightsHelper(
input_val->type()->str());
auto input_node = match_vmap.at(vmap.at("r"))->node()->inputs()[0]->node();
auto input_scale = getScaleFromInput(input_node);
auto q_bias = at::quantize_per_tensor(
original_bias, weight_scale * input_scale, 0, at::kQInt32);
at::Tensor q_bias;
std::vector<int64_t> bias_values;
bias_values.reserve(q_bias.numel());
auto bias_data = (int32_t*)q_bias.data_ptr<c10::qint32>();
for (const auto i : c10::irange(q_bias.numel())) {
bias_values.push_back(bias_data[i]);
if (caffe2) {
auto input_scale = getScaleFromInput(input_node);
q_bias = at::quantize_per_tensor(
original_bias, weight_scale * input_scale, 0, at::kQInt32);
std::vector<int64_t> bias_values;
bias_values.reserve(q_bias.numel());
auto bias_data = (int32_t*)q_bias.data_ptr<c10::qint32>();
for (const auto i : c10::irange(q_bias.numel())) {
bias_values.push_back(bias_data[i]);
}
Node* c2_bias = CreateQuantizedBiasCaffe2(
bias_values,
graph,
q_bias.sizes().vec(),
q_bias.q_scale(),
q_bias.q_zero_point());
c2_bias->insertBefore(qlinear_node);
qlinear_node->insertInput(2, c2_bias->output());
} else {
std::vector<float> bias_values(original_bias.numel());
auto bias_data = original_bias.data_ptr<float>();
for (const auto i : c10::irange(original_bias.numel())) {
bias_values[i] = bias_data[i];
}
Node* bias =
CreateQuantizedBias(bias_values, graph, original_bias.sizes().vec());
bias->insertBefore(qlinear_node);
// For quantized_linear inputs, the order is input, weight, bias, ....
// We unpack weight into 3 values. then it is
// input, weight_value, weight_scale, weight_zero_point, bias, ...
// Therefore bias is at location 4.
qlinear_node->insertInput(4, bias->output());
}
Node* c2_bias = CreateQuantizedBias(
bias_values,
graph,
q_bias.sizes().vec(),
q_bias.q_scale(),
q_bias.q_zero_point());
c2_bias->insertBefore(qlinear_node);
qlinear_node->insertInput(2, c2_bias->output());
// add conv arguments: stride, padding, dilation, groups
if (stride.has_value() && padding.has_value() && dilation.has_value() &&
@ -444,9 +536,59 @@ void unpackQuantizedWeightsHelper(
eraseUnusedValuesFromMap(valsToParamsMap);
}
}
static std::
unordered_map<c10::ScalarType, c10::ScalarType, ScalarTypeHashFunction>
qTypeToValType = {
{c10::ScalarType::QInt8, c10::ScalarType::Char},
{c10::ScalarType::QUInt8, c10::ScalarType::Byte},
{c10::ScalarType::QInt32, c10::ScalarType::Int},
{c10::ScalarType::QUInt4x2, c10::ScalarType::Byte},
};
// Unpack quantized tensor inputs into {value, scale, zero_point},
// Then create a prim::TupleConstruct node based on these three values.
void UnpackQuantizedTensorInputs(std::shared_ptr<Graph>& graph) {
for (size_t index = 0; index < graph->inputs().size();) {
auto g_input = graph->inputs()[index];
TensorTypePtr shape_type = g_input->type()->cast<TensorType>();
if (!shape_type || !shape_type->scalarType().has_value()) {
index++;
continue;
}
auto scalar_type = shape_type->scalarType().value();
if (qTypeToValType.find(scalar_type) == qTypeToValType.end()) {
index++;
continue;
}
std::string input_name = g_input->debugName();
auto input_value =
graph->insertInput(index, input_name + "_value")
->setType(shape_type->withScalarType(qTypeToValType[scalar_type]));
// scale and zero_point type can be found at torch/include/ATen/Operators.h
auto input_scale =
graph->insertInput(index + 1, input_name + "_scale")
->setType(TensorType::create(
at::kDouble, at::kCPU, 0, /*requires_grad=*/c10::nullopt));
auto input_zero_point =
graph->insertInput(index + 2, input_name + "_zero_point")
->setType(TensorType::create(
at::kLong, at::kCPU, 0, /*requires_grad=*/c10::nullopt));
std::vector<Value*> converted{input_value, input_scale, input_zero_point};
auto input_tuple =
graph->prependNode(graph->createTuple(converted))->output();
g_input->replaceAllUsesWith(input_tuple);
// Erase the original quantized tensor input.
graph->eraseInput(index + converted.size());
index += 3;
}
}
// https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter#quantized-model-export
void UnpackQuantizedWeights(
std::shared_ptr<Graph>& graph,
std::map<std::string, IValue>& paramsDict) {
std::map<std::string, IValue>& paramsDict,
bool caffe2) {
std::string qlinear = R"(
graph(%input, %packed_weight, %w_scale, %w_zero_point):
%r = quantized::linear(%input, %packed_weight, %w_scale, %w_zero_point)
@ -472,31 +614,36 @@ void UnpackQuantizedWeights(
paramsDict,
qlinear,
"quantized::linear_unpack",
QuantizedParamsType::LINEAR);
unpackQuantizedWeightsHelper(
graph,
paramsDict,
qconv2d,
"quantized::conv2d_unpack",
QuantizedParamsType::CONV);
unpackQuantizedWeightsHelper(
graph,
paramsDict,
qconv2d_relu,
"quantized::conv2d_unpack",
QuantizedParamsType::CONV);
unpackQuantizedWeightsHelper(
graph,
paramsDict,
qconv3d,
"quantized::conv3d_unpack",
QuantizedParamsType::CONV);
unpackQuantizedWeightsHelper(
graph,
paramsDict,
qconv3d_relu,
"quantized::conv3d_unpack",
QuantizedParamsType::CONV);
QuantizedParamsType::LINEAR,
caffe2);
if (caffe2) {
unpackQuantizedWeightsHelper(
graph,
paramsDict,
qconv2d,
"quantized::conv2d_unpack",
QuantizedParamsType::CONV);
unpackQuantizedWeightsHelper(
graph,
paramsDict,
qconv2d_relu,
"quantized::conv2d_unpack",
QuantizedParamsType::CONV);
unpackQuantizedWeightsHelper(
graph,
paramsDict,
qconv3d,
"quantized::conv3d_unpack",
QuantizedParamsType::CONV);
unpackQuantizedWeightsHelper(
graph,
paramsDict,
qconv3d_relu,
"quantized::conv3d_unpack",
QuantizedParamsType::CONV);
} else {
UnpackQuantizedTensorInputs(graph);
}
GRAPH_DUMP("After UnpackQuantizedWeights: ", graph);
}

View File

@ -2,6 +2,7 @@
#include <torch/csrc/jit/api/module.h>
#include <torch/csrc/jit/ir/ir.h>
#include <torch/csrc/onnx/onnx.h>
#include <memory>
@ -10,7 +11,8 @@ namespace jit {
TORCH_API void UnpackQuantizedWeights(
std::shared_ptr<Graph>& graph,
std::map<std::string, IValue>& paramsDict);
std::map<std::string, IValue>& paramsDict,
bool caffe2);
TORCH_API void insertPermutes(
std::shared_ptr<Graph>& graph,
std::map<std::string, IValue>& paramsDict);

View File

@ -11,6 +11,7 @@
#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
#include <torch/csrc/jit/passes/utils/subgraph_utils.h>
#include <torch/csrc/jit/runtime/custom_operator.h>
#include <torch/csrc/jit/runtime/graph_iterator.h>
#include <torch/csrc/jit/runtime/jit_trace.h>
#include <torch/csrc/jit/runtime/static/impl.h>
#include <torch/csrc/jit/runtime/static/ops.h>
@ -322,6 +323,17 @@ void createFusionGroups(Block* block, AliasDb* aliasDb, size_t min_size) {
inlineSmallFusionGroups(block, min_size);
}
void inlineFallbackGraphs(std::shared_ptr<Graph> graph) {
DepthFirstGraphNodeIterator it(graph);
Node* n = nullptr;
while ((n = it.next()) != nullptr) {
if (n->kind() == prim::FallbackGraph) {
SubgraphUtils::unmergeSubgraph(n);
}
}
}
void performTensorExprFusion(
std::shared_ptr<Graph> graph,
std::vector<IValue> sample_inputs) {
@ -335,6 +347,7 @@ void performTensorExprFusion(
/*min_group_size*/ 2,
/*add_composed_op*/ false,
/*fuse_to_dynamic_shapes*/ true);
inlineFallbackGraphs(traced_graph);
graph->block()->clear();
graph->block()->cloneFrom(traced_graph->block(), nullptr);
GRAPH_DUMP("Graph after fusion: ", graph);

View File

@ -157,10 +157,10 @@ void OptimizeGraph(
// TODO: we can avoid this guard by moving operations
// to exposed folders.
#ifdef FBCODE_CAFFE2
if (opts.use_copy_variants) {
if (opts.use_copy_variants && !opts.enable_tensorexpr_fusion) {
ReplaceWithCopy(graph);
}
if (opts.use_maybe_copy_variants) {
if (opts.use_maybe_copy_variants && !opts.enable_tensorexpr_fusion) {
ReplaceWithMaybeCopy(graph);
}
FuseListUnpack(graph);

View File

@ -166,11 +166,18 @@ struct TORCH_API StaticModuleOptions {
bool manage_output_tensors{false};
// Gates the ReplaceWithCopy pass, which replaces ops that
// sometimes alias their outputs with out variants that
// always copy (so the output may participate in memory planning)
// always copy (so the output may participate in memory planning).
// Since replacing with copies is done after TensorExpr fusion, the
// resulting graph does not conform to the assumptions made in the fuser.
// So, even if this flag is turned on, the ReplaceWithCopy pass will not
// be executed if TensorExpr fusion is enabled.
bool use_copy_variants{true};
// Gates the ReplaceWithMaybeCopy pass, which replaces ops that
// sometimes alias their outputs with subgraphs that include an out
// variant.
// For the same reason as `use_copy_variants`, the ReplaceWithMaybeCopy pass
// will not be executed if TensorExpr fusion is enabled, even if this flag
// is turned on.
bool use_maybe_copy_variants{true};
// enable TensorExpr fusion of ops at model loading time
bool enable_tensorexpr_fusion{false};

View File

@ -21,6 +21,7 @@
#include <atomic>
#include <onnx/checker.h>
#include <onnx/shape_inference/implementation.h>
#include <onnx/onnx_pb.h>
#include <onnx/proto_utils.h>
@ -1248,13 +1249,18 @@ std::string serialize_model_proto_to_string(
return model_proto->SerializeAsString();
}
void check_onnx_proto(const std::string& proto_string) {
void check_onnx_proto(const std::string& proto_string, bool full_check) {
onnx::ModelProto model;
if (!ParseProtoFromBytes(&model, proto_string.c_str(), proto_string.size())) {
throw std::runtime_error("Invalid ONNX proto string.");
return;
}
onnx::checker::check_model(model);
if (full_check) {
onnx::shape_inference::InferShapes(model);
}
}
} // namespace jit

View File

@ -61,7 +61,7 @@ export_onnx(
TORCH_API std::string serialize_model_proto_to_string(
const std::shared_ptr<::ONNX_NAMESPACE::ModelProto>& model_proto);
TORCH_API void check_onnx_proto(const std::string& proto_string);
TORCH_API void check_onnx_proto(const std::string& proto_string, bool full_check=false);
// Serializer for both oldsyle and unified format TorchScript serialization
class TORCH_API ScriptModuleSerializer {
@ -85,9 +85,6 @@ class TORCH_API ScriptModuleSerializer {
void convertNamedType(const c10::NamedTypePtr& class_type);
void convertTypes(const at::NamedTypePtr& root_type);
void writeExtraFiles(const Module& module, const ExtraFilesMap& extra_files);
void writeMobileMetadata(
const Module& module,
const ExtraFilesMap& extra_files);
void writeByteCode(const Module& module, bool save_mobile_debug_info);
void writeArchive(
const IValue& value,

View File

@ -946,6 +946,10 @@ std::shared_ptr<LazyGraphExecutor::Async> LazyGraphExecutor::
VLOG(3) << "Executing IR graph hash " << HashToString(hash)
<< " on device " << async->device << " done!";
TORCH_CHECK(async->tensors_data.size() == results.size(),
"Expected number of outputs does not match TorchScript Stack size: ",
async->tensors_data.size(), " != ", results.size());
for (const auto i : c10::irange(results.size())) {
if (async->tensors_data[i] != nullptr) {
async->tensors_data[i]->Assign(*results[i]);

View File

@ -3,6 +3,7 @@
#include <c10/core/ScalarType.h>
#include <c10/core/impl/DeviceGuardImplInterface.h>
#include <c10/macros/Macros.h>
#include <c10/util/irange.h>
#include <torch/csrc/lazy/core/tensor_util.h>
namespace torch {
@ -144,7 +145,7 @@ void LTCTensorImpl::setup_size_properties() {
// We can't call empty_tensor_restride(c10::MemoryFormat::Contiguous) given we override sizes() too.
std::vector<int64_t> updated_strides;
updated_strides = ComputeArrayStrides(shape.Get().sizes());
for (int i = 0; i < updated_strides.size(); i++) {
for (const auto i : c10::irange(updated_strides.size())) {
sizes_and_strides_.stride_at_unchecked(i) = updated_strides[i];
}
generation_ = generation;

View File

@ -1,3 +1,4 @@
#include <c10/util/irange.h>
#include <torch/csrc/lazy/core/view_ops/squeeze.h>
#include <torch/csrc/lazy/ts_backend/ts_lowering_context.h>
@ -9,7 +10,7 @@ namespace lazy {
std::vector<int64_t> BuildSqueezedDimensions(c10::ArrayRef<int64_t> dimensions,
int64_t squeeze_dim) {
std::vector<int64_t> output_dimensions;
for (int64_t i = 0; i < dimensions.size(); ++i) {
for (const auto i : c10::irange(dimensions.size())) {
int64_t dim = dimensions[i];
if (dim != 1 || (i != squeeze_dim && squeeze_dim >= 0)) {
output_dimensions.push_back(dim);

Some files were not shown because too many files have changed in this diff Show More