Update on "fix torch::deploy OSS CI: previous the tests for torch::deploy will be skipped"

We match build environment with linux-xenial-cuda11.3 in .jenkins/pytorch/build.sh but match against linux-xenial-cuda11.1 in .jenkins/pytorch/test.sh.

I suspect this mismatch may cause the torch::deploy tests not being run.  Send this PR out and I will verify in CI.

Differential Revision: [D31939631](https://our.internmc.facebook.com/intern/diff/D31939631)

[ghstack-poisoned]
This commit is contained in:
Shunting Zhang 2021-10-29 15:04:12 -07:00
commit f3e739463d
431 changed files with 11706 additions and 3391 deletions

View File

@ -4,11 +4,8 @@ from cimodel.lib.miniutils import quote
from cimodel.data.simple.util.branch_filters import gen_filter_dict, RC_PATTERN
# TODO: make this generated from a matrix rather than just a static list
# NOTE: All hardcoded docker image builds have been migrated to GHA
IMAGE_NAMES = [
"pytorch-linux-bionic-rocm4.1-py3.6",
"pytorch-linux-bionic-rocm4.2-py3.6",
"pytorch-linux-bionic-rocm4.3.1-py3.6",
]
# This entry should be an element from the list above

9
.circleci/config.yml generated
View File

@ -6506,15 +6506,6 @@ workflows:
when: << pipeline.parameters.run_binary_tests >>
build:
jobs:
- docker_build_job:
name: "docker-pytorch-linux-bionic-rocm4.1-py3.6"
image_name: "pytorch-linux-bionic-rocm4.1-py3.6"
- docker_build_job:
name: "docker-pytorch-linux-bionic-rocm4.2-py3.6"
image_name: "pytorch-linux-bionic-rocm4.2-py3.6"
- docker_build_job:
name: "docker-pytorch-linux-bionic-rocm4.3.1-py3.6"
image_name: "pytorch-linux-bionic-rocm4.3.1-py3.6"
- pytorch_linux_build:
name: pytorch_linux_xenial_py3_6_gcc5_4_build
requires:

View File

@ -140,8 +140,6 @@ def generate_required_docker_images(items):
def gen_build_workflows_tree():
build_workflows_functions = [
# For rocm images, which don't have a circleci job equivalent
cimodel.data.simple.docker_definitions.get_workflow_jobs,
pytorch_build_definitions.get_workflow_jobs,
cimodel.data.simple.macos_definitions.get_workflow_jobs,
cimodel.data.simple.android_definitions.get_workflow_jobs,

View File

@ -9,7 +9,6 @@
"linux-bionic-cuda10.2-py3.9-gcc7",
"linux-bionic-py3.6-clang9",
"linux-vulkan-bionic-py3.6-clang9",
"linux-xenial-cuda10.2-py3.6-gcc7",
"linux-xenial-cuda11.3-py3.6-gcc7",
"linux-xenial-py3-clang5-mobile-build",
"linux-xenial-py3-clang5-mobile-code-analysis",
@ -47,7 +46,6 @@
"libtorch-linux-xenial-cuda10.2-py3.6-gcc7",
"libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
"linux-bionic-cuda10.2-py3.9-gcc7",
"linux-xenial-cuda10.2-py3.6-gcc7",
"linux-xenial-cuda11.3-py3.6-gcc7",
"periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
"periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck",
@ -82,7 +80,6 @@
"linux-bionic-cuda10.2-py3.9-gcc7",
"linux-bionic-py3.6-clang9",
"linux-vulkan-bionic-py3.6-clang9",
"linux-xenial-cuda10.2-py3.6-gcc7",
"linux-xenial-cuda11.3-py3.6-gcc7",
"linux-xenial-py3-clang5-mobile-build",
"linux-xenial-py3-clang5-mobile-code-analysis",
@ -121,7 +118,6 @@
],
"ciflow/slow": [
"linux-bionic-cuda10.2-py3.9-gcc7",
"linux-xenial-cuda10.2-py3.6-gcc7",
"periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck"
],
"ciflow/slow-gradcheck": [

View File

@ -15,6 +15,7 @@
# os: linux
# max_available: 20
# disk_size: 50
# is_ephemeral: true
runner_types:
linux.2xlarge:
@ -27,16 +28,19 @@ runner_types:
os: linux
max_available: 125
disk_size: 150
is_ephemeral: false
linux.4xlarge.nvidia.gpu:
instance_type: g3.4xlarge
os: linux
max_available: 125
disk_size: 150
is_ephemeral: false
linux.16xlarge.nvidia.gpu:
instance_type: g3.16xlarge
os: linux
max_available: 10
disk_size: 150
is_ephemeral: false
windows.4xlarge:
instance_type: c5d.4xlarge
os: windows

View File

@ -17,6 +17,7 @@ DOCKER_REGISTRY = "308535385114.dkr.ecr.us-east-1.amazonaws.com"
GITHUB_DIR = Path(__file__).resolve().parent.parent
WINDOWS_CPU_TEST_RUNNER = "windows.4xlarge"
# contains 1 gpu
WINDOWS_CUDA_TEST_RUNNER = "windows.8xlarge.nvidia.gpu"
WINDOWS_RUNNERS = {
WINDOWS_CPU_TEST_RUNNER,
@ -24,7 +25,8 @@ WINDOWS_RUNNERS = {
}
LINUX_CPU_TEST_RUNNER = "linux.2xlarge"
LINUX_CUDA_TEST_RUNNER = "linux.8xlarge.nvidia.gpu"
# contains 1 gpu
LINUX_CUDA_TEST_RUNNER = "linux.4xlarge.nvidia.gpu"
LINUX_RUNNERS = {
LINUX_CPU_TEST_RUNNER,
LINUX_CUDA_TEST_RUNNER,
@ -411,17 +413,6 @@ LINUX_WORKFLOWS = [
build_environment="linux-bionic-cuda10.2-py3.9-gcc7",
docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7",
test_runner_type=LINUX_CUDA_TEST_RUNNER,
num_test_shards=2,
ciflow_config=CIFlowConfig(
run_on_canary=True,
labels={LABEL_CIFLOW_SLOW, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA}
),
),
CIWorkflow(
arch="linux",
build_environment="linux-xenial-cuda10.2-py3.6-gcc7",
docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
test_runner_type=LINUX_CUDA_TEST_RUNNER,
enable_jit_legacy_test=1,
enable_multigpu_test=1,
enable_nogpu_no_avx_test=1,
@ -429,7 +420,8 @@ LINUX_WORKFLOWS = [
enable_slow_test=1,
num_test_shards=2,
ciflow_config=CIFlowConfig(
labels=set([LABEL_CIFLOW_SLOW, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA]),
run_on_canary=True,
labels={LABEL_CIFLOW_SLOW, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA}
),
),
CIWorkflow(
@ -539,14 +531,23 @@ BAZEL_WORKFLOWS = [
),
]
DOCKER_IMAGES = {
f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.6-clang9", # for pytorch/xla
f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm4.1-py3.6", # for rocm
f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm4.2-py3.6", # for rocm
f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm4.3.1-py3.6", # for rocm
}
DOCKER_IMAGES.update({
workflow.docker_image_base
for workflow in [*LINUX_WORKFLOWS, *BAZEL_WORKFLOWS]
if workflow.docker_image_base
})
DOCKER_WORKFLOWS = [
DockerWorkflow(
build_environment="docker-builds",
docker_images=sorted({
workflow.docker_image_base
for workflow in [*LINUX_WORKFLOWS, *BAZEL_WORKFLOWS]
if workflow.docker_image_base
}),
docker_images=sorted(DOCKER_IMAGES),
# Run weekly to ensure they can build
is_scheduled="1 * */7 * *",
),

View File

@ -16,7 +16,7 @@ from typing_extensions import TypedDict
BUILD_ENVIRONMENT = os.getenv('BUILD_ENVIRONMENT')
assert BUILD_ENVIRONMENT is not None
class Config(TypedDict):
num_shards: int
@ -70,6 +70,7 @@ def main() -> None:
if not run_as_if_on_trunk() and NUM_TEST_SHARDS_ON_PULL_REQUEST:
NUM_TEST_SHARDS = int(NUM_TEST_SHARDS_ON_PULL_REQUEST)
MULTIGPU_RUNNER_TYPE = os.getenv('MULTIGPU_RUNNER_TYPE')
DISTRIBUTED_GPU_RUNNER_TYPE = os.getenv('DISTRIBUTED_GPU_RUNNER_TYPE', TEST_RUNNER_TYPE)
NOGPU_RUNNER_TYPE = os.getenv('NOGPU_RUNNER_TYPE')
configs: Dict[str, Config] = {}
if os.getenv('ENABLE_JIT_LEGACY_TEST'):
@ -84,7 +85,10 @@ def main() -> None:
if os.getenv('ENABLE_FORCE_ON_CPU_TEST'):
configs['force_on_cpu'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE}
if os.getenv('ENABLE_DISTRIBUTED_TEST'):
configs['distributed'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
configs['distributed'] = {
'num_shards': 1,
'runner': DISTRIBUTED_GPU_RUNNER_TYPE if "cuda" in str(BUILD_ENVIRONMENT) else TEST_RUNNER_TYPE
}
if os.getenv('ENABLE_SLOW_TEST'):
configs['slow'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
if os.getenv('ENABLE_DOCS_TEST'):

View File

@ -21,7 +21,7 @@ on:
steps:
!{{ common.setup_ec2_linux() }}
!{{ common.checkout_pytorch("recursive") }}
!{{ common.calculate_docker_image() }}
!{{ common.calculate_docker_image(false) }}
- name: Pull Docker image
run: |
!{{ common.pull_docker("${DOCKER_IMAGE}") }}

View File

@ -183,7 +183,7 @@ concurrency:
run: echo "${LABELS}"
{%- endmacro -%}
{%- macro calculate_docker_image() -%}
{%- macro calculate_docker_image(always_rebuild) -%}
- name: Calculate docker image tag
id: calculate-tag
run: |
@ -198,6 +198,7 @@ concurrency:
BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
run: |
set -x
{%- if not always_rebuild %}
# Check if image already exists, if it does then skip building it
if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
exit 0
@ -222,6 +223,7 @@ concurrency:
echo " contact the PyTorch team to restore the original images"
exit 1
fi
{%- endif %}
echo ::set-output name=rebuild::yes
- name: Build and push docker image
if: ${{ steps.check.outputs.rebuild }}

View File

@ -7,6 +7,7 @@ name: !{{ build_environment }}
{%- endblock %}
on:
workflow_dispatch:
pull_request:
types: [opened, synchronize, reopened]
paths:
@ -35,7 +36,7 @@ jobs:
steps:
!{{ common.setup_ec2_linux() }}
!{{ common.checkout_pytorch("recursive") }}
!{{ common.calculate_docker_image() }}
!{{ common.calculate_docker_image(true) }}
- name: Pull Docker image
run: |
!{{ common.pull_docker("${DOCKER_IMAGE}") }}

View File

@ -54,7 +54,7 @@ jobs:
steps:
!{{ common.setup_ec2_linux() }}
!{{ common.checkout_pytorch("recursive") }}
!{{ common.calculate_docker_image() }}
!{{ common.calculate_docker_image(false) }}
- name: Pull Docker image
run: |
!{{ common.pull_docker("${DOCKER_IMAGE}") }}
@ -152,6 +152,7 @@ jobs:
ENABLE_NOARCH_TEST: !{{ enable_noarch_test }}
NUM_TEST_SHARDS: !{{ num_test_shards }}
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
NOGPU_RUNNER_TYPE: linux.2xlarge
PR_BODY: ${{ github.event.pull_request.body }}
outputs:

View File

@ -4,6 +4,7 @@
name: docker-builds
on:
workflow_dispatch:
pull_request:
types: [opened, synchronize, reopened]
paths:
@ -22,10 +23,18 @@ jobs:
strategy:
matrix:
include:
- docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.6-clang9'
docker_image_short_name: 'pytorch-linux-bionic-cuda10.2-cudnn7-py3.6-clang9'
- docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7'
docker_image_short_name: 'pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7'
- docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.6-clang9'
docker_image_short_name: 'pytorch-linux-bionic-py3.6-clang9'
- docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm4.1-py3.6'
docker_image_short_name: 'pytorch-linux-bionic-rocm4.1-py3.6'
- docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm4.2-py3.6'
docker_image_short_name: 'pytorch-linux-bionic-rocm4.2-py3.6'
- docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm4.3.1-py3.6'
docker_image_short_name: 'pytorch-linux-bionic-rocm4.3.1-py3.6'
- docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7'
docker_image_short_name: 'pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7'
- docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7'
@ -110,30 +119,6 @@ jobs:
BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
run: |
set -x
# Check if image already exists, if it does then skip building it
if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
exit 0
fi
if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
# if we're on the base branch then use the parent commit
MERGE_BASE=$(git rev-parse HEAD~)
else
# otherwise we're on a PR, so use the most recent base commit
MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
fi
# Covers the case where a previous tag doesn't exist for the tree
# this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
exit 1
fi
PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
# If no image exists but the hash is the same as the previous hash then we should error out here
if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
echo " contact the PyTorch team to restore the original images"
exit 1
fi
echo ::set-output name=rebuild::yes
- name: Build and push docker image
if: ${{ steps.check.outputs.rebuild }}

View File

@ -260,19 +260,20 @@ jobs:
runs-on: ubuntu-18.04
needs: [ciflow_should_run]
env:
TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu
ENABLE_DISTRIBUTED_TEST: 1
ENABLE_JIT_LEGACY_TEST: ''
ENABLE_MULTIGPU_TEST: ''
ENABLE_NOGPU_NO_AVX_TEST: ''
ENABLE_NOGPU_NO_AVX2_TEST: ''
ENABLE_SLOW_TEST: ''
ENABLE_JIT_LEGACY_TEST: 1
ENABLE_MULTIGPU_TEST: 1
ENABLE_NOGPU_NO_AVX_TEST: 1
ENABLE_NOGPU_NO_AVX2_TEST: 1
ENABLE_SLOW_TEST: 1
ENABLE_DOCS_TEST: ''
ENABLE_BACKWARDS_COMPAT_TEST: ''
ENABLE_XLA_TEST: ''
ENABLE_NOARCH_TEST: ''
NUM_TEST_SHARDS: 2
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
NOGPU_RUNNER_TYPE: linux.2xlarge
PR_BODY: ${{ github.event.pull_request.body }}
outputs:

View File

@ -273,6 +273,7 @@ jobs:
ENABLE_NOARCH_TEST: 1
NUM_TEST_SHARDS: 2
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
NOGPU_RUNNER_TYPE: linux.2xlarge
PR_BODY: ${{ github.event.pull_request.body }}
outputs:

View File

@ -273,6 +273,7 @@ jobs:
ENABLE_NOARCH_TEST: ''
NUM_TEST_SHARDS: 1
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
NOGPU_RUNNER_TYPE: linux.2xlarge
PR_BODY: ${{ github.event.pull_request.body }}
outputs:

View File

@ -1,517 +0,0 @@
# @generated DO NOT EDIT MANUALLY
# Template is at: .github/templates/linux_ci_workflow.yml.j2
# Generation script: .github/scripts/generate_ci_workflows.py
name: linux-xenial-cuda10.2-py3.6-gcc7
on:
pull_request:
types: [opened, synchronize, reopened, unassigned]
push:
branches:
- master
- release/*
workflow_dispatch:
env:
BUILD_ENVIRONMENT: linux-xenial-cuda10.2-py3.6-gcc7
DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
TORCH_CUDA_ARCH_LIST: 5.2
IN_CI: 1
IS_GHA: 1
# This is used for the phase of adding wheel tests only, will be removed once completed
IN_WHEEL_TEST: 1
# Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
AWS_DEFAULT_REGION: us-east-1
CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
concurrency:
group: linux-xenial-cuda10.2-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true
jobs:
ciflow_should_run:
runs-on: ubuntu-18.04
env:
IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow') }}
LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
if: ${{ (github.repository == 'pytorch/pytorch') && (
(github.event_name == 'push') ||
(github.event_name == 'schedule') ||
(contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow')) ||
(false))
}}
steps:
- name: noop
run: echo running ciflow_should_run
- name: print labels
run: echo "${LABELS}"
build:
runs-on: linux.2xlarge
needs: [ciflow_should_run]
env:
JOB_BASE_NAME: linux-xenial-cuda10.2-py3.6-gcc7-build
outputs:
docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
steps:
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
- name: Log in to ECR
env:
AWS_RETRY_MODE: standard
AWS_MAX_ATTEMPTS: 5
run: |
aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
bash /tmp/ecr-login.sh
rm /tmp/ecr-login.sh
- name: Chown workspace
env:
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
run: |
retry () {
"$@" || (sleep 1 && "$@") || (sleep 2 && "$@")
}
retry docker pull "${ALPINE_IMAGE}"
# Ensure the working directory gets chowned back to the current user
docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
- name: Clean workspace
run: |
rm -rf "${GITHUB_WORKSPACE:?}/*"
rm -f ~/.ssh/authorized_keys
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: seemethere/add-github-ssh-key@v1
with:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Preserve github env variables for use in docker
run: |
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
# deep clone, to allow use of git merge-base
fetch-depth: 0
submodules: recursive
- name: Calculate docker image tag
id: calculate-tag
run: |
DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
echo "::set-output name=docker_tag::${DOCKER_TAG}"
echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
- name: Check if image should be built
id: check
env:
BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
run: |
set -x
# Check if image already exists, if it does then skip building it
if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
exit 0
fi
if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
# if we're on the base branch then use the parent commit
MERGE_BASE=$(git rev-parse HEAD~)
else
# otherwise we're on a PR, so use the most recent base commit
MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
fi
# Covers the case where a previous tag doesn't exist for the tree
# this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
exit 1
fi
PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
# If no image exists but the hash is the same as the previous hash then we should error out here
if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
echo " contact the PyTorch team to restore the original images"
exit 1
fi
echo ::set-output name=rebuild::yes
- name: Build and push docker image
if: ${{ steps.check.outputs.rebuild }}
env:
DOCKER_SKIP_S3_UPLOAD: 1
working-directory: .circleci/docker
run: |
export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
./build_docker.sh
- name: Pull Docker image
run: |
retry () {
"$@" || (sleep 1 && "$@") || (sleep 2 && "$@")
}
retry docker pull "${DOCKER_IMAGE}"
- name: Parse ref
id: parse-ref
run: .github/scripts/parse_ref.py
- name: Build
env:
CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
run: |
# detached container should get cleaned up by teardown_ec2_linux
container_name=$(docker run \
-e BUILD_ENVIRONMENT \
-e JOB_BASE_NAME \
-e MAX_JOBS="$(nproc --ignore=2)" \
-e AWS_DEFAULT_REGION \
-e IS_GHA \
-e CIRCLE_PR_NUMBER \
-e CIRCLE_SHA1 \
-e CIRCLE_BRANCH \
-e GITHUB_RUN_ID \
-e SCCACHE_BUCKET \
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-e SKIP_SCCACHE_INITIALIZATION=1 \
-e TORCH_CUDA_ARCH_LIST \
-e PR_LABELS \
-e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--tty \
--detach \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}"
)
docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
- name: Display and upload binary build size statistics (Click Me)
# temporary hack: set CIRCLE_* vars, until we update
# tools/stats/print_test_stats.py to natively support GitHub Actions
env:
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
run: |
COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
export COMMIT_TIME
pip3 install requests==2.26 boto3==1.16.34
python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
- name: Archive artifacts into zip
run: |
zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
- uses: seemethere/upload-artifact-s3@v3
name: Store PyTorch Build Artifacts on S3
with:
name: ${{ env.BUILD_ENVIRONMENT }}
retention-days: 14
if-no-files-found: error
path:
artifacts.zip
- name: Hold runner for 2 hours or until ssh sessions have drained
# Always hold for active ssh sessions
if: always()
run: .github/scripts/wait_for_ssh_to_drain.sh
- name: Chown workspace
if: always()
env:
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
- name: Kill containers, clean up images
if: always()
run: |
# ignore expansion of "docker ps -q" since it could be empty
# shellcheck disable=SC2046
docker stop $(docker ps -q) || true
# Prune all of the docker images
docker system prune -af
- name: Hold runner for 2 hours or until ssh sessions have drained
# Always hold for active ssh sessions
if: always()
run: .github/scripts/wait_for_ssh_to_drain.sh
- name: Clean up docker images
if: always()
run: |
# Prune all of the docker images
docker system prune -af
generate-test-matrix:
runs-on: ubuntu-18.04
needs: [ciflow_should_run]
env:
TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
ENABLE_DISTRIBUTED_TEST: 1
ENABLE_JIT_LEGACY_TEST: 1
ENABLE_MULTIGPU_TEST: 1
ENABLE_NOGPU_NO_AVX_TEST: 1
ENABLE_NOGPU_NO_AVX2_TEST: 1
ENABLE_SLOW_TEST: 1
ENABLE_DOCS_TEST: ''
ENABLE_BACKWARDS_COMPAT_TEST: ''
ENABLE_XLA_TEST: ''
ENABLE_NOARCH_TEST: ''
NUM_TEST_SHARDS: 2
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
NOGPU_RUNNER_TYPE: linux.2xlarge
PR_BODY: ${{ github.event.pull_request.body }}
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
container:
image: python:3.9
steps:
- name: Install dependencies
run: pip install typing-extensions==3.10
- name: Clone pytorch/pytorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
- name: Generating test matrix
id: set-matrix
run: .github/scripts/generate_pytorch_test_matrix.py
test:
needs: [build, generate-test-matrix, ciflow_should_run]
strategy:
matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
fail-fast: false
runs-on: ${{ matrix.runner }}
env:
DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
JOB_BASE_NAME: linux-xenial-cuda10.2-py3.6-gcc7-test
TEST_CONFIG: ${{ matrix.config }}
SHARD_NUMBER: ${{ matrix.shard }}
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
steps:
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
- name: Log in to ECR
env:
AWS_RETRY_MODE: standard
AWS_MAX_ATTEMPTS: 5
run: |
aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
bash /tmp/ecr-login.sh
rm /tmp/ecr-login.sh
- name: Chown workspace
env:
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
run: |
retry () {
"$@" || (sleep 1 && "$@") || (sleep 2 && "$@")
}
retry docker pull "${ALPINE_IMAGE}"
# Ensure the working directory gets chowned back to the current user
docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
- name: Clean workspace
run: |
rm -rf "${GITHUB_WORKSPACE:?}/*"
rm -f ~/.ssh/authorized_keys
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: seemethere/add-github-ssh-key@v1
with:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Preserve github env variables for use in docker
run: |
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Checkout PyTorch
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
with:
# deep clone, to allow use of git merge-base
fetch-depth: 0
submodules: recursive
- name: Pull Docker image
run: |
retry () {
"$@" || (sleep 1 && "$@") || (sleep 2 && "$@")
}
retry docker pull "${DOCKER_IMAGE}"
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
run: |
bash .github/scripts/install_nvidia_utils_linux.sh
echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
- name: Determine shm-size
run: |
shm_size="1g"
case "${BUILD_ENVIRONMENT}" in
*cuda*)
shm_size="2g"
;;
*rocm*)
shm_size="8g"
;;
esac
echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
- uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
name: Download PyTorch Build Artifacts
with:
name: ${{ env.BUILD_ENVIRONMENT }}
- name: Unzip artifacts
run: |
unzip -o artifacts.zip
- name: Output disk space left
run: |
sudo df -H
- name: Parse ref
id: parse-ref
run: .github/scripts/parse_ref.py
- name: Test
env:
PR_NUMBER: ${{ github.event.pull_request.number }}
CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
# Time out the test phase after 240 minutes
timeout-minutes: 240
run: |
set -x
if [[ $TEST_CONFIG == 'multigpu' ]]; then
TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
TEST_COMMAND=.jenkins/caffe2/test.sh
else
TEST_COMMAND=.jenkins/pytorch/test.sh
fi
# detached container should get cleaned up by teardown_ec2_linux
# TODO: Stop building test binaries as part of the build phase
# Used for GPU_FLAG since that doesn't play nice
# shellcheck disable=SC2086
container_name=$(docker run \
${GPU_FLAG:-} \
-e BUILD_ENVIRONMENT \
-e PR_NUMBER \
-e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-e GITHUB_ACTIONS \
-e IN_CI \
-e IS_GHA \
-e CIRCLE_BRANCH \
-e CIRCLE_SHA1 \
-e CIRCLE_PR_NUMBER \
-e AWS_DEFAULT_REGION \
-e IN_WHEEL_TEST \
-e SHARD_NUMBER \
-e JOB_BASE_NAME \
-e TEST_CONFIG \
-e NUM_TEST_SHARDS \
-e PYTORCH_IGNORE_DISABLED_ISSUES \
-e PR_LABELS \
-e MAX_JOBS="$(nproc --ignore=2)" \
-e SCCACHE_BUCKET \
-e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--ulimit stack=10485760:83886080 \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--shm-size="${SHM_SIZE}" \
--tty \
--detach \
--name="${container_name}" \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}"
)
docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}"
- name: Chown workspace
if: always()
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
- name: Install render_test_results dependencies
if: always()
shell: bash
run: |
python3 -m pip install junitparser==2.1.1 rich==10.9.0
- name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
if: always()
shell: bash
# Encoding is weird on windows, just try to default to utf-8 if possible
env:
PYTHONIOENCODING: "utf-8"
run: |
python3 tools/render_junit.py test/
- name: Zip test reports for upload
if: always()
env:
FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
run: |
# Remove any previous test reports if they exist
rm -f test-reports-*.zip
zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
- uses: seemethere/upload-artifact-s3@v3
name: Store Test Reports on S3
if: always()
with:
retention-days: 14
if-no-files-found: error
path:
test-reports-*.zip
- name: Display and upload test statistics (Click Me)
if: always()
# temporary hack: set CIRCLE_* vars, until we update
# tools/stats/print_test_stats.py to natively support GitHub Actions
env:
AWS_DEFAULT_REGION: us-east-1
CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
JOB_BASE_NAME: linux-xenial-cuda10.2-py3.6-gcc7-test
CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
shell: bash
run: |
python3 -m pip install -r requirements.txt
python3 -m pip install boto3==1.16.34
python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
- name: Hold runner for 2 hours or until ssh sessions have drained
# Always hold for active ssh sessions
if: always()
run: .github/scripts/wait_for_ssh_to_drain.sh
- name: Chown workspace
if: always()
env:
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
- name: Kill containers, clean up images
if: always()
run: |
# ignore expansion of "docker ps -q" since it could be empty
# shellcheck disable=SC2046
docker stop $(docker ps -q) || true
# Prune all of the docker images
docker system prune -af

View File

@ -260,7 +260,7 @@ jobs:
runs-on: ubuntu-18.04
needs: [ciflow_should_run]
env:
TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu
ENABLE_DISTRIBUTED_TEST: 1
ENABLE_JIT_LEGACY_TEST: ''
ENABLE_MULTIGPU_TEST: ''
@ -273,6 +273,7 @@ jobs:
ENABLE_NOARCH_TEST: ''
NUM_TEST_SHARDS: 2
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
NOGPU_RUNNER_TYPE: linux.2xlarge
PR_BODY: ${{ github.event.pull_request.body }}
outputs:

View File

@ -273,6 +273,7 @@ jobs:
ENABLE_NOARCH_TEST: ''
NUM_TEST_SHARDS: 2
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
NOGPU_RUNNER_TYPE: linux.2xlarge
PR_BODY: ${{ github.event.pull_request.body }}
outputs:

View File

@ -273,6 +273,7 @@ jobs:
ENABLE_NOARCH_TEST: ''
NUM_TEST_SHARDS: 2
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
NOGPU_RUNNER_TYPE: linux.2xlarge
PR_BODY: ${{ github.event.pull_request.body }}
outputs:

View File

@ -273,6 +273,7 @@ jobs:
ENABLE_NOARCH_TEST: ''
NUM_TEST_SHARDS: 2
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
NOGPU_RUNNER_TYPE: linux.2xlarge
PR_BODY: ${{ github.event.pull_request.body }}
outputs:

View File

@ -273,6 +273,7 @@ jobs:
ENABLE_NOARCH_TEST: ''
NUM_TEST_SHARDS: 2
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
NOGPU_RUNNER_TYPE: linux.2xlarge
PR_BODY: ${{ github.event.pull_request.body }}
outputs:

View File

@ -273,6 +273,7 @@ jobs:
ENABLE_NOARCH_TEST: ''
NUM_TEST_SHARDS: 1
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
NOGPU_RUNNER_TYPE: linux.2xlarge
PR_BODY: ${{ github.event.pull_request.body }}
outputs:

View File

@ -258,7 +258,7 @@ jobs:
runs-on: ubuntu-18.04
needs: [ciflow_should_run]
env:
TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu
ENABLE_DISTRIBUTED_TEST: ''
ENABLE_JIT_LEGACY_TEST: ''
ENABLE_MULTIGPU_TEST: ''
@ -271,6 +271,7 @@ jobs:
ENABLE_NOARCH_TEST: ''
NUM_TEST_SHARDS: 2
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
NOGPU_RUNNER_TYPE: linux.2xlarge
PR_BODY: ${{ github.event.pull_request.body }}
outputs:

View File

@ -258,7 +258,7 @@ jobs:
runs-on: ubuntu-18.04
needs: [ciflow_should_run]
env:
TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu
ENABLE_DISTRIBUTED_TEST: 1
ENABLE_JIT_LEGACY_TEST: ''
ENABLE_MULTIGPU_TEST: ''
@ -271,6 +271,7 @@ jobs:
ENABLE_NOARCH_TEST: ''
NUM_TEST_SHARDS: 2
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
NOGPU_RUNNER_TYPE: linux.2xlarge
PR_BODY: ${{ github.event.pull_request.body }}
outputs:

View File

@ -105,7 +105,7 @@ jobs:
- name: Ensure all test files have header containing ownership information
if: always()
run: |
(! git grep -L "# Owner(s): \[" test/distributed/**/test_*.py) || (printf "The above test files are missing a comment header with ownership information; please add the following line\n\n# Owner(s): [\"<owner: label>\"]\n\nto the top of each test file. The owner should be an existing pytorch/pytorch label."; false)
(! git grep -L "# Owner(s): \[" ./test/distributed/**/test_*.py || (printf "The above test files are missing a comment header with ownership information; please add the following line\n\n# Owner(s): [\"<owner: label>\"]\n\nto the top of each test file. The owner should be an existing pytorch/pytorch label."; false))
clang-format:
runs-on: ubuntu-18.04

View File

@ -181,9 +181,7 @@ if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
# JIT C++ extensions require ninja, so put it into PATH.
export PATH="/var/lib/jenkins/.local/bin:$PATH"
if [[ "$BUILD_ENVIRONMENT" == *py3* ]]; then
pip install -q --user flatbuffers==2.0
wget https://ortpypackage.blob.core.windows.net/ort-nightly/ort_nightly-1.8.0.dev202107131-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
pip install -q --user ort_nightly-1.8.0.dev202107131-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
pip install -q --user flatbuffers==2.0 onnxruntime==1.9.0
fi
"$ROOT_DIR/scripts/onnx/test.sh"
fi

View File

@ -523,7 +523,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
# TODO: run some C++ tests
echo "no-op at the moment"
elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 || ("${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1) ]]; then
if [[ "${BUILD_ENVIRONMENT}" == *linux-xenial-cuda11.3*-test1* ]]; then
if [[ "${BUILD_ENVIRONMENT}" == *linux-xenial-cuda11.3* ]]; then
echo "Testing torch::deploy"
test_torch_deploy
fi
test_without_numpy

View File

@ -127,6 +127,11 @@ genrule(
"aten/src/ATen/Declarations.yaml",
"aten/src/ATen/RegisterBackendSelect.cpp",
"aten/src/ATen/RegisterCPU.cpp",
"aten/src/ATen/RegisterFunctionalization_0.cpp",
"aten/src/ATen/RegisterFunctionalization_1.cpp",
"aten/src/ATen/RegisterFunctionalization_2.cpp",
"aten/src/ATen/RegisterFunctionalization_3.cpp",
# "aten/src/ATen/RegisterFunctionalizationEverything.cpp",
"aten/src/ATen/RegisterMkldnnCPU.cpp",
"aten/src/ATen/RegisterQuantizedCPU.cpp",
"aten/src/ATen/RegisterSparseCPU.cpp",
@ -143,6 +148,7 @@ genrule(
"aten/src/ATen/CompositeExplicitAutogradFunctions_inl.h",
"aten/src/ATen/CompositeImplicitAutogradFunctions.h",
"aten/src/ATen/CompositeImplicitAutogradFunctions_inl.h",
"aten/src/ATen/FunctionalInverses.h",
"aten/src/ATen/Functions.h",
"aten/src/ATen/Functions.cpp",
"aten/src/ATen/RedispatchFunctions.h",

View File

@ -11,6 +11,7 @@
/torch/nn/ @albanD @jbschlosser
/torch/optim/ @albanD
/test/test_public_bindings.py @albanD
/docs/source/conf.py @albanD
/aten/src/ATen/native/native_functions.yaml @ezyang
# Tensorpipe RPC Agent.

View File

@ -1026,6 +1026,8 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
m.impl("size.int", static_cast<int64_t (*)(const Tensor&, int64_t)>(native::size));
m.impl("_add_batch_dim", native::_add_batch_dim);
m.impl("_remove_batch_dim", native::_remove_batch_dim);
m.impl("_make_dual", native::_make_dual);
m.impl("is_same_size", native::is_same_size);
m.impl("sum.dim_IntList", sum_batching_rule);
m.impl("is_complex", native::is_complex);

View File

@ -0,0 +1,234 @@
#include <ATen/FunctionalInverses.h>
#include <ATen/ATen.h>
#include <ATen/ExpandUtils.h>
namespace at {
namespace functionalization {
// This logic is similar to autograd code for view backwards calls.
// We can't easily share it though, because (eventually) these functions
// will all call `permute/unsqueeze_copy()` instead of `permute/unsqueeze`.
Tensor permute_inverse(const Tensor& self, IntArrayRef dims) {
// invert the permutation
auto ndims = dims.size();
std::vector<int64_t> dims_(ndims);
for(const auto i : c10::irange(ndims)) {
dims_[at::maybe_wrap_dim(dims[i], ndims)] = i;
}
return self.permute(dims_);
}
Tensor unsqueeze_to(const Tensor & self, IntArrayRef sizes) {
auto result = self;
int64_t nDims = sizes.size();
for(const auto dim : c10::irange(nDims)) {
if (sizes[dim] == 1) {
result = result.unsqueeze(dim);
}
}
return result;
}
Tensor unsqueeze_to(const Tensor & self, int64_t dim, IntArrayRef sizes) {
dim = at::maybe_wrap_dim(dim, sizes.size());
// in NumPy it's not an error to unsqueeze a scalar, but we still need to avoided
// unsqueezing in the backward.
if (sizes.size() > 0 && sizes[dim] == 1) {
return self.unsqueeze(dim);
}
return self;
}
// Note [Functionalization Pass: View Inverses].
// This file contains the implementation of each "view inverse".
// These aren't really true inverses in the mathematically sense: each view inverse describes how to undo
// the original view (although it takes in different arguments).
//
// E.g. Below is an example of a program that has alias operations removed, and the role that view inverses play:
//
// normal program with views and mutations:
// view1 = input1.view_op(args...)
// view1.add_(1) (perform a mutation on the view, which should also modify input)
// version of the program with no aliasing, that instead uses view_inverse functions:
// view_copy1 = input1.view_copy_op(args...)
// view_copy1.add_(1) (perform a mutation on view_copy1. At this point, input1 is NOT modified)
// x = view_op_inverse(input1, view_copy1, args...)
//
// at this point, input1 and x should be equal
//
// Note that input1 is also passed as an argument to view_op_inverse in the above example.
// This isn't actually required for most view operators: it's only required for view ops
// where you can't figure out what the size of the base tensor is given just the view tensor and arguments.
// Examples are slice/select/scatter/squeeze/as_strided.
// We happen to be passing in the base tensor in all cases, mostly to make the codegen simpler.
// But you'll see below that the "base" argument is ignored by most view_inverse implementations.
// ----------------------------------------------------------
// Implementations of each view_inverse() function are below.
// One of these needs to be implemented for every existing non-composite view operator.
// The codegen automatically generates the corresponding function declaration.
// ----------------------------------------------------------
Tensor FunctionalInverses::_fw_primal_inverse(const at::Tensor& base, const at::Tensor& mutated_view, int64_t level) {
TORCH_INTERNAL_ASSERT(false, "Attempted to call _fw_primal() during the functionalization pass. For now, this is not supported.");
return Tensor();
}
Tensor FunctionalInverses::view_as_real_inverse(const Tensor& base, const Tensor& mutated_view) {
return at::view_as_complex(mutated_view);
}
Tensor FunctionalInverses::view_as_complex_inverse(const Tensor& base, const Tensor& mutated_view) {
return at::view_as_real(mutated_view.resolve_conj());
}
Tensor FunctionalInverses::_conj_inverse(const Tensor& base, const Tensor& mutated_view) {
return mutated_view.conj();
}
Tensor FunctionalInverses::_neg_view_inverse(const Tensor& base, const Tensor& mutated_view) {
return mutated_view.neg();
}
Tensor FunctionalInverses::as_strided_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef size, at::IntArrayRef stride, c10::optional<int64_t> storage_offset) {
TORCH_INTERNAL_ASSERT(false, "as_strided has not been implemented in the functionalization pass yet");
return Tensor();
}
Tensor FunctionalInverses::diagonal_inverse(const Tensor& base, const Tensor& mutated_view, int64_t offset, int64_t dim1, int64_t dim2) {
return base.diagonal_scatter(mutated_view, offset, dim1, dim2);
}
Tensor FunctionalInverses::expand_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef size, bool implicit) {
return at::sum_to(mutated_view, base.sizes());
}
Tensor FunctionalInverses::permute_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef dims) {
return at::functionalization::permute_inverse(mutated_view, dims);
}
Tensor FunctionalInverses::_reshape_alias_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef size, at::IntArrayRef stride) {
// Note that I'm directly calling reshape(), and ignoring the strides.
// _reshape_alias() isn't available from user code, and is an implementation detail of reshape().
// Specifically, passing in the strides directly can get us into trouble in cases like:
// b = a[0]; c = b.reshape(...); c.add_(1); print(a)
// When we eventually run the _reshape_alias_inverse() call here, if we were to pass in both sizes and strides,
// The call would fail because `mutated_view` doesn't have enough bytes of storage.
return mutated_view.reshape(base.sizes());
}
Tensor FunctionalInverses::select_int_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim, int64_t index) {
return base.select_scatter(mutated_view, dim, index);
}
Tensor FunctionalInverses::detach_inverse(const Tensor& base, const Tensor& mutated_view) {
// the functionalization pass doesn't care about autograd metadata - as a view, I think detach() is just an identity function
return mutated_view;
}
Tensor FunctionalInverses::slice_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim, c10::optional<int64_t> start, c10::optional<int64_t> end, int64_t step) {
return base.slice_scatter(mutated_view, dim, start, end, step);
}
Tensor FunctionalInverses::split_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, int64_t mutated_view_idx, int64_t split_size, int64_t dim) {
// It would be nice if this logic could be re-used from autograd's split_backward(), but I don't think it can.
// For functionalization, we have only have one of the tensors from the TensorList outputed by split(), and we want to layer i
// on top of the base tensor.
// For autograd, we have all of the tensors outputted by split() and we just want to stack them.
dim = at::maybe_wrap_dim(dim, base.sizes().size());
auto dim_size = base.size(dim);
auto start = mutated_view_idx * split_size;
auto end = start + split_size;
if (end > dim_size) end = dim_size;
return base.slice_scatter(mutated_view, dim, start, end, 1);
}
Tensor FunctionalInverses::split_with_sizes_inverse(const Tensor& base, const Tensor& mutated_view, int64_t mutated_view_idx, at::IntArrayRef split_sizes, int64_t dim) {
dim = at::maybe_wrap_dim(dim, base.sizes().size());
auto dim_size = base.size(dim);
int64_t start = 0;
for (auto i = 0; i < mutated_view_idx; ++i) {
start += split_sizes[i];
}
auto end = start + split_sizes[mutated_view_idx];
if (end > dim_size) end = dim_size;
return base.slice_scatter(mutated_view, dim, start, end, 1);
}
Tensor FunctionalInverses::squeeze_inverse(const Tensor& base, const Tensor& mutated_view) {
return unsqueeze_to(mutated_view, base.sizes());
}
Tensor FunctionalInverses::squeeze_dim_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim) {
return unsqueeze_to(mutated_view, dim, base.sizes());
}
Tensor FunctionalInverses::t_inverse(const Tensor& base, const Tensor& mutated_view) {
return mutated_view.t();
}
Tensor FunctionalInverses::transpose_int_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim0, int64_t dim1) {
return mutated_view.transpose(dim0, dim1);
}
Tensor FunctionalInverses::unsqueeze_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim) {
return mutated_view.squeeze(dim);
}
Tensor FunctionalInverses::_indices_inverse(const Tensor& base, const Tensor& mutated_view) {
TORCH_INTERNAL_ASSERT(false, "Attempted to call _indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
return Tensor();
}
Tensor FunctionalInverses::_values_inverse(const Tensor& base, const Tensor& mutated_view) {
TORCH_INTERNAL_ASSERT(false, "Attempted to call _values() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
return Tensor();
}
Tensor FunctionalInverses::indices_inverse(const Tensor& base, const Tensor& mutated_view) {
TORCH_INTERNAL_ASSERT(false, "Attempted to call indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
return Tensor();
}
Tensor FunctionalInverses::values_inverse(const Tensor& base, const Tensor& mutated_view) {
TORCH_INTERNAL_ASSERT(false, "Attempted to call values() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
return Tensor();
}
Tensor FunctionalInverses::crow_indices_inverse(const at::Tensor& base, const at::Tensor& mutated_view) {
TORCH_INTERNAL_ASSERT(false, "Attempted to call crow_indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
return Tensor();
}
Tensor FunctionalInverses::col_indices_inverse(const at::Tensor& base, const at::Tensor& mutated_view) {
TORCH_INTERNAL_ASSERT(false, "Attempted to call col_indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
return Tensor();
}
Tensor FunctionalInverses::unbind_int_inverse(const Tensor& base, const Tensor& mutated_view, int64_t mutated_view_idx, int64_t dim) {
dim = at::maybe_wrap_dim(dim, base.sizes().size());
return base.select_scatter(mutated_view, dim, mutated_view_idx);
}
Tensor FunctionalInverses::view_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef size) {
return mutated_view.view(base.sizes());
}
Tensor FunctionalInverses::view_dtype_inverse(const Tensor& base, const Tensor& mutated_view, at::ScalarType dtype) {
return mutated_view.view(base.scalar_type());
}
Tensor FunctionalInverses::unfold_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dimension, int64_t size, int64_t step) {
// I think autograd and the functionalization pass want the exact same thing here, but need to test to confirm.
return unfold_backward(mutated_view, base.sizes(), dimension, size, step);
}
Tensor FunctionalInverses::alias_inverse(const Tensor& base, const Tensor& mutated_view) {
return mutated_view;
}
} // functionalization
} // at

View File

@ -0,0 +1,117 @@
#include <ATen/FunctionalStorageImpl.h>
#include <ATen/FunctionalTensorWrapper.h>
#include <ATen/core/LegacyTypeDispatch.h>
#include <c10/util/Exception.h>
#include <vector>
namespace at {
namespace functionalization {
ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
if (out_idx == this->out_index) return *this;
return ViewMeta(forward_fn, reverse_fn, out_idx);
}
Alias::Alias(const at::Tensor& base) {
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base));
base_ = base;
}
const at::Tensor& Alias::base() const {
return base_;
}
void Alias::add_update(const at::Tensor& updated_val, const std::vector<ViewMeta>& metas) {
updates_.push_back({updated_val, metas});
generation_++;
}
// Note [Functionalization: Alias Removal Part 2]
// See Note [Functionalization: Alias Removal] for more details.
// This function applies a single update from one of the views to the Alias object.
// We start out with <original_base> and <mutated_view>, and our goal is to end up with <mutated_base>.
// Consider this program:
//
// base = ...
// a = base.view1()
// b = a.view2()
// c = b.view3()
// c.add_(3)
//
// Then the functionalization pass will queue an update as follows:
//
// update.new_val = c # the updated value of c
// update.view_metas = [view1_meta, view2_meta, view3_meta]
//
// Syncing any of a, b or c will eventually call apply_update() on the alias, and the following will run:
//
// tmp_values = [base, a, b] # NB: c is not necessary
// t = update.new_val
// t = view3_inverse(b, t, 0) # 0 is output index, these are all single output views so it's 0
// t = view2_inverse(a, t, 0)
// t = view1_inverse(base, t, 0) # t now represents the updated alias.
// alias.base_ = t
const Tensor apply_update(const Alias::Update& update, const Tensor& base) {
at::Tensor t = update.new_val;
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
std::vector<at::Tensor> tmp_values({base});
for (size_t i = 0; i < update.view_metas.size() - 1; ++i) {
at::Tensor next_view = update.view_metas[i].forward_fn(tmp_values.back(), update.view_metas[i].out_index);
// NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided
// All of these ops require additional information to recover the sizes of the original tensor.
// If need to, we could probably apply this optimization and only bother computing tmp_values
// for those necessary view ops.
tmp_values.push_back(std::move(next_view));
}
for(int i = update.view_metas.size()-1; i >= 0; --i) {
int64_t out_idx = update.view_metas[i].out_index;
// Each view inverse is implemented in ViewInverses.cpp.
t = update.view_metas[i].reverse_fn(tmp_values[i], t, out_idx);
}
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
return t;
}
void Alias::apply_updates() {
// N.B:none of the tensors used in this function should be FunctionalTensorWrappers at this point.
// The only reason we currently need the TLS exclude guard here is because of functorch's DynamicLayer stack.
// It adds the Functionalize key into TLS before redispatching to the functionalization kernels,
// which means that we need to explicitly exclude it here before doing any other work underneath the pass.
at::AutoDispatchSkipFunctionalize guard;
for (auto& update_data: updates_) {
base_ = apply_update(update_data, base_);
}
updates_.clear();
}
FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& value)
: c10::StorageImpl(
c10::StorageImpl::use_byte_size_t(),
value.numel() * value.dtype().itemsize(),
DataPtr{nullptr, value.device()},
// Using a null allocator, since FunctionalTensorImpl's aren't resizeable.
nullptr,
/*resizeable=*/false
),
alias_(Alias(value))
{}
void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<ViewMeta>& view_metas) {
alias_.add_update(updated_val, view_metas);
}
void FunctionalStorageImpl::apply_updates() {
alias_.apply_updates();
}
const Tensor& FunctionalStorageImpl::base() {
return alias_.base();
}
size_t FunctionalStorageImpl::generation() const {
return alias_.generation();
}
} // namespace functionalization
} // namespace at

View File

@ -0,0 +1,110 @@
#pragma once
#include <ATen/Tensor.h>
namespace at {
namespace functionalization {
// See Note [Functionalization Pass In Core]
// ViewMeta is a class used by the functionalization pass to navigate between
// a base tensor and a view tensor.
// For example, if I call `b = a.view1(...)`
// the functionalization pass will generate and store a ViewMeta on b that looks like:
//
// ViewMeta(
// [<captures>](const Tensor& base, int64_t mutated_view_idx) {
// return base.view1(...);
// },
// [<captures>](const at::Tensor& base, const at::Tensor& mutated_view, int64_t mutated_view_idx) -> at::Tensor {
// return at::functionalization::impl::view1_inverse(base, mutated_view, ...);
// }
//
// The forward_fn lambda describes how to replay view1 on a tensor.
//
// The reverse_fn lambda describes how, given a tensor that is already a view, how to get the corresponding base tensor.
// See Note [Functionalization Pass: View Inverses] for details.
struct ViewMeta {
ViewMeta(
std::function<Tensor(const Tensor&, int64_t)> forward,
std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse,
int64_t out_idx = 0) :
forward_fn(forward),
reverse_fn(reverse),
out_index(out_idx)
{}
std::function<Tensor(const Tensor&, int64_t)> forward_fn;
std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse_fn;
// See Note [out_idx in ViewMeta]
int64_t out_index;
// Returns a copy of the current ViewMeta, if out_idx matches the current out_index.
// Otherwise, returns a new ViewMeta with the same forward/reverse functions, but a new out index.
ViewMeta to_out_idx(int64_t out_idx);
};
// Alias represents the state shared by (potentially multiple) views of the same tensor.
// For example, in the following code:
//
// b = a.view1(...)
// c = b.view2(...)
// b.add_(1)
// --> alias.add_update(b, {view1_meta})
//
// The call to add_(1) will result in a call to alias.add_update(b, {view1_meta}), queueing up
// the mutation from b onto the alias.
// Later, suppose c is used in an expression (e.g. you try to print c, or pass it to an operator).
// Doing so will involve "syncing" c.
// First we apply any pending updates to the alias, and then we regenerate c
// by replaying its views off of the updated alias. E.g:
//
// print(str(c))
// --> c.sync_()
// --> alias.apply_updates() // after this, the alias will be updated to reflect the mutation to b
class Alias {
public:
struct Update {
const at::Tensor& new_val;
const std::vector<ViewMeta>& view_metas;
};
explicit Alias(const at::Tensor& base);
const at::Tensor& base() const;
size_t generation() const { return generation_; }
void add_update(const at::Tensor& updated_val, const std::vector<ViewMeta>& metas);
void apply_updates();
private:
// NB: base_ should always point to a tensor BELOW the current functionalization layer.
// This is mainly to avoid reference cycles.
// e.g. given `b = a.view(...)`
// Both a.storage_ and b.storage_ are a FunctionStorageImpl containing an Alias, with contains a Tensor `base_`.
// In this case (where a and b are FunctionalTensorWrapper's), base_ should point not to a, but to a's unwrapped value, a.value_`
// See Note [Functionalization: Alias Removal] for a diagram that shows this visually.
at::Tensor base_;
std::vector<Update> updates_;
// generation_ gets incremented every time a mutation is queued onto the alias.
// It is used to determine if a given tensor is "up to date", or if it needs to be regenerated from the alias.
size_t generation_ = 0;
};
// FunctionalStorageImpl is a subclass of StorageImpl used by the functionalization pass.
// It has no underlying data (similar to meta storage).
// It also knows how to reflect mutations to tensors in the absence of a valid data pointer.
// It does this by separately storing an Alias object, which knows how to reflect mutations
// that may have happened to views of the original tensor.
struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
explicit FunctionalStorageImpl(const Tensor& value);
void add_update(const Tensor& updated_val, const std::vector<ViewMeta>& view_metas);
void apply_updates();
const Tensor& base();
size_t generation() const;
~FunctionalStorageImpl() override = default;
private:
at::functionalization::Alias alias_;
};
} // namespace functionalization
} // namespace at

View File

@ -0,0 +1,377 @@
#include <ATen/FunctionalTensorWrapper.h>
#include <ATen/FunctionalInverses.h>
#include <ATen/TensorUtils.h>
#include <ATen/WrapDimUtils.h>
#include <ATen/core/LegacyTypeDispatch.h>
#include <c10/util/Exception.h>
#include <c10/util/irange.h>
namespace at {
void FunctionalTensorWrapper::set_constructor_metadata() {
TORCH_INTERNAL_ASSERT(value_.defined());
// Note: "level" is a concept that we don't know how to compute in core.
// For now I'm retroactively setting this in functorch,
// but once Open Multiple Dispatch lands we should be able to calculate this in core.
level_ = -1;
// shallow_copy_from overwrites the storage and dispatch keyset...
auto functional_storage = storage_;
shallow_copy_from(value_.getIntrusivePtr());
storage_ = functional_storage;
storage_access_should_throw_ = false;
key_set_ = c10::DispatchKeySet(c10::DispatchKey::Functionalize) | value_.key_set();
}
FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& value)
: c10::TensorImpl(
c10::Storage(c10::make_intrusive<functionalization::FunctionalStorageImpl>(value)),
c10::DispatchKeySet(DispatchKey::Functionalize) | value.key_set(),
value.dtype()
),
value_(value)
{
set_constructor_metadata();
}
// Note [Functionalization: Alias Removal]
// When someone calls a view() op during the functionalization pass, e.g. 'b = a.view(...)',
// we link `b` and `a` to a shared Alias object to preserve the aliasing relationship.
//
// How do we do that?
//
// Every FunctionalTensorWrapper contains a dummy FunctionalStorageImpl, which subclasses from c10::StorageImpl.
// It doesn't contain any data (similar to MetaTensor storage), but it contains an Alias object that knows about the base tensor.
// When a tensor is created through a view operation, both the new and old tensor point to the same FunctionalStorageImpl.
//
// As mutations are applied to any of the views, we also queue each mutation up on the Alias object, so we can replay them.
// When the user requests a tensor that's had a view taken, we check if it's up to date.
// If it's not up to date, we first replay all of the queued up mutations onto the alias, and then re-apply the current view
// on top of the newly updated alias.
//
// Why do we queue up and lazily run mutations on the alias, instead of updating the alias eagerly?
// This behavior was taken from pytorch/xla, which the alias-removal logic was inspired from.
// One benefit of the laziness is that we save work in the cases where a user has multiple views and mutates one of them,
// but never uses the other views later in the program (in which case we'll never update the alias).
// It also has downsides though: repeatedly applying mutations to the same view without syncing
// will silently use up more and more memory as more mutations are queued up.
//
// Corresponding diagram:
//
// b = a.view(...)
//
// a b
// | | If the user asks for b and its out of date,
// \/ \/ We regenerate b by replaying its views from the alias.
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - .
// | FunctionalTensorWrapper | | FunctionalTensorWrapper |
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - .
// | value | storage | | storage | Value |
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - .
// | \ / |
// | \ / |
// | . - - - - - - - - - - - - . |
// | | FunctionalStorageImpl | |
// | . - - - - - - - - - - - - . |
// | | Alias | |
// | . - - - - - - - - - - - - . |
// | / mutations to a or b |
// | / are queued onto Alias |
// | / |
// \/ / \/
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - - - .
// | TensorImpl | | TensorImpl |
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - - - .
// | value | storage | | storage | Value |
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - - - .
// | |
// | |
// | |
// | In this picture the two tensor views their own storages, |
// | have their own storages, but backends like functorch |
// \/ are allowed to re-alias underneath the pass \/
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - - - .
// | underyling_storage | | underyling_storage |
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - - - .
//
// This constructor is only used by view ops.
// - view_value: The output tensor that we need to wrap.
// - base: The "base" of the view that `view_value` was generated from.
// See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic.
FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, functionalization::ViewMeta meta)
: c10::TensorImpl(
c10::DispatchKeySet(DispatchKey::Functionalize),
view_value.dtype(),
view_value.device()
),
value_(view_value)
{
set_constructor_metadata();
// Copy the original tensor's ViewMeta vector and push the current one.
if (base->view_metas_.size() > 0) {
view_metas_ = base->view_metas_; // copy
}
view_metas_.push_back(meta);
storage_ = base->storage_; // alias this tensor's storage with the base tensor's
}
functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const {
return static_cast<functionalization::FunctionalStorageImpl*>(storage_.unsafeGetStorageImpl());
}
void FunctionalTensorWrapper::commit_update() {
auto storage_impl = functional_storage_impl();
storage_impl->add_update(value_, view_metas_);
// Invariant: commit_update() is called during an inplace operation.
// Tensor inputs to the operation are synced before runnig the op,
// so the current tensor must be up-to-date with its alias at this point.
generation_ = storage_impl->generation();
}
bool FunctionalTensorWrapper::is_aliased() const {
// Two FunctionalTensorWrapper objects are aliased if they share storage.
// That means that we can check if a given FunctionalTensorWrapper is aliased
// by checking the reference count on its storage.
return storage_.use_count() > 1;
}
bool FunctionalTensorWrapper::is_up_to_date() const {
if (!is_aliased()) return true;
auto alias_generation = functional_storage_impl()->generation();
return generation_ == alias_generation;
}
// See Note [Functionalization Pass - Inplace View Ops]
void FunctionalTensorWrapper::mutate_view_meta(at::functionalization::ViewMeta meta) {
view_metas_.push_back(meta);
// Note [Functionalization Pass - Inplace View Ops]
// So, these ops are special - they're mutation AND view ops. They get special codegen.
// An example is transpose_, e.g. `a.transpose_()`
// Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas.
// We also need to force a sync (even if a is already up to date), because a's underlying tensor hasn't actually
// been updated to reflect the new view yet.
regenerate_from_base();
}
// Note [Functionalization: Mutation Removal]
// Mutation removal is used to take a program like this:
//
// a.add_(b)
//
// and replace it with a slightly different program that has the same semantics:
//
// tmp = a.add(b)
// a.replace_(tmp)
//
// Where the replace_() call is implemented directly in the functionalization pass, so it is transparent to the backend.
// This is useful for backends that aren't able to handle certain types of mutations, like functorch.
//
// Why do we need to wrap every tensor in a FunctionalTensorWrapper? Consider this program:
//
// Before:
// tensor.add_(batched_tensor)
//
// After:
// tmp = tensor.add(batched_tensor)
// tensor.replace_(tmp)
//
// In the above, tmp is a batched tensor (because adding a normal tensor to a batched tensor does broadcasting and creates a batched tensor).
// But we can't just replace the underlying memory backing `tensor` with `tmp` - a batched tensor takes up more space!
// Instead, every input, intermediate and output of the program is wrapped in a FunctionalTensorImpl, which wraps the underlying tensor.
void FunctionalTensorWrapper::replace_(const Tensor& other) {
// TODO: going to need to change this if we want nested functionalize() transforms.
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(other));
value_ = other;
}
void FunctionalTensorWrapper::sync_() {
if (is_up_to_date()) {
return;
}
apply_updates();
regenerate_from_base();
}
void FunctionalTensorWrapper::regenerate_from_base() {
at::AutoDispatchSkipFunctionalize guard;
auto storage_impl = functional_storage_impl();
auto t = storage_impl->base();
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
// Reapply views to get the viewed tensor from the base in alias_
for (auto& view_meta: view_metas_) {
t = view_meta.forward_fn(t, view_meta.out_index);
}
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
replace_(t);
generation_ = storage_impl->generation();
}
void FunctionalTensorWrapper::apply_updates() {
// Apply all updates on alias_
auto storage_impl = functional_storage_impl();
storage_impl->apply_updates();
}
const char* FunctionalTensorWrapper::tensorimpl_type_name() const {
return "FunctionalTensorWrapper";
}
namespace functionalization {
namespace impl {
Tensor to_functional_tensor(const Tensor& tensor) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!isFunctionalTensor(tensor));
return at::detail::make_tensor<FunctionalTensorWrapper>(tensor);
}
TensorList to_functional_tensor(const c10::List<Tensor>& t_list) {
std::vector<Tensor> outputs(t_list.size());
for (const auto i : c10::irange(t_list.size())) {
outputs[i] = to_functional_tensor(t_list[i]);
}
return outputs;
}
std::vector<Tensor> to_functional_tensor(const std::vector<Tensor>& t_list) {
std::vector<Tensor> outputs(t_list.size());
for (const auto i : c10::irange(t_list.size())) {
outputs[i] = to_functional_tensor(t_list[i]);
}
return outputs;
}
TensorList to_functional_tensor(const TensorList& t_list) {
std::vector<Tensor> outputs(t_list.size());
for (const auto i : c10::irange(t_list.size())) {
outputs[i] = to_functional_tensor(t_list[i]);
}
return outputs;
}
Tensor from_functional_tensor(const Tensor& tensor) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isFunctionalTensor(tensor));
auto impl = unsafeGetFunctionalWrapper(tensor);
return impl->value();
}
c10::optional<Tensor> from_functional_tensor(const c10::optional<Tensor>& t) {
if (t.has_value()) {
return c10::make_optional<Tensor>(from_functional_tensor(*t));
}
return c10::nullopt;
}
c10::List<Tensor> from_functional_tensor(const c10::List<Tensor>& t_list) {
c10::List<Tensor> outputs;
outputs.reserve(t_list.size());
for (const auto i : c10::irange(t_list.size())) {
outputs.push_back(from_functional_tensor(t_list[i]));
}
return outputs;
}
c10::List<c10::optional<Tensor>> from_functional_tensor(const c10::List<c10::optional<Tensor>>& t_list) {
c10::List<c10::optional<Tensor>> outputs;
outputs.reserve(t_list.size());
for (const auto i : c10::irange(t_list.size())) {
outputs.push_back(from_functional_tensor(t_list[i]));
}
return outputs;
}
TensorList from_functional_tensor(const TensorList& t_list) {
std::vector<Tensor> outputs(t_list.size());
for (const auto i : c10::irange(t_list.size())) {
outputs.push_back(from_functional_tensor(t_list[i]));
}
return outputs;
}
void sync(const Tensor& t) {
if (t.unsafeGetTensorImpl()->is_wrapped_number()) {
// Unfortunately, we can't easily guarantee that wrapped numbers (scalar-tensors)
// get wrapped up in a FunctionalTensorWrapper object, since they skip the dispatcher.
// That shouldn't matter, since I don't think we're allowed to assign to wrapped numbers anyway.
return;
}
// Not every tensor that hits a functionalization kernel is necessarily a functional tensor.
// For example, xla_tensor.copy_(cpu_tensor) needs to hit the functionalization kernel
// to sync xla_tensor, but not cpu_tensor.
if (!at::functionalization::impl::isFunctionalTensor(t)) {
return;
}
auto functional_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(t);
functional_impl->sync_();
}
void sync(const c10::optional<Tensor>& t) {
if (t.has_value()) {
sync(*t);
}
}
void sync(const c10::List<Tensor> t_list) {
for (const auto i : c10::irange(t_list.size())) {
sync(t_list[i]);
}
}
void sync(const at::TensorList t_list) {
for (auto t: t_list) {
sync(t);
}
}
void sync(const c10::List<c10::optional<Tensor>> t_list) {
for (const auto i : c10::irange(t_list.size())) {
sync(t_list[i]);
}
}
Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) {
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap));
TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base));
auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base);
if (out_idx != 0) {
// Note [out_idx in ViewMeta]
// When a view op outputs multiple tensors, each output needs its own separate ViewMeta.
// Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function.
meta = meta.to_out_idx(out_idx);
}
return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta);
}
std::vector<Tensor> create_functional_tensor_with_view_meta(const c10::List<at::Tensor>& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta) {
std::vector<Tensor> outputs(view_to_wrap.size());
for (const auto i : c10::irange(view_to_wrap.size())) {
outputs[i] = create_functional_tensor_with_view_meta(view_to_wrap[i], base, meta, i);
}
return outputs;
}
std::vector<Tensor> create_functional_tensor_with_view_meta(const std::vector<at::Tensor>& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta) {
std::vector<Tensor> outputs(view_to_wrap.size());
for (const auto i : c10::irange(view_to_wrap.size())) {
outputs[i] = create_functional_tensor_with_view_meta(view_to_wrap[i], base, meta, i);
}
return outputs;
}
void mutate_view_meta(const at::Tensor& self, functionalization::ViewMeta meta) {
TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self));
auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self);
self_impl->mutate_view_meta(meta);
}
// Note [Propagating strides in the functionalization pass]
// In order to properly compute stride information, the functionalization pass
// calls each {view} reference implementations with meta tensors.
// The output meta tensor's stride info serves as a reference for what the correct strides should be.
void set_sizes_strides_offset(const Tensor& out, const Tensor& reference_out) {
out.unsafeGetTensorImpl()->set_sizes_and_strides(reference_out.sizes(), reference_out.strides());
out.unsafeGetTensorImpl()->set_storage_offset(reference_out.storage_offset());
}
void set_sizes_strides_offset(const std::vector<Tensor>& outs, const std::vector<Tensor>& reference_outs) {
TORCH_INTERNAL_ASSERT(outs.size() == reference_outs.size());
for (const auto i : c10::irange(reference_outs.size())) {
set_sizes_strides_offset(outs[i], reference_outs[i]);
}
}
} // namespace impl
} // namespace functionalization
} // namespace at

View File

@ -0,0 +1,155 @@
#pragma once
#include <ATen/ArrayRef.h>
#include <ATen/core/List.h>
#include <ATen/FunctionalStorageImpl.h>
#include <c10/core/DispatchKey.h>
namespace at {
// Note [Functionalization Pass In Core]
// The Functionalization pass is used to remove aliasing from a pytorch program.
//
// This is useful for backends that don't support aliasing, like XLA and Vulkan.
// It's also necessary in order to remove mutation from a program, which is needed in Functorch.
//
// Consider this program:
// a = torch.ones(...)
// b = a.view(...)
// b.add_(1)
//
// In this program, b is meant to alias with a due to the use of view(). At the end of the program, both a and b are full of 2's.
// However, backends that don't support aliasing aren't able to correctly implement the view() operator.
// Instead, they can opt into the Functionalization pass, which will sit between the user and the backend,
// and provide the necessary aliasing logic.
//
// The functionalization pass will turn the above program into a slightly different program that has the same semantics,
// transparently to the user, that backends like XLA/Vulkan are able to implement
// a = torch.ones(...)
// b = a.view_copy(...) # view() replaced with view_copy(). Backends like XLA/Vulkan can implement this!
// b.add_(1)
// a.add_(1) # Our functionalization pass machinery knows that a and b are aliased - it applies b's mutation to a too.
//
// So, how does the functionalization pass keep track of which tensors are aliased?
// The pass works by wrapping EVERY tensor in the program inside of a FunctionalTensorWrapper, which knows about its alias'd tensors.
//
// See Note [Functionalization: Alias Removal] for details on the aliasing machinery.
// See Note [Functionalization: Mutation Removal] for details on mutation removal.
struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
explicit FunctionalTensorWrapper(const Tensor& value);
// Additional constructor to create a FunctionalTensorWrapper directly from an underlying tensor that was created from a view.
// For example, the code b = a.view1() will generate a constructor call to FunctionalTensorWrapper(b, a, view1_meta)
explicit FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, functionalization::ViewMeta meta);
// Get the underlying, actual tensor, that doesn't know anything about functionalization.
const Tensor& value() const { return value_; };
// The concept of "level" is only ever important to functorch; it's exposed here
// as more of a hook for functorch to use.
int64_t level() const { return level_; };
void set_level(int64_t level) { level_ = level; }
// Sync's the underlying tensor with its alias, if it's out of date. This involves two steps:
// 1) Apply any pending updates/mutations to the alias
// 2) Replay the views (if any) to regenerate the current tensor off of the updated alias.
void sync_();
// Performs step (1) of the sync. This is its own public API because it's needed by view_inplace ops like transpose_.
// See Note [Functionalization Pass - Inplace View Ops]
void regenerate_from_base();
// Performs step (2) of the sync. This is its own public API because it's needed by functorch.
// functorch wants to make sure that all input tensors to a functionalized program have been properly synced
// so it can properly propagate mutations to inputs.
// It can't just call sync_(), because the FunctionalTensorWrapper will look like it has no aliases and sync_ will be a noop.
// We use the reference count on storage_ to determine if the wrapper is aliased, and by the time functorch
// is ready to propagate updates to inputs, any intermediate views of the input created by the program will have been deallocated.
void apply_updates();
// Takes the current state of value_ and snapshots it, sending it as a pending update to the alias.
void commit_update();
// When any tensor is mutated, the tensor increments its alias's "generation".
// Separately, each tensor maintains its own "generation" counter, which is used to determine if it's up-to-date with its alias.
// The act of syncing a tensor will set a tensor's generation equal to its alias's generation.
bool is_up_to_date() const;
// Every FunctionalTensorWrapper contains a vector<ViewMeta> objects describing the series of view ops that ran
// to generate the current tensor from the base tensor.
// This method is used by inplace-view ops like transpose_.
// It appends a ViewMeta to the existing stack, and refreshes the tensor by replaying the views off of the alias.
void mutate_view_meta(at::functionalization::ViewMeta meta);
// The functionalization pass can be used to remove mutations.
// It does so by replacing any mutation op with it's corresponding out-of-place op, followed by a call to replace_().
// e.g:
//
// a.add_(1)
//
// will turn into:
//
// tmp = a.add(1)
// a.replace_(tmp)
//
// replace_() swaps out the wrapped tensor, value_, with tmp.
void replace_(const Tensor& other);
~FunctionalTensorWrapper() override = default;
private:
const char* tensorimpl_type_name() const override;
// Returns true if this FunctionalTensorWrapper is aliased with any other FunctionalTensorWrapper objects.
// During a functionalization pass, if we have `b = a.view()`, then a and b should both report as aliased.
bool is_aliased() const;
void set_constructor_metadata();
functionalization::FunctionalStorageImpl* functional_storage_impl() const;
// Note that value is not taken by reference: internally, the wrapper will change the value tensor that it points to over time.
Tensor value_;
int64_t level_;
size_t generation_ = 0;
std::vector<at::functionalization::ViewMeta> view_metas_;
};
// Utility functions for the functionalization pass.
namespace functionalization {
namespace impl {
TORCH_API inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper(const Tensor& tensor) {
auto functional_impl = static_cast<FunctionalTensorWrapper*>(tensor.unsafeGetTensorImpl());
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(functional_impl != nullptr);
return functional_impl;
}
TORCH_API inline bool isFunctionalTensor(const at::Tensor& tensor) {
return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Functionalize);
}
TORCH_API Tensor to_functional_tensor(const Tensor& tensor);
TORCH_API TensorList to_functional_tensor(const c10::List<Tensor>& t_list);
TORCH_API std::vector<Tensor> to_functional_tensor(const std::vector<Tensor>& t_list);
TORCH_API TensorList to_functional_tensor(const TensorList& t_list);
TORCH_API Tensor from_functional_tensor(const Tensor& tensor);
TORCH_API c10::optional<Tensor> from_functional_tensor(const c10::optional<Tensor>& t);
TORCH_API c10::List<Tensor> from_functional_tensor(const c10::List<Tensor>& t_list);
TORCH_API c10::List<c10::optional<Tensor>> from_functional_tensor(const c10::List<c10::optional<Tensor>>& t_list);
TORCH_API TensorList from_functional_tensor(const TensorList& tensors);
TORCH_API void sync(const at::Tensor& t);
TORCH_API void sync(const c10::optional<Tensor>& t);
TORCH_API void sync(const c10::List<Tensor> t_list);
TORCH_API void sync(const at::TensorList t_list);
TORCH_API void sync(const c10::List<c10::optional<Tensor>> t_list);
Tensor create_functional_tensor_with_view_meta(const Tensor& view_to_wrap, const Tensor& base, functionalization::ViewMeta meta, int64_t out_idx = 0);
std::vector<Tensor> create_functional_tensor_with_view_meta(const c10::List<Tensor>& view_to_wrap, const Tensor& base, functionalization::ViewMeta meta);
std::vector<Tensor> create_functional_tensor_with_view_meta(const std::vector<Tensor>& view_to_wrap, const Tensor& base, functionalization::ViewMeta meta);
void mutate_view_meta(const Tensor& self, functionalization::ViewMeta meta);
void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out);
void set_sizes_strides_offset(const std::vector<Tensor>& outs, const std::vector<Tensor>& meta_outs);
} // namespace impl
} // namespace functionalization
} // namespace at

View File

@ -84,6 +84,13 @@ struct TORCH_API AutoNonVariableTypeMode {
c10::impl::ExcludeDispatchKeyGuard autograd_guard_;
};
struct TORCH_API AutoDispatchSkipFunctionalize {
AutoDispatchSkipFunctionalize() :
dispatch_key_guard_(c10::DispatchKeySet(c10::DispatchKey::Functionalize)) {
}
c10::impl::ExcludeDispatchKeyGuard dispatch_key_guard_;
};
/* Note [AutoDispatchBelowADInplaceOrView]
* AutoDispatchBelowADInplaceOrView is equivalent to AutoNonVariableTypeMode
* before we split inplace & view ops out of VariableType kernel.

View File

@ -1,6 +1,8 @@
#include <ATen/core/Tensor.h>
#include <ATen/core/Formatting.h>
#include <ATen/core/VariableHooksInterface.h>
#include <ATen/core/LegacyTypeDispatch.h>
#include <ATen/FunctionalTensorWrapper.h>
#include <iostream>
@ -113,7 +115,7 @@ const TensorBase& TensorBase::requires_grad_(bool _requires_grad) const {
return *this;
}
// View Variables
// View Methods
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool TensorBase::is_view() const {

View File

@ -1,5 +1,6 @@
#include <ATen/core/dispatch/Dispatcher.h>
#include <ATen/core/LegacyTypeDispatch.h>
#include <ATen/FunctionalTensorWrapper.h>
#include <torch/library.h>
/*
@ -60,6 +61,55 @@ TORCH_LIBRARY_IMPL(_, AutogradMLC, m) {
m.fallback(torch::CppFunction::makeFallthrough());
}
namespace {
void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet, torch::jit::Stack* stack) {
const auto& schema = op.schema();
TORCH_INTERNAL_ASSERT(!schema.hasAnyAliasInfo(), "mutating and aliasing ops should all have codegen'd kernels");
const auto num_arguments = schema.arguments().size();
const auto arguments_begin = stack->size() - num_arguments;
auto arguments = torch::jit::last(stack, num_arguments);
for (uint64_t idx = 0; idx < num_arguments; ++idx) {
const auto& ivalue = arguments[idx];
if (ivalue.isTensor()) {
at::Tensor t = ivalue.toTensor();
at::functionalization::impl::sync(t);
auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t));
(*stack)[arguments_begin + idx] = t_new;
} else if (ivalue.isTensorList()) {
auto tensors = ivalue.toTensorList();
at::functionalization::impl::sync(tensors);
auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(tensors));
(*stack)[arguments_begin + idx] = t_new;
}
}
{
at::AutoDispatchSkipFunctionalize guard;
op.redispatchBoxed(dispatchKeySet & c10::after_func_keyset, stack);
}
const auto num_returns = schema.returns().size();
const auto returns_begin = stack->size() - num_returns;
auto returns = torch::jit::last(stack, num_returns);
for (uint64_t idx = 0; idx < num_returns; ++idx) {
const auto& ivalue = returns[idx];
if (ivalue.isTensor()) {
at::Tensor t = ivalue.toTensor();
auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t));
(*stack)[returns_begin + idx] = t_new;
} else if (ivalue.isTensorList()) {
auto tensors = ivalue.toTensorList();
auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(tensors));
(*stack)[returns_begin + idx] = t_new;
}
}
}
}
TORCH_LIBRARY_IMPL(_, Functionalize, m) {
m.fallback(torch::CppFunction::makeFromBoxedFunction<&functionalizeFallback>());
}
// see Note [ADInplaceOrView key]
TORCH_LIBRARY_IMPL(_, ADInplaceOrView, m) {
m.fallback(torch::CppFunction::makeFallthrough());

View File

@ -197,6 +197,8 @@ _(aten, atan2) \
_(aten, atleast_1d) \
_(aten, atleast_2d) \
_(aten, atleast_3d) \
_(aten, _autocast_to_reduced_precision) \
_(aten, _autocast_to_full_precision) \
_(aten, avg_pool1d) \
_(aten, avg_pool2d) \
_(aten, avg_pool2d_backward) \

View File

@ -23,22 +23,14 @@ struct BuiltinOpFunction : public Function {
TORCH_INTERNAL_ASSERT(schema_.returns().size() == 1);
}
const std::string& doc_string() const override {
c10::string_view doc_string() const override {
return doc_string_;
}
bool isGraphFunction() const override {
return false;
}
void run(Stack& stack) override {
callable_(stack);
}
void run(Stack&& stack) override {
callable_(stack);
}
c10::intrusive_ptr<c10::ivalue::Future> runAsync(
Stack& stack,
TaskLauncher /* not used */) override {
@ -48,50 +40,15 @@ struct BuiltinOpFunction : public Function {
return res;
}
at::IValue operator()(std::vector<at::IValue> stack, const Kwargs& kwargs)
override {
getSchema().checkAndNormalizeInputs(stack, kwargs);
callable_(stack);
return stack.front();
}
const c10::QualifiedName& qualname() const override {
return name_;
}
const std::string& name() const override {
return name_.name();
}
// if this isn't yet defined, run its method_creator function
void ensure_defined() override {
// nop
}
std::shared_ptr<Graph> graph() const override {
TORCH_INTERNAL_ASSERT(false , "BuiltinFunction had a graph requested "
"from it. This probably indicates that the JIT calling context needs a "
"special case on Function::isGraphFunction()");
}
std::shared_ptr<Graph> optimized_graph() const override {
TORCH_INTERNAL_ASSERT(false , "BuiltinFunction had a graph requested "
"from it. This probably indicates that the JIT calling context needs a "
"special case on Function::isGraphFunction()");
}
void clear_execution_info() override {
TORCH_INTERNAL_ASSERT(false , "BuiltinFunction had a graph requested "
"from it. This probably indicates that the JIT calling context needs a "
"special case on Function::isGraphFunction()");
}
GraphExecutor& get_executor() override {
TORCH_INTERNAL_ASSERT(false , "BuiltinFunction had a GraphExecutor requested "
"from it. This probably indicates that the JIT calling context needs a "
"special case on Function::isGraphFunction()");
}
const c10::FunctionSchema& getSchema() const override {
return schema_;
}
@ -100,29 +57,21 @@ struct BuiltinOpFunction : public Function {
return schema_.arguments().size();
}
void check_single_output() override {
TORCH_CHECK(schema_.returns().size() == 1);
}
std::string pretty_print_schema() const override {
#ifdef __NVCC__
// Disable the "statement is unreachable" warning
#pragma diag_suppress code_is_unreachable
#endif
TORCH_INTERNAL_ASSERT(false);
return "";
#ifdef __NVCC__
#pragma diag_default code_is_unreachable
#endif
}
Function& setSchema(c10::FunctionSchema schema) override {
schema_ = std::move(schema);
return *this;
}
bool call(Stack& stack, size_t, c10::function_ref<void(const Code&)>) override {
run(stack);
return false;
}
bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)>) override {
run(stack);
return false;
}
~BuiltinOpFunction() override {}
private:

View File

@ -1,8 +1,10 @@
#pragma once
#include <ATen/core/function_schema.h>
#include <ATen/core/ivalue.h>
#include <ATen/core/qualified_name.h>
#include <mutex>
#include <c10/util/Exception.h>
#include <c10/util/FunctionRef.h>
namespace c10 {
struct FunctionSchema;
@ -16,7 +18,11 @@ namespace torch {
namespace jit {
struct Graph;
struct GraphExecutor;
struct Code;
namespace mobile {
struct Code;
}
using Stack = std::vector<at::IValue>;
using Kwargs = std::unordered_map<std::string, at::IValue>;
@ -30,50 +36,70 @@ TORCH_API void preoptimizeGraph(std::shared_ptr<Graph>& graph);
// execution of the function. Method is a wrapper around an
// underlying Function that also provides a `self` object.
struct TORCH_API Function {
virtual const std::string& doc_string() const {
static const std::string no_doc_string = "";
virtual c10::string_view doc_string() const {
static constexpr c10::string_view no_doc_string = "";
return no_doc_string;
}
virtual bool isGraphFunction() const = 0;
virtual bool isGraphFunction() const {
return false;
}
virtual void run(Stack& stack) = 0;
virtual void run(Stack&& stack) = 0;
virtual c10::intrusive_ptr<c10::ivalue::Future> runAsync(
Stack& stack,
TaskLauncher taskLauncher = at::launch) = 0;
TaskLauncher taskLauncher = at::launch) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
return {};
}
virtual at::IValue operator()(
std::vector<at::IValue> stack,
const Kwargs& kwargs = Kwargs()) = 0;
at::IValue operator()(
Stack stack,
const Kwargs& kwargs = Kwargs()) {
getSchema().checkAndNormalizeInputs(stack, kwargs);
run(stack);
return stack.front();
}
virtual const c10::QualifiedName& qualname() const = 0;
virtual const std::string& name() const = 0;
const std::string& name() const {
return qualname().name();
}
// if this isn't yet defined, run its method_creator function
virtual void ensure_defined() = 0;
virtual std::shared_ptr<Graph> graph() const = 0;
virtual std::shared_ptr<Graph> optimized_graph() const = 0;
virtual void clear_execution_info() = 0;
virtual GraphExecutor& get_executor() = 0;
virtual const c10::FunctionSchema& getSchema() const = 0;
virtual size_t num_inputs() const = 0;
virtual void check_single_output() = 0;
virtual std::string pretty_print_schema() const = 0;
virtual Function& setSchema(c10::FunctionSchema schema) = 0;
// call() defines how different interpreter implementations interacts with
// Function objects. Basically interpreters need to provide a callback to
// communicate to Functions what to do if provided a Code object.
// Alternatively we could design the signature to return an optional Code
// object, but that requires special handling the null case in interpreter
// and the fallback behavior is not well defined by interpreter but rather
// Function themselves, so a callback approach is more reasonable than
// returning values.
// If call() returns true, then callback completes successfully, otherwise
// call() returns false.
// Overload for server interpreter, a bailout size is needed for graph executor.
virtual bool call(Stack&, size_t, c10::function_ref<void(const Code&)>) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
return false;
}
// Overload for mobile interpreter.
virtual bool call(Stack&, c10::function_ref<void(const mobile::Code&)>) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
return false;
}
virtual ~Function() {}
};
} // namespace jit

View File

@ -172,6 +172,10 @@ inline bool operator==(const Argument& lhs, const Argument& rhs) {
&& *lhs.alias_info() == *rhs.alias_info()));
}
inline bool operator!=(const Argument& lhs, const Argument& rhs) {
return !(lhs == rhs);
}
bool operator==(const FunctionSchema& lhs, const FunctionSchema& rhs);
struct FunctionSchema {

View File

@ -463,6 +463,7 @@ namespace c10 {
_(attr, df_output_vjps) \
_(attr, axes) \
_(attr, axis) \
_(attr, symbolic_shape_inputs) \
_(attr, broadcast) \
_(attr, direction) \
_(attr, ends) \

View File

@ -1937,14 +1937,14 @@ struct TORCH_API ClassAttribute {
TypePtr attributeType,
std::string attributeName) :
kind_(kind),
attributeType_(attributeType),
attributeType_(std::move(attributeType)),
attributeName_(std::move(attributeName)) {}
AttributeKind getKind() const {
return kind_;
}
TypePtr getType() const {
const TypePtr& getType() const {
return attributeType_;
}
@ -2026,22 +2026,22 @@ struct TORCH_API ClassType : public NamedType {
return attributes_[pos].getType();
}
TypePtr getAttribute(const std::string& name) const {
auto type = findAttribute(name);
const TypePtr& getAttribute(const std::string& name) const {
auto slot = findAttributeSlot(name);
TORCH_CHECK(
type,
slot,
repr_str(),
" does not have an attribute with name '",
name,
"'");
return type;
return attributes_[*slot].getType();
}
size_t numAttributes() const {
return attributes_.size();
}
TypePtr getAttribute(size_t slot) const {
const TypePtr& getAttribute(size_t slot) const {
AT_ASSERT(slot < attributes_.size());
return attributes_.at(slot).getType();
}
@ -2059,7 +2059,7 @@ struct TORCH_API ClassType : public NamedType {
c10::optional<size_t> findAttributeSlot(const std::string& name) const {
size_t slot = 0;
for (const auto& attr : attributes_) {
if (name.compare(attr.getName()) == 0) {
if (name == attr.getName()) {
return slot;
}
slot++;
@ -2094,7 +2094,7 @@ struct TORCH_API ClassType : public NamedType {
size_t addAttribute(
const std::string& name,
const TypePtr& type,
TypePtr type,
bool is_parameter = false,
bool is_buffer = false);
@ -2122,7 +2122,7 @@ struct TORCH_API ClassType : public NamedType {
bool is_buffer = false) {
auto slot_idx = findAttributeSlot(name);
if (!slot_idx) {
return addAttribute(name, ty, is_parameter, is_buffer);
return addAttribute(name, std::move(ty), is_parameter, is_buffer);
}
TORCH_CHECK(
@ -2130,7 +2130,7 @@ struct TORCH_API ClassType : public NamedType {
"Parameter field mismatch for the field '",
name,
"'");
TypePtr atype = getAttribute(*slot_idx);
const TypePtr& atype = getAttribute(*slot_idx);
TORCH_CHECK(
ty->isSubtypeOf(*atype),
ty->repr_str(),
@ -2227,7 +2227,7 @@ struct TORCH_API ClassType : public NamedType {
AT_ASSERT(numAttributes() == contained_types.size());
for(size_t i = 0; i < attributes_.size(); ++i) {
AT_ASSERT(attributes_[i].getType()->isSubtypeOf(*contained_types[i]));
ptr->addAttribute(attributes_[i].getName(), contained_types[i]);
ptr->addAttribute(attributes_[i].getName(), std::move(contained_types[i]));
}
// Copy methods over
for (const auto& method : methods()) {

View File

@ -2228,7 +2228,7 @@ void ClassType::addAttribute(ClassAttribute classAttribute) {
size_t ClassType::addAttribute(
const std::string& name,
const TypePtr& type,
TypePtr type,
bool is_parameter,
bool is_buffer) {
if (is_parameter && is_buffer){
@ -2248,16 +2248,13 @@ size_t ClassType::addAttribute(
kind = AttributeKind::BUFFER;
}
ClassAttribute ClassAttribute(kind, type, name);
addAttribute(ClassAttribute);
if (is_parameter || is_buffer) {
TORCH_INTERNAL_ASSERT(is_module(), "adding a parameter or buffer to a non module");
TORCH_CHECK(
(type->kind() == TensorType::Kind) ||
(type->kind() == OptionalType::Kind &&
type->expect<OptionalType>()->getElementType()->kind() ==
type->expectRef<OptionalType>().getElementType()->kind() ==
TensorType::Kind) ||
(type->kind() == UnionType::Kind &&
TensorType::get()->isSubtypeOf(type->expectRef<UnionType>())) ||
@ -2266,6 +2263,8 @@ size_t ClassType::addAttribute(
toString(type));
}
addAttribute(ClassAttribute(kind, std::move(type), name));
return slot;
}

View File

@ -1,6 +1,4 @@
#pragma once
#include <ATen/cpu/vec/functional_base.h>
#if !defined(__VSX__) || !defined(CPU_CAPABILITY_VSX)
#include <ATen/cpu/vec/functional_bfloat16.h>
#endif

View File

@ -0,0 +1,54 @@
#pragma once
#include <ATen/cpu/vec/intrinsics.h>
#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
#include <ATen/cpu/vec/vec_base.h>
namespace at {
namespace vec {
// See Note [Acceptable use of anonymous namespace in header]
namespace {
inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
const Vectorized<BFloat16>& a) {
constexpr int64_t K = Vectorized<BFloat16>::size();
__at_align__ float arr[K];
__at_align__ BFloat16 arr2[K];
a.store(arr2);
convert(arr2, arr, K);
return std::make_tuple(
Vectorized<float>::loadu(arr),
Vectorized<float>::loadu(arr + Vectorized<float>::size()));
}
inline Vectorized<BFloat16> convert_float_bfloat16(
const Vectorized<float>& a,
const Vectorized<float>& b) {
constexpr int64_t K = Vectorized<BFloat16>::size();
__at_align__ float arr[K];
__at_align__ BFloat16 arr2[K];
a.store(arr);
b.store(arr + Vectorized<float>::size());
convert(arr, arr2, K);
return Vectorized<BFloat16>::loadu(arr2);
}
void load_fp32_from_bf16(const c10::BFloat16* data, Vectorized<float>& out) {
__at_align__ float values[Vectorized<float>::size()];
for (int k = 0; k < Vectorized<float>::size(); ++k) {
values[k] = data[k];
}
out = Vectorized<float>::loadu(values);
}
C10_UNUSED void load_fp32_from_bf16(
const c10::BFloat16* data,
Vectorized<float>& out1,
Vectorized<float>& out2) {
load_fp32_from_bf16(data, out1);
data += Vectorized<float>::size();
load_fp32_from_bf16(data, out2);
}
} // namespace
} // namespace vec
} // namespace at

View File

@ -3,6 +3,8 @@
#include <ATen/cpu/vec/intrinsics.h>
#include <ATen/cpu/vec/vec_base.h>
#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
// Note: header order is important here
#include <ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h>
#include <ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h>
#include <ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h>
@ -11,8 +13,12 @@
#include <ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h>
#include <ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h>
#include <ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h>
#include <ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h>
#include <ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h>
#include <ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h>
namespace at {
namespace vec {

View File

@ -10,3 +10,17 @@
#else
#define AT_USE_CUSPARSE_GENERIC_API() 0
#endif
// cuSparse Generic API spsv function was added in CUDA 11.3.0
#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && (CUSPARSE_VERSION >= 11500)
#define AT_USE_CUSPARSE_GENERIC_SPSV() 1
#else
#define AT_USE_CUSPARSE_GENERIC_SPSV() 0
#endif
// cuSparse Generic API spsv function was added in CUDA 11.3.1
#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && (CUSPARSE_VERSION >= 11600)
#define AT_USE_CUSPARSE_GENERIC_SPSM() 1
#else
#define AT_USE_CUSPARSE_GENERIC_SPSM() 0
#endif

View File

@ -0,0 +1,219 @@
/*
Provides the implementations of cuSPARSE function templates.
*/
#include <ATen/cuda/CUDASparseBlas.h>
namespace at {
namespace cuda {
namespace sparse {
template <>
void csrgeam2_bufferSizeExt<float>(
CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(float)) {
TORCH_CUDASPARSE_CHECK(cusparseScsrgeam2_bufferSizeExt(
handle,
m,
n,
alpha,
descrA,
nnzA,
csrSortedValA,
csrSortedRowPtrA,
csrSortedColIndA,
beta,
descrB,
nnzB,
csrSortedValB,
csrSortedRowPtrB,
csrSortedColIndB,
descrC,
csrSortedValC,
csrSortedRowPtrC,
csrSortedColIndC,
pBufferSizeInBytes));
}
template <>
void csrgeam2_bufferSizeExt<double>(
CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(double)) {
TORCH_CUDASPARSE_CHECK(cusparseDcsrgeam2_bufferSizeExt(
handle,
m,
n,
alpha,
descrA,
nnzA,
csrSortedValA,
csrSortedRowPtrA,
csrSortedColIndA,
beta,
descrB,
nnzB,
csrSortedValB,
csrSortedRowPtrB,
csrSortedColIndB,
descrC,
csrSortedValC,
csrSortedRowPtrC,
csrSortedColIndC,
pBufferSizeInBytes));
}
template <>
void csrgeam2_bufferSizeExt<c10::complex<float>>(
CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(c10::complex<float>)) {
TORCH_CUDASPARSE_CHECK(cusparseCcsrgeam2_bufferSizeExt(
handle,
m,
n,
reinterpret_cast<const cuComplex*>(alpha),
descrA,
nnzA,
reinterpret_cast<const cuComplex*>(csrSortedValA),
csrSortedRowPtrA,
csrSortedColIndA,
reinterpret_cast<const cuComplex*>(beta),
descrB,
nnzB,
reinterpret_cast<const cuComplex*>(csrSortedValB),
csrSortedRowPtrB,
csrSortedColIndB,
descrC,
reinterpret_cast<const cuComplex*>(csrSortedValC),
csrSortedRowPtrC,
csrSortedColIndC,
pBufferSizeInBytes));
}
template <>
void csrgeam2_bufferSizeExt<c10::complex<double>>(
CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(c10::complex<double>)) {
TORCH_CUDASPARSE_CHECK(cusparseZcsrgeam2_bufferSizeExt(
handle,
m,
n,
reinterpret_cast<const cuDoubleComplex*>(alpha),
descrA,
nnzA,
reinterpret_cast<const cuDoubleComplex*>(csrSortedValA),
csrSortedRowPtrA,
csrSortedColIndA,
reinterpret_cast<const cuDoubleComplex*>(beta),
descrB,
nnzB,
reinterpret_cast<const cuDoubleComplex*>(csrSortedValB),
csrSortedRowPtrB,
csrSortedColIndB,
descrC,
reinterpret_cast<const cuDoubleComplex*>(csrSortedValC),
csrSortedRowPtrC,
csrSortedColIndC,
pBufferSizeInBytes));
}
template <>
void csrgeam2<float>(CUSPARSE_CSRGEAM2_ARGTYPES(float)) {
TORCH_CUDASPARSE_CHECK(cusparseScsrgeam2(
handle,
m,
n,
alpha,
descrA,
nnzA,
csrSortedValA,
csrSortedRowPtrA,
csrSortedColIndA,
beta,
descrB,
nnzB,
csrSortedValB,
csrSortedRowPtrB,
csrSortedColIndB,
descrC,
csrSortedValC,
csrSortedRowPtrC,
csrSortedColIndC,
pBuffer));
}
template <>
void csrgeam2<double>(CUSPARSE_CSRGEAM2_ARGTYPES(double)) {
TORCH_CUDASPARSE_CHECK(cusparseDcsrgeam2(
handle,
m,
n,
alpha,
descrA,
nnzA,
csrSortedValA,
csrSortedRowPtrA,
csrSortedColIndA,
beta,
descrB,
nnzB,
csrSortedValB,
csrSortedRowPtrB,
csrSortedColIndB,
descrC,
csrSortedValC,
csrSortedRowPtrC,
csrSortedColIndC,
pBuffer));
}
template <>
void csrgeam2<c10::complex<float>>(
CUSPARSE_CSRGEAM2_ARGTYPES(c10::complex<float>)) {
TORCH_CUDASPARSE_CHECK(cusparseCcsrgeam2(
handle,
m,
n,
reinterpret_cast<const cuComplex*>(alpha),
descrA,
nnzA,
reinterpret_cast<const cuComplex*>(csrSortedValA),
csrSortedRowPtrA,
csrSortedColIndA,
reinterpret_cast<const cuComplex*>(beta),
descrB,
nnzB,
reinterpret_cast<const cuComplex*>(csrSortedValB),
csrSortedRowPtrB,
csrSortedColIndB,
descrC,
reinterpret_cast<cuComplex*>(csrSortedValC),
csrSortedRowPtrC,
csrSortedColIndC,
pBuffer));
}
template <>
void csrgeam2<c10::complex<double>>(
CUSPARSE_CSRGEAM2_ARGTYPES(c10::complex<double>)) {
TORCH_CUDASPARSE_CHECK(cusparseZcsrgeam2(
handle,
m,
n,
reinterpret_cast<const cuDoubleComplex*>(alpha),
descrA,
nnzA,
reinterpret_cast<const cuDoubleComplex*>(csrSortedValA),
csrSortedRowPtrA,
csrSortedColIndA,
reinterpret_cast<const cuDoubleComplex*>(beta),
descrB,
nnzB,
reinterpret_cast<const cuDoubleComplex*>(csrSortedValB),
csrSortedRowPtrB,
csrSortedColIndB,
descrC,
reinterpret_cast<cuDoubleComplex*>(csrSortedValC),
csrSortedRowPtrC,
csrSortedColIndC,
pBuffer));
}
} // namespace sparse
} // namespace cuda
} // namespace at

View File

@ -0,0 +1,110 @@
#pragma once
/*
Provides a subset of cuSPARSE functions as templates:
csrgeam2<scalar_t>(...)
where scalar_t is double, float, c10::complex<double> or c10::complex<float>.
The functions are available in at::cuda::sparse namespace.
*/
#include <ATen/cuda/CUDAContext.h>
namespace at {
namespace cuda {
namespace sparse {
#define CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(scalar_t) \
cusparseHandle_t handle, int m, int n, const scalar_t *alpha, \
const cusparseMatDescr_t descrA, int nnzA, \
const scalar_t *csrSortedValA, const int *csrSortedRowPtrA, \
const int *csrSortedColIndA, const scalar_t *beta, \
const cusparseMatDescr_t descrB, int nnzB, \
const scalar_t *csrSortedValB, const int *csrSortedRowPtrB, \
const int *csrSortedColIndB, const cusparseMatDescr_t descrC, \
const scalar_t *csrSortedValC, const int *csrSortedRowPtrC, \
const int *csrSortedColIndC, size_t *pBufferSizeInBytes
template <typename scalar_t>
inline void csrgeam2_bufferSizeExt(
CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(scalar_t)) {
TORCH_INTERNAL_ASSERT(
false,
"at::cuda::sparse::csrgeam2_bufferSizeExt: not implemented for ",
typeid(scalar_t).name());
}
template <>
void csrgeam2_bufferSizeExt<float>(
CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(float));
template <>
void csrgeam2_bufferSizeExt<double>(
CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(double));
template <>
void csrgeam2_bufferSizeExt<c10::complex<float>>(
CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(c10::complex<float>));
template <>
void csrgeam2_bufferSizeExt<c10::complex<double>>(
CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(c10::complex<double>));
#define CUSPARSE_CSRGEAM2_NNZ_ARGTYPES() \
cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA, \
int nnzA, const int *csrSortedRowPtrA, const int *csrSortedColIndA, \
const cusparseMatDescr_t descrB, int nnzB, const int *csrSortedRowPtrB, \
const int *csrSortedColIndB, const cusparseMatDescr_t descrC, \
int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *workspace
template <typename scalar_t>
inline void csrgeam2Nnz(CUSPARSE_CSRGEAM2_NNZ_ARGTYPES()) {
TORCH_CUDASPARSE_CHECK(cusparseXcsrgeam2Nnz(
handle,
m,
n,
descrA,
nnzA,
csrSortedRowPtrA,
csrSortedColIndA,
descrB,
nnzB,
csrSortedRowPtrB,
csrSortedColIndB,
descrC,
csrSortedRowPtrC,
nnzTotalDevHostPtr,
workspace));
}
#define CUSPARSE_CSRGEAM2_ARGTYPES(scalar_t) \
cusparseHandle_t handle, int m, int n, const scalar_t *alpha, \
const cusparseMatDescr_t descrA, int nnzA, \
const scalar_t *csrSortedValA, const int *csrSortedRowPtrA, \
const int *csrSortedColIndA, const scalar_t *beta, \
const cusparseMatDescr_t descrB, int nnzB, \
const scalar_t *csrSortedValB, const int *csrSortedRowPtrB, \
const int *csrSortedColIndB, const cusparseMatDescr_t descrC, \
scalar_t *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC, \
void *pBuffer
template <typename scalar_t>
inline void csrgeam2(CUSPARSE_CSRGEAM2_ARGTYPES(scalar_t)) {
TORCH_INTERNAL_ASSERT(
false,
"at::cuda::sparse::csrgeam2: not implemented for ",
typeid(scalar_t).name());
}
template <>
void csrgeam2<float>(CUSPARSE_CSRGEAM2_ARGTYPES(float));
template <>
void csrgeam2<double>(CUSPARSE_CSRGEAM2_ARGTYPES(double));
template <>
void csrgeam2<c10::complex<float>>(
CUSPARSE_CSRGEAM2_ARGTYPES(c10::complex<float>));
template <>
void csrgeam2<c10::complex<double>>(
CUSPARSE_CSRGEAM2_ARGTYPES(c10::complex<double>));
} // namespace sparse
} // namespace cuda
} // namespace at

View File

@ -5,12 +5,12 @@
#include <ATen/native/LinearAlgebraUtils.h>
#include <ATen/native/cuda/MiscUtils.h>
#if AT_USE_CUSPARSE_GENERIC_API()
namespace at {
namespace cuda {
namespace sparse {
#if AT_USE_CUSPARSE_GENERIC_API()
namespace {
// If a specific GPU model does not provide native support for a given data
@ -93,7 +93,8 @@ CuSparseDnMatDescriptor::CuSparseDnMatDescriptor(const Tensor& input) {
CuSparseDnVecDescriptor::CuSparseDnVecDescriptor(const Tensor& input) {
// cuSPARSE doesn't support batched vectors
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.dim() == 1);
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
input.dim() == 1 || (input.dim() == 2 && input.size(-1) == 1));
// cuSPARSE doesn't support non-contiguous vectors
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.is_contiguous());
@ -149,8 +150,8 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input) {
descriptor_.reset(raw_descriptor);
}
#endif // AT_USE_CUSPARSE_GENERIC_API()
} // namespace sparse
} // namespace cuda
} // namespace at
#endif // AT_USE_CUSPARSE_GENERIC_API()

View File

@ -6,14 +6,14 @@
#include <c10/core/ScalarType.h>
#if AT_USE_CUSPARSE_GENERIC_API()
#if defined(USE_ROCM)
#include <type_traits>
#endif
namespace at {
namespace cuda {
namespace sparse {
cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type);
template <typename T, cusparseStatus_t (*destructor)(T*)>
struct CuSparseDescriptorDeleter {
void operator()(T* x) {
@ -37,6 +37,25 @@ class CuSparseDescriptor {
std::unique_ptr<T, CuSparseDescriptorDeleter<T, destructor>> descriptor_;
};
#if defined(USE_ROCM)
// hipSPARSE doesn't define this
using cusparseMatDescr = std::remove_pointer<cusparseMatDescr_t>::type;
#endif
class TORCH_CUDA_CPP_API CuSparseMatDescriptor
: public CuSparseDescriptor<cusparseMatDescr, &cusparseDestroyMatDescr> {
public:
CuSparseMatDescriptor() {
cusparseMatDescr_t raw_descriptor;
TORCH_CUDASPARSE_CHECK(cusparseCreateMatDescr(&raw_descriptor));
descriptor_.reset(raw_descriptor);
}
};
#if AT_USE_CUSPARSE_GENERIC_API()
cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type);
class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor
: public CuSparseDescriptor<cusparseDnMatDescr, &cusparseDestroyDnMat> {
public:
@ -83,8 +102,54 @@ class TORCH_CUDA_CPP_API CuSparseSpMatCsrDescriptor
values.data_ptr()));
}
#endif
#if AT_USE_CUSPARSE_GENERIC_SPSV()
void set_mat_fill_mode(bool upper) {
cusparseFillMode_t fill_mode =
upper ? CUSPARSE_FILL_MODE_UPPER : CUSPARSE_FILL_MODE_LOWER;
TORCH_CUDASPARSE_CHECK(cusparseSpMatSetAttribute(
this->descriptor(),
CUSPARSE_SPMAT_FILL_MODE,
&fill_mode,
sizeof(fill_mode)));
}
void set_mat_diag_type(bool unit) {
cusparseDiagType_t diag_type =
unit ? CUSPARSE_DIAG_TYPE_UNIT : CUSPARSE_DIAG_TYPE_NON_UNIT;
TORCH_CUDASPARSE_CHECK(cusparseSpMatSetAttribute(
this->descriptor(),
CUSPARSE_SPMAT_DIAG_TYPE,
&diag_type,
sizeof(diag_type)));
}
#endif
};
#if AT_USE_CUSPARSE_GENERIC_SPSV()
class TORCH_CUDA_CPP_API CuSparseSpSVDescriptor
: public CuSparseDescriptor<cusparseSpSVDescr, &cusparseSpSV_destroyDescr> {
public:
CuSparseSpSVDescriptor() {
cusparseSpSVDescr_t raw_descriptor;
TORCH_CUDASPARSE_CHECK(cusparseSpSV_createDescr(&raw_descriptor));
descriptor_.reset(raw_descriptor);
}
};
#endif
#if AT_USE_CUSPARSE_GENERIC_SPSM()
class TORCH_CUDA_CPP_API CuSparseSpSMDescriptor
: public CuSparseDescriptor<cusparseSpSMDescr, &cusparseSpSM_destroyDescr> {
public:
CuSparseSpSMDescriptor() {
cusparseSpSMDescr_t raw_descriptor;
TORCH_CUDASPARSE_CHECK(cusparseSpSM_createDescr(&raw_descriptor));
descriptor_.reset(raw_descriptor);
}
};
#endif
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
class TORCH_CUDA_CPP_API CuSparseSpGEMMDescriptor
: public CuSparseDescriptor<cusparseSpGEMMDescr, &cusparseSpGEMM_destroyDescr> {
@ -97,8 +162,8 @@ class TORCH_CUDA_CPP_API CuSparseSpGEMMDescriptor
};
#endif
#endif // AT_USE_CUSPARSE_GENERIC_API()
} // namespace sparse
} // namespace cuda
} // namespace at
#endif // AT_USE_CUSPARSE_GENERIC_API()

View File

@ -1,6 +1,7 @@
#include <ATen/ATen.h>
#include <ATen/CPUApplyUtils.h>
#include <ATen/Dispatch.h>
#include <ATen/TensorMeta.h>
#include <ATen/NativeFunctions.h>
#include <ATen/ExpandUtils.h>
@ -203,6 +204,45 @@ extern "C" void strsm_(char *side, char *uplo, char *trans, char *diag, int *n,
#endif
namespace at {
namespace meta {
TORCH_META_FUNC(triangular_solve)(const Tensor& self, const Tensor& A, bool upper, bool transpose, bool unitriangular) {
TORCH_CHECK(self.dim() >= 2,
"torch.triangular_solve: Expected b to have at least 2 dimensions, but it has ", self.dim(), " dimensions instead");
TORCH_CHECK(A.dim() >= 2,
"torch.triangular_solve: Expected A to have at least 2 dimensions, but it has ", A.dim(), " dimensions instead");
at::native::linearSolveCheckInputs(self, A, "triangular_solve");
if (A.layout() == Layout::Strided) {
std::vector<int64_t> self_broadcast_size, A_broadcast_size;
std::tie(self_broadcast_size, A_broadcast_size) = at::native::_linalg_broadcast_batch_dims(self, A);
auto ndim = self_broadcast_size.size();
auto nrows = A.size(-2);
// make column major strides for BLAS
auto solution_strides = at::detail::defaultStrides(self_broadcast_size);
solution_strides[ndim - 2] = 1;
solution_strides[ndim - 1] = nrows;
set_output(0, self_broadcast_size, solution_strides, self.options(), {});
// make column major strides for BLAS
auto clone_A_strides = at::detail::defaultStrides(A_broadcast_size);
clone_A_strides[ndim - 2] = 1;
clone_A_strides[ndim - 1] = nrows;
set_output(1, A_broadcast_size, clone_A_strides, A.options(), {});
} else if (A.layout() == Layout::SparseCsr) {
// no broadcasting for non-strided layout
set_output(0, self.sizes(), {}, self.options(), {}); // make row major strides for Sparse BLAS
set_output(1, {0}, {}, self.options(), {}); // return 0-sized tensor
} else {
TORCH_INTERNAL_ASSERT(false, "triangular_solve: Got an unexpected layout.");
}
}
} // namespace meta
namespace native {
#if AT_BUILD_WITH_LAPACK()
@ -1593,31 +1633,25 @@ The result of the computation is saved in-place in 'result' tensor,
'unitriangular' if true then the diagonal elements of 'input' are assumed to be 1
and the actual diagonal values are not used.
*/
static std::tuple<Tensor&, Tensor&> triangular_solve_out_info(
Tensor& result,
Tensor& clone_input,
Tensor& infos,
static void triangular_solve_out_impl(
const Tensor& result,
const Tensor& clone_input,
const Tensor& input,
const Tensor& other,
bool upper, bool transpose, bool unitriangular) {
// These internal asserts make explicit the assumptions in the implementation
// Error check with the actual error messages are done on the higher level of
// the hierarchy of calls
TORCH_INTERNAL_ASSERT(input.dim() >= 2);
TORCH_INTERNAL_ASSERT(input.size(-2) == input.size(-1));
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.dim() >= 2);
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.size(-2) == input.size(-1));
TORCH_INTERNAL_ASSERT(input.device() == other.device());
TORCH_INTERNAL_ASSERT(input.device() == result.device());
TORCH_INTERNAL_ASSERT(input.device() == clone_input.device());
TORCH_INTERNAL_ASSERT(input.device() == infos.device());
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.device() == other.device());
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.device() == result.device());
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.device() == clone_input.device());
TORCH_INTERNAL_ASSERT(input.scalar_type() == other.scalar_type());
TORCH_INTERNAL_ASSERT(input.scalar_type() == result.scalar_type());
TORCH_INTERNAL_ASSERT(input.scalar_type() == clone_input.scalar_type());
TORCH_INTERNAL_ASSERT(infos.scalar_type() == at::kInt);
TORCH_INTERNAL_ASSERT(infos.numel() == std::max<int64_t>(1, batchCount(input)));
TORCH_INTERNAL_ASSERT(infos.is_contiguous());
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.scalar_type() == other.scalar_type());
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.scalar_type() == result.scalar_type());
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.scalar_type() == clone_input.scalar_type());
// if 'result' has no elements we can modify it
if (result.numel() == 0) {
@ -1632,60 +1666,38 @@ static std::tuple<Tensor&, Tensor&> triangular_solve_out_info(
}
// 'result' and 'clone_input' must be in batched column major order (Fortran contiguous)
TORCH_INTERNAL_ASSERT(result.mT().is_contiguous());
TORCH_INTERNAL_ASSERT(clone_input.mT().is_contiguous());
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.mT().is_contiguous());
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(clone_input.mT().is_contiguous());
// triangular_solve_stub performs calculations in-place
// 'result' must be a copy of 'other'
// 'clone_input' must be a copy of 'input'
TORCH_INTERNAL_ASSERT(result.sizes().equals(other.sizes()));
TORCH_INTERNAL_ASSERT(clone_input.sizes().equals(input.sizes()));
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.sizes().equals(other.sizes()));
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(clone_input.sizes().equals(input.sizes()));
result.copy_(other);
clone_input.copy_(input);
triangular_solve_stub(input.device().type(), clone_input, result, /*left=*/true, upper, transpose ? TransposeType::Transpose : TransposeType::NoTranspose, unitriangular);
return std::tuple<Tensor&, Tensor&>(result, clone_input);
}
// Supports arbitrary batch dimensions for self and A
std::tuple<Tensor, Tensor> triangular_solve(const Tensor& self, const Tensor& A,
bool upper, bool transpose, bool unitriangular) {
TORCH_CHECK(self.dim() >= 2,
"torch.triangular_solve: Expected b to have at least 2 dimensions, but it has ", self.dim(), " dimensions instead");
TORCH_CHECK(A.dim() >= 2,
"torch.triangular_solve: Expected A to have at least 2 dimensions, but it has ", A.dim(), " dimensions instead");
TORCH_IMPL_FUNC(triangular_solve_out)(const Tensor& self, const Tensor& A, bool upper, bool transpose, bool unitriangular, const Tensor& result, const Tensor& clone_A) {
Tensor self_broadcast, A_broadcast;
std::tie(self_broadcast, A_broadcast) = _linalg_broadcast_batch_dims(self, A, "triangular_solve");
Tensor self_broadcasted, A_broadcasted;
std::tie(self_broadcasted, A_broadcasted) = _linalg_broadcast_batch_dims(self, A, "triangular_solve");
bool copy_needed = !result.transpose(-2, -1).is_contiguous();
copy_needed |= !clone_A.transpose(-2, -1).is_contiguous();
Tensor result = at::empty({0}, self.options());
Tensor clone_A = at::empty({0}, self.options());
Tensor infos = at::zeros({std::max<int64_t>(1, batchCount(self_broadcasted))}, self.options().dtype(kInt));
if (copy_needed) {
Tensor result_tmp = at::empty({0}, self.options());
Tensor clone_A_tmp = at::empty({0}, A.options());
triangular_solve_out_info(result, clone_A, infos, A_broadcasted, self_broadcasted, upper, transpose, unitriangular);
triangular_solve_out_impl(result_tmp, clone_A_tmp, A_broadcast, self_broadcast, upper, transpose, unitriangular);
if (self_broadcasted.dim() > 2) {
batchCheckErrors(infos, "triangular_solve");
result.copy_(result_tmp);
clone_A.copy_(clone_A_tmp);
} else {
singleCheckErrors(infos.item().toInt(), "triangular_solve");
triangular_solve_out_impl(result, clone_A, A_broadcast, self_broadcast, upper, transpose, unitriangular);
}
return std::tuple<Tensor, Tensor>(result, clone_A);
}
std::tuple<Tensor&, Tensor&> triangular_solve_out(const Tensor& self, const Tensor& A, bool upper, bool transpose, bool unitriangular, Tensor& result, Tensor& clone_A) {
checkSameDevice("triangular_solve", result, self);
checkLinalgCompatibleDtype("triangular_solve", result, self);
checkSameDevice("triangular_solve", clone_A, self, "clone_A");
checkLinalgCompatibleDtype("triangular_solve", clone_A, self, "clone_A");
Tensor result_tmp, clone_A_tmp;
std::tie(result_tmp, clone_A_tmp) = at::native::triangular_solve(self, A, upper, transpose, unitriangular);
at::native::resize_output(result, result_tmp.sizes());
at::native::resize_output(clone_A, clone_A_tmp.sizes());
result.copy_(result_tmp);
clone_A.copy_(clone_A_tmp);
return std::tuple<Tensor&, Tensor&>(result, clone_A);
}
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ qr ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -211,8 +211,8 @@ using lstsq_fn = void (*)(
DECLARE_DISPATCH(lstsq_fn, lstsq_stub);
using triangular_solve_fn = void (*)(
Tensor& /*A*/,
Tensor& /*B*/,
const Tensor& /*A*/,
const Tensor& /*B*/,
bool /*left*/,
bool /*upper*/,
TransposeType /*transpose*/,

View File

@ -795,7 +795,7 @@ This is an in-place routine, content of 'B' is overwritten.
and the actual diagonal values are not used.
*/
template<typename scalar_t>
void apply_triangular_solve(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
void apply_triangular_solve(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
#if !AT_BUILD_WITH_BLAS()
TORCH_CHECK(
false,
@ -826,7 +826,7 @@ void apply_triangular_solve(Tensor& A, Tensor& B, bool left, bool upper, Transpo
#endif
}
void triangular_solve_kernel(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
void triangular_solve_kernel(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(A.scalar_type(), "triangular_solve_cpu", [&]{
apply_triangular_solve<scalar_t>(A, B, left, upper, transpose, unitriangular);
});

View File

@ -1337,22 +1337,13 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
auto input_shape = input.sizes().slice(2);
auto grad_output_shape = gO.sizes().slice(2);
if (kernel_size.size() == 1) {
auto expected_input_shape = (kernel_size[0] - 1) * gi_conv_params.dilation[1]
- 2 * gi_conv_params.padding[1]
+ (gi_conv_params.stride[1] * (grad_output_shape[0] - 1) + 1);
if (expected_input_shape != input_shape[0]) {
gi_conv_params.output_padding[1] = input_shape[0] - expected_input_shape;
}
} else {
for (const auto i : c10::irange(kernel_size.size())) {
// Check if whole input has been used or not
auto expected_input_shape = (kernel_size[i] - 1) * gi_conv_params.dilation[i]
- 2 * gi_conv_params.padding[i]
+ (gi_conv_params.stride[i] * (grad_output_shape[i] - 1) + 1);
if (expected_input_shape != input_shape[i]) {
gi_conv_params.output_padding[i] = input_shape[i] - expected_input_shape;
}
for (const auto i : c10::irange(kernel_size.size())) {
// Check if whole input has been used or not
auto expected_input_shape = (kernel_size[i] - 1) * gi_conv_params.dilation[i]
- 2 * gi_conv_params.padding[i]
+ (gi_conv_params.stride[i] * (grad_output_shape[i] - 1) + 1);
if (expected_input_shape != input_shape[i]) {
gi_conv_params.output_padding[i] = input_shape[i] - expected_input_shape;
}
}

View File

@ -308,9 +308,7 @@ static inline void checkAllSameDim(TensorList tensors, int64_t dim) {
}
}
static inline std::tuple<Tensor,Tensor> _linalg_broadcast_batch_dims(const Tensor& arg1, const Tensor& arg2, const char* name) {
linearSolveCheckInputs(arg1, arg2, name);
static inline std::tuple<std::vector<int64_t>, std::vector<int64_t>> _linalg_broadcast_batch_dims(const Tensor& arg1, const Tensor& arg2) {
// broadcast the batch dimensions of arg1 and arg2.
IntArrayRef arg1_batch_sizes(arg1.sizes().data(), arg1.ndimension() - 2);
IntArrayRef arg2_batch_sizes(arg2.sizes().data(), arg2.ndimension() - 2);
@ -321,6 +319,14 @@ static inline std::tuple<Tensor,Tensor> _linalg_broadcast_batch_dims(const Tenso
std::vector<int64_t> arg2_expand_size({expand_batch_portion});
arg2_expand_size.insert(arg2_expand_size.end(), { arg2.size(-2), arg2.size(-1) });
return std::make_tuple(std::move(arg1_expand_size), std::move(arg2_expand_size));
}
static inline std::tuple<Tensor,Tensor> _linalg_broadcast_batch_dims(const Tensor& arg1, const Tensor& arg2, const char* name) {
linearSolveCheckInputs(arg1, arg2, name);
std::vector<int64_t> arg1_expand_size, arg2_expand_size;
std::tie(arg1_expand_size, arg2_expand_size) = at::native::_linalg_broadcast_batch_dims(arg1, arg2);
Tensor arg1_broadcasted = arg1.expand(arg1_expand_size);
Tensor arg2_broadcasted = arg2.expand(arg2_expand_size);

View File

@ -116,6 +116,43 @@ static inline Tensor to_impl(
self, dtype, layout, device, pin_memory, non_blocking, optional_memory_format);
}
// If input tensor is fp32, cast it to fp16, otherwise leave it alone.
// (this is intended to be used internally by the JIT autocast implementation)
Tensor _autocast_to_reduced_precision(const Tensor& self, bool cuda_enabled, bool cpu_enabled, ScalarType cuda_dtype, ScalarType cpu_dtype) {
if (self.dtype() == at::ScalarType::Float &&
((self.device().is_cuda() && cuda_enabled) ||
(self.device().is_cpu() && cpu_enabled))
) {
at::ScalarType target = at::ScalarType::Undefined;
if (self.device().is_cuda()) {
target = cuda_dtype;
} else if (self.device().is_cpu()) {
target = cpu_dtype;
}
TORCH_INTERNAL_ASSERT(target != at::ScalarType::Undefined, "_autocast_to_reduced_precision requires legit ScalarType argument for given device");
return to_impl(
self, target, c10::nullopt, c10::nullopt, c10::nullopt, false, false, c10::nullopt);
} else {
return self;
}
}
// If input tensor is fp16, cast it to fp32, otherwise leave it alone.
// (this is intended to be used internally by the JIT autocast implementation)
Tensor _autocast_to_full_precision(const Tensor& self, bool cuda_enabled, bool cpu_enabled) {
if (self.dtype() == at::ScalarType::Half &&
((self.device().is_cuda() && cuda_enabled) ||
(self.device().is_cpu() && cpu_enabled))
) {
return to_impl(
self, at::ScalarType::Float, c10::nullopt, c10::nullopt, c10::nullopt, false, false, c10::nullopt);
} else {
return self;
}
}
Tensor to(
const Tensor& self,
c10::optional<ScalarType> dtype,

View File

@ -2561,4 +2561,27 @@ std::vector<Tensor> unflatten_dense_tensors(const Tensor& flat, TensorList tenso
return outputs;
}
}} // at::native
at::Tensor slice_scatter(const at::Tensor& self, const at::Tensor& src, int64_t dim, c10::optional<int64_t> start, c10::optional<int64_t> end, int64_t step) {
auto output = self.clone();
auto slice = output.slice(dim, start, end, step);
TORCH_CHECK(slice.sizes() == src.sizes(), "expected src to have a size equal to the slice of self. src size = ", src.sizes(), ", slice size = ", slice.sizes());
slice.copy_(src);
return output;
}
at::Tensor select_scatter(const at::Tensor& self, const at::Tensor& src, int64_t dim, int64_t index) {
auto output = self.clone();
auto slice = output.select(dim, index);
TORCH_CHECK(slice.sizes() == src.sizes(), "expected src to have a size equal to the slice of self. src size = ", src.sizes(), ", slice size = ", slice.sizes());
slice.copy_(src);
return output;
}
at::Tensor diagonal_scatter(const at::Tensor& self, const at::Tensor& src, int64_t offset, int64_t dim1, int64_t dim2) {
auto output = self.clone();
auto slice = output.diagonal(offset, dim1, dim2);
TORCH_CHECK(slice.sizes() == src.sizes(), "expected src to have a size equal to the slice of self. src size = ", src.sizes(), ", slice size = ", slice.sizes());
slice.copy_(src);
return output;
}
} // namespace native
} // namespace at

View File

@ -2,29 +2,48 @@
#include <ATen/CPUApplyUtils.h>
#include <ATen/Dispatch.h>
#include <ATen/NativeFunctions.h>
#include <ATen/native/Resize.h>
#include <ATen/Parallel.h>
#include <ATen/TensorMeta.h>
#include <ATen/native/Resize.h>
#include <ATen/native/TriangularOpsUtils.h>
#include <c10/util/irange.h>
namespace at {
namespace meta {
TORCH_META_FUNC(tril)(const Tensor& self, int64_t k) {
set_output(self.sizes(), self.options());
}
TORCH_META_FUNC(triu)(const Tensor& self, int64_t k) {
set_output(self.sizes(), self.options());
}
} // namespace meta
namespace native {
namespace {
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triu/tril ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
template <typename scalar_t, bool upper>
static void apply_triu_tril_single(
scalar_t* result, scalar_t* self, bool inplace,
int64_t k, int64_t n, int64_t m,
int64_t res_row_stride, int64_t res_col_stride,
int64_t self_row_stride, int64_t self_col_stride) {
template <typename scalar_t>
void apply_triu_tril_single(
scalar_t* result,
scalar_t* self,
bool inplace,
int64_t k,
int64_t n,
int64_t m,
int64_t res_row_stride,
int64_t res_col_stride,
int64_t self_row_stride,
int64_t self_col_stride,
bool upper) {
constexpr int64_t zero = 0;
if (upper) {
at::parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
for (const auto i : c10::irange(start, end)) {
parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
for (int64_t i : c10::irange(start, end)) {
for (int64_t j = 0; j < std::min(m, i + k); j++) {
result[i * res_row_stride + j * res_col_stride] = 0;
}
@ -36,8 +55,8 @@ static void apply_triu_tril_single(
}
});
} else {
at::parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
for (const auto i : c10::irange(start, end)) {
parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
for (int64_t i : c10::irange(start, end)) {
for (int64_t j = std::max(zero, i + k + 1); j < m; j++) {
result[i * res_row_stride + j * res_col_stride] = 0;
}
@ -51,108 +70,101 @@ static void apply_triu_tril_single(
}
}
template <typename scalar_t, bool upper>
void apply_triu_tril(Tensor& result, const Tensor& self, bool inplace, int64_t k) {
template <typename scalar_t>
void apply_triu_tril(const Tensor& result, const Tensor& self, bool inplace, int64_t k, bool upper) {
auto n = self.size(-2);
auto m = self.size(-1);
auto self_data = self.data_ptr<scalar_t>();
auto self_stride = (self.dim() > 2 && self.stride(-3) > 0) ? self.stride(-3) : 1;
auto batchsize = batchCountTrilTriu(result);
auto self_row_stride = self.stride(-2);
auto self_column_stride = self.stride(-1);
auto self_col_stride = self.stride(-1);
auto result_data = result.data_ptr<scalar_t>();
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int64_t result_stride, result_row_stride, result_column_stride;
int64_t result_stride, result_row_stride, result_col_stride;
if (result_data != self_data) {
result_stride = (result.dim() > 2 && result.stride(-3) > 0) ? result.stride(-3) : 1;
result_row_stride = result.stride(-2);
result_column_stride = result.stride(-1);
result_col_stride = result.stride(-1);
} else {
result_stride = self_stride;
result_row_stride = self_row_stride;
result_column_stride = self_column_stride;
result_col_stride = self_col_stride;
}
at::parallel_for(0, batchsize, 0, [&](int64_t start, int64_t end) {
parallel_for(0, batchsize, 0, [&](int64_t start, int64_t end) {
for (const auto b : c10::irange(start, end)) {
scalar_t* self_batch = &self_data[b * self_stride];
scalar_t* result_batch = &result_data[b * result_stride];
apply_triu_tril_single<scalar_t, upper>(
result_batch, self_batch, inplace, k, n, m,
result_row_stride, result_column_stride, self_row_stride, self_column_stride);
apply_triu_tril_single<scalar_t>(
result_batch,
self_batch,
inplace,
k,
n,
m,
result_row_stride,
result_col_stride,
self_row_stride,
self_col_stride,
upper);
}
});
}
Tensor tril(const Tensor& self, int64_t k) {
Tensor result = at::empty({0}, self.options());
at::tril_out(result, self, k);
return result;
}
struct UpperTriangle {
static constexpr const char* op_name = "triu";
static constexpr bool upper = true;
};
Tensor& tril_cpu_(Tensor &self, int64_t k) {
struct LowerTriangle {
static constexpr const char *op_name = "tril";
static constexpr bool upper = false;
};
template <typename Triangle>
void compute_triu_tril(const Tensor& self, int64_t k, const Tensor &result) {
if (self.numel() == 0) {
return self;
return;
}
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
bool inplace;
bool inplace_op = self.is_same(result);
bool inplace_update = false;
Tensor self_c;
std::tie(inplace, self_c) = checkTrilTriuBatchContiguous(self, true);
Tensor result = inplace ? self : at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::BFloat16, at::ScalarType::Half, at::ScalarType::Bool, self.scalar_type(), "tril", [&]{
apply_triu_tril<scalar_t, false>(result, self_c, inplace, k);
});
if (!inplace) self.copy_(result);
return self;
std::tie(inplace_update, self_c) = checkTrilTriuBatchContiguous(self, inplace_op);
Tensor result_c;
if (inplace_op && !inplace_update) {
result_c = at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
} else {
result_c = result;
}
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
ScalarType::BFloat16,
ScalarType::Half,
ScalarType::Bool,
self.scalar_type(),
Triangle::op_name,
[&]{
apply_triu_tril<scalar_t>(result_c, self_c, inplace_op && inplace_update, k, Triangle::upper);
});
if (inplace_op && !inplace_update) {
result.copy_(result_c);
}
}
Tensor& tril_cpu_out(const Tensor& self, int64_t k, Tensor &result) {
at::native::resize_output(result, self.sizes());
if (self.numel() == 0) {
return result;
}
Tensor self_c;
std::tie(std::ignore, self_c) = checkTrilTriuBatchContiguous(self, false);
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::BFloat16, at::ScalarType::Half, at::ScalarType::Bool, self.scalar_type(), "tril", [&]{
apply_triu_tril<scalar_t, false>(result, self_c, false, k);
});
return result;
} // namespace
TORCH_IMPL_FUNC(tril_cpu)(const Tensor& self, int64_t k, const Tensor &result) {
compute_triu_tril<LowerTriangle>(self, k, result);
}
Tensor triu(const Tensor& self, int64_t k) {
Tensor result = at::empty({0}, self.options());
at::triu_out(result, self, k);
return result;
}
Tensor& triu_cpu_(Tensor &self, int64_t k) {
if (self.numel() == 0) {
return self;
}
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
bool inplace;
Tensor self_c;
std::tie(inplace, self_c) = checkTrilTriuBatchContiguous(self, true);
Tensor result = inplace ? self : at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::BFloat16, at::ScalarType::Half, at::ScalarType::Bool, self.scalar_type(), "triu", [&]{
apply_triu_tril<scalar_t, true>(result, self_c, inplace, k);
});
if (!inplace) self.copy_(result);
return self;
}
Tensor& triu_cpu_out(const Tensor& self, int64_t k, Tensor &result) {
at::native::resize_output(result, self.sizes());
if (self.numel() == 0) {
return result;
}
Tensor self_c;
std::tie(std::ignore, self_c) = checkTrilTriuBatchContiguous(self, false);
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::BFloat16, at::ScalarType::Half, at::ScalarType::Bool, self.scalar_type(), "triu", [&]{
apply_triu_tril<scalar_t, true>(result, self_c, false, k);
});
return result;
TORCH_IMPL_FUNC(triu_cpu)(const Tensor& self, int64_t k, const Tensor &result) {
compute_triu_tril<UpperTriangle>(self, k, result);
}
Tensor trace_backward(const Tensor& grad, IntArrayRef sizes) {
@ -166,6 +178,5 @@ Tensor trace_backward(const Tensor& grad, IntArrayRef sizes) {
return grad_input.view(sizes);
}
} // namespace native
} // namespace at

View File

@ -1965,7 +1965,7 @@ REGISTER_CUDA_DISPATCH(lu_stub, &apply_lu);
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triangular_solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
template <typename scalar_t>
static void apply_triangular_solve_batched_magma(Tensor& A, Tensor& b, bool left, bool upper, TransposeType transpose, bool unitriangular) {
static void apply_triangular_solve_batched_magma(const Tensor& A, const Tensor& b, bool left, bool upper, TransposeType transpose, bool unitriangular) {
#if !AT_MAGMA_ENABLED()
AT_ERROR("triangular_solve: MAGMA library not found in "
"compilation. Please rebuild with MAGMA.");
@ -2030,13 +2030,13 @@ AT_ERROR("triangular_solve: MAGMA library not found in "
#endif
}
void triangular_solve_batched_magma(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
void triangular_solve_batched_magma(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(A.scalar_type(), "triangular_solve_cuda", [&]{
apply_triangular_solve_batched_magma<scalar_t>(A, B, left, upper, transpose, unitriangular);
});
}
void triangular_solve_kernel(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
void triangular_solve_kernel(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
// For batches smaller than 8 and matrix sizes larger than 64x64 cuBLAS forloop is faster than batched version
if (batchCount(A) <= 8 && A.size(-1) >= 64) {
triangular_solve_cublas(A, B, left, upper, transpose, unitriangular);

View File

@ -111,7 +111,7 @@ void lu_solve_batched_cublas(const Tensor& b, const Tensor& lu, const Tensor& pi
}
template <typename scalar_t>
static void apply_triangular_solve(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
static void apply_triangular_solve(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
cublasFillMode_t uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
const auto trans = to_cublas(transpose);
cublasDiagType_t diag = unitriangular ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
@ -138,14 +138,14 @@ static void apply_triangular_solve(Tensor& A, Tensor& B, bool left, bool upper,
}
}
void triangular_solve_cublas(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
void triangular_solve_cublas(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(A.scalar_type(), "triangular_solve_cuda", [&]{
apply_triangular_solve<scalar_t>(A, B, left, upper, transpose, unitriangular);
});
}
template <typename scalar_t>
static void apply_triangular_solve_batched(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
static void apply_triangular_solve_batched(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
cublasFillMode_t uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
const auto trans = to_cublas(transpose);
cublasDiagType_t diag = unitriangular ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
@ -170,7 +170,7 @@ static void apply_triangular_solve_batched(Tensor& A, Tensor& B, bool left, bool
at::cuda::blas::trsmBatched(handle, side, uplo, trans, diag, m, n, &alpha, A_ptr_array_data, lda, B_ptr_array_data, ldb, batch_size);
}
void triangular_solve_batched_cublas(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
void triangular_solve_batched_cublas(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(A.scalar_type(), "triangular_solve_cuda", [&]{
apply_triangular_solve_batched<scalar_t>(A, B, left, upper, transpose, unitriangular);
});

View File

@ -35,8 +35,8 @@ namespace at {
namespace native {
void geqrf_batched_cublas(const Tensor& input, const Tensor& tau);
void triangular_solve_cublas(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular);
void triangular_solve_batched_cublas(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular);
void triangular_solve_cublas(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular);
void triangular_solve_batched_cublas(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular);
void gels_batched_cublas(const Tensor& a, Tensor& b, Tensor& infos);
void lu_solve_batched_cublas(const Tensor& b, const Tensor& lu, const Tensor& pivots, TransposeType transpose);

View File

@ -4,9 +4,10 @@
#include <ATen/native/BinaryOps.h>
#include <ATen/native/DispatchStub.h>
#include <ATen/native/TensorIterator.h>
#include <ATen/native/cuda/Loops.cuh>
#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAMathCompat.h>
#include <ATen/native/cuda/Loops.cuh>
#include <c10/util/TypeSafeSignMath.h>
#include <type_traits>
@ -98,7 +99,7 @@ void div_floor_kernel_cuda(TensorIteratorBase& iter) {
} else if (isIntegralType(dtype, /*includeBool*/ false)) {
AT_DISPATCH_INTEGRAL_TYPES(dtype, "div_floor_cuda", [&]() {
gpu_kernel_with_scalars(iter, [] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
if (!std::is_unsigned<scalar_t>::value && (a < 0) != (b < 0)) {
if (c10::signs_differ(a, b)) {
// Subtracts one from the results of truncation division if the
// divisor and dividend have different sign(bit)s and the remainder of
// the division is nonzero

View File

@ -4,6 +4,7 @@
#include <ATen/native/cuda/Loops.cuh>
#include <ATen/native/BinaryOps.h>
#include <ATen/native/TensorIterator.h>
#include <c10/util/TypeSafeSignMath.h>
#include <type_traits>
@ -17,7 +18,7 @@ void remainder_kernel_cuda(TensorIteratorBase& iter) {
AT_DISPATCH_INTEGRAL_TYPES(iter.common_dtype(), "remainder_cuda", [&]() {
gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
scalar_t r = a % b;
if (!std::is_unsigned<scalar_t>::value && (r != 0) && ((r < 0) != (b < 0))) {
if (r != 0 && c10::signs_differ(r, b)) {
r += b;
}
return r;
@ -28,7 +29,7 @@ void remainder_kernel_cuda(TensorIteratorBase& iter) {
gpu_kernel_with_scalars(iter,
[]GPU_LAMBDA(scalar_t a, scalar_t b) __ubsan_ignore_float_divide_by_zero__ -> scalar_t {
auto mod = ::fmod(a, b);
if (!std::is_unsigned<scalar_t>::value && (mod != 0) && ((b < 0) != (mod < 0))) {
if (mod != 0 && c10::signs_differ(b, mod)) {
mod += b;
}
return mod;

View File

@ -55,7 +55,7 @@ __global__ void triu_tril_kernel(
}
template <bool upper>
Tensor& triu_tril_cuda_template(Tensor& result, const Tensor& self, int64_t k, const char* name) {
void triu_tril_cuda_template(const Tensor& result, const Tensor& self, int64_t k, const char* name) {
int64_t N = self.numel();
dim3 dim_block = cuda::getApplyBlock();
dim3 dim_grid((N + dim_block.x - 1) / dim_block.x);
@ -76,31 +76,18 @@ Tensor& triu_tril_cuda_template(Tensor& result, const Tensor& self, int64_t k, c
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
});
return result;
}
Tensor& tril_cuda_(Tensor &self, int64_t k) {
return tril_cuda_out(self, k, self);
}
Tensor& tril_cuda_out(const Tensor& self, int64_t k, Tensor &result) {
at::native::resize_output(result, self.sizes());
if (self.numel() == 0) {
return result;
TORCH_IMPL_FUNC(tril_cuda)(const Tensor& self, int64_t k, const Tensor &result) {
if (self.numel() != 0) {
triu_tril_cuda_template<false>(result, self, k, "tril");
}
return triu_tril_cuda_template<false>(result, self, k, "tril");
}
Tensor& triu_cuda_(Tensor &self, int64_t k) {
return triu_cuda_out(self, k, self);
}
Tensor& triu_cuda_out(const Tensor& self, int64_t k, Tensor &result) {
at::native::resize_output(result, self.sizes());
if (self.numel() == 0) {
return result;
TORCH_IMPL_FUNC(triu_cuda)(const Tensor& self, int64_t k, const Tensor &result) {
if (self.numel() != 0) {
triu_tril_cuda_template<true>(result, self, k, "triu");
}
return triu_tril_cuda_template<true>(result, self, k, "triu");
}
// Copy the kth diagonal of a matrix B to a vector A.

View File

@ -154,12 +154,6 @@ void nan_to_num_kernel_cuda(
}
void frexp_kernel_cuda(TensorIteratorBase& iter) {
#if defined(USE_ROCM)
// Reference: https://rocmdocs.amd.com/en/latest/ROCm_API_References/HIP-MATH.html
// https://github.com/ROCm-Developer-Tools/HIP/issues/2169
// ROCm does not support frexp function yet
TORCH_CHECK(false, "torch.frexp() is not implemented on ROCm platform.");
#else
AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::Half,
// The iter.dtype() here is the dtype of mantissa output.
// It's a floating point type and must be the same as the input's dtype.
@ -171,7 +165,6 @@ void frexp_kernel_cuda(TensorIteratorBase& iter) {
return {mantissa, exponent};
});
});
#endif
}
REGISTER_DISPATCH(bitwise_not_stub, &bitwise_not_kernel_cuda);

View File

@ -6,6 +6,7 @@
#include <ATen/native/DispatchStub.h>
#include <ATen/native/TensorIterator.h>
#include <ATen/native/cuda/Math.cuh>
#include <c10/util/TypeSafeSignMath.h>
#include <type_traits>
@ -38,8 +39,7 @@ void sign_kernel_cuda(TensorIteratorBase& iter){
} else {
AT_DISPATCH_ALL_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "sign_cuda", [&]() {
gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
scalar_t zero = scalar_t(0);
return (zero < a) - (a < zero);
return c10::signum(a);
});
});
}
@ -47,7 +47,7 @@ void sign_kernel_cuda(TensorIteratorBase& iter){
void signbit_kernel_cuda(TensorIteratorBase& iter){
AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, ScalarType::Half, iter.input_dtype(), "signbit_cuda", [&]() {
gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> bool { return !std::is_unsigned<scalar_t>::value && a < 0; });
gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> bool { return is_negative(a); });
});
}

View File

@ -645,6 +645,7 @@
variants: function, method
device_check: NoCheck
device_guard: False
tags: inplace_view
dispatch:
CompositeExplicitAutograd: as_strided_
@ -3126,7 +3127,7 @@
- func: mv(Tensor self, Tensor vec) -> Tensor
variants: function, method
dispatch:
CPU, CUDA, SparseCsrCUDA: mv
CompositeExplicitAutograd: mv
SparseCPU, SparseCUDA, SparseCsrCPU: mv_sparse
- func: mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
@ -3882,6 +3883,7 @@
# this. If this `Variable` is a view, throws an `std::runtime_error()`.
- func: detach_(Tensor(a!) self) -> Tensor(a!)
variants: function, method
tags: inplace_view
dispatch:
CompositeExplicitAutograd: detach_
@ -3910,6 +3912,27 @@
dispatch:
CompositeExplicitAutograd: slice_backward
- func: slice_scatter(Tensor self, Tensor src, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor
variants: function, method
device_check: NoCheck
device_guard: False
dispatch:
CompositeExplicitAutograd: slice_scatter
- func: select_scatter(Tensor self, Tensor src, int dim, int index) -> Tensor
variants: function, method
device_check: NoCheck
device_guard: False
dispatch:
CompositeExplicitAutograd: select_scatter
- func: diagonal_scatter(Tensor self, Tensor src, int offset=0, int dim1=0, int dim2=1) -> Tensor
variants: function, method
device_check: NoCheck
device_guard: False
dispatch:
CompositeExplicitAutograd: diagonal_scatter
- func: slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet)
variants: function, method
dispatch:
@ -4014,6 +4037,7 @@
variants: method
device_check: NoCheck
device_guard: False
tags: inplace_view
dispatch:
CompositeExplicitAutograd: squeeze_
@ -4021,6 +4045,7 @@
variants: method
device_check: NoCheck
device_guard: False
tags: inplace_view
dispatch:
CompositeExplicitAutograd: squeeze_
@ -4028,6 +4053,7 @@
variants: method
device_check: NoCheck
device_guard: False
tags: inplace_view
- func: sspaddmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
variants: function, method
@ -4259,6 +4285,7 @@
device_check: NoCheck
device_guard: False
variants: method
tags: inplace_view
dispatch:
CompositeExplicitAutograd: t_
@ -4365,6 +4392,7 @@
variants: method
device_check: NoCheck
device_guard: False
tags: inplace_view
dispatch:
CompositeExplicitAutograd: transpose_
@ -4501,6 +4529,7 @@
variants: method
device_check: NoCheck
device_guard: False
tags: inplace_view
dispatch:
CompositeExplicitAutograd: unsqueeze_
@ -5303,7 +5332,7 @@
- func: dequantize.self(Tensor self) -> Tensor
variants: function, method
dispatch:
CPU: dequantize_cpu
CPU, CUDA: dequantize_cpu_or_cuda
QuantizedCPU, QuantizedCUDA: dequantize_quantized
- func: dequantize.tensors(Tensor[] tensors) -> Tensor[]
@ -5425,6 +5454,14 @@
- func: choose_qparams_optimized(Tensor input, int numel, int n_bins, float ratio, int bit_width) -> (Tensor, Tensor)
variants: function
- func: _autocast_to_reduced_precision(Tensor(a) self, bool cuda_enabled, bool cpu_enabled, ScalarType cuda_dtype, ScalarType cpu_dtype) -> Tensor(a)
variants: method
device_guard: False
- func: _autocast_to_full_precision(Tensor(a) self, bool cuda_enabled, bool cpu_enabled) -> Tensor(a)
variants: method
device_guard: False
- func: _to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor
device_check: NoCheck
device_guard: False
@ -6098,16 +6135,12 @@
CPU, CUDA: bitwise_right_shift
- func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
structured_delegate: tril.out
variants: method
dispatch:
CPU: tril_cpu_
CUDA: tril_cuda_
- func: triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
structured_delegate: triu.out
variants: method
dispatch:
CPU: triu_cpu_
CUDA: triu_cuda_
- func: digamma_(Tensor(a!) self) -> Tensor(a!)
device_check: NoCheck # TensorIterator
@ -6221,24 +6254,24 @@
CPU, CUDA: cross
- func: triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
structured: True
dispatch:
CPU: triu_cpu_out
CUDA: triu_cuda_out
CPU: triu_cpu
CUDA: triu_cuda
- func: triu(Tensor self, int diagonal=0) -> Tensor
structured_delegate: triu.out
variants: method, function
dispatch:
CompositeExplicitAutograd: triu
- func: tril.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
structured: True
dispatch:
CPU: tril_cpu_out
CUDA: tril_cuda_out
CPU: tril_cpu
CUDA: tril_cuda
- func: tril(Tensor self, int diagonal=0) -> Tensor
structured_delegate: tril.out
variants: method, function
dispatch:
CompositeExplicitAutograd: tril
- func: tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
dispatch:
@ -6733,13 +6766,14 @@
CUDA: legacy_lstsq_cuda
- func: triangular_solve.X(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False, *, Tensor(a!) X, Tensor(b!) M) -> (Tensor(a!) solution, Tensor(b!) cloned_coefficient)
structured: True
dispatch:
CPU, CUDA: triangular_solve_out
SparseCsrCUDA: triangular_solve_out_sparse_csr_cuda
- func: triangular_solve(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False) -> (Tensor solution, Tensor cloned_coefficient)
structured_delegate: triangular_solve.X
variants: method, function
dispatch:
CPU, CUDA: triangular_solve
- func: symeig.e(Tensor self, bool eigenvectors=False, bool upper=True, *, Tensor(a!) e, Tensor(b!) V) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
dispatch:
@ -6786,6 +6820,7 @@
variants: method
device_check: NoCheck
device_guard: False
tags: inplace_view
# swapdims, alias for transpose
- func: swapdims(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
@ -6797,6 +6832,7 @@
variants: method
device_check: NoCheck
device_guard: False
tags: inplace_view
- func: cholesky.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
dispatch:

View File

@ -54,7 +54,8 @@ Tensor quantize_per_channel(
auto quantizer = make_per_channel_affine_quantizer(scales, zero_points, axis, dtype);
return quantizer->quantize(self);
}
Tensor dequantize_cpu(const Tensor& self) {
Tensor dequantize_cpu_or_cuda(const Tensor& self) {
TORCH_CHECK(!self.is_quantized());
return self.to(at::kFloat);
}

View File

@ -349,10 +349,8 @@ Tensor qnnpack_avg_pool2d(
pytorch_qnnp_operator_t qnnpack_operator{nullptr};
const pytorch_qnnp_status createStatus =
pytorch_qnnp_create_average_pooling2d_nhwc_q8(
padH /* input_padding_top */,
padW /* input_padding_right */,
padH /* input_padding_bottom */,
padW /* input_padding_left */,
padH /* input_padding_height */,
padW /* input_padding_width */,
kH /* kernel height */,
kW /* kernel width */,
dH /* stride height */,

View File

@ -41,18 +41,13 @@ static void convolution_q8(benchmark::State& state, const char* net, bool per_ch
const size_t inputPixelStride = groups * groupInputChannels;
const size_t effectiveKernelHeight = (kernelHeight - 1) * dilation + 1;
const size_t effectiveKernelWidth = (kernelWidth - 1) * dilation + 1;
const size_t paddingLeft = effectiveKernelWidth / 2;
const size_t paddingTop = effectiveKernelHeight / 2;
const size_t paddingRight = effectiveKernelWidth - 1 - paddingLeft;
const size_t paddingBottom = effectiveKernelHeight - 1 - paddingTop;
const size_t paddingWidth = effectiveKernelWidth / 2;
const size_t paddingHeight = effectiveKernelHeight / 2;
const size_t outputHeight =
(paddingTop + inputHeight + paddingBottom - effectiveKernelHeight) /
subsampling +
(inputHeight + paddingHeight * 2 - effectiveKernelHeight) / subsampling +
1;
const size_t outputWidth =
(paddingLeft + inputWidth + paddingRight - effectiveKernelWidth) /
subsampling +
1;
(inputWidth + paddingWidth * 2 - effectiveKernelWidth) / subsampling + 1;
std::vector<uint8_t> input(
batchSize * inputHeight * inputWidth * inputPixelStride);
@ -78,10 +73,8 @@ static void convolution_q8(benchmark::State& state, const char* net, bool per_ch
std::vector<float> requantization_scale(
num_zero_points_padded, 0.5 * 0.5 / 0.5);
status = pytorch_qnnp_create_convolution2d_nhwc_q8(
paddingTop,
paddingRight,
paddingBottom,
paddingLeft,
paddingHeight,
paddingWidth,
kernelHeight,
kernelWidth,
subsampling,

View File

@ -40,10 +40,8 @@ enum pytorch_qnnp_status pytorch_qnnp_deinitialize(void);
typedef struct pytorch_qnnp_operator* pytorch_qnnp_operator_t;
enum pytorch_qnnp_status pytorch_qnnp_create_convolution2d_nhwc_q8(
uint32_t input_padding_top,
uint32_t input_padding_right,
uint32_t input_padding_bottom,
uint32_t input_padding_left,
uint32_t input_padding_height,
uint32_t input_padding_width,
uint32_t kernel_height,
uint32_t kernel_width,
uint32_t subsampling_height,
@ -66,12 +64,9 @@ enum pytorch_qnnp_status pytorch_qnnp_create_convolution2d_nhwc_q8(
pytorch_qnnp_operator_t* convolution);
enum pytorch_qnnp_status pytorch_qnnp_create_convolution3d_ndhwc_q8(
uint32_t input_padding_front,
uint32_t input_padding_top,
uint32_t input_padding_right,
uint32_t input_padding_back,
uint32_t input_padding_bottom,
uint32_t input_padding_left,
uint32_t input_padding_depth,
uint32_t input_padding_height,
uint32_t input_padding_width,
uint32_t kernel_depth,
uint32_t kernel_height,
uint32_t kernel_width,
@ -120,10 +115,8 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_convolution_ndhwc_q8(
pthreadpool_t threadpool);
enum pytorch_qnnp_status pytorch_qnnp_create_deconvolution2d_nhwc_q8(
uint32_t input_padding_top,
uint32_t input_padding_right,
uint32_t input_padding_bottom,
uint32_t input_padding_left,
uint32_t input_padding_height,
uint32_t input_padding_width,
uint32_t adjustment_height,
uint32_t adjustment_width,
uint32_t kernel_height,
@ -227,10 +220,8 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_global_average_pooling_nwc_q8(
size_t output_stride);
enum pytorch_qnnp_status pytorch_qnnp_create_average_pooling2d_nhwc_q8(
uint32_t input_padding_top,
uint32_t input_padding_right,
uint32_t input_padding_bottom,
uint32_t input_padding_left,
uint32_t input_padding_height,
uint32_t input_padding_width,
uint32_t pooling_height,
uint32_t pooling_width,
uint32_t stride_height,
@ -257,10 +248,8 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_average_pooling2d_nhwc_q8(
pthreadpool_t threadpool);
enum pytorch_qnnp_status pytorch_qnnp_create_max_pooling2d_nhwc_u8(
uint32_t input_padding_top,
uint32_t input_padding_right,
uint32_t input_padding_bottom,
uint32_t input_padding_left,
uint32_t input_padding_height,
uint32_t input_padding_width,
uint32_t pooling_height,
uint32_t pooling_width,
uint32_t stride_height,

View File

@ -30,10 +30,8 @@ static inline size_t compute_output_dimension(
}
enum pytorch_qnnp_status pytorch_qnnp_create_average_pooling2d_nhwc_q8(
uint32_t input_padding_top,
uint32_t input_padding_right,
uint32_t input_padding_bottom,
uint32_t input_padding_left,
uint32_t input_padding_height,
uint32_t input_padding_width,
uint32_t pooling_height,
uint32_t pooling_width,
uint32_t stride_height,
@ -145,8 +143,7 @@ enum pytorch_qnnp_status pytorch_qnnp_create_average_pooling2d_nhwc_q8(
goto error;
}
const bool any_padding = (input_padding_left | input_padding_top |
input_padding_right | input_padding_bottom) != 0;
const bool any_padding = (input_padding_width | input_padding_height) != 0;
const uint32_t kr = pytorch_qnnp_params.q8avgpool.kr;
const uint32_t mr = pytorch_qnnp_params.q8avgpool.mr;
const uint32_t qr = pytorch_qnnp_params.q8avgpool.qr;
@ -162,11 +159,8 @@ enum pytorch_qnnp_status pytorch_qnnp_create_average_pooling2d_nhwc_q8(
average_pooling->zero_pointer = zero_buffer;
}
average_pooling->input_padding_top = input_padding_top;
average_pooling->input_padding_right = input_padding_right;
average_pooling->input_padding_bottom = input_padding_bottom;
average_pooling->input_padding_left = input_padding_left;
average_pooling->input_padding_height = input_padding_height;
average_pooling->input_padding_width = input_padding_width;
average_pooling->kernel_height = pooling_height;
average_pooling->kernel_width = pooling_width;
average_pooling->stride_height = stride_height;
@ -239,13 +233,11 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_average_pooling2d_nhwc_q8(
average_pooling->input_pixel_stride = input_pixel_stride;
average_pooling->output_height = compute_output_dimension(
average_pooling->input_padding_top + input_height +
average_pooling->input_padding_bottom,
input_height + average_pooling->input_padding_height * 2,
average_pooling->kernel_height,
average_pooling->stride_height);
average_pooling->output_width = compute_output_dimension(
average_pooling->input_padding_left + input_width +
average_pooling->input_padding_right,
input_width + average_pooling->input_padding_width * 2,
average_pooling->kernel_width,
average_pooling->stride_width);
average_pooling->output = output;

View File

@ -38,10 +38,8 @@ static inline size_t compute_output_dimension(
}
enum pytorch_qnnp_status pytorch_qnnp_create_convolution2d_nhwc_q8(
uint32_t input_padding_top,
uint32_t input_padding_right,
uint32_t input_padding_bottom,
uint32_t input_padding_left,
uint32_t input_padding_height,
uint32_t input_padding_width,
uint32_t kernel_height,
uint32_t kernel_width,
uint32_t subsampling_height,
@ -64,11 +62,8 @@ enum pytorch_qnnp_status pytorch_qnnp_create_convolution2d_nhwc_q8(
pytorch_qnnp_operator_t* convolution_out) {
return pytorch_qnnp_create_convolution3d_ndhwc_q8(
0,
input_padding_top,
input_padding_right,
0,
input_padding_bottom,
input_padding_left,
input_padding_height,
input_padding_width,
1,
kernel_height,
kernel_width,
@ -95,12 +90,9 @@ enum pytorch_qnnp_status pytorch_qnnp_create_convolution2d_nhwc_q8(
}
enum pytorch_qnnp_status pytorch_qnnp_create_convolution3d_ndhwc_q8(
uint32_t input_padding_front,
uint32_t input_padding_top,
uint32_t input_padding_right,
uint32_t input_padding_back,
uint32_t input_padding_bottom,
uint32_t input_padding_left,
uint32_t input_padding_depth,
uint32_t input_padding_height,
uint32_t input_padding_width,
uint32_t kernel_depth,
uint32_t kernel_height,
uint32_t kernel_width,
@ -190,52 +182,43 @@ enum pytorch_qnnp_status pytorch_qnnp_create_convolution3d_ndhwc_q8(
subsampling_height);
}
if (input_padding_top >= kernel_height) {
if (input_padding_depth >= kernel_depth) {
pytorch_qnnp_log_info(
"inefficiency in convolution with %" PRIu32 "x%" PRIu32
"inefficiency in convolution with %" PRIu32 "x%" PRIu32 "x%" PRIu32
" kernel and %" PRIu32 "+%" PRIu32
" depth padding: "
"input depth padding is greater or equal to kernel depth",
kernel_depth,
kernel_height,
kernel_width,
input_padding_depth,
input_padding_depth);
}
if (input_padding_height >= kernel_height) {
pytorch_qnnp_log_info(
"inefficiency in convolution with %" PRIu32 "x%" PRIu32 "x%" PRIu32
" kernel and %" PRIu32 "+%" PRIu32
" height padding: "
"input top padding is greater or equal to kernel height",
kernel_width,
"input height padding is greater or equal to kernel height",
kernel_depth,
kernel_height,
input_padding_top,
input_padding_bottom);
kernel_width,
input_padding_height,
input_padding_height);
}
if (input_padding_bottom >= kernel_height) {
if (input_padding_width >= kernel_width) {
pytorch_qnnp_log_info(
"inefficiency in convolution with %" PRIu32 "x%" PRIu32
" kernel and %" PRIu32 "+%" PRIu32
" height padding: "
"input bottom padding is greater or equal to kernel height",
kernel_width,
kernel_height,
input_padding_top,
input_padding_bottom);
}
if (input_padding_right >= kernel_width) {
pytorch_qnnp_log_info(
"inefficiency in convolution with %" PRIu32 "x%" PRIu32
"inefficiency in convolution with %" PRIu32 "x%" PRIu32 "x%" PRIu32
" kernel and %" PRIu32 "+%" PRIu32
" width padding: "
"input right padding is greater or equal to kernel width",
kernel_width,
"input width padding is greater or equal to kernel width",
kernel_depth,
kernel_height,
input_padding_left,
input_padding_right);
}
if (input_padding_left >= kernel_width) {
pytorch_qnnp_log_info(
"inefficiency in convolution with %" PRIu32 "x%" PRIu32
" kernel and %" PRIu32 "+%" PRIu32
" width padding: "
"input left padding is greater or equal to kernel width",
kernel_width,
kernel_height,
input_padding_left,
input_padding_right);
input_padding_width,
input_padding_width);
}
for (int i = 0; i < groups * group_output_channels; ++i) {
@ -262,8 +245,7 @@ enum pytorch_qnnp_status pytorch_qnnp_create_convolution3d_ndhwc_q8(
enum pytorch_qnnp_ukernel_type ukernel_type = pytorch_qnnp_ukernel_type_none;
const bool any_padding =
(input_padding_front | input_padding_left | input_padding_top |
input_padding_back | input_padding_right | input_padding_bottom) != 0;
(input_padding_depth | input_padding_height | input_padding_width) != 0;
if ((kernel_size == 9 || kernel_size == 25) && group_input_channels == 1 &&
group_output_channels == 1 && groups > 1) {
ukernel_type = pytorch_qnnp_ukernel_type_dwconv;
@ -503,13 +485,9 @@ enum pytorch_qnnp_status pytorch_qnnp_create_convolution3d_ndhwc_q8(
convolution->zero_pointer = (void*)((uintptr_t)zero_buffer + zero_offset);
}
convolution->input_padding_front = input_padding_front;
convolution->input_padding_top = input_padding_top;
convolution->input_padding_right = input_padding_right;
convolution->input_padding_back = input_padding_back;
convolution->input_padding_bottom = input_padding_bottom;
convolution->input_padding_left = input_padding_left;
convolution->input_padding_depth = input_padding_depth;
convolution->input_padding_height = input_padding_height;
convolution->input_padding_width = input_padding_width;
convolution->kernel_depth = kernel_depth;
convolution->kernel_height = kernel_height;
convolution->kernel_width = kernel_width;
@ -615,20 +593,17 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_convolution_ndhwc_q8(
convolution->input_pixel_stride = input_pixel_stride;
convolution->output_depth = compute_output_dimension(
convolution->input_padding_front + input_depth +
convolution->input_padding_back,
input_depth + convolution->input_padding_depth * 2,
convolution->kernel_depth,
convolution->dilation_depth,
convolution->stride_depth);
convolution->output_height = compute_output_dimension(
convolution->input_padding_top + input_height +
convolution->input_padding_bottom,
input_height + convolution->input_padding_height * 2,
convolution->kernel_height,
convolution->dilation_height,
convolution->stride_height);
convolution->output_width = compute_output_dimension(
convolution->input_padding_left + input_width +
convolution->input_padding_right,
input_width + convolution->input_padding_width * 2,
convolution->kernel_width,
convolution->dilation_width,
convolution->stride_width);

View File

@ -131,14 +131,14 @@ enum pytorch_qnnp_status qnnpackDeConv(
// Setup the kernel
const size_t output_width = compute_output_dimension(
input_width,
deconvolution->input_padding_left + deconvolution->input_padding_right,
deconvolution->input_padding_width * 2,
deconvolution->adjustment_width,
kernel_width,
deconvolution->dilation_width,
deconvolution->stride_width);
const size_t output_height = compute_output_dimension(
input_height,
deconvolution->input_padding_top + deconvolution->input_padding_bottom,
deconvolution->input_padding_height * 2,
deconvolution->adjustment_height,
kernel_height,
deconvolution->dilation_height,

View File

@ -36,10 +36,8 @@ static inline size_t compute_output_dimension(
}
enum pytorch_qnnp_status pytorch_qnnp_create_deconvolution2d_nhwc_q8(
uint32_t input_padding_top,
uint32_t input_padding_right,
uint32_t input_padding_bottom,
uint32_t input_padding_left,
uint32_t input_padding_height,
uint32_t input_padding_width,
uint32_t adjustment_height,
uint32_t adjustment_width,
uint32_t kernel_height,
@ -181,10 +179,8 @@ enum pytorch_qnnp_status pytorch_qnnp_create_deconvolution2d_nhwc_q8(
deconvolution->zero_buffer = zero_buffer;
deconvolution->zero_pointer = (void*)((uintptr_t)zero_buffer + zero_offset);
deconvolution->input_padding_top = input_padding_top;
deconvolution->input_padding_right = input_padding_right;
deconvolution->input_padding_bottom = input_padding_bottom;
deconvolution->input_padding_left = input_padding_left;
deconvolution->input_padding_height = input_padding_height;
deconvolution->input_padding_width = input_padding_width;
deconvolution->adjustment_height = adjustment_height;
deconvolution->adjustment_width = adjustment_width;
@ -266,8 +262,7 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_deconvolution2d_nhwc_q8(
const size_t output_height = deconvolution->output_height =
compute_output_dimension(
input_height,
deconvolution->input_padding_top +
deconvolution->input_padding_bottom,
deconvolution->input_padding_height * 2,
deconvolution->adjustment_height,
kernel_height,
deconvolution->dilation_height,
@ -275,8 +270,7 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_deconvolution2d_nhwc_q8(
const size_t output_width = deconvolution->output_width =
compute_output_dimension(
input_width,
deconvolution->input_padding_left +
deconvolution->input_padding_right,
deconvolution->input_padding_width * 2,
deconvolution->adjustment_width,
kernel_width,
deconvolution->dilation_width,

View File

@ -40,9 +40,9 @@ void pytorch_qnnp_indirection_init_conv3d(
const size_t dilation_depth = op->dilation_depth;
const size_t dilation_height = op->dilation_height;
const size_t dilation_width = op->dilation_width;
const size_t input_padding_front = op->input_padding_front;
const size_t input_padding_top = op->input_padding_top;
const size_t input_padding_left = op->input_padding_left;
const size_t input_padding_depth = op->input_padding_depth;
const size_t input_padding_height = op->input_padding_height;
const size_t input_padding_width = op->input_padding_width;
const size_t output_size = output_depth * output_height * output_width;
const size_t kernel_size = kernel_depth * kernel_height * kernel_width;
@ -70,16 +70,16 @@ void pytorch_qnnp_indirection_init_conv3d(
for (size_t kernel_z = 0; kernel_z < kernel_depth; kernel_z++) {
const size_t input_z = output_z * stride_depth +
kernel_z * dilation_depth - input_padding_front;
kernel_z * dilation_depth - input_padding_depth;
if (input_z < input_depth) {
for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
const size_t input_y = output_y * stride_height +
kernel_y * dilation_height - input_padding_top;
kernel_y * dilation_height - input_padding_height;
if (input_y < input_height) {
for (size_t kernel_x = 0; kernel_x < kernel_width;
kernel_x++) {
const size_t input_x = output_x * stride_width +
kernel_x * dilation_width - input_padding_left;
kernel_x * dilation_width - input_padding_width;
const size_t index = (group * batch_size + image) *
tiled_output_size * kernel_size +
output_tile_start * kernel_size +
@ -154,19 +154,19 @@ void pytorch_qnnp_indirection_init_dwconv2d(
const size_t stride_width = op->stride_width;
const size_t dilation_height = op->dilation_height;
const size_t dilation_width = op->dilation_width;
const size_t input_padding_top = op->input_padding_top;
const size_t input_padding_left = op->input_padding_left;
const size_t input_padding_height = op->input_padding_height;
const size_t input_padding_width = op->input_padding_width;
for (size_t image = batch_start; image < batch_size; image++) {
for (size_t output_y = 0; output_y < output_height; output_y++) {
for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
const size_t input_y = output_y * stride_height +
kernel_y * dilation_height - input_padding_top;
kernel_y * dilation_height - input_padding_height;
if (input_y < input_height) {
for (size_t output_x = 0; output_x < output_width; output_x++) {
for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
const size_t input_x = output_x * stride_width +
kernel_x * dilation_width - input_padding_left;
kernel_x * dilation_width - input_padding_width;
const size_t index =
(image * output_height + output_y) * step_height +
output_x * step_width * kernel_height +
@ -217,8 +217,8 @@ void pytorch_qnnp_indirection_init_deconv2d(
const size_t stride_width = op->stride_width;
const size_t dilation_height = op->dilation_height;
const size_t dilation_width = op->dilation_width;
const size_t input_padding_top = op->input_padding_top;
const size_t input_padding_left = op->input_padding_left;
const size_t input_padding_height = op->input_padding_height;
const size_t input_padding_width = op->input_padding_width;
const size_t output_size = output_height * output_width;
const size_t kernel_size = kernel_height * kernel_width;
@ -237,11 +237,11 @@ void pytorch_qnnp_indirection_init_deconv2d(
const size_t output_x = output_index % output_width;
for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
const size_t y =
output_y + input_padding_top - kernel_y * dilation_height;
output_y + input_padding_height - kernel_y * dilation_height;
const size_t input_y = y / stride_height;
for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
const size_t x =
output_x + input_padding_left - kernel_x * dilation_width;
output_x + input_padding_width - kernel_x * dilation_width;
const size_t input_x = x / stride_width;
const size_t index = (group * batch_size + image) *
tiled_output_size * kernel_size +
@ -284,21 +284,21 @@ void pytorch_qnnp_indirection_init_maxpool2d(
const size_t stride_width = op->stride_width;
const size_t dilation_height = op->dilation_height;
const size_t dilation_width = op->dilation_width;
const size_t input_padding_top = op->input_padding_top;
const size_t input_padding_left = op->input_padding_left;
const size_t input_padding_height = op->input_padding_height;
const size_t input_padding_width = op->input_padding_width;
for (size_t image = batch_start; image < batch_size; image++) {
for (size_t output_y = 0; output_y < output_height; output_y++) {
for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
const size_t input_y =
doz(output_y * stride_height + pooling_y * dilation_height,
input_padding_top);
input_padding_height);
const size_t clamped_input_y = min(input_y, input_height - 1);
for (size_t output_x = 0; output_x < output_width; output_x++) {
for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
const size_t input_x =
doz(output_x * stride_width + pooling_x * dilation_width,
input_padding_left);
input_padding_width);
const size_t clamped_input_x = min(input_x, input_width - 1);
const size_t index =
(image * output_height + output_y) * step_height +

View File

@ -35,10 +35,8 @@ static inline size_t compute_output_dimension(
}
enum pytorch_qnnp_status pytorch_qnnp_create_max_pooling2d_nhwc_u8(
uint32_t input_padding_top,
uint32_t input_padding_right,
uint32_t input_padding_bottom,
uint32_t input_padding_left,
uint32_t input_padding_height,
uint32_t input_padding_width,
uint32_t pooling_height,
uint32_t pooling_width,
uint32_t stride_height,
@ -117,10 +115,8 @@ enum pytorch_qnnp_status pytorch_qnnp_create_max_pooling2d_nhwc_u8(
goto error;
}
max_pooling->input_padding_top = input_padding_top;
max_pooling->input_padding_right = input_padding_right;
max_pooling->input_padding_bottom = input_padding_bottom;
max_pooling->input_padding_left = input_padding_left;
max_pooling->input_padding_height = input_padding_height;
max_pooling->input_padding_width = input_padding_width;
max_pooling->kernel_height = pooling_height;
max_pooling->kernel_width = pooling_width;
@ -180,14 +176,12 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_max_pooling2d_nhwc_u8(
max_pooling->input_pixel_stride = input_pixel_stride;
max_pooling->output_height = compute_output_dimension(
max_pooling->input_padding_top + input_height +
max_pooling->input_padding_bottom,
input_height + max_pooling->input_padding_height * 2,
max_pooling->kernel_height,
max_pooling->dilation_height,
max_pooling->stride_height);
max_pooling->output_width = compute_output_dimension(
max_pooling->input_padding_left + input_width +
max_pooling->input_padding_right,
input_width + max_pooling->input_padding_width * 2,
max_pooling->kernel_width,
max_pooling->dilation_width,
max_pooling->stride_width);

View File

@ -47,12 +47,9 @@ typedef struct {
struct pytorch_qnnp_operator {
size_t batch_size;
uint32_t input_padding_front;
uint32_t input_padding_back;
uint32_t input_padding_top;
uint32_t input_padding_right;
uint32_t input_padding_bottom;
uint32_t input_padding_left;
uint32_t input_padding_depth;
uint32_t input_padding_height;
uint32_t input_padding_width;
uint32_t adjustment_height;
uint32_t adjustment_width;
uint32_t kernel_depth;

View File

@ -22,69 +22,35 @@
class AveragePoolingOperatorTester {
public:
inline AveragePoolingOperatorTester& padding(uint32_t padding) {
this->paddingTop_ = padding;
this->paddingRight_ = padding;
this->paddingBottom_ = padding;
this->paddingLeft_ = padding;
this->paddingHeight_ = padding;
this->paddingWidth_ = padding;
return *this;
}
inline AveragePoolingOperatorTester& padding(
uint32_t paddingHeight,
uint32_t paddingWidth) {
this->paddingTop_ = paddingHeight;
this->paddingRight_ = paddingWidth;
this->paddingBottom_ = paddingHeight;
this->paddingLeft_ = paddingWidth;
this->paddingHeight_ = paddingHeight;
this->paddingWidth_ = paddingWidth;
return *this;
}
inline AveragePoolingOperatorTester& paddingHeight(uint32_t paddingHeight) {
this->paddingTop_ = paddingHeight;
this->paddingBottom_ = paddingHeight;
this->paddingHeight_ = paddingHeight;
return *this;
}
inline AveragePoolingOperatorTester& paddingWidth(uint32_t paddingWidth) {
this->paddingRight_ = paddingWidth;
this->paddingLeft_ = paddingWidth;
this->paddingWidth_ = paddingWidth;
return *this;
}
inline AveragePoolingOperatorTester& paddingTop(uint32_t paddingTop) {
this->paddingTop_ = paddingTop;
return *this;
inline uint32_t paddingHeight() const {
return this->paddingHeight_;
}
inline uint32_t paddingTop() const {
return this->paddingTop_;
}
inline AveragePoolingOperatorTester& paddingRight(uint32_t paddingRight) {
this->paddingRight_ = paddingRight;
return *this;
}
inline uint32_t paddingRight() const {
return this->paddingRight_;
}
inline AveragePoolingOperatorTester& paddingBottom(uint32_t paddingBottom) {
this->paddingBottom_ = paddingBottom;
return *this;
}
inline uint32_t paddingBottom() const {
return this->paddingBottom_;
}
inline AveragePoolingOperatorTester& paddingLeft(uint32_t paddingLeft) {
this->paddingLeft_ = paddingLeft;
return *this;
}
inline uint32_t paddingLeft() const {
return this->paddingLeft_;
inline uint32_t paddingWidth() const {
return this->paddingWidth_;
}
inline AveragePoolingOperatorTester& inputSize(
@ -211,8 +177,7 @@ class AveragePoolingOperatorTester {
}
inline size_t outputHeight() const {
const size_t paddedInputHeight =
paddingTop() + inputHeight() + paddingBottom();
const size_t paddedInputHeight = inputHeight() + paddingHeight() * 2;
if (paddedInputHeight <= poolingHeight()) {
return 1;
} else {
@ -221,8 +186,7 @@ class AveragePoolingOperatorTester {
}
inline size_t outputWidth() const {
const size_t paddedInputWidth =
paddingLeft() + inputWidth() + paddingRight();
const size_t paddedInputWidth = inputWidth() + paddingWidth() * 2;
if (paddedInputWidth <= poolingWidth()) {
return 1;
} else {
@ -303,7 +267,7 @@ class AveragePoolingOperatorTester {
inline size_t nextOutputHeight() const {
const size_t paddedNextInputHeight =
paddingTop() + nextInputHeight() + paddingBottom();
nextInputHeight() + paddingHeight() * 2;
if (paddedNextInputHeight <= poolingHeight()) {
return 1;
} else {
@ -312,8 +276,7 @@ class AveragePoolingOperatorTester {
}
inline size_t nextOutputWidth() const {
const size_t paddedNextInputWidth =
paddingLeft() + nextInputWidth() + paddingRight();
const size_t paddedNextInputWidth = nextInputWidth() + paddingWidth() * 2;
if (paddedNextInputWidth <= poolingWidth()) {
return 1;
} else {
@ -430,9 +393,9 @@ class AveragePoolingOperatorTester {
for (size_t c = 0; c < channels(); c++) {
double acc = 0.0f;
for (size_t py = 0; py < poolingHeight(); py++) {
const size_t iy = oy * strideHeight() + py - paddingTop();
const size_t iy = oy * strideHeight() + py - paddingHeight();
for (size_t px = 0; px < poolingWidth(); px++) {
const size_t ix = ox * strideWidth() + px - paddingLeft();
const size_t ix = ox * strideWidth() + px - paddingWidth();
if (ix < inputWidth() && iy < inputHeight()) {
acc += double(
int32_t(input
@ -482,10 +445,8 @@ class AveragePoolingOperatorTester {
ASSERT_EQ(
pytorch_qnnp_status_success,
pytorch_qnnp_create_average_pooling2d_nhwc_q8(
paddingTop(),
paddingRight(),
paddingBottom(),
paddingLeft(),
paddingHeight(),
paddingWidth(),
poolingHeight(),
poolingWidth(),
strideHeight(),
@ -596,9 +557,9 @@ class AveragePoolingOperatorTester {
for (size_t c = 0; c < channels(); c++) {
double acc = 0.0f;
for (size_t py = 0; py < poolingHeight(); py++) {
const size_t iy = oy * strideHeight() + py - paddingTop();
const size_t iy = oy * strideHeight() + py - paddingHeight();
for (size_t px = 0; px < poolingWidth(); px++) {
const size_t ix = ox * strideWidth() + px - paddingLeft();
const size_t ix = ox * strideWidth() + px - paddingWidth();
if (ix < inputWidth() && iy < inputHeight()) {
acc += double(
int32_t(input
@ -648,10 +609,8 @@ class AveragePoolingOperatorTester {
ASSERT_EQ(
pytorch_qnnp_status_success,
pytorch_qnnp_create_average_pooling2d_nhwc_q8(
paddingTop(),
paddingRight(),
paddingBottom(),
paddingLeft(),
paddingHeight(),
paddingWidth(),
poolingHeight(),
poolingWidth(),
strideHeight(),
@ -731,9 +690,9 @@ class AveragePoolingOperatorTester {
for (size_t c = 0; c < channels(); c++) {
double acc = 0.0f;
for (size_t py = 0; py < poolingHeight(); py++) {
const size_t iy = oy * strideHeight() + py - paddingTop();
const size_t iy = oy * strideHeight() + py - paddingHeight();
for (size_t px = 0; px < poolingWidth(); px++) {
const size_t ix = ox * strideWidth() + px - paddingLeft();
const size_t ix = ox * strideWidth() + px - paddingWidth();
if (ix < nextInputWidth() && iy < nextInputHeight()) {
acc += double(
int32_t(input
@ -847,10 +806,8 @@ class AveragePoolingOperatorTester {
}
private:
uint32_t paddingTop_{0};
uint32_t paddingRight_{0};
uint32_t paddingBottom_{0};
uint32_t paddingLeft_{0};
uint32_t paddingHeight_{0};
uint32_t paddingWidth_{0};
size_t inputHeight_{1};
size_t inputWidth_{1};
size_t channels_{1};

View File

@ -50,19 +50,16 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_small_1xM_pool_with_padding) {
channels += 3) {
for (size_t poolSize = 3; poolSize <= pytorch_qnnp_params.q8avgpool.mr;
poolSize++) {
for (size_t paddingLeft = 0; paddingLeft <= 1; paddingLeft++) {
for (size_t paddingRight = 0; paddingRight <= 1; paddingRight++) {
AveragePoolingOperatorTester()
.batchSize(1)
.inputHeight(2)
.inputWidth(poolSize + 2)
.paddingLeft(paddingLeft)
.paddingRight(paddingRight)
.poolingHeight(1)
.poolingWidth(poolSize)
.channels(channels)
.testQ8();
}
for (size_t paddingWidth = 0; paddingWidth <= 1; paddingWidth++) {
AveragePoolingOperatorTester()
.batchSize(1)
.inputHeight(2)
.inputWidth(poolSize + 2)
.paddingWidth(paddingWidth)
.poolingHeight(1)
.poolingWidth(poolSize)
.channels(channels)
.testQ8();
}
}
}
@ -114,19 +111,16 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_small_Mx1_pool_with_padding) {
channels += 3) {
for (size_t poolSize = 2; poolSize <= pytorch_qnnp_params.q8avgpool.mr;
poolSize++) {
for (size_t paddingTop = 0; paddingTop <= 1; paddingTop++) {
for (size_t paddingBottom = 0; paddingBottom <= 1; paddingBottom++) {
AveragePoolingOperatorTester()
.batchSize(1)
.inputHeight(poolSize + 1)
.inputWidth(3)
.paddingTop(paddingTop)
.paddingBottom(paddingBottom)
.poolingHeight(poolSize)
.poolingWidth(1)
.channels(channels)
.testQ8();
}
for (size_t paddingHeight = 0; paddingHeight <= 1; paddingHeight++) {
AveragePoolingOperatorTester()
.batchSize(1)
.inputHeight(poolSize + 1)
.inputWidth(3)
.paddingHeight(paddingHeight)
.poolingHeight(poolSize)
.poolingWidth(1)
.channels(channels)
.testQ8();
}
}
}
@ -433,19 +427,16 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_large_1xM_pool_with_padding) {
channels += 3) {
for (size_t poolSize = 3; poolSize <= pytorch_qnnp_params.q8avgpool.mr;
poolSize++) {
for (size_t paddingLeft = 0; paddingLeft <= 1; paddingLeft++) {
for (size_t paddingRight = 0; paddingRight <= 1; paddingRight++) {
AveragePoolingOperatorTester()
.batchSize(1)
.inputHeight(2)
.inputWidth(poolSize + 2)
.paddingLeft(paddingLeft)
.paddingRight(paddingRight)
.poolingHeight(1)
.poolingWidth(poolSize)
.channels(channels)
.testQ8();
}
for (size_t paddingWidth = 0; paddingWidth <= 1; paddingWidth++) {
AveragePoolingOperatorTester()
.batchSize(1)
.inputHeight(2)
.inputWidth(poolSize + 2)
.paddingWidth(paddingWidth)
.poolingHeight(1)
.poolingWidth(poolSize)
.channels(channels)
.testQ8();
}
}
}
@ -500,19 +491,16 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_large_Mx1_pool_with_padding) {
for (size_t poolSize = pytorch_qnnp_params.q8avgpool.mr + 1; poolSize <=
pytorch_qnnp_params.q8avgpool.mr + pytorch_qnnp_params.q8avgpool.qr;
poolSize++) {
for (size_t paddingTop = 0; paddingTop <= 1; paddingTop++) {
for (size_t paddingBottom = 0; paddingBottom <= 1; paddingBottom++) {
AveragePoolingOperatorTester()
.batchSize(1)
.inputHeight(poolSize + 1)
.inputWidth(3)
.paddingTop(paddingTop)
.paddingBottom(paddingBottom)
.poolingHeight(poolSize)
.poolingWidth(1)
.channels(channels)
.testQ8();
}
for (size_t paddingHeight = 0; paddingHeight <= 1; paddingHeight++) {
AveragePoolingOperatorTester()
.batchSize(1)
.inputHeight(poolSize + 1)
.inputWidth(3)
.paddingHeight(paddingHeight)
.poolingHeight(poolSize)
.poolingWidth(1)
.channels(channels)
.testQ8();
}
}
}
@ -526,19 +514,16 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_large_Mx1_pool_with_stride) {
for (size_t poolSize = pytorch_qnnp_params.q8avgpool.mr + 1; poolSize <=
pytorch_qnnp_params.q8avgpool.mr + pytorch_qnnp_params.q8avgpool.qr;
poolSize++) {
for (size_t paddingTop = 0; paddingTop <= 1; paddingTop++) {
for (size_t paddingBottom = 0; paddingBottom <= 1; paddingBottom++) {
AveragePoolingOperatorTester()
.batchSize(1)
.inputHeight(poolSize + 1)
.inputWidth(3)
.paddingTop(paddingTop)
.paddingBottom(paddingBottom)
.poolingHeight(poolSize)
.poolingWidth(1)
.channels(channels)
.testQ8();
}
for (size_t paddingHeight = 0; paddingHeight <= 1; paddingHeight++) {
AveragePoolingOperatorTester()
.batchSize(1)
.inputHeight(poolSize + 1)
.inputWidth(3)
.paddingHeight(paddingHeight)
.poolingHeight(poolSize)
.poolingWidth(1)
.channels(channels)
.testQ8();
}
}
}
@ -830,19 +815,16 @@ TEST(AVERAGE_POOLING_OP, unit_batch_few_channels_1xM_pool_with_padding) {
channels++) {
for (size_t poolSize = 3; poolSize <= pytorch_qnnp_params.q8avgpool.mr;
poolSize++) {
for (size_t paddingLeft = 0; paddingLeft <= 1; paddingLeft++) {
for (size_t paddingRight = 0; paddingRight <= 1; paddingRight++) {
AveragePoolingOperatorTester()
.batchSize(1)
.inputHeight(2)
.inputWidth(poolSize + 2)
.paddingLeft(paddingLeft)
.paddingRight(paddingRight)
.poolingHeight(1)
.poolingWidth(poolSize)
.channels(channels)
.testQ8();
}
for (size_t paddingWidth = 0; paddingWidth <= 1; paddingWidth++) {
AveragePoolingOperatorTester()
.batchSize(1)
.inputHeight(2)
.inputWidth(poolSize + 2)
.paddingWidth(paddingWidth)
.poolingHeight(1)
.poolingWidth(poolSize)
.channels(channels)
.testQ8();
}
}
}
@ -891,19 +873,16 @@ TEST(AVERAGE_POOLING_OP, unit_batch_few_channels_Mx1_pool_with_padding) {
channels++) {
for (size_t poolSize = 2; poolSize <= 2 * pytorch_qnnp_params.q8avgpool.kr;
poolSize++) {
for (size_t paddingTop = 0; paddingTop <= 1; paddingTop++) {
for (size_t paddingBottom = 0; paddingBottom <= 1; paddingBottom++) {
AveragePoolingOperatorTester()
.batchSize(1)
.inputHeight(poolSize + 1)
.inputWidth(3)
.paddingTop(paddingTop)
.paddingBottom(paddingBottom)
.poolingHeight(poolSize)
.poolingWidth(1)
.channels(channels)
.testQ8();
}
for (size_t paddingHeight = 0; paddingHeight <= 1; paddingHeight++) {
AveragePoolingOperatorTester()
.batchSize(1)
.inputHeight(poolSize + 1)
.inputWidth(3)
.paddingHeight(paddingHeight)
.poolingHeight(poolSize)
.poolingWidth(1)
.channels(channels)
.testQ8();
}
}
}

View File

@ -27,69 +27,35 @@ using namespace qnnpack::testing;
class ConvolutionOperatorTester {
public:
inline ConvolutionOperatorTester& padding(uint32_t padding) {
this->paddingTop_ = padding;
this->paddingRight_ = padding;
this->paddingBottom_ = padding;
this->paddingLeft_ = padding;
this->paddingHeight_ = padding;
this->paddingWidth_ = padding;
return *this;
}
inline ConvolutionOperatorTester& padding(
uint32_t paddingHeight,
uint32_t paddingWidth) {
this->paddingTop_ = paddingHeight;
this->paddingRight_ = paddingWidth;
this->paddingBottom_ = paddingHeight;
this->paddingLeft_ = paddingWidth;
this->paddingHeight_ = paddingHeight;
this->paddingWidth_ = paddingWidth;
return *this;
}
inline ConvolutionOperatorTester& paddingHeight(uint32_t paddingHeight) {
this->paddingTop_ = paddingHeight;
this->paddingBottom_ = paddingHeight;
this->paddingHeight_ = paddingHeight;
return *this;
}
inline ConvolutionOperatorTester& paddingWidth(uint32_t paddingWidth) {
this->paddingRight_ = paddingWidth;
this->paddingLeft_ = paddingWidth;
this->paddingWidth_ = paddingWidth;
return *this;
}
inline ConvolutionOperatorTester& paddingTop(uint32_t paddingTop) {
this->paddingTop_ = paddingTop;
return *this;
inline uint32_t paddingWidth() const {
return this->paddingWidth_;
}
inline uint32_t paddingTop() const {
return this->paddingTop_;
}
inline ConvolutionOperatorTester& paddingRight(uint32_t paddingRight) {
this->paddingRight_ = paddingRight;
return *this;
}
inline uint32_t paddingRight() const {
return this->paddingRight_;
}
inline ConvolutionOperatorTester& paddingBottom(uint32_t paddingBottom) {
this->paddingBottom_ = paddingBottom;
return *this;
}
inline uint32_t paddingBottom() const {
return this->paddingBottom_;
}
inline ConvolutionOperatorTester& paddingLeft(uint32_t paddingLeft) {
this->paddingLeft_ = paddingLeft;
return *this;
}
inline uint32_t paddingLeft() const {
return this->paddingLeft_;
inline uint32_t paddingHeight() const {
return this->paddingHeight_;
}
inline ConvolutionOperatorTester& inputSize(
@ -325,8 +291,7 @@ class ConvolutionOperatorTester {
}
inline size_t outputHeight() const {
const size_t paddedInputHeight =
paddingTop() + inputHeight() + paddingBottom();
const size_t paddedInputHeight = inputHeight() + paddingHeight() * 2;
if (paddedInputHeight <= dilatedKernelHeight()) {
return 1;
} else {
@ -336,8 +301,7 @@ class ConvolutionOperatorTester {
}
inline size_t outputWidth() const {
const size_t paddedInputWidth =
paddingLeft() + inputWidth() + paddingRight();
const size_t paddedInputWidth = inputWidth() + paddingWidth() * 2;
if (paddedInputWidth <= dilatedKernelWidth()) {
return 1;
} else {
@ -437,11 +401,11 @@ class ConvolutionOperatorTester {
for (size_t ox = 0; ox < outputWidth(); ox++) {
for (size_t ky = 0; ky < kernelHeight(); ky++) {
const size_t iy = oy * subsamplingHeight() +
ky * dilationHeight() - paddingTop();
ky * dilationHeight() - paddingHeight();
if (iy < inputHeight()) {
for (size_t kx = 0; kx < kernelWidth(); kx++) {
const size_t ix = ox * subsamplingWidth() +
kx * dilationWidth() - paddingLeft();
kx * dilationWidth() - paddingWidth();
if (ix < inputWidth()) {
for (size_t g = 0; g < groups(); g++) {
for (size_t oc = 0; oc < groupOutputChannels(); oc++) {
@ -515,10 +479,8 @@ class ConvolutionOperatorTester {
ASSERT_EQ(
pytorch_qnnp_status_success,
pytorch_qnnp_create_convolution2d_nhwc_q8(
paddingTop(),
paddingRight(),
paddingBottom(),
paddingLeft(),
paddingHeight(),
paddingWidth(),
kernelHeight(),
kernelWidth(),
subsamplingHeight(),
@ -641,10 +603,8 @@ class ConvolutionOperatorTester {
}
private:
uint32_t paddingTop_{0};
uint32_t paddingRight_{0};
uint32_t paddingBottom_{0};
uint32_t paddingLeft_{0};
uint32_t paddingHeight_{0};
uint32_t paddingWidth_{0};
size_t inputHeight_{1};
size_t inputWidth_{1};
uint32_t groups_{1};

View File

@ -272,45 +272,27 @@ _STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_without_padding,
.iterations(3)
)
_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_left_padding,
ConvolutionOperatorTester()
.inputSize(13, 12)
.paddingLeft(1)
.kernelSize(3, 3)
.groupInputChannels(15)
.groupOutputChannels(17)
.iterations(3)
)
_STATIC_AND_RUNTIME_TEST(
CONVOLUTION_OP,
3x3_with_width_padding,
ConvolutionOperatorTester()
.inputSize(13, 12)
.paddingWidth(1)
.kernelSize(3, 3)
.groupInputChannels(15)
.groupOutputChannels(17)
.iterations(3))
_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_right_padding,
ConvolutionOperatorTester()
.inputSize(13, 12)
.paddingRight(1)
.kernelSize(3, 3)
.groupInputChannels(15)
.groupOutputChannels(17)
.iterations(3)
)
_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_top_padding,
ConvolutionOperatorTester()
.inputSize(13, 12)
.paddingTop(1)
.kernelSize(3, 3)
.groupInputChannels(15)
.groupOutputChannels(17)
.iterations(3)
)
_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_bottom_padding,
ConvolutionOperatorTester()
.inputSize(13, 12)
.paddingBottom(1)
.kernelSize(3, 3)
.groupInputChannels(15)
.groupOutputChannels(17)
.iterations(3)
)
_STATIC_AND_RUNTIME_TEST(
CONVOLUTION_OP,
3x3_with_height_padding,
ConvolutionOperatorTester()
.inputSize(13, 12)
.paddingHeight(1)
.kernelSize(3, 3)
.groupInputChannels(15)
.groupOutputChannels(17)
.iterations(3))
_STATIC_TEST(CONVOLUTION_OP, 3x3_with_input_stride,
ConvolutionOperatorTester()
@ -833,49 +815,29 @@ _STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_without_padding_per_channel,
.per_channel(true)
)
_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_left_padding_per_channel,
ConvolutionOperatorTester()
.inputSize(13, 12)
.paddingLeft(1)
.kernelSize(3, 3)
.groupInputChannels(15)
.groupOutputChannels(17)
.iterations(3)
.per_channel(true)
)
_STATIC_AND_RUNTIME_TEST(
CONVOLUTION_OP,
3x3_with_width_padding_per_channel,
ConvolutionOperatorTester()
.inputSize(13, 12)
.paddingWidth(1)
.kernelSize(3, 3)
.groupInputChannels(15)
.groupOutputChannels(17)
.iterations(3)
.per_channel(true))
_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_right_padding_per_channel,
ConvolutionOperatorTester()
.inputSize(13, 12)
.paddingRight(1)
.kernelSize(3, 3)
.groupInputChannels(15)
.groupOutputChannels(17)
.iterations(3)
.per_channel(true)
)
_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_top_padding_per_channel,
ConvolutionOperatorTester()
.inputSize(13, 12)
.paddingTop(1)
.kernelSize(3, 3)
.groupInputChannels(15)
.groupOutputChannels(17)
.iterations(3)
.per_channel(true)
)
_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_bottom_padding_per_channel,
ConvolutionOperatorTester()
.inputSize(13, 12)
.paddingBottom(1)
.kernelSize(3, 3)
.groupInputChannels(15)
.groupOutputChannels(17)
.iterations(3)
.per_channel(true)
)
_STATIC_AND_RUNTIME_TEST(
CONVOLUTION_OP,
3x3_with_height_padding_per_channel,
ConvolutionOperatorTester()
.inputSize(13, 12)
.paddingHeight(1)
.kernelSize(3, 3)
.groupInputChannels(15)
.groupOutputChannels(17)
.iterations(3)
.per_channel(true))
_STATIC_TEST(CONVOLUTION_OP, 3x3_with_input_stride_per_channel,
ConvolutionOperatorTester()

View File

@ -27,77 +27,35 @@ using namespace qnnpack::testing;
class DeconvolutionOperatorTester {
public:
inline DeconvolutionOperatorTester& padding(uint32_t padding) {
this->paddingTop_ = padding;
this->paddingRight_ = padding;
this->paddingBottom_ = padding;
this->paddingLeft_ = padding;
this->paddingHeight_ = padding;
this->paddingWidth_ = padding;
return *this;
}
inline DeconvolutionOperatorTester& padding(
uint32_t paddingHeight,
uint32_t paddingWidth) {
this->paddingTop_ = paddingHeight;
this->paddingRight_ = paddingWidth;
this->paddingBottom_ = paddingHeight;
this->paddingLeft_ = paddingWidth;
this->paddingHeight_ = paddingHeight;
this->paddingWidth_ = paddingWidth;
return *this;
}
inline DeconvolutionOperatorTester& paddingHeight(uint32_t paddingHeight) {
this->paddingTop_ = paddingHeight;
this->paddingBottom_ = paddingHeight;
this->paddingHeight_ = paddingHeight;
return *this;
}
inline uint32_t paddingHeight() const {
return this->paddingTop_ + this->paddingBottom_;
return this->paddingHeight_;
}
inline DeconvolutionOperatorTester& paddingWidth(uint32_t paddingWidth) {
this->paddingRight_ = paddingWidth;
this->paddingLeft_ = paddingWidth;
this->paddingWidth_ = paddingWidth;
return *this;
}
inline uint32_t paddingWidth() const {
return this->paddingLeft_ + this->paddingRight_;
}
inline DeconvolutionOperatorTester& paddingTop(uint32_t paddingTop) {
this->paddingTop_ = paddingTop;
return *this;
}
inline uint32_t paddingTop() const {
return this->paddingTop_;
}
inline DeconvolutionOperatorTester& paddingRight(uint32_t paddingRight) {
this->paddingRight_ = paddingRight;
return *this;
}
inline uint32_t paddingRight() const {
return this->paddingRight_;
}
inline DeconvolutionOperatorTester& paddingBottom(uint32_t paddingBottom) {
this->paddingBottom_ = paddingBottom;
return *this;
}
inline uint32_t paddingBottom() const {
return this->paddingBottom_;
}
inline DeconvolutionOperatorTester& paddingLeft(uint32_t paddingLeft) {
this->paddingLeft_ = paddingLeft;
return *this;
}
inline uint32_t paddingLeft() const {
return this->paddingLeft_;
return this->paddingWidth_;
}
inline DeconvolutionOperatorTester& adjustmentHeight(
@ -353,12 +311,12 @@ class DeconvolutionOperatorTester {
inline size_t outputHeight() const {
return strideHeight() * (inputHeight() - 1) + adjustmentHeight() +
dilatedKernelHeight() - paddingHeight();
dilatedKernelHeight() - paddingHeight() * 2;
}
inline size_t outputWidth() const {
return strideWidth() * (inputWidth() - 1) + adjustmentWidth() +
dilatedKernelWidth() - paddingWidth();
dilatedKernelWidth() - paddingWidth() * 2;
}
inline DeconvolutionOperatorTester& qmin(uint8_t qmin) {
@ -451,11 +409,11 @@ class DeconvolutionOperatorTester {
for (size_t oy = 0; oy < outputHeight(); oy++) {
for (size_t ox = 0; ox < outputWidth(); ox++) {
for (size_t ky = 0; ky < kernelHeight(); ky++) {
const size_t y = oy + paddingTop() - ky * dilationHeight();
const size_t y = oy + paddingHeight() - ky * dilationHeight();
const size_t iy = y / strideHeight();
if (iy * strideHeight() == y && iy < inputHeight()) {
for (size_t kx = 0; kx < kernelWidth(); kx++) {
const size_t x = ox + paddingLeft() - kx * dilationWidth();
const size_t x = ox + paddingWidth() - kx * dilationWidth();
const size_t ix = x / strideWidth();
if (ix * strideWidth() == x && ix < inputWidth()) {
for (size_t g = 0; g < groups(); g++) {
@ -531,10 +489,8 @@ class DeconvolutionOperatorTester {
ASSERT_EQ(
pytorch_qnnp_status_success,
pytorch_qnnp_create_deconvolution2d_nhwc_q8(
paddingTop(),
paddingRight(),
paddingBottom(),
paddingLeft(),
paddingHeight(),
paddingWidth(),
adjustmentHeight(),
adjustmentWidth(),
kernelHeight(),
@ -674,10 +630,8 @@ class DeconvolutionOperatorTester {
}
private:
uint32_t paddingTop_{0};
uint32_t paddingRight_{0};
uint32_t paddingBottom_{0};
uint32_t paddingLeft_{0};
uint32_t paddingHeight_{0};
uint32_t paddingWidth_{0};
size_t inputHeight_{1};
size_t inputWidth_{1};
uint32_t groups_{1};

View File

@ -21,69 +21,35 @@
class MaxPoolingOperatorTester {
public:
inline MaxPoolingOperatorTester& padding(uint32_t padding) {
this->paddingTop_ = padding;
this->paddingRight_ = padding;
this->paddingBottom_ = padding;
this->paddingLeft_ = padding;
this->paddingHeight_ = padding;
this->paddingWidth_ = padding;
return *this;
}
inline MaxPoolingOperatorTester& padding(
uint32_t paddingHeight,
uint32_t paddingWidth) {
this->paddingTop_ = paddingHeight;
this->paddingRight_ = paddingWidth;
this->paddingBottom_ = paddingHeight;
this->paddingLeft_ = paddingWidth;
this->paddingHeight_ = paddingHeight;
this->paddingWidth_ = paddingWidth;
return *this;
}
inline MaxPoolingOperatorTester& paddingHeight(uint32_t paddingHeight) {
this->paddingTop_ = paddingHeight;
this->paddingBottom_ = paddingHeight;
this->paddingHeight_ = paddingHeight;
return *this;
}
inline MaxPoolingOperatorTester& paddingWidth(uint32_t paddingWidth) {
this->paddingRight_ = paddingWidth;
this->paddingLeft_ = paddingWidth;
this->paddingWidth_ = paddingWidth;
return *this;
}
inline MaxPoolingOperatorTester& paddingTop(uint32_t paddingTop) {
this->paddingTop_ = paddingTop;
return *this;
inline uint32_t paddingHeight() const {
return this->paddingHeight_;
}
inline uint32_t paddingTop() const {
return this->paddingTop_;
}
inline MaxPoolingOperatorTester& paddingRight(uint32_t paddingRight) {
this->paddingRight_ = paddingRight;
return *this;
}
inline uint32_t paddingRight() const {
return this->paddingRight_;
}
inline MaxPoolingOperatorTester& paddingBottom(uint32_t paddingBottom) {
this->paddingBottom_ = paddingBottom;
return *this;
}
inline uint32_t paddingBottom() const {
return this->paddingBottom_;
}
inline MaxPoolingOperatorTester& paddingLeft(uint32_t paddingLeft) {
this->paddingLeft_ = paddingLeft;
return *this;
}
inline uint32_t paddingLeft() const {
return this->paddingLeft_;
inline uint32_t paddingWidth() const {
return this->paddingWidth_;
}
inline MaxPoolingOperatorTester& inputSize(
@ -255,8 +221,7 @@ class MaxPoolingOperatorTester {
}
inline size_t outputHeight() const {
const size_t paddedInputHeight =
paddingTop() + inputHeight() + paddingBottom();
const size_t paddedInputHeight = inputHeight() + paddingHeight() * 2;
if (paddedInputHeight <= dilatedPoolingHeight()) {
return 1;
} else {
@ -265,8 +230,7 @@ class MaxPoolingOperatorTester {
}
inline size_t outputWidth() const {
const size_t paddedInputWidth =
paddingLeft() + inputWidth() + paddingRight();
const size_t paddedInputWidth = inputWidth() + paddingWidth() * 2;
if (paddedInputWidth <= dilatedPoolingWidth()) {
return 1;
} else {
@ -344,7 +308,7 @@ class MaxPoolingOperatorTester {
inline size_t nextOutputHeight() const {
const size_t paddedNextInputHeight =
paddingTop() + nextInputHeight() + paddingBottom();
nextInputHeight() + paddingHeight() * 2;
if (paddedNextInputHeight <= dilatedPoolingHeight()) {
return 1;
} else {
@ -354,8 +318,7 @@ class MaxPoolingOperatorTester {
}
inline size_t nextOutputWidth() const {
const size_t paddedNextInputWidth =
paddingLeft() + nextInputWidth() + paddingRight();
const size_t paddedNextInputWidth = nextInputWidth() + paddingWidth() * 2;
if (paddedNextInputWidth <= dilatedPoolingWidth()) {
return 1;
} else {
@ -429,11 +392,11 @@ class MaxPoolingOperatorTester {
for (size_t c = 0; c < channels(); c++) {
uint8_t maxValue = 0;
for (size_t py = 0; py < poolingHeight(); py++) {
const size_t iy =
oy * strideHeight() + py * dilationHeight() - paddingTop();
const size_t iy = oy * strideHeight() + py * dilationHeight() -
paddingHeight();
for (size_t px = 0; px < poolingWidth(); px++) {
const size_t ix =
ox * strideWidth() + px * dilationWidth() - paddingLeft();
const size_t ix = ox * strideWidth() + px * dilationWidth() -
paddingWidth();
if (ix < inputWidth() && iy < inputHeight()) {
maxValue = std::max(
maxValue,
@ -462,10 +425,8 @@ class MaxPoolingOperatorTester {
ASSERT_EQ(
pytorch_qnnp_status_success,
pytorch_qnnp_create_max_pooling2d_nhwc_u8(
paddingTop(),
paddingRight(),
paddingBottom(),
paddingLeft(),
paddingHeight(),
paddingWidth(),
poolingHeight(),
poolingWidth(),
strideHeight(),
@ -569,11 +530,11 @@ class MaxPoolingOperatorTester {
for (size_t c = 0; c < channels(); c++) {
uint8_t maxValue = 0;
for (size_t py = 0; py < poolingHeight(); py++) {
const size_t iy =
oy * strideHeight() + py * dilationHeight() - paddingTop();
const size_t iy = oy * strideHeight() + py * dilationHeight() -
paddingHeight();
for (size_t px = 0; px < poolingWidth(); px++) {
const size_t ix =
ox * strideWidth() + px * dilationWidth() - paddingLeft();
const size_t ix = ox * strideWidth() + px * dilationWidth() -
paddingWidth();
if (ix < inputWidth() && iy < inputHeight()) {
maxValue = std::max(
maxValue,
@ -602,10 +563,8 @@ class MaxPoolingOperatorTester {
ASSERT_EQ(
pytorch_qnnp_status_success,
pytorch_qnnp_create_max_pooling2d_nhwc_u8(
paddingTop(),
paddingRight(),
paddingBottom(),
paddingLeft(),
paddingHeight(),
paddingWidth(),
poolingHeight(),
poolingWidth(),
strideHeight(),
@ -680,11 +639,11 @@ class MaxPoolingOperatorTester {
for (size_t c = 0; c < channels(); c++) {
uint8_t maxValue = 0;
for (size_t py = 0; py < poolingHeight(); py++) {
const size_t iy =
oy * strideHeight() + py * dilationHeight() - paddingTop();
const size_t iy = oy * strideHeight() + py * dilationHeight() -
paddingHeight();
for (size_t px = 0; px < poolingWidth(); px++) {
const size_t ix =
ox * strideWidth() + px * dilationWidth() - paddingLeft();
const size_t ix = ox * strideWidth() + px * dilationWidth() -
paddingWidth();
if (ix < nextInputWidth() && iy < nextInputHeight()) {
maxValue = std::max(
maxValue,
@ -775,10 +734,8 @@ class MaxPoolingOperatorTester {
}
private:
uint32_t paddingTop_{0};
uint32_t paddingRight_{0};
uint32_t paddingBottom_{0};
uint32_t paddingLeft_{0};
uint32_t paddingHeight_{0};
uint32_t paddingWidth_{0};
size_t inputHeight_{1};
size_t inputWidth_{1};
size_t channels_{1};

View File

@ -50,14 +50,13 @@ TEST(MAX_POOLING_OP, unit_batch_many_channels_small_1xM_pool_with_padding) {
channels += 3) {
for (size_t poolSize = 3; poolSize <= pytorch_qnnp_params.u8maxpool.mr;
poolSize++) {
for (size_t paddingLeft = 0; paddingLeft <= 1; paddingLeft++) {
for (size_t paddingWidth = 0; paddingWidth <= 1; paddingWidth++) {
for (size_t paddingRight = 0; paddingRight <= 1; paddingRight++) {
MaxPoolingOperatorTester()
.batchSize(1)
.inputHeight(2)
.inputWidth(poolSize + 2)
.paddingLeft(paddingLeft)
.paddingRight(paddingRight)
.paddingWidth(paddingWidth)
.poolingHeight(1)
.poolingWidth(poolSize)
.channels(channels)
@ -134,14 +133,13 @@ TEST(MAX_POOLING_OP, unit_batch_many_channels_small_Mx1_pool_with_padding) {
channels += 3) {
for (size_t poolSize = 2; poolSize <= pytorch_qnnp_params.u8maxpool.mr;
poolSize++) {
for (size_t paddingTop = 0; paddingTop <= 1; paddingTop++) {
for (size_t paddingHeight = 0; paddingHeight <= 1; paddingHeight++) {
for (size_t paddingBottom = 0; paddingBottom <= 1; paddingBottom++) {
MaxPoolingOperatorTester()
.batchSize(1)
.inputHeight(poolSize + 1)
.inputWidth(3)
.paddingTop(paddingTop)
.paddingBottom(paddingBottom)
.paddingHeight(paddingHeight)
.poolingHeight(poolSize)
.poolingWidth(1)
.channels(channels)
@ -335,14 +333,13 @@ TEST(MAX_POOLING_OP, unit_batch_many_channels_large_1xM_pool_with_padding) {
channels += 3) {
for (size_t poolSize = 3; poolSize <= pytorch_qnnp_params.u8maxpool.mr;
poolSize++) {
for (size_t paddingLeft = 0; paddingLeft <= 1; paddingLeft++) {
for (size_t paddingWidth = 0; paddingWidth <= 1; paddingWidth++) {
for (size_t paddingRight = 0; paddingRight <= 1; paddingRight++) {
MaxPoolingOperatorTester()
.batchSize(1)
.inputHeight(2)
.inputWidth(poolSize + 2)
.paddingLeft(paddingLeft)
.paddingRight(paddingRight)
.paddingWidth(paddingWidth)
.poolingHeight(1)
.poolingWidth(poolSize)
.channels(channels)
@ -423,14 +420,13 @@ TEST(MAX_POOLING_OP, unit_batch_many_channels_large_Mx1_pool_with_padding) {
for (size_t poolSize = pytorch_qnnp_params.u8maxpool.mr; poolSize <=
pytorch_qnnp_params.u8maxpool.mr + pytorch_qnnp_params.u8maxpool.qr;
poolSize++) {
for (size_t paddingTop = 0; paddingTop <= 1; paddingTop++) {
for (size_t paddingHeight = 0; paddingHeight <= 1; paddingHeight++) {
for (size_t paddingBottom = 0; paddingBottom <= 1; paddingBottom++) {
MaxPoolingOperatorTester()
.batchSize(1)
.inputHeight(poolSize + 1)
.inputWidth(3)
.paddingTop(paddingTop)
.paddingBottom(paddingBottom)
.paddingHeight(paddingHeight)
.poolingHeight(poolSize)
.poolingWidth(1)
.channels(channels)
@ -627,14 +623,13 @@ TEST(MAX_POOLING_OP, unit_batch_few_channels_1xM_pool_with_padding) {
channels++) {
for (size_t poolSize = 3; poolSize <= pytorch_qnnp_params.u8maxpool.mr;
poolSize++) {
for (size_t paddingLeft = 0; paddingLeft <= 1; paddingLeft++) {
for (size_t paddingWidth = 0; paddingWidth <= 1; paddingWidth++) {
for (size_t paddingRight = 0; paddingRight <= 1; paddingRight++) {
MaxPoolingOperatorTester()
.batchSize(1)
.inputHeight(2)
.inputWidth(poolSize + 2)
.paddingLeft(paddingLeft)
.paddingRight(paddingRight)
.paddingWidth(paddingWidth)
.poolingHeight(1)
.poolingWidth(poolSize)
.channels(channels)
@ -707,14 +702,13 @@ TEST(MAX_POOLING_OP, unit_batch_few_channels_Mx1_pool_with_padding) {
channels++) {
for (size_t poolSize = 2; poolSize <= 2 * pytorch_qnnp_params.u8maxpool.kr;
poolSize++) {
for (size_t paddingTop = 0; paddingTop <= 1; paddingTop++) {
for (size_t paddingHeight = 0; paddingHeight <= 1; paddingHeight++) {
for (size_t paddingBottom = 0; paddingBottom <= 1; paddingBottom++) {
MaxPoolingOperatorTester()
.batchSize(1)
.inputHeight(poolSize + 1)
.inputWidth(3)
.paddingTop(paddingTop)
.paddingBottom(paddingBottom)
.paddingHeight(paddingHeight)
.poolingHeight(poolSize)
.poolingWidth(1)
.channels(channels)

View File

@ -183,12 +183,9 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {
convolution->dilation_depth = kSpatialDim == 3 ? dilation_[0] : 1;
convolution->dilation_height = dilation_[kSpatialDim - 2];
convolution->dilation_width = dilation_[kSpatialDim - 1];
convolution->input_padding_top = padding_[kSpatialDim - 2];
convolution->input_padding_left = padding_[kSpatialDim - 1];
convolution->input_padding_bottom = padding_[kSpatialDim - 2];
convolution->input_padding_right = padding_[kSpatialDim - 1];
convolution->input_padding_front = kSpatialDim == 3 ? padding_[0] : 0;
convolution->input_padding_back = kSpatialDim == 3 ? padding_[0] : 0;
convolution->input_padding_height = padding_[kSpatialDim - 2];
convolution->input_padding_width = padding_[kSpatialDim - 1];
convolution->input_padding_depth = kSpatialDim == 3 ? padding_[0] : 0;
convolution->per_channel = is_per_channel;
convolution->transpose = transpose_;

View File

@ -298,10 +298,8 @@ void check_maxpool2d_params(
const pytorch_qnnp_status createStatus =
pytorch_qnnp_create_max_pooling2d_nhwc_u8(
padH /* input_padding_top */,
padW /* input_padding_right */,
padH /* input_padding_bottom */,
padW /* input_padding_left */,
padH /* input_padding_height */,
padW /* input_padding_width */,
kH /* pooling height */,
kW /* pooling width */,
strideH /* stride height */,

View File

@ -270,7 +270,7 @@ Tensor _sparse_csr_addmm(
Tensor add_sparse_csr(const Tensor& self, const Tensor& other, const Scalar& alpha) {
auto commonDtype = at::result_type(self, other);
alpha_check(commonDtype, alpha);
Tensor result = at::empty({0}, self.options().dtype(commonDtype));
Tensor result = at::empty({0, 0}, self.options().dtype(commonDtype));
return at::add_out(result, self, other, alpha); // redispatch!
}

View File

@ -147,5 +147,30 @@ Tensor& addmv_out_sparse_csr_cuda(
return result;
}
/*
Solves a system of linear equations whose coefficients are represented in a sparse triangular matrix A:
op(A) X = B.
Args:
* `B` - dense Tensor of size m × nrhs.
* `A` - sparse Tensor of size m × m.
* `upper` - controls whether upper or lower triangular part of A is considered in computations.
* `transpose` - if true then op(A) = A^T.
* `unitriangular` - if true then the diagonal elements of A are assumed to be one.
* `X` - dense Tensor of size m × nrhs.
* `clone_A` - cloned matrix A, required only for compatibility with strided layout interface.
*/
std::tuple<Tensor&, Tensor&> triangular_solve_out_sparse_csr_cuda(
const Tensor& B,
const Tensor& A,
bool upper,
bool transpose,
bool unitriangular,
Tensor& X,
Tensor& clone_A) {
sparse::impl::cuda::triangular_solve_out_sparse_csr(A, B, X, upper, transpose, unitriangular);
return std::tuple<Tensor&, Tensor&>(X, clone_A);
}
} // namespace native
} // namespace at

View File

@ -1,8 +1,10 @@
#include <ATen/Dispatch.h>
#include <ATen/cuda/CUDADataType.h>
#include <ATen/cuda/CUDASparse.h>
#include <ATen/cuda/CUDASparseBlas.h>
#include <ATen/cuda/CUDASparseDescriptors.h>
#include <ATen/native/LinearAlgebraUtils.h>
#include <ATen/native/cuda/MiscUtils.h>
#include <ATen/native/sparse/cuda/SparseBlasImpl.h>
#include <ATen/native/sparse/cuda/SparseBlasLegacy.h>
@ -452,6 +454,326 @@ void addmv_out_sparse_csr(
#endif
}
/*
Computes C = alpha * A + beta * B
Args:
* `A` - [in] sparse Tensor of size m × n.
* `B` - [in] sparse Tensor of size m × n.
* `C` - [out] sparse Tensor of size m × n.
*/
void add_out_sparse_csr(
const at::sparse_csr::SparseCsrTensor& A,
const at::sparse_csr::SparseCsrTensor& B,
const Scalar& alpha,
const Scalar& beta,
const at::sparse_csr::SparseCsrTensor& C) {
IntArrayRef A_sizes = A.sizes();
auto ndim = A.dim();
int m = at::native::cuda_int_cast(A_sizes[ndim - 2], "m");
int n = at::native::cuda_int_cast(A_sizes[ndim - 1], "n");
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(A.sizes().equals(B.sizes()) && A.sizes().equals(C.sizes()));
// Only 32-bit indices are supported
auto A_32 = at::native::_sparse_csr_tensor_unsafe(
A.crow_indices().to(kInt),
A.col_indices().to(kInt),
A.values(),
A.sizes(),
A.scalar_type(),
A.layout(),
A.device());
auto B_32 = at::native::_sparse_csr_tensor_unsafe(
B.crow_indices().to(kInt),
B.col_indices().to(kInt),
B.values(),
B.sizes(),
B.scalar_type(),
B.layout(),
B.device());
// Modify C tensor in-place to swap indices tensors with 32-bit variants
indices_to_32_bit_inplace(C);
int nnzA = at::native::cuda_int_cast(A_32._nnz(), "nnzA");
int nnzB = at::native::cuda_int_cast(B_32._nnz(), "nnzB");
auto desc = at::cuda::sparse::CuSparseMatDescriptor();
auto A_crow_indices = A_32.crow_indices();
auto B_crow_indices = B_32.crow_indices();
auto C_crow_indices = C.crow_indices();
auto A_crow_indices_ptr = A_crow_indices.data_ptr<int>();
auto B_crow_indices_ptr = B_crow_indices.data_ptr<int>();
auto C_crow_indices_ptr = C_crow_indices.data_ptr<int>();
auto A_col_indices = A_32.col_indices();
auto B_col_indices = B_32.col_indices();
auto C_col_indices = C.col_indices();
auto A_col_indices_ptr = A_col_indices.data_ptr<int>();
auto B_col_indices_ptr = B_col_indices.data_ptr<int>();
auto C_col_indices_ptr = C_col_indices.data_ptr<int>();
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
C.scalar_type(), "add_out_sparse_csr_cuda_impl", [&] {
auto beta_ = beta.to<scalar_t>();
auto alpha_ = alpha.to<scalar_t>();
auto A_values = A_32.values();
auto B_values = B_32.values();
auto C_values = C.values();
auto A_values_ptr = A_values.data_ptr<scalar_t>();
auto B_values_ptr = B_values.data_ptr<scalar_t>();
auto C_values_ptr = C_values.data_ptr<scalar_t>();
auto handle = at::cuda::getCurrentCUDASparseHandle();
TORCH_CUDASPARSE_CHECK(cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST));
size_t buffer_size;
at::cuda::sparse::csrgeam2_bufferSizeExt<scalar_t>(
handle,
m,
n,
&alpha_,
desc.descriptor(),
nnzA,
A_values_ptr,
A_crow_indices_ptr,
A_col_indices_ptr,
&beta_,
desc.descriptor(),
nnzB,
B_values_ptr,
B_crow_indices_ptr,
B_col_indices_ptr,
desc.descriptor(),
C_values_ptr,
C_crow_indices_ptr,
C_col_indices_ptr,
&buffer_size // output
);
auto& allocator = *c10::cuda::CUDACachingAllocator::get();
auto work_data = allocator.allocate(buffer_size);
int nnzC = -1;
at::cuda::sparse::csrgeam2Nnz<scalar_t>(
handle,
m,
n,
desc.descriptor(),
nnzA,
A_crow_indices_ptr,
A_col_indices_ptr,
desc.descriptor(),
nnzB,
B_crow_indices_ptr,
B_col_indices_ptr,
desc.descriptor(),
C_crow_indices_ptr,
&nnzC,
work_data.get());
// Resize result using nnz information from cusparse
col_indices_and_values_resize_(C, nnzC);
C_col_indices = C.col_indices();
C_values = C.values();
C_col_indices_ptr = C_col_indices.data_ptr<int>();
C_values_ptr = C_values.data_ptr<scalar_t>();
at::cuda::sparse::csrgeam2<scalar_t>(
handle,
m,
n,
&alpha_,
desc.descriptor(),
nnzA,
A_values_ptr,
A_crow_indices_ptr,
A_col_indices_ptr,
&beta_,
desc.descriptor(),
nnzB,
B_values_ptr,
B_crow_indices_ptr,
B_col_indices_ptr,
desc.descriptor(),
C_values_ptr,
C_crow_indices_ptr,
C_col_indices_ptr,
work_data.get());
});
}
/*
Solves a system of linear equations whose coefficients are represented in a sparse triangular matrix A:
op(A) X = B.
Args:
* `A` - sparse Tensor of size m × m.
* `B` - dense Tensor of size m × nrhs.
* `X` - dense Tensor of size m × nrhs.
* `upper` - controls whether upper or lower triangular part of A is considered in computations.
* `transpose` - if true then op(A) = A^T.
* `unitriangular` - if true then the diagonal elements of A are assumed to be one.
*/
void triangular_solve_out_sparse_csr(
const at::sparse_csr::SparseCsrTensor& A,
const Tensor& B,
const Tensor& X,
bool upper,
bool transpose,
bool unitriangular) {
#if !AT_USE_CUSPARSE_GENERIC_SPSV()
TORCH_CHECK(
false,
"Calling triangular solve on a sparse GPU tensor requires compiling ",
"PyTorch with at least CUDA 11.3. ",
"Please use PyTorch built with newer CUDA version.");
#else
if (B.numel() == 0 || X.numel() == 0 || A._nnz() == 0) {
return;
}
c10::MaybeOwned<Tensor> B_ = prepare_dense_matrix_for_cusparse(B);
c10::MaybeOwned<Tensor> X_ = prepare_dense_matrix_for_cusparse(X);
// TODO: update this to support COO sparse layout
auto descA = at::cuda::sparse::CuSparseSpMatCsrDescriptor(A);
descA.set_mat_fill_mode(upper);
descA.set_mat_diag_type(unitriangular);
cusparseOperation_t opA = transpose ? CUSPARSE_OPERATION_TRANSPOSE
: CUSPARSE_OPERATION_NON_TRANSPOSE;
if (B.size(-1) == 1) {
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
X.scalar_type(), "triangular_solve_out_sparse_csr_cuda_impl", [&] {
scalar_t alpha = 1;
cudaDataType compute_type = at::cuda::getCudaDataType<scalar_t>();
auto handle = at::cuda::getCurrentCUDASparseHandle();
size_t buffer_size;
auto desc_spsv = at::cuda::sparse::CuSparseSpSVDescriptor();
auto descB = at::cuda::sparse::CuSparseDnVecDescriptor(*B_);
auto descX = at::cuda::sparse::CuSparseDnVecDescriptor(*X_);
TORCH_CUDASPARSE_CHECK(cusparseSpSV_bufferSize(
handle,
opA,
&alpha,
descA.descriptor(),
descB.descriptor(),
descX.descriptor(),
compute_type,
CUSPARSE_SPSV_ALG_DEFAULT,
desc_spsv.descriptor(),
&buffer_size // output
));
auto& allocator = *c10::cuda::CUDACachingAllocator::get();
auto work_data = allocator.allocate(buffer_size);
TORCH_CUDASPARSE_CHECK(cusparseSpSV_analysis(
handle,
opA,
&alpha,
descA.descriptor(),
descB.descriptor(),
descX.descriptor(),
compute_type,
CUSPARSE_SPSV_ALG_DEFAULT,
desc_spsv.descriptor(),
work_data.get()));
TORCH_CUDASPARSE_CHECK(cusparseSpSV_solve(
handle,
opA,
&alpha,
descA.descriptor(),
descB.descriptor(),
descX.descriptor(),
compute_type,
CUSPARSE_SPSV_ALG_DEFAULT,
desc_spsv.descriptor()));
});
} else {
#if !AT_USE_CUSPARSE_GENERIC_SPSM()
TORCH_CHECK(
false,
"Calling triangular solve on a sparse GPU tensor requires compiling ",
"PyTorch with at least CUDA 11.3.1. ",
"Please use PyTorch built with newer CUDA version.");
#else
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
X.scalar_type(), "triangular_solve_out_sparse_csr_cuda_impl", [&] {
scalar_t alpha = 1;
cudaDataType compute_type = at::cuda::getCudaDataType<scalar_t>();
auto handle = at::cuda::getCurrentCUDASparseHandle();
size_t buffer_size;
// TODO: support mixed memory format
IntArrayRef X_strides = X_->strides();
IntArrayRef B_strides = B_->strides();
auto ndim = X_->dim();
bool is_X_row_major = (X_strides[ndim - 1] == 1);
bool is_B_row_major = (B_strides[ndim - 1] == 1);
TORCH_INTERNAL_ASSERT(is_X_row_major && is_B_row_major);
cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE;
auto desc_spsm = at::cuda::sparse::CuSparseSpSMDescriptor();
auto descB = at::cuda::sparse::CuSparseDnMatDescriptor(*B_);
auto descX = at::cuda::sparse::CuSparseDnMatDescriptor(*X_);
TORCH_CUDASPARSE_CHECK(cusparseSpSM_bufferSize(
handle,
opA,
opB,
&alpha,
descA.descriptor(),
descB.descriptor(),
descX.descriptor(),
compute_type,
CUSPARSE_SPSM_ALG_DEFAULT,
desc_spsm.descriptor(),
&buffer_size // output
));
auto& allocator = *c10::cuda::CUDACachingAllocator::get();
auto work_data = allocator.allocate(buffer_size);
TORCH_CUDASPARSE_CHECK(cusparseSpSM_analysis(
handle,
opA,
opB,
&alpha,
descA.descriptor(),
descB.descriptor(),
descX.descriptor(),
compute_type,
CUSPARSE_SPSM_ALG_DEFAULT,
desc_spsm.descriptor(),
work_data.get()));
TORCH_CUDASPARSE_CHECK(cusparseSpSM_solve(
handle,
opA,
opB,
&alpha,
descA.descriptor(),
descB.descriptor(),
descX.descriptor(),
compute_type,
CUSPARSE_SPSM_ALG_DEFAULT,
desc_spsm.descriptor()));
});
#endif // !AT_USE_CUSPARSE_GENERIC_SPSM()
}
if (!X.is_same(*X_)) {
X.copy_(*X_);
}
#endif // !AT_USE_CUSPARSE_GENERIC_SPSV()
}
} // namespace cuda
} // namespace impl
} // namespace sparse

View File

@ -24,6 +24,21 @@ void addmv_out_sparse_csr(
const Scalar& alpha,
const Tensor& result);
void add_out_sparse_csr(
const at::sparse_csr::SparseCsrTensor& mat1,
const at::sparse_csr::SparseCsrTensor& mat2,
const Scalar& alpha,
const Scalar& beta,
const at::sparse_csr::SparseCsrTensor& result);
void triangular_solve_out_sparse_csr(
const at::sparse_csr::SparseCsrTensor& A,
const Tensor& B,
const Tensor& X,
bool upper,
bool transpose,
bool unitriangular);
} // namespace cuda
} // namespace impl
} // namespace sparse

View File

@ -19,6 +19,7 @@
#include <ATen/cuda/ThrustAllocator.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <ATen/native/sparse/cuda/SparseBlasImpl.h>
#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
#include <ATen/native/sparse/cuda/SparseCUDATensorMath.cuh>
@ -187,11 +188,16 @@ Tensor& add_out_sparse_csr_cuda(
const Scalar& alpha,
SparseCsrTensor& out) {
if (self.layout() == kStrided) {
return add_out_dense_sparse_csr_cuda(out, self, other, alpha);
add_out_dense_sparse_csr_cuda(out, self, other, alpha);
} else {
TORCH_CHECK(
false,
"NotImplementedError: Addition of sparse CSR tensors is not yet implemented.")
self.sizes().equals(other.sizes()),
"torch.add: Expected input tensors to have the same shape, but got tensor `self` with shape ",
self.sizes(),
" and tensor `other` with shape ",
other.sizes());
at::native::resize_as_sparse_csr_(out, self);
sparse::impl::cuda::add_out_sparse_csr(self, other, Scalar(1), alpha, out);
}
return out;
}

View File

@ -145,6 +145,23 @@ void Context::flush() {
command().pool.purge();
}
void Context::wait(const at::Tensor& src) {
// wait only if Vulkan tensor
if (at::kVulkan == src.device().type()) {
api::Command::Pool& command_pool = command().pool;
api::Command::Buffer& command_buffer = command_pool.stream();
using Future = ops::vTensor::Future<const void, ops::vTensor::Access::Read>;
const ops::vTensor& v_src = ops::convert(src);
const Future v_src_future = v_src.host<const void>(command_buffer);
// This wait() is a no-op if data is not out of sync. More often than
// not though, waits here are expected as the GPU catches up with
// compute submitted from CPU.
v_src_future.wait();
}
}
bool available() {
return context();
}

View File

@ -56,6 +56,10 @@ class Context final {
void flush();
// Use this function only for debugging and testing when you want to make sure
// all GPU operations get finished before calling flush(). Otherwise, it may crash.
void wait(const at::Tensor& src);
private:
VkDevice device();
VkQueue queue();

View File

@ -12,17 +12,23 @@ void copy_texture_to_texture(
api::Command::Buffer& command_buffer,
api::Resource::Image::Object& src_image,
api::Resource::Image::Object& dst_image,
api::utils::uvec3 src_extents,
api::utils::uvec3 copy_extents,
api::utils::uvec3 src_offset,
api::utils::uvec3 dst_offset) {
VkImageCopy copy_info{};
copy_info.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
copy_info.srcSubresource.layerCount = 1;
copy_info.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
copy_info.dstSubresource.layerCount = 1;
copy_info.extent.width = src_extents.data[0u];
copy_info.extent.height = src_extents.data[1u];
copy_info.extent.depth = src_extents.data[2u];
copy_info.extent.width = copy_extents.data[0u];
copy_info.extent.height = copy_extents.data[1u];
copy_info.extent.depth = copy_extents.data[2u];
copy_info.srcOffset.x = src_offset.data[0u];
copy_info.srcOffset.y = src_offset.data[1u];
copy_info.srcOffset.z = src_offset.data[2u];
copy_info.dstOffset.x = dst_offset.data[0u];
copy_info.dstOffset.y = dst_offset.data[1u];
copy_info.dstOffset.z = dst_offset.data[2u];
// To use vkCmdCopyImage, the stage of src & dst image must be set to vTensor::Stage::Transfer.
vkCmdCopyImage(

View File

@ -18,7 +18,8 @@ void copy_texture_to_texture(
api::Command::Buffer& command_buffer,
api::Resource::Image::Object& src_image,
api::Resource::Image::Object& dst_image,
api::utils::uvec3 src_extents,
api::utils::uvec3 copy_extents,
api::utils::uvec3 src_offset,
api::utils::uvec3 dst_offset);
} // namespace utils

View File

@ -96,6 +96,61 @@ Tensor cat_feature(const TensorList tensors, vTensor& v_output) {
return convert(v_output);
}
Tensor cat_feature_mult4ch(const TensorList tensors, vTensor& v_output) {
api::Context* const context = api::context();
api::Command::Pool& command_pool = context->command().pool;
api::Command::Buffer& command_buffer = command_pool.stream();
int64_t depth_size_allprior = 0;
int64_t ch_interval = 0;
for (const auto& tensor : tensors) {
ch_interval += tensor.sizes()[1];
}
const int64_t depth_interval = ch_interval / 4;
auto dst_image = v_output.image(
command_buffer,
vTensor::Stage::Transfer,
vTensor::Access::Write);
uvec3 src_offset{};
uvec3 dst_offset{};
for (const auto& tensor : tensors) {
const Tensor self = tensor.is_vulkan() ? tensor : tensor.vulkan();
const vTensor& v_self = convert(self);
if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
auto src_image = v_self.image(
command_buffer,
vTensor::Stage::Transfer);
const uint32_t depth_slice = safe_downcast<uint32_t>(tensor.sizes()[1] / 4);
uvec3 copy_extents {v_self.extents().data[0u],
v_self.extents().data[1u],
depth_slice};
for (int b = 0; b < tensor.sizes()[0]; ++b) {
src_offset.data[2u] = safe_downcast<uint32_t>(depth_slice * b);
dst_offset.data[2u] = depth_size_allprior + safe_downcast<uint32_t>(depth_interval * b);
api::helper::copy_texture_to_texture(command_buffer,
src_image,
dst_image,
copy_extents,
src_offset,
dst_offset);
}
depth_size_allprior += depth_slice;
}
else {
TORCH_CHECK(false, "Not implemented!");
}
}
command_pool.submit(context->gpu().queue, command_buffer);
return convert(v_output);
}
Tensor cat_width(const TensorList tensors, vTensor& v_output) {
TORCH_CHECK(false, "Vulkan cat not implemented for width dimension!");
}
@ -110,6 +165,7 @@ Tensor cat_height(const TensorList tensors, vTensor& v_output) {
vTensor::Stage::Transfer,
vTensor::Access::Write);
uvec3 src_offset{};
uvec3 dst_offset{};
for (const auto& tensor : tensors) {
const Tensor self = tensor.is_vulkan() ? tensor : tensor.vulkan();
@ -123,6 +179,7 @@ Tensor cat_height(const TensorList tensors, vTensor& v_output) {
src_image,
dst_image,
v_self.extents(),
src_offset,
dst_offset);
// Increment by height
@ -148,11 +205,16 @@ Tensor cat(
at::Tensor tensor = tensors[0];
int64_t cat_dim_size = 0;
bool is_mult4ch = true;
for (const auto & t : tensors) {
TORCH_INTERNAL_ASSERT(
t.dim() == 4, "Vulkan cat expects 4 dimensional inputs");
if (t.sizes()[1] % 4 != 0) {
is_mult4ch = false;
}
for (int d = 0; d < 4; ++d) {
if (d == dim) {
continue;
@ -179,6 +241,9 @@ Tensor cat(
return cat_height(tensors, v_output);
}
else if (dim == 1) {
if (is_mult4ch) {
return cat_feature_mult4ch(tensors, v_output);
}
return cat_feature(tensors, v_output);
}
return cat_batch(tensors, v_output);

Some files were not shown because too many files have changed in this diff Show More