mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Update on "fix torch::deploy OSS CI: previous the tests for torch::deploy will be skipped"
We match build environment with linux-xenial-cuda11.3 in .jenkins/pytorch/build.sh but match against linux-xenial-cuda11.1 in .jenkins/pytorch/test.sh. I suspect this mismatch may cause the torch::deploy tests not being run. Send this PR out and I will verify in CI. Differential Revision: [D31939631](https://our.internmc.facebook.com/intern/diff/D31939631) [ghstack-poisoned]
This commit is contained in:
commit
f3e739463d
|
|
@ -4,11 +4,8 @@ from cimodel.lib.miniutils import quote
|
|||
from cimodel.data.simple.util.branch_filters import gen_filter_dict, RC_PATTERN
|
||||
|
||||
|
||||
# TODO: make this generated from a matrix rather than just a static list
|
||||
# NOTE: All hardcoded docker image builds have been migrated to GHA
|
||||
IMAGE_NAMES = [
|
||||
"pytorch-linux-bionic-rocm4.1-py3.6",
|
||||
"pytorch-linux-bionic-rocm4.2-py3.6",
|
||||
"pytorch-linux-bionic-rocm4.3.1-py3.6",
|
||||
]
|
||||
|
||||
# This entry should be an element from the list above
|
||||
|
|
|
|||
9
.circleci/config.yml
generated
9
.circleci/config.yml
generated
|
|
@ -6506,15 +6506,6 @@ workflows:
|
|||
when: << pipeline.parameters.run_binary_tests >>
|
||||
build:
|
||||
jobs:
|
||||
- docker_build_job:
|
||||
name: "docker-pytorch-linux-bionic-rocm4.1-py3.6"
|
||||
image_name: "pytorch-linux-bionic-rocm4.1-py3.6"
|
||||
- docker_build_job:
|
||||
name: "docker-pytorch-linux-bionic-rocm4.2-py3.6"
|
||||
image_name: "pytorch-linux-bionic-rocm4.2-py3.6"
|
||||
- docker_build_job:
|
||||
name: "docker-pytorch-linux-bionic-rocm4.3.1-py3.6"
|
||||
image_name: "pytorch-linux-bionic-rocm4.3.1-py3.6"
|
||||
- pytorch_linux_build:
|
||||
name: pytorch_linux_xenial_py3_6_gcc5_4_build
|
||||
requires:
|
||||
|
|
|
|||
|
|
@ -140,8 +140,6 @@ def generate_required_docker_images(items):
|
|||
|
||||
def gen_build_workflows_tree():
|
||||
build_workflows_functions = [
|
||||
# For rocm images, which don't have a circleci job equivalent
|
||||
cimodel.data.simple.docker_definitions.get_workflow_jobs,
|
||||
pytorch_build_definitions.get_workflow_jobs,
|
||||
cimodel.data.simple.macos_definitions.get_workflow_jobs,
|
||||
cimodel.data.simple.android_definitions.get_workflow_jobs,
|
||||
|
|
|
|||
4
.github/generated-ciflow-ruleset.json
generated
vendored
4
.github/generated-ciflow-ruleset.json
generated
vendored
|
|
@ -9,7 +9,6 @@
|
|||
"linux-bionic-cuda10.2-py3.9-gcc7",
|
||||
"linux-bionic-py3.6-clang9",
|
||||
"linux-vulkan-bionic-py3.6-clang9",
|
||||
"linux-xenial-cuda10.2-py3.6-gcc7",
|
||||
"linux-xenial-cuda11.3-py3.6-gcc7",
|
||||
"linux-xenial-py3-clang5-mobile-build",
|
||||
"linux-xenial-py3-clang5-mobile-code-analysis",
|
||||
|
|
@ -47,7 +46,6 @@
|
|||
"libtorch-linux-xenial-cuda10.2-py3.6-gcc7",
|
||||
"libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
|
||||
"linux-bionic-cuda10.2-py3.9-gcc7",
|
||||
"linux-xenial-cuda10.2-py3.6-gcc7",
|
||||
"linux-xenial-cuda11.3-py3.6-gcc7",
|
||||
"periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
|
||||
"periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck",
|
||||
|
|
@ -82,7 +80,6 @@
|
|||
"linux-bionic-cuda10.2-py3.9-gcc7",
|
||||
"linux-bionic-py3.6-clang9",
|
||||
"linux-vulkan-bionic-py3.6-clang9",
|
||||
"linux-xenial-cuda10.2-py3.6-gcc7",
|
||||
"linux-xenial-cuda11.3-py3.6-gcc7",
|
||||
"linux-xenial-py3-clang5-mobile-build",
|
||||
"linux-xenial-py3-clang5-mobile-code-analysis",
|
||||
|
|
@ -121,7 +118,6 @@
|
|||
],
|
||||
"ciflow/slow": [
|
||||
"linux-bionic-cuda10.2-py3.9-gcc7",
|
||||
"linux-xenial-cuda10.2-py3.6-gcc7",
|
||||
"periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck"
|
||||
],
|
||||
"ciflow/slow-gradcheck": [
|
||||
|
|
|
|||
4
.github/scale-config.yml
vendored
4
.github/scale-config.yml
vendored
|
|
@ -15,6 +15,7 @@
|
|||
# os: linux
|
||||
# max_available: 20
|
||||
# disk_size: 50
|
||||
# is_ephemeral: true
|
||||
|
||||
runner_types:
|
||||
linux.2xlarge:
|
||||
|
|
@ -27,16 +28,19 @@ runner_types:
|
|||
os: linux
|
||||
max_available: 125
|
||||
disk_size: 150
|
||||
is_ephemeral: false
|
||||
linux.4xlarge.nvidia.gpu:
|
||||
instance_type: g3.4xlarge
|
||||
os: linux
|
||||
max_available: 125
|
||||
disk_size: 150
|
||||
is_ephemeral: false
|
||||
linux.16xlarge.nvidia.gpu:
|
||||
instance_type: g3.16xlarge
|
||||
os: linux
|
||||
max_available: 10
|
||||
disk_size: 150
|
||||
is_ephemeral: false
|
||||
windows.4xlarge:
|
||||
instance_type: c5d.4xlarge
|
||||
os: windows
|
||||
|
|
|
|||
37
.github/scripts/generate_ci_workflows.py
vendored
37
.github/scripts/generate_ci_workflows.py
vendored
|
|
@ -17,6 +17,7 @@ DOCKER_REGISTRY = "308535385114.dkr.ecr.us-east-1.amazonaws.com"
|
|||
GITHUB_DIR = Path(__file__).resolve().parent.parent
|
||||
|
||||
WINDOWS_CPU_TEST_RUNNER = "windows.4xlarge"
|
||||
# contains 1 gpu
|
||||
WINDOWS_CUDA_TEST_RUNNER = "windows.8xlarge.nvidia.gpu"
|
||||
WINDOWS_RUNNERS = {
|
||||
WINDOWS_CPU_TEST_RUNNER,
|
||||
|
|
@ -24,7 +25,8 @@ WINDOWS_RUNNERS = {
|
|||
}
|
||||
|
||||
LINUX_CPU_TEST_RUNNER = "linux.2xlarge"
|
||||
LINUX_CUDA_TEST_RUNNER = "linux.8xlarge.nvidia.gpu"
|
||||
# contains 1 gpu
|
||||
LINUX_CUDA_TEST_RUNNER = "linux.4xlarge.nvidia.gpu"
|
||||
LINUX_RUNNERS = {
|
||||
LINUX_CPU_TEST_RUNNER,
|
||||
LINUX_CUDA_TEST_RUNNER,
|
||||
|
|
@ -411,17 +413,6 @@ LINUX_WORKFLOWS = [
|
|||
build_environment="linux-bionic-cuda10.2-py3.9-gcc7",
|
||||
docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7",
|
||||
test_runner_type=LINUX_CUDA_TEST_RUNNER,
|
||||
num_test_shards=2,
|
||||
ciflow_config=CIFlowConfig(
|
||||
run_on_canary=True,
|
||||
labels={LABEL_CIFLOW_SLOW, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA}
|
||||
),
|
||||
),
|
||||
CIWorkflow(
|
||||
arch="linux",
|
||||
build_environment="linux-xenial-cuda10.2-py3.6-gcc7",
|
||||
docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
|
||||
test_runner_type=LINUX_CUDA_TEST_RUNNER,
|
||||
enable_jit_legacy_test=1,
|
||||
enable_multigpu_test=1,
|
||||
enable_nogpu_no_avx_test=1,
|
||||
|
|
@ -429,7 +420,8 @@ LINUX_WORKFLOWS = [
|
|||
enable_slow_test=1,
|
||||
num_test_shards=2,
|
||||
ciflow_config=CIFlowConfig(
|
||||
labels=set([LABEL_CIFLOW_SLOW, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA]),
|
||||
run_on_canary=True,
|
||||
labels={LABEL_CIFLOW_SLOW, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA}
|
||||
),
|
||||
),
|
||||
CIWorkflow(
|
||||
|
|
@ -539,14 +531,23 @@ BAZEL_WORKFLOWS = [
|
|||
),
|
||||
]
|
||||
|
||||
DOCKER_WORKFLOWS = [
|
||||
DockerWorkflow(
|
||||
build_environment="docker-builds",
|
||||
docker_images=sorted({
|
||||
DOCKER_IMAGES = {
|
||||
f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.6-clang9", # for pytorch/xla
|
||||
f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm4.1-py3.6", # for rocm
|
||||
f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm4.2-py3.6", # for rocm
|
||||
f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm4.3.1-py3.6", # for rocm
|
||||
}
|
||||
|
||||
DOCKER_IMAGES.update({
|
||||
workflow.docker_image_base
|
||||
for workflow in [*LINUX_WORKFLOWS, *BAZEL_WORKFLOWS]
|
||||
if workflow.docker_image_base
|
||||
}),
|
||||
})
|
||||
|
||||
DOCKER_WORKFLOWS = [
|
||||
DockerWorkflow(
|
||||
build_environment="docker-builds",
|
||||
docker_images=sorted(DOCKER_IMAGES),
|
||||
# Run weekly to ensure they can build
|
||||
is_scheduled="1 * */7 * *",
|
||||
),
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ from typing_extensions import TypedDict
|
|||
|
||||
|
||||
BUILD_ENVIRONMENT = os.getenv('BUILD_ENVIRONMENT')
|
||||
|
||||
assert BUILD_ENVIRONMENT is not None
|
||||
|
||||
class Config(TypedDict):
|
||||
num_shards: int
|
||||
|
|
@ -70,6 +70,7 @@ def main() -> None:
|
|||
if not run_as_if_on_trunk() and NUM_TEST_SHARDS_ON_PULL_REQUEST:
|
||||
NUM_TEST_SHARDS = int(NUM_TEST_SHARDS_ON_PULL_REQUEST)
|
||||
MULTIGPU_RUNNER_TYPE = os.getenv('MULTIGPU_RUNNER_TYPE')
|
||||
DISTRIBUTED_GPU_RUNNER_TYPE = os.getenv('DISTRIBUTED_GPU_RUNNER_TYPE', TEST_RUNNER_TYPE)
|
||||
NOGPU_RUNNER_TYPE = os.getenv('NOGPU_RUNNER_TYPE')
|
||||
configs: Dict[str, Config] = {}
|
||||
if os.getenv('ENABLE_JIT_LEGACY_TEST'):
|
||||
|
|
@ -84,7 +85,10 @@ def main() -> None:
|
|||
if os.getenv('ENABLE_FORCE_ON_CPU_TEST'):
|
||||
configs['force_on_cpu'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE}
|
||||
if os.getenv('ENABLE_DISTRIBUTED_TEST'):
|
||||
configs['distributed'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
|
||||
configs['distributed'] = {
|
||||
'num_shards': 1,
|
||||
'runner': DISTRIBUTED_GPU_RUNNER_TYPE if "cuda" in str(BUILD_ENVIRONMENT) else TEST_RUNNER_TYPE
|
||||
}
|
||||
if os.getenv('ENABLE_SLOW_TEST'):
|
||||
configs['slow'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
|
||||
if os.getenv('ENABLE_DOCS_TEST'):
|
||||
|
|
|
|||
2
.github/templates/bazel_ci_workflow.yml.j2
vendored
2
.github/templates/bazel_ci_workflow.yml.j2
vendored
|
|
@ -21,7 +21,7 @@ on:
|
|||
steps:
|
||||
!{{ common.setup_ec2_linux() }}
|
||||
!{{ common.checkout_pytorch("recursive") }}
|
||||
!{{ common.calculate_docker_image() }}
|
||||
!{{ common.calculate_docker_image(false) }}
|
||||
- name: Pull Docker image
|
||||
run: |
|
||||
!{{ common.pull_docker("${DOCKER_IMAGE}") }}
|
||||
|
|
|
|||
4
.github/templates/common.yml.j2
vendored
4
.github/templates/common.yml.j2
vendored
|
|
@ -183,7 +183,7 @@ concurrency:
|
|||
run: echo "${LABELS}"
|
||||
{%- endmacro -%}
|
||||
|
||||
{%- macro calculate_docker_image() -%}
|
||||
{%- macro calculate_docker_image(always_rebuild) -%}
|
||||
- name: Calculate docker image tag
|
||||
id: calculate-tag
|
||||
run: |
|
||||
|
|
@ -198,6 +198,7 @@ concurrency:
|
|||
BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
|
||||
run: |
|
||||
set -x
|
||||
{%- if not always_rebuild %}
|
||||
# Check if image already exists, if it does then skip building it
|
||||
if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
|
||||
exit 0
|
||||
|
|
@ -222,6 +223,7 @@ concurrency:
|
|||
echo " contact the PyTorch team to restore the original images"
|
||||
exit 1
|
||||
fi
|
||||
{%- endif %}
|
||||
echo ::set-output name=rebuild::yes
|
||||
- name: Build and push docker image
|
||||
if: ${{ steps.check.outputs.rebuild }}
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ name: !{{ build_environment }}
|
|||
{%- endblock %}
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths:
|
||||
|
|
@ -35,7 +36,7 @@ jobs:
|
|||
steps:
|
||||
!{{ common.setup_ec2_linux() }}
|
||||
!{{ common.checkout_pytorch("recursive") }}
|
||||
!{{ common.calculate_docker_image() }}
|
||||
!{{ common.calculate_docker_image(true) }}
|
||||
- name: Pull Docker image
|
||||
run: |
|
||||
!{{ common.pull_docker("${DOCKER_IMAGE}") }}
|
||||
|
|
|
|||
3
.github/templates/linux_ci_workflow.yml.j2
vendored
3
.github/templates/linux_ci_workflow.yml.j2
vendored
|
|
@ -54,7 +54,7 @@ jobs:
|
|||
steps:
|
||||
!{{ common.setup_ec2_linux() }}
|
||||
!{{ common.checkout_pytorch("recursive") }}
|
||||
!{{ common.calculate_docker_image() }}
|
||||
!{{ common.calculate_docker_image(false) }}
|
||||
- name: Pull Docker image
|
||||
run: |
|
||||
!{{ common.pull_docker("${DOCKER_IMAGE}") }}
|
||||
|
|
@ -152,6 +152,7 @@ jobs:
|
|||
ENABLE_NOARCH_TEST: !{{ enable_noarch_test }}
|
||||
NUM_TEST_SHARDS: !{{ num_test_shards }}
|
||||
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
|
||||
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
|
||||
NOGPU_RUNNER_TYPE: linux.2xlarge
|
||||
PR_BODY: ${{ github.event.pull_request.body }}
|
||||
outputs:
|
||||
|
|
|
|||
33
.github/workflows/generated-docker-builds.yml
generated
vendored
33
.github/workflows/generated-docker-builds.yml
generated
vendored
|
|
@ -4,6 +4,7 @@
|
|||
name: docker-builds
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths:
|
||||
|
|
@ -22,10 +23,18 @@ jobs:
|
|||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.6-clang9'
|
||||
docker_image_short_name: 'pytorch-linux-bionic-cuda10.2-cudnn7-py3.6-clang9'
|
||||
- docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7'
|
||||
docker_image_short_name: 'pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7'
|
||||
- docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.6-clang9'
|
||||
docker_image_short_name: 'pytorch-linux-bionic-py3.6-clang9'
|
||||
- docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm4.1-py3.6'
|
||||
docker_image_short_name: 'pytorch-linux-bionic-rocm4.1-py3.6'
|
||||
- docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm4.2-py3.6'
|
||||
docker_image_short_name: 'pytorch-linux-bionic-rocm4.2-py3.6'
|
||||
- docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm4.3.1-py3.6'
|
||||
docker_image_short_name: 'pytorch-linux-bionic-rocm4.3.1-py3.6'
|
||||
- docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7'
|
||||
docker_image_short_name: 'pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7'
|
||||
- docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7'
|
||||
|
|
@ -110,30 +119,6 @@ jobs:
|
|||
BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
|
||||
run: |
|
||||
set -x
|
||||
# Check if image already exists, if it does then skip building it
|
||||
if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
|
||||
exit 0
|
||||
fi
|
||||
if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
|
||||
# if we're on the base branch then use the parent commit
|
||||
MERGE_BASE=$(git rev-parse HEAD~)
|
||||
else
|
||||
# otherwise we're on a PR, so use the most recent base commit
|
||||
MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
|
||||
fi
|
||||
# Covers the case where a previous tag doesn't exist for the tree
|
||||
# this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
|
||||
if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
|
||||
echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
|
||||
exit 1
|
||||
fi
|
||||
PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
|
||||
# If no image exists but the hash is the same as the previous hash then we should error out here
|
||||
if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
|
||||
echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
|
||||
echo " contact the PyTorch team to restore the original images"
|
||||
exit 1
|
||||
fi
|
||||
echo ::set-output name=rebuild::yes
|
||||
- name: Build and push docker image
|
||||
if: ${{ steps.check.outputs.rebuild }}
|
||||
|
|
|
|||
13
.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
generated
vendored
13
.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
generated
vendored
|
|
@ -260,19 +260,20 @@ jobs:
|
|||
runs-on: ubuntu-18.04
|
||||
needs: [ciflow_should_run]
|
||||
env:
|
||||
TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
|
||||
TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu
|
||||
ENABLE_DISTRIBUTED_TEST: 1
|
||||
ENABLE_JIT_LEGACY_TEST: ''
|
||||
ENABLE_MULTIGPU_TEST: ''
|
||||
ENABLE_NOGPU_NO_AVX_TEST: ''
|
||||
ENABLE_NOGPU_NO_AVX2_TEST: ''
|
||||
ENABLE_SLOW_TEST: ''
|
||||
ENABLE_JIT_LEGACY_TEST: 1
|
||||
ENABLE_MULTIGPU_TEST: 1
|
||||
ENABLE_NOGPU_NO_AVX_TEST: 1
|
||||
ENABLE_NOGPU_NO_AVX2_TEST: 1
|
||||
ENABLE_SLOW_TEST: 1
|
||||
ENABLE_DOCS_TEST: ''
|
||||
ENABLE_BACKWARDS_COMPAT_TEST: ''
|
||||
ENABLE_XLA_TEST: ''
|
||||
ENABLE_NOARCH_TEST: ''
|
||||
NUM_TEST_SHARDS: 2
|
||||
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
|
||||
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
|
||||
NOGPU_RUNNER_TYPE: linux.2xlarge
|
||||
PR_BODY: ${{ github.event.pull_request.body }}
|
||||
outputs:
|
||||
|
|
|
|||
1
.github/workflows/generated-linux-bionic-py3.6-clang9.yml
generated
vendored
1
.github/workflows/generated-linux-bionic-py3.6-clang9.yml
generated
vendored
|
|
@ -273,6 +273,7 @@ jobs:
|
|||
ENABLE_NOARCH_TEST: 1
|
||||
NUM_TEST_SHARDS: 2
|
||||
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
|
||||
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
|
||||
NOGPU_RUNNER_TYPE: linux.2xlarge
|
||||
PR_BODY: ${{ github.event.pull_request.body }}
|
||||
outputs:
|
||||
|
|
|
|||
1
.github/workflows/generated-linux-vulkan-bionic-py3.6-clang9.yml
generated
vendored
1
.github/workflows/generated-linux-vulkan-bionic-py3.6-clang9.yml
generated
vendored
|
|
@ -273,6 +273,7 @@ jobs:
|
|||
ENABLE_NOARCH_TEST: ''
|
||||
NUM_TEST_SHARDS: 1
|
||||
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
|
||||
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
|
||||
NOGPU_RUNNER_TYPE: linux.2xlarge
|
||||
PR_BODY: ${{ github.event.pull_request.body }}
|
||||
outputs:
|
||||
|
|
|
|||
517
.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
generated
vendored
517
.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
generated
vendored
|
|
@ -1,517 +0,0 @@
|
|||
# @generated DO NOT EDIT MANUALLY
|
||||
# Template is at: .github/templates/linux_ci_workflow.yml.j2
|
||||
# Generation script: .github/scripts/generate_ci_workflows.py
|
||||
name: linux-xenial-cuda10.2-py3.6-gcc7
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened, unassigned]
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
- release/*
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
BUILD_ENVIRONMENT: linux-xenial-cuda10.2-py3.6-gcc7
|
||||
DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7
|
||||
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
|
||||
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
|
||||
TORCH_CUDA_ARCH_LIST: 5.2
|
||||
IN_CI: 1
|
||||
IS_GHA: 1
|
||||
# This is used for the phase of adding wheel tests only, will be removed once completed
|
||||
IN_WHEEL_TEST: 1
|
||||
# Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
|
||||
CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
|
||||
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
|
||||
PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
concurrency:
|
||||
group: linux-xenial-cuda10.2-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
|
||||
ciflow_should_run:
|
||||
runs-on: ubuntu-18.04
|
||||
env:
|
||||
IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
|
||||
LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow') }}
|
||||
LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
|
||||
if: ${{ (github.repository == 'pytorch/pytorch') && (
|
||||
(github.event_name == 'push') ||
|
||||
(github.event_name == 'schedule') ||
|
||||
(contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow')) ||
|
||||
(false))
|
||||
}}
|
||||
steps:
|
||||
- name: noop
|
||||
run: echo running ciflow_should_run
|
||||
- name: print labels
|
||||
run: echo "${LABELS}"
|
||||
|
||||
build:
|
||||
runs-on: linux.2xlarge
|
||||
needs: [ciflow_should_run]
|
||||
env:
|
||||
JOB_BASE_NAME: linux-xenial-cuda10.2-py3.6-gcc7-build
|
||||
outputs:
|
||||
docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
function get_ec2_metadata() {
|
||||
# Pulled from instance metadata endpoint for EC2
|
||||
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
||||
category=$1
|
||||
curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
||||
}
|
||||
echo "ami-id: $(get_ec2_metadata ami-id)"
|
||||
echo "instance-id: $(get_ec2_metadata instance-id)"
|
||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
||||
- name: Log in to ECR
|
||||
env:
|
||||
AWS_RETRY_MODE: standard
|
||||
AWS_MAX_ATTEMPTS: 5
|
||||
run: |
|
||||
aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
|
||||
bash /tmp/ecr-login.sh
|
||||
rm /tmp/ecr-login.sh
|
||||
- name: Chown workspace
|
||||
env:
|
||||
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
|
||||
run: |
|
||||
retry () {
|
||||
"$@" || (sleep 1 && "$@") || (sleep 2 && "$@")
|
||||
}
|
||||
retry docker pull "${ALPINE_IMAGE}"
|
||||
# Ensure the working directory gets chowned back to the current user
|
||||
docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
|
||||
- name: Clean workspace
|
||||
run: |
|
||||
rm -rf "${GITHUB_WORKSPACE:?}/*"
|
||||
rm -f ~/.ssh/authorized_keys
|
||||
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
||||
uses: seemethere/add-github-ssh-key@v1
|
||||
with:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Preserve github env variables for use in docker
|
||||
run: |
|
||||
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
|
||||
- name: Checkout PyTorch
|
||||
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
|
||||
with:
|
||||
# deep clone, to allow use of git merge-base
|
||||
fetch-depth: 0
|
||||
submodules: recursive
|
||||
- name: Calculate docker image tag
|
||||
id: calculate-tag
|
||||
run: |
|
||||
DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
|
||||
echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
|
||||
echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
|
||||
echo "::set-output name=docker_tag::${DOCKER_TAG}"
|
||||
echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
|
||||
- name: Check if image should be built
|
||||
id: check
|
||||
env:
|
||||
BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
|
||||
run: |
|
||||
set -x
|
||||
# Check if image already exists, if it does then skip building it
|
||||
if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
|
||||
exit 0
|
||||
fi
|
||||
if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
|
||||
# if we're on the base branch then use the parent commit
|
||||
MERGE_BASE=$(git rev-parse HEAD~)
|
||||
else
|
||||
# otherwise we're on a PR, so use the most recent base commit
|
||||
MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
|
||||
fi
|
||||
# Covers the case where a previous tag doesn't exist for the tree
|
||||
# this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
|
||||
if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
|
||||
echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
|
||||
exit 1
|
||||
fi
|
||||
PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
|
||||
# If no image exists but the hash is the same as the previous hash then we should error out here
|
||||
if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
|
||||
echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
|
||||
echo " contact the PyTorch team to restore the original images"
|
||||
exit 1
|
||||
fi
|
||||
echo ::set-output name=rebuild::yes
|
||||
- name: Build and push docker image
|
||||
if: ${{ steps.check.outputs.rebuild }}
|
||||
env:
|
||||
DOCKER_SKIP_S3_UPLOAD: 1
|
||||
working-directory: .circleci/docker
|
||||
run: |
|
||||
export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
|
||||
./build_docker.sh
|
||||
- name: Pull Docker image
|
||||
run: |
|
||||
retry () {
|
||||
"$@" || (sleep 1 && "$@") || (sleep 2 && "$@")
|
||||
}
|
||||
retry docker pull "${DOCKER_IMAGE}"
|
||||
- name: Parse ref
|
||||
id: parse-ref
|
||||
run: .github/scripts/parse_ref.py
|
||||
- name: Build
|
||||
env:
|
||||
CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
|
||||
run: |
|
||||
# detached container should get cleaned up by teardown_ec2_linux
|
||||
container_name=$(docker run \
|
||||
-e BUILD_ENVIRONMENT \
|
||||
-e JOB_BASE_NAME \
|
||||
-e MAX_JOBS="$(nproc --ignore=2)" \
|
||||
-e AWS_DEFAULT_REGION \
|
||||
-e IS_GHA \
|
||||
-e CIRCLE_PR_NUMBER \
|
||||
-e CIRCLE_SHA1 \
|
||||
-e CIRCLE_BRANCH \
|
||||
-e GITHUB_RUN_ID \
|
||||
-e SCCACHE_BUCKET \
|
||||
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
|
||||
-e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
|
||||
-e SKIP_SCCACHE_INITIALIZATION=1 \
|
||||
-e TORCH_CUDA_ARCH_LIST \
|
||||
-e PR_LABELS \
|
||||
-e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
|
||||
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
||||
--security-opt seccomp=unconfined \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--tty \
|
||||
--detach \
|
||||
--user jenkins \
|
||||
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
|
||||
-w /var/lib/jenkins/workspace \
|
||||
"${DOCKER_IMAGE}"
|
||||
)
|
||||
docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
|
||||
- name: Display and upload binary build size statistics (Click Me)
|
||||
# temporary hack: set CIRCLE_* vars, until we update
|
||||
# tools/stats/print_test_stats.py to natively support GitHub Actions
|
||||
env:
|
||||
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
|
||||
CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
|
||||
CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
|
||||
CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
|
||||
run: |
|
||||
COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
|
||||
export COMMIT_TIME
|
||||
pip3 install requests==2.26 boto3==1.16.34
|
||||
python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
|
||||
- name: Chown workspace
|
||||
run: |
|
||||
# Ensure the working directory gets chowned back to the current user
|
||||
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
|
||||
- name: Archive artifacts into zip
|
||||
run: |
|
||||
zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
|
||||
- uses: seemethere/upload-artifact-s3@v3
|
||||
name: Store PyTorch Build Artifacts on S3
|
||||
with:
|
||||
name: ${{ env.BUILD_ENVIRONMENT }}
|
||||
retention-days: 14
|
||||
if-no-files-found: error
|
||||
path:
|
||||
artifacts.zip
|
||||
- name: Hold runner for 2 hours or until ssh sessions have drained
|
||||
# Always hold for active ssh sessions
|
||||
if: always()
|
||||
run: .github/scripts/wait_for_ssh_to_drain.sh
|
||||
- name: Chown workspace
|
||||
if: always()
|
||||
env:
|
||||
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
|
||||
run: |
|
||||
# Ensure the working directory gets chowned back to the current user
|
||||
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
|
||||
- name: Kill containers, clean up images
|
||||
if: always()
|
||||
run: |
|
||||
# ignore expansion of "docker ps -q" since it could be empty
|
||||
# shellcheck disable=SC2046
|
||||
docker stop $(docker ps -q) || true
|
||||
# Prune all of the docker images
|
||||
docker system prune -af
|
||||
- name: Hold runner for 2 hours or until ssh sessions have drained
|
||||
# Always hold for active ssh sessions
|
||||
if: always()
|
||||
run: .github/scripts/wait_for_ssh_to_drain.sh
|
||||
- name: Clean up docker images
|
||||
if: always()
|
||||
run: |
|
||||
# Prune all of the docker images
|
||||
docker system prune -af
|
||||
|
||||
generate-test-matrix:
|
||||
runs-on: ubuntu-18.04
|
||||
needs: [ciflow_should_run]
|
||||
env:
|
||||
TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
|
||||
ENABLE_DISTRIBUTED_TEST: 1
|
||||
ENABLE_JIT_LEGACY_TEST: 1
|
||||
ENABLE_MULTIGPU_TEST: 1
|
||||
ENABLE_NOGPU_NO_AVX_TEST: 1
|
||||
ENABLE_NOGPU_NO_AVX2_TEST: 1
|
||||
ENABLE_SLOW_TEST: 1
|
||||
ENABLE_DOCS_TEST: ''
|
||||
ENABLE_BACKWARDS_COMPAT_TEST: ''
|
||||
ENABLE_XLA_TEST: ''
|
||||
ENABLE_NOARCH_TEST: ''
|
||||
NUM_TEST_SHARDS: 2
|
||||
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
|
||||
NOGPU_RUNNER_TYPE: linux.2xlarge
|
||||
PR_BODY: ${{ github.event.pull_request.body }}
|
||||
outputs:
|
||||
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||
render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
|
||||
ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
|
||||
container:
|
||||
image: python:3.9
|
||||
steps:
|
||||
- name: Install dependencies
|
||||
run: pip install typing-extensions==3.10
|
||||
- name: Clone pytorch/pytorch
|
||||
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
|
||||
- name: Generating test matrix
|
||||
id: set-matrix
|
||||
run: .github/scripts/generate_pytorch_test_matrix.py
|
||||
|
||||
test:
|
||||
needs: [build, generate-test-matrix, ciflow_should_run]
|
||||
strategy:
|
||||
matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
|
||||
fail-fast: false
|
||||
runs-on: ${{ matrix.runner }}
|
||||
env:
|
||||
DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
|
||||
JOB_BASE_NAME: linux-xenial-cuda10.2-py3.6-gcc7-test
|
||||
TEST_CONFIG: ${{ matrix.config }}
|
||||
SHARD_NUMBER: ${{ matrix.shard }}
|
||||
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
|
||||
PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
function get_ec2_metadata() {
|
||||
# Pulled from instance metadata endpoint for EC2
|
||||
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
||||
category=$1
|
||||
curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
||||
}
|
||||
echo "ami-id: $(get_ec2_metadata ami-id)"
|
||||
echo "instance-id: $(get_ec2_metadata instance-id)"
|
||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
||||
- name: Log in to ECR
|
||||
env:
|
||||
AWS_RETRY_MODE: standard
|
||||
AWS_MAX_ATTEMPTS: 5
|
||||
run: |
|
||||
aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
|
||||
bash /tmp/ecr-login.sh
|
||||
rm /tmp/ecr-login.sh
|
||||
- name: Chown workspace
|
||||
env:
|
||||
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
|
||||
run: |
|
||||
retry () {
|
||||
"$@" || (sleep 1 && "$@") || (sleep 2 && "$@")
|
||||
}
|
||||
retry docker pull "${ALPINE_IMAGE}"
|
||||
# Ensure the working directory gets chowned back to the current user
|
||||
docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
|
||||
- name: Clean workspace
|
||||
run: |
|
||||
rm -rf "${GITHUB_WORKSPACE:?}/*"
|
||||
rm -f ~/.ssh/authorized_keys
|
||||
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
||||
uses: seemethere/add-github-ssh-key@v1
|
||||
with:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Preserve github env variables for use in docker
|
||||
run: |
|
||||
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
|
||||
- name: Checkout PyTorch
|
||||
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
|
||||
with:
|
||||
# deep clone, to allow use of git merge-base
|
||||
fetch-depth: 0
|
||||
submodules: recursive
|
||||
- name: Pull Docker image
|
||||
run: |
|
||||
retry () {
|
||||
"$@" || (sleep 1 && "$@") || (sleep 2 && "$@")
|
||||
}
|
||||
retry docker pull "${DOCKER_IMAGE}"
|
||||
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
|
||||
if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
|
||||
run: |
|
||||
bash .github/scripts/install_nvidia_utils_linux.sh
|
||||
echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
|
||||
- name: Determine shm-size
|
||||
run: |
|
||||
shm_size="1g"
|
||||
case "${BUILD_ENVIRONMENT}" in
|
||||
*cuda*)
|
||||
shm_size="2g"
|
||||
;;
|
||||
*rocm*)
|
||||
shm_size="8g"
|
||||
;;
|
||||
esac
|
||||
echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
|
||||
- uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
|
||||
name: Download PyTorch Build Artifacts
|
||||
with:
|
||||
name: ${{ env.BUILD_ENVIRONMENT }}
|
||||
- name: Unzip artifacts
|
||||
run: |
|
||||
unzip -o artifacts.zip
|
||||
- name: Output disk space left
|
||||
run: |
|
||||
sudo df -H
|
||||
- name: Parse ref
|
||||
id: parse-ref
|
||||
run: .github/scripts/parse_ref.py
|
||||
- name: Test
|
||||
env:
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
|
||||
# Time out the test phase after 240 minutes
|
||||
timeout-minutes: 240
|
||||
run: |
|
||||
set -x
|
||||
|
||||
if [[ $TEST_CONFIG == 'multigpu' ]]; then
|
||||
TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
|
||||
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
|
||||
TEST_COMMAND=.jenkins/caffe2/test.sh
|
||||
else
|
||||
TEST_COMMAND=.jenkins/pytorch/test.sh
|
||||
fi
|
||||
# detached container should get cleaned up by teardown_ec2_linux
|
||||
# TODO: Stop building test binaries as part of the build phase
|
||||
# Used for GPU_FLAG since that doesn't play nice
|
||||
# shellcheck disable=SC2086
|
||||
container_name=$(docker run \
|
||||
${GPU_FLAG:-} \
|
||||
-e BUILD_ENVIRONMENT \
|
||||
-e PR_NUMBER \
|
||||
-e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
|
||||
-e GITHUB_ACTIONS \
|
||||
-e IN_CI \
|
||||
-e IS_GHA \
|
||||
-e CIRCLE_BRANCH \
|
||||
-e CIRCLE_SHA1 \
|
||||
-e CIRCLE_PR_NUMBER \
|
||||
-e AWS_DEFAULT_REGION \
|
||||
-e IN_WHEEL_TEST \
|
||||
-e SHARD_NUMBER \
|
||||
-e JOB_BASE_NAME \
|
||||
-e TEST_CONFIG \
|
||||
-e NUM_TEST_SHARDS \
|
||||
-e PYTORCH_IGNORE_DISABLED_ISSUES \
|
||||
-e PR_LABELS \
|
||||
-e MAX_JOBS="$(nproc --ignore=2)" \
|
||||
-e SCCACHE_BUCKET \
|
||||
-e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
|
||||
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
|
||||
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
||||
--ulimit stack=10485760:83886080 \
|
||||
--security-opt seccomp=unconfined \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--shm-size="${SHM_SIZE}" \
|
||||
--tty \
|
||||
--detach \
|
||||
--name="${container_name}" \
|
||||
--user jenkins \
|
||||
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
|
||||
-w /var/lib/jenkins/workspace \
|
||||
"${DOCKER_IMAGE}"
|
||||
)
|
||||
docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}"
|
||||
- name: Chown workspace
|
||||
if: always()
|
||||
run: |
|
||||
# Ensure the working directory gets chowned back to the current user
|
||||
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
|
||||
- name: Install render_test_results dependencies
|
||||
if: always()
|
||||
shell: bash
|
||||
run: |
|
||||
python3 -m pip install junitparser==2.1.1 rich==10.9.0
|
||||
- name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
|
||||
if: always()
|
||||
shell: bash
|
||||
# Encoding is weird on windows, just try to default to utf-8 if possible
|
||||
env:
|
||||
PYTHONIOENCODING: "utf-8"
|
||||
run: |
|
||||
python3 tools/render_junit.py test/
|
||||
- name: Zip test reports for upload
|
||||
if: always()
|
||||
env:
|
||||
FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
|
||||
run: |
|
||||
# Remove any previous test reports if they exist
|
||||
rm -f test-reports-*.zip
|
||||
zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
|
||||
- uses: seemethere/upload-artifact-s3@v3
|
||||
name: Store Test Reports on S3
|
||||
if: always()
|
||||
with:
|
||||
retention-days: 14
|
||||
if-no-files-found: error
|
||||
path:
|
||||
test-reports-*.zip
|
||||
- name: Display and upload test statistics (Click Me)
|
||||
if: always()
|
||||
# temporary hack: set CIRCLE_* vars, until we update
|
||||
# tools/stats/print_test_stats.py to natively support GitHub Actions
|
||||
env:
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
|
||||
JOB_BASE_NAME: linux-xenial-cuda10.2-py3.6-gcc7-test
|
||||
CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
|
||||
CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
|
||||
shell: bash
|
||||
run: |
|
||||
python3 -m pip install -r requirements.txt
|
||||
python3 -m pip install boto3==1.16.34
|
||||
python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
|
||||
- name: Hold runner for 2 hours or until ssh sessions have drained
|
||||
# Always hold for active ssh sessions
|
||||
if: always()
|
||||
run: .github/scripts/wait_for_ssh_to_drain.sh
|
||||
- name: Chown workspace
|
||||
if: always()
|
||||
env:
|
||||
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
|
||||
run: |
|
||||
# Ensure the working directory gets chowned back to the current user
|
||||
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
|
||||
- name: Kill containers, clean up images
|
||||
if: always()
|
||||
run: |
|
||||
# ignore expansion of "docker ps -q" since it could be empty
|
||||
# shellcheck disable=SC2046
|
||||
docker stop $(docker ps -q) || true
|
||||
# Prune all of the docker images
|
||||
docker system prune -af
|
||||
3
.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
generated
vendored
3
.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
generated
vendored
|
|
@ -260,7 +260,7 @@ jobs:
|
|||
runs-on: ubuntu-18.04
|
||||
needs: [ciflow_should_run]
|
||||
env:
|
||||
TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
|
||||
TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu
|
||||
ENABLE_DISTRIBUTED_TEST: 1
|
||||
ENABLE_JIT_LEGACY_TEST: ''
|
||||
ENABLE_MULTIGPU_TEST: ''
|
||||
|
|
@ -273,6 +273,7 @@ jobs:
|
|||
ENABLE_NOARCH_TEST: ''
|
||||
NUM_TEST_SHARDS: 2
|
||||
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
|
||||
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
|
||||
NOGPU_RUNNER_TYPE: linux.2xlarge
|
||||
PR_BODY: ${{ github.event.pull_request.body }}
|
||||
outputs:
|
||||
|
|
|
|||
1
.github/workflows/generated-linux-xenial-py3.6-clang7-asan.yml
generated
vendored
1
.github/workflows/generated-linux-xenial-py3.6-clang7-asan.yml
generated
vendored
|
|
@ -273,6 +273,7 @@ jobs:
|
|||
ENABLE_NOARCH_TEST: ''
|
||||
NUM_TEST_SHARDS: 2
|
||||
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
|
||||
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
|
||||
NOGPU_RUNNER_TYPE: linux.2xlarge
|
||||
PR_BODY: ${{ github.event.pull_request.body }}
|
||||
outputs:
|
||||
|
|
|
|||
1
.github/workflows/generated-linux-xenial-py3.6-clang7-onnx.yml
generated
vendored
1
.github/workflows/generated-linux-xenial-py3.6-clang7-onnx.yml
generated
vendored
|
|
@ -273,6 +273,7 @@ jobs:
|
|||
ENABLE_NOARCH_TEST: ''
|
||||
NUM_TEST_SHARDS: 2
|
||||
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
|
||||
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
|
||||
NOGPU_RUNNER_TYPE: linux.2xlarge
|
||||
PR_BODY: ${{ github.event.pull_request.body }}
|
||||
outputs:
|
||||
|
|
|
|||
1
.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
generated
vendored
1
.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
generated
vendored
|
|
@ -273,6 +273,7 @@ jobs:
|
|||
ENABLE_NOARCH_TEST: ''
|
||||
NUM_TEST_SHARDS: 2
|
||||
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
|
||||
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
|
||||
NOGPU_RUNNER_TYPE: linux.2xlarge
|
||||
PR_BODY: ${{ github.event.pull_request.body }}
|
||||
outputs:
|
||||
|
|
|
|||
1
.github/workflows/generated-linux-xenial-py3.6-gcc7.yml
generated
vendored
1
.github/workflows/generated-linux-xenial-py3.6-gcc7.yml
generated
vendored
|
|
@ -273,6 +273,7 @@ jobs:
|
|||
ENABLE_NOARCH_TEST: ''
|
||||
NUM_TEST_SHARDS: 2
|
||||
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
|
||||
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
|
||||
NOGPU_RUNNER_TYPE: linux.2xlarge
|
||||
PR_BODY: ${{ github.event.pull_request.body }}
|
||||
outputs:
|
||||
|
|
|
|||
1
.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml
generated
vendored
1
.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml
generated
vendored
|
|
@ -273,6 +273,7 @@ jobs:
|
|||
ENABLE_NOARCH_TEST: ''
|
||||
NUM_TEST_SHARDS: 1
|
||||
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
|
||||
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
|
||||
NOGPU_RUNNER_TYPE: linux.2xlarge
|
||||
PR_BODY: ${{ github.event.pull_request.body }}
|
||||
outputs:
|
||||
|
|
|
|||
|
|
@ -258,7 +258,7 @@ jobs:
|
|||
runs-on: ubuntu-18.04
|
||||
needs: [ciflow_should_run]
|
||||
env:
|
||||
TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
|
||||
TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu
|
||||
ENABLE_DISTRIBUTED_TEST: ''
|
||||
ENABLE_JIT_LEGACY_TEST: ''
|
||||
ENABLE_MULTIGPU_TEST: ''
|
||||
|
|
@ -271,6 +271,7 @@ jobs:
|
|||
ENABLE_NOARCH_TEST: ''
|
||||
NUM_TEST_SHARDS: 2
|
||||
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
|
||||
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
|
||||
NOGPU_RUNNER_TYPE: linux.2xlarge
|
||||
PR_BODY: ${{ github.event.pull_request.body }}
|
||||
outputs:
|
||||
|
|
|
|||
3
.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
generated
vendored
3
.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
generated
vendored
|
|
@ -258,7 +258,7 @@ jobs:
|
|||
runs-on: ubuntu-18.04
|
||||
needs: [ciflow_should_run]
|
||||
env:
|
||||
TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
|
||||
TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu
|
||||
ENABLE_DISTRIBUTED_TEST: 1
|
||||
ENABLE_JIT_LEGACY_TEST: ''
|
||||
ENABLE_MULTIGPU_TEST: ''
|
||||
|
|
@ -271,6 +271,7 @@ jobs:
|
|||
ENABLE_NOARCH_TEST: ''
|
||||
NUM_TEST_SHARDS: 2
|
||||
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
|
||||
DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
|
||||
NOGPU_RUNNER_TYPE: linux.2xlarge
|
||||
PR_BODY: ${{ github.event.pull_request.body }}
|
||||
outputs:
|
||||
|
|
|
|||
2
.github/workflows/lint.yml
vendored
2
.github/workflows/lint.yml
vendored
|
|
@ -105,7 +105,7 @@ jobs:
|
|||
- name: Ensure all test files have header containing ownership information
|
||||
if: always()
|
||||
run: |
|
||||
(! git grep -L "# Owner(s): \[" test/distributed/**/test_*.py) || (printf "The above test files are missing a comment header with ownership information; please add the following line\n\n# Owner(s): [\"<owner: label>\"]\n\nto the top of each test file. The owner should be an existing pytorch/pytorch label."; false)
|
||||
(! git grep -L "# Owner(s): \[" ./test/distributed/**/test_*.py || (printf "The above test files are missing a comment header with ownership information; please add the following line\n\n# Owner(s): [\"<owner: label>\"]\n\nto the top of each test file. The owner should be an existing pytorch/pytorch label."; false))
|
||||
|
||||
clang-format:
|
||||
runs-on: ubuntu-18.04
|
||||
|
|
|
|||
|
|
@ -181,9 +181,7 @@ if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
|
|||
# JIT C++ extensions require ninja, so put it into PATH.
|
||||
export PATH="/var/lib/jenkins/.local/bin:$PATH"
|
||||
if [[ "$BUILD_ENVIRONMENT" == *py3* ]]; then
|
||||
pip install -q --user flatbuffers==2.0
|
||||
wget https://ortpypackage.blob.core.windows.net/ort-nightly/ort_nightly-1.8.0.dev202107131-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
|
||||
pip install -q --user ort_nightly-1.8.0.dev202107131-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
|
||||
pip install -q --user flatbuffers==2.0 onnxruntime==1.9.0
|
||||
fi
|
||||
"$ROOT_DIR/scripts/onnx/test.sh"
|
||||
fi
|
||||
|
|
|
|||
|
|
@ -523,7 +523,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
|
|||
# TODO: run some C++ tests
|
||||
echo "no-op at the moment"
|
||||
elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 || ("${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1) ]]; then
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *linux-xenial-cuda11.3*-test1* ]]; then
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *linux-xenial-cuda11.3* ]]; then
|
||||
echo "Testing torch::deploy"
|
||||
test_torch_deploy
|
||||
fi
|
||||
test_without_numpy
|
||||
|
|
|
|||
|
|
@ -127,6 +127,11 @@ genrule(
|
|||
"aten/src/ATen/Declarations.yaml",
|
||||
"aten/src/ATen/RegisterBackendSelect.cpp",
|
||||
"aten/src/ATen/RegisterCPU.cpp",
|
||||
"aten/src/ATen/RegisterFunctionalization_0.cpp",
|
||||
"aten/src/ATen/RegisterFunctionalization_1.cpp",
|
||||
"aten/src/ATen/RegisterFunctionalization_2.cpp",
|
||||
"aten/src/ATen/RegisterFunctionalization_3.cpp",
|
||||
# "aten/src/ATen/RegisterFunctionalizationEverything.cpp",
|
||||
"aten/src/ATen/RegisterMkldnnCPU.cpp",
|
||||
"aten/src/ATen/RegisterQuantizedCPU.cpp",
|
||||
"aten/src/ATen/RegisterSparseCPU.cpp",
|
||||
|
|
@ -143,6 +148,7 @@ genrule(
|
|||
"aten/src/ATen/CompositeExplicitAutogradFunctions_inl.h",
|
||||
"aten/src/ATen/CompositeImplicitAutogradFunctions.h",
|
||||
"aten/src/ATen/CompositeImplicitAutogradFunctions_inl.h",
|
||||
"aten/src/ATen/FunctionalInverses.h",
|
||||
"aten/src/ATen/Functions.h",
|
||||
"aten/src/ATen/Functions.cpp",
|
||||
"aten/src/ATen/RedispatchFunctions.h",
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@
|
|||
/torch/nn/ @albanD @jbschlosser
|
||||
/torch/optim/ @albanD
|
||||
/test/test_public_bindings.py @albanD
|
||||
/docs/source/conf.py @albanD
|
||||
/aten/src/ATen/native/native_functions.yaml @ezyang
|
||||
|
||||
# Tensorpipe RPC Agent.
|
||||
|
|
|
|||
|
|
@ -1026,6 +1026,8 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
|
|||
m.impl("size.int", static_cast<int64_t (*)(const Tensor&, int64_t)>(native::size));
|
||||
m.impl("_add_batch_dim", native::_add_batch_dim);
|
||||
m.impl("_remove_batch_dim", native::_remove_batch_dim);
|
||||
m.impl("_make_dual", native::_make_dual);
|
||||
m.impl("is_same_size", native::is_same_size);
|
||||
|
||||
m.impl("sum.dim_IntList", sum_batching_rule);
|
||||
m.impl("is_complex", native::is_complex);
|
||||
|
|
|
|||
234
aten/src/ATen/FunctionalInverses.cpp
Normal file
234
aten/src/ATen/FunctionalInverses.cpp
Normal file
|
|
@ -0,0 +1,234 @@
|
|||
|
||||
#include <ATen/FunctionalInverses.h>
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/ExpandUtils.h>
|
||||
namespace at {
|
||||
namespace functionalization {
|
||||
|
||||
// This logic is similar to autograd code for view backwards calls.
|
||||
// We can't easily share it though, because (eventually) these functions
|
||||
// will all call `permute/unsqueeze_copy()` instead of `permute/unsqueeze`.
|
||||
|
||||
Tensor permute_inverse(const Tensor& self, IntArrayRef dims) {
|
||||
// invert the permutation
|
||||
auto ndims = dims.size();
|
||||
std::vector<int64_t> dims_(ndims);
|
||||
for(const auto i : c10::irange(ndims)) {
|
||||
dims_[at::maybe_wrap_dim(dims[i], ndims)] = i;
|
||||
}
|
||||
return self.permute(dims_);
|
||||
}
|
||||
|
||||
Tensor unsqueeze_to(const Tensor & self, IntArrayRef sizes) {
|
||||
auto result = self;
|
||||
|
||||
int64_t nDims = sizes.size();
|
||||
for(const auto dim : c10::irange(nDims)) {
|
||||
if (sizes[dim] == 1) {
|
||||
result = result.unsqueeze(dim);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
Tensor unsqueeze_to(const Tensor & self, int64_t dim, IntArrayRef sizes) {
|
||||
dim = at::maybe_wrap_dim(dim, sizes.size());
|
||||
// in NumPy it's not an error to unsqueeze a scalar, but we still need to avoided
|
||||
// unsqueezing in the backward.
|
||||
if (sizes.size() > 0 && sizes[dim] == 1) {
|
||||
return self.unsqueeze(dim);
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
// Note [Functionalization Pass: View Inverses].
|
||||
// This file contains the implementation of each "view inverse".
|
||||
// These aren't really true inverses in the mathematically sense: each view inverse describes how to undo
|
||||
// the original view (although it takes in different arguments).
|
||||
//
|
||||
// E.g. Below is an example of a program that has alias operations removed, and the role that view inverses play:
|
||||
//
|
||||
// normal program with views and mutations:
|
||||
// view1 = input1.view_op(args...)
|
||||
// view1.add_(1) (perform a mutation on the view, which should also modify input)
|
||||
|
||||
// version of the program with no aliasing, that instead uses view_inverse functions:
|
||||
// view_copy1 = input1.view_copy_op(args...)
|
||||
// view_copy1.add_(1) (perform a mutation on view_copy1. At this point, input1 is NOT modified)
|
||||
// x = view_op_inverse(input1, view_copy1, args...)
|
||||
//
|
||||
// at this point, input1 and x should be equal
|
||||
//
|
||||
// Note that input1 is also passed as an argument to view_op_inverse in the above example.
|
||||
// This isn't actually required for most view operators: it's only required for view ops
|
||||
// where you can't figure out what the size of the base tensor is given just the view tensor and arguments.
|
||||
// Examples are slice/select/scatter/squeeze/as_strided.
|
||||
// We happen to be passing in the base tensor in all cases, mostly to make the codegen simpler.
|
||||
// But you'll see below that the "base" argument is ignored by most view_inverse implementations.
|
||||
|
||||
// ----------------------------------------------------------
|
||||
// Implementations of each view_inverse() function are below.
|
||||
// One of these needs to be implemented for every existing non-composite view operator.
|
||||
// The codegen automatically generates the corresponding function declaration.
|
||||
// ----------------------------------------------------------
|
||||
|
||||
Tensor FunctionalInverses::_fw_primal_inverse(const at::Tensor& base, const at::Tensor& mutated_view, int64_t level) {
|
||||
TORCH_INTERNAL_ASSERT(false, "Attempted to call _fw_primal() during the functionalization pass. For now, this is not supported.");
|
||||
return Tensor();
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::view_as_real_inverse(const Tensor& base, const Tensor& mutated_view) {
|
||||
return at::view_as_complex(mutated_view);
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::view_as_complex_inverse(const Tensor& base, const Tensor& mutated_view) {
|
||||
return at::view_as_real(mutated_view.resolve_conj());
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::_conj_inverse(const Tensor& base, const Tensor& mutated_view) {
|
||||
return mutated_view.conj();
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::_neg_view_inverse(const Tensor& base, const Tensor& mutated_view) {
|
||||
return mutated_view.neg();
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::as_strided_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef size, at::IntArrayRef stride, c10::optional<int64_t> storage_offset) {
|
||||
TORCH_INTERNAL_ASSERT(false, "as_strided has not been implemented in the functionalization pass yet");
|
||||
return Tensor();
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::diagonal_inverse(const Tensor& base, const Tensor& mutated_view, int64_t offset, int64_t dim1, int64_t dim2) {
|
||||
return base.diagonal_scatter(mutated_view, offset, dim1, dim2);
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::expand_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef size, bool implicit) {
|
||||
return at::sum_to(mutated_view, base.sizes());
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::permute_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef dims) {
|
||||
return at::functionalization::permute_inverse(mutated_view, dims);
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::_reshape_alias_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef size, at::IntArrayRef stride) {
|
||||
// Note that I'm directly calling reshape(), and ignoring the strides.
|
||||
// _reshape_alias() isn't available from user code, and is an implementation detail of reshape().
|
||||
// Specifically, passing in the strides directly can get us into trouble in cases like:
|
||||
// b = a[0]; c = b.reshape(...); c.add_(1); print(a)
|
||||
// When we eventually run the _reshape_alias_inverse() call here, if we were to pass in both sizes and strides,
|
||||
// The call would fail because `mutated_view` doesn't have enough bytes of storage.
|
||||
return mutated_view.reshape(base.sizes());
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::select_int_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim, int64_t index) {
|
||||
return base.select_scatter(mutated_view, dim, index);
|
||||
}
|
||||
Tensor FunctionalInverses::detach_inverse(const Tensor& base, const Tensor& mutated_view) {
|
||||
// the functionalization pass doesn't care about autograd metadata - as a view, I think detach() is just an identity function
|
||||
return mutated_view;
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::slice_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim, c10::optional<int64_t> start, c10::optional<int64_t> end, int64_t step) {
|
||||
return base.slice_scatter(mutated_view, dim, start, end, step);
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::split_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, int64_t mutated_view_idx, int64_t split_size, int64_t dim) {
|
||||
// It would be nice if this logic could be re-used from autograd's split_backward(), but I don't think it can.
|
||||
// For functionalization, we have only have one of the tensors from the TensorList outputed by split(), and we want to layer i
|
||||
// on top of the base tensor.
|
||||
// For autograd, we have all of the tensors outputted by split() and we just want to stack them.
|
||||
dim = at::maybe_wrap_dim(dim, base.sizes().size());
|
||||
auto dim_size = base.size(dim);
|
||||
auto start = mutated_view_idx * split_size;
|
||||
auto end = start + split_size;
|
||||
if (end > dim_size) end = dim_size;
|
||||
return base.slice_scatter(mutated_view, dim, start, end, 1);
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::split_with_sizes_inverse(const Tensor& base, const Tensor& mutated_view, int64_t mutated_view_idx, at::IntArrayRef split_sizes, int64_t dim) {
|
||||
dim = at::maybe_wrap_dim(dim, base.sizes().size());
|
||||
auto dim_size = base.size(dim);
|
||||
int64_t start = 0;
|
||||
for (auto i = 0; i < mutated_view_idx; ++i) {
|
||||
start += split_sizes[i];
|
||||
}
|
||||
auto end = start + split_sizes[mutated_view_idx];
|
||||
if (end > dim_size) end = dim_size;
|
||||
return base.slice_scatter(mutated_view, dim, start, end, 1);
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::squeeze_inverse(const Tensor& base, const Tensor& mutated_view) {
|
||||
return unsqueeze_to(mutated_view, base.sizes());
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::squeeze_dim_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim) {
|
||||
return unsqueeze_to(mutated_view, dim, base.sizes());
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::t_inverse(const Tensor& base, const Tensor& mutated_view) {
|
||||
return mutated_view.t();
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::transpose_int_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim0, int64_t dim1) {
|
||||
return mutated_view.transpose(dim0, dim1);
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::unsqueeze_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim) {
|
||||
return mutated_view.squeeze(dim);
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::_indices_inverse(const Tensor& base, const Tensor& mutated_view) {
|
||||
TORCH_INTERNAL_ASSERT(false, "Attempted to call _indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
|
||||
return Tensor();
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::_values_inverse(const Tensor& base, const Tensor& mutated_view) {
|
||||
TORCH_INTERNAL_ASSERT(false, "Attempted to call _values() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
|
||||
return Tensor();
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::indices_inverse(const Tensor& base, const Tensor& mutated_view) {
|
||||
TORCH_INTERNAL_ASSERT(false, "Attempted to call indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
|
||||
return Tensor();
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::values_inverse(const Tensor& base, const Tensor& mutated_view) {
|
||||
TORCH_INTERNAL_ASSERT(false, "Attempted to call values() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
|
||||
return Tensor();
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::crow_indices_inverse(const at::Tensor& base, const at::Tensor& mutated_view) {
|
||||
TORCH_INTERNAL_ASSERT(false, "Attempted to call crow_indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
|
||||
return Tensor();
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::col_indices_inverse(const at::Tensor& base, const at::Tensor& mutated_view) {
|
||||
TORCH_INTERNAL_ASSERT(false, "Attempted to call col_indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
|
||||
return Tensor();
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::unbind_int_inverse(const Tensor& base, const Tensor& mutated_view, int64_t mutated_view_idx, int64_t dim) {
|
||||
dim = at::maybe_wrap_dim(dim, base.sizes().size());
|
||||
return base.select_scatter(mutated_view, dim, mutated_view_idx);
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::view_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef size) {
|
||||
return mutated_view.view(base.sizes());
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::view_dtype_inverse(const Tensor& base, const Tensor& mutated_view, at::ScalarType dtype) {
|
||||
return mutated_view.view(base.scalar_type());
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::unfold_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dimension, int64_t size, int64_t step) {
|
||||
// I think autograd and the functionalization pass want the exact same thing here, but need to test to confirm.
|
||||
return unfold_backward(mutated_view, base.sizes(), dimension, size, step);
|
||||
}
|
||||
|
||||
Tensor FunctionalInverses::alias_inverse(const Tensor& base, const Tensor& mutated_view) {
|
||||
return mutated_view;
|
||||
}
|
||||
|
||||
} // functionalization
|
||||
} // at
|
||||
117
aten/src/ATen/FunctionalStorageImpl.cpp
Normal file
117
aten/src/ATen/FunctionalStorageImpl.cpp
Normal file
|
|
@ -0,0 +1,117 @@
|
|||
#include <ATen/FunctionalStorageImpl.h>
|
||||
|
||||
#include <ATen/FunctionalTensorWrapper.h>
|
||||
#include <ATen/core/LegacyTypeDispatch.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <vector>
|
||||
|
||||
namespace at {
|
||||
namespace functionalization {
|
||||
|
||||
ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
|
||||
if (out_idx == this->out_index) return *this;
|
||||
return ViewMeta(forward_fn, reverse_fn, out_idx);
|
||||
}
|
||||
|
||||
Alias::Alias(const at::Tensor& base) {
|
||||
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base));
|
||||
base_ = base;
|
||||
}
|
||||
|
||||
const at::Tensor& Alias::base() const {
|
||||
return base_;
|
||||
}
|
||||
|
||||
void Alias::add_update(const at::Tensor& updated_val, const std::vector<ViewMeta>& metas) {
|
||||
updates_.push_back({updated_val, metas});
|
||||
generation_++;
|
||||
}
|
||||
|
||||
// Note [Functionalization: Alias Removal Part 2]
|
||||
// See Note [Functionalization: Alias Removal] for more details.
|
||||
// This function applies a single update from one of the views to the Alias object.
|
||||
// We start out with <original_base> and <mutated_view>, and our goal is to end up with <mutated_base>.
|
||||
// Consider this program:
|
||||
//
|
||||
// base = ...
|
||||
// a = base.view1()
|
||||
// b = a.view2()
|
||||
// c = b.view3()
|
||||
// c.add_(3)
|
||||
//
|
||||
// Then the functionalization pass will queue an update as follows:
|
||||
//
|
||||
// update.new_val = c # the updated value of c
|
||||
// update.view_metas = [view1_meta, view2_meta, view3_meta]
|
||||
//
|
||||
// Syncing any of a, b or c will eventually call apply_update() on the alias, and the following will run:
|
||||
//
|
||||
// tmp_values = [base, a, b] # NB: c is not necessary
|
||||
// t = update.new_val
|
||||
// t = view3_inverse(b, t, 0) # 0 is output index, these are all single output views so it's 0
|
||||
// t = view2_inverse(a, t, 0)
|
||||
// t = view1_inverse(base, t, 0) # t now represents the updated alias.
|
||||
// alias.base_ = t
|
||||
const Tensor apply_update(const Alias::Update& update, const Tensor& base) {
|
||||
at::Tensor t = update.new_val;
|
||||
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
|
||||
std::vector<at::Tensor> tmp_values({base});
|
||||
for (size_t i = 0; i < update.view_metas.size() - 1; ++i) {
|
||||
at::Tensor next_view = update.view_metas[i].forward_fn(tmp_values.back(), update.view_metas[i].out_index);
|
||||
// NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided
|
||||
// All of these ops require additional information to recover the sizes of the original tensor.
|
||||
// If need to, we could probably apply this optimization and only bother computing tmp_values
|
||||
// for those necessary view ops.
|
||||
tmp_values.push_back(std::move(next_view));
|
||||
}
|
||||
for(int i = update.view_metas.size()-1; i >= 0; --i) {
|
||||
int64_t out_idx = update.view_metas[i].out_index;
|
||||
// Each view inverse is implemented in ViewInverses.cpp.
|
||||
t = update.view_metas[i].reverse_fn(tmp_values[i], t, out_idx);
|
||||
}
|
||||
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
|
||||
return t;
|
||||
}
|
||||
|
||||
void Alias::apply_updates() {
|
||||
// N.B:none of the tensors used in this function should be FunctionalTensorWrappers at this point.
|
||||
// The only reason we currently need the TLS exclude guard here is because of functorch's DynamicLayer stack.
|
||||
// It adds the Functionalize key into TLS before redispatching to the functionalization kernels,
|
||||
// which means that we need to explicitly exclude it here before doing any other work underneath the pass.
|
||||
at::AutoDispatchSkipFunctionalize guard;
|
||||
for (auto& update_data: updates_) {
|
||||
base_ = apply_update(update_data, base_);
|
||||
}
|
||||
updates_.clear();
|
||||
}
|
||||
|
||||
FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& value)
|
||||
: c10::StorageImpl(
|
||||
c10::StorageImpl::use_byte_size_t(),
|
||||
value.numel() * value.dtype().itemsize(),
|
||||
DataPtr{nullptr, value.device()},
|
||||
// Using a null allocator, since FunctionalTensorImpl's aren't resizeable.
|
||||
nullptr,
|
||||
/*resizeable=*/false
|
||||
),
|
||||
alias_(Alias(value))
|
||||
{}
|
||||
|
||||
void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<ViewMeta>& view_metas) {
|
||||
alias_.add_update(updated_val, view_metas);
|
||||
}
|
||||
|
||||
void FunctionalStorageImpl::apply_updates() {
|
||||
alias_.apply_updates();
|
||||
}
|
||||
|
||||
const Tensor& FunctionalStorageImpl::base() {
|
||||
return alias_.base();
|
||||
}
|
||||
|
||||
size_t FunctionalStorageImpl::generation() const {
|
||||
return alias_.generation();
|
||||
}
|
||||
|
||||
} // namespace functionalization
|
||||
} // namespace at
|
||||
110
aten/src/ATen/FunctionalStorageImpl.h
Normal file
110
aten/src/ATen/FunctionalStorageImpl.h
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
#pragma once
|
||||
|
||||
#include <ATen/Tensor.h>
|
||||
|
||||
namespace at {
|
||||
namespace functionalization {
|
||||
|
||||
// See Note [Functionalization Pass In Core]
|
||||
|
||||
// ViewMeta is a class used by the functionalization pass to navigate between
|
||||
// a base tensor and a view tensor.
|
||||
// For example, if I call `b = a.view1(...)`
|
||||
// the functionalization pass will generate and store a ViewMeta on b that looks like:
|
||||
//
|
||||
// ViewMeta(
|
||||
// [<captures>](const Tensor& base, int64_t mutated_view_idx) {
|
||||
// return base.view1(...);
|
||||
// },
|
||||
// [<captures>](const at::Tensor& base, const at::Tensor& mutated_view, int64_t mutated_view_idx) -> at::Tensor {
|
||||
// return at::functionalization::impl::view1_inverse(base, mutated_view, ...);
|
||||
// }
|
||||
//
|
||||
// The forward_fn lambda describes how to replay view1 on a tensor.
|
||||
//
|
||||
// The reverse_fn lambda describes how, given a tensor that is already a view, how to get the corresponding base tensor.
|
||||
// See Note [Functionalization Pass: View Inverses] for details.
|
||||
struct ViewMeta {
|
||||
ViewMeta(
|
||||
std::function<Tensor(const Tensor&, int64_t)> forward,
|
||||
std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse,
|
||||
int64_t out_idx = 0) :
|
||||
forward_fn(forward),
|
||||
reverse_fn(reverse),
|
||||
out_index(out_idx)
|
||||
{}
|
||||
|
||||
std::function<Tensor(const Tensor&, int64_t)> forward_fn;
|
||||
std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse_fn;
|
||||
// See Note [out_idx in ViewMeta]
|
||||
int64_t out_index;
|
||||
|
||||
// Returns a copy of the current ViewMeta, if out_idx matches the current out_index.
|
||||
// Otherwise, returns a new ViewMeta with the same forward/reverse functions, but a new out index.
|
||||
ViewMeta to_out_idx(int64_t out_idx);
|
||||
};
|
||||
|
||||
// Alias represents the state shared by (potentially multiple) views of the same tensor.
|
||||
// For example, in the following code:
|
||||
//
|
||||
// b = a.view1(...)
|
||||
// c = b.view2(...)
|
||||
// b.add_(1)
|
||||
// --> alias.add_update(b, {view1_meta})
|
||||
//
|
||||
// The call to add_(1) will result in a call to alias.add_update(b, {view1_meta}), queueing up
|
||||
// the mutation from b onto the alias.
|
||||
// Later, suppose c is used in an expression (e.g. you try to print c, or pass it to an operator).
|
||||
// Doing so will involve "syncing" c.
|
||||
// First we apply any pending updates to the alias, and then we regenerate c
|
||||
// by replaying its views off of the updated alias. E.g:
|
||||
//
|
||||
// print(str(c))
|
||||
// --> c.sync_()
|
||||
// --> alias.apply_updates() // after this, the alias will be updated to reflect the mutation to b
|
||||
class Alias {
|
||||
public:
|
||||
struct Update {
|
||||
const at::Tensor& new_val;
|
||||
const std::vector<ViewMeta>& view_metas;
|
||||
};
|
||||
explicit Alias(const at::Tensor& base);
|
||||
const at::Tensor& base() const;
|
||||
size_t generation() const { return generation_; }
|
||||
void add_update(const at::Tensor& updated_val, const std::vector<ViewMeta>& metas);
|
||||
void apply_updates();
|
||||
private:
|
||||
// NB: base_ should always point to a tensor BELOW the current functionalization layer.
|
||||
// This is mainly to avoid reference cycles.
|
||||
// e.g. given `b = a.view(...)`
|
||||
// Both a.storage_ and b.storage_ are a FunctionStorageImpl containing an Alias, with contains a Tensor `base_`.
|
||||
// In this case (where a and b are FunctionalTensorWrapper's), base_ should point not to a, but to a's unwrapped value, a.value_`
|
||||
// See Note [Functionalization: Alias Removal] for a diagram that shows this visually.
|
||||
at::Tensor base_;
|
||||
std::vector<Update> updates_;
|
||||
// generation_ gets incremented every time a mutation is queued onto the alias.
|
||||
// It is used to determine if a given tensor is "up to date", or if it needs to be regenerated from the alias.
|
||||
size_t generation_ = 0;
|
||||
};
|
||||
|
||||
// FunctionalStorageImpl is a subclass of StorageImpl used by the functionalization pass.
|
||||
// It has no underlying data (similar to meta storage).
|
||||
// It also knows how to reflect mutations to tensors in the absence of a valid data pointer.
|
||||
// It does this by separately storing an Alias object, which knows how to reflect mutations
|
||||
// that may have happened to views of the original tensor.
|
||||
struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
|
||||
explicit FunctionalStorageImpl(const Tensor& value);
|
||||
|
||||
void add_update(const Tensor& updated_val, const std::vector<ViewMeta>& view_metas);
|
||||
void apply_updates();
|
||||
const Tensor& base();
|
||||
size_t generation() const;
|
||||
|
||||
~FunctionalStorageImpl() override = default;
|
||||
|
||||
private:
|
||||
at::functionalization::Alias alias_;
|
||||
};
|
||||
|
||||
} // namespace functionalization
|
||||
} // namespace at
|
||||
377
aten/src/ATen/FunctionalTensorWrapper.cpp
Normal file
377
aten/src/ATen/FunctionalTensorWrapper.cpp
Normal file
|
|
@ -0,0 +1,377 @@
|
|||
|
||||
#include <ATen/FunctionalTensorWrapper.h>
|
||||
|
||||
#include <ATen/FunctionalInverses.h>
|
||||
#include <ATen/TensorUtils.h>
|
||||
#include <ATen/WrapDimUtils.h>
|
||||
#include <ATen/core/LegacyTypeDispatch.h>
|
||||
#include <c10/util/Exception.h>
|
||||
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
|
||||
void FunctionalTensorWrapper::set_constructor_metadata() {
|
||||
TORCH_INTERNAL_ASSERT(value_.defined());
|
||||
// Note: "level" is a concept that we don't know how to compute in core.
|
||||
// For now I'm retroactively setting this in functorch,
|
||||
// but once Open Multiple Dispatch lands we should be able to calculate this in core.
|
||||
level_ = -1;
|
||||
// shallow_copy_from overwrites the storage and dispatch keyset...
|
||||
auto functional_storage = storage_;
|
||||
shallow_copy_from(value_.getIntrusivePtr());
|
||||
storage_ = functional_storage;
|
||||
storage_access_should_throw_ = false;
|
||||
key_set_ = c10::DispatchKeySet(c10::DispatchKey::Functionalize) | value_.key_set();
|
||||
}
|
||||
|
||||
FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& value)
|
||||
: c10::TensorImpl(
|
||||
c10::Storage(c10::make_intrusive<functionalization::FunctionalStorageImpl>(value)),
|
||||
c10::DispatchKeySet(DispatchKey::Functionalize) | value.key_set(),
|
||||
value.dtype()
|
||||
),
|
||||
value_(value)
|
||||
{
|
||||
set_constructor_metadata();
|
||||
}
|
||||
|
||||
// Note [Functionalization: Alias Removal]
|
||||
// When someone calls a view() op during the functionalization pass, e.g. 'b = a.view(...)',
|
||||
// we link `b` and `a` to a shared Alias object to preserve the aliasing relationship.
|
||||
//
|
||||
// How do we do that?
|
||||
//
|
||||
// Every FunctionalTensorWrapper contains a dummy FunctionalStorageImpl, which subclasses from c10::StorageImpl.
|
||||
// It doesn't contain any data (similar to MetaTensor storage), but it contains an Alias object that knows about the base tensor.
|
||||
// When a tensor is created through a view operation, both the new and old tensor point to the same FunctionalStorageImpl.
|
||||
//
|
||||
// As mutations are applied to any of the views, we also queue each mutation up on the Alias object, so we can replay them.
|
||||
// When the user requests a tensor that's had a view taken, we check if it's up to date.
|
||||
// If it's not up to date, we first replay all of the queued up mutations onto the alias, and then re-apply the current view
|
||||
// on top of the newly updated alias.
|
||||
//
|
||||
// Why do we queue up and lazily run mutations on the alias, instead of updating the alias eagerly?
|
||||
// This behavior was taken from pytorch/xla, which the alias-removal logic was inspired from.
|
||||
// One benefit of the laziness is that we save work in the cases where a user has multiple views and mutates one of them,
|
||||
// but never uses the other views later in the program (in which case we'll never update the alias).
|
||||
// It also has downsides though: repeatedly applying mutations to the same view without syncing
|
||||
// will silently use up more and more memory as more mutations are queued up.
|
||||
//
|
||||
// Corresponding diagram:
|
||||
//
|
||||
// b = a.view(...)
|
||||
//
|
||||
// a b
|
||||
// | | If the user asks for b and it’s out of date,
|
||||
// \/ \/ We regenerate b by replaying it’s views from the alias.
|
||||
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - .
|
||||
// | FunctionalTensorWrapper | | FunctionalTensorWrapper |
|
||||
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - .
|
||||
// | value | storage | | storage | Value |
|
||||
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - .
|
||||
// | \ / |
|
||||
// | \ / |
|
||||
// | . - - - - - - - - - - - - . |
|
||||
// | | FunctionalStorageImpl | |
|
||||
// | . - - - - - - - - - - - - . |
|
||||
// | | Alias | |
|
||||
// | . - - - - - - - - - - - - . |
|
||||
// | / mutations to a or b |
|
||||
// | / are queued onto Alias |
|
||||
// | / |
|
||||
// \/ / \/
|
||||
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - - - .
|
||||
// | TensorImpl | | TensorImpl |
|
||||
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - - - .
|
||||
// | value | storage | | storage | Value |
|
||||
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - - - .
|
||||
// | |
|
||||
// | |
|
||||
// | |
|
||||
// | In this picture the two tensor views their own storages, |
|
||||
// | have their own storages, but backends like functorch |
|
||||
// \/ are allowed to re-alias underneath the pass \/
|
||||
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - - - .
|
||||
// | underyling_storage | | underyling_storage |
|
||||
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - - - .
|
||||
//
|
||||
// This constructor is only used by view ops.
|
||||
// - view_value: The output tensor that we need to wrap.
|
||||
// - base: The "base" of the view that `view_value` was generated from.
|
||||
// See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic.
|
||||
FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, functionalization::ViewMeta meta)
|
||||
: c10::TensorImpl(
|
||||
c10::DispatchKeySet(DispatchKey::Functionalize),
|
||||
view_value.dtype(),
|
||||
view_value.device()
|
||||
),
|
||||
value_(view_value)
|
||||
{
|
||||
set_constructor_metadata();
|
||||
// Copy the original tensor's ViewMeta vector and push the current one.
|
||||
if (base->view_metas_.size() > 0) {
|
||||
view_metas_ = base->view_metas_; // copy
|
||||
}
|
||||
view_metas_.push_back(meta);
|
||||
storage_ = base->storage_; // alias this tensor's storage with the base tensor's
|
||||
}
|
||||
|
||||
functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const {
|
||||
return static_cast<functionalization::FunctionalStorageImpl*>(storage_.unsafeGetStorageImpl());
|
||||
}
|
||||
|
||||
void FunctionalTensorWrapper::commit_update() {
|
||||
auto storage_impl = functional_storage_impl();
|
||||
storage_impl->add_update(value_, view_metas_);
|
||||
// Invariant: commit_update() is called during an inplace operation.
|
||||
// Tensor inputs to the operation are synced before runnig the op,
|
||||
// so the current tensor must be up-to-date with its alias at this point.
|
||||
generation_ = storage_impl->generation();
|
||||
}
|
||||
|
||||
bool FunctionalTensorWrapper::is_aliased() const {
|
||||
// Two FunctionalTensorWrapper objects are aliased if they share storage.
|
||||
// That means that we can check if a given FunctionalTensorWrapper is aliased
|
||||
// by checking the reference count on its storage.
|
||||
return storage_.use_count() > 1;
|
||||
}
|
||||
|
||||
bool FunctionalTensorWrapper::is_up_to_date() const {
|
||||
if (!is_aliased()) return true;
|
||||
auto alias_generation = functional_storage_impl()->generation();
|
||||
return generation_ == alias_generation;
|
||||
}
|
||||
|
||||
// See Note [Functionalization Pass - Inplace View Ops]
|
||||
void FunctionalTensorWrapper::mutate_view_meta(at::functionalization::ViewMeta meta) {
|
||||
view_metas_.push_back(meta);
|
||||
// Note [Functionalization Pass - Inplace View Ops]
|
||||
// So, these ops are special - they're mutation AND view ops. They get special codegen.
|
||||
// An example is transpose_, e.g. `a.transpose_()`
|
||||
// Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas.
|
||||
// We also need to force a sync (even if a is already up to date), because a's underlying tensor hasn't actually
|
||||
// been updated to reflect the new view yet.
|
||||
regenerate_from_base();
|
||||
}
|
||||
|
||||
// Note [Functionalization: Mutation Removal]
|
||||
// Mutation removal is used to take a program like this:
|
||||
//
|
||||
// a.add_(b)
|
||||
//
|
||||
// and replace it with a slightly different program that has the same semantics:
|
||||
//
|
||||
// tmp = a.add(b)
|
||||
// a.replace_(tmp)
|
||||
//
|
||||
// Where the replace_() call is implemented directly in the functionalization pass, so it is transparent to the backend.
|
||||
// This is useful for backends that aren't able to handle certain types of mutations, like functorch.
|
||||
//
|
||||
// Why do we need to wrap every tensor in a FunctionalTensorWrapper? Consider this program:
|
||||
//
|
||||
// Before:
|
||||
// tensor.add_(batched_tensor)
|
||||
//
|
||||
// After:
|
||||
// tmp = tensor.add(batched_tensor)
|
||||
// tensor.replace_(tmp)
|
||||
//
|
||||
// In the above, tmp is a batched tensor (because adding a normal tensor to a batched tensor does broadcasting and creates a batched tensor).
|
||||
// But we can't just replace the underlying memory backing `tensor` with `tmp` - a batched tensor takes up more space!
|
||||
// Instead, every input, intermediate and output of the program is wrapped in a FunctionalTensorImpl, which wraps the underlying tensor.
|
||||
void FunctionalTensorWrapper::replace_(const Tensor& other) {
|
||||
// TODO: going to need to change this if we want nested functionalize() transforms.
|
||||
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(other));
|
||||
value_ = other;
|
||||
}
|
||||
|
||||
|
||||
void FunctionalTensorWrapper::sync_() {
|
||||
if (is_up_to_date()) {
|
||||
return;
|
||||
}
|
||||
apply_updates();
|
||||
regenerate_from_base();
|
||||
}
|
||||
|
||||
void FunctionalTensorWrapper::regenerate_from_base() {
|
||||
at::AutoDispatchSkipFunctionalize guard;
|
||||
auto storage_impl = functional_storage_impl();
|
||||
auto t = storage_impl->base();
|
||||
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
|
||||
// Reapply views to get the viewed tensor from the base in alias_
|
||||
for (auto& view_meta: view_metas_) {
|
||||
t = view_meta.forward_fn(t, view_meta.out_index);
|
||||
}
|
||||
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
|
||||
replace_(t);
|
||||
generation_ = storage_impl->generation();
|
||||
}
|
||||
|
||||
void FunctionalTensorWrapper::apply_updates() {
|
||||
// Apply all updates on alias_
|
||||
auto storage_impl = functional_storage_impl();
|
||||
storage_impl->apply_updates();
|
||||
}
|
||||
|
||||
const char* FunctionalTensorWrapper::tensorimpl_type_name() const {
|
||||
return "FunctionalTensorWrapper";
|
||||
}
|
||||
|
||||
namespace functionalization {
|
||||
namespace impl {
|
||||
|
||||
Tensor to_functional_tensor(const Tensor& tensor) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!isFunctionalTensor(tensor));
|
||||
return at::detail::make_tensor<FunctionalTensorWrapper>(tensor);
|
||||
}
|
||||
TensorList to_functional_tensor(const c10::List<Tensor>& t_list) {
|
||||
std::vector<Tensor> outputs(t_list.size());
|
||||
for (const auto i : c10::irange(t_list.size())) {
|
||||
outputs[i] = to_functional_tensor(t_list[i]);
|
||||
}
|
||||
return outputs;
|
||||
}
|
||||
std::vector<Tensor> to_functional_tensor(const std::vector<Tensor>& t_list) {
|
||||
std::vector<Tensor> outputs(t_list.size());
|
||||
for (const auto i : c10::irange(t_list.size())) {
|
||||
outputs[i] = to_functional_tensor(t_list[i]);
|
||||
}
|
||||
return outputs;
|
||||
}
|
||||
TensorList to_functional_tensor(const TensorList& t_list) {
|
||||
std::vector<Tensor> outputs(t_list.size());
|
||||
for (const auto i : c10::irange(t_list.size())) {
|
||||
outputs[i] = to_functional_tensor(t_list[i]);
|
||||
}
|
||||
return outputs;
|
||||
}
|
||||
|
||||
Tensor from_functional_tensor(const Tensor& tensor) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isFunctionalTensor(tensor));
|
||||
auto impl = unsafeGetFunctionalWrapper(tensor);
|
||||
return impl->value();
|
||||
}
|
||||
c10::optional<Tensor> from_functional_tensor(const c10::optional<Tensor>& t) {
|
||||
if (t.has_value()) {
|
||||
return c10::make_optional<Tensor>(from_functional_tensor(*t));
|
||||
}
|
||||
return c10::nullopt;
|
||||
}
|
||||
c10::List<Tensor> from_functional_tensor(const c10::List<Tensor>& t_list) {
|
||||
c10::List<Tensor> outputs;
|
||||
outputs.reserve(t_list.size());
|
||||
for (const auto i : c10::irange(t_list.size())) {
|
||||
outputs.push_back(from_functional_tensor(t_list[i]));
|
||||
}
|
||||
return outputs;
|
||||
}
|
||||
c10::List<c10::optional<Tensor>> from_functional_tensor(const c10::List<c10::optional<Tensor>>& t_list) {
|
||||
c10::List<c10::optional<Tensor>> outputs;
|
||||
outputs.reserve(t_list.size());
|
||||
for (const auto i : c10::irange(t_list.size())) {
|
||||
outputs.push_back(from_functional_tensor(t_list[i]));
|
||||
}
|
||||
return outputs;
|
||||
}
|
||||
TensorList from_functional_tensor(const TensorList& t_list) {
|
||||
std::vector<Tensor> outputs(t_list.size());
|
||||
for (const auto i : c10::irange(t_list.size())) {
|
||||
outputs.push_back(from_functional_tensor(t_list[i]));
|
||||
}
|
||||
return outputs;
|
||||
}
|
||||
|
||||
void sync(const Tensor& t) {
|
||||
if (t.unsafeGetTensorImpl()->is_wrapped_number()) {
|
||||
// Unfortunately, we can't easily guarantee that wrapped numbers (scalar-tensors)
|
||||
// get wrapped up in a FunctionalTensorWrapper object, since they skip the dispatcher.
|
||||
// That shouldn't matter, since I don't think we're allowed to assign to wrapped numbers anyway.
|
||||
return;
|
||||
}
|
||||
// Not every tensor that hits a functionalization kernel is necessarily a functional tensor.
|
||||
// For example, xla_tensor.copy_(cpu_tensor) needs to hit the functionalization kernel
|
||||
// to sync xla_tensor, but not cpu_tensor.
|
||||
if (!at::functionalization::impl::isFunctionalTensor(t)) {
|
||||
return;
|
||||
}
|
||||
auto functional_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(t);
|
||||
functional_impl->sync_();
|
||||
}
|
||||
void sync(const c10::optional<Tensor>& t) {
|
||||
if (t.has_value()) {
|
||||
sync(*t);
|
||||
}
|
||||
}
|
||||
void sync(const c10::List<Tensor> t_list) {
|
||||
for (const auto i : c10::irange(t_list.size())) {
|
||||
sync(t_list[i]);
|
||||
}
|
||||
}
|
||||
void sync(const at::TensorList t_list) {
|
||||
for (auto t: t_list) {
|
||||
sync(t);
|
||||
}
|
||||
}
|
||||
void sync(const c10::List<c10::optional<Tensor>> t_list) {
|
||||
for (const auto i : c10::irange(t_list.size())) {
|
||||
sync(t_list[i]);
|
||||
}
|
||||
}
|
||||
|
||||
Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) {
|
||||
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap));
|
||||
TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base));
|
||||
auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base);
|
||||
if (out_idx != 0) {
|
||||
// Note [out_idx in ViewMeta]
|
||||
// When a view op outputs multiple tensors, each output needs its own separate ViewMeta.
|
||||
// Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function.
|
||||
meta = meta.to_out_idx(out_idx);
|
||||
}
|
||||
return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta);
|
||||
}
|
||||
|
||||
std::vector<Tensor> create_functional_tensor_with_view_meta(const c10::List<at::Tensor>& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta) {
|
||||
std::vector<Tensor> outputs(view_to_wrap.size());
|
||||
for (const auto i : c10::irange(view_to_wrap.size())) {
|
||||
outputs[i] = create_functional_tensor_with_view_meta(view_to_wrap[i], base, meta, i);
|
||||
}
|
||||
return outputs;
|
||||
}
|
||||
|
||||
std::vector<Tensor> create_functional_tensor_with_view_meta(const std::vector<at::Tensor>& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta) {
|
||||
std::vector<Tensor> outputs(view_to_wrap.size());
|
||||
for (const auto i : c10::irange(view_to_wrap.size())) {
|
||||
outputs[i] = create_functional_tensor_with_view_meta(view_to_wrap[i], base, meta, i);
|
||||
}
|
||||
return outputs;
|
||||
}
|
||||
|
||||
void mutate_view_meta(const at::Tensor& self, functionalization::ViewMeta meta) {
|
||||
TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self));
|
||||
auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self);
|
||||
self_impl->mutate_view_meta(meta);
|
||||
}
|
||||
|
||||
// Note [Propagating strides in the functionalization pass]
|
||||
// In order to properly compute stride information, the functionalization pass
|
||||
// calls each {view} reference implementations with meta tensors.
|
||||
// The output meta tensor's stride info serves as a reference for what the correct strides should be.
|
||||
void set_sizes_strides_offset(const Tensor& out, const Tensor& reference_out) {
|
||||
out.unsafeGetTensorImpl()->set_sizes_and_strides(reference_out.sizes(), reference_out.strides());
|
||||
out.unsafeGetTensorImpl()->set_storage_offset(reference_out.storage_offset());
|
||||
}
|
||||
|
||||
void set_sizes_strides_offset(const std::vector<Tensor>& outs, const std::vector<Tensor>& reference_outs) {
|
||||
TORCH_INTERNAL_ASSERT(outs.size() == reference_outs.size());
|
||||
for (const auto i : c10::irange(reference_outs.size())) {
|
||||
set_sizes_strides_offset(outs[i], reference_outs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace impl
|
||||
} // namespace functionalization
|
||||
} // namespace at
|
||||
155
aten/src/ATen/FunctionalTensorWrapper.h
Normal file
155
aten/src/ATen/FunctionalTensorWrapper.h
Normal file
|
|
@ -0,0 +1,155 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include <ATen/ArrayRef.h>
|
||||
#include <ATen/core/List.h>
|
||||
#include <ATen/FunctionalStorageImpl.h>
|
||||
|
||||
#include <c10/core/DispatchKey.h>
|
||||
|
||||
namespace at {
|
||||
|
||||
// Note [Functionalization Pass In Core]
|
||||
// The Functionalization pass is used to remove aliasing from a pytorch program.
|
||||
//
|
||||
// This is useful for backends that don't support aliasing, like XLA and Vulkan.
|
||||
// It's also necessary in order to remove mutation from a program, which is needed in Functorch.
|
||||
//
|
||||
// Consider this program:
|
||||
// a = torch.ones(...)
|
||||
// b = a.view(...)
|
||||
// b.add_(1)
|
||||
//
|
||||
// In this program, b is meant to alias with a due to the use of view(). At the end of the program, both a and b are full of 2's.
|
||||
// However, backends that don't support aliasing aren't able to correctly implement the view() operator.
|
||||
// Instead, they can opt into the Functionalization pass, which will sit between the user and the backend,
|
||||
// and provide the necessary aliasing logic.
|
||||
//
|
||||
// The functionalization pass will turn the above program into a slightly different program that has the same semantics,
|
||||
// transparently to the user, that backends like XLA/Vulkan are able to implement
|
||||
// a = torch.ones(...)
|
||||
// b = a.view_copy(...) # view() replaced with view_copy(). Backends like XLA/Vulkan can implement this!
|
||||
// b.add_(1)
|
||||
// a.add_(1) # Our functionalization pass machinery knows that a and b are aliased - it applies b's mutation to a too.
|
||||
//
|
||||
// So, how does the functionalization pass keep track of which tensors are aliased?
|
||||
// The pass works by wrapping EVERY tensor in the program inside of a FunctionalTensorWrapper, which knows about its alias'd tensors.
|
||||
//
|
||||
// See Note [Functionalization: Alias Removal] for details on the aliasing machinery.
|
||||
// See Note [Functionalization: Mutation Removal] for details on mutation removal.
|
||||
|
||||
struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
|
||||
explicit FunctionalTensorWrapper(const Tensor& value);
|
||||
// Additional constructor to create a FunctionalTensorWrapper directly from an underlying tensor that was created from a view.
|
||||
// For example, the code b = a.view1() will generate a constructor call to FunctionalTensorWrapper(b, a, view1_meta)
|
||||
explicit FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, functionalization::ViewMeta meta);
|
||||
|
||||
// Get the underlying, actual tensor, that doesn't know anything about functionalization.
|
||||
const Tensor& value() const { return value_; };
|
||||
// The concept of "level" is only ever important to functorch; it's exposed here
|
||||
// as more of a hook for functorch to use.
|
||||
int64_t level() const { return level_; };
|
||||
void set_level(int64_t level) { level_ = level; }
|
||||
|
||||
// Sync's the underlying tensor with its alias, if it's out of date. This involves two steps:
|
||||
// 1) Apply any pending updates/mutations to the alias
|
||||
// 2) Replay the views (if any) to regenerate the current tensor off of the updated alias.
|
||||
void sync_();
|
||||
// Performs step (1) of the sync. This is its own public API because it's needed by view_inplace ops like transpose_.
|
||||
// See Note [Functionalization Pass - Inplace View Ops]
|
||||
void regenerate_from_base();
|
||||
// Performs step (2) of the sync. This is its own public API because it's needed by functorch.
|
||||
// functorch wants to make sure that all input tensors to a functionalized program have been properly synced
|
||||
// so it can properly propagate mutations to inputs.
|
||||
// It can't just call sync_(), because the FunctionalTensorWrapper will look like it has no aliases and sync_ will be a noop.
|
||||
// We use the reference count on storage_ to determine if the wrapper is aliased, and by the time functorch
|
||||
// is ready to propagate updates to inputs, any intermediate views of the input created by the program will have been deallocated.
|
||||
void apply_updates();
|
||||
// Takes the current state of value_ and snapshots it, sending it as a pending update to the alias.
|
||||
void commit_update();
|
||||
// When any tensor is mutated, the tensor increments its alias's "generation".
|
||||
// Separately, each tensor maintains its own "generation" counter, which is used to determine if it's up-to-date with its alias.
|
||||
// The act of syncing a tensor will set a tensor's generation equal to its alias's generation.
|
||||
bool is_up_to_date() const;
|
||||
// Every FunctionalTensorWrapper contains a vector<ViewMeta> objects describing the series of view ops that ran
|
||||
// to generate the current tensor from the base tensor.
|
||||
// This method is used by inplace-view ops like transpose_.
|
||||
// It appends a ViewMeta to the existing stack, and refreshes the tensor by replaying the views off of the alias.
|
||||
void mutate_view_meta(at::functionalization::ViewMeta meta);
|
||||
|
||||
// The functionalization pass can be used to remove mutations.
|
||||
// It does so by replacing any mutation op with it's corresponding out-of-place op, followed by a call to replace_().
|
||||
// e.g:
|
||||
//
|
||||
// a.add_(1)
|
||||
//
|
||||
// will turn into:
|
||||
//
|
||||
// tmp = a.add(1)
|
||||
// a.replace_(tmp)
|
||||
//
|
||||
// replace_() swaps out the wrapped tensor, value_, with tmp.
|
||||
void replace_(const Tensor& other);
|
||||
|
||||
~FunctionalTensorWrapper() override = default;
|
||||
|
||||
private:
|
||||
const char* tensorimpl_type_name() const override;
|
||||
// Returns true if this FunctionalTensorWrapper is aliased with any other FunctionalTensorWrapper objects.
|
||||
// During a functionalization pass, if we have `b = a.view()`, then a and b should both report as aliased.
|
||||
bool is_aliased() const;
|
||||
void set_constructor_metadata();
|
||||
functionalization::FunctionalStorageImpl* functional_storage_impl() const;
|
||||
|
||||
// Note that value is not taken by reference: internally, the wrapper will change the value tensor that it points to over time.
|
||||
Tensor value_;
|
||||
int64_t level_;
|
||||
|
||||
size_t generation_ = 0;
|
||||
std::vector<at::functionalization::ViewMeta> view_metas_;
|
||||
};
|
||||
|
||||
// Utility functions for the functionalization pass.
|
||||
|
||||
namespace functionalization {
|
||||
namespace impl {
|
||||
|
||||
TORCH_API inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper(const Tensor& tensor) {
|
||||
auto functional_impl = static_cast<FunctionalTensorWrapper*>(tensor.unsafeGetTensorImpl());
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(functional_impl != nullptr);
|
||||
return functional_impl;
|
||||
}
|
||||
|
||||
TORCH_API inline bool isFunctionalTensor(const at::Tensor& tensor) {
|
||||
return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Functionalize);
|
||||
}
|
||||
|
||||
TORCH_API Tensor to_functional_tensor(const Tensor& tensor);
|
||||
TORCH_API TensorList to_functional_tensor(const c10::List<Tensor>& t_list);
|
||||
TORCH_API std::vector<Tensor> to_functional_tensor(const std::vector<Tensor>& t_list);
|
||||
TORCH_API TensorList to_functional_tensor(const TensorList& t_list);
|
||||
|
||||
TORCH_API Tensor from_functional_tensor(const Tensor& tensor);
|
||||
TORCH_API c10::optional<Tensor> from_functional_tensor(const c10::optional<Tensor>& t);
|
||||
TORCH_API c10::List<Tensor> from_functional_tensor(const c10::List<Tensor>& t_list);
|
||||
TORCH_API c10::List<c10::optional<Tensor>> from_functional_tensor(const c10::List<c10::optional<Tensor>>& t_list);
|
||||
TORCH_API TensorList from_functional_tensor(const TensorList& tensors);
|
||||
|
||||
TORCH_API void sync(const at::Tensor& t);
|
||||
TORCH_API void sync(const c10::optional<Tensor>& t);
|
||||
TORCH_API void sync(const c10::List<Tensor> t_list);
|
||||
TORCH_API void sync(const at::TensorList t_list);
|
||||
TORCH_API void sync(const c10::List<c10::optional<Tensor>> t_list);
|
||||
|
||||
Tensor create_functional_tensor_with_view_meta(const Tensor& view_to_wrap, const Tensor& base, functionalization::ViewMeta meta, int64_t out_idx = 0);
|
||||
std::vector<Tensor> create_functional_tensor_with_view_meta(const c10::List<Tensor>& view_to_wrap, const Tensor& base, functionalization::ViewMeta meta);
|
||||
std::vector<Tensor> create_functional_tensor_with_view_meta(const std::vector<Tensor>& view_to_wrap, const Tensor& base, functionalization::ViewMeta meta);
|
||||
|
||||
void mutate_view_meta(const Tensor& self, functionalization::ViewMeta meta);
|
||||
|
||||
void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out);
|
||||
void set_sizes_strides_offset(const std::vector<Tensor>& outs, const std::vector<Tensor>& meta_outs);
|
||||
|
||||
} // namespace impl
|
||||
} // namespace functionalization
|
||||
} // namespace at
|
||||
|
|
@ -84,6 +84,13 @@ struct TORCH_API AutoNonVariableTypeMode {
|
|||
c10::impl::ExcludeDispatchKeyGuard autograd_guard_;
|
||||
};
|
||||
|
||||
struct TORCH_API AutoDispatchSkipFunctionalize {
|
||||
AutoDispatchSkipFunctionalize() :
|
||||
dispatch_key_guard_(c10::DispatchKeySet(c10::DispatchKey::Functionalize)) {
|
||||
}
|
||||
c10::impl::ExcludeDispatchKeyGuard dispatch_key_guard_;
|
||||
};
|
||||
|
||||
/* Note [AutoDispatchBelowADInplaceOrView]
|
||||
* AutoDispatchBelowADInplaceOrView is equivalent to AutoNonVariableTypeMode
|
||||
* before we split inplace & view ops out of VariableType kernel.
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/core/Formatting.h>
|
||||
#include <ATen/core/VariableHooksInterface.h>
|
||||
#include <ATen/core/LegacyTypeDispatch.h>
|
||||
#include <ATen/FunctionalTensorWrapper.h>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
|
|
@ -113,7 +115,7 @@ const TensorBase& TensorBase::requires_grad_(bool _requires_grad) const {
|
|||
return *this;
|
||||
}
|
||||
|
||||
// View Variables
|
||||
// View Methods
|
||||
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
bool TensorBase::is_view() const {
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#include <ATen/core/dispatch/Dispatcher.h>
|
||||
#include <ATen/core/LegacyTypeDispatch.h>
|
||||
#include <ATen/FunctionalTensorWrapper.h>
|
||||
#include <torch/library.h>
|
||||
|
||||
/*
|
||||
|
|
@ -60,6 +61,55 @@ TORCH_LIBRARY_IMPL(_, AutogradMLC, m) {
|
|||
m.fallback(torch::CppFunction::makeFallthrough());
|
||||
}
|
||||
|
||||
namespace {
|
||||
void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet, torch::jit::Stack* stack) {
|
||||
const auto& schema = op.schema();
|
||||
TORCH_INTERNAL_ASSERT(!schema.hasAnyAliasInfo(), "mutating and aliasing ops should all have codegen'd kernels");
|
||||
const auto num_arguments = schema.arguments().size();
|
||||
const auto arguments_begin = stack->size() - num_arguments;
|
||||
auto arguments = torch::jit::last(stack, num_arguments);
|
||||
|
||||
for (uint64_t idx = 0; idx < num_arguments; ++idx) {
|
||||
const auto& ivalue = arguments[idx];
|
||||
if (ivalue.isTensor()) {
|
||||
at::Tensor t = ivalue.toTensor();
|
||||
at::functionalization::impl::sync(t);
|
||||
auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t));
|
||||
(*stack)[arguments_begin + idx] = t_new;
|
||||
} else if (ivalue.isTensorList()) {
|
||||
auto tensors = ivalue.toTensorList();
|
||||
at::functionalization::impl::sync(tensors);
|
||||
auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(tensors));
|
||||
(*stack)[arguments_begin + idx] = t_new;
|
||||
}
|
||||
}
|
||||
{
|
||||
at::AutoDispatchSkipFunctionalize guard;
|
||||
op.redispatchBoxed(dispatchKeySet & c10::after_func_keyset, stack);
|
||||
}
|
||||
const auto num_returns = schema.returns().size();
|
||||
const auto returns_begin = stack->size() - num_returns;
|
||||
auto returns = torch::jit::last(stack, num_returns);
|
||||
|
||||
for (uint64_t idx = 0; idx < num_returns; ++idx) {
|
||||
const auto& ivalue = returns[idx];
|
||||
if (ivalue.isTensor()) {
|
||||
at::Tensor t = ivalue.toTensor();
|
||||
auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t));
|
||||
(*stack)[returns_begin + idx] = t_new;
|
||||
} else if (ivalue.isTensorList()) {
|
||||
auto tensors = ivalue.toTensorList();
|
||||
auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(tensors));
|
||||
(*stack)[returns_begin + idx] = t_new;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL(_, Functionalize, m) {
|
||||
m.fallback(torch::CppFunction::makeFromBoxedFunction<&functionalizeFallback>());
|
||||
}
|
||||
|
||||
// see Note [ADInplaceOrView key]
|
||||
TORCH_LIBRARY_IMPL(_, ADInplaceOrView, m) {
|
||||
m.fallback(torch::CppFunction::makeFallthrough());
|
||||
|
|
|
|||
|
|
@ -197,6 +197,8 @@ _(aten, atan2) \
|
|||
_(aten, atleast_1d) \
|
||||
_(aten, atleast_2d) \
|
||||
_(aten, atleast_3d) \
|
||||
_(aten, _autocast_to_reduced_precision) \
|
||||
_(aten, _autocast_to_full_precision) \
|
||||
_(aten, avg_pool1d) \
|
||||
_(aten, avg_pool2d) \
|
||||
_(aten, avg_pool2d_backward) \
|
||||
|
|
|
|||
|
|
@ -23,22 +23,14 @@ struct BuiltinOpFunction : public Function {
|
|||
TORCH_INTERNAL_ASSERT(schema_.returns().size() == 1);
|
||||
}
|
||||
|
||||
const std::string& doc_string() const override {
|
||||
c10::string_view doc_string() const override {
|
||||
return doc_string_;
|
||||
}
|
||||
|
||||
bool isGraphFunction() const override {
|
||||
return false;
|
||||
}
|
||||
|
||||
void run(Stack& stack) override {
|
||||
callable_(stack);
|
||||
}
|
||||
|
||||
void run(Stack&& stack) override {
|
||||
callable_(stack);
|
||||
}
|
||||
|
||||
c10::intrusive_ptr<c10::ivalue::Future> runAsync(
|
||||
Stack& stack,
|
||||
TaskLauncher /* not used */) override {
|
||||
|
|
@ -48,50 +40,15 @@ struct BuiltinOpFunction : public Function {
|
|||
return res;
|
||||
}
|
||||
|
||||
at::IValue operator()(std::vector<at::IValue> stack, const Kwargs& kwargs)
|
||||
override {
|
||||
getSchema().checkAndNormalizeInputs(stack, kwargs);
|
||||
callable_(stack);
|
||||
return stack.front();
|
||||
}
|
||||
|
||||
const c10::QualifiedName& qualname() const override {
|
||||
return name_;
|
||||
}
|
||||
|
||||
const std::string& name() const override {
|
||||
return name_.name();
|
||||
}
|
||||
|
||||
// if this isn't yet defined, run its method_creator function
|
||||
void ensure_defined() override {
|
||||
// nop
|
||||
}
|
||||
|
||||
std::shared_ptr<Graph> graph() const override {
|
||||
TORCH_INTERNAL_ASSERT(false , "BuiltinFunction had a graph requested "
|
||||
"from it. This probably indicates that the JIT calling context needs a "
|
||||
"special case on Function::isGraphFunction()");
|
||||
}
|
||||
|
||||
std::shared_ptr<Graph> optimized_graph() const override {
|
||||
TORCH_INTERNAL_ASSERT(false , "BuiltinFunction had a graph requested "
|
||||
"from it. This probably indicates that the JIT calling context needs a "
|
||||
"special case on Function::isGraphFunction()");
|
||||
}
|
||||
|
||||
void clear_execution_info() override {
|
||||
TORCH_INTERNAL_ASSERT(false , "BuiltinFunction had a graph requested "
|
||||
"from it. This probably indicates that the JIT calling context needs a "
|
||||
"special case on Function::isGraphFunction()");
|
||||
}
|
||||
|
||||
GraphExecutor& get_executor() override {
|
||||
TORCH_INTERNAL_ASSERT(false , "BuiltinFunction had a GraphExecutor requested "
|
||||
"from it. This probably indicates that the JIT calling context needs a "
|
||||
"special case on Function::isGraphFunction()");
|
||||
}
|
||||
|
||||
const c10::FunctionSchema& getSchema() const override {
|
||||
return schema_;
|
||||
}
|
||||
|
|
@ -100,29 +57,21 @@ struct BuiltinOpFunction : public Function {
|
|||
return schema_.arguments().size();
|
||||
}
|
||||
|
||||
void check_single_output() override {
|
||||
TORCH_CHECK(schema_.returns().size() == 1);
|
||||
}
|
||||
|
||||
std::string pretty_print_schema() const override {
|
||||
#ifdef __NVCC__
|
||||
// Disable the "statement is unreachable" warning
|
||||
#pragma diag_suppress code_is_unreachable
|
||||
#endif
|
||||
|
||||
TORCH_INTERNAL_ASSERT(false);
|
||||
return "";
|
||||
|
||||
#ifdef __NVCC__
|
||||
#pragma diag_default code_is_unreachable
|
||||
#endif
|
||||
}
|
||||
|
||||
Function& setSchema(c10::FunctionSchema schema) override {
|
||||
schema_ = std::move(schema);
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool call(Stack& stack, size_t, c10::function_ref<void(const Code&)>) override {
|
||||
run(stack);
|
||||
return false;
|
||||
}
|
||||
|
||||
bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)>) override {
|
||||
run(stack);
|
||||
return false;
|
||||
}
|
||||
|
||||
~BuiltinOpFunction() override {}
|
||||
|
||||
private:
|
||||
|
|
|
|||
|
|
@ -1,8 +1,10 @@
|
|||
#pragma once
|
||||
|
||||
#include <ATen/core/function_schema.h>
|
||||
#include <ATen/core/ivalue.h>
|
||||
#include <ATen/core/qualified_name.h>
|
||||
#include <mutex>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/FunctionRef.h>
|
||||
|
||||
namespace c10 {
|
||||
struct FunctionSchema;
|
||||
|
|
@ -16,7 +18,11 @@ namespace torch {
|
|||
namespace jit {
|
||||
|
||||
struct Graph;
|
||||
struct GraphExecutor;
|
||||
struct Code;
|
||||
|
||||
namespace mobile {
|
||||
struct Code;
|
||||
}
|
||||
|
||||
using Stack = std::vector<at::IValue>;
|
||||
using Kwargs = std::unordered_map<std::string, at::IValue>;
|
||||
|
|
@ -30,50 +36,70 @@ TORCH_API void preoptimizeGraph(std::shared_ptr<Graph>& graph);
|
|||
// execution of the function. Method is a wrapper around an
|
||||
// underlying Function that also provides a `self` object.
|
||||
struct TORCH_API Function {
|
||||
virtual const std::string& doc_string() const {
|
||||
static const std::string no_doc_string = "";
|
||||
virtual c10::string_view doc_string() const {
|
||||
static constexpr c10::string_view no_doc_string = "";
|
||||
return no_doc_string;
|
||||
}
|
||||
|
||||
virtual bool isGraphFunction() const = 0;
|
||||
virtual bool isGraphFunction() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual void run(Stack& stack) = 0;
|
||||
|
||||
virtual void run(Stack&& stack) = 0;
|
||||
|
||||
virtual c10::intrusive_ptr<c10::ivalue::Future> runAsync(
|
||||
Stack& stack,
|
||||
TaskLauncher taskLauncher = at::launch) = 0;
|
||||
TaskLauncher taskLauncher = at::launch) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
|
||||
return {};
|
||||
}
|
||||
|
||||
virtual at::IValue operator()(
|
||||
std::vector<at::IValue> stack,
|
||||
const Kwargs& kwargs = Kwargs()) = 0;
|
||||
at::IValue operator()(
|
||||
Stack stack,
|
||||
const Kwargs& kwargs = Kwargs()) {
|
||||
getSchema().checkAndNormalizeInputs(stack, kwargs);
|
||||
run(stack);
|
||||
return stack.front();
|
||||
}
|
||||
|
||||
virtual const c10::QualifiedName& qualname() const = 0;
|
||||
|
||||
virtual const std::string& name() const = 0;
|
||||
const std::string& name() const {
|
||||
return qualname().name();
|
||||
}
|
||||
|
||||
// if this isn't yet defined, run its method_creator function
|
||||
virtual void ensure_defined() = 0;
|
||||
|
||||
virtual std::shared_ptr<Graph> graph() const = 0;
|
||||
|
||||
virtual std::shared_ptr<Graph> optimized_graph() const = 0;
|
||||
|
||||
virtual void clear_execution_info() = 0;
|
||||
|
||||
virtual GraphExecutor& get_executor() = 0;
|
||||
|
||||
virtual const c10::FunctionSchema& getSchema() const = 0;
|
||||
|
||||
virtual size_t num_inputs() const = 0;
|
||||
|
||||
virtual void check_single_output() = 0;
|
||||
|
||||
virtual std::string pretty_print_schema() const = 0;
|
||||
|
||||
virtual Function& setSchema(c10::FunctionSchema schema) = 0;
|
||||
|
||||
// call() defines how different interpreter implementations interacts with
|
||||
// Function objects. Basically interpreters need to provide a callback to
|
||||
// communicate to Functions what to do if provided a Code object.
|
||||
// Alternatively we could design the signature to return an optional Code
|
||||
// object, but that requires special handling the null case in interpreter
|
||||
// and the fallback behavior is not well defined by interpreter but rather
|
||||
// Function themselves, so a callback approach is more reasonable than
|
||||
// returning values.
|
||||
// If call() returns true, then callback completes successfully, otherwise
|
||||
// call() returns false.
|
||||
|
||||
// Overload for server interpreter, a bailout size is needed for graph executor.
|
||||
virtual bool call(Stack&, size_t, c10::function_ref<void(const Code&)>) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Overload for mobile interpreter.
|
||||
virtual bool call(Stack&, c10::function_ref<void(const mobile::Code&)>) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual ~Function() {}
|
||||
};
|
||||
} // namespace jit
|
||||
|
|
|
|||
|
|
@ -172,6 +172,10 @@ inline bool operator==(const Argument& lhs, const Argument& rhs) {
|
|||
&& *lhs.alias_info() == *rhs.alias_info()));
|
||||
}
|
||||
|
||||
inline bool operator!=(const Argument& lhs, const Argument& rhs) {
|
||||
return !(lhs == rhs);
|
||||
}
|
||||
|
||||
bool operator==(const FunctionSchema& lhs, const FunctionSchema& rhs);
|
||||
|
||||
struct FunctionSchema {
|
||||
|
|
|
|||
|
|
@ -463,6 +463,7 @@ namespace c10 {
|
|||
_(attr, df_output_vjps) \
|
||||
_(attr, axes) \
|
||||
_(attr, axis) \
|
||||
_(attr, symbolic_shape_inputs) \
|
||||
_(attr, broadcast) \
|
||||
_(attr, direction) \
|
||||
_(attr, ends) \
|
||||
|
|
|
|||
|
|
@ -1937,14 +1937,14 @@ struct TORCH_API ClassAttribute {
|
|||
TypePtr attributeType,
|
||||
std::string attributeName) :
|
||||
kind_(kind),
|
||||
attributeType_(attributeType),
|
||||
attributeType_(std::move(attributeType)),
|
||||
attributeName_(std::move(attributeName)) {}
|
||||
|
||||
AttributeKind getKind() const {
|
||||
return kind_;
|
||||
}
|
||||
|
||||
TypePtr getType() const {
|
||||
const TypePtr& getType() const {
|
||||
return attributeType_;
|
||||
}
|
||||
|
||||
|
|
@ -2026,22 +2026,22 @@ struct TORCH_API ClassType : public NamedType {
|
|||
return attributes_[pos].getType();
|
||||
}
|
||||
|
||||
TypePtr getAttribute(const std::string& name) const {
|
||||
auto type = findAttribute(name);
|
||||
const TypePtr& getAttribute(const std::string& name) const {
|
||||
auto slot = findAttributeSlot(name);
|
||||
TORCH_CHECK(
|
||||
type,
|
||||
slot,
|
||||
repr_str(),
|
||||
" does not have an attribute with name '",
|
||||
name,
|
||||
"'");
|
||||
return type;
|
||||
return attributes_[*slot].getType();
|
||||
}
|
||||
|
||||
size_t numAttributes() const {
|
||||
return attributes_.size();
|
||||
}
|
||||
|
||||
TypePtr getAttribute(size_t slot) const {
|
||||
const TypePtr& getAttribute(size_t slot) const {
|
||||
AT_ASSERT(slot < attributes_.size());
|
||||
return attributes_.at(slot).getType();
|
||||
}
|
||||
|
|
@ -2059,7 +2059,7 @@ struct TORCH_API ClassType : public NamedType {
|
|||
c10::optional<size_t> findAttributeSlot(const std::string& name) const {
|
||||
size_t slot = 0;
|
||||
for (const auto& attr : attributes_) {
|
||||
if (name.compare(attr.getName()) == 0) {
|
||||
if (name == attr.getName()) {
|
||||
return slot;
|
||||
}
|
||||
slot++;
|
||||
|
|
@ -2094,7 +2094,7 @@ struct TORCH_API ClassType : public NamedType {
|
|||
|
||||
size_t addAttribute(
|
||||
const std::string& name,
|
||||
const TypePtr& type,
|
||||
TypePtr type,
|
||||
bool is_parameter = false,
|
||||
bool is_buffer = false);
|
||||
|
||||
|
|
@ -2122,7 +2122,7 @@ struct TORCH_API ClassType : public NamedType {
|
|||
bool is_buffer = false) {
|
||||
auto slot_idx = findAttributeSlot(name);
|
||||
if (!slot_idx) {
|
||||
return addAttribute(name, ty, is_parameter, is_buffer);
|
||||
return addAttribute(name, std::move(ty), is_parameter, is_buffer);
|
||||
}
|
||||
|
||||
TORCH_CHECK(
|
||||
|
|
@ -2130,7 +2130,7 @@ struct TORCH_API ClassType : public NamedType {
|
|||
"Parameter field mismatch for the field '",
|
||||
name,
|
||||
"'");
|
||||
TypePtr atype = getAttribute(*slot_idx);
|
||||
const TypePtr& atype = getAttribute(*slot_idx);
|
||||
TORCH_CHECK(
|
||||
ty->isSubtypeOf(*atype),
|
||||
ty->repr_str(),
|
||||
|
|
@ -2227,7 +2227,7 @@ struct TORCH_API ClassType : public NamedType {
|
|||
AT_ASSERT(numAttributes() == contained_types.size());
|
||||
for(size_t i = 0; i < attributes_.size(); ++i) {
|
||||
AT_ASSERT(attributes_[i].getType()->isSubtypeOf(*contained_types[i]));
|
||||
ptr->addAttribute(attributes_[i].getName(), contained_types[i]);
|
||||
ptr->addAttribute(attributes_[i].getName(), std::move(contained_types[i]));
|
||||
}
|
||||
// Copy methods over
|
||||
for (const auto& method : methods()) {
|
||||
|
|
|
|||
|
|
@ -2228,7 +2228,7 @@ void ClassType::addAttribute(ClassAttribute classAttribute) {
|
|||
|
||||
size_t ClassType::addAttribute(
|
||||
const std::string& name,
|
||||
const TypePtr& type,
|
||||
TypePtr type,
|
||||
bool is_parameter,
|
||||
bool is_buffer) {
|
||||
if (is_parameter && is_buffer){
|
||||
|
|
@ -2248,16 +2248,13 @@ size_t ClassType::addAttribute(
|
|||
kind = AttributeKind::BUFFER;
|
||||
}
|
||||
|
||||
ClassAttribute ClassAttribute(kind, type, name);
|
||||
|
||||
addAttribute(ClassAttribute);
|
||||
|
||||
if (is_parameter || is_buffer) {
|
||||
TORCH_INTERNAL_ASSERT(is_module(), "adding a parameter or buffer to a non module");
|
||||
TORCH_CHECK(
|
||||
(type->kind() == TensorType::Kind) ||
|
||||
(type->kind() == OptionalType::Kind &&
|
||||
type->expect<OptionalType>()->getElementType()->kind() ==
|
||||
type->expectRef<OptionalType>().getElementType()->kind() ==
|
||||
TensorType::Kind) ||
|
||||
(type->kind() == UnionType::Kind &&
|
||||
TensorType::get()->isSubtypeOf(type->expectRef<UnionType>())) ||
|
||||
|
|
@ -2266,6 +2263,8 @@ size_t ClassType::addAttribute(
|
|||
toString(type));
|
||||
}
|
||||
|
||||
addAttribute(ClassAttribute(kind, std::move(type), name));
|
||||
|
||||
return slot;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,4 @@
|
|||
#pragma once
|
||||
|
||||
#include <ATen/cpu/vec/functional_base.h>
|
||||
#if !defined(__VSX__) || !defined(CPU_CAPABILITY_VSX)
|
||||
#include <ATen/cpu/vec/functional_bfloat16.h>
|
||||
#endif
|
||||
|
|
|
|||
54
aten/src/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h
Normal file
54
aten/src/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
#pragma once
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
namespace at {
|
||||
namespace vec {
|
||||
// See Note [Acceptable use of anonymous namespace in header]
|
||||
namespace {
|
||||
|
||||
inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
|
||||
const Vectorized<BFloat16>& a) {
|
||||
constexpr int64_t K = Vectorized<BFloat16>::size();
|
||||
__at_align__ float arr[K];
|
||||
__at_align__ BFloat16 arr2[K];
|
||||
a.store(arr2);
|
||||
convert(arr2, arr, K);
|
||||
return std::make_tuple(
|
||||
Vectorized<float>::loadu(arr),
|
||||
Vectorized<float>::loadu(arr + Vectorized<float>::size()));
|
||||
}
|
||||
|
||||
inline Vectorized<BFloat16> convert_float_bfloat16(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
constexpr int64_t K = Vectorized<BFloat16>::size();
|
||||
__at_align__ float arr[K];
|
||||
__at_align__ BFloat16 arr2[K];
|
||||
a.store(arr);
|
||||
b.store(arr + Vectorized<float>::size());
|
||||
convert(arr, arr2, K);
|
||||
return Vectorized<BFloat16>::loadu(arr2);
|
||||
}
|
||||
|
||||
void load_fp32_from_bf16(const c10::BFloat16* data, Vectorized<float>& out) {
|
||||
__at_align__ float values[Vectorized<float>::size()];
|
||||
for (int k = 0; k < Vectorized<float>::size(); ++k) {
|
||||
values[k] = data[k];
|
||||
}
|
||||
out = Vectorized<float>::loadu(values);
|
||||
}
|
||||
|
||||
C10_UNUSED void load_fp32_from_bf16(
|
||||
const c10::BFloat16* data,
|
||||
Vectorized<float>& out1,
|
||||
Vectorized<float>& out2) {
|
||||
load_fp32_from_bf16(data, out1);
|
||||
data += Vectorized<float>::size();
|
||||
load_fp32_from_bf16(data, out2);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace vec
|
||||
} // namespace at
|
||||
|
|
@ -3,6 +3,8 @@
|
|||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
|
||||
|
||||
// Note: header order is important here
|
||||
#include <ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h>
|
||||
#include <ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h>
|
||||
#include <ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h>
|
||||
|
|
@ -11,8 +13,12 @@
|
|||
#include <ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h>
|
||||
#include <ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h>
|
||||
#include <ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h>
|
||||
#include <ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h>
|
||||
|
||||
namespace at {
|
||||
namespace vec {
|
||||
|
||||
|
|
|
|||
|
|
@ -10,3 +10,17 @@
|
|||
#else
|
||||
#define AT_USE_CUSPARSE_GENERIC_API() 0
|
||||
#endif
|
||||
|
||||
// cuSparse Generic API spsv function was added in CUDA 11.3.0
|
||||
#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && (CUSPARSE_VERSION >= 11500)
|
||||
#define AT_USE_CUSPARSE_GENERIC_SPSV() 1
|
||||
#else
|
||||
#define AT_USE_CUSPARSE_GENERIC_SPSV() 0
|
||||
#endif
|
||||
|
||||
// cuSparse Generic API spsv function was added in CUDA 11.3.1
|
||||
#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && (CUSPARSE_VERSION >= 11600)
|
||||
#define AT_USE_CUSPARSE_GENERIC_SPSM() 1
|
||||
#else
|
||||
#define AT_USE_CUSPARSE_GENERIC_SPSM() 0
|
||||
#endif
|
||||
|
|
|
|||
219
aten/src/ATen/cuda/CUDASparseBlas.cpp
Normal file
219
aten/src/ATen/cuda/CUDASparseBlas.cpp
Normal file
|
|
@ -0,0 +1,219 @@
|
|||
/*
|
||||
Provides the implementations of cuSPARSE function templates.
|
||||
*/
|
||||
|
||||
#include <ATen/cuda/CUDASparseBlas.h>
|
||||
|
||||
namespace at {
|
||||
namespace cuda {
|
||||
namespace sparse {
|
||||
|
||||
template <>
|
||||
void csrgeam2_bufferSizeExt<float>(
|
||||
CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(float)) {
|
||||
TORCH_CUDASPARSE_CHECK(cusparseScsrgeam2_bufferSizeExt(
|
||||
handle,
|
||||
m,
|
||||
n,
|
||||
alpha,
|
||||
descrA,
|
||||
nnzA,
|
||||
csrSortedValA,
|
||||
csrSortedRowPtrA,
|
||||
csrSortedColIndA,
|
||||
beta,
|
||||
descrB,
|
||||
nnzB,
|
||||
csrSortedValB,
|
||||
csrSortedRowPtrB,
|
||||
csrSortedColIndB,
|
||||
descrC,
|
||||
csrSortedValC,
|
||||
csrSortedRowPtrC,
|
||||
csrSortedColIndC,
|
||||
pBufferSizeInBytes));
|
||||
}
|
||||
|
||||
template <>
|
||||
void csrgeam2_bufferSizeExt<double>(
|
||||
CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(double)) {
|
||||
TORCH_CUDASPARSE_CHECK(cusparseDcsrgeam2_bufferSizeExt(
|
||||
handle,
|
||||
m,
|
||||
n,
|
||||
alpha,
|
||||
descrA,
|
||||
nnzA,
|
||||
csrSortedValA,
|
||||
csrSortedRowPtrA,
|
||||
csrSortedColIndA,
|
||||
beta,
|
||||
descrB,
|
||||
nnzB,
|
||||
csrSortedValB,
|
||||
csrSortedRowPtrB,
|
||||
csrSortedColIndB,
|
||||
descrC,
|
||||
csrSortedValC,
|
||||
csrSortedRowPtrC,
|
||||
csrSortedColIndC,
|
||||
pBufferSizeInBytes));
|
||||
}
|
||||
|
||||
template <>
|
||||
void csrgeam2_bufferSizeExt<c10::complex<float>>(
|
||||
CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(c10::complex<float>)) {
|
||||
TORCH_CUDASPARSE_CHECK(cusparseCcsrgeam2_bufferSizeExt(
|
||||
handle,
|
||||
m,
|
||||
n,
|
||||
reinterpret_cast<const cuComplex*>(alpha),
|
||||
descrA,
|
||||
nnzA,
|
||||
reinterpret_cast<const cuComplex*>(csrSortedValA),
|
||||
csrSortedRowPtrA,
|
||||
csrSortedColIndA,
|
||||
reinterpret_cast<const cuComplex*>(beta),
|
||||
descrB,
|
||||
nnzB,
|
||||
reinterpret_cast<const cuComplex*>(csrSortedValB),
|
||||
csrSortedRowPtrB,
|
||||
csrSortedColIndB,
|
||||
descrC,
|
||||
reinterpret_cast<const cuComplex*>(csrSortedValC),
|
||||
csrSortedRowPtrC,
|
||||
csrSortedColIndC,
|
||||
pBufferSizeInBytes));
|
||||
}
|
||||
|
||||
template <>
|
||||
void csrgeam2_bufferSizeExt<c10::complex<double>>(
|
||||
CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(c10::complex<double>)) {
|
||||
TORCH_CUDASPARSE_CHECK(cusparseZcsrgeam2_bufferSizeExt(
|
||||
handle,
|
||||
m,
|
||||
n,
|
||||
reinterpret_cast<const cuDoubleComplex*>(alpha),
|
||||
descrA,
|
||||
nnzA,
|
||||
reinterpret_cast<const cuDoubleComplex*>(csrSortedValA),
|
||||
csrSortedRowPtrA,
|
||||
csrSortedColIndA,
|
||||
reinterpret_cast<const cuDoubleComplex*>(beta),
|
||||
descrB,
|
||||
nnzB,
|
||||
reinterpret_cast<const cuDoubleComplex*>(csrSortedValB),
|
||||
csrSortedRowPtrB,
|
||||
csrSortedColIndB,
|
||||
descrC,
|
||||
reinterpret_cast<const cuDoubleComplex*>(csrSortedValC),
|
||||
csrSortedRowPtrC,
|
||||
csrSortedColIndC,
|
||||
pBufferSizeInBytes));
|
||||
}
|
||||
|
||||
template <>
|
||||
void csrgeam2<float>(CUSPARSE_CSRGEAM2_ARGTYPES(float)) {
|
||||
TORCH_CUDASPARSE_CHECK(cusparseScsrgeam2(
|
||||
handle,
|
||||
m,
|
||||
n,
|
||||
alpha,
|
||||
descrA,
|
||||
nnzA,
|
||||
csrSortedValA,
|
||||
csrSortedRowPtrA,
|
||||
csrSortedColIndA,
|
||||
beta,
|
||||
descrB,
|
||||
nnzB,
|
||||
csrSortedValB,
|
||||
csrSortedRowPtrB,
|
||||
csrSortedColIndB,
|
||||
descrC,
|
||||
csrSortedValC,
|
||||
csrSortedRowPtrC,
|
||||
csrSortedColIndC,
|
||||
pBuffer));
|
||||
}
|
||||
|
||||
template <>
|
||||
void csrgeam2<double>(CUSPARSE_CSRGEAM2_ARGTYPES(double)) {
|
||||
TORCH_CUDASPARSE_CHECK(cusparseDcsrgeam2(
|
||||
handle,
|
||||
m,
|
||||
n,
|
||||
alpha,
|
||||
descrA,
|
||||
nnzA,
|
||||
csrSortedValA,
|
||||
csrSortedRowPtrA,
|
||||
csrSortedColIndA,
|
||||
beta,
|
||||
descrB,
|
||||
nnzB,
|
||||
csrSortedValB,
|
||||
csrSortedRowPtrB,
|
||||
csrSortedColIndB,
|
||||
descrC,
|
||||
csrSortedValC,
|
||||
csrSortedRowPtrC,
|
||||
csrSortedColIndC,
|
||||
pBuffer));
|
||||
}
|
||||
|
||||
template <>
|
||||
void csrgeam2<c10::complex<float>>(
|
||||
CUSPARSE_CSRGEAM2_ARGTYPES(c10::complex<float>)) {
|
||||
TORCH_CUDASPARSE_CHECK(cusparseCcsrgeam2(
|
||||
handle,
|
||||
m,
|
||||
n,
|
||||
reinterpret_cast<const cuComplex*>(alpha),
|
||||
descrA,
|
||||
nnzA,
|
||||
reinterpret_cast<const cuComplex*>(csrSortedValA),
|
||||
csrSortedRowPtrA,
|
||||
csrSortedColIndA,
|
||||
reinterpret_cast<const cuComplex*>(beta),
|
||||
descrB,
|
||||
nnzB,
|
||||
reinterpret_cast<const cuComplex*>(csrSortedValB),
|
||||
csrSortedRowPtrB,
|
||||
csrSortedColIndB,
|
||||
descrC,
|
||||
reinterpret_cast<cuComplex*>(csrSortedValC),
|
||||
csrSortedRowPtrC,
|
||||
csrSortedColIndC,
|
||||
pBuffer));
|
||||
}
|
||||
|
||||
template <>
|
||||
void csrgeam2<c10::complex<double>>(
|
||||
CUSPARSE_CSRGEAM2_ARGTYPES(c10::complex<double>)) {
|
||||
TORCH_CUDASPARSE_CHECK(cusparseZcsrgeam2(
|
||||
handle,
|
||||
m,
|
||||
n,
|
||||
reinterpret_cast<const cuDoubleComplex*>(alpha),
|
||||
descrA,
|
||||
nnzA,
|
||||
reinterpret_cast<const cuDoubleComplex*>(csrSortedValA),
|
||||
csrSortedRowPtrA,
|
||||
csrSortedColIndA,
|
||||
reinterpret_cast<const cuDoubleComplex*>(beta),
|
||||
descrB,
|
||||
nnzB,
|
||||
reinterpret_cast<const cuDoubleComplex*>(csrSortedValB),
|
||||
csrSortedRowPtrB,
|
||||
csrSortedColIndB,
|
||||
descrC,
|
||||
reinterpret_cast<cuDoubleComplex*>(csrSortedValC),
|
||||
csrSortedRowPtrC,
|
||||
csrSortedColIndC,
|
||||
pBuffer));
|
||||
}
|
||||
|
||||
} // namespace sparse
|
||||
} // namespace cuda
|
||||
} // namespace at
|
||||
110
aten/src/ATen/cuda/CUDASparseBlas.h
Normal file
110
aten/src/ATen/cuda/CUDASparseBlas.h
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
#pragma once
|
||||
|
||||
/*
|
||||
Provides a subset of cuSPARSE functions as templates:
|
||||
|
||||
csrgeam2<scalar_t>(...)
|
||||
|
||||
where scalar_t is double, float, c10::complex<double> or c10::complex<float>.
|
||||
The functions are available in at::cuda::sparse namespace.
|
||||
*/
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
|
||||
namespace at {
|
||||
namespace cuda {
|
||||
namespace sparse {
|
||||
|
||||
#define CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(scalar_t) \
|
||||
cusparseHandle_t handle, int m, int n, const scalar_t *alpha, \
|
||||
const cusparseMatDescr_t descrA, int nnzA, \
|
||||
const scalar_t *csrSortedValA, const int *csrSortedRowPtrA, \
|
||||
const int *csrSortedColIndA, const scalar_t *beta, \
|
||||
const cusparseMatDescr_t descrB, int nnzB, \
|
||||
const scalar_t *csrSortedValB, const int *csrSortedRowPtrB, \
|
||||
const int *csrSortedColIndB, const cusparseMatDescr_t descrC, \
|
||||
const scalar_t *csrSortedValC, const int *csrSortedRowPtrC, \
|
||||
const int *csrSortedColIndC, size_t *pBufferSizeInBytes
|
||||
|
||||
template <typename scalar_t>
|
||||
inline void csrgeam2_bufferSizeExt(
|
||||
CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(scalar_t)) {
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
false,
|
||||
"at::cuda::sparse::csrgeam2_bufferSizeExt: not implemented for ",
|
||||
typeid(scalar_t).name());
|
||||
}
|
||||
|
||||
template <>
|
||||
void csrgeam2_bufferSizeExt<float>(
|
||||
CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(float));
|
||||
template <>
|
||||
void csrgeam2_bufferSizeExt<double>(
|
||||
CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(double));
|
||||
template <>
|
||||
void csrgeam2_bufferSizeExt<c10::complex<float>>(
|
||||
CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(c10::complex<float>));
|
||||
template <>
|
||||
void csrgeam2_bufferSizeExt<c10::complex<double>>(
|
||||
CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(c10::complex<double>));
|
||||
|
||||
#define CUSPARSE_CSRGEAM2_NNZ_ARGTYPES() \
|
||||
cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA, \
|
||||
int nnzA, const int *csrSortedRowPtrA, const int *csrSortedColIndA, \
|
||||
const cusparseMatDescr_t descrB, int nnzB, const int *csrSortedRowPtrB, \
|
||||
const int *csrSortedColIndB, const cusparseMatDescr_t descrC, \
|
||||
int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *workspace
|
||||
|
||||
template <typename scalar_t>
|
||||
inline void csrgeam2Nnz(CUSPARSE_CSRGEAM2_NNZ_ARGTYPES()) {
|
||||
TORCH_CUDASPARSE_CHECK(cusparseXcsrgeam2Nnz(
|
||||
handle,
|
||||
m,
|
||||
n,
|
||||
descrA,
|
||||
nnzA,
|
||||
csrSortedRowPtrA,
|
||||
csrSortedColIndA,
|
||||
descrB,
|
||||
nnzB,
|
||||
csrSortedRowPtrB,
|
||||
csrSortedColIndB,
|
||||
descrC,
|
||||
csrSortedRowPtrC,
|
||||
nnzTotalDevHostPtr,
|
||||
workspace));
|
||||
}
|
||||
|
||||
#define CUSPARSE_CSRGEAM2_ARGTYPES(scalar_t) \
|
||||
cusparseHandle_t handle, int m, int n, const scalar_t *alpha, \
|
||||
const cusparseMatDescr_t descrA, int nnzA, \
|
||||
const scalar_t *csrSortedValA, const int *csrSortedRowPtrA, \
|
||||
const int *csrSortedColIndA, const scalar_t *beta, \
|
||||
const cusparseMatDescr_t descrB, int nnzB, \
|
||||
const scalar_t *csrSortedValB, const int *csrSortedRowPtrB, \
|
||||
const int *csrSortedColIndB, const cusparseMatDescr_t descrC, \
|
||||
scalar_t *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC, \
|
||||
void *pBuffer
|
||||
|
||||
template <typename scalar_t>
|
||||
inline void csrgeam2(CUSPARSE_CSRGEAM2_ARGTYPES(scalar_t)) {
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
false,
|
||||
"at::cuda::sparse::csrgeam2: not implemented for ",
|
||||
typeid(scalar_t).name());
|
||||
}
|
||||
|
||||
template <>
|
||||
void csrgeam2<float>(CUSPARSE_CSRGEAM2_ARGTYPES(float));
|
||||
template <>
|
||||
void csrgeam2<double>(CUSPARSE_CSRGEAM2_ARGTYPES(double));
|
||||
template <>
|
||||
void csrgeam2<c10::complex<float>>(
|
||||
CUSPARSE_CSRGEAM2_ARGTYPES(c10::complex<float>));
|
||||
template <>
|
||||
void csrgeam2<c10::complex<double>>(
|
||||
CUSPARSE_CSRGEAM2_ARGTYPES(c10::complex<double>));
|
||||
|
||||
} // namespace sparse
|
||||
} // namespace cuda
|
||||
} // namespace at
|
||||
|
|
@ -5,12 +5,12 @@
|
|||
#include <ATen/native/LinearAlgebraUtils.h>
|
||||
#include <ATen/native/cuda/MiscUtils.h>
|
||||
|
||||
#if AT_USE_CUSPARSE_GENERIC_API()
|
||||
|
||||
namespace at {
|
||||
namespace cuda {
|
||||
namespace sparse {
|
||||
|
||||
#if AT_USE_CUSPARSE_GENERIC_API()
|
||||
|
||||
namespace {
|
||||
|
||||
// If a specific GPU model does not provide native support for a given data
|
||||
|
|
@ -93,7 +93,8 @@ CuSparseDnMatDescriptor::CuSparseDnMatDescriptor(const Tensor& input) {
|
|||
|
||||
CuSparseDnVecDescriptor::CuSparseDnVecDescriptor(const Tensor& input) {
|
||||
// cuSPARSE doesn't support batched vectors
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.dim() == 1);
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
input.dim() == 1 || (input.dim() == 2 && input.size(-1) == 1));
|
||||
|
||||
// cuSPARSE doesn't support non-contiguous vectors
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.is_contiguous());
|
||||
|
|
@ -149,8 +150,8 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input) {
|
|||
descriptor_.reset(raw_descriptor);
|
||||
}
|
||||
|
||||
#endif // AT_USE_CUSPARSE_GENERIC_API()
|
||||
|
||||
} // namespace sparse
|
||||
} // namespace cuda
|
||||
} // namespace at
|
||||
|
||||
#endif // AT_USE_CUSPARSE_GENERIC_API()
|
||||
|
|
|
|||
|
|
@ -6,14 +6,14 @@
|
|||
|
||||
#include <c10/core/ScalarType.h>
|
||||
|
||||
#if AT_USE_CUSPARSE_GENERIC_API()
|
||||
#if defined(USE_ROCM)
|
||||
#include <type_traits>
|
||||
#endif
|
||||
|
||||
namespace at {
|
||||
namespace cuda {
|
||||
namespace sparse {
|
||||
|
||||
cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type);
|
||||
|
||||
template <typename T, cusparseStatus_t (*destructor)(T*)>
|
||||
struct CuSparseDescriptorDeleter {
|
||||
void operator()(T* x) {
|
||||
|
|
@ -37,6 +37,25 @@ class CuSparseDescriptor {
|
|||
std::unique_ptr<T, CuSparseDescriptorDeleter<T, destructor>> descriptor_;
|
||||
};
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
// hipSPARSE doesn't define this
|
||||
using cusparseMatDescr = std::remove_pointer<cusparseMatDescr_t>::type;
|
||||
#endif
|
||||
|
||||
class TORCH_CUDA_CPP_API CuSparseMatDescriptor
|
||||
: public CuSparseDescriptor<cusparseMatDescr, &cusparseDestroyMatDescr> {
|
||||
public:
|
||||
CuSparseMatDescriptor() {
|
||||
cusparseMatDescr_t raw_descriptor;
|
||||
TORCH_CUDASPARSE_CHECK(cusparseCreateMatDescr(&raw_descriptor));
|
||||
descriptor_.reset(raw_descriptor);
|
||||
}
|
||||
};
|
||||
|
||||
#if AT_USE_CUSPARSE_GENERIC_API()
|
||||
|
||||
cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type);
|
||||
|
||||
class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor
|
||||
: public CuSparseDescriptor<cusparseDnMatDescr, &cusparseDestroyDnMat> {
|
||||
public:
|
||||
|
|
@ -83,8 +102,54 @@ class TORCH_CUDA_CPP_API CuSparseSpMatCsrDescriptor
|
|||
values.data_ptr()));
|
||||
}
|
||||
#endif
|
||||
|
||||
#if AT_USE_CUSPARSE_GENERIC_SPSV()
|
||||
void set_mat_fill_mode(bool upper) {
|
||||
cusparseFillMode_t fill_mode =
|
||||
upper ? CUSPARSE_FILL_MODE_UPPER : CUSPARSE_FILL_MODE_LOWER;
|
||||
TORCH_CUDASPARSE_CHECK(cusparseSpMatSetAttribute(
|
||||
this->descriptor(),
|
||||
CUSPARSE_SPMAT_FILL_MODE,
|
||||
&fill_mode,
|
||||
sizeof(fill_mode)));
|
||||
}
|
||||
|
||||
void set_mat_diag_type(bool unit) {
|
||||
cusparseDiagType_t diag_type =
|
||||
unit ? CUSPARSE_DIAG_TYPE_UNIT : CUSPARSE_DIAG_TYPE_NON_UNIT;
|
||||
TORCH_CUDASPARSE_CHECK(cusparseSpMatSetAttribute(
|
||||
this->descriptor(),
|
||||
CUSPARSE_SPMAT_DIAG_TYPE,
|
||||
&diag_type,
|
||||
sizeof(diag_type)));
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
#if AT_USE_CUSPARSE_GENERIC_SPSV()
|
||||
class TORCH_CUDA_CPP_API CuSparseSpSVDescriptor
|
||||
: public CuSparseDescriptor<cusparseSpSVDescr, &cusparseSpSV_destroyDescr> {
|
||||
public:
|
||||
CuSparseSpSVDescriptor() {
|
||||
cusparseSpSVDescr_t raw_descriptor;
|
||||
TORCH_CUDASPARSE_CHECK(cusparseSpSV_createDescr(&raw_descriptor));
|
||||
descriptor_.reset(raw_descriptor);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
#if AT_USE_CUSPARSE_GENERIC_SPSM()
|
||||
class TORCH_CUDA_CPP_API CuSparseSpSMDescriptor
|
||||
: public CuSparseDescriptor<cusparseSpSMDescr, &cusparseSpSM_destroyDescr> {
|
||||
public:
|
||||
CuSparseSpSMDescriptor() {
|
||||
cusparseSpSMDescr_t raw_descriptor;
|
||||
TORCH_CUDASPARSE_CHECK(cusparseSpSM_createDescr(&raw_descriptor));
|
||||
descriptor_.reset(raw_descriptor);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
|
||||
class TORCH_CUDA_CPP_API CuSparseSpGEMMDescriptor
|
||||
: public CuSparseDescriptor<cusparseSpGEMMDescr, &cusparseSpGEMM_destroyDescr> {
|
||||
|
|
@ -97,8 +162,8 @@ class TORCH_CUDA_CPP_API CuSparseSpGEMMDescriptor
|
|||
};
|
||||
#endif
|
||||
|
||||
#endif // AT_USE_CUSPARSE_GENERIC_API()
|
||||
|
||||
} // namespace sparse
|
||||
} // namespace cuda
|
||||
} // namespace at
|
||||
|
||||
#endif // AT_USE_CUSPARSE_GENERIC_API()
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#include <ATen/ATen.h>
|
||||
#include <ATen/CPUApplyUtils.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/TensorMeta.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#include <ATen/ExpandUtils.h>
|
||||
|
||||
|
|
@ -203,6 +204,45 @@ extern "C" void strsm_(char *side, char *uplo, char *trans, char *diag, int *n,
|
|||
#endif
|
||||
|
||||
namespace at {
|
||||
namespace meta {
|
||||
|
||||
TORCH_META_FUNC(triangular_solve)(const Tensor& self, const Tensor& A, bool upper, bool transpose, bool unitriangular) {
|
||||
TORCH_CHECK(self.dim() >= 2,
|
||||
"torch.triangular_solve: Expected b to have at least 2 dimensions, but it has ", self.dim(), " dimensions instead");
|
||||
TORCH_CHECK(A.dim() >= 2,
|
||||
"torch.triangular_solve: Expected A to have at least 2 dimensions, but it has ", A.dim(), " dimensions instead");
|
||||
|
||||
at::native::linearSolveCheckInputs(self, A, "triangular_solve");
|
||||
|
||||
if (A.layout() == Layout::Strided) {
|
||||
std::vector<int64_t> self_broadcast_size, A_broadcast_size;
|
||||
std::tie(self_broadcast_size, A_broadcast_size) = at::native::_linalg_broadcast_batch_dims(self, A);
|
||||
|
||||
auto ndim = self_broadcast_size.size();
|
||||
auto nrows = A.size(-2);
|
||||
|
||||
// make column major strides for BLAS
|
||||
auto solution_strides = at::detail::defaultStrides(self_broadcast_size);
|
||||
solution_strides[ndim - 2] = 1;
|
||||
solution_strides[ndim - 1] = nrows;
|
||||
set_output(0, self_broadcast_size, solution_strides, self.options(), {});
|
||||
|
||||
// make column major strides for BLAS
|
||||
auto clone_A_strides = at::detail::defaultStrides(A_broadcast_size);
|
||||
clone_A_strides[ndim - 2] = 1;
|
||||
clone_A_strides[ndim - 1] = nrows;
|
||||
set_output(1, A_broadcast_size, clone_A_strides, A.options(), {});
|
||||
} else if (A.layout() == Layout::SparseCsr) {
|
||||
// no broadcasting for non-strided layout
|
||||
set_output(0, self.sizes(), {}, self.options(), {}); // make row major strides for Sparse BLAS
|
||||
set_output(1, {0}, {}, self.options(), {}); // return 0-sized tensor
|
||||
} else {
|
||||
TORCH_INTERNAL_ASSERT(false, "triangular_solve: Got an unexpected layout.");
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace meta
|
||||
|
||||
namespace native {
|
||||
|
||||
#if AT_BUILD_WITH_LAPACK()
|
||||
|
|
@ -1593,31 +1633,25 @@ The result of the computation is saved in-place in 'result' tensor,
|
|||
'unitriangular' if true then the diagonal elements of 'input' are assumed to be 1
|
||||
and the actual diagonal values are not used.
|
||||
*/
|
||||
static std::tuple<Tensor&, Tensor&> triangular_solve_out_info(
|
||||
Tensor& result,
|
||||
Tensor& clone_input,
|
||||
Tensor& infos,
|
||||
static void triangular_solve_out_impl(
|
||||
const Tensor& result,
|
||||
const Tensor& clone_input,
|
||||
const Tensor& input,
|
||||
const Tensor& other,
|
||||
bool upper, bool transpose, bool unitriangular) {
|
||||
// These internal asserts make explicit the assumptions in the implementation
|
||||
// Error check with the actual error messages are done on the higher level of
|
||||
// the hierarchy of calls
|
||||
TORCH_INTERNAL_ASSERT(input.dim() >= 2);
|
||||
TORCH_INTERNAL_ASSERT(input.size(-2) == input.size(-1));
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.dim() >= 2);
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.size(-2) == input.size(-1));
|
||||
|
||||
TORCH_INTERNAL_ASSERT(input.device() == other.device());
|
||||
TORCH_INTERNAL_ASSERT(input.device() == result.device());
|
||||
TORCH_INTERNAL_ASSERT(input.device() == clone_input.device());
|
||||
TORCH_INTERNAL_ASSERT(input.device() == infos.device());
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.device() == other.device());
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.device() == result.device());
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.device() == clone_input.device());
|
||||
|
||||
TORCH_INTERNAL_ASSERT(input.scalar_type() == other.scalar_type());
|
||||
TORCH_INTERNAL_ASSERT(input.scalar_type() == result.scalar_type());
|
||||
TORCH_INTERNAL_ASSERT(input.scalar_type() == clone_input.scalar_type());
|
||||
|
||||
TORCH_INTERNAL_ASSERT(infos.scalar_type() == at::kInt);
|
||||
TORCH_INTERNAL_ASSERT(infos.numel() == std::max<int64_t>(1, batchCount(input)));
|
||||
TORCH_INTERNAL_ASSERT(infos.is_contiguous());
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.scalar_type() == other.scalar_type());
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.scalar_type() == result.scalar_type());
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.scalar_type() == clone_input.scalar_type());
|
||||
|
||||
// if 'result' has no elements we can modify it
|
||||
if (result.numel() == 0) {
|
||||
|
|
@ -1632,60 +1666,38 @@ static std::tuple<Tensor&, Tensor&> triangular_solve_out_info(
|
|||
}
|
||||
|
||||
// 'result' and 'clone_input' must be in batched column major order (Fortran contiguous)
|
||||
TORCH_INTERNAL_ASSERT(result.mT().is_contiguous());
|
||||
TORCH_INTERNAL_ASSERT(clone_input.mT().is_contiguous());
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.mT().is_contiguous());
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(clone_input.mT().is_contiguous());
|
||||
|
||||
// triangular_solve_stub performs calculations in-place
|
||||
// 'result' must be a copy of 'other'
|
||||
// 'clone_input' must be a copy of 'input'
|
||||
TORCH_INTERNAL_ASSERT(result.sizes().equals(other.sizes()));
|
||||
TORCH_INTERNAL_ASSERT(clone_input.sizes().equals(input.sizes()));
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.sizes().equals(other.sizes()));
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(clone_input.sizes().equals(input.sizes()));
|
||||
result.copy_(other);
|
||||
clone_input.copy_(input);
|
||||
|
||||
triangular_solve_stub(input.device().type(), clone_input, result, /*left=*/true, upper, transpose ? TransposeType::Transpose : TransposeType::NoTranspose, unitriangular);
|
||||
|
||||
return std::tuple<Tensor&, Tensor&>(result, clone_input);
|
||||
}
|
||||
|
||||
// Supports arbitrary batch dimensions for self and A
|
||||
std::tuple<Tensor, Tensor> triangular_solve(const Tensor& self, const Tensor& A,
|
||||
bool upper, bool transpose, bool unitriangular) {
|
||||
TORCH_CHECK(self.dim() >= 2,
|
||||
"torch.triangular_solve: Expected b to have at least 2 dimensions, but it has ", self.dim(), " dimensions instead");
|
||||
TORCH_CHECK(A.dim() >= 2,
|
||||
"torch.triangular_solve: Expected A to have at least 2 dimensions, but it has ", A.dim(), " dimensions instead");
|
||||
TORCH_IMPL_FUNC(triangular_solve_out)(const Tensor& self, const Tensor& A, bool upper, bool transpose, bool unitriangular, const Tensor& result, const Tensor& clone_A) {
|
||||
Tensor self_broadcast, A_broadcast;
|
||||
std::tie(self_broadcast, A_broadcast) = _linalg_broadcast_batch_dims(self, A, "triangular_solve");
|
||||
|
||||
Tensor self_broadcasted, A_broadcasted;
|
||||
std::tie(self_broadcasted, A_broadcasted) = _linalg_broadcast_batch_dims(self, A, "triangular_solve");
|
||||
bool copy_needed = !result.transpose(-2, -1).is_contiguous();
|
||||
copy_needed |= !clone_A.transpose(-2, -1).is_contiguous();
|
||||
|
||||
Tensor result = at::empty({0}, self.options());
|
||||
Tensor clone_A = at::empty({0}, self.options());
|
||||
Tensor infos = at::zeros({std::max<int64_t>(1, batchCount(self_broadcasted))}, self.options().dtype(kInt));
|
||||
if (copy_needed) {
|
||||
Tensor result_tmp = at::empty({0}, self.options());
|
||||
Tensor clone_A_tmp = at::empty({0}, A.options());
|
||||
|
||||
triangular_solve_out_info(result, clone_A, infos, A_broadcasted, self_broadcasted, upper, transpose, unitriangular);
|
||||
triangular_solve_out_impl(result_tmp, clone_A_tmp, A_broadcast, self_broadcast, upper, transpose, unitriangular);
|
||||
|
||||
if (self_broadcasted.dim() > 2) {
|
||||
batchCheckErrors(infos, "triangular_solve");
|
||||
} else {
|
||||
singleCheckErrors(infos.item().toInt(), "triangular_solve");
|
||||
}
|
||||
|
||||
return std::tuple<Tensor, Tensor>(result, clone_A);
|
||||
}
|
||||
|
||||
std::tuple<Tensor&, Tensor&> triangular_solve_out(const Tensor& self, const Tensor& A, bool upper, bool transpose, bool unitriangular, Tensor& result, Tensor& clone_A) {
|
||||
checkSameDevice("triangular_solve", result, self);
|
||||
checkLinalgCompatibleDtype("triangular_solve", result, self);
|
||||
checkSameDevice("triangular_solve", clone_A, self, "clone_A");
|
||||
checkLinalgCompatibleDtype("triangular_solve", clone_A, self, "clone_A");
|
||||
Tensor result_tmp, clone_A_tmp;
|
||||
std::tie(result_tmp, clone_A_tmp) = at::native::triangular_solve(self, A, upper, transpose, unitriangular);
|
||||
at::native::resize_output(result, result_tmp.sizes());
|
||||
at::native::resize_output(clone_A, clone_A_tmp.sizes());
|
||||
result.copy_(result_tmp);
|
||||
clone_A.copy_(clone_A_tmp);
|
||||
return std::tuple<Tensor&, Tensor&>(result, clone_A);
|
||||
} else {
|
||||
triangular_solve_out_impl(result, clone_A, A_broadcast, self_broadcast, upper, transpose, unitriangular);
|
||||
}
|
||||
}
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ qr ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
|
|
|||
|
|
@ -211,8 +211,8 @@ using lstsq_fn = void (*)(
|
|||
DECLARE_DISPATCH(lstsq_fn, lstsq_stub);
|
||||
|
||||
using triangular_solve_fn = void (*)(
|
||||
Tensor& /*A*/,
|
||||
Tensor& /*B*/,
|
||||
const Tensor& /*A*/,
|
||||
const Tensor& /*B*/,
|
||||
bool /*left*/,
|
||||
bool /*upper*/,
|
||||
TransposeType /*transpose*/,
|
||||
|
|
|
|||
|
|
@ -795,7 +795,7 @@ This is an in-place routine, content of 'B' is overwritten.
|
|||
and the actual diagonal values are not used.
|
||||
*/
|
||||
template<typename scalar_t>
|
||||
void apply_triangular_solve(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
|
||||
void apply_triangular_solve(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
|
||||
#if !AT_BUILD_WITH_BLAS()
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
|
|
@ -826,7 +826,7 @@ void apply_triangular_solve(Tensor& A, Tensor& B, bool left, bool upper, Transpo
|
|||
#endif
|
||||
}
|
||||
|
||||
void triangular_solve_kernel(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
|
||||
void triangular_solve_kernel(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(A.scalar_type(), "triangular_solve_cpu", [&]{
|
||||
apply_triangular_solve<scalar_t>(A, B, left, upper, transpose, unitriangular);
|
||||
});
|
||||
|
|
|
|||
|
|
@ -1337,14 +1337,6 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
|
|||
auto input_shape = input.sizes().slice(2);
|
||||
auto grad_output_shape = gO.sizes().slice(2);
|
||||
|
||||
if (kernel_size.size() == 1) {
|
||||
auto expected_input_shape = (kernel_size[0] - 1) * gi_conv_params.dilation[1]
|
||||
- 2 * gi_conv_params.padding[1]
|
||||
+ (gi_conv_params.stride[1] * (grad_output_shape[0] - 1) + 1);
|
||||
if (expected_input_shape != input_shape[0]) {
|
||||
gi_conv_params.output_padding[1] = input_shape[0] - expected_input_shape;
|
||||
}
|
||||
} else {
|
||||
for (const auto i : c10::irange(kernel_size.size())) {
|
||||
// Check if whole input has been used or not
|
||||
auto expected_input_shape = (kernel_size[i] - 1) * gi_conv_params.dilation[i]
|
||||
|
|
@ -1354,7 +1346,6 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
|
|||
gi_conv_params.output_padding[i] = input_shape[i] - expected_input_shape;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (gO.is_cuda()) {
|
||||
gO = gO.contiguous();
|
||||
|
|
|
|||
|
|
@ -308,9 +308,7 @@ static inline void checkAllSameDim(TensorList tensors, int64_t dim) {
|
|||
}
|
||||
}
|
||||
|
||||
static inline std::tuple<Tensor,Tensor> _linalg_broadcast_batch_dims(const Tensor& arg1, const Tensor& arg2, const char* name) {
|
||||
linearSolveCheckInputs(arg1, arg2, name);
|
||||
|
||||
static inline std::tuple<std::vector<int64_t>, std::vector<int64_t>> _linalg_broadcast_batch_dims(const Tensor& arg1, const Tensor& arg2) {
|
||||
// broadcast the batch dimensions of arg1 and arg2.
|
||||
IntArrayRef arg1_batch_sizes(arg1.sizes().data(), arg1.ndimension() - 2);
|
||||
IntArrayRef arg2_batch_sizes(arg2.sizes().data(), arg2.ndimension() - 2);
|
||||
|
|
@ -321,6 +319,14 @@ static inline std::tuple<Tensor,Tensor> _linalg_broadcast_batch_dims(const Tenso
|
|||
|
||||
std::vector<int64_t> arg2_expand_size({expand_batch_portion});
|
||||
arg2_expand_size.insert(arg2_expand_size.end(), { arg2.size(-2), arg2.size(-1) });
|
||||
return std::make_tuple(std::move(arg1_expand_size), std::move(arg2_expand_size));
|
||||
}
|
||||
|
||||
static inline std::tuple<Tensor,Tensor> _linalg_broadcast_batch_dims(const Tensor& arg1, const Tensor& arg2, const char* name) {
|
||||
linearSolveCheckInputs(arg1, arg2, name);
|
||||
|
||||
std::vector<int64_t> arg1_expand_size, arg2_expand_size;
|
||||
std::tie(arg1_expand_size, arg2_expand_size) = at::native::_linalg_broadcast_batch_dims(arg1, arg2);
|
||||
|
||||
Tensor arg1_broadcasted = arg1.expand(arg1_expand_size);
|
||||
Tensor arg2_broadcasted = arg2.expand(arg2_expand_size);
|
||||
|
|
|
|||
|
|
@ -116,6 +116,43 @@ static inline Tensor to_impl(
|
|||
self, dtype, layout, device, pin_memory, non_blocking, optional_memory_format);
|
||||
}
|
||||
|
||||
// If input tensor is fp32, cast it to fp16, otherwise leave it alone.
|
||||
// (this is intended to be used internally by the JIT autocast implementation)
|
||||
Tensor _autocast_to_reduced_precision(const Tensor& self, bool cuda_enabled, bool cpu_enabled, ScalarType cuda_dtype, ScalarType cpu_dtype) {
|
||||
if (self.dtype() == at::ScalarType::Float &&
|
||||
((self.device().is_cuda() && cuda_enabled) ||
|
||||
(self.device().is_cpu() && cpu_enabled))
|
||||
) {
|
||||
at::ScalarType target = at::ScalarType::Undefined;
|
||||
if (self.device().is_cuda()) {
|
||||
target = cuda_dtype;
|
||||
} else if (self.device().is_cpu()) {
|
||||
target = cpu_dtype;
|
||||
}
|
||||
|
||||
TORCH_INTERNAL_ASSERT(target != at::ScalarType::Undefined, "_autocast_to_reduced_precision requires legit ScalarType argument for given device");
|
||||
|
||||
return to_impl(
|
||||
self, target, c10::nullopt, c10::nullopt, c10::nullopt, false, false, c10::nullopt);
|
||||
} else {
|
||||
return self;
|
||||
}
|
||||
}
|
||||
|
||||
// If input tensor is fp16, cast it to fp32, otherwise leave it alone.
|
||||
// (this is intended to be used internally by the JIT autocast implementation)
|
||||
Tensor _autocast_to_full_precision(const Tensor& self, bool cuda_enabled, bool cpu_enabled) {
|
||||
if (self.dtype() == at::ScalarType::Half &&
|
||||
((self.device().is_cuda() && cuda_enabled) ||
|
||||
(self.device().is_cpu() && cpu_enabled))
|
||||
) {
|
||||
return to_impl(
|
||||
self, at::ScalarType::Float, c10::nullopt, c10::nullopt, c10::nullopt, false, false, c10::nullopt);
|
||||
} else {
|
||||
return self;
|
||||
}
|
||||
}
|
||||
|
||||
Tensor to(
|
||||
const Tensor& self,
|
||||
c10::optional<ScalarType> dtype,
|
||||
|
|
|
|||
|
|
@ -2561,4 +2561,27 @@ std::vector<Tensor> unflatten_dense_tensors(const Tensor& flat, TensorList tenso
|
|||
return outputs;
|
||||
}
|
||||
|
||||
}} // at::native
|
||||
at::Tensor slice_scatter(const at::Tensor& self, const at::Tensor& src, int64_t dim, c10::optional<int64_t> start, c10::optional<int64_t> end, int64_t step) {
|
||||
auto output = self.clone();
|
||||
auto slice = output.slice(dim, start, end, step);
|
||||
TORCH_CHECK(slice.sizes() == src.sizes(), "expected src to have a size equal to the slice of self. src size = ", src.sizes(), ", slice size = ", slice.sizes());
|
||||
slice.copy_(src);
|
||||
return output;
|
||||
}
|
||||
at::Tensor select_scatter(const at::Tensor& self, const at::Tensor& src, int64_t dim, int64_t index) {
|
||||
auto output = self.clone();
|
||||
auto slice = output.select(dim, index);
|
||||
TORCH_CHECK(slice.sizes() == src.sizes(), "expected src to have a size equal to the slice of self. src size = ", src.sizes(), ", slice size = ", slice.sizes());
|
||||
slice.copy_(src);
|
||||
return output;
|
||||
}
|
||||
at::Tensor diagonal_scatter(const at::Tensor& self, const at::Tensor& src, int64_t offset, int64_t dim1, int64_t dim2) {
|
||||
auto output = self.clone();
|
||||
auto slice = output.diagonal(offset, dim1, dim2);
|
||||
TORCH_CHECK(slice.sizes() == src.sizes(), "expected src to have a size equal to the slice of self. src size = ", src.sizes(), ", slice size = ", slice.sizes());
|
||||
slice.copy_(src);
|
||||
return output;
|
||||
}
|
||||
|
||||
} // namespace native
|
||||
} // namespace at
|
||||
|
|
|
|||
|
|
@ -2,29 +2,48 @@
|
|||
#include <ATen/CPUApplyUtils.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
|
||||
#include <ATen/Parallel.h>
|
||||
#include <ATen/TensorMeta.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/TriangularOpsUtils.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
namespace meta {
|
||||
|
||||
TORCH_META_FUNC(tril)(const Tensor& self, int64_t k) {
|
||||
set_output(self.sizes(), self.options());
|
||||
}
|
||||
|
||||
TORCH_META_FUNC(triu)(const Tensor& self, int64_t k) {
|
||||
set_output(self.sizes(), self.options());
|
||||
}
|
||||
|
||||
} // namespace meta
|
||||
|
||||
namespace native {
|
||||
namespace {
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triu/tril ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
template <typename scalar_t, bool upper>
|
||||
static void apply_triu_tril_single(
|
||||
scalar_t* result, scalar_t* self, bool inplace,
|
||||
int64_t k, int64_t n, int64_t m,
|
||||
int64_t res_row_stride, int64_t res_col_stride,
|
||||
int64_t self_row_stride, int64_t self_col_stride) {
|
||||
|
||||
template <typename scalar_t>
|
||||
void apply_triu_tril_single(
|
||||
scalar_t* result,
|
||||
scalar_t* self,
|
||||
bool inplace,
|
||||
int64_t k,
|
||||
int64_t n,
|
||||
int64_t m,
|
||||
int64_t res_row_stride,
|
||||
int64_t res_col_stride,
|
||||
int64_t self_row_stride,
|
||||
int64_t self_col_stride,
|
||||
bool upper) {
|
||||
constexpr int64_t zero = 0;
|
||||
|
||||
if (upper) {
|
||||
at::parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto i : c10::irange(start, end)) {
|
||||
parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
|
||||
for (int64_t i : c10::irange(start, end)) {
|
||||
for (int64_t j = 0; j < std::min(m, i + k); j++) {
|
||||
result[i * res_row_stride + j * res_col_stride] = 0;
|
||||
}
|
||||
|
|
@ -36,8 +55,8 @@ static void apply_triu_tril_single(
|
|||
}
|
||||
});
|
||||
} else {
|
||||
at::parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto i : c10::irange(start, end)) {
|
||||
parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
|
||||
for (int64_t i : c10::irange(start, end)) {
|
||||
for (int64_t j = std::max(zero, i + k + 1); j < m; j++) {
|
||||
result[i * res_row_stride + j * res_col_stride] = 0;
|
||||
}
|
||||
|
|
@ -51,108 +70,101 @@ static void apply_triu_tril_single(
|
|||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, bool upper>
|
||||
void apply_triu_tril(Tensor& result, const Tensor& self, bool inplace, int64_t k) {
|
||||
template <typename scalar_t>
|
||||
void apply_triu_tril(const Tensor& result, const Tensor& self, bool inplace, int64_t k, bool upper) {
|
||||
auto n = self.size(-2);
|
||||
auto m = self.size(-1);
|
||||
auto self_data = self.data_ptr<scalar_t>();
|
||||
auto self_stride = (self.dim() > 2 && self.stride(-3) > 0) ? self.stride(-3) : 1;
|
||||
auto batchsize = batchCountTrilTriu(result);
|
||||
auto self_row_stride = self.stride(-2);
|
||||
auto self_column_stride = self.stride(-1);
|
||||
auto self_col_stride = self.stride(-1);
|
||||
|
||||
auto result_data = result.data_ptr<scalar_t>();
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int64_t result_stride, result_row_stride, result_column_stride;
|
||||
int64_t result_stride, result_row_stride, result_col_stride;
|
||||
if (result_data != self_data) {
|
||||
result_stride = (result.dim() > 2 && result.stride(-3) > 0) ? result.stride(-3) : 1;
|
||||
result_row_stride = result.stride(-2);
|
||||
result_column_stride = result.stride(-1);
|
||||
result_col_stride = result.stride(-1);
|
||||
} else {
|
||||
result_stride = self_stride;
|
||||
result_row_stride = self_row_stride;
|
||||
result_column_stride = self_column_stride;
|
||||
result_col_stride = self_col_stride;
|
||||
}
|
||||
|
||||
at::parallel_for(0, batchsize, 0, [&](int64_t start, int64_t end) {
|
||||
parallel_for(0, batchsize, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto b : c10::irange(start, end)) {
|
||||
scalar_t* self_batch = &self_data[b * self_stride];
|
||||
scalar_t* result_batch = &result_data[b * result_stride];
|
||||
apply_triu_tril_single<scalar_t, upper>(
|
||||
result_batch, self_batch, inplace, k, n, m,
|
||||
result_row_stride, result_column_stride, self_row_stride, self_column_stride);
|
||||
apply_triu_tril_single<scalar_t>(
|
||||
result_batch,
|
||||
self_batch,
|
||||
inplace,
|
||||
k,
|
||||
n,
|
||||
m,
|
||||
result_row_stride,
|
||||
result_col_stride,
|
||||
self_row_stride,
|
||||
self_col_stride,
|
||||
upper);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
Tensor tril(const Tensor& self, int64_t k) {
|
||||
Tensor result = at::empty({0}, self.options());
|
||||
at::tril_out(result, self, k);
|
||||
return result;
|
||||
}
|
||||
struct UpperTriangle {
|
||||
static constexpr const char* op_name = "triu";
|
||||
static constexpr bool upper = true;
|
||||
};
|
||||
|
||||
Tensor& tril_cpu_(Tensor &self, int64_t k) {
|
||||
struct LowerTriangle {
|
||||
static constexpr const char *op_name = "tril";
|
||||
static constexpr bool upper = false;
|
||||
};
|
||||
|
||||
template <typename Triangle>
|
||||
void compute_triu_tril(const Tensor& self, int64_t k, const Tensor &result) {
|
||||
if (self.numel() == 0) {
|
||||
return self;
|
||||
return;
|
||||
}
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
bool inplace;
|
||||
|
||||
bool inplace_op = self.is_same(result);
|
||||
|
||||
bool inplace_update = false;
|
||||
Tensor self_c;
|
||||
std::tie(inplace, self_c) = checkTrilTriuBatchContiguous(self, true);
|
||||
Tensor result = inplace ? self : at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::BFloat16, at::ScalarType::Half, at::ScalarType::Bool, self.scalar_type(), "tril", [&]{
|
||||
apply_triu_tril<scalar_t, false>(result, self_c, inplace, k);
|
||||
std::tie(inplace_update, self_c) = checkTrilTriuBatchContiguous(self, inplace_op);
|
||||
|
||||
Tensor result_c;
|
||||
if (inplace_op && !inplace_update) {
|
||||
result_c = at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
||||
} else {
|
||||
result_c = result;
|
||||
}
|
||||
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
|
||||
ScalarType::BFloat16,
|
||||
ScalarType::Half,
|
||||
ScalarType::Bool,
|
||||
self.scalar_type(),
|
||||
Triangle::op_name,
|
||||
[&]{
|
||||
apply_triu_tril<scalar_t>(result_c, self_c, inplace_op && inplace_update, k, Triangle::upper);
|
||||
});
|
||||
if (!inplace) self.copy_(result);
|
||||
return self;
|
||||
|
||||
if (inplace_op && !inplace_update) {
|
||||
result.copy_(result_c);
|
||||
}
|
||||
}
|
||||
|
||||
Tensor& tril_cpu_out(const Tensor& self, int64_t k, Tensor &result) {
|
||||
at::native::resize_output(result, self.sizes());
|
||||
if (self.numel() == 0) {
|
||||
return result;
|
||||
}
|
||||
Tensor self_c;
|
||||
std::tie(std::ignore, self_c) = checkTrilTriuBatchContiguous(self, false);
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::BFloat16, at::ScalarType::Half, at::ScalarType::Bool, self.scalar_type(), "tril", [&]{
|
||||
apply_triu_tril<scalar_t, false>(result, self_c, false, k);
|
||||
});
|
||||
return result;
|
||||
} // namespace
|
||||
|
||||
TORCH_IMPL_FUNC(tril_cpu)(const Tensor& self, int64_t k, const Tensor &result) {
|
||||
compute_triu_tril<LowerTriangle>(self, k, result);
|
||||
}
|
||||
|
||||
Tensor triu(const Tensor& self, int64_t k) {
|
||||
Tensor result = at::empty({0}, self.options());
|
||||
at::triu_out(result, self, k);
|
||||
return result;
|
||||
}
|
||||
|
||||
Tensor& triu_cpu_(Tensor &self, int64_t k) {
|
||||
if (self.numel() == 0) {
|
||||
return self;
|
||||
}
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
bool inplace;
|
||||
Tensor self_c;
|
||||
std::tie(inplace, self_c) = checkTrilTriuBatchContiguous(self, true);
|
||||
Tensor result = inplace ? self : at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::BFloat16, at::ScalarType::Half, at::ScalarType::Bool, self.scalar_type(), "triu", [&]{
|
||||
apply_triu_tril<scalar_t, true>(result, self_c, inplace, k);
|
||||
});
|
||||
if (!inplace) self.copy_(result);
|
||||
return self;
|
||||
}
|
||||
|
||||
Tensor& triu_cpu_out(const Tensor& self, int64_t k, Tensor &result) {
|
||||
at::native::resize_output(result, self.sizes());
|
||||
if (self.numel() == 0) {
|
||||
return result;
|
||||
}
|
||||
Tensor self_c;
|
||||
std::tie(std::ignore, self_c) = checkTrilTriuBatchContiguous(self, false);
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::BFloat16, at::ScalarType::Half, at::ScalarType::Bool, self.scalar_type(), "triu", [&]{
|
||||
apply_triu_tril<scalar_t, true>(result, self_c, false, k);
|
||||
});
|
||||
return result;
|
||||
TORCH_IMPL_FUNC(triu_cpu)(const Tensor& self, int64_t k, const Tensor &result) {
|
||||
compute_triu_tril<UpperTriangle>(self, k, result);
|
||||
}
|
||||
|
||||
Tensor trace_backward(const Tensor& grad, IntArrayRef sizes) {
|
||||
|
|
@ -166,6 +178,5 @@ Tensor trace_backward(const Tensor& grad, IntArrayRef sizes) {
|
|||
return grad_input.view(sizes);
|
||||
}
|
||||
|
||||
|
||||
} // namespace native
|
||||
} // namespace at
|
||||
|
|
|
|||
|
|
@ -1965,7 +1965,7 @@ REGISTER_CUDA_DISPATCH(lu_stub, &apply_lu);
|
|||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triangular_solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
template <typename scalar_t>
|
||||
static void apply_triangular_solve_batched_magma(Tensor& A, Tensor& b, bool left, bool upper, TransposeType transpose, bool unitriangular) {
|
||||
static void apply_triangular_solve_batched_magma(const Tensor& A, const Tensor& b, bool left, bool upper, TransposeType transpose, bool unitriangular) {
|
||||
#if !AT_MAGMA_ENABLED()
|
||||
AT_ERROR("triangular_solve: MAGMA library not found in "
|
||||
"compilation. Please rebuild with MAGMA.");
|
||||
|
|
@ -2030,13 +2030,13 @@ AT_ERROR("triangular_solve: MAGMA library not found in "
|
|||
#endif
|
||||
}
|
||||
|
||||
void triangular_solve_batched_magma(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
|
||||
void triangular_solve_batched_magma(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(A.scalar_type(), "triangular_solve_cuda", [&]{
|
||||
apply_triangular_solve_batched_magma<scalar_t>(A, B, left, upper, transpose, unitriangular);
|
||||
});
|
||||
}
|
||||
|
||||
void triangular_solve_kernel(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
|
||||
void triangular_solve_kernel(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
|
||||
// For batches smaller than 8 and matrix sizes larger than 64x64 cuBLAS forloop is faster than batched version
|
||||
if (batchCount(A) <= 8 && A.size(-1) >= 64) {
|
||||
triangular_solve_cublas(A, B, left, upper, transpose, unitriangular);
|
||||
|
|
|
|||
|
|
@ -111,7 +111,7 @@ void lu_solve_batched_cublas(const Tensor& b, const Tensor& lu, const Tensor& pi
|
|||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
static void apply_triangular_solve(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
|
||||
static void apply_triangular_solve(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
|
||||
cublasFillMode_t uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
|
||||
const auto trans = to_cublas(transpose);
|
||||
cublasDiagType_t diag = unitriangular ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
|
||||
|
|
@ -138,14 +138,14 @@ static void apply_triangular_solve(Tensor& A, Tensor& B, bool left, bool upper,
|
|||
}
|
||||
}
|
||||
|
||||
void triangular_solve_cublas(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
|
||||
void triangular_solve_cublas(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(A.scalar_type(), "triangular_solve_cuda", [&]{
|
||||
apply_triangular_solve<scalar_t>(A, B, left, upper, transpose, unitriangular);
|
||||
});
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
static void apply_triangular_solve_batched(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
|
||||
static void apply_triangular_solve_batched(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
|
||||
cublasFillMode_t uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
|
||||
const auto trans = to_cublas(transpose);
|
||||
cublasDiagType_t diag = unitriangular ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
|
||||
|
|
@ -170,7 +170,7 @@ static void apply_triangular_solve_batched(Tensor& A, Tensor& B, bool left, bool
|
|||
at::cuda::blas::trsmBatched(handle, side, uplo, trans, diag, m, n, &alpha, A_ptr_array_data, lda, B_ptr_array_data, ldb, batch_size);
|
||||
}
|
||||
|
||||
void triangular_solve_batched_cublas(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
|
||||
void triangular_solve_batched_cublas(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(A.scalar_type(), "triangular_solve_cuda", [&]{
|
||||
apply_triangular_solve_batched<scalar_t>(A, B, left, upper, transpose, unitriangular);
|
||||
});
|
||||
|
|
|
|||
|
|
@ -35,8 +35,8 @@ namespace at {
|
|||
namespace native {
|
||||
|
||||
void geqrf_batched_cublas(const Tensor& input, const Tensor& tau);
|
||||
void triangular_solve_cublas(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular);
|
||||
void triangular_solve_batched_cublas(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular);
|
||||
void triangular_solve_cublas(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular);
|
||||
void triangular_solve_batched_cublas(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular);
|
||||
void gels_batched_cublas(const Tensor& a, Tensor& b, Tensor& infos);
|
||||
void lu_solve_batched_cublas(const Tensor& b, const Tensor& lu, const Tensor& pivots, TransposeType transpose);
|
||||
|
||||
|
|
|
|||
|
|
@ -4,9 +4,10 @@
|
|||
#include <ATen/native/BinaryOps.h>
|
||||
#include <ATen/native/DispatchStub.h>
|
||||
#include <ATen/native/TensorIterator.h>
|
||||
#include <ATen/native/cuda/Loops.cuh>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <c10/cuda/CUDAMathCompat.h>
|
||||
#include <ATen/native/cuda/Loops.cuh>
|
||||
#include <c10/util/TypeSafeSignMath.h>
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
|
|
@ -98,7 +99,7 @@ void div_floor_kernel_cuda(TensorIteratorBase& iter) {
|
|||
} else if (isIntegralType(dtype, /*includeBool*/ false)) {
|
||||
AT_DISPATCH_INTEGRAL_TYPES(dtype, "div_floor_cuda", [&]() {
|
||||
gpu_kernel_with_scalars(iter, [] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
|
||||
if (!std::is_unsigned<scalar_t>::value && (a < 0) != (b < 0)) {
|
||||
if (c10::signs_differ(a, b)) {
|
||||
// Subtracts one from the results of truncation division if the
|
||||
// divisor and dividend have different sign(bit)s and the remainder of
|
||||
// the division is nonzero
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
#include <ATen/native/cuda/Loops.cuh>
|
||||
#include <ATen/native/BinaryOps.h>
|
||||
#include <ATen/native/TensorIterator.h>
|
||||
#include <c10/util/TypeSafeSignMath.h>
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
|
|
@ -17,7 +18,7 @@ void remainder_kernel_cuda(TensorIteratorBase& iter) {
|
|||
AT_DISPATCH_INTEGRAL_TYPES(iter.common_dtype(), "remainder_cuda", [&]() {
|
||||
gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
|
||||
scalar_t r = a % b;
|
||||
if (!std::is_unsigned<scalar_t>::value && (r != 0) && ((r < 0) != (b < 0))) {
|
||||
if (r != 0 && c10::signs_differ(r, b)) {
|
||||
r += b;
|
||||
}
|
||||
return r;
|
||||
|
|
@ -28,7 +29,7 @@ void remainder_kernel_cuda(TensorIteratorBase& iter) {
|
|||
gpu_kernel_with_scalars(iter,
|
||||
[]GPU_LAMBDA(scalar_t a, scalar_t b) __ubsan_ignore_float_divide_by_zero__ -> scalar_t {
|
||||
auto mod = ::fmod(a, b);
|
||||
if (!std::is_unsigned<scalar_t>::value && (mod != 0) && ((b < 0) != (mod < 0))) {
|
||||
if (mod != 0 && c10::signs_differ(b, mod)) {
|
||||
mod += b;
|
||||
}
|
||||
return mod;
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ __global__ void triu_tril_kernel(
|
|||
}
|
||||
|
||||
template <bool upper>
|
||||
Tensor& triu_tril_cuda_template(Tensor& result, const Tensor& self, int64_t k, const char* name) {
|
||||
void triu_tril_cuda_template(const Tensor& result, const Tensor& self, int64_t k, const char* name) {
|
||||
int64_t N = self.numel();
|
||||
dim3 dim_block = cuda::getApplyBlock();
|
||||
dim3 dim_grid((N + dim_block.x - 1) / dim_block.x);
|
||||
|
|
@ -76,31 +76,18 @@ Tensor& triu_tril_cuda_template(Tensor& result, const Tensor& self, int64_t k, c
|
|||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
Tensor& tril_cuda_(Tensor &self, int64_t k) {
|
||||
return tril_cuda_out(self, k, self);
|
||||
}
|
||||
|
||||
Tensor& tril_cuda_out(const Tensor& self, int64_t k, Tensor &result) {
|
||||
at::native::resize_output(result, self.sizes());
|
||||
if (self.numel() == 0) {
|
||||
return result;
|
||||
TORCH_IMPL_FUNC(tril_cuda)(const Tensor& self, int64_t k, const Tensor &result) {
|
||||
if (self.numel() != 0) {
|
||||
triu_tril_cuda_template<false>(result, self, k, "tril");
|
||||
}
|
||||
return triu_tril_cuda_template<false>(result, self, k, "tril");
|
||||
}
|
||||
|
||||
Tensor& triu_cuda_(Tensor &self, int64_t k) {
|
||||
return triu_cuda_out(self, k, self);
|
||||
}
|
||||
|
||||
Tensor& triu_cuda_out(const Tensor& self, int64_t k, Tensor &result) {
|
||||
at::native::resize_output(result, self.sizes());
|
||||
if (self.numel() == 0) {
|
||||
return result;
|
||||
TORCH_IMPL_FUNC(triu_cuda)(const Tensor& self, int64_t k, const Tensor &result) {
|
||||
if (self.numel() != 0) {
|
||||
triu_tril_cuda_template<true>(result, self, k, "triu");
|
||||
}
|
||||
return triu_tril_cuda_template<true>(result, self, k, "triu");
|
||||
}
|
||||
|
||||
// Copy the kth diagonal of a matrix B to a vector A.
|
||||
|
|
|
|||
|
|
@ -154,12 +154,6 @@ void nan_to_num_kernel_cuda(
|
|||
}
|
||||
|
||||
void frexp_kernel_cuda(TensorIteratorBase& iter) {
|
||||
#if defined(USE_ROCM)
|
||||
// Reference: https://rocmdocs.amd.com/en/latest/ROCm_API_References/HIP-MATH.html
|
||||
// https://github.com/ROCm-Developer-Tools/HIP/issues/2169
|
||||
// ROCm does not support frexp function yet
|
||||
TORCH_CHECK(false, "torch.frexp() is not implemented on ROCm platform.");
|
||||
#else
|
||||
AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::Half,
|
||||
// The iter.dtype() here is the dtype of mantissa output.
|
||||
// It's a floating point type and must be the same as the input's dtype.
|
||||
|
|
@ -171,7 +165,6 @@ void frexp_kernel_cuda(TensorIteratorBase& iter) {
|
|||
return {mantissa, exponent};
|
||||
});
|
||||
});
|
||||
#endif
|
||||
}
|
||||
|
||||
REGISTER_DISPATCH(bitwise_not_stub, &bitwise_not_kernel_cuda);
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@
|
|||
#include <ATen/native/DispatchStub.h>
|
||||
#include <ATen/native/TensorIterator.h>
|
||||
#include <ATen/native/cuda/Math.cuh>
|
||||
#include <c10/util/TypeSafeSignMath.h>
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
|
|
@ -38,8 +39,7 @@ void sign_kernel_cuda(TensorIteratorBase& iter){
|
|||
} else {
|
||||
AT_DISPATCH_ALL_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "sign_cuda", [&]() {
|
||||
gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
|
||||
scalar_t zero = scalar_t(0);
|
||||
return (zero < a) - (a < zero);
|
||||
return c10::signum(a);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
|
@ -47,7 +47,7 @@ void sign_kernel_cuda(TensorIteratorBase& iter){
|
|||
|
||||
void signbit_kernel_cuda(TensorIteratorBase& iter){
|
||||
AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, ScalarType::Half, iter.input_dtype(), "signbit_cuda", [&]() {
|
||||
gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> bool { return !std::is_unsigned<scalar_t>::value && a < 0; });
|
||||
gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> bool { return is_negative(a); });
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -645,6 +645,7 @@
|
|||
variants: function, method
|
||||
device_check: NoCheck
|
||||
device_guard: False
|
||||
tags: inplace_view
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: as_strided_
|
||||
|
||||
|
|
@ -3126,7 +3127,7 @@
|
|||
- func: mv(Tensor self, Tensor vec) -> Tensor
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CPU, CUDA, SparseCsrCUDA: mv
|
||||
CompositeExplicitAutograd: mv
|
||||
SparseCPU, SparseCUDA, SparseCsrCPU: mv_sparse
|
||||
|
||||
- func: mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
|
||||
|
|
@ -3882,6 +3883,7 @@
|
|||
# this. If this `Variable` is a view, throws an `std::runtime_error()`.
|
||||
- func: detach_(Tensor(a!) self) -> Tensor(a!)
|
||||
variants: function, method
|
||||
tags: inplace_view
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: detach_
|
||||
|
||||
|
|
@ -3910,6 +3912,27 @@
|
|||
dispatch:
|
||||
CompositeExplicitAutograd: slice_backward
|
||||
|
||||
- func: slice_scatter(Tensor self, Tensor src, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor
|
||||
variants: function, method
|
||||
device_check: NoCheck
|
||||
device_guard: False
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: slice_scatter
|
||||
|
||||
- func: select_scatter(Tensor self, Tensor src, int dim, int index) -> Tensor
|
||||
variants: function, method
|
||||
device_check: NoCheck
|
||||
device_guard: False
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: select_scatter
|
||||
|
||||
- func: diagonal_scatter(Tensor self, Tensor src, int offset=0, int dim1=0, int dim2=1) -> Tensor
|
||||
variants: function, method
|
||||
device_check: NoCheck
|
||||
device_guard: False
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: diagonal_scatter
|
||||
|
||||
- func: slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet)
|
||||
variants: function, method
|
||||
dispatch:
|
||||
|
|
@ -4014,6 +4037,7 @@
|
|||
variants: method
|
||||
device_check: NoCheck
|
||||
device_guard: False
|
||||
tags: inplace_view
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: squeeze_
|
||||
|
||||
|
|
@ -4021,6 +4045,7 @@
|
|||
variants: method
|
||||
device_check: NoCheck
|
||||
device_guard: False
|
||||
tags: inplace_view
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: squeeze_
|
||||
|
||||
|
|
@ -4028,6 +4053,7 @@
|
|||
variants: method
|
||||
device_check: NoCheck
|
||||
device_guard: False
|
||||
tags: inplace_view
|
||||
|
||||
- func: sspaddmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
|
||||
variants: function, method
|
||||
|
|
@ -4259,6 +4285,7 @@
|
|||
device_check: NoCheck
|
||||
device_guard: False
|
||||
variants: method
|
||||
tags: inplace_view
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: t_
|
||||
|
||||
|
|
@ -4365,6 +4392,7 @@
|
|||
variants: method
|
||||
device_check: NoCheck
|
||||
device_guard: False
|
||||
tags: inplace_view
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: transpose_
|
||||
|
||||
|
|
@ -4501,6 +4529,7 @@
|
|||
variants: method
|
||||
device_check: NoCheck
|
||||
device_guard: False
|
||||
tags: inplace_view
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: unsqueeze_
|
||||
|
||||
|
|
@ -5303,7 +5332,7 @@
|
|||
- func: dequantize.self(Tensor self) -> Tensor
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CPU: dequantize_cpu
|
||||
CPU, CUDA: dequantize_cpu_or_cuda
|
||||
QuantizedCPU, QuantizedCUDA: dequantize_quantized
|
||||
|
||||
- func: dequantize.tensors(Tensor[] tensors) -> Tensor[]
|
||||
|
|
@ -5425,6 +5454,14 @@
|
|||
- func: choose_qparams_optimized(Tensor input, int numel, int n_bins, float ratio, int bit_width) -> (Tensor, Tensor)
|
||||
variants: function
|
||||
|
||||
- func: _autocast_to_reduced_precision(Tensor(a) self, bool cuda_enabled, bool cpu_enabled, ScalarType cuda_dtype, ScalarType cpu_dtype) -> Tensor(a)
|
||||
variants: method
|
||||
device_guard: False
|
||||
|
||||
- func: _autocast_to_full_precision(Tensor(a) self, bool cuda_enabled, bool cpu_enabled) -> Tensor(a)
|
||||
variants: method
|
||||
device_guard: False
|
||||
|
||||
- func: _to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor
|
||||
device_check: NoCheck
|
||||
device_guard: False
|
||||
|
|
@ -6098,16 +6135,12 @@
|
|||
CPU, CUDA: bitwise_right_shift
|
||||
|
||||
- func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
|
||||
structured_delegate: tril.out
|
||||
variants: method
|
||||
dispatch:
|
||||
CPU: tril_cpu_
|
||||
CUDA: tril_cuda_
|
||||
|
||||
- func: triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
|
||||
structured_delegate: triu.out
|
||||
variants: method
|
||||
dispatch:
|
||||
CPU: triu_cpu_
|
||||
CUDA: triu_cuda_
|
||||
|
||||
- func: digamma_(Tensor(a!) self) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
|
|
@ -6221,24 +6254,24 @@
|
|||
CPU, CUDA: cross
|
||||
|
||||
- func: triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
|
||||
structured: True
|
||||
dispatch:
|
||||
CPU: triu_cpu_out
|
||||
CUDA: triu_cuda_out
|
||||
CPU: triu_cpu
|
||||
CUDA: triu_cuda
|
||||
|
||||
- func: triu(Tensor self, int diagonal=0) -> Tensor
|
||||
structured_delegate: triu.out
|
||||
variants: method, function
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: triu
|
||||
|
||||
- func: tril.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
|
||||
structured: True
|
||||
dispatch:
|
||||
CPU: tril_cpu_out
|
||||
CUDA: tril_cuda_out
|
||||
CPU: tril_cpu
|
||||
CUDA: tril_cuda
|
||||
|
||||
- func: tril(Tensor self, int diagonal=0) -> Tensor
|
||||
structured_delegate: tril.out
|
||||
variants: method, function
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: tril
|
||||
|
||||
- func: tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
|
||||
dispatch:
|
||||
|
|
@ -6733,13 +6766,14 @@
|
|||
CUDA: legacy_lstsq_cuda
|
||||
|
||||
- func: triangular_solve.X(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False, *, Tensor(a!) X, Tensor(b!) M) -> (Tensor(a!) solution, Tensor(b!) cloned_coefficient)
|
||||
structured: True
|
||||
dispatch:
|
||||
CPU, CUDA: triangular_solve_out
|
||||
SparseCsrCUDA: triangular_solve_out_sparse_csr_cuda
|
||||
|
||||
- func: triangular_solve(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False) -> (Tensor solution, Tensor cloned_coefficient)
|
||||
structured_delegate: triangular_solve.X
|
||||
variants: method, function
|
||||
dispatch:
|
||||
CPU, CUDA: triangular_solve
|
||||
|
||||
- func: symeig.e(Tensor self, bool eigenvectors=False, bool upper=True, *, Tensor(a!) e, Tensor(b!) V) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
|
||||
dispatch:
|
||||
|
|
@ -6786,6 +6820,7 @@
|
|||
variants: method
|
||||
device_check: NoCheck
|
||||
device_guard: False
|
||||
tags: inplace_view
|
||||
|
||||
# swapdims, alias for transpose
|
||||
- func: swapdims(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
|
||||
|
|
@ -6797,6 +6832,7 @@
|
|||
variants: method
|
||||
device_check: NoCheck
|
||||
device_guard: False
|
||||
tags: inplace_view
|
||||
|
||||
- func: cholesky.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
dispatch:
|
||||
|
|
|
|||
|
|
@ -54,7 +54,8 @@ Tensor quantize_per_channel(
|
|||
auto quantizer = make_per_channel_affine_quantizer(scales, zero_points, axis, dtype);
|
||||
return quantizer->quantize(self);
|
||||
}
|
||||
Tensor dequantize_cpu(const Tensor& self) {
|
||||
|
||||
Tensor dequantize_cpu_or_cuda(const Tensor& self) {
|
||||
TORCH_CHECK(!self.is_quantized());
|
||||
return self.to(at::kFloat);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -349,10 +349,8 @@ Tensor qnnpack_avg_pool2d(
|
|||
pytorch_qnnp_operator_t qnnpack_operator{nullptr};
|
||||
const pytorch_qnnp_status createStatus =
|
||||
pytorch_qnnp_create_average_pooling2d_nhwc_q8(
|
||||
padH /* input_padding_top */,
|
||||
padW /* input_padding_right */,
|
||||
padH /* input_padding_bottom */,
|
||||
padW /* input_padding_left */,
|
||||
padH /* input_padding_height */,
|
||||
padW /* input_padding_width */,
|
||||
kH /* kernel height */,
|
||||
kW /* kernel width */,
|
||||
dH /* stride height */,
|
||||
|
|
|
|||
|
|
@ -41,18 +41,13 @@ static void convolution_q8(benchmark::State& state, const char* net, bool per_ch
|
|||
const size_t inputPixelStride = groups * groupInputChannels;
|
||||
const size_t effectiveKernelHeight = (kernelHeight - 1) * dilation + 1;
|
||||
const size_t effectiveKernelWidth = (kernelWidth - 1) * dilation + 1;
|
||||
const size_t paddingLeft = effectiveKernelWidth / 2;
|
||||
const size_t paddingTop = effectiveKernelHeight / 2;
|
||||
const size_t paddingRight = effectiveKernelWidth - 1 - paddingLeft;
|
||||
const size_t paddingBottom = effectiveKernelHeight - 1 - paddingTop;
|
||||
const size_t paddingWidth = effectiveKernelWidth / 2;
|
||||
const size_t paddingHeight = effectiveKernelHeight / 2;
|
||||
const size_t outputHeight =
|
||||
(paddingTop + inputHeight + paddingBottom - effectiveKernelHeight) /
|
||||
subsampling +
|
||||
(inputHeight + paddingHeight * 2 - effectiveKernelHeight) / subsampling +
|
||||
1;
|
||||
const size_t outputWidth =
|
||||
(paddingLeft + inputWidth + paddingRight - effectiveKernelWidth) /
|
||||
subsampling +
|
||||
1;
|
||||
(inputWidth + paddingWidth * 2 - effectiveKernelWidth) / subsampling + 1;
|
||||
|
||||
std::vector<uint8_t> input(
|
||||
batchSize * inputHeight * inputWidth * inputPixelStride);
|
||||
|
|
@ -78,10 +73,8 @@ static void convolution_q8(benchmark::State& state, const char* net, bool per_ch
|
|||
std::vector<float> requantization_scale(
|
||||
num_zero_points_padded, 0.5 * 0.5 / 0.5);
|
||||
status = pytorch_qnnp_create_convolution2d_nhwc_q8(
|
||||
paddingTop,
|
||||
paddingRight,
|
||||
paddingBottom,
|
||||
paddingLeft,
|
||||
paddingHeight,
|
||||
paddingWidth,
|
||||
kernelHeight,
|
||||
kernelWidth,
|
||||
subsampling,
|
||||
|
|
|
|||
|
|
@ -40,10 +40,8 @@ enum pytorch_qnnp_status pytorch_qnnp_deinitialize(void);
|
|||
typedef struct pytorch_qnnp_operator* pytorch_qnnp_operator_t;
|
||||
|
||||
enum pytorch_qnnp_status pytorch_qnnp_create_convolution2d_nhwc_q8(
|
||||
uint32_t input_padding_top,
|
||||
uint32_t input_padding_right,
|
||||
uint32_t input_padding_bottom,
|
||||
uint32_t input_padding_left,
|
||||
uint32_t input_padding_height,
|
||||
uint32_t input_padding_width,
|
||||
uint32_t kernel_height,
|
||||
uint32_t kernel_width,
|
||||
uint32_t subsampling_height,
|
||||
|
|
@ -66,12 +64,9 @@ enum pytorch_qnnp_status pytorch_qnnp_create_convolution2d_nhwc_q8(
|
|||
pytorch_qnnp_operator_t* convolution);
|
||||
|
||||
enum pytorch_qnnp_status pytorch_qnnp_create_convolution3d_ndhwc_q8(
|
||||
uint32_t input_padding_front,
|
||||
uint32_t input_padding_top,
|
||||
uint32_t input_padding_right,
|
||||
uint32_t input_padding_back,
|
||||
uint32_t input_padding_bottom,
|
||||
uint32_t input_padding_left,
|
||||
uint32_t input_padding_depth,
|
||||
uint32_t input_padding_height,
|
||||
uint32_t input_padding_width,
|
||||
uint32_t kernel_depth,
|
||||
uint32_t kernel_height,
|
||||
uint32_t kernel_width,
|
||||
|
|
@ -120,10 +115,8 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_convolution_ndhwc_q8(
|
|||
pthreadpool_t threadpool);
|
||||
|
||||
enum pytorch_qnnp_status pytorch_qnnp_create_deconvolution2d_nhwc_q8(
|
||||
uint32_t input_padding_top,
|
||||
uint32_t input_padding_right,
|
||||
uint32_t input_padding_bottom,
|
||||
uint32_t input_padding_left,
|
||||
uint32_t input_padding_height,
|
||||
uint32_t input_padding_width,
|
||||
uint32_t adjustment_height,
|
||||
uint32_t adjustment_width,
|
||||
uint32_t kernel_height,
|
||||
|
|
@ -227,10 +220,8 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_global_average_pooling_nwc_q8(
|
|||
size_t output_stride);
|
||||
|
||||
enum pytorch_qnnp_status pytorch_qnnp_create_average_pooling2d_nhwc_q8(
|
||||
uint32_t input_padding_top,
|
||||
uint32_t input_padding_right,
|
||||
uint32_t input_padding_bottom,
|
||||
uint32_t input_padding_left,
|
||||
uint32_t input_padding_height,
|
||||
uint32_t input_padding_width,
|
||||
uint32_t pooling_height,
|
||||
uint32_t pooling_width,
|
||||
uint32_t stride_height,
|
||||
|
|
@ -257,10 +248,8 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_average_pooling2d_nhwc_q8(
|
|||
pthreadpool_t threadpool);
|
||||
|
||||
enum pytorch_qnnp_status pytorch_qnnp_create_max_pooling2d_nhwc_u8(
|
||||
uint32_t input_padding_top,
|
||||
uint32_t input_padding_right,
|
||||
uint32_t input_padding_bottom,
|
||||
uint32_t input_padding_left,
|
||||
uint32_t input_padding_height,
|
||||
uint32_t input_padding_width,
|
||||
uint32_t pooling_height,
|
||||
uint32_t pooling_width,
|
||||
uint32_t stride_height,
|
||||
|
|
|
|||
|
|
@ -30,10 +30,8 @@ static inline size_t compute_output_dimension(
|
|||
}
|
||||
|
||||
enum pytorch_qnnp_status pytorch_qnnp_create_average_pooling2d_nhwc_q8(
|
||||
uint32_t input_padding_top,
|
||||
uint32_t input_padding_right,
|
||||
uint32_t input_padding_bottom,
|
||||
uint32_t input_padding_left,
|
||||
uint32_t input_padding_height,
|
||||
uint32_t input_padding_width,
|
||||
uint32_t pooling_height,
|
||||
uint32_t pooling_width,
|
||||
uint32_t stride_height,
|
||||
|
|
@ -145,8 +143,7 @@ enum pytorch_qnnp_status pytorch_qnnp_create_average_pooling2d_nhwc_q8(
|
|||
goto error;
|
||||
}
|
||||
|
||||
const bool any_padding = (input_padding_left | input_padding_top |
|
||||
input_padding_right | input_padding_bottom) != 0;
|
||||
const bool any_padding = (input_padding_width | input_padding_height) != 0;
|
||||
const uint32_t kr = pytorch_qnnp_params.q8avgpool.kr;
|
||||
const uint32_t mr = pytorch_qnnp_params.q8avgpool.mr;
|
||||
const uint32_t qr = pytorch_qnnp_params.q8avgpool.qr;
|
||||
|
|
@ -162,11 +159,8 @@ enum pytorch_qnnp_status pytorch_qnnp_create_average_pooling2d_nhwc_q8(
|
|||
average_pooling->zero_pointer = zero_buffer;
|
||||
}
|
||||
|
||||
average_pooling->input_padding_top = input_padding_top;
|
||||
average_pooling->input_padding_right = input_padding_right;
|
||||
average_pooling->input_padding_bottom = input_padding_bottom;
|
||||
average_pooling->input_padding_left = input_padding_left;
|
||||
|
||||
average_pooling->input_padding_height = input_padding_height;
|
||||
average_pooling->input_padding_width = input_padding_width;
|
||||
average_pooling->kernel_height = pooling_height;
|
||||
average_pooling->kernel_width = pooling_width;
|
||||
average_pooling->stride_height = stride_height;
|
||||
|
|
@ -239,13 +233,11 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_average_pooling2d_nhwc_q8(
|
|||
average_pooling->input_pixel_stride = input_pixel_stride;
|
||||
|
||||
average_pooling->output_height = compute_output_dimension(
|
||||
average_pooling->input_padding_top + input_height +
|
||||
average_pooling->input_padding_bottom,
|
||||
input_height + average_pooling->input_padding_height * 2,
|
||||
average_pooling->kernel_height,
|
||||
average_pooling->stride_height);
|
||||
average_pooling->output_width = compute_output_dimension(
|
||||
average_pooling->input_padding_left + input_width +
|
||||
average_pooling->input_padding_right,
|
||||
input_width + average_pooling->input_padding_width * 2,
|
||||
average_pooling->kernel_width,
|
||||
average_pooling->stride_width);
|
||||
average_pooling->output = output;
|
||||
|
|
|
|||
|
|
@ -38,10 +38,8 @@ static inline size_t compute_output_dimension(
|
|||
}
|
||||
|
||||
enum pytorch_qnnp_status pytorch_qnnp_create_convolution2d_nhwc_q8(
|
||||
uint32_t input_padding_top,
|
||||
uint32_t input_padding_right,
|
||||
uint32_t input_padding_bottom,
|
||||
uint32_t input_padding_left,
|
||||
uint32_t input_padding_height,
|
||||
uint32_t input_padding_width,
|
||||
uint32_t kernel_height,
|
||||
uint32_t kernel_width,
|
||||
uint32_t subsampling_height,
|
||||
|
|
@ -64,11 +62,8 @@ enum pytorch_qnnp_status pytorch_qnnp_create_convolution2d_nhwc_q8(
|
|||
pytorch_qnnp_operator_t* convolution_out) {
|
||||
return pytorch_qnnp_create_convolution3d_ndhwc_q8(
|
||||
0,
|
||||
input_padding_top,
|
||||
input_padding_right,
|
||||
0,
|
||||
input_padding_bottom,
|
||||
input_padding_left,
|
||||
input_padding_height,
|
||||
input_padding_width,
|
||||
1,
|
||||
kernel_height,
|
||||
kernel_width,
|
||||
|
|
@ -95,12 +90,9 @@ enum pytorch_qnnp_status pytorch_qnnp_create_convolution2d_nhwc_q8(
|
|||
}
|
||||
|
||||
enum pytorch_qnnp_status pytorch_qnnp_create_convolution3d_ndhwc_q8(
|
||||
uint32_t input_padding_front,
|
||||
uint32_t input_padding_top,
|
||||
uint32_t input_padding_right,
|
||||
uint32_t input_padding_back,
|
||||
uint32_t input_padding_bottom,
|
||||
uint32_t input_padding_left,
|
||||
uint32_t input_padding_depth,
|
||||
uint32_t input_padding_height,
|
||||
uint32_t input_padding_width,
|
||||
uint32_t kernel_depth,
|
||||
uint32_t kernel_height,
|
||||
uint32_t kernel_width,
|
||||
|
|
@ -190,52 +182,43 @@ enum pytorch_qnnp_status pytorch_qnnp_create_convolution3d_ndhwc_q8(
|
|||
subsampling_height);
|
||||
}
|
||||
|
||||
if (input_padding_top >= kernel_height) {
|
||||
if (input_padding_depth >= kernel_depth) {
|
||||
pytorch_qnnp_log_info(
|
||||
"inefficiency in convolution with %" PRIu32 "x%" PRIu32
|
||||
"inefficiency in convolution with %" PRIu32 "x%" PRIu32 "x%" PRIu32
|
||||
" kernel and %" PRIu32 "+%" PRIu32
|
||||
" depth padding: "
|
||||
"input depth padding is greater or equal to kernel depth",
|
||||
kernel_depth,
|
||||
kernel_height,
|
||||
kernel_width,
|
||||
input_padding_depth,
|
||||
input_padding_depth);
|
||||
}
|
||||
|
||||
if (input_padding_height >= kernel_height) {
|
||||
pytorch_qnnp_log_info(
|
||||
"inefficiency in convolution with %" PRIu32 "x%" PRIu32 "x%" PRIu32
|
||||
" kernel and %" PRIu32 "+%" PRIu32
|
||||
" height padding: "
|
||||
"input top padding is greater or equal to kernel height",
|
||||
kernel_width,
|
||||
"input height padding is greater or equal to kernel height",
|
||||
kernel_depth,
|
||||
kernel_height,
|
||||
input_padding_top,
|
||||
input_padding_bottom);
|
||||
kernel_width,
|
||||
input_padding_height,
|
||||
input_padding_height);
|
||||
}
|
||||
|
||||
if (input_padding_bottom >= kernel_height) {
|
||||
if (input_padding_width >= kernel_width) {
|
||||
pytorch_qnnp_log_info(
|
||||
"inefficiency in convolution with %" PRIu32 "x%" PRIu32
|
||||
" kernel and %" PRIu32 "+%" PRIu32
|
||||
" height padding: "
|
||||
"input bottom padding is greater or equal to kernel height",
|
||||
kernel_width,
|
||||
kernel_height,
|
||||
input_padding_top,
|
||||
input_padding_bottom);
|
||||
}
|
||||
|
||||
if (input_padding_right >= kernel_width) {
|
||||
pytorch_qnnp_log_info(
|
||||
"inefficiency in convolution with %" PRIu32 "x%" PRIu32
|
||||
"inefficiency in convolution with %" PRIu32 "x%" PRIu32 "x%" PRIu32
|
||||
" kernel and %" PRIu32 "+%" PRIu32
|
||||
" width padding: "
|
||||
"input right padding is greater or equal to kernel width",
|
||||
kernel_width,
|
||||
"input width padding is greater or equal to kernel width",
|
||||
kernel_depth,
|
||||
kernel_height,
|
||||
input_padding_left,
|
||||
input_padding_right);
|
||||
}
|
||||
|
||||
if (input_padding_left >= kernel_width) {
|
||||
pytorch_qnnp_log_info(
|
||||
"inefficiency in convolution with %" PRIu32 "x%" PRIu32
|
||||
" kernel and %" PRIu32 "+%" PRIu32
|
||||
" width padding: "
|
||||
"input left padding is greater or equal to kernel width",
|
||||
kernel_width,
|
||||
kernel_height,
|
||||
input_padding_left,
|
||||
input_padding_right);
|
||||
input_padding_width,
|
||||
input_padding_width);
|
||||
}
|
||||
|
||||
for (int i = 0; i < groups * group_output_channels; ++i) {
|
||||
|
|
@ -262,8 +245,7 @@ enum pytorch_qnnp_status pytorch_qnnp_create_convolution3d_ndhwc_q8(
|
|||
|
||||
enum pytorch_qnnp_ukernel_type ukernel_type = pytorch_qnnp_ukernel_type_none;
|
||||
const bool any_padding =
|
||||
(input_padding_front | input_padding_left | input_padding_top |
|
||||
input_padding_back | input_padding_right | input_padding_bottom) != 0;
|
||||
(input_padding_depth | input_padding_height | input_padding_width) != 0;
|
||||
if ((kernel_size == 9 || kernel_size == 25) && group_input_channels == 1 &&
|
||||
group_output_channels == 1 && groups > 1) {
|
||||
ukernel_type = pytorch_qnnp_ukernel_type_dwconv;
|
||||
|
|
@ -503,13 +485,9 @@ enum pytorch_qnnp_status pytorch_qnnp_create_convolution3d_ndhwc_q8(
|
|||
convolution->zero_pointer = (void*)((uintptr_t)zero_buffer + zero_offset);
|
||||
}
|
||||
|
||||
convolution->input_padding_front = input_padding_front;
|
||||
convolution->input_padding_top = input_padding_top;
|
||||
convolution->input_padding_right = input_padding_right;
|
||||
convolution->input_padding_back = input_padding_back;
|
||||
convolution->input_padding_bottom = input_padding_bottom;
|
||||
convolution->input_padding_left = input_padding_left;
|
||||
|
||||
convolution->input_padding_depth = input_padding_depth;
|
||||
convolution->input_padding_height = input_padding_height;
|
||||
convolution->input_padding_width = input_padding_width;
|
||||
convolution->kernel_depth = kernel_depth;
|
||||
convolution->kernel_height = kernel_height;
|
||||
convolution->kernel_width = kernel_width;
|
||||
|
|
@ -615,20 +593,17 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_convolution_ndhwc_q8(
|
|||
convolution->input_pixel_stride = input_pixel_stride;
|
||||
|
||||
convolution->output_depth = compute_output_dimension(
|
||||
convolution->input_padding_front + input_depth +
|
||||
convolution->input_padding_back,
|
||||
input_depth + convolution->input_padding_depth * 2,
|
||||
convolution->kernel_depth,
|
||||
convolution->dilation_depth,
|
||||
convolution->stride_depth);
|
||||
convolution->output_height = compute_output_dimension(
|
||||
convolution->input_padding_top + input_height +
|
||||
convolution->input_padding_bottom,
|
||||
input_height + convolution->input_padding_height * 2,
|
||||
convolution->kernel_height,
|
||||
convolution->dilation_height,
|
||||
convolution->stride_height);
|
||||
convolution->output_width = compute_output_dimension(
|
||||
convolution->input_padding_left + input_width +
|
||||
convolution->input_padding_right,
|
||||
input_width + convolution->input_padding_width * 2,
|
||||
convolution->kernel_width,
|
||||
convolution->dilation_width,
|
||||
convolution->stride_width);
|
||||
|
|
|
|||
|
|
@ -131,14 +131,14 @@ enum pytorch_qnnp_status qnnpackDeConv(
|
|||
// Setup the kernel
|
||||
const size_t output_width = compute_output_dimension(
|
||||
input_width,
|
||||
deconvolution->input_padding_left + deconvolution->input_padding_right,
|
||||
deconvolution->input_padding_width * 2,
|
||||
deconvolution->adjustment_width,
|
||||
kernel_width,
|
||||
deconvolution->dilation_width,
|
||||
deconvolution->stride_width);
|
||||
const size_t output_height = compute_output_dimension(
|
||||
input_height,
|
||||
deconvolution->input_padding_top + deconvolution->input_padding_bottom,
|
||||
deconvolution->input_padding_height * 2,
|
||||
deconvolution->adjustment_height,
|
||||
kernel_height,
|
||||
deconvolution->dilation_height,
|
||||
|
|
|
|||
|
|
@ -36,10 +36,8 @@ static inline size_t compute_output_dimension(
|
|||
}
|
||||
|
||||
enum pytorch_qnnp_status pytorch_qnnp_create_deconvolution2d_nhwc_q8(
|
||||
uint32_t input_padding_top,
|
||||
uint32_t input_padding_right,
|
||||
uint32_t input_padding_bottom,
|
||||
uint32_t input_padding_left,
|
||||
uint32_t input_padding_height,
|
||||
uint32_t input_padding_width,
|
||||
uint32_t adjustment_height,
|
||||
uint32_t adjustment_width,
|
||||
uint32_t kernel_height,
|
||||
|
|
@ -181,10 +179,8 @@ enum pytorch_qnnp_status pytorch_qnnp_create_deconvolution2d_nhwc_q8(
|
|||
deconvolution->zero_buffer = zero_buffer;
|
||||
deconvolution->zero_pointer = (void*)((uintptr_t)zero_buffer + zero_offset);
|
||||
|
||||
deconvolution->input_padding_top = input_padding_top;
|
||||
deconvolution->input_padding_right = input_padding_right;
|
||||
deconvolution->input_padding_bottom = input_padding_bottom;
|
||||
deconvolution->input_padding_left = input_padding_left;
|
||||
deconvolution->input_padding_height = input_padding_height;
|
||||
deconvolution->input_padding_width = input_padding_width;
|
||||
deconvolution->adjustment_height = adjustment_height;
|
||||
deconvolution->adjustment_width = adjustment_width;
|
||||
|
||||
|
|
@ -266,8 +262,7 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_deconvolution2d_nhwc_q8(
|
|||
const size_t output_height = deconvolution->output_height =
|
||||
compute_output_dimension(
|
||||
input_height,
|
||||
deconvolution->input_padding_top +
|
||||
deconvolution->input_padding_bottom,
|
||||
deconvolution->input_padding_height * 2,
|
||||
deconvolution->adjustment_height,
|
||||
kernel_height,
|
||||
deconvolution->dilation_height,
|
||||
|
|
@ -275,8 +270,7 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_deconvolution2d_nhwc_q8(
|
|||
const size_t output_width = deconvolution->output_width =
|
||||
compute_output_dimension(
|
||||
input_width,
|
||||
deconvolution->input_padding_left +
|
||||
deconvolution->input_padding_right,
|
||||
deconvolution->input_padding_width * 2,
|
||||
deconvolution->adjustment_width,
|
||||
kernel_width,
|
||||
deconvolution->dilation_width,
|
||||
|
|
|
|||
|
|
@ -40,9 +40,9 @@ void pytorch_qnnp_indirection_init_conv3d(
|
|||
const size_t dilation_depth = op->dilation_depth;
|
||||
const size_t dilation_height = op->dilation_height;
|
||||
const size_t dilation_width = op->dilation_width;
|
||||
const size_t input_padding_front = op->input_padding_front;
|
||||
const size_t input_padding_top = op->input_padding_top;
|
||||
const size_t input_padding_left = op->input_padding_left;
|
||||
const size_t input_padding_depth = op->input_padding_depth;
|
||||
const size_t input_padding_height = op->input_padding_height;
|
||||
const size_t input_padding_width = op->input_padding_width;
|
||||
|
||||
const size_t output_size = output_depth * output_height * output_width;
|
||||
const size_t kernel_size = kernel_depth * kernel_height * kernel_width;
|
||||
|
|
@ -70,16 +70,16 @@ void pytorch_qnnp_indirection_init_conv3d(
|
|||
|
||||
for (size_t kernel_z = 0; kernel_z < kernel_depth; kernel_z++) {
|
||||
const size_t input_z = output_z * stride_depth +
|
||||
kernel_z * dilation_depth - input_padding_front;
|
||||
kernel_z * dilation_depth - input_padding_depth;
|
||||
if (input_z < input_depth) {
|
||||
for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
|
||||
const size_t input_y = output_y * stride_height +
|
||||
kernel_y * dilation_height - input_padding_top;
|
||||
kernel_y * dilation_height - input_padding_height;
|
||||
if (input_y < input_height) {
|
||||
for (size_t kernel_x = 0; kernel_x < kernel_width;
|
||||
kernel_x++) {
|
||||
const size_t input_x = output_x * stride_width +
|
||||
kernel_x * dilation_width - input_padding_left;
|
||||
kernel_x * dilation_width - input_padding_width;
|
||||
const size_t index = (group * batch_size + image) *
|
||||
tiled_output_size * kernel_size +
|
||||
output_tile_start * kernel_size +
|
||||
|
|
@ -154,19 +154,19 @@ void pytorch_qnnp_indirection_init_dwconv2d(
|
|||
const size_t stride_width = op->stride_width;
|
||||
const size_t dilation_height = op->dilation_height;
|
||||
const size_t dilation_width = op->dilation_width;
|
||||
const size_t input_padding_top = op->input_padding_top;
|
||||
const size_t input_padding_left = op->input_padding_left;
|
||||
const size_t input_padding_height = op->input_padding_height;
|
||||
const size_t input_padding_width = op->input_padding_width;
|
||||
|
||||
for (size_t image = batch_start; image < batch_size; image++) {
|
||||
for (size_t output_y = 0; output_y < output_height; output_y++) {
|
||||
for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
|
||||
const size_t input_y = output_y * stride_height +
|
||||
kernel_y * dilation_height - input_padding_top;
|
||||
kernel_y * dilation_height - input_padding_height;
|
||||
if (input_y < input_height) {
|
||||
for (size_t output_x = 0; output_x < output_width; output_x++) {
|
||||
for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
|
||||
const size_t input_x = output_x * stride_width +
|
||||
kernel_x * dilation_width - input_padding_left;
|
||||
kernel_x * dilation_width - input_padding_width;
|
||||
const size_t index =
|
||||
(image * output_height + output_y) * step_height +
|
||||
output_x * step_width * kernel_height +
|
||||
|
|
@ -217,8 +217,8 @@ void pytorch_qnnp_indirection_init_deconv2d(
|
|||
const size_t stride_width = op->stride_width;
|
||||
const size_t dilation_height = op->dilation_height;
|
||||
const size_t dilation_width = op->dilation_width;
|
||||
const size_t input_padding_top = op->input_padding_top;
|
||||
const size_t input_padding_left = op->input_padding_left;
|
||||
const size_t input_padding_height = op->input_padding_height;
|
||||
const size_t input_padding_width = op->input_padding_width;
|
||||
|
||||
const size_t output_size = output_height * output_width;
|
||||
const size_t kernel_size = kernel_height * kernel_width;
|
||||
|
|
@ -237,11 +237,11 @@ void pytorch_qnnp_indirection_init_deconv2d(
|
|||
const size_t output_x = output_index % output_width;
|
||||
for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
|
||||
const size_t y =
|
||||
output_y + input_padding_top - kernel_y * dilation_height;
|
||||
output_y + input_padding_height - kernel_y * dilation_height;
|
||||
const size_t input_y = y / stride_height;
|
||||
for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
|
||||
const size_t x =
|
||||
output_x + input_padding_left - kernel_x * dilation_width;
|
||||
output_x + input_padding_width - kernel_x * dilation_width;
|
||||
const size_t input_x = x / stride_width;
|
||||
const size_t index = (group * batch_size + image) *
|
||||
tiled_output_size * kernel_size +
|
||||
|
|
@ -284,21 +284,21 @@ void pytorch_qnnp_indirection_init_maxpool2d(
|
|||
const size_t stride_width = op->stride_width;
|
||||
const size_t dilation_height = op->dilation_height;
|
||||
const size_t dilation_width = op->dilation_width;
|
||||
const size_t input_padding_top = op->input_padding_top;
|
||||
const size_t input_padding_left = op->input_padding_left;
|
||||
const size_t input_padding_height = op->input_padding_height;
|
||||
const size_t input_padding_width = op->input_padding_width;
|
||||
|
||||
for (size_t image = batch_start; image < batch_size; image++) {
|
||||
for (size_t output_y = 0; output_y < output_height; output_y++) {
|
||||
for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
|
||||
const size_t input_y =
|
||||
doz(output_y * stride_height + pooling_y * dilation_height,
|
||||
input_padding_top);
|
||||
input_padding_height);
|
||||
const size_t clamped_input_y = min(input_y, input_height - 1);
|
||||
for (size_t output_x = 0; output_x < output_width; output_x++) {
|
||||
for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
|
||||
const size_t input_x =
|
||||
doz(output_x * stride_width + pooling_x * dilation_width,
|
||||
input_padding_left);
|
||||
input_padding_width);
|
||||
const size_t clamped_input_x = min(input_x, input_width - 1);
|
||||
const size_t index =
|
||||
(image * output_height + output_y) * step_height +
|
||||
|
|
|
|||
|
|
@ -35,10 +35,8 @@ static inline size_t compute_output_dimension(
|
|||
}
|
||||
|
||||
enum pytorch_qnnp_status pytorch_qnnp_create_max_pooling2d_nhwc_u8(
|
||||
uint32_t input_padding_top,
|
||||
uint32_t input_padding_right,
|
||||
uint32_t input_padding_bottom,
|
||||
uint32_t input_padding_left,
|
||||
uint32_t input_padding_height,
|
||||
uint32_t input_padding_width,
|
||||
uint32_t pooling_height,
|
||||
uint32_t pooling_width,
|
||||
uint32_t stride_height,
|
||||
|
|
@ -117,10 +115,8 @@ enum pytorch_qnnp_status pytorch_qnnp_create_max_pooling2d_nhwc_u8(
|
|||
goto error;
|
||||
}
|
||||
|
||||
max_pooling->input_padding_top = input_padding_top;
|
||||
max_pooling->input_padding_right = input_padding_right;
|
||||
max_pooling->input_padding_bottom = input_padding_bottom;
|
||||
max_pooling->input_padding_left = input_padding_left;
|
||||
max_pooling->input_padding_height = input_padding_height;
|
||||
max_pooling->input_padding_width = input_padding_width;
|
||||
|
||||
max_pooling->kernel_height = pooling_height;
|
||||
max_pooling->kernel_width = pooling_width;
|
||||
|
|
@ -180,14 +176,12 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_max_pooling2d_nhwc_u8(
|
|||
max_pooling->input_pixel_stride = input_pixel_stride;
|
||||
|
||||
max_pooling->output_height = compute_output_dimension(
|
||||
max_pooling->input_padding_top + input_height +
|
||||
max_pooling->input_padding_bottom,
|
||||
input_height + max_pooling->input_padding_height * 2,
|
||||
max_pooling->kernel_height,
|
||||
max_pooling->dilation_height,
|
||||
max_pooling->stride_height);
|
||||
max_pooling->output_width = compute_output_dimension(
|
||||
max_pooling->input_padding_left + input_width +
|
||||
max_pooling->input_padding_right,
|
||||
input_width + max_pooling->input_padding_width * 2,
|
||||
max_pooling->kernel_width,
|
||||
max_pooling->dilation_width,
|
||||
max_pooling->stride_width);
|
||||
|
|
|
|||
|
|
@ -47,12 +47,9 @@ typedef struct {
|
|||
|
||||
struct pytorch_qnnp_operator {
|
||||
size_t batch_size;
|
||||
uint32_t input_padding_front;
|
||||
uint32_t input_padding_back;
|
||||
uint32_t input_padding_top;
|
||||
uint32_t input_padding_right;
|
||||
uint32_t input_padding_bottom;
|
||||
uint32_t input_padding_left;
|
||||
uint32_t input_padding_depth;
|
||||
uint32_t input_padding_height;
|
||||
uint32_t input_padding_width;
|
||||
uint32_t adjustment_height;
|
||||
uint32_t adjustment_width;
|
||||
uint32_t kernel_depth;
|
||||
|
|
|
|||
|
|
@ -22,69 +22,35 @@
|
|||
class AveragePoolingOperatorTester {
|
||||
public:
|
||||
inline AveragePoolingOperatorTester& padding(uint32_t padding) {
|
||||
this->paddingTop_ = padding;
|
||||
this->paddingRight_ = padding;
|
||||
this->paddingBottom_ = padding;
|
||||
this->paddingLeft_ = padding;
|
||||
this->paddingHeight_ = padding;
|
||||
this->paddingWidth_ = padding;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline AveragePoolingOperatorTester& padding(
|
||||
uint32_t paddingHeight,
|
||||
uint32_t paddingWidth) {
|
||||
this->paddingTop_ = paddingHeight;
|
||||
this->paddingRight_ = paddingWidth;
|
||||
this->paddingBottom_ = paddingHeight;
|
||||
this->paddingLeft_ = paddingWidth;
|
||||
this->paddingHeight_ = paddingHeight;
|
||||
this->paddingWidth_ = paddingWidth;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline AveragePoolingOperatorTester& paddingHeight(uint32_t paddingHeight) {
|
||||
this->paddingTop_ = paddingHeight;
|
||||
this->paddingBottom_ = paddingHeight;
|
||||
this->paddingHeight_ = paddingHeight;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline AveragePoolingOperatorTester& paddingWidth(uint32_t paddingWidth) {
|
||||
this->paddingRight_ = paddingWidth;
|
||||
this->paddingLeft_ = paddingWidth;
|
||||
this->paddingWidth_ = paddingWidth;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline AveragePoolingOperatorTester& paddingTop(uint32_t paddingTop) {
|
||||
this->paddingTop_ = paddingTop;
|
||||
return *this;
|
||||
inline uint32_t paddingHeight() const {
|
||||
return this->paddingHeight_;
|
||||
}
|
||||
|
||||
inline uint32_t paddingTop() const {
|
||||
return this->paddingTop_;
|
||||
}
|
||||
|
||||
inline AveragePoolingOperatorTester& paddingRight(uint32_t paddingRight) {
|
||||
this->paddingRight_ = paddingRight;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline uint32_t paddingRight() const {
|
||||
return this->paddingRight_;
|
||||
}
|
||||
|
||||
inline AveragePoolingOperatorTester& paddingBottom(uint32_t paddingBottom) {
|
||||
this->paddingBottom_ = paddingBottom;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline uint32_t paddingBottom() const {
|
||||
return this->paddingBottom_;
|
||||
}
|
||||
|
||||
inline AveragePoolingOperatorTester& paddingLeft(uint32_t paddingLeft) {
|
||||
this->paddingLeft_ = paddingLeft;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline uint32_t paddingLeft() const {
|
||||
return this->paddingLeft_;
|
||||
inline uint32_t paddingWidth() const {
|
||||
return this->paddingWidth_;
|
||||
}
|
||||
|
||||
inline AveragePoolingOperatorTester& inputSize(
|
||||
|
|
@ -211,8 +177,7 @@ class AveragePoolingOperatorTester {
|
|||
}
|
||||
|
||||
inline size_t outputHeight() const {
|
||||
const size_t paddedInputHeight =
|
||||
paddingTop() + inputHeight() + paddingBottom();
|
||||
const size_t paddedInputHeight = inputHeight() + paddingHeight() * 2;
|
||||
if (paddedInputHeight <= poolingHeight()) {
|
||||
return 1;
|
||||
} else {
|
||||
|
|
@ -221,8 +186,7 @@ class AveragePoolingOperatorTester {
|
|||
}
|
||||
|
||||
inline size_t outputWidth() const {
|
||||
const size_t paddedInputWidth =
|
||||
paddingLeft() + inputWidth() + paddingRight();
|
||||
const size_t paddedInputWidth = inputWidth() + paddingWidth() * 2;
|
||||
if (paddedInputWidth <= poolingWidth()) {
|
||||
return 1;
|
||||
} else {
|
||||
|
|
@ -303,7 +267,7 @@ class AveragePoolingOperatorTester {
|
|||
|
||||
inline size_t nextOutputHeight() const {
|
||||
const size_t paddedNextInputHeight =
|
||||
paddingTop() + nextInputHeight() + paddingBottom();
|
||||
nextInputHeight() + paddingHeight() * 2;
|
||||
if (paddedNextInputHeight <= poolingHeight()) {
|
||||
return 1;
|
||||
} else {
|
||||
|
|
@ -312,8 +276,7 @@ class AveragePoolingOperatorTester {
|
|||
}
|
||||
|
||||
inline size_t nextOutputWidth() const {
|
||||
const size_t paddedNextInputWidth =
|
||||
paddingLeft() + nextInputWidth() + paddingRight();
|
||||
const size_t paddedNextInputWidth = nextInputWidth() + paddingWidth() * 2;
|
||||
if (paddedNextInputWidth <= poolingWidth()) {
|
||||
return 1;
|
||||
} else {
|
||||
|
|
@ -430,9 +393,9 @@ class AveragePoolingOperatorTester {
|
|||
for (size_t c = 0; c < channels(); c++) {
|
||||
double acc = 0.0f;
|
||||
for (size_t py = 0; py < poolingHeight(); py++) {
|
||||
const size_t iy = oy * strideHeight() + py - paddingTop();
|
||||
const size_t iy = oy * strideHeight() + py - paddingHeight();
|
||||
for (size_t px = 0; px < poolingWidth(); px++) {
|
||||
const size_t ix = ox * strideWidth() + px - paddingLeft();
|
||||
const size_t ix = ox * strideWidth() + px - paddingWidth();
|
||||
if (ix < inputWidth() && iy < inputHeight()) {
|
||||
acc += double(
|
||||
int32_t(input
|
||||
|
|
@ -482,10 +445,8 @@ class AveragePoolingOperatorTester {
|
|||
ASSERT_EQ(
|
||||
pytorch_qnnp_status_success,
|
||||
pytorch_qnnp_create_average_pooling2d_nhwc_q8(
|
||||
paddingTop(),
|
||||
paddingRight(),
|
||||
paddingBottom(),
|
||||
paddingLeft(),
|
||||
paddingHeight(),
|
||||
paddingWidth(),
|
||||
poolingHeight(),
|
||||
poolingWidth(),
|
||||
strideHeight(),
|
||||
|
|
@ -596,9 +557,9 @@ class AveragePoolingOperatorTester {
|
|||
for (size_t c = 0; c < channels(); c++) {
|
||||
double acc = 0.0f;
|
||||
for (size_t py = 0; py < poolingHeight(); py++) {
|
||||
const size_t iy = oy * strideHeight() + py - paddingTop();
|
||||
const size_t iy = oy * strideHeight() + py - paddingHeight();
|
||||
for (size_t px = 0; px < poolingWidth(); px++) {
|
||||
const size_t ix = ox * strideWidth() + px - paddingLeft();
|
||||
const size_t ix = ox * strideWidth() + px - paddingWidth();
|
||||
if (ix < inputWidth() && iy < inputHeight()) {
|
||||
acc += double(
|
||||
int32_t(input
|
||||
|
|
@ -648,10 +609,8 @@ class AveragePoolingOperatorTester {
|
|||
ASSERT_EQ(
|
||||
pytorch_qnnp_status_success,
|
||||
pytorch_qnnp_create_average_pooling2d_nhwc_q8(
|
||||
paddingTop(),
|
||||
paddingRight(),
|
||||
paddingBottom(),
|
||||
paddingLeft(),
|
||||
paddingHeight(),
|
||||
paddingWidth(),
|
||||
poolingHeight(),
|
||||
poolingWidth(),
|
||||
strideHeight(),
|
||||
|
|
@ -731,9 +690,9 @@ class AveragePoolingOperatorTester {
|
|||
for (size_t c = 0; c < channels(); c++) {
|
||||
double acc = 0.0f;
|
||||
for (size_t py = 0; py < poolingHeight(); py++) {
|
||||
const size_t iy = oy * strideHeight() + py - paddingTop();
|
||||
const size_t iy = oy * strideHeight() + py - paddingHeight();
|
||||
for (size_t px = 0; px < poolingWidth(); px++) {
|
||||
const size_t ix = ox * strideWidth() + px - paddingLeft();
|
||||
const size_t ix = ox * strideWidth() + px - paddingWidth();
|
||||
if (ix < nextInputWidth() && iy < nextInputHeight()) {
|
||||
acc += double(
|
||||
int32_t(input
|
||||
|
|
@ -847,10 +806,8 @@ class AveragePoolingOperatorTester {
|
|||
}
|
||||
|
||||
private:
|
||||
uint32_t paddingTop_{0};
|
||||
uint32_t paddingRight_{0};
|
||||
uint32_t paddingBottom_{0};
|
||||
uint32_t paddingLeft_{0};
|
||||
uint32_t paddingHeight_{0};
|
||||
uint32_t paddingWidth_{0};
|
||||
size_t inputHeight_{1};
|
||||
size_t inputWidth_{1};
|
||||
size_t channels_{1};
|
||||
|
|
|
|||
|
|
@ -50,14 +50,12 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_small_1xM_pool_with_padding) {
|
|||
channels += 3) {
|
||||
for (size_t poolSize = 3; poolSize <= pytorch_qnnp_params.q8avgpool.mr;
|
||||
poolSize++) {
|
||||
for (size_t paddingLeft = 0; paddingLeft <= 1; paddingLeft++) {
|
||||
for (size_t paddingRight = 0; paddingRight <= 1; paddingRight++) {
|
||||
for (size_t paddingWidth = 0; paddingWidth <= 1; paddingWidth++) {
|
||||
AveragePoolingOperatorTester()
|
||||
.batchSize(1)
|
||||
.inputHeight(2)
|
||||
.inputWidth(poolSize + 2)
|
||||
.paddingLeft(paddingLeft)
|
||||
.paddingRight(paddingRight)
|
||||
.paddingWidth(paddingWidth)
|
||||
.poolingHeight(1)
|
||||
.poolingWidth(poolSize)
|
||||
.channels(channels)
|
||||
|
|
@ -65,7 +63,6 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_small_1xM_pool_with_padding) {
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_small_1xM_pool_with_stride) {
|
||||
|
|
@ -114,14 +111,12 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_small_Mx1_pool_with_padding) {
|
|||
channels += 3) {
|
||||
for (size_t poolSize = 2; poolSize <= pytorch_qnnp_params.q8avgpool.mr;
|
||||
poolSize++) {
|
||||
for (size_t paddingTop = 0; paddingTop <= 1; paddingTop++) {
|
||||
for (size_t paddingBottom = 0; paddingBottom <= 1; paddingBottom++) {
|
||||
for (size_t paddingHeight = 0; paddingHeight <= 1; paddingHeight++) {
|
||||
AveragePoolingOperatorTester()
|
||||
.batchSize(1)
|
||||
.inputHeight(poolSize + 1)
|
||||
.inputWidth(3)
|
||||
.paddingTop(paddingTop)
|
||||
.paddingBottom(paddingBottom)
|
||||
.paddingHeight(paddingHeight)
|
||||
.poolingHeight(poolSize)
|
||||
.poolingWidth(1)
|
||||
.channels(channels)
|
||||
|
|
@ -129,7 +124,6 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_small_Mx1_pool_with_padding) {
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_small_Mx1_pool_with_stride) {
|
||||
|
|
@ -433,14 +427,12 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_large_1xM_pool_with_padding) {
|
|||
channels += 3) {
|
||||
for (size_t poolSize = 3; poolSize <= pytorch_qnnp_params.q8avgpool.mr;
|
||||
poolSize++) {
|
||||
for (size_t paddingLeft = 0; paddingLeft <= 1; paddingLeft++) {
|
||||
for (size_t paddingRight = 0; paddingRight <= 1; paddingRight++) {
|
||||
for (size_t paddingWidth = 0; paddingWidth <= 1; paddingWidth++) {
|
||||
AveragePoolingOperatorTester()
|
||||
.batchSize(1)
|
||||
.inputHeight(2)
|
||||
.inputWidth(poolSize + 2)
|
||||
.paddingLeft(paddingLeft)
|
||||
.paddingRight(paddingRight)
|
||||
.paddingWidth(paddingWidth)
|
||||
.poolingHeight(1)
|
||||
.poolingWidth(poolSize)
|
||||
.channels(channels)
|
||||
|
|
@ -448,7 +440,6 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_large_1xM_pool_with_padding) {
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_large_1xM_pool_with_stride) {
|
||||
|
|
@ -500,14 +491,12 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_large_Mx1_pool_with_padding) {
|
|||
for (size_t poolSize = pytorch_qnnp_params.q8avgpool.mr + 1; poolSize <=
|
||||
pytorch_qnnp_params.q8avgpool.mr + pytorch_qnnp_params.q8avgpool.qr;
|
||||
poolSize++) {
|
||||
for (size_t paddingTop = 0; paddingTop <= 1; paddingTop++) {
|
||||
for (size_t paddingBottom = 0; paddingBottom <= 1; paddingBottom++) {
|
||||
for (size_t paddingHeight = 0; paddingHeight <= 1; paddingHeight++) {
|
||||
AveragePoolingOperatorTester()
|
||||
.batchSize(1)
|
||||
.inputHeight(poolSize + 1)
|
||||
.inputWidth(3)
|
||||
.paddingTop(paddingTop)
|
||||
.paddingBottom(paddingBottom)
|
||||
.paddingHeight(paddingHeight)
|
||||
.poolingHeight(poolSize)
|
||||
.poolingWidth(1)
|
||||
.channels(channels)
|
||||
|
|
@ -515,7 +504,6 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_large_Mx1_pool_with_padding) {
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_large_Mx1_pool_with_stride) {
|
||||
|
|
@ -526,14 +514,12 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_large_Mx1_pool_with_stride) {
|
|||
for (size_t poolSize = pytorch_qnnp_params.q8avgpool.mr + 1; poolSize <=
|
||||
pytorch_qnnp_params.q8avgpool.mr + pytorch_qnnp_params.q8avgpool.qr;
|
||||
poolSize++) {
|
||||
for (size_t paddingTop = 0; paddingTop <= 1; paddingTop++) {
|
||||
for (size_t paddingBottom = 0; paddingBottom <= 1; paddingBottom++) {
|
||||
for (size_t paddingHeight = 0; paddingHeight <= 1; paddingHeight++) {
|
||||
AveragePoolingOperatorTester()
|
||||
.batchSize(1)
|
||||
.inputHeight(poolSize + 1)
|
||||
.inputWidth(3)
|
||||
.paddingTop(paddingTop)
|
||||
.paddingBottom(paddingBottom)
|
||||
.paddingHeight(paddingHeight)
|
||||
.poolingHeight(poolSize)
|
||||
.poolingWidth(1)
|
||||
.channels(channels)
|
||||
|
|
@ -541,7 +527,6 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_large_Mx1_pool_with_stride) {
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(
|
||||
|
|
@ -830,14 +815,12 @@ TEST(AVERAGE_POOLING_OP, unit_batch_few_channels_1xM_pool_with_padding) {
|
|||
channels++) {
|
||||
for (size_t poolSize = 3; poolSize <= pytorch_qnnp_params.q8avgpool.mr;
|
||||
poolSize++) {
|
||||
for (size_t paddingLeft = 0; paddingLeft <= 1; paddingLeft++) {
|
||||
for (size_t paddingRight = 0; paddingRight <= 1; paddingRight++) {
|
||||
for (size_t paddingWidth = 0; paddingWidth <= 1; paddingWidth++) {
|
||||
AveragePoolingOperatorTester()
|
||||
.batchSize(1)
|
||||
.inputHeight(2)
|
||||
.inputWidth(poolSize + 2)
|
||||
.paddingLeft(paddingLeft)
|
||||
.paddingRight(paddingRight)
|
||||
.paddingWidth(paddingWidth)
|
||||
.poolingHeight(1)
|
||||
.poolingWidth(poolSize)
|
||||
.channels(channels)
|
||||
|
|
@ -845,7 +828,6 @@ TEST(AVERAGE_POOLING_OP, unit_batch_few_channels_1xM_pool_with_padding) {
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(AVERAGE_POOLING_OP, unit_batch_few_channels_1xM_pool_with_stride) {
|
||||
|
|
@ -891,14 +873,12 @@ TEST(AVERAGE_POOLING_OP, unit_batch_few_channels_Mx1_pool_with_padding) {
|
|||
channels++) {
|
||||
for (size_t poolSize = 2; poolSize <= 2 * pytorch_qnnp_params.q8avgpool.kr;
|
||||
poolSize++) {
|
||||
for (size_t paddingTop = 0; paddingTop <= 1; paddingTop++) {
|
||||
for (size_t paddingBottom = 0; paddingBottom <= 1; paddingBottom++) {
|
||||
for (size_t paddingHeight = 0; paddingHeight <= 1; paddingHeight++) {
|
||||
AveragePoolingOperatorTester()
|
||||
.batchSize(1)
|
||||
.inputHeight(poolSize + 1)
|
||||
.inputWidth(3)
|
||||
.paddingTop(paddingTop)
|
||||
.paddingBottom(paddingBottom)
|
||||
.paddingHeight(paddingHeight)
|
||||
.poolingHeight(poolSize)
|
||||
.poolingWidth(1)
|
||||
.channels(channels)
|
||||
|
|
@ -906,7 +886,6 @@ TEST(AVERAGE_POOLING_OP, unit_batch_few_channels_Mx1_pool_with_padding) {
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(AVERAGE_POOLING_OP, unit_batch_few_channels_Mx1_pool_with_stride) {
|
||||
|
|
|
|||
|
|
@ -27,69 +27,35 @@ using namespace qnnpack::testing;
|
|||
class ConvolutionOperatorTester {
|
||||
public:
|
||||
inline ConvolutionOperatorTester& padding(uint32_t padding) {
|
||||
this->paddingTop_ = padding;
|
||||
this->paddingRight_ = padding;
|
||||
this->paddingBottom_ = padding;
|
||||
this->paddingLeft_ = padding;
|
||||
this->paddingHeight_ = padding;
|
||||
this->paddingWidth_ = padding;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline ConvolutionOperatorTester& padding(
|
||||
uint32_t paddingHeight,
|
||||
uint32_t paddingWidth) {
|
||||
this->paddingTop_ = paddingHeight;
|
||||
this->paddingRight_ = paddingWidth;
|
||||
this->paddingBottom_ = paddingHeight;
|
||||
this->paddingLeft_ = paddingWidth;
|
||||
this->paddingHeight_ = paddingHeight;
|
||||
this->paddingWidth_ = paddingWidth;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline ConvolutionOperatorTester& paddingHeight(uint32_t paddingHeight) {
|
||||
this->paddingTop_ = paddingHeight;
|
||||
this->paddingBottom_ = paddingHeight;
|
||||
this->paddingHeight_ = paddingHeight;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline ConvolutionOperatorTester& paddingWidth(uint32_t paddingWidth) {
|
||||
this->paddingRight_ = paddingWidth;
|
||||
this->paddingLeft_ = paddingWidth;
|
||||
this->paddingWidth_ = paddingWidth;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline ConvolutionOperatorTester& paddingTop(uint32_t paddingTop) {
|
||||
this->paddingTop_ = paddingTop;
|
||||
return *this;
|
||||
inline uint32_t paddingWidth() const {
|
||||
return this->paddingWidth_;
|
||||
}
|
||||
|
||||
inline uint32_t paddingTop() const {
|
||||
return this->paddingTop_;
|
||||
}
|
||||
|
||||
inline ConvolutionOperatorTester& paddingRight(uint32_t paddingRight) {
|
||||
this->paddingRight_ = paddingRight;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline uint32_t paddingRight() const {
|
||||
return this->paddingRight_;
|
||||
}
|
||||
|
||||
inline ConvolutionOperatorTester& paddingBottom(uint32_t paddingBottom) {
|
||||
this->paddingBottom_ = paddingBottom;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline uint32_t paddingBottom() const {
|
||||
return this->paddingBottom_;
|
||||
}
|
||||
|
||||
inline ConvolutionOperatorTester& paddingLeft(uint32_t paddingLeft) {
|
||||
this->paddingLeft_ = paddingLeft;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline uint32_t paddingLeft() const {
|
||||
return this->paddingLeft_;
|
||||
inline uint32_t paddingHeight() const {
|
||||
return this->paddingHeight_;
|
||||
}
|
||||
|
||||
inline ConvolutionOperatorTester& inputSize(
|
||||
|
|
@ -325,8 +291,7 @@ class ConvolutionOperatorTester {
|
|||
}
|
||||
|
||||
inline size_t outputHeight() const {
|
||||
const size_t paddedInputHeight =
|
||||
paddingTop() + inputHeight() + paddingBottom();
|
||||
const size_t paddedInputHeight = inputHeight() + paddingHeight() * 2;
|
||||
if (paddedInputHeight <= dilatedKernelHeight()) {
|
||||
return 1;
|
||||
} else {
|
||||
|
|
@ -336,8 +301,7 @@ class ConvolutionOperatorTester {
|
|||
}
|
||||
|
||||
inline size_t outputWidth() const {
|
||||
const size_t paddedInputWidth =
|
||||
paddingLeft() + inputWidth() + paddingRight();
|
||||
const size_t paddedInputWidth = inputWidth() + paddingWidth() * 2;
|
||||
if (paddedInputWidth <= dilatedKernelWidth()) {
|
||||
return 1;
|
||||
} else {
|
||||
|
|
@ -437,11 +401,11 @@ class ConvolutionOperatorTester {
|
|||
for (size_t ox = 0; ox < outputWidth(); ox++) {
|
||||
for (size_t ky = 0; ky < kernelHeight(); ky++) {
|
||||
const size_t iy = oy * subsamplingHeight() +
|
||||
ky * dilationHeight() - paddingTop();
|
||||
ky * dilationHeight() - paddingHeight();
|
||||
if (iy < inputHeight()) {
|
||||
for (size_t kx = 0; kx < kernelWidth(); kx++) {
|
||||
const size_t ix = ox * subsamplingWidth() +
|
||||
kx * dilationWidth() - paddingLeft();
|
||||
kx * dilationWidth() - paddingWidth();
|
||||
if (ix < inputWidth()) {
|
||||
for (size_t g = 0; g < groups(); g++) {
|
||||
for (size_t oc = 0; oc < groupOutputChannels(); oc++) {
|
||||
|
|
@ -515,10 +479,8 @@ class ConvolutionOperatorTester {
|
|||
ASSERT_EQ(
|
||||
pytorch_qnnp_status_success,
|
||||
pytorch_qnnp_create_convolution2d_nhwc_q8(
|
||||
paddingTop(),
|
||||
paddingRight(),
|
||||
paddingBottom(),
|
||||
paddingLeft(),
|
||||
paddingHeight(),
|
||||
paddingWidth(),
|
||||
kernelHeight(),
|
||||
kernelWidth(),
|
||||
subsamplingHeight(),
|
||||
|
|
@ -641,10 +603,8 @@ class ConvolutionOperatorTester {
|
|||
}
|
||||
|
||||
private:
|
||||
uint32_t paddingTop_{0};
|
||||
uint32_t paddingRight_{0};
|
||||
uint32_t paddingBottom_{0};
|
||||
uint32_t paddingLeft_{0};
|
||||
uint32_t paddingHeight_{0};
|
||||
uint32_t paddingWidth_{0};
|
||||
size_t inputHeight_{1};
|
||||
size_t inputWidth_{1};
|
||||
uint32_t groups_{1};
|
||||
|
|
|
|||
|
|
@ -272,45 +272,27 @@ _STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_without_padding,
|
|||
.iterations(3)
|
||||
)
|
||||
|
||||
_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_left_padding,
|
||||
_STATIC_AND_RUNTIME_TEST(
|
||||
CONVOLUTION_OP,
|
||||
3x3_with_width_padding,
|
||||
ConvolutionOperatorTester()
|
||||
.inputSize(13, 12)
|
||||
.paddingLeft(1)
|
||||
.paddingWidth(1)
|
||||
.kernelSize(3, 3)
|
||||
.groupInputChannels(15)
|
||||
.groupOutputChannels(17)
|
||||
.iterations(3)
|
||||
)
|
||||
.iterations(3))
|
||||
|
||||
_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_right_padding,
|
||||
_STATIC_AND_RUNTIME_TEST(
|
||||
CONVOLUTION_OP,
|
||||
3x3_with_height_padding,
|
||||
ConvolutionOperatorTester()
|
||||
.inputSize(13, 12)
|
||||
.paddingRight(1)
|
||||
.paddingHeight(1)
|
||||
.kernelSize(3, 3)
|
||||
.groupInputChannels(15)
|
||||
.groupOutputChannels(17)
|
||||
.iterations(3)
|
||||
)
|
||||
|
||||
_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_top_padding,
|
||||
ConvolutionOperatorTester()
|
||||
.inputSize(13, 12)
|
||||
.paddingTop(1)
|
||||
.kernelSize(3, 3)
|
||||
.groupInputChannels(15)
|
||||
.groupOutputChannels(17)
|
||||
.iterations(3)
|
||||
)
|
||||
|
||||
_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_bottom_padding,
|
||||
ConvolutionOperatorTester()
|
||||
.inputSize(13, 12)
|
||||
.paddingBottom(1)
|
||||
.kernelSize(3, 3)
|
||||
.groupInputChannels(15)
|
||||
.groupOutputChannels(17)
|
||||
.iterations(3)
|
||||
)
|
||||
.iterations(3))
|
||||
|
||||
_STATIC_TEST(CONVOLUTION_OP, 3x3_with_input_stride,
|
||||
ConvolutionOperatorTester()
|
||||
|
|
@ -833,49 +815,29 @@ _STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_without_padding_per_channel,
|
|||
.per_channel(true)
|
||||
)
|
||||
|
||||
_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_left_padding_per_channel,
|
||||
_STATIC_AND_RUNTIME_TEST(
|
||||
CONVOLUTION_OP,
|
||||
3x3_with_width_padding_per_channel,
|
||||
ConvolutionOperatorTester()
|
||||
.inputSize(13, 12)
|
||||
.paddingLeft(1)
|
||||
.paddingWidth(1)
|
||||
.kernelSize(3, 3)
|
||||
.groupInputChannels(15)
|
||||
.groupOutputChannels(17)
|
||||
.iterations(3)
|
||||
.per_channel(true)
|
||||
)
|
||||
.per_channel(true))
|
||||
|
||||
_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_right_padding_per_channel,
|
||||
_STATIC_AND_RUNTIME_TEST(
|
||||
CONVOLUTION_OP,
|
||||
3x3_with_height_padding_per_channel,
|
||||
ConvolutionOperatorTester()
|
||||
.inputSize(13, 12)
|
||||
.paddingRight(1)
|
||||
.paddingHeight(1)
|
||||
.kernelSize(3, 3)
|
||||
.groupInputChannels(15)
|
||||
.groupOutputChannels(17)
|
||||
.iterations(3)
|
||||
.per_channel(true)
|
||||
)
|
||||
|
||||
_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_top_padding_per_channel,
|
||||
ConvolutionOperatorTester()
|
||||
.inputSize(13, 12)
|
||||
.paddingTop(1)
|
||||
.kernelSize(3, 3)
|
||||
.groupInputChannels(15)
|
||||
.groupOutputChannels(17)
|
||||
.iterations(3)
|
||||
.per_channel(true)
|
||||
)
|
||||
|
||||
_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_bottom_padding_per_channel,
|
||||
ConvolutionOperatorTester()
|
||||
.inputSize(13, 12)
|
||||
.paddingBottom(1)
|
||||
.kernelSize(3, 3)
|
||||
.groupInputChannels(15)
|
||||
.groupOutputChannels(17)
|
||||
.iterations(3)
|
||||
.per_channel(true)
|
||||
)
|
||||
.per_channel(true))
|
||||
|
||||
_STATIC_TEST(CONVOLUTION_OP, 3x3_with_input_stride_per_channel,
|
||||
ConvolutionOperatorTester()
|
||||
|
|
|
|||
|
|
@ -27,77 +27,35 @@ using namespace qnnpack::testing;
|
|||
class DeconvolutionOperatorTester {
|
||||
public:
|
||||
inline DeconvolutionOperatorTester& padding(uint32_t padding) {
|
||||
this->paddingTop_ = padding;
|
||||
this->paddingRight_ = padding;
|
||||
this->paddingBottom_ = padding;
|
||||
this->paddingLeft_ = padding;
|
||||
this->paddingHeight_ = padding;
|
||||
this->paddingWidth_ = padding;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline DeconvolutionOperatorTester& padding(
|
||||
uint32_t paddingHeight,
|
||||
uint32_t paddingWidth) {
|
||||
this->paddingTop_ = paddingHeight;
|
||||
this->paddingRight_ = paddingWidth;
|
||||
this->paddingBottom_ = paddingHeight;
|
||||
this->paddingLeft_ = paddingWidth;
|
||||
this->paddingHeight_ = paddingHeight;
|
||||
this->paddingWidth_ = paddingWidth;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline DeconvolutionOperatorTester& paddingHeight(uint32_t paddingHeight) {
|
||||
this->paddingTop_ = paddingHeight;
|
||||
this->paddingBottom_ = paddingHeight;
|
||||
this->paddingHeight_ = paddingHeight;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline uint32_t paddingHeight() const {
|
||||
return this->paddingTop_ + this->paddingBottom_;
|
||||
return this->paddingHeight_;
|
||||
}
|
||||
|
||||
inline DeconvolutionOperatorTester& paddingWidth(uint32_t paddingWidth) {
|
||||
this->paddingRight_ = paddingWidth;
|
||||
this->paddingLeft_ = paddingWidth;
|
||||
this->paddingWidth_ = paddingWidth;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline uint32_t paddingWidth() const {
|
||||
return this->paddingLeft_ + this->paddingRight_;
|
||||
}
|
||||
|
||||
inline DeconvolutionOperatorTester& paddingTop(uint32_t paddingTop) {
|
||||
this->paddingTop_ = paddingTop;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline uint32_t paddingTop() const {
|
||||
return this->paddingTop_;
|
||||
}
|
||||
|
||||
inline DeconvolutionOperatorTester& paddingRight(uint32_t paddingRight) {
|
||||
this->paddingRight_ = paddingRight;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline uint32_t paddingRight() const {
|
||||
return this->paddingRight_;
|
||||
}
|
||||
|
||||
inline DeconvolutionOperatorTester& paddingBottom(uint32_t paddingBottom) {
|
||||
this->paddingBottom_ = paddingBottom;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline uint32_t paddingBottom() const {
|
||||
return this->paddingBottom_;
|
||||
}
|
||||
|
||||
inline DeconvolutionOperatorTester& paddingLeft(uint32_t paddingLeft) {
|
||||
this->paddingLeft_ = paddingLeft;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline uint32_t paddingLeft() const {
|
||||
return this->paddingLeft_;
|
||||
return this->paddingWidth_;
|
||||
}
|
||||
|
||||
inline DeconvolutionOperatorTester& adjustmentHeight(
|
||||
|
|
@ -353,12 +311,12 @@ class DeconvolutionOperatorTester {
|
|||
|
||||
inline size_t outputHeight() const {
|
||||
return strideHeight() * (inputHeight() - 1) + adjustmentHeight() +
|
||||
dilatedKernelHeight() - paddingHeight();
|
||||
dilatedKernelHeight() - paddingHeight() * 2;
|
||||
}
|
||||
|
||||
inline size_t outputWidth() const {
|
||||
return strideWidth() * (inputWidth() - 1) + adjustmentWidth() +
|
||||
dilatedKernelWidth() - paddingWidth();
|
||||
dilatedKernelWidth() - paddingWidth() * 2;
|
||||
}
|
||||
|
||||
inline DeconvolutionOperatorTester& qmin(uint8_t qmin) {
|
||||
|
|
@ -451,11 +409,11 @@ class DeconvolutionOperatorTester {
|
|||
for (size_t oy = 0; oy < outputHeight(); oy++) {
|
||||
for (size_t ox = 0; ox < outputWidth(); ox++) {
|
||||
for (size_t ky = 0; ky < kernelHeight(); ky++) {
|
||||
const size_t y = oy + paddingTop() - ky * dilationHeight();
|
||||
const size_t y = oy + paddingHeight() - ky * dilationHeight();
|
||||
const size_t iy = y / strideHeight();
|
||||
if (iy * strideHeight() == y && iy < inputHeight()) {
|
||||
for (size_t kx = 0; kx < kernelWidth(); kx++) {
|
||||
const size_t x = ox + paddingLeft() - kx * dilationWidth();
|
||||
const size_t x = ox + paddingWidth() - kx * dilationWidth();
|
||||
const size_t ix = x / strideWidth();
|
||||
if (ix * strideWidth() == x && ix < inputWidth()) {
|
||||
for (size_t g = 0; g < groups(); g++) {
|
||||
|
|
@ -531,10 +489,8 @@ class DeconvolutionOperatorTester {
|
|||
ASSERT_EQ(
|
||||
pytorch_qnnp_status_success,
|
||||
pytorch_qnnp_create_deconvolution2d_nhwc_q8(
|
||||
paddingTop(),
|
||||
paddingRight(),
|
||||
paddingBottom(),
|
||||
paddingLeft(),
|
||||
paddingHeight(),
|
||||
paddingWidth(),
|
||||
adjustmentHeight(),
|
||||
adjustmentWidth(),
|
||||
kernelHeight(),
|
||||
|
|
@ -674,10 +630,8 @@ class DeconvolutionOperatorTester {
|
|||
}
|
||||
|
||||
private:
|
||||
uint32_t paddingTop_{0};
|
||||
uint32_t paddingRight_{0};
|
||||
uint32_t paddingBottom_{0};
|
||||
uint32_t paddingLeft_{0};
|
||||
uint32_t paddingHeight_{0};
|
||||
uint32_t paddingWidth_{0};
|
||||
size_t inputHeight_{1};
|
||||
size_t inputWidth_{1};
|
||||
uint32_t groups_{1};
|
||||
|
|
|
|||
|
|
@ -21,69 +21,35 @@
|
|||
class MaxPoolingOperatorTester {
|
||||
public:
|
||||
inline MaxPoolingOperatorTester& padding(uint32_t padding) {
|
||||
this->paddingTop_ = padding;
|
||||
this->paddingRight_ = padding;
|
||||
this->paddingBottom_ = padding;
|
||||
this->paddingLeft_ = padding;
|
||||
this->paddingHeight_ = padding;
|
||||
this->paddingWidth_ = padding;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline MaxPoolingOperatorTester& padding(
|
||||
uint32_t paddingHeight,
|
||||
uint32_t paddingWidth) {
|
||||
this->paddingTop_ = paddingHeight;
|
||||
this->paddingRight_ = paddingWidth;
|
||||
this->paddingBottom_ = paddingHeight;
|
||||
this->paddingLeft_ = paddingWidth;
|
||||
this->paddingHeight_ = paddingHeight;
|
||||
this->paddingWidth_ = paddingWidth;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline MaxPoolingOperatorTester& paddingHeight(uint32_t paddingHeight) {
|
||||
this->paddingTop_ = paddingHeight;
|
||||
this->paddingBottom_ = paddingHeight;
|
||||
this->paddingHeight_ = paddingHeight;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline MaxPoolingOperatorTester& paddingWidth(uint32_t paddingWidth) {
|
||||
this->paddingRight_ = paddingWidth;
|
||||
this->paddingLeft_ = paddingWidth;
|
||||
this->paddingWidth_ = paddingWidth;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline MaxPoolingOperatorTester& paddingTop(uint32_t paddingTop) {
|
||||
this->paddingTop_ = paddingTop;
|
||||
return *this;
|
||||
inline uint32_t paddingHeight() const {
|
||||
return this->paddingHeight_;
|
||||
}
|
||||
|
||||
inline uint32_t paddingTop() const {
|
||||
return this->paddingTop_;
|
||||
}
|
||||
|
||||
inline MaxPoolingOperatorTester& paddingRight(uint32_t paddingRight) {
|
||||
this->paddingRight_ = paddingRight;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline uint32_t paddingRight() const {
|
||||
return this->paddingRight_;
|
||||
}
|
||||
|
||||
inline MaxPoolingOperatorTester& paddingBottom(uint32_t paddingBottom) {
|
||||
this->paddingBottom_ = paddingBottom;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline uint32_t paddingBottom() const {
|
||||
return this->paddingBottom_;
|
||||
}
|
||||
|
||||
inline MaxPoolingOperatorTester& paddingLeft(uint32_t paddingLeft) {
|
||||
this->paddingLeft_ = paddingLeft;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline uint32_t paddingLeft() const {
|
||||
return this->paddingLeft_;
|
||||
inline uint32_t paddingWidth() const {
|
||||
return this->paddingWidth_;
|
||||
}
|
||||
|
||||
inline MaxPoolingOperatorTester& inputSize(
|
||||
|
|
@ -255,8 +221,7 @@ class MaxPoolingOperatorTester {
|
|||
}
|
||||
|
||||
inline size_t outputHeight() const {
|
||||
const size_t paddedInputHeight =
|
||||
paddingTop() + inputHeight() + paddingBottom();
|
||||
const size_t paddedInputHeight = inputHeight() + paddingHeight() * 2;
|
||||
if (paddedInputHeight <= dilatedPoolingHeight()) {
|
||||
return 1;
|
||||
} else {
|
||||
|
|
@ -265,8 +230,7 @@ class MaxPoolingOperatorTester {
|
|||
}
|
||||
|
||||
inline size_t outputWidth() const {
|
||||
const size_t paddedInputWidth =
|
||||
paddingLeft() + inputWidth() + paddingRight();
|
||||
const size_t paddedInputWidth = inputWidth() + paddingWidth() * 2;
|
||||
if (paddedInputWidth <= dilatedPoolingWidth()) {
|
||||
return 1;
|
||||
} else {
|
||||
|
|
@ -344,7 +308,7 @@ class MaxPoolingOperatorTester {
|
|||
|
||||
inline size_t nextOutputHeight() const {
|
||||
const size_t paddedNextInputHeight =
|
||||
paddingTop() + nextInputHeight() + paddingBottom();
|
||||
nextInputHeight() + paddingHeight() * 2;
|
||||
if (paddedNextInputHeight <= dilatedPoolingHeight()) {
|
||||
return 1;
|
||||
} else {
|
||||
|
|
@ -354,8 +318,7 @@ class MaxPoolingOperatorTester {
|
|||
}
|
||||
|
||||
inline size_t nextOutputWidth() const {
|
||||
const size_t paddedNextInputWidth =
|
||||
paddingLeft() + nextInputWidth() + paddingRight();
|
||||
const size_t paddedNextInputWidth = nextInputWidth() + paddingWidth() * 2;
|
||||
if (paddedNextInputWidth <= dilatedPoolingWidth()) {
|
||||
return 1;
|
||||
} else {
|
||||
|
|
@ -429,11 +392,11 @@ class MaxPoolingOperatorTester {
|
|||
for (size_t c = 0; c < channels(); c++) {
|
||||
uint8_t maxValue = 0;
|
||||
for (size_t py = 0; py < poolingHeight(); py++) {
|
||||
const size_t iy =
|
||||
oy * strideHeight() + py * dilationHeight() - paddingTop();
|
||||
const size_t iy = oy * strideHeight() + py * dilationHeight() -
|
||||
paddingHeight();
|
||||
for (size_t px = 0; px < poolingWidth(); px++) {
|
||||
const size_t ix =
|
||||
ox * strideWidth() + px * dilationWidth() - paddingLeft();
|
||||
const size_t ix = ox * strideWidth() + px * dilationWidth() -
|
||||
paddingWidth();
|
||||
if (ix < inputWidth() && iy < inputHeight()) {
|
||||
maxValue = std::max(
|
||||
maxValue,
|
||||
|
|
@ -462,10 +425,8 @@ class MaxPoolingOperatorTester {
|
|||
ASSERT_EQ(
|
||||
pytorch_qnnp_status_success,
|
||||
pytorch_qnnp_create_max_pooling2d_nhwc_u8(
|
||||
paddingTop(),
|
||||
paddingRight(),
|
||||
paddingBottom(),
|
||||
paddingLeft(),
|
||||
paddingHeight(),
|
||||
paddingWidth(),
|
||||
poolingHeight(),
|
||||
poolingWidth(),
|
||||
strideHeight(),
|
||||
|
|
@ -569,11 +530,11 @@ class MaxPoolingOperatorTester {
|
|||
for (size_t c = 0; c < channels(); c++) {
|
||||
uint8_t maxValue = 0;
|
||||
for (size_t py = 0; py < poolingHeight(); py++) {
|
||||
const size_t iy =
|
||||
oy * strideHeight() + py * dilationHeight() - paddingTop();
|
||||
const size_t iy = oy * strideHeight() + py * dilationHeight() -
|
||||
paddingHeight();
|
||||
for (size_t px = 0; px < poolingWidth(); px++) {
|
||||
const size_t ix =
|
||||
ox * strideWidth() + px * dilationWidth() - paddingLeft();
|
||||
const size_t ix = ox * strideWidth() + px * dilationWidth() -
|
||||
paddingWidth();
|
||||
if (ix < inputWidth() && iy < inputHeight()) {
|
||||
maxValue = std::max(
|
||||
maxValue,
|
||||
|
|
@ -602,10 +563,8 @@ class MaxPoolingOperatorTester {
|
|||
ASSERT_EQ(
|
||||
pytorch_qnnp_status_success,
|
||||
pytorch_qnnp_create_max_pooling2d_nhwc_u8(
|
||||
paddingTop(),
|
||||
paddingRight(),
|
||||
paddingBottom(),
|
||||
paddingLeft(),
|
||||
paddingHeight(),
|
||||
paddingWidth(),
|
||||
poolingHeight(),
|
||||
poolingWidth(),
|
||||
strideHeight(),
|
||||
|
|
@ -680,11 +639,11 @@ class MaxPoolingOperatorTester {
|
|||
for (size_t c = 0; c < channels(); c++) {
|
||||
uint8_t maxValue = 0;
|
||||
for (size_t py = 0; py < poolingHeight(); py++) {
|
||||
const size_t iy =
|
||||
oy * strideHeight() + py * dilationHeight() - paddingTop();
|
||||
const size_t iy = oy * strideHeight() + py * dilationHeight() -
|
||||
paddingHeight();
|
||||
for (size_t px = 0; px < poolingWidth(); px++) {
|
||||
const size_t ix =
|
||||
ox * strideWidth() + px * dilationWidth() - paddingLeft();
|
||||
const size_t ix = ox * strideWidth() + px * dilationWidth() -
|
||||
paddingWidth();
|
||||
if (ix < nextInputWidth() && iy < nextInputHeight()) {
|
||||
maxValue = std::max(
|
||||
maxValue,
|
||||
|
|
@ -775,10 +734,8 @@ class MaxPoolingOperatorTester {
|
|||
}
|
||||
|
||||
private:
|
||||
uint32_t paddingTop_{0};
|
||||
uint32_t paddingRight_{0};
|
||||
uint32_t paddingBottom_{0};
|
||||
uint32_t paddingLeft_{0};
|
||||
uint32_t paddingHeight_{0};
|
||||
uint32_t paddingWidth_{0};
|
||||
size_t inputHeight_{1};
|
||||
size_t inputWidth_{1};
|
||||
size_t channels_{1};
|
||||
|
|
|
|||
|
|
@ -50,14 +50,13 @@ TEST(MAX_POOLING_OP, unit_batch_many_channels_small_1xM_pool_with_padding) {
|
|||
channels += 3) {
|
||||
for (size_t poolSize = 3; poolSize <= pytorch_qnnp_params.u8maxpool.mr;
|
||||
poolSize++) {
|
||||
for (size_t paddingLeft = 0; paddingLeft <= 1; paddingLeft++) {
|
||||
for (size_t paddingWidth = 0; paddingWidth <= 1; paddingWidth++) {
|
||||
for (size_t paddingRight = 0; paddingRight <= 1; paddingRight++) {
|
||||
MaxPoolingOperatorTester()
|
||||
.batchSize(1)
|
||||
.inputHeight(2)
|
||||
.inputWidth(poolSize + 2)
|
||||
.paddingLeft(paddingLeft)
|
||||
.paddingRight(paddingRight)
|
||||
.paddingWidth(paddingWidth)
|
||||
.poolingHeight(1)
|
||||
.poolingWidth(poolSize)
|
||||
.channels(channels)
|
||||
|
|
@ -134,14 +133,13 @@ TEST(MAX_POOLING_OP, unit_batch_many_channels_small_Mx1_pool_with_padding) {
|
|||
channels += 3) {
|
||||
for (size_t poolSize = 2; poolSize <= pytorch_qnnp_params.u8maxpool.mr;
|
||||
poolSize++) {
|
||||
for (size_t paddingTop = 0; paddingTop <= 1; paddingTop++) {
|
||||
for (size_t paddingHeight = 0; paddingHeight <= 1; paddingHeight++) {
|
||||
for (size_t paddingBottom = 0; paddingBottom <= 1; paddingBottom++) {
|
||||
MaxPoolingOperatorTester()
|
||||
.batchSize(1)
|
||||
.inputHeight(poolSize + 1)
|
||||
.inputWidth(3)
|
||||
.paddingTop(paddingTop)
|
||||
.paddingBottom(paddingBottom)
|
||||
.paddingHeight(paddingHeight)
|
||||
.poolingHeight(poolSize)
|
||||
.poolingWidth(1)
|
||||
.channels(channels)
|
||||
|
|
@ -335,14 +333,13 @@ TEST(MAX_POOLING_OP, unit_batch_many_channels_large_1xM_pool_with_padding) {
|
|||
channels += 3) {
|
||||
for (size_t poolSize = 3; poolSize <= pytorch_qnnp_params.u8maxpool.mr;
|
||||
poolSize++) {
|
||||
for (size_t paddingLeft = 0; paddingLeft <= 1; paddingLeft++) {
|
||||
for (size_t paddingWidth = 0; paddingWidth <= 1; paddingWidth++) {
|
||||
for (size_t paddingRight = 0; paddingRight <= 1; paddingRight++) {
|
||||
MaxPoolingOperatorTester()
|
||||
.batchSize(1)
|
||||
.inputHeight(2)
|
||||
.inputWidth(poolSize + 2)
|
||||
.paddingLeft(paddingLeft)
|
||||
.paddingRight(paddingRight)
|
||||
.paddingWidth(paddingWidth)
|
||||
.poolingHeight(1)
|
||||
.poolingWidth(poolSize)
|
||||
.channels(channels)
|
||||
|
|
@ -423,14 +420,13 @@ TEST(MAX_POOLING_OP, unit_batch_many_channels_large_Mx1_pool_with_padding) {
|
|||
for (size_t poolSize = pytorch_qnnp_params.u8maxpool.mr; poolSize <=
|
||||
pytorch_qnnp_params.u8maxpool.mr + pytorch_qnnp_params.u8maxpool.qr;
|
||||
poolSize++) {
|
||||
for (size_t paddingTop = 0; paddingTop <= 1; paddingTop++) {
|
||||
for (size_t paddingHeight = 0; paddingHeight <= 1; paddingHeight++) {
|
||||
for (size_t paddingBottom = 0; paddingBottom <= 1; paddingBottom++) {
|
||||
MaxPoolingOperatorTester()
|
||||
.batchSize(1)
|
||||
.inputHeight(poolSize + 1)
|
||||
.inputWidth(3)
|
||||
.paddingTop(paddingTop)
|
||||
.paddingBottom(paddingBottom)
|
||||
.paddingHeight(paddingHeight)
|
||||
.poolingHeight(poolSize)
|
||||
.poolingWidth(1)
|
||||
.channels(channels)
|
||||
|
|
@ -627,14 +623,13 @@ TEST(MAX_POOLING_OP, unit_batch_few_channels_1xM_pool_with_padding) {
|
|||
channels++) {
|
||||
for (size_t poolSize = 3; poolSize <= pytorch_qnnp_params.u8maxpool.mr;
|
||||
poolSize++) {
|
||||
for (size_t paddingLeft = 0; paddingLeft <= 1; paddingLeft++) {
|
||||
for (size_t paddingWidth = 0; paddingWidth <= 1; paddingWidth++) {
|
||||
for (size_t paddingRight = 0; paddingRight <= 1; paddingRight++) {
|
||||
MaxPoolingOperatorTester()
|
||||
.batchSize(1)
|
||||
.inputHeight(2)
|
||||
.inputWidth(poolSize + 2)
|
||||
.paddingLeft(paddingLeft)
|
||||
.paddingRight(paddingRight)
|
||||
.paddingWidth(paddingWidth)
|
||||
.poolingHeight(1)
|
||||
.poolingWidth(poolSize)
|
||||
.channels(channels)
|
||||
|
|
@ -707,14 +702,13 @@ TEST(MAX_POOLING_OP, unit_batch_few_channels_Mx1_pool_with_padding) {
|
|||
channels++) {
|
||||
for (size_t poolSize = 2; poolSize <= 2 * pytorch_qnnp_params.u8maxpool.kr;
|
||||
poolSize++) {
|
||||
for (size_t paddingTop = 0; paddingTop <= 1; paddingTop++) {
|
||||
for (size_t paddingHeight = 0; paddingHeight <= 1; paddingHeight++) {
|
||||
for (size_t paddingBottom = 0; paddingBottom <= 1; paddingBottom++) {
|
||||
MaxPoolingOperatorTester()
|
||||
.batchSize(1)
|
||||
.inputHeight(poolSize + 1)
|
||||
.inputWidth(3)
|
||||
.paddingTop(paddingTop)
|
||||
.paddingBottom(paddingBottom)
|
||||
.paddingHeight(paddingHeight)
|
||||
.poolingHeight(poolSize)
|
||||
.poolingWidth(1)
|
||||
.channels(channels)
|
||||
|
|
|
|||
|
|
@ -183,12 +183,9 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {
|
|||
convolution->dilation_depth = kSpatialDim == 3 ? dilation_[0] : 1;
|
||||
convolution->dilation_height = dilation_[kSpatialDim - 2];
|
||||
convolution->dilation_width = dilation_[kSpatialDim - 1];
|
||||
convolution->input_padding_top = padding_[kSpatialDim - 2];
|
||||
convolution->input_padding_left = padding_[kSpatialDim - 1];
|
||||
convolution->input_padding_bottom = padding_[kSpatialDim - 2];
|
||||
convolution->input_padding_right = padding_[kSpatialDim - 1];
|
||||
convolution->input_padding_front = kSpatialDim == 3 ? padding_[0] : 0;
|
||||
convolution->input_padding_back = kSpatialDim == 3 ? padding_[0] : 0;
|
||||
convolution->input_padding_height = padding_[kSpatialDim - 2];
|
||||
convolution->input_padding_width = padding_[kSpatialDim - 1];
|
||||
convolution->input_padding_depth = kSpatialDim == 3 ? padding_[0] : 0;
|
||||
convolution->per_channel = is_per_channel;
|
||||
convolution->transpose = transpose_;
|
||||
|
||||
|
|
|
|||
|
|
@ -298,10 +298,8 @@ void check_maxpool2d_params(
|
|||
|
||||
const pytorch_qnnp_status createStatus =
|
||||
pytorch_qnnp_create_max_pooling2d_nhwc_u8(
|
||||
padH /* input_padding_top */,
|
||||
padW /* input_padding_right */,
|
||||
padH /* input_padding_bottom */,
|
||||
padW /* input_padding_left */,
|
||||
padH /* input_padding_height */,
|
||||
padW /* input_padding_width */,
|
||||
kH /* pooling height */,
|
||||
kW /* pooling width */,
|
||||
strideH /* stride height */,
|
||||
|
|
|
|||
|
|
@ -270,7 +270,7 @@ Tensor _sparse_csr_addmm(
|
|||
Tensor add_sparse_csr(const Tensor& self, const Tensor& other, const Scalar& alpha) {
|
||||
auto commonDtype = at::result_type(self, other);
|
||||
alpha_check(commonDtype, alpha);
|
||||
Tensor result = at::empty({0}, self.options().dtype(commonDtype));
|
||||
Tensor result = at::empty({0, 0}, self.options().dtype(commonDtype));
|
||||
return at::add_out(result, self, other, alpha); // redispatch!
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -147,5 +147,30 @@ Tensor& addmv_out_sparse_csr_cuda(
|
|||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
Solves a system of linear equations whose coefficients are represented in a sparse triangular matrix A:
|
||||
op(A) X = B.
|
||||
|
||||
Args:
|
||||
* `B` - dense Tensor of size m × nrhs.
|
||||
* `A` - sparse Tensor of size m × m.
|
||||
* `upper` - controls whether upper or lower triangular part of A is considered in computations.
|
||||
* `transpose` - if true then op(A) = A^T.
|
||||
* `unitriangular` - if true then the diagonal elements of A are assumed to be one.
|
||||
* `X` - dense Tensor of size m × nrhs.
|
||||
* `clone_A` - cloned matrix A, required only for compatibility with strided layout interface.
|
||||
*/
|
||||
std::tuple<Tensor&, Tensor&> triangular_solve_out_sparse_csr_cuda(
|
||||
const Tensor& B,
|
||||
const Tensor& A,
|
||||
bool upper,
|
||||
bool transpose,
|
||||
bool unitriangular,
|
||||
Tensor& X,
|
||||
Tensor& clone_A) {
|
||||
sparse::impl::cuda::triangular_solve_out_sparse_csr(A, B, X, upper, transpose, unitriangular);
|
||||
return std::tuple<Tensor&, Tensor&>(X, clone_A);
|
||||
}
|
||||
|
||||
} // namespace native
|
||||
} // namespace at
|
||||
|
|
|
|||
|
|
@ -1,8 +1,10 @@
|
|||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/cuda/CUDADataType.h>
|
||||
#include <ATen/cuda/CUDASparse.h>
|
||||
#include <ATen/cuda/CUDASparseBlas.h>
|
||||
#include <ATen/cuda/CUDASparseDescriptors.h>
|
||||
#include <ATen/native/LinearAlgebraUtils.h>
|
||||
#include <ATen/native/cuda/MiscUtils.h>
|
||||
#include <ATen/native/sparse/cuda/SparseBlasImpl.h>
|
||||
#include <ATen/native/sparse/cuda/SparseBlasLegacy.h>
|
||||
|
||||
|
|
@ -452,6 +454,326 @@ void addmv_out_sparse_csr(
|
|||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
Computes C = alpha * A + beta * B
|
||||
|
||||
Args:
|
||||
* `A` - [in] sparse Tensor of size m × n.
|
||||
* `B` - [in] sparse Tensor of size m × n.
|
||||
* `C` - [out] sparse Tensor of size m × n.
|
||||
*/
|
||||
void add_out_sparse_csr(
|
||||
const at::sparse_csr::SparseCsrTensor& A,
|
||||
const at::sparse_csr::SparseCsrTensor& B,
|
||||
const Scalar& alpha,
|
||||
const Scalar& beta,
|
||||
const at::sparse_csr::SparseCsrTensor& C) {
|
||||
IntArrayRef A_sizes = A.sizes();
|
||||
auto ndim = A.dim();
|
||||
int m = at::native::cuda_int_cast(A_sizes[ndim - 2], "m");
|
||||
int n = at::native::cuda_int_cast(A_sizes[ndim - 1], "n");
|
||||
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(A.sizes().equals(B.sizes()) && A.sizes().equals(C.sizes()));
|
||||
|
||||
// Only 32-bit indices are supported
|
||||
auto A_32 = at::native::_sparse_csr_tensor_unsafe(
|
||||
A.crow_indices().to(kInt),
|
||||
A.col_indices().to(kInt),
|
||||
A.values(),
|
||||
A.sizes(),
|
||||
A.scalar_type(),
|
||||
A.layout(),
|
||||
A.device());
|
||||
auto B_32 = at::native::_sparse_csr_tensor_unsafe(
|
||||
B.crow_indices().to(kInt),
|
||||
B.col_indices().to(kInt),
|
||||
B.values(),
|
||||
B.sizes(),
|
||||
B.scalar_type(),
|
||||
B.layout(),
|
||||
B.device());
|
||||
|
||||
// Modify C tensor in-place to swap indices tensors with 32-bit variants
|
||||
indices_to_32_bit_inplace(C);
|
||||
|
||||
int nnzA = at::native::cuda_int_cast(A_32._nnz(), "nnzA");
|
||||
int nnzB = at::native::cuda_int_cast(B_32._nnz(), "nnzB");
|
||||
|
||||
auto desc = at::cuda::sparse::CuSparseMatDescriptor();
|
||||
|
||||
auto A_crow_indices = A_32.crow_indices();
|
||||
auto B_crow_indices = B_32.crow_indices();
|
||||
auto C_crow_indices = C.crow_indices();
|
||||
auto A_crow_indices_ptr = A_crow_indices.data_ptr<int>();
|
||||
auto B_crow_indices_ptr = B_crow_indices.data_ptr<int>();
|
||||
auto C_crow_indices_ptr = C_crow_indices.data_ptr<int>();
|
||||
|
||||
auto A_col_indices = A_32.col_indices();
|
||||
auto B_col_indices = B_32.col_indices();
|
||||
auto C_col_indices = C.col_indices();
|
||||
auto A_col_indices_ptr = A_col_indices.data_ptr<int>();
|
||||
auto B_col_indices_ptr = B_col_indices.data_ptr<int>();
|
||||
auto C_col_indices_ptr = C_col_indices.data_ptr<int>();
|
||||
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
|
||||
C.scalar_type(), "add_out_sparse_csr_cuda_impl", [&] {
|
||||
auto beta_ = beta.to<scalar_t>();
|
||||
auto alpha_ = alpha.to<scalar_t>();
|
||||
|
||||
auto A_values = A_32.values();
|
||||
auto B_values = B_32.values();
|
||||
auto C_values = C.values();
|
||||
auto A_values_ptr = A_values.data_ptr<scalar_t>();
|
||||
auto B_values_ptr = B_values.data_ptr<scalar_t>();
|
||||
auto C_values_ptr = C_values.data_ptr<scalar_t>();
|
||||
|
||||
auto handle = at::cuda::getCurrentCUDASparseHandle();
|
||||
TORCH_CUDASPARSE_CHECK(cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST));
|
||||
|
||||
size_t buffer_size;
|
||||
at::cuda::sparse::csrgeam2_bufferSizeExt<scalar_t>(
|
||||
handle,
|
||||
m,
|
||||
n,
|
||||
&alpha_,
|
||||
desc.descriptor(),
|
||||
nnzA,
|
||||
A_values_ptr,
|
||||
A_crow_indices_ptr,
|
||||
A_col_indices_ptr,
|
||||
&beta_,
|
||||
desc.descriptor(),
|
||||
nnzB,
|
||||
B_values_ptr,
|
||||
B_crow_indices_ptr,
|
||||
B_col_indices_ptr,
|
||||
desc.descriptor(),
|
||||
C_values_ptr,
|
||||
C_crow_indices_ptr,
|
||||
C_col_indices_ptr,
|
||||
&buffer_size // output
|
||||
);
|
||||
|
||||
auto& allocator = *c10::cuda::CUDACachingAllocator::get();
|
||||
auto work_data = allocator.allocate(buffer_size);
|
||||
|
||||
int nnzC = -1;
|
||||
at::cuda::sparse::csrgeam2Nnz<scalar_t>(
|
||||
handle,
|
||||
m,
|
||||
n,
|
||||
desc.descriptor(),
|
||||
nnzA,
|
||||
A_crow_indices_ptr,
|
||||
A_col_indices_ptr,
|
||||
desc.descriptor(),
|
||||
nnzB,
|
||||
B_crow_indices_ptr,
|
||||
B_col_indices_ptr,
|
||||
desc.descriptor(),
|
||||
C_crow_indices_ptr,
|
||||
&nnzC,
|
||||
work_data.get());
|
||||
|
||||
// Resize result using nnz information from cusparse
|
||||
col_indices_and_values_resize_(C, nnzC);
|
||||
C_col_indices = C.col_indices();
|
||||
C_values = C.values();
|
||||
|
||||
C_col_indices_ptr = C_col_indices.data_ptr<int>();
|
||||
C_values_ptr = C_values.data_ptr<scalar_t>();
|
||||
|
||||
at::cuda::sparse::csrgeam2<scalar_t>(
|
||||
handle,
|
||||
m,
|
||||
n,
|
||||
&alpha_,
|
||||
desc.descriptor(),
|
||||
nnzA,
|
||||
A_values_ptr,
|
||||
A_crow_indices_ptr,
|
||||
A_col_indices_ptr,
|
||||
&beta_,
|
||||
desc.descriptor(),
|
||||
nnzB,
|
||||
B_values_ptr,
|
||||
B_crow_indices_ptr,
|
||||
B_col_indices_ptr,
|
||||
desc.descriptor(),
|
||||
C_values_ptr,
|
||||
C_crow_indices_ptr,
|
||||
C_col_indices_ptr,
|
||||
work_data.get());
|
||||
});
|
||||
}
|
||||
|
||||
/*
|
||||
Solves a system of linear equations whose coefficients are represented in a sparse triangular matrix A:
|
||||
op(A) X = B.
|
||||
|
||||
Args:
|
||||
* `A` - sparse Tensor of size m × m.
|
||||
* `B` - dense Tensor of size m × nrhs.
|
||||
* `X` - dense Tensor of size m × nrhs.
|
||||
* `upper` - controls whether upper or lower triangular part of A is considered in computations.
|
||||
* `transpose` - if true then op(A) = A^T.
|
||||
* `unitriangular` - if true then the diagonal elements of A are assumed to be one.
|
||||
*/
|
||||
void triangular_solve_out_sparse_csr(
|
||||
const at::sparse_csr::SparseCsrTensor& A,
|
||||
const Tensor& B,
|
||||
const Tensor& X,
|
||||
bool upper,
|
||||
bool transpose,
|
||||
bool unitriangular) {
|
||||
#if !AT_USE_CUSPARSE_GENERIC_SPSV()
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Calling triangular solve on a sparse GPU tensor requires compiling ",
|
||||
"PyTorch with at least CUDA 11.3. ",
|
||||
"Please use PyTorch built with newer CUDA version.");
|
||||
#else
|
||||
if (B.numel() == 0 || X.numel() == 0 || A._nnz() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
c10::MaybeOwned<Tensor> B_ = prepare_dense_matrix_for_cusparse(B);
|
||||
c10::MaybeOwned<Tensor> X_ = prepare_dense_matrix_for_cusparse(X);
|
||||
|
||||
// TODO: update this to support COO sparse layout
|
||||
auto descA = at::cuda::sparse::CuSparseSpMatCsrDescriptor(A);
|
||||
descA.set_mat_fill_mode(upper);
|
||||
descA.set_mat_diag_type(unitriangular);
|
||||
cusparseOperation_t opA = transpose ? CUSPARSE_OPERATION_TRANSPOSE
|
||||
: CUSPARSE_OPERATION_NON_TRANSPOSE;
|
||||
|
||||
if (B.size(-1) == 1) {
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
|
||||
X.scalar_type(), "triangular_solve_out_sparse_csr_cuda_impl", [&] {
|
||||
scalar_t alpha = 1;
|
||||
cudaDataType compute_type = at::cuda::getCudaDataType<scalar_t>();
|
||||
auto handle = at::cuda::getCurrentCUDASparseHandle();
|
||||
size_t buffer_size;
|
||||
|
||||
auto desc_spsv = at::cuda::sparse::CuSparseSpSVDescriptor();
|
||||
auto descB = at::cuda::sparse::CuSparseDnVecDescriptor(*B_);
|
||||
auto descX = at::cuda::sparse::CuSparseDnVecDescriptor(*X_);
|
||||
TORCH_CUDASPARSE_CHECK(cusparseSpSV_bufferSize(
|
||||
handle,
|
||||
opA,
|
||||
&alpha,
|
||||
descA.descriptor(),
|
||||
descB.descriptor(),
|
||||
descX.descriptor(),
|
||||
compute_type,
|
||||
CUSPARSE_SPSV_ALG_DEFAULT,
|
||||
desc_spsv.descriptor(),
|
||||
&buffer_size // output
|
||||
));
|
||||
|
||||
auto& allocator = *c10::cuda::CUDACachingAllocator::get();
|
||||
auto work_data = allocator.allocate(buffer_size);
|
||||
|
||||
TORCH_CUDASPARSE_CHECK(cusparseSpSV_analysis(
|
||||
handle,
|
||||
opA,
|
||||
&alpha,
|
||||
descA.descriptor(),
|
||||
descB.descriptor(),
|
||||
descX.descriptor(),
|
||||
compute_type,
|
||||
CUSPARSE_SPSV_ALG_DEFAULT,
|
||||
desc_spsv.descriptor(),
|
||||
work_data.get()));
|
||||
|
||||
TORCH_CUDASPARSE_CHECK(cusparseSpSV_solve(
|
||||
handle,
|
||||
opA,
|
||||
&alpha,
|
||||
descA.descriptor(),
|
||||
descB.descriptor(),
|
||||
descX.descriptor(),
|
||||
compute_type,
|
||||
CUSPARSE_SPSV_ALG_DEFAULT,
|
||||
desc_spsv.descriptor()));
|
||||
});
|
||||
} else {
|
||||
#if !AT_USE_CUSPARSE_GENERIC_SPSM()
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Calling triangular solve on a sparse GPU tensor requires compiling ",
|
||||
"PyTorch with at least CUDA 11.3.1. ",
|
||||
"Please use PyTorch built with newer CUDA version.");
|
||||
#else
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
|
||||
X.scalar_type(), "triangular_solve_out_sparse_csr_cuda_impl", [&] {
|
||||
scalar_t alpha = 1;
|
||||
cudaDataType compute_type = at::cuda::getCudaDataType<scalar_t>();
|
||||
auto handle = at::cuda::getCurrentCUDASparseHandle();
|
||||
size_t buffer_size;
|
||||
|
||||
// TODO: support mixed memory format
|
||||
IntArrayRef X_strides = X_->strides();
|
||||
IntArrayRef B_strides = B_->strides();
|
||||
auto ndim = X_->dim();
|
||||
bool is_X_row_major = (X_strides[ndim - 1] == 1);
|
||||
bool is_B_row_major = (B_strides[ndim - 1] == 1);
|
||||
TORCH_INTERNAL_ASSERT(is_X_row_major && is_B_row_major);
|
||||
|
||||
cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE;
|
||||
auto desc_spsm = at::cuda::sparse::CuSparseSpSMDescriptor();
|
||||
auto descB = at::cuda::sparse::CuSparseDnMatDescriptor(*B_);
|
||||
auto descX = at::cuda::sparse::CuSparseDnMatDescriptor(*X_);
|
||||
TORCH_CUDASPARSE_CHECK(cusparseSpSM_bufferSize(
|
||||
handle,
|
||||
opA,
|
||||
opB,
|
||||
&alpha,
|
||||
descA.descriptor(),
|
||||
descB.descriptor(),
|
||||
descX.descriptor(),
|
||||
compute_type,
|
||||
CUSPARSE_SPSM_ALG_DEFAULT,
|
||||
desc_spsm.descriptor(),
|
||||
&buffer_size // output
|
||||
));
|
||||
|
||||
auto& allocator = *c10::cuda::CUDACachingAllocator::get();
|
||||
auto work_data = allocator.allocate(buffer_size);
|
||||
|
||||
TORCH_CUDASPARSE_CHECK(cusparseSpSM_analysis(
|
||||
handle,
|
||||
opA,
|
||||
opB,
|
||||
&alpha,
|
||||
descA.descriptor(),
|
||||
descB.descriptor(),
|
||||
descX.descriptor(),
|
||||
compute_type,
|
||||
CUSPARSE_SPSM_ALG_DEFAULT,
|
||||
desc_spsm.descriptor(),
|
||||
work_data.get()));
|
||||
|
||||
TORCH_CUDASPARSE_CHECK(cusparseSpSM_solve(
|
||||
handle,
|
||||
opA,
|
||||
opB,
|
||||
&alpha,
|
||||
descA.descriptor(),
|
||||
descB.descriptor(),
|
||||
descX.descriptor(),
|
||||
compute_type,
|
||||
CUSPARSE_SPSM_ALG_DEFAULT,
|
||||
desc_spsm.descriptor()));
|
||||
});
|
||||
#endif // !AT_USE_CUSPARSE_GENERIC_SPSM()
|
||||
}
|
||||
if (!X.is_same(*X_)) {
|
||||
X.copy_(*X_);
|
||||
}
|
||||
#endif // !AT_USE_CUSPARSE_GENERIC_SPSV()
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace impl
|
||||
} // namespace sparse
|
||||
|
|
|
|||
|
|
@ -24,6 +24,21 @@ void addmv_out_sparse_csr(
|
|||
const Scalar& alpha,
|
||||
const Tensor& result);
|
||||
|
||||
void add_out_sparse_csr(
|
||||
const at::sparse_csr::SparseCsrTensor& mat1,
|
||||
const at::sparse_csr::SparseCsrTensor& mat2,
|
||||
const Scalar& alpha,
|
||||
const Scalar& beta,
|
||||
const at::sparse_csr::SparseCsrTensor& result);
|
||||
|
||||
void triangular_solve_out_sparse_csr(
|
||||
const at::sparse_csr::SparseCsrTensor& A,
|
||||
const Tensor& B,
|
||||
const Tensor& X,
|
||||
bool upper,
|
||||
bool transpose,
|
||||
bool unitriangular);
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace impl
|
||||
} // namespace sparse
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@
|
|||
#include <ATen/cuda/ThrustAllocator.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
|
||||
#include <ATen/native/sparse/cuda/SparseBlasImpl.h>
|
||||
#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
|
||||
#include <ATen/native/sparse/cuda/SparseCUDATensorMath.cuh>
|
||||
|
||||
|
|
@ -187,11 +188,16 @@ Tensor& add_out_sparse_csr_cuda(
|
|||
const Scalar& alpha,
|
||||
SparseCsrTensor& out) {
|
||||
if (self.layout() == kStrided) {
|
||||
return add_out_dense_sparse_csr_cuda(out, self, other, alpha);
|
||||
add_out_dense_sparse_csr_cuda(out, self, other, alpha);
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"NotImplementedError: Addition of sparse CSR tensors is not yet implemented.")
|
||||
self.sizes().equals(other.sizes()),
|
||||
"torch.add: Expected input tensors to have the same shape, but got tensor `self` with shape ",
|
||||
self.sizes(),
|
||||
" and tensor `other` with shape ",
|
||||
other.sizes());
|
||||
at::native::resize_as_sparse_csr_(out, self);
|
||||
sparse::impl::cuda::add_out_sparse_csr(self, other, Scalar(1), alpha, out);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -145,6 +145,23 @@ void Context::flush() {
|
|||
command().pool.purge();
|
||||
}
|
||||
|
||||
void Context::wait(const at::Tensor& src) {
|
||||
// wait only if Vulkan tensor
|
||||
if (at::kVulkan == src.device().type()) {
|
||||
api::Command::Pool& command_pool = command().pool;
|
||||
api::Command::Buffer& command_buffer = command_pool.stream();
|
||||
|
||||
using Future = ops::vTensor::Future<const void, ops::vTensor::Access::Read>;
|
||||
const ops::vTensor& v_src = ops::convert(src);
|
||||
const Future v_src_future = v_src.host<const void>(command_buffer);
|
||||
|
||||
// This wait() is a no-op if data is not out of sync. More often than
|
||||
// not though, waits here are expected as the GPU catches up with
|
||||
// compute submitted from CPU.
|
||||
v_src_future.wait();
|
||||
}
|
||||
}
|
||||
|
||||
bool available() {
|
||||
return context();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -56,6 +56,10 @@ class Context final {
|
|||
|
||||
void flush();
|
||||
|
||||
// Use this function only for debugging and testing when you want to make sure
|
||||
// all GPU operations get finished before calling flush(). Otherwise, it may crash.
|
||||
void wait(const at::Tensor& src);
|
||||
|
||||
private:
|
||||
VkDevice device();
|
||||
VkQueue queue();
|
||||
|
|
|
|||
|
|
@ -12,17 +12,23 @@ void copy_texture_to_texture(
|
|||
api::Command::Buffer& command_buffer,
|
||||
api::Resource::Image::Object& src_image,
|
||||
api::Resource::Image::Object& dst_image,
|
||||
api::utils::uvec3 src_extents,
|
||||
api::utils::uvec3 copy_extents,
|
||||
api::utils::uvec3 src_offset,
|
||||
api::utils::uvec3 dst_offset) {
|
||||
VkImageCopy copy_info{};
|
||||
copy_info.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
|
||||
copy_info.srcSubresource.layerCount = 1;
|
||||
copy_info.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
|
||||
copy_info.dstSubresource.layerCount = 1;
|
||||
copy_info.extent.width = src_extents.data[0u];
|
||||
copy_info.extent.height = src_extents.data[1u];
|
||||
copy_info.extent.depth = src_extents.data[2u];
|
||||
copy_info.extent.width = copy_extents.data[0u];
|
||||
copy_info.extent.height = copy_extents.data[1u];
|
||||
copy_info.extent.depth = copy_extents.data[2u];
|
||||
copy_info.srcOffset.x = src_offset.data[0u];
|
||||
copy_info.srcOffset.y = src_offset.data[1u];
|
||||
copy_info.srcOffset.z = src_offset.data[2u];
|
||||
copy_info.dstOffset.x = dst_offset.data[0u];
|
||||
copy_info.dstOffset.y = dst_offset.data[1u];
|
||||
copy_info.dstOffset.z = dst_offset.data[2u];
|
||||
|
||||
// To use vkCmdCopyImage, the stage of src & dst image must be set to vTensor::Stage::Transfer.
|
||||
vkCmdCopyImage(
|
||||
|
|
|
|||
|
|
@ -18,7 +18,8 @@ void copy_texture_to_texture(
|
|||
api::Command::Buffer& command_buffer,
|
||||
api::Resource::Image::Object& src_image,
|
||||
api::Resource::Image::Object& dst_image,
|
||||
api::utils::uvec3 src_extents,
|
||||
api::utils::uvec3 copy_extents,
|
||||
api::utils::uvec3 src_offset,
|
||||
api::utils::uvec3 dst_offset);
|
||||
|
||||
} // namespace utils
|
||||
|
|
|
|||
|
|
@ -96,6 +96,61 @@ Tensor cat_feature(const TensorList tensors, vTensor& v_output) {
|
|||
return convert(v_output);
|
||||
}
|
||||
|
||||
Tensor cat_feature_mult4ch(const TensorList tensors, vTensor& v_output) {
|
||||
api::Context* const context = api::context();
|
||||
api::Command::Pool& command_pool = context->command().pool;
|
||||
api::Command::Buffer& command_buffer = command_pool.stream();
|
||||
|
||||
int64_t depth_size_allprior = 0;
|
||||
int64_t ch_interval = 0;
|
||||
for (const auto& tensor : tensors) {
|
||||
ch_interval += tensor.sizes()[1];
|
||||
}
|
||||
const int64_t depth_interval = ch_interval / 4;
|
||||
|
||||
auto dst_image = v_output.image(
|
||||
command_buffer,
|
||||
vTensor::Stage::Transfer,
|
||||
vTensor::Access::Write);
|
||||
uvec3 src_offset{};
|
||||
uvec3 dst_offset{};
|
||||
|
||||
for (const auto& tensor : tensors) {
|
||||
const Tensor self = tensor.is_vulkan() ? tensor : tensor.vulkan();
|
||||
const vTensor& v_self = convert(self);
|
||||
if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
|
||||
auto src_image = v_self.image(
|
||||
command_buffer,
|
||||
vTensor::Stage::Transfer);
|
||||
|
||||
const uint32_t depth_slice = safe_downcast<uint32_t>(tensor.sizes()[1] / 4);
|
||||
uvec3 copy_extents {v_self.extents().data[0u],
|
||||
v_self.extents().data[1u],
|
||||
depth_slice};
|
||||
|
||||
for (int b = 0; b < tensor.sizes()[0]; ++b) {
|
||||
src_offset.data[2u] = safe_downcast<uint32_t>(depth_slice * b);
|
||||
dst_offset.data[2u] = depth_size_allprior + safe_downcast<uint32_t>(depth_interval * b);
|
||||
api::helper::copy_texture_to_texture(command_buffer,
|
||||
src_image,
|
||||
dst_image,
|
||||
copy_extents,
|
||||
src_offset,
|
||||
dst_offset);
|
||||
}
|
||||
|
||||
depth_size_allprior += depth_slice;
|
||||
}
|
||||
else {
|
||||
TORCH_CHECK(false, "Not implemented!");
|
||||
}
|
||||
}
|
||||
|
||||
command_pool.submit(context->gpu().queue, command_buffer);
|
||||
|
||||
return convert(v_output);
|
||||
}
|
||||
|
||||
Tensor cat_width(const TensorList tensors, vTensor& v_output) {
|
||||
TORCH_CHECK(false, "Vulkan cat not implemented for width dimension!");
|
||||
}
|
||||
|
|
@ -110,6 +165,7 @@ Tensor cat_height(const TensorList tensors, vTensor& v_output) {
|
|||
vTensor::Stage::Transfer,
|
||||
vTensor::Access::Write);
|
||||
|
||||
uvec3 src_offset{};
|
||||
uvec3 dst_offset{};
|
||||
for (const auto& tensor : tensors) {
|
||||
const Tensor self = tensor.is_vulkan() ? tensor : tensor.vulkan();
|
||||
|
|
@ -123,6 +179,7 @@ Tensor cat_height(const TensorList tensors, vTensor& v_output) {
|
|||
src_image,
|
||||
dst_image,
|
||||
v_self.extents(),
|
||||
src_offset,
|
||||
dst_offset);
|
||||
|
||||
// Increment by height
|
||||
|
|
@ -148,11 +205,16 @@ Tensor cat(
|
|||
|
||||
at::Tensor tensor = tensors[0];
|
||||
int64_t cat_dim_size = 0;
|
||||
bool is_mult4ch = true;
|
||||
|
||||
for (const auto & t : tensors) {
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
t.dim() == 4, "Vulkan cat expects 4 dimensional inputs");
|
||||
|
||||
if (t.sizes()[1] % 4 != 0) {
|
||||
is_mult4ch = false;
|
||||
}
|
||||
|
||||
for (int d = 0; d < 4; ++d) {
|
||||
if (d == dim) {
|
||||
continue;
|
||||
|
|
@ -179,6 +241,9 @@ Tensor cat(
|
|||
return cat_height(tensors, v_output);
|
||||
}
|
||||
else if (dim == 1) {
|
||||
if (is_mult4ch) {
|
||||
return cat_feature_mult4ch(tensors, v_output);
|
||||
}
|
||||
return cat_feature(tensors, v_output);
|
||||
}
|
||||
return cat_batch(tensors, v_output);
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user