mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Update on "Add type refinement for Union"
Differential Revision: [D28572811](https://our.internmc.facebook.com/intern/diff/D28572811) [ghstack-poisoned]
This commit is contained in:
commit
b3ece930a1
6
.bazelrc
6
.bazelrc
|
|
@ -3,7 +3,11 @@ build --copt=-I.
|
|||
build --copt=-isystem --copt bazel-out/k8-fastbuild/bin
|
||||
|
||||
# Configuration to disable tty features for environments like CI
|
||||
|
||||
build:no-tty --curses no
|
||||
build:no-tty --progress_report_interval 10
|
||||
build:no-tty --show_progress_rate_limit 10
|
||||
|
||||
# Configuration to build with GPU support
|
||||
build:gpu --define=cuda=true
|
||||
# define a separate build folder for faster switching between configs
|
||||
build:gpu --platform_suffix=-gpu
|
||||
|
|
|
|||
|
|
@ -14,6 +14,9 @@ chmod +x "$build_script"
|
|||
# Build
|
||||
cat >"$build_script" <<EOL
|
||||
export PATH="$workdir/miniconda/bin:$PATH"
|
||||
if [[ "$CIRCLE_BRANCH" == "nightly" ]]; then
|
||||
export USE_PYTORCH_METAL_EXPORT=1
|
||||
fi
|
||||
if [[ "$PACKAGE_TYPE" == conda ]]; then
|
||||
"$workdir/builder/conda/build_pytorch.sh"
|
||||
else
|
||||
|
|
|
|||
|
|
@ -10,18 +10,27 @@ pt_checkout="/var/lib/jenkins/workspace"
|
|||
# Since we're cat-ing this file, we need to escape all $'s
|
||||
echo "cpp_doc_push_script.sh: Invoked with $*"
|
||||
|
||||
# Argument 1: Where to copy the built documentation for Python API to
|
||||
# (pytorch.github.io/$install_path)
|
||||
install_path="$1"
|
||||
if [ -z "$install_path" ]; then
|
||||
echo "error: cpp_doc_push_script.sh: install_path (arg1) not specified"
|
||||
# for statements like ${1:-${DOCS_INSTALL_PATH:-docs/}}
|
||||
# the order of operations goes:
|
||||
# 1. Check if there's an argument $1
|
||||
# 2. If no argument check for environment var DOCS_INSTALL_PATH
|
||||
# 3. If no environment var fall back to default 'docs/'
|
||||
|
||||
# NOTE: It might seem weird to gather the second argument before gathering the first argument
|
||||
# but since DOCS_INSTALL_PATH can be derived from DOCS_VERSION it's probably better to
|
||||
# try and gather it first, just so we don't potentially break people who rely on this script
|
||||
# Argument 2: What version of the Python API docs we are building.
|
||||
version="${2:-${DOCS_VERSION:-master}}"
|
||||
if [ -z "$version" ]; then
|
||||
echo "error: cpp_doc_push_script.sh: version (arg2) not specified"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Argument 2: What version of the Python API docs we are building.
|
||||
version="$2"
|
||||
if [ -z "$version" ]; then
|
||||
echo "error: cpp_doc_push_script.sh: version (arg2) not specified"
|
||||
# Argument 1: Where to copy the built documentation for Python API to
|
||||
# (pytorch.github.io/$install_path)
|
||||
install_path="${1:-${DOCS_INSTALL_PATH:-docs/${DOCS_VERSION}}}"
|
||||
if [ -z "$install_path" ]; then
|
||||
echo "error: cpp_doc_push_script.sh: install_path (arg1) not specified"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
|
|
|||
|
|
@ -13,18 +13,27 @@ echo "python_doc_push_script.sh: Invoked with $*"
|
|||
|
||||
set -ex
|
||||
|
||||
# Argument 1: Where to copy the built documentation to
|
||||
# (pytorch.github.io/$install_path)
|
||||
install_path="$1"
|
||||
if [ -z "$install_path" ]; then
|
||||
echo "error: python_doc_push_script.sh: install_path (arg1) not specified"
|
||||
# for statements like ${1:-${DOCS_INSTALL_PATH:-docs/}}
|
||||
# the order of operations goes:
|
||||
# 1. Check if there's an argument $1
|
||||
# 2. If no argument check for environment var DOCS_INSTALL_PATH
|
||||
# 3. If no environment var fall back to default 'docs/'
|
||||
|
||||
# NOTE: It might seem weird to gather the second argument before gathering the first argument
|
||||
# but since DOCS_INSTALL_PATH can be derived from DOCS_VERSION it's probably better to
|
||||
# try and gather it first, just so we don't potentially break people who rely on this script
|
||||
# Argument 2: What version of the docs we are building.
|
||||
version="${2:-${DOCS_VERSION:-master}}"
|
||||
if [ -z "$version" ]; then
|
||||
echo "error: python_doc_push_script.sh: version (arg2) not specified"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Argument 2: What version of the docs we are building.
|
||||
version="$2"
|
||||
if [ -z "$version" ]; then
|
||||
echo "error: python_doc_push_script.sh: version (arg2) not specified"
|
||||
# Argument 1: Where to copy the built documentation to
|
||||
# (pytorch.github.io/$install_path)
|
||||
install_path="${1:-${DOCS_INSTALL_PATH:-docs/${DOCS_VERSION}}}"
|
||||
if [ -z "$install_path" ]; then
|
||||
echo "error: python_doc_push_script.sh: install_path (arg1) not specified"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
|
@ -34,7 +43,7 @@ if [ "$version" == "master" ]; then
|
|||
fi
|
||||
|
||||
# Argument 3: The branch to push to. Usually is "site"
|
||||
branch="$3"
|
||||
branch="${3:-${DOCS_BRANCH:-site}}"
|
||||
if [ -z "$branch" ]; then
|
||||
echo "error: python_doc_push_script.sh: branch (arg3) not specified"
|
||||
exit 1
|
||||
|
|
|
|||
6
.github/scripts/generate_ci_workflows.py
vendored
6
.github/scripts/generate_ci_workflows.py
vendored
|
|
@ -148,6 +148,8 @@ class CIWorkflow:
|
|||
enable_nogpu_no_avx_test: YamlShellBool = "''"
|
||||
enable_nogpu_no_avx2_test: YamlShellBool = "''"
|
||||
enable_slow_test: YamlShellBool = "''"
|
||||
enable_docs_test: YamlShellBool = "''"
|
||||
enable_backwards_compat_test: YamlShellBool = "''"
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if self.is_libtorch:
|
||||
|
|
@ -266,6 +268,8 @@ LINUX_WORKFLOWS = [
|
|||
test_runner_type=LINUX_CPU_TEST_RUNNER,
|
||||
on_pull_request=True,
|
||||
enable_doc_jobs=True,
|
||||
enable_docs_test=1,
|
||||
enable_backwards_compat_test=1,
|
||||
num_test_shards=2,
|
||||
ciflow_config=CIFlowConfig(
|
||||
enabled=True,
|
||||
|
|
@ -497,7 +501,7 @@ BAZEL_WORKFLOWS = [
|
|||
CIWorkflow(
|
||||
arch="linux",
|
||||
build_environment="linux-xenial-py3.6-gcc7-bazel-test",
|
||||
docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc7",
|
||||
docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7",
|
||||
test_runner_type=LINUX_CPU_TEST_RUNNER,
|
||||
on_pull_request=True,
|
||||
ciflow_config=CIFlowConfig(
|
||||
|
|
|
|||
|
|
@ -55,6 +55,10 @@ def main() -> None:
|
|||
configs['distributed'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
|
||||
if os.getenv('ENABLE_SLOW_TEST'):
|
||||
configs['slow'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
|
||||
if os.getenv('ENABLE_DOCS_TEST'):
|
||||
configs['docs_test'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
|
||||
if os.getenv('ENABLE_BACKWARDS_COMPAT_TEST'):
|
||||
configs['backwards_compat'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
|
||||
matrix = {
|
||||
'include': [
|
||||
{
|
||||
|
|
|
|||
45
.github/templates/linux_ci_workflow.yml.j2
vendored
45
.github/templates/linux_ci_workflow.yml.j2
vendored
|
|
@ -254,6 +254,8 @@ jobs:
|
|||
ENABLE_NOGPU_NO_AVX_TEST: !{{ enable_nogpu_no_avx_test }}
|
||||
ENABLE_NOGPU_NO_AVX2_TEST: !{{ enable_nogpu_no_avx2_test }}
|
||||
ENABLE_SLOW_TEST: !{{ enable_slow_test }}
|
||||
ENABLE_DOCS_TEST: !{{ enable_docs_test }}
|
||||
ENABLE_BACKWARDS_COMPAT_TEST: !{{ enable_backwards_compat_test }}
|
||||
NUM_TEST_SHARDS: !{{ num_test_shards }}
|
||||
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
|
||||
NOGPU_RUNNER_TYPE: linux.2xlarge
|
||||
|
|
@ -346,7 +348,6 @@ jobs:
|
|||
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
|
||||
- name: Test PyTorch
|
||||
env:
|
||||
BUILD_ENVIRONMENT: !{{ build_environment }}-${{ matrix.config }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
run: |
|
||||
if [[ $TEST_CONFIG == 'multigpu' ]]; then
|
||||
|
|
@ -441,11 +442,15 @@ jobs:
|
|||
{% endblock %}
|
||||
{%- endif -%}
|
||||
{%- if enable_doc_jobs %}
|
||||
pytorch_python_doc_build:
|
||||
pytorch_doc_build:
|
||||
runs-on: linux.2xlarge
|
||||
strategy:
|
||||
matrix:
|
||||
docs_type: [cpp, python]
|
||||
needs: [calculate-docker-image, build, !{{ ciflow_config.root_job_name }}]
|
||||
env:
|
||||
DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
|
||||
DOCS_TYPE: ${{ matrix.docs_type }}
|
||||
steps:
|
||||
- name: Log in to ECR
|
||||
run: |
|
||||
|
|
@ -483,7 +488,7 @@ jobs:
|
|||
- name: Unzip artifacts
|
||||
run: |
|
||||
unzip -o artifacts.zip
|
||||
- name: Build Python Doc in Docker
|
||||
- name: Build ${{ matrix.docs_type }} docs
|
||||
run: |
|
||||
set -ex
|
||||
time docker pull "${DOCKER_IMAGE}" > /dev/null
|
||||
|
|
@ -496,6 +501,8 @@ jobs:
|
|||
-e IN_CI \
|
||||
-e MAX_JOBS="$(nproc --ignore=2)" \
|
||||
-e CIRCLE_SHA1="$GITHUB_SHA" \
|
||||
-e DOCS_VERSION="${target}" \
|
||||
-e DOCS_TYPE \
|
||||
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
||||
--security-opt seccomp=unconfined \
|
||||
--cap-add=SYS_PTRACE \
|
||||
|
|
@ -505,34 +512,36 @@ jobs:
|
|||
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
|
||||
-w /var/lib/jenkins/workspace \
|
||||
"${DOCKER_IMAGE}" \
|
||||
bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/python_doc_push_script.sh docs/$target $target site"
|
||||
bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/${DOCS_TYPE}_doc_push_script.sh"
|
||||
- name: Chown workspace
|
||||
run: |
|
||||
# Ensure the working directory gets chowned back to the current user
|
||||
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
|
||||
- uses: driazati/upload-artifact-s3@21c31d0a7bcb056ca50bd6ce197ba6507c26a1be
|
||||
if: ${{ github.event_name == 'pull_request' }}
|
||||
name: Upload Docs Preview
|
||||
- uses: seemethere/upload-artifact-s3@v3
|
||||
name: Upload Python Docs Preview
|
||||
if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' }}
|
||||
with:
|
||||
name: deploy
|
||||
retention-days: 14
|
||||
if-no-files-found: error
|
||||
path: pytorch.github.io/docs/merge
|
||||
- name: Show Docs Preview URL (Click Me)
|
||||
if: ${{ github.event_name == 'pull_request' }}
|
||||
env:
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
run: |
|
||||
echo "See rendered docs at https://docs-preview.pytorch.org/$PR_NUMBER/"
|
||||
path: pytorch.github.io/docs/merge/
|
||||
s3-prefix: ${{ github.repository }}/pr-previews/pr/${{ github.event.pull_request.number }}
|
||||
- uses: seemethere/upload-artifact-s3@v3
|
||||
name: Upload C++ Docs Preview
|
||||
if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'cppdocs' }}
|
||||
with:
|
||||
retention-days: 14
|
||||
if-no-files-found: error
|
||||
path: cppdocs/
|
||||
s3-prefix: ${{ github.repository }}/pr-previews/pr/${{ github.event.pull_request.number }}/cppdocs
|
||||
- name: Archive artifacts into zip
|
||||
run: |
|
||||
zip -r pytorch_github_io.zip "${GITHUB_WORKSPACE}/pytorch.github.io"
|
||||
zip -r "docs_${DOCS_TYPE}.zip" "${GITHUB_WORKSPACE}/pytorch.github.io" "${GITHUB_WORKSPACE}/cppdocs"
|
||||
- uses: actions/upload-artifact@v2
|
||||
name: Store PyTorch Build Artifacts
|
||||
with:
|
||||
name: pytorch_github_io
|
||||
name: docs_${{ matrix.docs_type }}
|
||||
path: docs_${{ matrix.docs_type }}.zip
|
||||
if-no-files-found: error
|
||||
path: pytorch_github_io.zip
|
||||
- name: Hold runner for 2 hours or until ssh sessions have drained
|
||||
# Always hold for active ssh sessions
|
||||
if: always()
|
||||
|
|
|
|||
3
.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
generated
vendored
3
.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
generated
vendored
|
|
@ -230,6 +230,8 @@ jobs:
|
|||
ENABLE_NOGPU_NO_AVX_TEST: ''
|
||||
ENABLE_NOGPU_NO_AVX2_TEST: ''
|
||||
ENABLE_SLOW_TEST: ''
|
||||
ENABLE_DOCS_TEST: ''
|
||||
ENABLE_BACKWARDS_COMPAT_TEST: ''
|
||||
NUM_TEST_SHARDS: 2
|
||||
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
|
||||
NOGPU_RUNNER_TYPE: linux.2xlarge
|
||||
|
|
@ -325,7 +327,6 @@ jobs:
|
|||
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
|
||||
- name: Test PyTorch
|
||||
env:
|
||||
BUILD_ENVIRONMENT: linux-bionic-cuda10.2-py3.9-gcc7-${{ matrix.config }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
run: |
|
||||
if [[ $TEST_CONFIG == 'multigpu' ]]; then
|
||||
|
|
|
|||
3
.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
generated
vendored
3
.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
generated
vendored
|
|
@ -230,6 +230,8 @@ jobs:
|
|||
ENABLE_NOGPU_NO_AVX_TEST: ''
|
||||
ENABLE_NOGPU_NO_AVX2_TEST: ''
|
||||
ENABLE_SLOW_TEST: ''
|
||||
ENABLE_DOCS_TEST: ''
|
||||
ENABLE_BACKWARDS_COMPAT_TEST: ''
|
||||
NUM_TEST_SHARDS: 2
|
||||
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
|
||||
NOGPU_RUNNER_TYPE: linux.2xlarge
|
||||
|
|
@ -325,7 +327,6 @@ jobs:
|
|||
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
|
||||
- name: Test PyTorch
|
||||
env:
|
||||
BUILD_ENVIRONMENT: linux-bionic-py3.8-gcc9-coverage-${{ matrix.config }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
run: |
|
||||
if [[ $TEST_CONFIG == 'multigpu' ]]; then
|
||||
|
|
|
|||
3
.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
generated
vendored
3
.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
generated
vendored
|
|
@ -230,6 +230,8 @@ jobs:
|
|||
ENABLE_NOGPU_NO_AVX_TEST: 1
|
||||
ENABLE_NOGPU_NO_AVX2_TEST: 1
|
||||
ENABLE_SLOW_TEST: 1
|
||||
ENABLE_DOCS_TEST: ''
|
||||
ENABLE_BACKWARDS_COMPAT_TEST: ''
|
||||
NUM_TEST_SHARDS: 2
|
||||
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
|
||||
NOGPU_RUNNER_TYPE: linux.2xlarge
|
||||
|
|
@ -325,7 +327,6 @@ jobs:
|
|||
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
|
||||
- name: Test PyTorch
|
||||
env:
|
||||
BUILD_ENVIRONMENT: linux-xenial-cuda10.2-py3.6-gcc7-${{ matrix.config }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
run: |
|
||||
if [[ $TEST_CONFIG == 'multigpu' ]]; then
|
||||
|
|
|
|||
3
.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
generated
vendored
3
.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
generated
vendored
|
|
@ -230,6 +230,8 @@ jobs:
|
|||
ENABLE_NOGPU_NO_AVX_TEST: ''
|
||||
ENABLE_NOGPU_NO_AVX2_TEST: ''
|
||||
ENABLE_SLOW_TEST: ''
|
||||
ENABLE_DOCS_TEST: ''
|
||||
ENABLE_BACKWARDS_COMPAT_TEST: ''
|
||||
NUM_TEST_SHARDS: 2
|
||||
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
|
||||
NOGPU_RUNNER_TYPE: linux.2xlarge
|
||||
|
|
@ -325,7 +327,6 @@ jobs:
|
|||
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
|
||||
- name: Test PyTorch
|
||||
env:
|
||||
BUILD_ENVIRONMENT: linux-xenial-cuda11.3-py3.6-gcc7-${{ matrix.config }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
run: |
|
||||
if [[ $TEST_CONFIG == 'multigpu' ]]; then
|
||||
|
|
|
|||
45
.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
generated
vendored
45
.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
generated
vendored
|
|
@ -230,6 +230,8 @@ jobs:
|
|||
ENABLE_NOGPU_NO_AVX_TEST: ''
|
||||
ENABLE_NOGPU_NO_AVX2_TEST: ''
|
||||
ENABLE_SLOW_TEST: ''
|
||||
ENABLE_DOCS_TEST: 1
|
||||
ENABLE_BACKWARDS_COMPAT_TEST: 1
|
||||
NUM_TEST_SHARDS: 2
|
||||
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
|
||||
NOGPU_RUNNER_TYPE: linux.2xlarge
|
||||
|
|
@ -325,7 +327,6 @@ jobs:
|
|||
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
|
||||
- name: Test PyTorch
|
||||
env:
|
||||
BUILD_ENVIRONMENT: linux-xenial-py3.6-gcc5.4-${{ matrix.config }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
run: |
|
||||
if [[ $TEST_CONFIG == 'multigpu' ]]; then
|
||||
|
|
@ -429,11 +430,15 @@ jobs:
|
|||
# Prune all of the docker images
|
||||
docker system prune -af
|
||||
|
||||
pytorch_python_doc_build:
|
||||
pytorch_doc_build:
|
||||
runs-on: linux.2xlarge
|
||||
strategy:
|
||||
matrix:
|
||||
docs_type: [cpp, python]
|
||||
needs: [calculate-docker-image, build, ciflow_should_run]
|
||||
env:
|
||||
DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
|
||||
DOCS_TYPE: ${{ matrix.docs_type }}
|
||||
steps:
|
||||
- name: Log in to ECR
|
||||
run: |
|
||||
|
|
@ -474,7 +479,7 @@ jobs:
|
|||
- name: Unzip artifacts
|
||||
run: |
|
||||
unzip -o artifacts.zip
|
||||
- name: Build Python Doc in Docker
|
||||
- name: Build ${{ matrix.docs_type }} docs
|
||||
run: |
|
||||
set -ex
|
||||
time docker pull "${DOCKER_IMAGE}" > /dev/null
|
||||
|
|
@ -487,6 +492,8 @@ jobs:
|
|||
-e IN_CI \
|
||||
-e MAX_JOBS="$(nproc --ignore=2)" \
|
||||
-e CIRCLE_SHA1="$GITHUB_SHA" \
|
||||
-e DOCS_VERSION="${target}" \
|
||||
-e DOCS_TYPE \
|
||||
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
||||
--security-opt seccomp=unconfined \
|
||||
--cap-add=SYS_PTRACE \
|
||||
|
|
@ -496,34 +503,36 @@ jobs:
|
|||
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
|
||||
-w /var/lib/jenkins/workspace \
|
||||
"${DOCKER_IMAGE}" \
|
||||
bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/python_doc_push_script.sh docs/$target $target site"
|
||||
bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/${DOCS_TYPE}_doc_push_script.sh"
|
||||
- name: Chown workspace
|
||||
run: |
|
||||
# Ensure the working directory gets chowned back to the current user
|
||||
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
|
||||
- uses: driazati/upload-artifact-s3@21c31d0a7bcb056ca50bd6ce197ba6507c26a1be
|
||||
if: ${{ github.event_name == 'pull_request' }}
|
||||
name: Upload Docs Preview
|
||||
- uses: seemethere/upload-artifact-s3@v3
|
||||
name: Upload Python Docs Preview
|
||||
if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' }}
|
||||
with:
|
||||
name: deploy
|
||||
retention-days: 14
|
||||
if-no-files-found: error
|
||||
path: pytorch.github.io/docs/merge
|
||||
- name: Show Docs Preview URL (Click Me)
|
||||
if: ${{ github.event_name == 'pull_request' }}
|
||||
env:
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
run: |
|
||||
echo "See rendered docs at https://docs-preview.pytorch.org/$PR_NUMBER/"
|
||||
path: pytorch.github.io/docs/merge/
|
||||
s3-prefix: ${{ github.repository }}/pr-previews/pr/${{ github.event.pull_request.number }}
|
||||
- uses: seemethere/upload-artifact-s3@v3
|
||||
name: Upload C++ Docs Preview
|
||||
if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'cppdocs' }}
|
||||
with:
|
||||
retention-days: 14
|
||||
if-no-files-found: error
|
||||
path: cppdocs/
|
||||
s3-prefix: ${{ github.repository }}/pr-previews/pr/${{ github.event.pull_request.number }}/cppdocs
|
||||
- name: Archive artifacts into zip
|
||||
run: |
|
||||
zip -r pytorch_github_io.zip "${GITHUB_WORKSPACE}/pytorch.github.io"
|
||||
zip -r "docs_${DOCS_TYPE}.zip" "${GITHUB_WORKSPACE}/pytorch.github.io" "${GITHUB_WORKSPACE}/cppdocs"
|
||||
- uses: actions/upload-artifact@v2
|
||||
name: Store PyTorch Build Artifacts
|
||||
with:
|
||||
name: pytorch_github_io
|
||||
name: docs_${{ matrix.docs_type }}
|
||||
path: docs_${{ matrix.docs_type }}.zip
|
||||
if-no-files-found: error
|
||||
path: pytorch_github_io.zip
|
||||
- name: Hold runner for 2 hours or until ssh sessions have drained
|
||||
# Always hold for active ssh sessions
|
||||
if: always()
|
||||
|
|
|
|||
2
.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
generated
vendored
2
.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
generated
vendored
|
|
@ -15,7 +15,7 @@ on:
|
|||
|
||||
env:
|
||||
BUILD_ENVIRONMENT: linux-xenial-py3.6-gcc7-bazel-test
|
||||
DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc7
|
||||
DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7
|
||||
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
|
||||
TORCH_CUDA_ARCH_LIST: 5.2
|
||||
IN_CI: 1
|
||||
|
|
|
|||
3
.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
generated
vendored
3
.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
generated
vendored
|
|
@ -228,6 +228,8 @@ jobs:
|
|||
ENABLE_NOGPU_NO_AVX_TEST: ''
|
||||
ENABLE_NOGPU_NO_AVX2_TEST: ''
|
||||
ENABLE_SLOW_TEST: ''
|
||||
ENABLE_DOCS_TEST: ''
|
||||
ENABLE_BACKWARDS_COMPAT_TEST: ''
|
||||
NUM_TEST_SHARDS: 2
|
||||
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
|
||||
NOGPU_RUNNER_TYPE: linux.2xlarge
|
||||
|
|
@ -323,7 +325,6 @@ jobs:
|
|||
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
|
||||
- name: Test PyTorch
|
||||
env:
|
||||
BUILD_ENVIRONMENT: periodic-linux-xenial-cuda11.1-py3.6-gcc7-${{ matrix.config }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
run: |
|
||||
if [[ $TEST_CONFIG == 'multigpu' ]]; then
|
||||
|
|
|
|||
|
|
@ -224,7 +224,11 @@ if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
|
|||
|
||||
get_bazel
|
||||
|
||||
# first build the whole torch for CPU-only
|
||||
tools/bazel build --config=no-tty :torch
|
||||
# then build selected set of targets with GPU-support.
|
||||
# TODO: eventually this should converge to building the whole :torch with GPU-support
|
||||
tools/bazel build --config=no-tty --config=gpu :c10
|
||||
else
|
||||
# check that setup.py would fail with bad arguments
|
||||
echo "The next three invocations are expected to fail with invalid command error messages."
|
||||
|
|
|
|||
|
|
@ -494,6 +494,10 @@ test_torch_deploy() {
|
|||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
test_docs_test() {
|
||||
.jenkins/pytorch/docs-test.sh
|
||||
}
|
||||
|
||||
if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
|
||||
(cd test && python -c "import torch; print(torch.__config__.show())")
|
||||
(cd test && python -c "import torch; print(torch.__config__.parallel_info())")
|
||||
|
|
@ -517,7 +521,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 || "$
|
|||
test_without_numpy
|
||||
install_torchvision
|
||||
test_python_shard1
|
||||
test_distributed
|
||||
test_aten
|
||||
elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 || "${SHARD_NUMBER}" == 2 ]]; then
|
||||
install_torchvision
|
||||
|
|
@ -533,6 +536,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
|
|||
elif [[ "${BUILD_ENVIRONMENT}" == *distributed* ]]; then
|
||||
test_distributed
|
||||
test_rpc
|
||||
elif [[ "${TEST_CONFIG}" = docs_test ]]; then
|
||||
test_docs_test
|
||||
else
|
||||
install_torchvision
|
||||
install_monkeytype
|
||||
|
|
|
|||
|
|
@ -214,6 +214,7 @@ option(USE_LMDB "Use LMDB" OFF)
|
|||
option(USE_MAGMA "Use MAGMA" ON)
|
||||
option(USE_METAL "Use Metal for Caffe2 iOS build" ON)
|
||||
option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
|
||||
option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
|
||||
option(USE_NATIVE_ARCH "Use -march=native" OFF)
|
||||
cmake_dependent_option(
|
||||
USE_MLCOMPUTE "Use ML Compute for macOS build" ON
|
||||
|
|
@ -688,6 +689,10 @@ if(USE_PYTORCH_METAL)
|
|||
string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_METAL")
|
||||
endif()
|
||||
|
||||
if(USE_PYTORCH_METAL_EXPORT)
|
||||
string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_METAL_EXPORT")
|
||||
endif()
|
||||
|
||||
if(USE_SOURCE_DEBUG_ON_MOBILE)
|
||||
string(APPEND CMAKE_CXX_FLAGS " -DSYMBOLICATE_MOBILE_DEBUG_HANDLE")
|
||||
endif()
|
||||
|
|
|
|||
13
WORKSPACE
13
WORKSPACE
|
|
@ -1,7 +1,7 @@
|
|||
workspace(name = "pytorch")
|
||||
|
||||
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
|
||||
load("//tools/rules:workspace.bzl", "new_patched_local_repository")
|
||||
load("//tools/rules:workspace.bzl", "new_patched_local_repository", "new_empty_repository")
|
||||
|
||||
http_archive(
|
||||
name = "bazel_skylib",
|
||||
|
|
@ -170,3 +170,14 @@ protobuf_deps()
|
|||
load("@rules_python//python:repositories.bzl", "py_repositories")
|
||||
|
||||
py_repositories()
|
||||
|
||||
local_repository(
|
||||
name = "local_config_cuda",
|
||||
path = "third_party/tensorflow_cuda_bazel_build",
|
||||
)
|
||||
|
||||
# Wrapper to expose local_config_cuda in an agnostic way
|
||||
new_empty_repository(
|
||||
name = "cuda",
|
||||
build_file = "//third_party:cuda.BUILD",
|
||||
)
|
||||
|
|
|
|||
|
|
@ -167,13 +167,12 @@ else()
|
|||
endif()
|
||||
|
||||
# Metal
|
||||
if(USE_PYTORCH_METAL)
|
||||
if(APPLE)
|
||||
set(all_cpu_cpp ${all_cpu_cpp} ${metal_cpp} ${native_metal_srcs})
|
||||
else()
|
||||
# Add files needed from optimized_for_mobile
|
||||
set(all_cpu_cpp ${all_cpu_cpp} ${metal_cpp} ${metal_prepack_cpp})
|
||||
endif()
|
||||
if(USE_PYTORCH_METAL_EXPORT)
|
||||
# Add files needed from exporting metal models(optimized_for_mobile)
|
||||
set(all_cpu_cpp ${all_cpu_cpp} ${metal_cpp} ${metal_prepack_cpp})
|
||||
elseif(APPLE AND USE_PYTORCH_METAL)
|
||||
# Compile Metal kernels
|
||||
set(all_cpu_cpp ${all_cpu_cpp} ${metal_cpp} ${native_metal_srcs})
|
||||
else()
|
||||
set(all_cpu_cpp ${all_cpu_cpp} ${metal_cpp})
|
||||
endif()
|
||||
|
|
@ -450,13 +449,21 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
|
|||
set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS})
|
||||
if(NOT INTERN_BUILD_MOBILE)
|
||||
list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${miopen_h})
|
||||
# Metal
|
||||
if(USE_PYTORCH_METAL_EXPORT)
|
||||
# Add files needed from exporting metal models(optimized_for_mobile)
|
||||
list(APPEND INSTALL_HEADERS ${metal_h} ${metal_prepack_h})
|
||||
elseif(APPLE AND USE_PYTORCH_METAL)
|
||||
# Needed by Metal kernels
|
||||
list(APPEND INSTALL_HEADERS ${metal_h} ${native_metal_h})
|
||||
else()
|
||||
list(APPEND INSTALL_HEADERS ${metal_h})
|
||||
endif()
|
||||
else()
|
||||
if(USE_PYTORCH_METAL)
|
||||
if(IOS)
|
||||
if(IOS AND USE_PYTORCH_METAL)
|
||||
list(APPEND INSTALL_HEADERS ${metal_h} ${native_metal_h})
|
||||
else()
|
||||
else()
|
||||
list(APPEND INSTALL_HEADERS ${metal_h} ${metal_prepack_h})
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ namespace at {
|
|||
struct TORCH_API CPUGeneratorImpl : public c10::GeneratorImpl {
|
||||
// Constructors
|
||||
CPUGeneratorImpl(uint64_t seed_in = default_rng_seed_val);
|
||||
~CPUGeneratorImpl() = default;
|
||||
~CPUGeneratorImpl() override = default;
|
||||
|
||||
// CPUGeneratorImpl methods
|
||||
std::shared_ptr<CPUGeneratorImpl> clone() const;
|
||||
|
|
|
|||
|
|
@ -7,6 +7,9 @@
|
|||
namespace at {
|
||||
|
||||
static inline int64_t maybe_wrap_dim(int64_t dim, int64_t dim_post_expr, bool wrap_scalar=true) {
|
||||
// if dim_post_expr is 0 and wrap_scalar is true, then dim must be in the range [-1, 0].
|
||||
// This is a special case for scalar tensors and manifests in e.g. torch.sum(scalar_tensor, 0)
|
||||
// Otherwise, dim should be in the range [-dim_post_expr, dim_post_expr-1].
|
||||
return c10::maybe_wrap_dim(dim, dim_post_expr, wrap_scalar);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -123,7 +123,7 @@ struct BuiltinOpFunction : public Function {
|
|||
return *this;
|
||||
}
|
||||
|
||||
~BuiltinOpFunction() {}
|
||||
~BuiltinOpFunction() override {}
|
||||
|
||||
private:
|
||||
c10::QualifiedName name_;
|
||||
|
|
|
|||
|
|
@ -344,6 +344,10 @@ public:
|
|||
c10::Dispatcher::singleton().callBoxed(*this, stack);
|
||||
}
|
||||
|
||||
void callBoxed(Stack& stack) const {
|
||||
callBoxed(&stack);
|
||||
}
|
||||
|
||||
void redispatchBoxed(DispatchKeySet ks, Stack* stack) const {
|
||||
c10::Dispatcher::singleton().redispatchBoxed(*this, ks, stack);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -34,6 +34,9 @@ struct Argument {
|
|||
default_value_(std::move(default_value)),
|
||||
kwarg_only_(kwarg_only),
|
||||
alias_info_(std::move(alias_info)) {
|
||||
// this is an softly-enforced invariant for out arguments.
|
||||
bool is_alias = alias_info_.has_value() && alias_info_.value().isWrite();
|
||||
is_out_ = kwarg_only_ && is_alias;
|
||||
}
|
||||
const std::string& name() const {
|
||||
return name_;
|
||||
|
|
@ -50,6 +53,11 @@ struct Argument {
|
|||
bool kwarg_only() const {
|
||||
return kwarg_only_;
|
||||
}
|
||||
|
||||
bool is_out() const {
|
||||
return is_out_;
|
||||
}
|
||||
|
||||
const c10::optional<AliasInfo>& alias_info() const {
|
||||
return alias_info_;
|
||||
}
|
||||
|
|
@ -116,6 +124,8 @@ struct Argument {
|
|||
// is this only specifiable as a keyword argument?
|
||||
bool kwarg_only_;
|
||||
c10::optional<AliasInfo> alias_info_;
|
||||
// marks if the argument is out variant of the schema
|
||||
bool is_out_;
|
||||
};
|
||||
|
||||
inline bool operator==(const Argument& lhs, const Argument& rhs) {
|
||||
|
|
|
|||
|
|
@ -51,6 +51,16 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema)
|
|||
return out;
|
||||
}
|
||||
|
||||
inline size_t findFirstOutArg(const std::vector<Argument>& args) {
|
||||
// find the start of out args in the schema
|
||||
for (size_t out_start_idx = 0; out_start_idx < args.size(); out_start_idx++) {
|
||||
if (args.at(out_start_idx).is_out()) {
|
||||
return out_start_idx;
|
||||
}
|
||||
}
|
||||
return args.size();
|
||||
}
|
||||
|
||||
inline bool Argument::isBackwardCompatibleWith(
|
||||
const Argument& old,
|
||||
std::ostream* why_not) const {
|
||||
|
|
@ -121,17 +131,20 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
|
|||
}
|
||||
}
|
||||
|
||||
// Make sure that all the old arguments have their corresponding backward
|
||||
// compatible arguments in this schema.
|
||||
for (size_t i = 0; i < old.arguments().size(); ++i) {
|
||||
// we want to test both out and default args seperately
|
||||
size_t old_out_start_idx = findFirstOutArg(old.arguments());
|
||||
size_t new_out_start_idx = findFirstOutArg(arguments());
|
||||
|
||||
// make sure among the default args, they are backward compatible
|
||||
for (size_t i = 0; i < old_out_start_idx; i++) {
|
||||
if (!arguments().at(i).isBackwardCompatibleWith(
|
||||
old.arguments().at(i), why_not)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Validate that all new arguments provided a default value.
|
||||
for (size_t i = old.arguments().size(); i < arguments().size(); ++i) {
|
||||
// // Validate that all new arguments provided has a default value
|
||||
for (size_t i = old_out_start_idx; i < new_out_start_idx; ++i) {
|
||||
if (!arguments().at(i).default_value()) {
|
||||
if (why_not) {
|
||||
*why_not
|
||||
|
|
@ -144,6 +157,15 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
|
|||
}
|
||||
}
|
||||
|
||||
// now compare the out args
|
||||
for (size_t i = old_out_start_idx; i < old.arguments().size(); i++) {
|
||||
if (!arguments()
|
||||
.at(i - old_out_start_idx + new_out_start_idx)
|
||||
.isBackwardCompatibleWith(old.arguments().at(i), why_not)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,9 @@
|
|||
#pragma once
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
#include <ATen/core/ivalue.h>
|
||||
#include <c10/util/Deprecated.h>
|
||||
|
||||
// TODO move this to c10 namespace
|
||||
|
||||
|
|
@ -9,7 +12,42 @@ namespace jit {
|
|||
|
||||
using c10::IValue;
|
||||
using Stack = std::vector<IValue>;
|
||||
using Operation = std::function<void(Stack*)>;
|
||||
|
||||
class Operation {
|
||||
template <typename F, typename Arg>
|
||||
using accepts = std::is_constructible<std::function<void(Arg)>, F&&>;
|
||||
|
||||
public:
|
||||
template <typename F,
|
||||
std::enable_if_t<accepts<F, Stack*>::value, int> = 0>
|
||||
C10_DEPRECATED_MESSAGE("Please use void(Stack&) to register operator instead.")
|
||||
Operation(F&& raw): op_([raw = std::forward<F>(raw)](Stack& stack) {
|
||||
raw(&stack);
|
||||
}) {}
|
||||
|
||||
template <typename F,
|
||||
std::enable_if_t<accepts<F, Stack&>::value &&
|
||||
!std::is_same<std::decay_t<F>, Operation>::value, int> = 0>
|
||||
Operation(F&& op): op_(std::forward<F>(op)) {}
|
||||
|
||||
Operation(std::nullptr_t) noexcept {}
|
||||
|
||||
explicit operator bool() const noexcept {
|
||||
return op_ ? true : false;
|
||||
}
|
||||
|
||||
void operator()(Stack& stack) {
|
||||
op_(stack);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T* target() noexcept {
|
||||
return op_.target<T>();
|
||||
}
|
||||
|
||||
private:
|
||||
std::function<void(Stack&)> op_;
|
||||
};
|
||||
|
||||
// An operation with N inputs and M outputs pops the last N inputs off
|
||||
// the stack and pushes its M inputs onto the stack
|
||||
|
|
|
|||
|
|
@ -459,9 +459,10 @@ TORCH_IMPL_FUNC(nll_loss_backward_out_cpu)
|
|||
|
||||
Tensor cross_entropy_loss_prob_target(
|
||||
const Tensor& self,
|
||||
const Tensor& target,
|
||||
const Tensor& target_,
|
||||
const Tensor& weight,
|
||||
int64_t reduction) {
|
||||
int64_t reduction,
|
||||
double label_smoothing) {
|
||||
const auto n_classes = self.size(1);
|
||||
TORCH_CHECK(
|
||||
!weight.defined() || (weight.dim() == 1 && weight.numel() == n_classes),
|
||||
|
|
@ -472,6 +473,15 @@ Tensor cross_entropy_loss_prob_target(
|
|||
weight.sizes());
|
||||
|
||||
auto input = at::log_softmax(self, 1, self.scalar_type());
|
||||
Tensor target;
|
||||
|
||||
if (label_smoothing > 0.0) {
|
||||
TORCH_CHECK(label_smoothing <= 1.0, "label_smoothing must be between 0.0 and 1.0. Got: ", label_smoothing);
|
||||
target = target_ * (1 - label_smoothing) + label_smoothing / n_classes;
|
||||
} else {
|
||||
target = target_;
|
||||
}
|
||||
|
||||
if (weight.defined()) {
|
||||
// Expand weight to the correct number of dims for broadcasting with input / target
|
||||
auto weight_broadcast_shape = SmallBuffer<int64_t, 5>(input.dim());
|
||||
|
|
@ -503,12 +513,66 @@ Tensor cross_entropy_loss_prob_target(
|
|||
}
|
||||
}
|
||||
|
||||
Tensor cross_entropy_loss_label_smoothing(
|
||||
const Tensor& self,
|
||||
const Tensor& target,
|
||||
const Tensor& weight,
|
||||
int64_t reduction,
|
||||
int64_t ignore_index,
|
||||
double label_smoothing) {
|
||||
|
||||
auto input = at::log_softmax(self, 1, self.scalar_type());
|
||||
auto nllloss = at::nll_loss_nd(input, target, weight, reduction, ignore_index);
|
||||
|
||||
auto n_classes = input.size(1);
|
||||
|
||||
Tensor smooth_loss;
|
||||
if (weight.defined()) {
|
||||
// Expand weight to the correct number of dims for broadcasting with input / target
|
||||
auto weight_broadcast_shape = SmallBuffer<int64_t, 5>(input.dim());
|
||||
std::fill(weight_broadcast_shape.begin(), weight_broadcast_shape.end(), 1);
|
||||
weight_broadcast_shape[1] = weight.size(0);
|
||||
Tensor weight_ = weight.view(weight_broadcast_shape);
|
||||
|
||||
smooth_loss = -(input * weight_).sum(1);
|
||||
} else {
|
||||
smooth_loss = -input.sum(1);
|
||||
}
|
||||
|
||||
if (ignore_index >= 0) {
|
||||
smooth_loss.index_put_({target == ignore_index}, 0.0);
|
||||
}
|
||||
|
||||
Tensor ret;
|
||||
switch (reduction) {
|
||||
case Reduction::Mean:
|
||||
if (weight.defined()) {
|
||||
// TODO: This code can path can be removed if #61309 is resolved
|
||||
// loss is normalized by the weights to be consistent with nll_loss_nd
|
||||
ret = smooth_loss.sum() / weight.gather(0, target.flatten()).sum();
|
||||
} else {
|
||||
ret = smooth_loss.mean();
|
||||
}
|
||||
break;
|
||||
case Reduction::Sum:
|
||||
ret = smooth_loss.sum();
|
||||
break;
|
||||
case Reduction::None:
|
||||
ret = smooth_loss;
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK(false, "Invalid reduction type encountered in cross_entropy: ", reduction);
|
||||
}
|
||||
return (1 - label_smoothing) * nllloss + ret * (label_smoothing / n_classes);
|
||||
}
|
||||
|
||||
Tensor cross_entropy_loss(
|
||||
const Tensor& self,
|
||||
const Tensor& target,
|
||||
const c10::optional<Tensor>& weight,
|
||||
int64_t reduction,
|
||||
int64_t ignore_index) {
|
||||
int64_t ignore_index,
|
||||
double label_smoothing) {
|
||||
Tensor ret;
|
||||
if (self.sizes() == target.sizes()) {
|
||||
// Assume soft targets when input and target shapes are the same
|
||||
|
|
@ -519,7 +583,14 @@ Tensor cross_entropy_loss(
|
|||
// See [Note: hacky wrapper removal for optional tensor]
|
||||
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight);
|
||||
const Tensor& weight_ = *weight_maybe_owned;
|
||||
ret = cross_entropy_loss_prob_target(self, target, weight_, reduction);
|
||||
ret = cross_entropy_loss_prob_target(self, target, weight_, reduction, label_smoothing);
|
||||
} else if (label_smoothing > 0.0) {
|
||||
TORCH_CHECK(label_smoothing <= 1.0, "label_smoothing must be between 0.0 and 1.0. Got: ", label_smoothing);
|
||||
|
||||
// See [Note: hacky wrapper removal for optional tensor]
|
||||
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight);
|
||||
const Tensor& weight_ = *weight_maybe_owned;
|
||||
ret = cross_entropy_loss_label_smoothing(self, target, weight_, reduction, ignore_index, label_smoothing);
|
||||
} else {
|
||||
ret = at::nll_loss_nd(
|
||||
at::log_softmax(self, 1, self.scalar_type()),
|
||||
|
|
|
|||
|
|
@ -1,90 +1,17 @@
|
|||
#include <ATen/ATen.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#include <ATen/Parallel.h>
|
||||
#include <tuple>
|
||||
#include <ATen/native/cpu/MaxUnpoolKernel.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
|
||||
template <typename scalar_t>
|
||||
Tensor max_unpooling2d_forward_out_cpu_frame(
|
||||
Tensor& output,
|
||||
const Tensor& input,
|
||||
const Tensor& indices,
|
||||
int64_t oheight,
|
||||
int64_t owidth) {
|
||||
int64_t numBatch = 1;
|
||||
int64_t dimc = 0;
|
||||
int64_t dimh = 1;
|
||||
int64_t dimw = 2;
|
||||
if (input.ndimension() == 4) {
|
||||
numBatch = input.size(0);
|
||||
dimc++;
|
||||
dimh++;
|
||||
dimw++;
|
||||
}
|
||||
int64_t numChannels = input.size(dimc);
|
||||
int64_t inputHeight = input.size(dimh);
|
||||
int64_t inputWidth = input.size(dimw);
|
||||
|
||||
auto* rawInput = input.data_ptr<scalar_t>();
|
||||
auto* rawIndices = indices.data_ptr<int64_t>();
|
||||
auto* rawOutput = output.data_ptr<scalar_t>();
|
||||
|
||||
at::internal::lazy_init_num_threads();
|
||||
|
||||
for (int64_t n = 0; n < numBatch; n++) {
|
||||
int64_t nOutputOffset = n * numChannels * owidth * oheight;
|
||||
int64_t nInputOffset = n * numChannels * inputWidth * inputHeight;
|
||||
int64_t k = 0;
|
||||
bool has_error = false;
|
||||
int64_t error_index = 0;
|
||||
#pragma omp parallel for private(k)
|
||||
for (k = 0; k < numChannels; k++) {
|
||||
int64_t finalOutputOffset = nOutputOffset + k * owidth * oheight;
|
||||
int64_t finalInputOffset = nInputOffset + k * inputWidth * inputHeight;
|
||||
scalar_t* output_p_k = rawOutput + finalOutputOffset;
|
||||
scalar_t* input_p_k = rawInput + finalInputOffset;
|
||||
int64_t* ind_p_k = rawIndices + finalInputOffset;
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int64_t maxp;
|
||||
for (int64_t i = 0; i < inputHeight; i++) {
|
||||
for (int64_t j = 0; j < inputWidth; j++) {
|
||||
maxp = ind_p_k[i * inputWidth + j];
|
||||
if (maxp < 0 || maxp >= owidth * oheight) {
|
||||
#pragma omp critical
|
||||
{
|
||||
has_error = true;
|
||||
error_index = maxp;
|
||||
}
|
||||
} else {
|
||||
output_p_k[maxp] = input_p_k[i * inputWidth + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (has_error) {
|
||||
AT_ERROR(
|
||||
"Found an invalid max index: ",
|
||||
error_index,
|
||||
" (output volumes are of size ",
|
||||
oheight,
|
||||
"x",
|
||||
owidth);
|
||||
(void)error_index;
|
||||
}
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
Tensor& max_unpooling2d_forward_out_cpu(const Tensor& self_,
|
||||
Tensor& max_unpooling2d_forward_out_cpu(
|
||||
const Tensor& self_,
|
||||
const Tensor& indices_,
|
||||
IntArrayRef output_size,
|
||||
Tensor& output) {
|
||||
auto oheight = output_size[0];
|
||||
auto owidth = output_size[1];
|
||||
TORCH_CHECK(output.is_contiguous(), "output must be contiguous");
|
||||
TORCH_CHECK(
|
||||
indices_.scalar_type() == at::ScalarType::Long,
|
||||
"elements in indices should be type int64");
|
||||
|
|
@ -100,8 +27,9 @@ Tensor& max_unpooling2d_forward_out_cpu(const Tensor& self_,
|
|||
|
||||
TORCH_CHECK(self_.numel() > 0, "Input must be non-empty");
|
||||
|
||||
auto self = self_.contiguous();
|
||||
auto indices = indices_.contiguous();
|
||||
auto memory_format = self_.suggest_memory_format();
|
||||
auto self = self_.contiguous(memory_format);
|
||||
auto indices = indices_.contiguous(memory_format);
|
||||
|
||||
if (self.ndimension() == 3) {
|
||||
int64_t numChannels = self.size(0);
|
||||
|
|
@ -109,15 +37,11 @@ Tensor& max_unpooling2d_forward_out_cpu(const Tensor& self_,
|
|||
} else {
|
||||
int64_t numBatch = self.size(0);
|
||||
int64_t numChannels = self.size(1);
|
||||
output.resize_({numBatch, numChannels, oheight, owidth});
|
||||
output.resize_({numBatch, numChannels, oheight, owidth}, memory_format);
|
||||
}
|
||||
output.zero_();
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
self.scalar_type(), "max_unpooling2d_forward_out_cpu_frame", ([&] {
|
||||
max_unpooling2d_forward_out_cpu_frame<scalar_t>(
|
||||
output, self, indices, oheight, owidth);
|
||||
}));
|
||||
max_unpool2d_kernel(kCPU, output, self, indices);
|
||||
return output;
|
||||
};
|
||||
|
||||
|
|
@ -130,87 +54,6 @@ Tensor max_unpooling2d_forward_cpu(
|
|||
return output;
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
Tensor max_unpooling3d_forward_out_cpu_frame(
|
||||
Tensor& output,
|
||||
const Tensor& input,
|
||||
const Tensor& indices,
|
||||
int64_t oT,
|
||||
int64_t oH,
|
||||
int64_t oW) {
|
||||
int64_t nBatch = 1;
|
||||
int64_t dimw = 3;
|
||||
int64_t dimh = 2;
|
||||
int64_t dimt = 1;
|
||||
|
||||
if (input.ndimension() == 5) {
|
||||
nBatch = input.size(0);
|
||||
dimw++;
|
||||
dimh++;
|
||||
dimt++;
|
||||
}
|
||||
|
||||
int64_t nSlices = input.size(dimt - 1);
|
||||
int64_t iT = input.size(dimt);
|
||||
int64_t iH = input.size(dimh);
|
||||
int64_t iW = input.size(dimw);
|
||||
|
||||
scalar_t* input_data = input.data_ptr<scalar_t>();
|
||||
scalar_t* output_data = output.data_ptr<scalar_t>();
|
||||
int64_t* indices_data = indices.data_ptr<int64_t>();
|
||||
|
||||
at::internal::lazy_init_num_threads();
|
||||
|
||||
for (int64_t p = 0; p < nBatch; p++) {
|
||||
int64_t inputOffset = p * nSlices * iT * iW * iH;
|
||||
int64_t outputOffset = p * nSlices * oT * oW * oH;
|
||||
int64_t k = 0;
|
||||
bool has_error = false;
|
||||
int error_index = 0;
|
||||
#pragma omp parallel for private(k)
|
||||
for (k = 0; k < nSlices; k++) {
|
||||
int64_t finalInputOffset = inputOffset + k * iT * iW * iH;
|
||||
int64_t finalOutputOffset = outputOffset + k * oT * oW * oH;
|
||||
|
||||
scalar_t* output_p_k = output_data + finalOutputOffset;
|
||||
scalar_t* input_p_k = input_data + finalInputOffset;
|
||||
int64_t* ind_p_k = indices_data + finalInputOffset;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int maxp;
|
||||
for (int64_t t = 0; t < iT; t++) {
|
||||
for (int64_t i = 0; i < iH; i++) {
|
||||
for (int64_t j = 0; j < iW; j++) {
|
||||
int64_t index = t * iH * iW + i * iW + j;
|
||||
maxp = ind_p_k[index];
|
||||
if (maxp < 0 || maxp >= oT * oW * oH) {
|
||||
#pragma omp critical
|
||||
{
|
||||
has_error = true;
|
||||
error_index = maxp;
|
||||
}
|
||||
} else {
|
||||
output_p_k[maxp] = input_p_k[index];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (has_error) {
|
||||
AT_ERROR(
|
||||
"found an invalid max index ",
|
||||
error_index,
|
||||
" (output volumes are of size ",
|
||||
oT,
|
||||
"x",
|
||||
oH,
|
||||
"x",
|
||||
oW);
|
||||
(void)error_index;
|
||||
}
|
||||
}
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
static void max_unpooling3d_shape_check(
|
||||
const Tensor& input,
|
||||
const Tensor& gradOutput,
|
||||
|
|
@ -310,16 +153,7 @@ Tensor& max_unpooling3d_forward_out_cpu(const Tensor& self_,
|
|||
}
|
||||
output.zero_();
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
self.scalar_type(), "max_unpooling3d_forward_out_cpu_frame", ([&] {
|
||||
max_unpooling3d_forward_out_cpu_frame<scalar_t>(
|
||||
output,
|
||||
self,
|
||||
indices,
|
||||
oT,
|
||||
oH,
|
||||
oW);
|
||||
}));
|
||||
max_unpool3d_kernel(kCPU, output, self, indices);
|
||||
return output;
|
||||
}
|
||||
|
||||
|
|
@ -335,59 +169,6 @@ Tensor max_unpooling3d_forward_cpu(
|
|||
return output;
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
static void max_unpooling2d_backward_out_cpu_frame(
|
||||
scalar_t* gradInput_p,
|
||||
scalar_t* gradOutput_p,
|
||||
int64_t* ind_p,
|
||||
int64_t nslices,
|
||||
int64_t iheight,
|
||||
int64_t iwidth,
|
||||
int64_t oheight,
|
||||
int64_t owidth) {
|
||||
bool has_error = false;
|
||||
int64_t error_index = 0;
|
||||
int64_t k = 0;
|
||||
|
||||
at::internal::lazy_init_num_threads();
|
||||
#pragma omp parallel for private(k)
|
||||
for (k = 0; k < nslices; k++) {
|
||||
scalar_t* gradInput_p_k = gradInput_p + k * iwidth * iheight;
|
||||
scalar_t* gradOutput_p_k = gradOutput_p + k * owidth * oheight;
|
||||
int64_t* ind_p_k = ind_p + k * iwidth * iheight;
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int64_t i, j;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int64_t maxp;
|
||||
|
||||
for (i = 0; i < iheight; i++) {
|
||||
for (j = 0; j < iwidth; j++) {
|
||||
maxp = ind_p_k[i * iwidth + j]; /* retrieve position of max */
|
||||
if (maxp < 0 || maxp >= owidth * oheight) {
|
||||
#pragma omp critical
|
||||
{
|
||||
has_error = true;
|
||||
error_index = maxp;
|
||||
}
|
||||
}
|
||||
gradInput_p_k[i * iwidth + j] =
|
||||
gradOutput_p_k[maxp]; /* update gradient */
|
||||
}
|
||||
}
|
||||
}
|
||||
if (has_error) {
|
||||
AT_ERROR(
|
||||
"invalid max index ",
|
||||
error_index,
|
||||
", owidth= ",
|
||||
owidth,
|
||||
", oheight= ",
|
||||
oheight);
|
||||
(void)error_index;
|
||||
}
|
||||
}
|
||||
|
||||
Tensor& max_unpooling2d_backward_out_cpu(const Tensor& grad_output_,
|
||||
const Tensor& self,
|
||||
const Tensor& indices_,
|
||||
|
|
@ -396,42 +177,24 @@ Tensor& max_unpooling2d_backward_out_cpu(const Tensor& grad_output_,
|
|||
TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
|
||||
int64_t oheight = output_size[0];
|
||||
int64_t owidth = output_size[1];
|
||||
int dimw = 2;
|
||||
int dimh = 1;
|
||||
int nbatch = 1;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int nslices;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int iheight;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int iwidth;
|
||||
int64_t ndim = self.ndimension();
|
||||
int64_t dimh = ndim == 3 ? 1 : 2;
|
||||
int64_t dimw = ndim == 3 ? 2 : 3;
|
||||
|
||||
TORCH_CHECK(
|
||||
indices_.scalar_type() == at::ScalarType::Long,
|
||||
"elements in indices should be type int64");
|
||||
TORCH_CHECK(
|
||||
self.sizes() == indices_.sizes(), "Input shape must match indices shape");
|
||||
|
||||
TORCH_CHECK(output_size.size() == 2, "Output size must be 2");
|
||||
|
||||
/* get contiguous gradOutput and indices */
|
||||
auto grad_output = grad_output_.contiguous();
|
||||
auto indices = indices_.contiguous();
|
||||
auto memory_format = self.suggest_memory_format();
|
||||
auto grad_output = grad_output_.contiguous(memory_format);
|
||||
auto indices = indices_.contiguous(memory_format);
|
||||
|
||||
/* resize */
|
||||
grad_input.resize_as_(self);
|
||||
grad_input.resize_(self.sizes(), memory_format);
|
||||
grad_input.zero_();
|
||||
|
||||
if (self.ndimension() == 4) {
|
||||
nbatch = self.size(0);
|
||||
dimw++;
|
||||
dimh++;
|
||||
}
|
||||
|
||||
/* sizes */
|
||||
nslices = self.size(dimh - 1);
|
||||
iheight = self.size(dimh);
|
||||
iwidth = self.size(dimw);
|
||||
|
||||
if (owidth != grad_output.size(dimw) || oheight != grad_output.size(dimh)) {
|
||||
AT_ERROR(
|
||||
"Inconsistent gradOutput size. output height = ",
|
||||
|
|
@ -443,23 +206,8 @@ Tensor& max_unpooling2d_backward_out_cpu(const Tensor& grad_output_,
|
|||
"x",
|
||||
grad_output.size(dimw));
|
||||
}
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
self.scalar_type(), "max_unpooling2d_backward_out_cpu_frame", ([&] {
|
||||
int p;
|
||||
for (p = 0; p < nbatch; p++) {
|
||||
auto inputOffset = p * nslices * iheight * iwidth;
|
||||
auto outputOffset = p * nslices * oheight * owidth;
|
||||
max_unpooling2d_backward_out_cpu_frame<scalar_t>(
|
||||
grad_input.data_ptr<scalar_t>() + inputOffset,
|
||||
grad_output.data_ptr<scalar_t>() + outputOffset,
|
||||
indices.data_ptr<int64_t>() + inputOffset,
|
||||
nslices,
|
||||
iheight,
|
||||
iwidth,
|
||||
oheight,
|
||||
owidth);
|
||||
}
|
||||
}));
|
||||
|
||||
max_unpool2d_backward_kernel(kCPU, grad_input, grad_output, indices);
|
||||
return grad_input;
|
||||
}
|
||||
|
||||
|
|
@ -468,72 +216,14 @@ Tensor max_unpooling2d_backward_cpu(
|
|||
const Tensor& self,
|
||||
const Tensor& indices,
|
||||
IntArrayRef output_size) {
|
||||
auto grad_input = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
||||
at::native::max_unpooling2d_backward_out_cpu(
|
||||
auto grad_input = at::empty({0}, self.options());
|
||||
max_unpooling2d_backward_out_cpu(
|
||||
grad_output, self, indices, output_size, grad_input);
|
||||
return grad_input;
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
static void max_unpooling3d_backward_out_cpu_frame(
|
||||
scalar_t* gradInput_p,
|
||||
scalar_t* gradOutput_p,
|
||||
int64_t* ind_p,
|
||||
int64_t nslices,
|
||||
int64_t iT,
|
||||
int64_t iH,
|
||||
int64_t iW,
|
||||
int64_t oT,
|
||||
int64_t oH,
|
||||
int64_t oW) {
|
||||
int64_t k = 0;
|
||||
bool has_error = false;
|
||||
int error_index = 0;
|
||||
|
||||
at::internal::lazy_init_num_threads();
|
||||
|
||||
#pragma omp parallel for private(k)
|
||||
for (k = 0; k < nslices; k++) {
|
||||
scalar_t* gradInput_p_k = gradInput_p + k * iT * iH * iW;
|
||||
scalar_t* gradOutput_p_k = gradOutput_p + k * oT * oH * oW;
|
||||
int64_t* ind_p_k = ind_p + k * iT * iH * iW;
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int64_t t, i, j, index;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int64_t maxp;
|
||||
for (t = 0; t < iT; t++) {
|
||||
for (i = 0; i < iH; i++) {
|
||||
for (j = 0; j < iW; j++) {
|
||||
index = t * iH * iW + i * iW + j;
|
||||
maxp = ind_p_k[index]; /* retrieve position of max */
|
||||
if (maxp < 0 || maxp >= oT * oH * oW) {
|
||||
#pragma omp critical
|
||||
{
|
||||
has_error = true;
|
||||
error_index = maxp;
|
||||
}
|
||||
}
|
||||
gradInput_p_k[index] = gradOutput_p_k[maxp]; /* update gradient */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (has_error) {
|
||||
AT_ERROR(
|
||||
"invalid max index ",
|
||||
error_index,
|
||||
", oT= ",
|
||||
oT,
|
||||
", oW= ",
|
||||
oW,
|
||||
",oH= ",
|
||||
oH);
|
||||
(void)error_index;
|
||||
}
|
||||
}
|
||||
|
||||
Tensor& max_unpooling3d_backward_out_cpu(const Tensor& grad_output_,
|
||||
Tensor& max_unpooling3d_backward_out_cpu(
|
||||
const Tensor& grad_output_,
|
||||
const Tensor& self,
|
||||
const Tensor& indices_,
|
||||
IntArrayRef output_size,
|
||||
|
|
@ -541,26 +231,17 @@ Tensor& max_unpooling3d_backward_out_cpu(const Tensor& grad_output_,
|
|||
IntArrayRef padding,
|
||||
Tensor& grad_input) {
|
||||
TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
|
||||
auto oT = output_size[0];
|
||||
auto oH = output_size[1];
|
||||
auto oW = output_size[2];
|
||||
int dimw = 3;
|
||||
int dimh = 2;
|
||||
int dimt = 1;
|
||||
int nbatch = 1;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int nslices;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int iT;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int iH;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int iW;
|
||||
int64_t oT = output_size[0];
|
||||
int64_t oH = output_size[1];
|
||||
int64_t oW = output_size[2];
|
||||
int64_t ndim = self.ndimension();
|
||||
int64_t dimt = ndim == 4 ? 1 : 2;
|
||||
int64_t dimh = ndim == 4 ? 2 : 3;
|
||||
int64_t dimw = ndim == 4 ? 3 : 4;
|
||||
|
||||
max_unpooling3d_shape_check(
|
||||
self, grad_output_, indices_, output_size, stride, padding);
|
||||
|
||||
// TODO (from THNN): check gradOutput shape
|
||||
/* get contiguous gradOutput */
|
||||
auto grad_output = grad_output_.contiguous();
|
||||
auto indices = indices_.contiguous();
|
||||
|
|
@ -568,39 +249,24 @@ Tensor& max_unpooling3d_backward_out_cpu(const Tensor& grad_output_,
|
|||
/* resize */
|
||||
grad_input.resize_as_(self);
|
||||
grad_input.zero_();
|
||||
if (self.ndimension() == 5) {
|
||||
nbatch = self.size(0);
|
||||
dimt++;
|
||||
dimw++;
|
||||
dimh++;
|
||||
|
||||
if (oW != grad_output.size(dimw) || oH != grad_output.size(dimh) || oT != grad_output.size(dimt)) {
|
||||
AT_ERROR(
|
||||
"Inconsistent gradOutput size. output depth = ",
|
||||
oT,
|
||||
", output height = ",
|
||||
oH,
|
||||
", output width = ",
|
||||
oW,
|
||||
", gradOutput: ",
|
||||
grad_output.size(dimt),
|
||||
"x",
|
||||
grad_output.size(dimh),
|
||||
"x",
|
||||
grad_output.size(dimw));
|
||||
}
|
||||
|
||||
/* sizes */
|
||||
nslices = self.size(dimt - 1);
|
||||
iT = self.size(dimt);
|
||||
iH = self.size(dimh);
|
||||
iW = self.size(dimw);
|
||||
|
||||
/* backprop */
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
self.scalar_type(), "max_unpooling3d_backward_out_cpu_frame", ([&] {
|
||||
int p;
|
||||
for (p = 0; p < nbatch; p++) {
|
||||
int inputOffset = p * nslices * iT * iH * iW;
|
||||
int outputOffset = p * nslices * oT * oH * oW;
|
||||
max_unpooling3d_backward_out_cpu_frame<scalar_t>(
|
||||
grad_input.data_ptr<scalar_t>() + inputOffset,
|
||||
grad_output.data_ptr<scalar_t>() + outputOffset,
|
||||
indices.data_ptr<int64_t>() + inputOffset,
|
||||
nslices,
|
||||
iT,
|
||||
iH,
|
||||
iW,
|
||||
oT,
|
||||
oH,
|
||||
oW);
|
||||
}
|
||||
}));
|
||||
max_unpool3d_backward_kernel(kCPU, grad_input, grad_output, indices);
|
||||
return grad_input;
|
||||
}
|
||||
|
||||
|
|
@ -611,10 +277,16 @@ Tensor max_unpooling3d_backward_cpu(
|
|||
IntArrayRef output_size,
|
||||
IntArrayRef stride,
|
||||
IntArrayRef padding) {
|
||||
auto grad_input = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
||||
auto grad_input = at::empty({0}, self.options());
|
||||
at::native::max_unpooling3d_backward_out_cpu(
|
||||
grad_output, self, indices, output_size, stride, padding, grad_input);
|
||||
return grad_input;
|
||||
}
|
||||
|
||||
DEFINE_DISPATCH(max_unpool2d_kernel);
|
||||
DEFINE_DISPATCH(max_unpool2d_backward_kernel);
|
||||
DEFINE_DISPATCH(max_unpool3d_kernel);
|
||||
DEFINE_DISPATCH(max_unpool3d_backward_kernel);
|
||||
|
||||
} // namespace native
|
||||
} // namespace at
|
||||
|
|
|
|||
|
|
@ -1209,12 +1209,15 @@ Tensor index_select_sparse(const Tensor& self, int64_t dim, const Tensor& index)
|
|||
|
||||
if (dim < sparse_dim) {
|
||||
|
||||
auto dim_indices = indices[dim];
|
||||
auto cpu_dim_indices = indices[dim].to(c10::kCPU).contiguous();
|
||||
int64_t* cpu_dim_indices_ptr = cpu_dim_indices.data_ptr<int64_t>();
|
||||
auto cpu_index = index.to(c10::kCPU).contiguous();
|
||||
int64_t* cpu_index_ptr = cpu_index.data_ptr<int64_t>();
|
||||
std::vector<int64_t> zindices;
|
||||
std::vector<int64_t> iindices;
|
||||
int64_t new_nnz = 0;
|
||||
for (const auto i : c10::irange(new_sizes[dim])) {
|
||||
auto idx = index[i].item<int64_t>();
|
||||
for (int64_t i = 0; i < new_sizes[dim]; i++) {
|
||||
int64_t idx = cpu_index_ptr[i];
|
||||
if (idx < -size || idx >= size) {
|
||||
TORCH_CHECK_INDEX(false, "index_select(): index contains ", idx, " that is out of range for tensor of size ",
|
||||
self.sizes(), " at dimension ", dim);
|
||||
|
|
@ -1222,8 +1225,8 @@ Tensor index_select_sparse(const Tensor& self, int64_t dim, const Tensor& index)
|
|||
if (idx < 0) {
|
||||
idx += size;
|
||||
}
|
||||
for (const auto j : c10::irange(nnz)) {
|
||||
auto jdx = dim_indices[j].item<int64_t>();
|
||||
for (int64_t j = 0; j < nnz; j++) {
|
||||
int64_t jdx = cpu_dim_indices_ptr[j];
|
||||
if (idx == jdx) {
|
||||
new_nnz++;
|
||||
iindices.push_back(i);
|
||||
|
|
|
|||
|
|
@ -251,12 +251,16 @@ static inline scalar_t area_pixel_compute_scale(
|
|||
bool align_corners,
|
||||
const c10::optional<double> scale) {
|
||||
// see Note [area_pixel_compute_scale]
|
||||
if (output_size > 1) {
|
||||
return align_corners
|
||||
? static_cast<scalar_t>(input_size - 1) / (output_size - 1)
|
||||
: compute_scales_value<scalar_t>(scale, input_size, output_size);
|
||||
} else {
|
||||
return scalar_t(0);
|
||||
if(align_corners){
|
||||
if(output_size > 1) {
|
||||
return static_cast<scalar_t>(input_size - 1) / (output_size - 1);
|
||||
}
|
||||
else {
|
||||
return static_cast<scalar_t>(0);
|
||||
}
|
||||
}
|
||||
else{
|
||||
return compute_scales_value<scalar_t>(scale, input_size, output_size);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
385
aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
Normal file
385
aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
Normal file
|
|
@ -0,0 +1,385 @@
|
|||
#include <ATen/ATen.h>
|
||||
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Parallel.h>
|
||||
#include <ATen/native/Pool.h>
|
||||
#include <ATen/native/cpu/utils.h>
|
||||
|
||||
namespace at { namespace native {
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename scalar_t, bool is_3d = false>
|
||||
void cpu_max_unpool(
|
||||
Tensor& output_,
|
||||
const Tensor& input,
|
||||
const Tensor& indices) {
|
||||
auto output = output_.contiguous();
|
||||
|
||||
auto input_data = input.data_ptr<scalar_t>();
|
||||
auto indices_data = indices.data_ptr<int64_t>();
|
||||
auto output_data = output.data_ptr<scalar_t>();
|
||||
|
||||
// NB: input tensor dimensions:
|
||||
// MaxUnpool2d:
|
||||
// dim = 3: CHW
|
||||
// dim = 4: NCHW
|
||||
// MaxUnpool3d:
|
||||
// dim = 4: CDHW
|
||||
// dim = 5: NCDHW
|
||||
|
||||
int64_t numel = input.numel();
|
||||
int64_t ndim = input.ndimension();
|
||||
|
||||
// treat batch size and channels as one dimension
|
||||
// and the feature map as another dimension
|
||||
int64_t channels, output_depth, output_height, output_width;
|
||||
if (is_3d) {
|
||||
TORCH_CHECK(ndim == 4 || ndim == 5, "MaxUnpool3d: expect input to be 4d or 5d tensor.");
|
||||
channels = ndim == 4 ? input.size(0) : input.size(0) * input.size(1);
|
||||
output_depth = output.size(-3);
|
||||
output_height = output.size(-2);
|
||||
output_width = output.size(-1);
|
||||
} else {
|
||||
TORCH_CHECK(ndim == 3 || ndim == 4, "MaxUnpool2d: expect input to be 3d or 4d tensor.");
|
||||
channels = ndim == 3 ? input.size(0) : input.size(0) * input.size(1);
|
||||
output_depth = 1;
|
||||
output_height = output.size(-2);
|
||||
output_width = output.size(-1);
|
||||
}
|
||||
int64_t input_image_size = numel / channels;
|
||||
int64_t output_image_size = output.numel() / channels;
|
||||
|
||||
bool has_error = false;
|
||||
int64_t error_index = 0;
|
||||
|
||||
// parallel on dim N, C, D, H, W: [channels, input_image_size]
|
||||
at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) {
|
||||
int64_t c = 0;
|
||||
int64_t ip = 0;
|
||||
data_index_init(begin, c, channels, ip, input_image_size);
|
||||
|
||||
for (int64_t i = begin; i < end; i++) {
|
||||
scalar_t* output_ptr = output_data + c * output_image_size;
|
||||
|
||||
int64_t maxp = indices_data[i];
|
||||
if (maxp < 0 || maxp >= output_image_size) {
|
||||
#pragma omp critical
|
||||
{
|
||||
has_error = true;
|
||||
error_index = maxp;
|
||||
}
|
||||
} else {
|
||||
output_ptr[maxp] = input_data[i];
|
||||
}
|
||||
|
||||
// move on to next input index
|
||||
data_index_step(c, channels, ip, input_image_size);
|
||||
}
|
||||
});
|
||||
|
||||
if (has_error) {
|
||||
if (is_3d) {
|
||||
AT_ERROR("Found an invalid max index: ", error_index,
|
||||
" (output volumes are of size ", output_depth,
|
||||
"x", output_height, "x", output_width);
|
||||
(void)error_index;
|
||||
} else {
|
||||
AT_ERROR("Found an invalid max index: ", error_index,
|
||||
" (output volumes are of size ", output_height,
|
||||
"x", output_width);
|
||||
(void)error_index;
|
||||
}
|
||||
}
|
||||
|
||||
if (!output_.is_contiguous()) {
|
||||
output_.copy_(output);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
void cpu_max_unpool_channels_last(
|
||||
Tensor& output_,
|
||||
const Tensor& input,
|
||||
const Tensor& indices) {
|
||||
TORCH_CHECK(input.ndimension() == 4,
|
||||
"max_unpool2d with channels last format supports tensors with 4 dims");
|
||||
auto memory_format = at::MemoryFormat::ChannelsLast;
|
||||
auto output = output_.contiguous(memory_format);
|
||||
|
||||
auto input_data = input.data_ptr<scalar_t>();
|
||||
auto indices_data = indices.data_ptr<int64_t>();
|
||||
auto output_data = output.data_ptr<scalar_t>();
|
||||
|
||||
int64_t nbatch = input.size(0);
|
||||
int64_t channels = input.size(1);
|
||||
int64_t input_height = input.size(2);
|
||||
int64_t input_width = input.size(3);
|
||||
int64_t output_height = output.size(2);
|
||||
int64_t output_width = output.size(3);
|
||||
int64_t input_image_size = input_height * input_width;
|
||||
int64_t output_image_size = output_height * output_width;
|
||||
|
||||
bool has_error = false;
|
||||
int64_t error_index = 0;
|
||||
|
||||
// parallel on dim N, H, W
|
||||
at::parallel_for(0, nbatch * input_image_size, 0, [&](int64_t begin, int64_t end) {
|
||||
int64_t n = 0;
|
||||
int64_t ip = 0;
|
||||
data_index_init(begin, n, nbatch, ip, input_image_size);
|
||||
|
||||
for (int64_t i = begin; i < end; i++) {
|
||||
scalar_t* input_ptr = input_data + i * channels;
|
||||
int64_t* indices_ptr = indices_data + i * channels;
|
||||
scalar_t* output_ptr = output_data + n * output_image_size * channels;
|
||||
|
||||
// can't do scatter on avx2 (only available on avx512)
|
||||
for (int64_t c = 0; c < channels; c++) {
|
||||
int64_t maxp = indices_ptr[c];
|
||||
if (maxp < 0 || maxp >= output_image_size) {
|
||||
#pragma omp critical
|
||||
{
|
||||
has_error = true;
|
||||
error_index = maxp;
|
||||
}
|
||||
} else {
|
||||
output_ptr[maxp * channels + c] = input_ptr[c];
|
||||
}
|
||||
}
|
||||
|
||||
// move on to next input index
|
||||
data_index_step(n, nbatch, ip, input_image_size);
|
||||
}
|
||||
});
|
||||
|
||||
if (has_error) {
|
||||
AT_ERROR("Found an invalid max index: ", error_index,
|
||||
" (output volumes are of size ", output_height,
|
||||
"x", output_width);
|
||||
(void)error_index;
|
||||
}
|
||||
|
||||
if (!output_.is_contiguous(memory_format)) {
|
||||
output_.copy_(output);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, bool is_3d = false>
|
||||
void cpu_max_unpool_backward(
|
||||
Tensor& grad_input_,
|
||||
const Tensor& grad_output,
|
||||
const Tensor& indices) {
|
||||
auto grad_input = grad_input_.contiguous();
|
||||
|
||||
auto grad_output_data = grad_output.data_ptr<scalar_t>();
|
||||
auto indices_data = indices.data_ptr<int64_t>();
|
||||
auto grad_input_data = grad_input.data_ptr<scalar_t>();
|
||||
|
||||
int64_t numel = grad_input.numel();
|
||||
int64_t ndim = grad_output.ndimension();
|
||||
|
||||
// treat batch size and channels as one dimension
|
||||
// and the feature map as another dimension
|
||||
int64_t channels, output_depth, output_height, output_width;
|
||||
if (is_3d) {
|
||||
TORCH_CHECK(ndim == 4 || ndim == 5, "MaxUnpool3d_backward: expect grad_output to be 4d or 5d tensor.");
|
||||
channels = ndim == 4 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1);
|
||||
output_depth = grad_output.size(-3);
|
||||
output_height = grad_output.size(-2);
|
||||
output_width = grad_output.size(-1);
|
||||
} else {
|
||||
TORCH_CHECK(ndim == 3 || ndim == 4, "MaxUnpool2d_backward: expect grad_output to be 3d or 4d tensor.");
|
||||
channels = ndim == 3 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1);
|
||||
output_depth = 1;
|
||||
output_height = grad_output.size(-2);
|
||||
output_width = grad_output.size(-1);
|
||||
}
|
||||
int64_t input_image_size = numel / channels;
|
||||
int64_t output_image_size = grad_output.numel() / channels;
|
||||
|
||||
bool has_error = false;
|
||||
int64_t error_index = 0;
|
||||
|
||||
// parallel on dim N, C, D, H, W
|
||||
at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) {
|
||||
int64_t c = 0;
|
||||
int64_t ip = 0;
|
||||
data_index_init(begin, c, channels, ip, input_image_size);
|
||||
|
||||
for (int64_t i = begin; i < end; i++) {
|
||||
scalar_t* grad_output_ptr = grad_output_data + c * output_image_size;
|
||||
|
||||
int64_t maxp = indices_data[i];
|
||||
if (maxp < 0 || maxp >= output_image_size) {
|
||||
#pragma omp critical
|
||||
{
|
||||
has_error = true;
|
||||
error_index = maxp;
|
||||
}
|
||||
} else {
|
||||
grad_input_data[i] = grad_output_ptr[maxp];
|
||||
}
|
||||
|
||||
// move on to next input index
|
||||
data_index_step(c, channels, ip, input_image_size);
|
||||
}
|
||||
});
|
||||
|
||||
if (has_error) {
|
||||
if (is_3d) {
|
||||
AT_ERROR("invalid max index ", error_index,
|
||||
", odepth= ", output_depth,
|
||||
", owidth= ", output_width,
|
||||
", oheight= ", output_height);
|
||||
(void)error_index;
|
||||
} else {
|
||||
AT_ERROR("invalid max index ", error_index,
|
||||
", owidth= ", output_width,
|
||||
", oheight= ", output_height);
|
||||
(void)error_index;
|
||||
}
|
||||
}
|
||||
|
||||
if (!grad_input_.is_contiguous()) {
|
||||
grad_input_.copy_(grad_input);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
void cpu_max_unpool_backward_channels_last(
|
||||
Tensor& grad_input_,
|
||||
const Tensor& grad_output,
|
||||
const Tensor& indices) {
|
||||
TORCH_CHECK(grad_output.ndimension() == 4,
|
||||
"max_unpool2d backward with channels last format supports tensors with 4 dims.");
|
||||
auto memory_format = at::MemoryFormat::ChannelsLast;
|
||||
auto grad_input = grad_input_.contiguous(memory_format);
|
||||
|
||||
auto grad_input_data = grad_input.data_ptr<scalar_t>();
|
||||
auto grad_output_data = grad_output.data_ptr<scalar_t>();
|
||||
auto indices_data = indices.data_ptr<int64_t>();
|
||||
|
||||
int64_t nbatch = grad_input.size(0);
|
||||
int64_t channels = grad_input.size(1);
|
||||
int64_t input_height = grad_input.size(2);
|
||||
int64_t input_width = grad_input.size(3);
|
||||
int64_t output_height = grad_output.size(2);
|
||||
int64_t output_width = grad_output.size(3);
|
||||
int64_t input_image_size = input_height * input_width;
|
||||
int64_t output_image_size = output_height * output_width;
|
||||
|
||||
bool has_error = false;
|
||||
int64_t error_index = 0;
|
||||
|
||||
// parallel on dim N, H, W
|
||||
at::parallel_for(0, nbatch * input_image_size, 0, [&](int64_t begin, int64_t end) {
|
||||
int64_t n = 0;
|
||||
int64_t ip = 0;
|
||||
data_index_init(begin, n, nbatch, ip, input_image_size);
|
||||
|
||||
for (int64_t i = begin; i < end; i++) {
|
||||
scalar_t* grad_output_ptr = grad_output_data + n * output_image_size * channels;
|
||||
scalar_t* grad_input_ptr = grad_input_data + i * channels;
|
||||
int64_t* indices_ptr = indices_data + i * channels;
|
||||
|
||||
for (int64_t c = 0; c < channels; c++) {
|
||||
int64_t maxp = indices_ptr[c];
|
||||
if (maxp < 0 || maxp >= output_image_size) {
|
||||
#pragma omp critical
|
||||
{
|
||||
has_error = true;
|
||||
error_index = maxp;
|
||||
}
|
||||
} else {
|
||||
grad_input_ptr[c] = grad_output_ptr[maxp * channels + c];
|
||||
}
|
||||
}
|
||||
|
||||
// move on to next input index
|
||||
data_index_step(n, nbatch, ip, input_image_size);
|
||||
}
|
||||
});
|
||||
|
||||
if (has_error) {
|
||||
AT_ERROR("invalid max index ", error_index,
|
||||
", owidth= ", output_width,
|
||||
", oheight= ", output_height);
|
||||
(void)error_index;
|
||||
}
|
||||
|
||||
if (!grad_input_.is_contiguous(memory_format)) {
|
||||
grad_input_.copy_(grad_input);
|
||||
}
|
||||
}
|
||||
|
||||
void max_unpool2d_kernel_impl(
|
||||
Tensor& output,
|
||||
const Tensor& input,
|
||||
const Tensor& indices) {
|
||||
switch(input.suggest_memory_format()) {
|
||||
case at::MemoryFormat::Contiguous: {
|
||||
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_unpool2d", [&] {
|
||||
cpu_max_unpool<scalar_t, /*is_3d*/false>(output, input, indices);
|
||||
});
|
||||
break;
|
||||
}
|
||||
case at::MemoryFormat::ChannelsLast: {
|
||||
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_unpool2d_channels_last", [&] {
|
||||
cpu_max_unpool_channels_last<scalar_t>(output, input, indices);
|
||||
});
|
||||
break;
|
||||
}
|
||||
default:
|
||||
TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
|
||||
}
|
||||
}
|
||||
|
||||
void max_unpool3d_kernel_impl(
|
||||
Tensor& output,
|
||||
const Tensor& input,
|
||||
const Tensor& indices) {
|
||||
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_unpool3d", [&] {
|
||||
cpu_max_unpool<scalar_t, /*is_3d*/true>(output, input, indices);
|
||||
});
|
||||
}
|
||||
|
||||
void max_unpool2d_backward_kernel_impl(
|
||||
Tensor& grad_input,
|
||||
const Tensor& grad_output,
|
||||
const Tensor& indices) {
|
||||
switch(grad_output.suggest_memory_format()) {
|
||||
case at::MemoryFormat::Contiguous: {
|
||||
AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_unpool2d_backward", [&] {
|
||||
cpu_max_unpool_backward<scalar_t, /*is_3d*/false>(grad_input, grad_output, indices);
|
||||
});
|
||||
break;
|
||||
}
|
||||
case at::MemoryFormat::ChannelsLast: {
|
||||
AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_unpool2d_backward_channels_last", [&] {
|
||||
cpu_max_unpool_backward_channels_last<scalar_t>(grad_input, grad_output, indices);
|
||||
});
|
||||
break;
|
||||
}
|
||||
default:
|
||||
TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
|
||||
}
|
||||
}
|
||||
|
||||
void max_unpool3d_backward_kernel_impl(
|
||||
Tensor& grad_input,
|
||||
const Tensor& grad_output,
|
||||
const Tensor& indices) {
|
||||
AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_unpool3d_backward", [&] {
|
||||
cpu_max_unpool_backward<scalar_t, /*is_3d*/true>(grad_input, grad_output, indices);
|
||||
});
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
REGISTER_DISPATCH(max_unpool2d_kernel, &max_unpool2d_kernel_impl);
|
||||
REGISTER_DISPATCH(max_unpool2d_backward_kernel, &max_unpool2d_backward_kernel_impl);
|
||||
REGISTER_DISPATCH(max_unpool3d_kernel, &max_unpool3d_kernel_impl);
|
||||
REGISTER_DISPATCH(max_unpool3d_backward_kernel, &max_unpool3d_backward_kernel_impl);
|
||||
|
||||
}} // at::native
|
||||
16
aten/src/ATen/native/cpu/MaxUnpoolKernel.h
Normal file
16
aten/src/ATen/native/cpu/MaxUnpoolKernel.h
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
#include <ATen/ATen.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#include <ATen/native/DispatchStub.h>
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace at { namespace native {
|
||||
|
||||
using max_unpooling_fn = void(*)(Tensor&, const Tensor&, const Tensor&);
|
||||
|
||||
DECLARE_DISPATCH(max_unpooling_fn, max_unpool2d_kernel);
|
||||
DECLARE_DISPATCH(max_unpooling_fn, max_unpool2d_backward_kernel);
|
||||
DECLARE_DISPATCH(max_unpooling_fn, max_unpool3d_kernel);
|
||||
DECLARE_DISPATCH(max_unpooling_fn, max_unpool3d_backward_kernel);
|
||||
|
||||
}} // at::native
|
||||
|
|
@ -163,24 +163,29 @@ static void std_var_kernel_impl(TensorIterator& iter, int64_t correction, bool t
|
|||
}
|
||||
|
||||
static void prod_kernel_impl(TensorIterator& iter) {
|
||||
// Workaround for the error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context]
|
||||
// Workaround for the error: '*' in boolean context, suggest '&&' instead
|
||||
// [-Werror=int-in-bool-context]
|
||||
if (iter.dtype() == ScalarType::Bool) {
|
||||
using scalar_t = bool;
|
||||
binary_kernel_reduce_vec(
|
||||
iter,
|
||||
[=](scalar_t a, scalar_t b) -> scalar_t { return a && b; },
|
||||
[=](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return a && b; },
|
||||
// NOLINTNEXTLINE(bugprone-argument-comment)
|
||||
/*identity=*/1);
|
||||
iter,
|
||||
[=](scalar_t a, scalar_t b)
|
||||
__ubsan_ignore_undefined__ -> scalar_t { return a && b; },
|
||||
[=](Vectorized<scalar_t> a, Vectorized<scalar_t> b)
|
||||
__ubsan_ignore_undefined__ { return a && b; },
|
||||
// NOLINTNEXTLINE(bugprone-argument-comment)
|
||||
/*identity=*/1);
|
||||
} else {
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "prod_cpu", [&] {
|
||||
binary_kernel_reduce_vec(
|
||||
iter,
|
||||
[=](scalar_t a, scalar_t b) -> scalar_t { return a * b; },
|
||||
[=](Vectorized <scalar_t> a, Vectorized <scalar_t> b) { return a * b; },
|
||||
// NOLINTNEXTLINE(bugprone-argument-comment)
|
||||
/*identity=*/1);
|
||||
});
|
||||
iter,
|
||||
[=](scalar_t a, scalar_t b)
|
||||
__ubsan_ignore_undefined__ -> scalar_t { return a * b; },
|
||||
[=](Vectorized<scalar_t> a, Vectorized<scalar_t> b)
|
||||
__ubsan_ignore_undefined__ { return a * b; },
|
||||
// NOLINTNEXTLINE(bugprone-argument-comment)
|
||||
/*identity=*/1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -648,7 +648,9 @@ Tensor batch_norm_backward_elemt_cuda(const Tensor& self, const Tensor& input, c
|
|||
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
|
||||
const Tensor& weight = *weight_maybe_owned;
|
||||
|
||||
if (at::cuda::detail::canUse32BitIndexMath(self) && batch_norm_use_channels_last_kernels(self)){
|
||||
if (at::cuda::detail::canUse32BitIndexMath(self) &&
|
||||
batch_norm_use_channels_last_kernels(self) &&
|
||||
batch_norm_use_channels_last_kernels(input)) {
|
||||
return batch_norm_backward_elemt_channels_last_cuda_template(self, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1649,7 +1649,8 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
|
|||
const auto stride = input.sizes()[1];
|
||||
const auto reduction_size = input.numel() / stride;
|
||||
|
||||
at::Tensor grad_input = at::empty_like(input, input.suggest_memory_format());
|
||||
// Input is guarunteed to be channels-last compatible
|
||||
at::Tensor grad_input = at::empty_like(input);
|
||||
|
||||
dim3 block;
|
||||
dim3 grid;
|
||||
|
|
@ -1716,7 +1717,8 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
|
|||
const auto reduction_size = input.numel() / stride;
|
||||
auto norm_fct = 1.0 / reduction_size;
|
||||
|
||||
at::Tensor grad_input = at::empty_like(input, input.suggest_memory_format());
|
||||
// Input is guarunteed to be channels-last compatible
|
||||
at::Tensor grad_input = at::empty_like(input);
|
||||
|
||||
dim3 block;
|
||||
dim3 grid;
|
||||
|
|
|
|||
|
|
@ -207,6 +207,87 @@ struct offset_t {
|
|||
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
// Segmented sort by full sort algorithm:.
|
||||
// Say we are sorting a (2, 3) tensor. We have in flattened form:
|
||||
// values 0.4 1.2 5.3 6.2 1.3 2.3
|
||||
// indices 0 1 2 0 1 2
|
||||
// segment_id 0 0 0 1 1 1
|
||||
|
||||
// First we sort by values, globally:
|
||||
// values 6.2 5.3 2.3 1.2 1.3 0.4
|
||||
// indices 0 2 2 1 1 0
|
||||
// segment_id 1 0 1 0 1 0
|
||||
|
||||
// Then we stable sort by segment id:
|
||||
// values 5.3 1.2 0.4 6.2 2.3 1.3
|
||||
// indices 2 1 0 0 2 1
|
||||
// segment_id 0 0 0 1 1 1
|
||||
|
||||
// This method can only work if the slice we are sorting (`dim`) is
|
||||
// innermost, and both values and indices are contiguous. We do this
|
||||
// by re-arranging the input into this form as needed, which will
|
||||
// unfortunately allocate memory if the request is not in this form.
|
||||
// Vectorized sort is slower than iterated sort if the number of
|
||||
// slices is small (since we're sorting twice, instead of invoking a
|
||||
// smaller sort `numSlices` times), but the cub sort
|
||||
// implementation here is a catch-all, so we're not looking for
|
||||
// efficiency, but instead correctness.
|
||||
|
||||
template<typename scalar_t>
|
||||
__global__ void sort_postprocess_kernel(const scalar_t *in, scalar_t *out, int64_t *index, const int2 *i_s_ptr, int nsegments, int nsort) {
|
||||
CUDA_KERNEL_LOOP(i, nsegments * nsort) {
|
||||
int segment = i / nsort;
|
||||
int j = i % nsort;
|
||||
|
||||
int offset = segment * nsort;
|
||||
const scalar_t *in_ = in + offset;
|
||||
scalar_t *out_ = out + offset;
|
||||
int64_t *index_ = index + offset;
|
||||
const int2 *i_s_ptr_ = i_s_ptr + offset;
|
||||
|
||||
int idx = i_s_ptr_[j].y;
|
||||
index_[j] = idx;
|
||||
out_[j] = in_[idx];
|
||||
}
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
inline void segmented_sort_pairs_by_full_sort(
|
||||
int64_t nsegments, int64_t nsort, int64_t n, bool descending, const Tensor &indices,
|
||||
const scalar_t *self_ptr, scalar_t *values_ptr, int64_t *indices_ptr
|
||||
) {
|
||||
int64_t segment_bits = std::max<int64_t>(1L, static_cast<int64_t>(std::ceil(std::log2(nsegments))));
|
||||
|
||||
auto int_options = indices.options().dtype(kInt);
|
||||
auto indices_and_segment = at::empty({nsegments, nsort, 2}, int_options);
|
||||
indices_and_segment.select(-1, 0).copy_( // segment id
|
||||
at::arange(nsegments, int_options).view({nsegments, 1}).expand({nsegments, nsort}));
|
||||
indices_and_segment.select(-1, 1).copy_( // reverse indices
|
||||
at::arange(nsort, int_options).view({1, nsort}).expand({nsegments, nsort}));
|
||||
|
||||
auto i_s_ptr = reinterpret_cast<int2 *>(indices_and_segment.data_ptr<int>());
|
||||
auto indices_and_segment2 = at::empty_like(indices_and_segment);
|
||||
auto i_s_ptr2 = reinterpret_cast<int2 *>(indices_and_segment2.data_ptr<int>());
|
||||
|
||||
at::cuda::cub::sort_pairs<scalar_t, int2>(
|
||||
self_ptr, nullptr, i_s_ptr, i_s_ptr2,
|
||||
n, descending);
|
||||
|
||||
TORCH_INTERNAL_ASSERT(segment_bits <= 32);
|
||||
|
||||
// sort on lower 32bits, i.e. segment index
|
||||
at::cuda::cub::sort_keys<int64_t>(
|
||||
reinterpret_cast<int64_t *>(i_s_ptr2), reinterpret_cast<int64_t *>(i_s_ptr),
|
||||
n, false, 0, segment_bits);
|
||||
|
||||
sort_postprocess_kernel<<<(n + 511) / 512, 512, 0, at::cuda::getCurrentCUDAStream()>>>(
|
||||
self_ptr, values_ptr, indices_ptr, i_s_ptr, nsegments, nsort);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// We perform a segmented sort in cub with inputs that have
|
||||
// more than 1024/2048 elements along the selected dimension.
|
||||
// Otherwise, we do an inplace bitonic sort (see sortKeyValueInplace).
|
||||
|
|
@ -349,11 +430,15 @@ std::tuple<Tensor &,Tensor &> sort_out_stable_cuda(const Tensor & self, c10::opt
|
|||
int64_t n = std::min(remaining, nbatch);
|
||||
int64_t nsegments = n / nsort;
|
||||
|
||||
auto reverse_indices = at::arange(nsort, indices.options()).view({1, nsort}).expand({nsegments, nsort}).contiguous();
|
||||
|
||||
at::cuda::cub::segmented_sort_pairs(self_ptr, values_ptr,
|
||||
reverse_indices.data_ptr<int64_t>(), indices_ptr, n, nsegments,
|
||||
offset_t{(int)nsort, 0}, offset_t{(int)nsort, 1}, descending);
|
||||
if (nsegments < 128) {
|
||||
segmented_sort_pairs_by_full_sort(nsegments, nsort, n, descending,
|
||||
indices, self_ptr, values_ptr, indices_ptr);
|
||||
} else {
|
||||
auto reverse_indices = at::arange(nsort, indices.options()).view({1, nsort}).expand({nsegments, nsort}).contiguous();
|
||||
at::cuda::cub::segmented_sort_pairs(self_ptr, values_ptr,
|
||||
reverse_indices.data_ptr<int64_t>(), indices_ptr, n, nsegments,
|
||||
offset_t{(int)nsort, 0}, offset_t{(int)nsort, 1}, descending);
|
||||
}
|
||||
|
||||
remaining -= n;
|
||||
self_ptr += n;
|
||||
|
|
|
|||
|
|
@ -94,11 +94,16 @@ __host__ __forceinline__ static accscalar_t area_pixel_compute_scale(
|
|||
int output_size,
|
||||
bool align_corners,
|
||||
const c10::optional<double> scale) {
|
||||
if (output_size > 1) {
|
||||
return align_corners ? (accscalar_t)(input_size - 1) / (output_size - 1)
|
||||
: compute_scales_value<accscalar_t>(scale, input_size, output_size);
|
||||
} else {
|
||||
return static_cast<accscalar_t>(0);
|
||||
if(align_corners) {
|
||||
if(output_size > 1) {
|
||||
return (accscalar_t)(input_size - 1) / (output_size - 1);
|
||||
}
|
||||
else {
|
||||
return static_cast<accscalar_t>(0);
|
||||
}
|
||||
}
|
||||
else{
|
||||
return compute_scales_value<accscalar_t>(scale, input_size, output_size);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -393,31 +393,32 @@ kernel void clamp(texture2d_array<half, access::read> in_arr[[texture(0), functi
|
|||
}
|
||||
}
|
||||
|
||||
kernel void hardswish(texture2d_array<half, access::read> in[[texture(0)]],
|
||||
texture2d_array<half, access::write> out[[texture(1)]],
|
||||
constant bool hardswish_is_arr = (ushort_arg_0 > 1 || ushort_arg_1 > 4);
|
||||
constant bool hardswish_is_tex = !hardswish_is_arr;
|
||||
kernel void hardswish(texture2d_array<half, access::read> in_arr[[texture(0), function_constant(hardswish_is_arr)]],
|
||||
texture2d<half, access::read> in_tex[[texture(0), function_constant(hardswish_is_tex)]],
|
||||
texture2d_array<half, access::write> out_arr[[texture(1), function_constant(hardswish_is_arr)]],
|
||||
texture2d<half, access::write> out_tex[[texture(1), function_constant(hardswish_is_tex)]],
|
||||
ushort3 gid[[thread_position_in_grid]]) {
|
||||
if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
|
||||
const ushort oH = ushort_arg_2;
|
||||
const ushort oW = ushort_arg_3;
|
||||
if (gid.x >= oW || gid.y >= oH) {
|
||||
return;
|
||||
}
|
||||
ushort2 gid_ = gid.xy;
|
||||
half4 value = in.read(gid_, gid.z);
|
||||
half4 mask1 = half4(value < 3.0);
|
||||
half4 mask2 = half4(value > -3.0);
|
||||
half4 outval = mask2*(mask1*(value*(value + 3.0)/6.0) + (1 - mask1)*value);
|
||||
out.write(outval, gid_, gid.z);
|
||||
}
|
||||
|
||||
kernel void hardswish_nonarray(texture2d<half, access::read> in[[texture(0)]],
|
||||
texture2d<half, access::write> out[[texture(1)]],
|
||||
ushort2 gid[[thread_position_in_grid]]) {
|
||||
if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
|
||||
return;
|
||||
if (hardswish_is_arr) {
|
||||
half4 value = in_arr.read(gid_, gid.z);
|
||||
half4 mask1 = half4(value < 3.0);
|
||||
half4 mask2 = half4(value > -3.0);
|
||||
half4 outval = mask2*(mask1*(value*(value + 3.0)/6.0) + (1 - mask1)*value);
|
||||
out_arr.write(outval, gid_, gid.z);
|
||||
} else {
|
||||
half4 value = in_tex.read(gid_);
|
||||
half4 mask1 = half4(value < 3);
|
||||
half4 mask2 = half4(value > -3.0);
|
||||
half4 outval = mask2*(mask1*(value*(value + 3.0)/6.0) + (1 - mask1)*value);
|
||||
out_tex.write(outval, gid_);
|
||||
}
|
||||
half4 value = in.read(gid);
|
||||
half4 mask1 = half4(value < 3);
|
||||
half4 mask2 = half4(value > -3.0);
|
||||
half4 outval = mask2*(mask1*(value*(value + 3.0)/6.0) + (1 - mask1)*value);
|
||||
out.write(outval, gid);
|
||||
}
|
||||
|
||||
constant bool out_is_arr = (ushort_arg_3 > 1 || ushort_arg_2 > 4);
|
||||
|
|
|
|||
|
|
@ -41,6 +41,7 @@ bool test_softmax();
|
|||
bool test_sigmoid();
|
||||
bool test_hardsigmoid();
|
||||
bool test_hardswish();
|
||||
bool test_hardswish2();
|
||||
bool test_upsampling_nearest2d_vec();
|
||||
bool test_upsampling_nearest2d_vec2();
|
||||
bool test_adaptive_avg_pool2d();
|
||||
|
|
|
|||
|
|
@ -262,6 +262,18 @@ bool test_hardswish() {
|
|||
});
|
||||
}
|
||||
|
||||
bool test_hardswish2() {
|
||||
__block std::vector<int64_t> size{1, 3, 44, 44};
|
||||
return TEST(size, __PRETTY_FUNCTION__, ^bool {
|
||||
auto X =
|
||||
at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat)) * 12 - 6;
|
||||
auto X2 = X.metal();
|
||||
auto Y1 = at::hardswish_(X);
|
||||
auto Y2 = at::hardswish_(X2).cpu();
|
||||
return almostEqual(Y1, Y2);
|
||||
});
|
||||
}
|
||||
|
||||
bool test_addmm() {
|
||||
bool result = true;
|
||||
for (int i = 0; i < ITER_COUNT; ++i) {
|
||||
|
|
|
|||
|
|
@ -69,6 +69,7 @@
|
|||
REG_TEST("test_sigmoid", test_sigmoid);
|
||||
REG_TEST("test_hardsigmoid", test_hardsigmoid);
|
||||
REG_TEST("test_hardswish", test_hardswish);
|
||||
REG_TEST("test_hardswish2", test_hardswish2);
|
||||
REG_TEST("test_upsampling_nearest2d_vec", test_upsampling_nearest2d_vec);
|
||||
REG_TEST("test_upsampling_nearest2d_vec2", test_upsampling_nearest2d_vec2);
|
||||
REG_TEST("test_adaptive_avg_pool2d", test_adaptive_avg_pool2d);
|
||||
|
|
|
|||
|
|
@ -24,9 +24,9 @@ Tensor& hardswish_(Tensor& input) {
|
|||
id<MTLComputeCommandEncoder> encoder =
|
||||
[commandBuffer.buffer computeCommandEncoder];
|
||||
id<MTLComputePipelineState> state = [[MetalContext sharedInstance]
|
||||
specializedPipelineState:mpscnn::kernelFor(
|
||||
X, "hardswish", "hardswish_nonarray")
|
||||
specializedPipelineState:"hardswish"
|
||||
Constants:@[
|
||||
@(X.numberOfImages),
|
||||
@(X.featureChannels),
|
||||
@(X.height),
|
||||
@(X.width)
|
||||
|
|
|
|||
|
|
@ -6652,7 +6652,7 @@
|
|||
device_check: NoCheck # TensorIterator
|
||||
variants: method
|
||||
|
||||
- func: cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100) -> Tensor
|
||||
- func: cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100, float label_smoothing=0.0) -> Tensor
|
||||
python_module: nn
|
||||
|
||||
- func: lstsq.X(Tensor self, Tensor A, *, Tensor(a!) X, Tensor(b!) qr) -> (Tensor(a!) solution, Tensor(b!) QR)
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
#include <ATen/native/quantized/cpu/qembeddingbag_prepack.h>
|
||||
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/Parallel.h>
|
||||
|
|
@ -122,7 +124,6 @@ c10::intrusive_ptr<EmbeddingPackedParamsBase> PackedEmbeddingBagWeight::prepack(
|
|||
|
||||
namespace at {
|
||||
namespace native {
|
||||
namespace {
|
||||
|
||||
// Note - This is a temporary pack function for embedding bag which quantizes
|
||||
// and packs the float weight tensor. In the next step it will be replaced by a
|
||||
|
|
@ -184,7 +185,7 @@ namespace {
|
|||
//
|
||||
// [[50. , 60.00000035],
|
||||
// [70. , 80.00000035]]])
|
||||
Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
|
||||
Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) {
|
||||
// The "last" dimension of an N-Dimensioned batch of embedding bags is
|
||||
// quantization channel. E.g. for a 2D embedding bag, this has
|
||||
// [ row, col ] dimensions, for batched of embedding bags, dimensions might be
|
||||
|
|
@ -208,17 +209,12 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
|
|||
const int32_t embedding_cols = weight_sizes[cols_dim];
|
||||
// Add 8 bytes per column to store FP32 scale and zero_point per row.
|
||||
const int32_t output_columns = embedding_cols + 2 * sizeof(float);
|
||||
Tensor weight_contig = weight.contiguous(weight.suggest_memory_format());
|
||||
const auto weight_contig = weight.expect_contiguous(weight.suggest_memory_format());
|
||||
|
||||
// Adjust output dimensions to account for FP32 scale and zero_points.
|
||||
std::vector<int64_t> output_shape = weight_sizes.vec();
|
||||
output_shape[cols_dim] = output_columns;
|
||||
|
||||
// Allocate output packed weights
|
||||
auto output = at::empty(
|
||||
output_shape,
|
||||
weight_contig.options().dtype(at::kByte),
|
||||
weight_contig.suggest_memory_format());
|
||||
at::native::resize_(output, output_shape, c10::nullopt);
|
||||
auto* output_data = output.data_ptr<uint8_t>();
|
||||
|
||||
#ifdef USE_FBGEMM
|
||||
|
|
@ -246,10 +242,9 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
|
|||
}
|
||||
|
||||
#else
|
||||
const auto float_weight = weight_contig.scalar_type() == at::ScalarType::Half
|
||||
? weight_contig.to(at::ScalarType::Float)
|
||||
: weight_contig;
|
||||
const auto weight_data = float_weight.data_ptr<float>();
|
||||
const auto weight_data = weight_contig->scalar_type() == at::ScalarType::Half
|
||||
? weight_contig->to(at::ScalarType::Float).data_ptr<float>()
|
||||
: weight_contig->data_ptr<float>();
|
||||
constexpr float kEpsilon = 1e-8f;
|
||||
for (auto row: c10::irange(embedding_rows)) {
|
||||
const float* input_row = weight_data + row * embedding_cols;
|
||||
|
|
@ -276,6 +271,21 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
|
|||
return output;
|
||||
}
|
||||
|
||||
Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
|
||||
const auto weight_contig = weight.expect_contiguous(weight.suggest_memory_format());
|
||||
auto output = at::detail::empty_cpu(
|
||||
{0},
|
||||
at::kByte,
|
||||
weight_contig->layout(),
|
||||
weight_contig->device(),
|
||||
c10::nullopt,
|
||||
c10::nullopt);
|
||||
qembeddingbag_byte_prepack_out(output, weight);
|
||||
return output;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
// TODO: Extend support to N-D batched embeddings, similar to qembeddingbag_byte_prepack
|
||||
Tensor _qembeddingbag_nbit_prepack_helper(
|
||||
const Tensor& weight,
|
||||
|
|
|
|||
11
aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h
Normal file
11
aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
#include <ATen/ATen.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
|
||||
Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight);
|
||||
|
||||
Tensor qembeddingbag_byte_prepack(const Tensor& weight);
|
||||
|
||||
} // namespace native
|
||||
} // namespace at
|
||||
|
|
@ -11,7 +11,7 @@
|
|||
#include <ATen/cuda/CUDAApplyUtils.cuh>
|
||||
#include <ATen/cuda/detail/IndexUtils.cuh>
|
||||
#include <ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh>
|
||||
#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
|
||||
#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
|
||||
|
||||
#include <THC/THCThrustAllocator.cuh>
|
||||
|
||||
|
|
|
|||
74
aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.cpp
Normal file
74
aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.cpp
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
/*
|
||||
Functions here use deprecated cuSPARSE API that was removed in CUDA 11.
|
||||
This file will be removed eventually.
|
||||
*/
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/SparseTensorUtils.h>
|
||||
#include <ATen/native/LinearAlgebraUtils.h>
|
||||
#include <ATen/native/sparse/cuda/SparseBlasLegacy.h>
|
||||
#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
|
||||
void s_addmm_out_csr_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, const Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, const Tensor& crow_indices, const Tensor& col_indices, const Tensor& values, const Tensor& dense) {
|
||||
TORCH_INTERNAL_ASSERT(nnz > 0);
|
||||
|
||||
// No half support, so we don't have to use CUDATypeConversion
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
|
||||
values.scalar_type(), "addmm_sparse_cuda", [&] {
|
||||
scalar_t cast_beta = beta.to<scalar_t>();
|
||||
scalar_t cast_alpha = alpha.to<scalar_t>();
|
||||
Tensor r__;
|
||||
if (cast_beta == scalar_t(0)) {
|
||||
r_.zero_();
|
||||
} else if (!at::sparse::is_same_tensor(t, r_)) {
|
||||
r_.copy_(t);
|
||||
}
|
||||
if (r_.stride(0) == 1 && r_.stride(1) == r_.size(0)) {
|
||||
r__ = r_;
|
||||
} else {
|
||||
// Note: This storage arrangement is preferred due to most of the CUDA kernels handle only contiguous tensors
|
||||
r__ = r_.transpose(0, 1).clone(at::MemoryFormat::Contiguous);
|
||||
r__.transpose_(0, 1);
|
||||
}
|
||||
TORCH_INTERNAL_ASSERT(r__.transpose(-1, -2).is_contiguous());
|
||||
Tensor dense_;
|
||||
char transpose_dense;
|
||||
if (dense.stride(0) == 1 && dense.stride(1) == dense.size(0)) {
|
||||
transpose_dense = 'n';
|
||||
dense_ = dense;
|
||||
} else if (dense.stride(1) == 1 && dense.stride(0) == dense.size(1)) {
|
||||
transpose_dense = 't';
|
||||
dense_ = dense;
|
||||
} else {
|
||||
transpose_dense = 't';
|
||||
dense_ = dense.contiguous();
|
||||
}
|
||||
|
||||
sparse::cuda::csrmm2(
|
||||
'n',
|
||||
transpose_dense,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
nnz,
|
||||
cast_alpha,
|
||||
values.data_ptr<scalar_t>(),
|
||||
crow_indices.data_ptr<int32_t>(),
|
||||
col_indices.data_ptr<int32_t>(),
|
||||
dense_.data_ptr<scalar_t>(),
|
||||
(transpose_dense == 'n' ? dense_.stride(1) : dense_.stride(0)),
|
||||
cast_beta,
|
||||
r__.data_ptr<scalar_t>(),
|
||||
r__.stride(1));
|
||||
|
||||
if (!at::sparse::is_same_tensor(r__, r_)) {
|
||||
r_.copy_(r__);
|
||||
}
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
} // namespace native
|
||||
} // namespace at
|
||||
18
aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.h
Normal file
18
aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.h
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
#pragma once
|
||||
|
||||
#include <ATen/Tensor.h>
|
||||
#include <ATen/core/Scalar.h>
|
||||
|
||||
/*
|
||||
Functions here use deprecated cuSPARSE API that was removed in CUDA 11.
|
||||
Here only 32-bit indices sparse indices are supported.
|
||||
This file will be removed eventually.
|
||||
*/
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
|
||||
void s_addmm_out_csr_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, const Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, const Tensor& crow_indices, const Tensor& col_indices, const Tensor& values, const Tensor& dense);
|
||||
|
||||
} // namespace native
|
||||
} // namespace at
|
||||
|
|
@ -2,7 +2,7 @@
|
|||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <ATen/cuda/Exceptions.h>
|
||||
#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
|
||||
#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
|
||||
#include <cusparse.h>
|
||||
|
|
@ -14,7 +14,7 @@
|
|||
// Using these APIs in any other systems will result in compile-time or run-time failures.
|
||||
// Their support will be extended in the next releases.
|
||||
|
||||
#if defined(__CUDACC__) && (CUSPARSE_VERSION >= 11000 || (!defined(_MSC_VER) && CUSPARSE_VERSION >= 10301))
|
||||
#if defined(CUDART_VERSION) && (CUSPARSE_VERSION >= 11000 || (!defined(_MSC_VER) && CUSPARSE_VERSION >= 10301))
|
||||
#define IS_SPMM_AVAILABLE() 1
|
||||
#else
|
||||
#define IS_SPMM_AVAILABLE() 0
|
||||
|
|
@ -5,8 +5,9 @@
|
|||
#include <ATen/NativeFunctions.h>
|
||||
#include <ATen/SparseTensorUtils.h>
|
||||
#include <ATen/native/sparse/SparseTensorMath.h>
|
||||
#include <ATen/native/sparse/cuda/SparseBlasLegacy.h>
|
||||
#include <ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh>
|
||||
#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
|
||||
#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
|
||||
#include <ATen/cuda/CUDAApplyUtils.cuh>
|
||||
#include <ATen/cuda/CUDAUtils.h>
|
||||
#include <ATen/cuda/detail/IndexUtils.cuh>
|
||||
|
|
@ -50,64 +51,6 @@ namespace {
|
|||
}
|
||||
}
|
||||
|
||||
void s_addmm_out_csr_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, Tensor& crow_indices, Tensor& col_indices, Tensor& values, const Tensor& dense) {
|
||||
TORCH_INTERNAL_ASSERT(nnz > 0);
|
||||
|
||||
// No half support, so we don't have to use CUDATypeConversion
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
|
||||
values.scalar_type(), "addmm_sparse_cuda", [&] {
|
||||
scalar_t cast_beta = beta.to<scalar_t>();
|
||||
scalar_t cast_alpha = alpha.to<scalar_t>();
|
||||
Tensor r__;
|
||||
if (cast_beta == scalar_t(0)) {
|
||||
r_.zero_();
|
||||
} else if (!is_same_tensor(t, r_)) {
|
||||
r_.copy_(t);
|
||||
}
|
||||
if(r_.stride(0) == 1 && r_.stride(1) == r_.size(0)) {
|
||||
r__ = r_;
|
||||
} else {
|
||||
// Note: This storage arrangement is preferred due to most of the CUDA kernels handle only contiguous tensors
|
||||
r__ = r_.transpose(0, 1).clone(at::MemoryFormat::Contiguous);
|
||||
r__.transpose_(0, 1);
|
||||
}
|
||||
Tensor dense_;
|
||||
char transpose_dense;
|
||||
if(dense.stride(0) == 1 && dense.stride(1) == dense.size(0)) {
|
||||
transpose_dense = 'n';
|
||||
dense_ = dense;
|
||||
} else if(dense.stride(1) == 1 && dense.stride(0) == dense.size(1)) {
|
||||
transpose_dense = 't';
|
||||
dense_ = dense;
|
||||
} else {
|
||||
transpose_dense = 't';
|
||||
dense_ = dense.contiguous();
|
||||
}
|
||||
|
||||
sparse::cuda::csrmm2(
|
||||
'n',
|
||||
transpose_dense,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
nnz,
|
||||
cast_alpha,
|
||||
values.data_ptr<scalar_t>(),
|
||||
crow_indices.data_ptr<int32_t>(),
|
||||
col_indices.data_ptr<int32_t>(),
|
||||
dense_.data_ptr<scalar_t>(),
|
||||
(transpose_dense == 'n' ? dense_.stride(1) : dense_.stride(0)),
|
||||
cast_beta,
|
||||
r__.data_ptr<scalar_t>(),
|
||||
r__.stride(1));
|
||||
|
||||
if (!is_same_tensor(r__, r_)) {
|
||||
r_.copy_(r__);
|
||||
}
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// NB: Deleted spaddcmul (aka addcmul_, but not actually wired up), spaddcdiv (not
|
||||
// wired at all)
|
||||
|
||||
|
|
|
|||
|
|
@ -6,8 +6,6 @@
|
|||
|
||||
namespace at { namespace native {
|
||||
|
||||
void s_addmm_out_csr_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, Tensor& crow_indices, Tensor& col_indices, Tensor& values, const Tensor& dense);
|
||||
|
||||
void s_addmm_out_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, Tensor& indices, Tensor& values, const Tensor& dense);
|
||||
|
||||
}} // namespace at::native
|
||||
|
|
|
|||
|
|
@ -19,7 +19,8 @@
|
|||
#include <ATen/cuda/CUDAUtils.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
|
||||
#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
|
||||
#include <ATen/native/sparse/cuda/SparseBlasLegacy.h>
|
||||
#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
|
||||
#include <ATen/native/sparse/cuda/SparseCUDATensorMath.cuh>
|
||||
|
||||
#include <thrust/device_ptr.h>
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@
|
|||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <ATen/cuda/CUDAUtils.h>
|
||||
#include <cusparse.h>
|
||||
#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
|
||||
#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
|
||||
#include <thrust/device_vector.h>
|
||||
|
|
|
|||
28
aten/src/ATen/native/vulkan/glsl/leaky_relu.glsl
Normal file
28
aten/src/ATen/native/vulkan/glsl/leaky_relu.glsl
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
#version 450 core
|
||||
#define PRECISION $precision
|
||||
|
||||
layout(std430) buffer;
|
||||
|
||||
/* Qualifiers: layout - storage - precision - memory */
|
||||
|
||||
layout(set = 0, binding = 0) uniform PRECISION restrict writeonly image3D uOutput;
|
||||
layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
|
||||
layout(set = 0, binding = 2) uniform PRECISION restrict Block {
|
||||
ivec4 size;
|
||||
float negative_slope;
|
||||
} uBlock;
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
|
||||
|
||||
void main() {
|
||||
const ivec3 pos = ivec3(gl_GlobalInvocationID);
|
||||
|
||||
if (all(lessThan(pos, uBlock.size.xyz))) {
|
||||
const vec4 inval = texelFetch(uInput, pos, 0);
|
||||
const vec4 negative_values = vec4(lessThan(inval, vec4(0.0f)));
|
||||
const vec4 positive_values = vec4(1.0) - negative_values;
|
||||
const vec4 mask = negative_values * vec4(uBlock.negative_slope) + positive_values;
|
||||
const vec4 outval = inval * mask;
|
||||
imageStore(uOutput, pos, outval);
|
||||
}
|
||||
}
|
||||
27
aten/src/ATen/native/vulkan/glsl/leaky_relu_.glsl
Normal file
27
aten/src/ATen/native/vulkan/glsl/leaky_relu_.glsl
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
#version 450 core
|
||||
#define PRECISION $precision
|
||||
|
||||
layout(std430) buffer;
|
||||
|
||||
/* Qualifiers: layout - storage - precision - memory */
|
||||
|
||||
layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict image3D uOutput;
|
||||
layout(set = 0, binding = 1) uniform PRECISION restrict Block {
|
||||
ivec4 size;
|
||||
float negative_slope;
|
||||
} uBlock;
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
|
||||
|
||||
void main() {
|
||||
const ivec3 pos = ivec3(gl_GlobalInvocationID);
|
||||
|
||||
if (all(lessThan(pos, uBlock.size.xyz))) {
|
||||
const vec4 inval = imageLoad(uOutput, pos);
|
||||
const vec4 negative_values = vec4(lessThan(inval, vec4(0.0f)));
|
||||
const vec4 positive_values = vec4(1.0) - negative_values;
|
||||
const vec4 mask = negative_values * vec4(uBlock.negative_slope) + positive_values;
|
||||
const vec4 outval = inval * mask;
|
||||
imageStore(uOutput, pos, outval);
|
||||
}
|
||||
}
|
||||
|
|
@ -404,6 +404,121 @@ Tensor& hardshrink_(
|
|||
return self;
|
||||
}
|
||||
|
||||
Tensor leaky_relu(
|
||||
const Tensor& self_arg,
|
||||
const Scalar& negative_slope) {
|
||||
api::Context* const context = api::context();
|
||||
|
||||
const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan();
|
||||
const vTensor& v_self = convert(self);
|
||||
|
||||
vTensor v_output{
|
||||
context,
|
||||
v_self.sizes(),
|
||||
v_self.options(),
|
||||
};
|
||||
|
||||
api::Command::Pool& command_pool = context->command().pool;
|
||||
api::Command::Buffer& command_buffer = command_pool.stream();
|
||||
{
|
||||
if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
|
||||
const struct Block final {
|
||||
uvec3 extents;
|
||||
uint32_t _;
|
||||
float negative_slope;
|
||||
} block {
|
||||
v_output.extents(),
|
||||
0u,
|
||||
negative_slope.to<float>(),
|
||||
};
|
||||
|
||||
context->dispatch(
|
||||
command_buffer,
|
||||
{
|
||||
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
|
||||
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
|
||||
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
|
||||
},
|
||||
VK_KERNEL(leaky_relu),
|
||||
v_output.extents(),
|
||||
context->gpu().adapter->local_work_group_size(),
|
||||
// Write-only access bypasses synchronization but inserts appropriate
|
||||
// barriers if necessary.
|
||||
v_output.image(
|
||||
command_buffer,
|
||||
vTensor::Stage::Compute,
|
||||
vTensor::Access::Write),
|
||||
// Read-only access is implied on const tensors and triggers an async
|
||||
// synchronization if necessary.
|
||||
v_self.image(
|
||||
command_buffer,
|
||||
vTensor::Stage::Compute),
|
||||
// Object lifetime is managed by the resource pool.
|
||||
// It is OK not to keep track of the handle.
|
||||
context->resource().pool.uniform(block).object);
|
||||
}
|
||||
else {
|
||||
TORCH_CHECK(false, "Not implemented!");
|
||||
}
|
||||
}
|
||||
command_pool.submit(context->gpu().queue, command_buffer);
|
||||
|
||||
return convert(v_output);
|
||||
}
|
||||
|
||||
Tensor& leaky_relu_(
|
||||
Tensor& self,
|
||||
const Scalar& negative_slope) {
|
||||
api::Context* const context = api::context();
|
||||
|
||||
TORCH_CHECK(
|
||||
self.is_vulkan(),
|
||||
"Vulkan: In-place leaky relu is only supported on Vulkan tensors.");
|
||||
|
||||
vTensor& v_self = convert(self);
|
||||
|
||||
api::Command::Pool& command_pool = context->command().pool;
|
||||
api::Command::Buffer& command_buffer = command_pool.stream();
|
||||
{
|
||||
if C10_LIKELY(v_self.has_image()) {
|
||||
const struct Block final {
|
||||
uvec3 extents;
|
||||
uint32_t _;
|
||||
float negative_slope;
|
||||
} block {
|
||||
v_self.extents(),
|
||||
0u,
|
||||
negative_slope.to<float>(),
|
||||
};
|
||||
|
||||
context->dispatch(
|
||||
command_buffer,
|
||||
{
|
||||
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
|
||||
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
|
||||
},
|
||||
VK_KERNEL(leaky_relu_),
|
||||
v_self.extents(),
|
||||
context->gpu().adapter->local_work_group_size(),
|
||||
// Read-Write access triggers an async synchronization if necessory
|
||||
// and inserts appropriate barriers if hazards are detected.
|
||||
v_self.image(
|
||||
command_buffer,
|
||||
vTensor::Stage::Compute,
|
||||
vTensor::Access::Read | vTensor::Access::Write),
|
||||
// Object lifetime is managed by the resource pool.
|
||||
// It is OK not to keep track of the handle.
|
||||
context->resource().pool.uniform(block).object);
|
||||
}
|
||||
else {
|
||||
TORCH_CHECK(false, "Not implemented!");
|
||||
}
|
||||
}
|
||||
command_pool.submit(context->gpu().queue, command_buffer);
|
||||
|
||||
return self;
|
||||
}
|
||||
|
||||
Tensor sigmoid(const Tensor& self) {
|
||||
return ops::activation(self, VK_KERNEL(sigmoid));
|
||||
}
|
||||
|
|
@ -433,6 +548,8 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
|
|||
m.impl(TORCH_SELECTIVE_NAME("aten::hardswish_"), hardswish_);
|
||||
m.impl(TORCH_SELECTIVE_NAME("aten::hardtanh"), hardtanh);
|
||||
m.impl(TORCH_SELECTIVE_NAME("aten::hardtanh_"), hardtanh_);
|
||||
m.impl(TORCH_SELECTIVE_NAME("aten::leaky_relu"), leaky_relu);
|
||||
m.impl(TORCH_SELECTIVE_NAME("aten::leaky_relu_"), leaky_relu_);
|
||||
m.impl(TORCH_SELECTIVE_NAME("aten::sigmoid"), sigmoid);
|
||||
m.impl(TORCH_SELECTIVE_NAME("aten::sigmoid_"), sigmoid_);
|
||||
m.impl(TORCH_SELECTIVE_NAME("aten::tanh"), tanh);
|
||||
|
|
|
|||
|
|
@ -43,6 +43,8 @@ namespace at {
|
|||
// at namespace already.
|
||||
namespace {
|
||||
|
||||
${dispatch_helpers}
|
||||
|
||||
${dispatch_anonymous_definitions}
|
||||
|
||||
TORCH_LIBRARY_IMPL(aten, ${DispatchKey}, m) {
|
||||
|
|
|
|||
|
|
@ -979,6 +979,49 @@ TEST(VulkanAPITest, hardshrink_) {
|
|||
}
|
||||
}
|
||||
|
||||
TEST(VulkanAPITest, leaky_relu) {
|
||||
if (!at::is_vulkan_available()) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const auto negative_slope : {0.01, 0.001, 1.0, -0.001}) {
|
||||
const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
|
||||
const auto in_vulkan = in_cpu.vulkan();
|
||||
|
||||
const auto out_cpu = at::leaky_relu(in_cpu, negative_slope);
|
||||
const auto out_vulkan = at::leaky_relu(in_vulkan, negative_slope);
|
||||
|
||||
const auto check = almostEqual(out_cpu, out_vulkan.cpu());
|
||||
|
||||
if (!check) {
|
||||
showRtol(out_cpu, out_vulkan.cpu());
|
||||
}
|
||||
|
||||
ASSERT_TRUE(check);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(VulkanAPITest, leaky_relu_) {
|
||||
if (!at::is_vulkan_available()) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const auto negative_slope : {0.01, 0.001, 1.0, -0.001}) {
|
||||
auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
|
||||
auto vulkan = cpu.vulkan();
|
||||
|
||||
at::leaky_relu_(cpu, negative_slope);
|
||||
at::leaky_relu_(vulkan, negative_slope);
|
||||
|
||||
const auto check = almostEqual(cpu, vulkan.cpu());
|
||||
if (!check) {
|
||||
showRtol(cpu, vulkan.cpu());
|
||||
}
|
||||
|
||||
ASSERT_TRUE(check);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(VulkanAPITest, hardswish) {
|
||||
if (!at::is_vulkan_available()) {
|
||||
return;
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ add_executable(
|
|||
bench_batchnorm.cpp
|
||||
bench_concat.cpp
|
||||
bench_compile.cpp
|
||||
bench_signed_log1p.cpp
|
||||
bench_fuser_overhead.cpp
|
||||
bench_gemm.cpp
|
||||
bench_parallel.cpp
|
||||
|
|
|
|||
120
benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp
Normal file
120
benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp
Normal file
|
|
@ -0,0 +1,120 @@
|
|||
#include <benchmark/benchmark.h>
|
||||
|
||||
#include <torch/csrc/jit/jit_log.h>
|
||||
#include <torch/csrc/jit/tensorexpr/ir.h>
|
||||
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
|
||||
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
|
||||
#include <torch/csrc/jit/tensorexpr/loopnest.h>
|
||||
#include <torch/csrc/jit/tensorexpr/tensor.h>
|
||||
#include <torch/torch.h>
|
||||
|
||||
using namespace torch::jit::tensorexpr;
|
||||
|
||||
namespace {
|
||||
|
||||
class SignedLog1pBench : public benchmark::Fixture {
|
||||
public:
|
||||
void SetUp(const benchmark::State& state) override {
|
||||
input_size_ = {state.range(0), state.range(1)};
|
||||
input_size_int_ = {state.range(0), state.range(1)};
|
||||
input_ = torch::rand(input_size_);
|
||||
ref_ = signedLog1p(input_);
|
||||
}
|
||||
|
||||
void TearDown(benchmark::State& state) override {
|
||||
TORCH_CHECK(at::allclose(ref_, output_));
|
||||
state.counters["GB/s"] = benchmark::Counter(
|
||||
uint64_t(state.iterations()) * 2 * output_.nbytes(),
|
||||
benchmark::Counter::kIsRate);
|
||||
}
|
||||
|
||||
at::Tensor signedLog1p(const at::Tensor& inp) {
|
||||
auto sign = at::sign(inp);
|
||||
auto log1p = at::log1p(at::abs(inp));
|
||||
return sign * log1p;
|
||||
}
|
||||
|
||||
void runATen(benchmark::State& state) {
|
||||
for (auto _ : state) {
|
||||
output_ = signedLog1p(input_);
|
||||
}
|
||||
}
|
||||
|
||||
void runNNC(benchmark::State& state) {
|
||||
Placeholder input_ph(
|
||||
"input", kFloat, {input_size_int_[0], input_size_int_[1]});
|
||||
Tensor abs_result = Compute(
|
||||
"aten_abs",
|
||||
{{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
|
||||
[&](const VarHandle& m, const VarHandle& n) {
|
||||
return abs(input_ph.load(m, n));
|
||||
});
|
||||
Tensor log1p_result = Compute(
|
||||
"aten_log1p",
|
||||
{{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
|
||||
[&](const VarHandle& m, const VarHandle& n) {
|
||||
return log1p(abs_result.load(m, n));
|
||||
});
|
||||
Tensor sign = Compute(
|
||||
"aten_sign",
|
||||
{{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
|
||||
[&](const VarHandle& m, const VarHandle& n) {
|
||||
return CompareSelect::make(
|
||||
input_ph.load(m, n),
|
||||
ExprHandle(0.0f),
|
||||
ExprHandle(-1),
|
||||
ExprHandle(1),
|
||||
kLT);
|
||||
});
|
||||
Tensor output = Compute(
|
||||
"aten_mul",
|
||||
{{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
|
||||
[&](const VarHandle& m, const VarHandle& n) {
|
||||
return sign.load(m, n) * log1p_result.load(m, n);
|
||||
});
|
||||
LoopNest nest({output}, {abs_result, log1p_result, sign, output});
|
||||
GRAPH_DEBUG("Original Stmt: ", *nest.root_stmt());
|
||||
nest.inlineIntermediateBufs(true);
|
||||
nest.prepareForCodegen();
|
||||
nest.simplify();
|
||||
nest.vectorizeInnerLoops();
|
||||
nest.simplify();
|
||||
GRAPH_DEBUG("Final stmt: ", *nest.root_stmt());
|
||||
|
||||
// StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
|
||||
std::vector<CodeGen::BufferArg> buf_args;
|
||||
buf_args.push_back(input_ph);
|
||||
buf_args.push_back(output);
|
||||
LLVMCodeGen cg(nest.root_stmt(), buf_args);
|
||||
|
||||
std::vector<CodeGen::CallArg> call_args;
|
||||
for (auto _ : state) {
|
||||
output_ = at::empty_like(ref_);
|
||||
call_args.clear();
|
||||
call_args.push_back(input_.data_ptr<float>());
|
||||
call_args.push_back(output_.data_ptr<float>());
|
||||
cg.call(call_args);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<long> input_size_;
|
||||
std::vector<int> input_size_int_;
|
||||
at::Tensor input_;
|
||||
at::Tensor output_;
|
||||
at::Tensor ref_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
BENCHMARK_DEFINE_F(SignedLog1pBench, ATen)(benchmark::State& state) {
|
||||
runATen(state);
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(SignedLog1pBench, NNC)(benchmark::State& state) {
|
||||
runNNC(state);
|
||||
}
|
||||
|
||||
BENCHMARK_REGISTER_F(SignedLog1pBench, ATen)->Args({10, 1467});
|
||||
|
||||
BENCHMARK_REGISTER_F(SignedLog1pBench, NNC)->Args({10, 1467});
|
||||
|
|
@ -762,3 +762,42 @@ const std::string quantize_script = R"IR(
|
|||
%1249: Tensor = aten::dequantize(%1254)
|
||||
return (%1249)
|
||||
)IR";
|
||||
|
||||
const auto fmod_tensor = R"JIT(
|
||||
def forward(self, a: Tensor, b: Tensor):
|
||||
return torch.fmod(a, b).clone()
|
||||
)JIT";
|
||||
|
||||
const auto fmod_scalar = R"JIT(
|
||||
def forward(self, a: Tensor, b: int):
|
||||
return torch.fmod(a, b).clone()
|
||||
)JIT";
|
||||
|
||||
const std::string embedding_bag_byte_prepack_script = R"IR(
|
||||
graph(%input: Tensor):
|
||||
%none : None = prim::Constant()
|
||||
%output: Tensor = quantized::embedding_bag_byte_prepack(%input)
|
||||
%res: Tensor = aten::clone(%output, %none)
|
||||
return (%res)
|
||||
)IR";
|
||||
|
||||
const auto linalg_norm_ord_scalar = R"JIT(
|
||||
def forward(self, a: Tensor, ord: int, dim: List[int], keepdim: bool, dtype: int):
|
||||
return torch.linalg_norm(a, ord, dim, keepdim, dtype=dtype).clone()
|
||||
)JIT";
|
||||
|
||||
const auto linalg_norm_ord_str = R"JIT(
|
||||
def forward(self, a: Tensor, ord: str, dim: List[int], keepdim: bool, dtype: int):
|
||||
return torch.linalg_norm(a, ord, dim, keepdim, dtype=dtype).clone()
|
||||
)JIT";
|
||||
|
||||
const std::string cat_script = R"IR(
|
||||
graph(%a: Tensor, %b: Tensor, %dim: int):
|
||||
%ten_list: Tensor[] = prim::ListConstruct(%a, %b)
|
||||
%1 : int = prim::Constant[value=0]()
|
||||
%2 : int = prim::Constant[value=1]()
|
||||
%3 : int = prim::Constant[value=1]()
|
||||
%ten_list2 : Tensor[] = aten::slice(%ten_list, %1, %2, %3)
|
||||
%ret: Tensor = aten::cat(%ten_list2, %dim)
|
||||
return (%ret)
|
||||
)IR";
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#include <gtest/gtest.h>
|
||||
#include <torch/csrc/jit/ir/alias_analysis.h>
|
||||
#include <torch/csrc/jit/ir/irparser.h>
|
||||
#include <torch/csrc/jit/runtime/static/fusion.h>
|
||||
#include <torch/csrc/jit/runtime/static/impl.h>
|
||||
#include <torch/csrc/jit/runtime/static/passes.h>
|
||||
|
|
@ -209,6 +210,13 @@ TEST(StaticRuntime, EmbeddingBag) {
|
|||
}
|
||||
|
||||
TEST(StaticRuntime, LayerNorm) {
|
||||
#ifdef FBCODE_CAFFE2
|
||||
script::Module module("module");
|
||||
module.define(layer_norm_with_weights);
|
||||
torch::jit::StaticModule smodule(module);
|
||||
ASSERT_EQ(getNodeWithKind(smodule, "aten::layer_norm"), nullptr);
|
||||
ASSERT_NE(getNodeWithKind(smodule, "static_runtime::layer_norm"), nullptr);
|
||||
#endif
|
||||
const auto a = torch::rand({1, 2, 2, 2});
|
||||
const auto b = torch::rand({3, 2, 2, 2});
|
||||
for (int normalized_size : {2, 3}) {
|
||||
|
|
@ -1223,3 +1231,83 @@ TEST(StaticRuntime, IndividualOps_VarStack) {
|
|||
|
||||
testStaticRuntime(var_stack_script, args1, args2);
|
||||
}
|
||||
|
||||
TEST(StaticRuntime, IndividualOps_FmodTensor) {
|
||||
// fmod tensor version
|
||||
auto a = at::randn({2, 3});
|
||||
auto b = at::randn({2, 3});
|
||||
std::vector<IValue> args0{a, b};
|
||||
testStaticRuntime(fmod_tensor, args0);
|
||||
|
||||
// check for dynamic shapes
|
||||
auto c = at::randn({4, 3, 2});
|
||||
auto d = at::randn({4, 3, 2});
|
||||
std::vector<IValue> args1{c, d};
|
||||
testStaticRuntime(fmod_tensor, args0, args1);
|
||||
}
|
||||
|
||||
TEST(StaticRuntime, IndividualOps_FmodScalar) {
|
||||
auto a = at::randn({2, 3});
|
||||
|
||||
// fmod scalar version
|
||||
std::vector<IValue> args2{a, 3};
|
||||
testStaticRuntime(fmod_scalar, args2);
|
||||
|
||||
// check for dynamic shapes
|
||||
auto c = at::randn({4, 3, 2});
|
||||
std::vector<IValue> args3{c, 4};
|
||||
testStaticRuntime(fmod_scalar, args2, args3);
|
||||
}
|
||||
|
||||
TEST(StaticRuntime, QEmbeddingBagByteUnpack) {
|
||||
auto a = torch::randn({8, 16}, at::ScalarType::Float);
|
||||
auto b = torch::randn({8*2, 16*2}, at::ScalarType::Float);
|
||||
|
||||
testStaticRuntime(embedding_bag_byte_prepack_script, {a});
|
||||
testStaticRuntime(embedding_bag_byte_prepack_script, {a},{b});
|
||||
}
|
||||
|
||||
TEST(StaticRuntime, IndividualOps_LinalgNorm_ScalarOrd) {
|
||||
auto a = at::randn({2, 3});
|
||||
auto dim = std::vector<int64_t>({1});
|
||||
auto dtype = at::ScalarType::Float;
|
||||
|
||||
std::vector<IValue> args0{a, 4, dim, true, dtype};
|
||||
testStaticRuntime(linalg_norm_ord_scalar, args0);
|
||||
|
||||
auto b = at::randn({4, 5});
|
||||
std::vector<IValue> args1{b, 4, dim, true, dtype};
|
||||
testStaticRuntime(linalg_norm_ord_scalar, args0, args1);
|
||||
}
|
||||
|
||||
TEST(StaticRuntime, IndividualOps_LinalgNorm_StringOrd) {
|
||||
auto a = at::randn({2, 3});
|
||||
auto dim = std::vector<int64_t>({0, 1});
|
||||
auto dtype = at::ScalarType::Float;
|
||||
|
||||
std::vector<IValue> args0{a, "fro", dim, true, dtype};
|
||||
testStaticRuntime(linalg_norm_ord_str, args0);
|
||||
|
||||
auto b = at::randn({4, 5});
|
||||
std::vector<IValue> args1{b, "fro", dim, true, dtype};
|
||||
testStaticRuntime(linalg_norm_ord_str, args0, args1);
|
||||
}
|
||||
|
||||
TEST(StaticRuntime, IndividualOps_Cat) {
|
||||
auto graph = std::make_shared<Graph>();
|
||||
std::unordered_map<std::string, Value*> vmap;
|
||||
parseIR(cat_script, graph.get(), vmap);
|
||||
torch::jit::StaticModule smodule(graph);
|
||||
ASSERT_TRUE(getNodeWithKind(smodule, "aten::cat"));
|
||||
|
||||
auto a = at::randn({2, 4});
|
||||
auto b = at::randn({3, 4});
|
||||
std::vector<IValue> args0{a, b, 0};
|
||||
|
||||
testStaticRuntime(cat_script, args0);
|
||||
|
||||
auto c = at::randn({3, 4});
|
||||
auto d = at::randn({3, 5});
|
||||
std::vector<IValue> args1{c, d, 1};
|
||||
testStaticRuntime(cat_script, args0, args1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,8 +4,10 @@ namespace c10 {
|
|||
|
||||
namespace {
|
||||
// By default, grad mode is enabled and inference mode is disabled
|
||||
thread_local AutogradState autograd_state_tls =
|
||||
AutogradState(/* grad_mode */ true, /* inference_mode */ false);
|
||||
thread_local AutogradState autograd_state_tls = AutogradState(
|
||||
/* grad_mode */ true,
|
||||
/* inference_mode */ false,
|
||||
/* fw_grad_mode */ true);
|
||||
} // namespace
|
||||
|
||||
AutogradState& AutogradState::get_tls_state() {
|
||||
|
|
|
|||
|
|
@ -12,13 +12,19 @@ struct C10_API AutogradState {
|
|||
static AutogradState& get_tls_state();
|
||||
static void set_tls_state(AutogradState state);
|
||||
|
||||
AutogradState(bool grad_mode, bool inference_mode)
|
||||
: grad_mode_(grad_mode), inference_mode_(inference_mode) {}
|
||||
AutogradState(bool grad_mode, bool inference_mode, bool fw_grad_mode)
|
||||
: grad_mode_(grad_mode),
|
||||
inference_mode_(inference_mode),
|
||||
fw_grad_mode_(fw_grad_mode) {}
|
||||
|
||||
void set_grad_mode(bool enabled) {
|
||||
grad_mode_ = enabled;
|
||||
}
|
||||
|
||||
void set_fw_grad_mode(bool enabled) {
|
||||
fw_grad_mode_ = enabled;
|
||||
}
|
||||
|
||||
void set_inference_mode(bool enabled) {
|
||||
inference_mode_ = enabled;
|
||||
}
|
||||
|
|
@ -27,6 +33,10 @@ struct C10_API AutogradState {
|
|||
return grad_mode_;
|
||||
}
|
||||
|
||||
bool get_fw_grad_mode() const {
|
||||
return fw_grad_mode_;
|
||||
}
|
||||
|
||||
bool get_inference_mode() const {
|
||||
return inference_mode_;
|
||||
}
|
||||
|
|
@ -34,6 +44,7 @@ struct C10_API AutogradState {
|
|||
private:
|
||||
bool grad_mode_ : 1;
|
||||
bool inference_mode_ : 1;
|
||||
bool fw_grad_mode_ : 1;
|
||||
};
|
||||
|
||||
} // namespace c10
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
#include <c10/core/AutogradState.h>
|
||||
#include <c10/core/GradMode.h>
|
||||
|
||||
#include <stdexcept>
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#pragma once
|
||||
|
||||
#include <c10/core/AutogradState.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
|
||||
namespace c10 {
|
||||
|
|
@ -27,4 +28,17 @@ struct TORCH_API NoGradGuard : public AutoGradMode {
|
|||
NoGradGuard() : AutoGradMode(/*enabled=*/false) {}
|
||||
};
|
||||
|
||||
// A RAII, thread local (!) guard that enables or disables forward grad mode
|
||||
// upon construction, and sets it back to the original value upon destruction.
|
||||
struct TORCH_API AutoFwGradMode {
|
||||
AutoFwGradMode(bool enabled)
|
||||
: prev_mode(AutogradState::get_tls_state().get_fw_grad_mode()) {
|
||||
AutogradState::get_tls_state().set_fw_grad_mode(enabled);
|
||||
}
|
||||
~AutoFwGradMode() {
|
||||
AutogradState::get_tls_state().set_fw_grad_mode(prev_mode);
|
||||
}
|
||||
bool prev_mode;
|
||||
};
|
||||
|
||||
} // namespace c10
|
||||
|
|
|
|||
|
|
@ -53,10 +53,12 @@ struct TORCH_API InferenceMode {
|
|||
InferenceMode(bool enabled = true)
|
||||
: prev_mode(AutogradState::get_tls_state()),
|
||||
prev_keyset(c10::impl::tls_local_dispatch_key_set()) {
|
||||
// Enabling inference mode means disabling grad mode
|
||||
// And disabling inference mode means enabling grad mode
|
||||
AutogradState::set_tls_state(
|
||||
AutogradState(/* grad_mode */ !enabled, /* inference_mode */ enabled));
|
||||
// Enabling inference mode means disabling grad modes
|
||||
// And disabling inference mode means enabling grad modes
|
||||
AutogradState::set_tls_state(AutogradState(
|
||||
/* grad_mode */ !enabled,
|
||||
/* inference_mode */ enabled,
|
||||
/* fw_grad_mode */ !enabled));
|
||||
DispatchKeySet included = enabled
|
||||
? prev_keyset.included_.remove(c10::DispatchKey::ADInplaceOrView)
|
||||
: prev_keyset.included_.add(c10::DispatchKey::ADInplaceOrView);
|
||||
|
|
|
|||
|
|
@ -68,7 +68,7 @@ struct C10_API StorageImpl final : public c10::intrusive_ptr_target {
|
|||
StorageImpl() = delete;
|
||||
StorageImpl(StorageImpl&& other) = default;
|
||||
StorageImpl(const StorageImpl&) = delete;
|
||||
~StorageImpl() = default;
|
||||
~StorageImpl() override = default;
|
||||
|
||||
void reset() {
|
||||
data_ptr_.clear();
|
||||
|
|
|
|||
|
|
@ -6,12 +6,13 @@
|
|||
#include <initializer_list>
|
||||
#include <ostream>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "c10/util/Registry.h"
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/types.h"
|
||||
#include "caffe2/proto/caffe2_pb.h"
|
||||
#include "caffe2/utils/filler.h"
|
||||
#include "caffe2/utils/proto_utils.h"
|
||||
|
|
@ -273,8 +274,8 @@ class TORCH_API OpSchema {
|
|||
OpSchema&
|
||||
Arg(const char* name, const char* description, bool required = false);
|
||||
|
||||
#define DECLARE_STANDARD_ARG(name, str) \
|
||||
static const char* Arg_##name; \
|
||||
#define DECLARE_STANDARD_ARG(name, str) \
|
||||
static const char* Arg_##name; \
|
||||
OpSchema& Arg##name(const char* description);
|
||||
|
||||
DECLARE_STANDARD_ARG(IsTest, is_test)
|
||||
|
|
@ -339,7 +340,9 @@ class TORCH_API OpSchema {
|
|||
return inplace_enforced_(x, y);
|
||||
}
|
||||
|
||||
TORCH_API friend std::ostream& operator<<(std::ostream& out, const OpSchema& schema);
|
||||
TORCH_API friend std::ostream& operator<<(
|
||||
std::ostream& out,
|
||||
const OpSchema& schema);
|
||||
|
||||
const std::vector<Argument>& args() const {
|
||||
return args_;
|
||||
|
|
@ -562,8 +565,10 @@ OpSchema::Cost PointwiseCostInference(
|
|||
}
|
||||
|
||||
c.flops = nElemX * OpsPerPoint;
|
||||
c.bytes_read = nElemRead * sizeof(X.data_type());
|
||||
c.bytes_written = nElemX * sizeof(X.data_type());
|
||||
auto const& X_element_size_byte =
|
||||
DataTypeToTypeMeta(X.data_type()).itemsize();
|
||||
c.bytes_read = nElemRead * X_element_size_byte;
|
||||
c.bytes_written = nElemX * X_element_size_byte;
|
||||
return c;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#include "caffe2/operators/batch_matmul_op.h"
|
||||
|
||||
#include "caffe2/core/operator_schema.h"
|
||||
#include "caffe2/core/types.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
|
|
@ -116,9 +117,13 @@ OpSchema::Cost CostInferenceForBatchMatMul(
|
|||
K = in[0].dims(ndims_A - 1);
|
||||
}
|
||||
|
||||
auto const& A_element_size_byte =
|
||||
DataTypeToTypeMeta(A.data_type()).itemsize();
|
||||
auto const& Y_element_size_byte =
|
||||
DataTypeToTypeMeta(Y.data_type()).itemsize();
|
||||
c.flops = 2 * nElemY * K;
|
||||
c.bytes_read = (nElemA + nElemB) * sizeof(A.data_type());
|
||||
c.bytes_written = nElemY * sizeof(Y.data_type());
|
||||
c.bytes_read = (nElemA + nElemB) * A_element_size_byte;
|
||||
c.bytes_written = nElemY * Y_element_size_byte;
|
||||
c.params_bytes = 0;
|
||||
return c;
|
||||
}
|
||||
|
|
@ -180,72 +185,76 @@ class GetBatchMatMulGradient : public GradientMakerBase {
|
|||
auto no_trans_arg = vector<Argument>();
|
||||
auto trans_a_arg = vector<Argument>{MakeArgument<int>("trans_a", 1)};
|
||||
auto trans_b_arg = vector<Argument>{MakeArgument<int>("trans_b", 1)};
|
||||
auto trans_both_arg = vector<Argument>{MakeArgument<int>("trans_a", 1),
|
||||
MakeArgument<int>("trans_b", 1)};
|
||||
auto trans_both_arg = vector<Argument>{
|
||||
MakeArgument<int>("trans_a", 1), MakeArgument<int>("trans_b", 1)};
|
||||
|
||||
if (trans_a) {
|
||||
if (trans_b) {
|
||||
// A'B':
|
||||
// dA = B'G', dB = G'A'
|
||||
return vector<OperatorDef>{CreateOperatorDef(
|
||||
"BatchMatMul",
|
||||
"",
|
||||
vector<string>{I(1), GO(0)},
|
||||
vector<string>{GI(0)},
|
||||
trans_both_arg),
|
||||
CreateOperatorDef(
|
||||
"BatchMatMul",
|
||||
"",
|
||||
vector<string>{GO(0), I(0)},
|
||||
vector<string>{GI(1)},
|
||||
trans_both_arg)};
|
||||
return vector<OperatorDef>{
|
||||
CreateOperatorDef(
|
||||
"BatchMatMul",
|
||||
"",
|
||||
vector<string>{I(1), GO(0)},
|
||||
vector<string>{GI(0)},
|
||||
trans_both_arg),
|
||||
CreateOperatorDef(
|
||||
"BatchMatMul",
|
||||
"",
|
||||
vector<string>{GO(0), I(0)},
|
||||
vector<string>{GI(1)},
|
||||
trans_both_arg)};
|
||||
} else {
|
||||
// A'B:
|
||||
// dA = BG', dB = AG
|
||||
return vector<OperatorDef>{CreateOperatorDef(
|
||||
"BatchMatMul",
|
||||
"",
|
||||
vector<string>{I(1), GO(0)},
|
||||
vector<string>{GI(0)},
|
||||
trans_b_arg),
|
||||
CreateOperatorDef(
|
||||
"BatchMatMul",
|
||||
"",
|
||||
vector<string>{I(0), GO(0)},
|
||||
vector<string>{GI(1)},
|
||||
no_trans_arg)};
|
||||
return vector<OperatorDef>{
|
||||
CreateOperatorDef(
|
||||
"BatchMatMul",
|
||||
"",
|
||||
vector<string>{I(1), GO(0)},
|
||||
vector<string>{GI(0)},
|
||||
trans_b_arg),
|
||||
CreateOperatorDef(
|
||||
"BatchMatMul",
|
||||
"",
|
||||
vector<string>{I(0), GO(0)},
|
||||
vector<string>{GI(1)},
|
||||
no_trans_arg)};
|
||||
}
|
||||
} else {
|
||||
if (trans_b) {
|
||||
// AB':
|
||||
// dA = GB, dB = G'A
|
||||
return vector<OperatorDef>{CreateOperatorDef(
|
||||
"BatchMatMul",
|
||||
"",
|
||||
vector<string>{GO(0), I(1)},
|
||||
vector<string>{GI(0)},
|
||||
no_trans_arg),
|
||||
CreateOperatorDef(
|
||||
"BatchMatMul",
|
||||
"",
|
||||
vector<string>{GO(0), I(0)},
|
||||
vector<string>{GI(1)},
|
||||
trans_a_arg)};
|
||||
return vector<OperatorDef>{
|
||||
CreateOperatorDef(
|
||||
"BatchMatMul",
|
||||
"",
|
||||
vector<string>{GO(0), I(1)},
|
||||
vector<string>{GI(0)},
|
||||
no_trans_arg),
|
||||
CreateOperatorDef(
|
||||
"BatchMatMul",
|
||||
"",
|
||||
vector<string>{GO(0), I(0)},
|
||||
vector<string>{GI(1)},
|
||||
trans_a_arg)};
|
||||
} else {
|
||||
// AB:
|
||||
// dA = GB', dB = A'G
|
||||
return vector<OperatorDef>{CreateOperatorDef(
|
||||
"BatchMatMul",
|
||||
"",
|
||||
vector<string>{GO(0), I(1)},
|
||||
vector<string>{GI(0)},
|
||||
trans_b_arg),
|
||||
CreateOperatorDef(
|
||||
"BatchMatMul",
|
||||
"",
|
||||
vector<string>{I(0), GO(0)},
|
||||
vector<string>{GI(1)},
|
||||
trans_a_arg)};
|
||||
return vector<OperatorDef>{
|
||||
CreateOperatorDef(
|
||||
"BatchMatMul",
|
||||
"",
|
||||
vector<string>{GO(0), I(1)},
|
||||
vector<string>{GI(0)},
|
||||
trans_b_arg),
|
||||
CreateOperatorDef(
|
||||
"BatchMatMul",
|
||||
"",
|
||||
vector<string>{I(0), GO(0)},
|
||||
vector<string>{GI(1)},
|
||||
trans_a_arg)};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -101,9 +101,12 @@ OpSchema::Cost CostInferenceForSplit(
|
|||
CAFFE_ENFORCE_GT(in.size(), 0);
|
||||
struct OpSchema::Cost cost;
|
||||
cost.flops = 0;
|
||||
auto input_bytes_count = nElemFromDim(in[0]) * sizeof(in[0].data_type());
|
||||
auto split_bytes_count =
|
||||
(in.size() == 1) ? 0 : nElemFromDim(in[1]) * sizeof(in[1].data_type());
|
||||
auto const& input_0_element_size_byte =
|
||||
DataTypeToTypeMeta(in[0].data_type()).itemsize();
|
||||
auto const& input_1_element_size_byte =
|
||||
(in.size() > 1) ? DataTypeToTypeMeta(in[1].data_type()).itemsize() : 0;
|
||||
auto input_bytes_count = nElemFromDim(in[0]) * input_0_element_size_byte;
|
||||
auto split_bytes_count = nElemFromDim(in[1]) * input_1_element_size_byte;
|
||||
// There can be two input blobs:
|
||||
// (1) actual tensor to be split
|
||||
// (2) lengths of outputs along split axis
|
||||
|
|
@ -329,11 +332,13 @@ OpSchema::Cost CostInferenceForConcat(
|
|||
}
|
||||
auto split_info_bytes_count = in.size() * sizeof(int);
|
||||
|
||||
auto const& input_0_element_size_byte =
|
||||
DataTypeToTypeMeta(in[0].data_type()).itemsize();
|
||||
struct OpSchema::Cost cost;
|
||||
cost.flops = 0;
|
||||
cost.bytes_read = nElemRead * sizeof(in[0].data_type());
|
||||
cost.bytes_read = nElemRead * input_0_element_size_byte;
|
||||
cost.bytes_written =
|
||||
size * sizeof(in[0].data_type()) + split_info_bytes_count;
|
||||
size * input_0_element_size_byte + split_info_bytes_count;
|
||||
cost.params_bytes = 0;
|
||||
return cost;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@
|
|||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/types.h"
|
||||
#include "caffe2/proto/caffe2_legacy.pb.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
|
|
@ -519,14 +520,20 @@ class ConvPoolOpBase : public Operator<Context> {
|
|||
uint64_t nElemW = nElemFromDim(W);
|
||||
uint64_t nElemBias = inputs.size() > 2 ? nElemFromDim(inputs[2]) : 0;
|
||||
|
||||
auto const& X_elemenet_size_byte =
|
||||
DataTypeToTypeMeta(X.data_type()).itemsize();
|
||||
auto const& Y_element_size_byte =
|
||||
DataTypeToTypeMeta(Y.data_type()).itemsize();
|
||||
auto const& W_element_size_byte =
|
||||
DataTypeToTypeMeta(W.data_type()).itemsize();
|
||||
|
||||
// grouping is NOT properly handled yet
|
||||
c.flops = N * Y_t * Y_h * Y_w * kernel_t * kernel_w * kernel_h *
|
||||
in_channels * out_channels * 2;
|
||||
c.bytes_read = (nElemX + nElemW + nElemBias) * sizeof(X.data_type());
|
||||
c.bytes_written =
|
||||
N * out_channels * Y_t * Y_h * Y_w * sizeof(Y.data_type());
|
||||
c.bytes_read = (nElemX + nElemW + nElemBias) * X_elemenet_size_byte;
|
||||
c.bytes_written = N * out_channels * Y_t * Y_h * Y_w * Y_element_size_byte;
|
||||
c.params_bytes = out_channels * in_channels * kernel_t * kernel_h *
|
||||
kernel_w * sizeof(W.data_type());
|
||||
kernel_w * W_element_size_byte;
|
||||
return c;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
#include "caffe2/operators/distance_op.h"
|
||||
#include "caffe2/core/types.h"
|
||||
#include "caffe2/utils/eigen_utils.h"
|
||||
#ifdef CAFFE2_USE_MKLDNN
|
||||
#include <caffe2/ideep/operators/operator_fallback_ideep.h>
|
||||
|
|
@ -7,7 +8,7 @@
|
|||
|
||||
namespace caffe2 {
|
||||
|
||||
template<>
|
||||
template <>
|
||||
bool SquaredL2DistanceOp<float, CPUContext>::RunOnDevice() {
|
||||
auto& X = Input(0);
|
||||
auto& Y = Input(1);
|
||||
|
|
@ -257,7 +258,9 @@ OpSchema::Cost CostInferenceForDotProduct(
|
|||
CAFFE_ENFORCE_EQ(out[0].dims().size(), 1);
|
||||
|
||||
struct OpSchema::Cost c = PointwiseCostInference<2>(def, in);
|
||||
c.bytes_written = out[0].dims(0) * sizeof(out[0].data_type());
|
||||
auto const& out_0_element_size_byte =
|
||||
DataTypeToTypeMeta(out[0].data_type()).itemsize();
|
||||
c.bytes_written = out[0].dims(0) * out_0_element_size_byte;
|
||||
c.params_bytes = 0;
|
||||
return c;
|
||||
}
|
||||
|
|
@ -379,10 +382,12 @@ bool DotProductWithPaddingOp<float, CPUContext>::RunOnDevice() {
|
|||
}
|
||||
|
||||
// L2
|
||||
REGISTER_CPU_OPERATOR(SquaredL2Distance,
|
||||
SquaredL2DistanceOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(SquaredL2DistanceGradient,
|
||||
SquaredL2DistanceGradientOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(
|
||||
SquaredL2Distance,
|
||||
SquaredL2DistanceOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(
|
||||
SquaredL2DistanceGradient,
|
||||
SquaredL2DistanceGradientOp<float, CPUContext>);
|
||||
|
||||
OPERATOR_SCHEMA(SquaredL2Distance)
|
||||
.NumInputs(2)
|
||||
|
|
@ -402,7 +407,8 @@ class GetSquaredL2DistanceGradient : public GradientMakerBase {
|
|||
using GradientMakerBase::GradientMakerBase;
|
||||
vector<OperatorDef> GetGradientDefs() override {
|
||||
return SingleGradientDef(
|
||||
"SquaredL2DistanceGradient", "",
|
||||
"SquaredL2DistanceGradient",
|
||||
"",
|
||||
vector<string>{I(0), I(1), GO(0)},
|
||||
vector<string>{GI(0), GI(1)});
|
||||
}
|
||||
|
|
@ -762,9 +768,9 @@ class GetDotProductWithPaddingGradient : public GradientMakerBase {
|
|||
replicate = GetArgument(Def(), "replicate").i();
|
||||
}
|
||||
|
||||
const auto dot_arg =
|
||||
vector<Argument>{MakeArgument<float>("pad_value", pad_value),
|
||||
MakeArgument<bool>("replicate", replicate)};
|
||||
const auto dot_arg = vector<Argument>{
|
||||
MakeArgument<float>("pad_value", pad_value),
|
||||
MakeArgument<bool>("replicate", replicate)};
|
||||
|
||||
return SingleGradientDef(
|
||||
"DotProductWithPaddingGradient",
|
||||
|
|
@ -775,4 +781,4 @@ class GetDotProductWithPaddingGradient : public GradientMakerBase {
|
|||
}
|
||||
};
|
||||
REGISTER_GRADIENT(DotProductWithPadding, GetDotProductWithPaddingGradient);
|
||||
} // namespace caffe2
|
||||
} // namespace caffe2
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
#include "caffe2/operators/fc_inference.h"
|
||||
#include "caffe2/core/types.h"
|
||||
|
||||
namespace caffe2 {
|
||||
std::vector<TensorShape> FCShapeInference(
|
||||
|
|
@ -51,11 +52,12 @@ OpSchema::Cost CostInferenceForFC(
|
|||
? size_from_dim_(canonical_axis_w, GetDimsVector(in[1]))
|
||||
: size_to_dim_(canonical_axis_w, GetDimsVector(in[1]));
|
||||
|
||||
const auto& X = in[0];
|
||||
auto const& X_element_size_byte =
|
||||
DataTypeToTypeMeta(in[0].data_type()).itemsize();
|
||||
c.flops = M * N * (2 * K + 1);
|
||||
c.bytes_read = (K * (M + N) + N) * sizeof(X.data_type());
|
||||
c.bytes_written = M * N * sizeof(X.data_type());
|
||||
c.params_bytes = (K * N + N) * sizeof(X.data_type());
|
||||
c.bytes_read = (K * (M + N) + N) * X_element_size_byte;
|
||||
c.bytes_written = M * N * X_element_size_byte;
|
||||
c.params_bytes = (K * N + N) * X_element_size_byte;
|
||||
return c;
|
||||
}
|
||||
|
||||
|
|
@ -94,7 +96,11 @@ OpSchema::Cost CostInferenceForFCGradient(
|
|||
|
||||
CAFFE_ENFORCE_LT(0, out.size());
|
||||
const TensorShape dW = out[0];
|
||||
auto const& dW_element_size_byte =
|
||||
DataTypeToTypeMeta(dW.data_type()).itemsize();
|
||||
const TensorShape db = out[1];
|
||||
auto const& db_element_size_byte =
|
||||
DataTypeToTypeMeta(db.data_type()).itemsize();
|
||||
|
||||
auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
|
||||
const auto canonical_axis = canonical_axis_index_(axis, in[0].dims().size());
|
||||
|
|
@ -111,15 +117,17 @@ OpSchema::Cost CostInferenceForFCGradient(
|
|||
uint64_t size_db = nElemFromDim(db);
|
||||
|
||||
c.flops = M * N * (2 * K + 1);
|
||||
c.bytes_written = (size_dW + size_db) * sizeof(float);
|
||||
c.bytes_written =
|
||||
size_dW * dW_element_size_byte + size_db * db_element_size_byte;
|
||||
c.params_bytes = (K * N + N) * sizeof(float);
|
||||
|
||||
if (out.size() == 3) {
|
||||
const TensorShape dX = out[2];
|
||||
uint64_t size_dX = nElemFromDim(dX);
|
||||
|
||||
auto const& dX_element_size_byte =
|
||||
DataTypeToTypeMeta(dX.data_type()).itemsize();
|
||||
c.flops += 2 * M * N * K;
|
||||
c.bytes_written += size_dX * sizeof(float);
|
||||
c.bytes_written += size_dX * dX_element_size_byte;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/tensor.h"
|
||||
#include "caffe2/core/types.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
|
|
@ -78,12 +79,21 @@ OpSchema::Cost CostInferenceForBatchOneHot(
|
|||
const auto& length = in[1];
|
||||
const auto& values = in[2];
|
||||
|
||||
uint64_t nBytesData = nElemFromDim(data) * sizeof(data.data_type());
|
||||
uint64_t nBytesLength = nElemFromDim(length) * sizeof(length.data_type());
|
||||
uint64_t nBytesValues = nElemFromDim(values) * sizeof(values.data_type());
|
||||
auto const& data_element_size_byte =
|
||||
DataTypeToTypeMeta(data.data_type()).itemsize();
|
||||
auto const& length_element_size_byte =
|
||||
DataTypeToTypeMeta(length.data_type()).itemsize();
|
||||
auto const& values_element_size_byte =
|
||||
DataTypeToTypeMeta(values.data_type()).itemsize();
|
||||
auto const& output_element_size_byte =
|
||||
DataTypeToTypeMeta(output.data_type()).itemsize();
|
||||
|
||||
uint64_t nBytesData = nElemFromDim(data) * data_element_size_byte;
|
||||
uint64_t nBytesLength = nElemFromDim(length) * length_element_size_byte;
|
||||
uint64_t nBytesValues = nElemFromDim(values) * values_element_size_byte;
|
||||
c.flops = 0;
|
||||
c.bytes_read = nBytesData + nBytesLength + nBytesValues;
|
||||
c.bytes_written = nElemFromDim(output) * sizeof(output.data_type());
|
||||
c.bytes_written = nElemFromDim(output) * output_element_size_byte;
|
||||
c.params_bytes = 0;
|
||||
return c;
|
||||
}
|
||||
|
|
@ -145,15 +155,15 @@ bool BatchBucketOneHotOp<CPUContext>::RunOnDevice() {
|
|||
for (int64_t j = 0; j < D; j++) {
|
||||
// here we assume the boundary values for each feature are sorted
|
||||
int64_t lower_bucket_idx = std::lower_bound(
|
||||
boundaries_offset,
|
||||
boundaries_offset + lens_data[j],
|
||||
input_data[pos]) -
|
||||
boundaries_offset,
|
||||
boundaries_offset + lens_data[j],
|
||||
input_data[pos]) -
|
||||
boundaries_offset;
|
||||
|
||||
int64_t upper_bucket_idx = std::upper_bound(
|
||||
boundaries_offset,
|
||||
boundaries_offset + lens_data[j],
|
||||
input_data[pos]) -
|
||||
boundaries_offset,
|
||||
boundaries_offset + lens_data[j],
|
||||
input_data[pos]) -
|
||||
boundaries_offset;
|
||||
|
||||
int64_t bucket_idx = (lower_bucket_idx + upper_bucket_idx) / 2;
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#include "caffe2/operators/utility_ops.h"
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include "caffe2/core/types.h"
|
||||
#include "caffe2/utils/eigen_utils.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
|
@ -34,9 +35,11 @@ OpSchema::Cost CostInferenceForWeightedSum(
|
|||
const auto& nElem = nElemFromDim(X0);
|
||||
const auto& nInputs = in.size();
|
||||
c.flops = (nInputs - 1) * nElem;
|
||||
c.bytes_read = (nInputs / 2) * (nElem + 1) * sizeof(X0.data_type());
|
||||
c.bytes_written = nElem * sizeof(X0.data_type());
|
||||
c.params_bytes = (nInputs / 2) * sizeof(X0.data_type());
|
||||
auto const& X0_element_size_byte =
|
||||
DataTypeToTypeMeta(X0.data_type()).itemsize();
|
||||
c.bytes_read = (nInputs / 2) * (nElem + 1) * X0_element_size_byte;
|
||||
c.bytes_written = nElem * X0_element_size_byte;
|
||||
c.params_bytes = (nInputs / 2) * X0_element_size_byte;
|
||||
return c;
|
||||
}
|
||||
|
||||
|
|
@ -48,9 +51,7 @@ REGISTER_CPU_OPERATOR(ResizeLike, ResizeLikeOp<CPUContext>);
|
|||
REGISTER_CPU_OPERATOR(SumInt, SumOp<CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(WeightedSum, WeightedSumOp<CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(WeightedSumGradient, WeightedSumGradientOp<CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(
|
||||
ScatterWeightedSum,
|
||||
ScatterWeightedSumOp<CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(ScatterWeightedSum, ScatterWeightedSumOp<CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp<CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(Scatter, ScatterOp<CPUContext>);
|
||||
|
||||
|
|
|
|||
|
|
@ -7,33 +7,39 @@ from caffe2.python.test_util import TestCase
|
|||
|
||||
class TestConcatOpCost(TestCase):
|
||||
def test_columnwise_concat(self):
|
||||
workspace.ResetWorkspace()
|
||||
workspace.FeedBlob("input_1", np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32))
|
||||
workspace.FeedBlob("input_2", np.array([[7], [8]], dtype=np.int32))
|
||||
concat_op = core.CreateOperator(
|
||||
"Concat",
|
||||
["input_1", "input_2"],
|
||||
["output", "split_info"],
|
||||
)
|
||||
workspace.RunOperatorOnce(concat_op)
|
||||
def _test_columnwise_concat_for_type(dtype):
|
||||
workspace.ResetWorkspace()
|
||||
workspace.FeedBlob("input_1", np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype))
|
||||
workspace.FeedBlob("input_2", np.array([[7], [8]], dtype=dtype))
|
||||
concat_op = core.CreateOperator(
|
||||
"Concat",
|
||||
["input_1", "input_2"],
|
||||
["output", "split_info"],
|
||||
)
|
||||
workspace.RunOperatorOnce(concat_op)
|
||||
|
||||
output = workspace.FetchBlob("output")
|
||||
self.assertTupleEqual(output.shape, (2, 4))
|
||||
np.testing.assert_array_equal(output, [[1, 2, 3, 7], [4, 5, 6, 8]])
|
||||
output = workspace.FetchBlob("output")
|
||||
self.assertTupleEqual(output.shape, (2, 4))
|
||||
np.testing.assert_array_equal(output, [[1, 2, 3, 7], [4, 5, 6, 8]])
|
||||
|
||||
flops, bytes_written, bytes_read = workspace.GetOperatorCost(
|
||||
concat_op, concat_op.input
|
||||
)
|
||||
flops, bytes_written, bytes_read = workspace.GetOperatorCost(
|
||||
concat_op, concat_op.input
|
||||
)
|
||||
|
||||
self.assertEqual(flops, 0)
|
||||
self.assertEqual(
|
||||
bytes_read,
|
||||
sum(workspace.FetchBlob(b).nbytes for b in concat_op.input),
|
||||
)
|
||||
self.assertEqual(
|
||||
bytes_written,
|
||||
sum(workspace.FetchBlob(b).nbytes for b in concat_op.output),
|
||||
)
|
||||
self.assertEqual(flops, 0)
|
||||
self.assertEqual(
|
||||
bytes_read,
|
||||
sum(workspace.FetchBlob(b).nbytes for b in concat_op.input),
|
||||
)
|
||||
self.assertEqual(
|
||||
bytes_written,
|
||||
sum(workspace.FetchBlob(b).nbytes for b in concat_op.output),
|
||||
)
|
||||
|
||||
[
|
||||
_test_columnwise_concat_for_type(t)
|
||||
for t in [np.int64, np.float, np.half, np.int8]
|
||||
]
|
||||
|
||||
def test_split_then_concat(self):
|
||||
workspace.ResetWorkspace()
|
||||
|
|
|
|||
|
|
@ -60,7 +60,7 @@ class TestWorkspace(unittest.TestCase):
|
|||
self.assertTupleEqual(
|
||||
op_cost,
|
||||
namedtuple("Cost", ["flops", "bytes_written", "bytes_read"])(
|
||||
1152, 256, 2084
|
||||
1152, 256, 4168
|
||||
),
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ class TORCH_API IStreamAdapter final : public ReadAdapterInterface {
|
|||
size_t size() const override;
|
||||
size_t read(uint64_t pos, void* buf, size_t n, const char* what = "")
|
||||
const override;
|
||||
~IStreamAdapter();
|
||||
~IStreamAdapter() override;
|
||||
|
||||
private:
|
||||
std::istream* istream_;
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
#include "adagrad_op.h"
|
||||
#include "caffe2/core/types.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
|
|
@ -23,22 +24,30 @@ static OpSchema::Cost CostInferenceForAdagrad(
|
|||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
|
||||
c.flops = grad_size * 10;
|
||||
|
||||
auto const& moment_element_size_byte =
|
||||
DataTypeToTypeMeta(moment.data_type()).itemsize();
|
||||
auto const& param_element_size_byte =
|
||||
DataTypeToTypeMeta(param.data_type()).itemsize();
|
||||
auto const& grad_element_size_byte =
|
||||
DataTypeToTypeMeta(grad.data_type()).itemsize();
|
||||
auto const& lr_element_size_byte =
|
||||
DataTypeToTypeMeta(lr.data_type()).itemsize();
|
||||
uint64_t bytes_written =
|
||||
grad_size * (sizeof(param.data_type()) + sizeof(moment.data_type()));
|
||||
grad_size * param_element_size_byte + moment_element_size_byte;
|
||||
|
||||
if (output_size == 3) {
|
||||
// also need to output effective learning rate in this case
|
||||
// assume it's the same data type as lr
|
||||
bytes_written += grad_size * sizeof(lr.data_type());
|
||||
bytes_written += grad_size * lr_element_size_byte;
|
||||
} else if (output_size == 4) {
|
||||
// also need to output effective learning rate and updates in this case
|
||||
// assume update is the same data type as param
|
||||
bytes_written +=
|
||||
grad_size * (sizeof(lr.data_type()) + sizeof(param.data_type()));
|
||||
grad_size * (lr_element_size_byte + param_element_size_byte);
|
||||
}
|
||||
c.bytes_written = bytes_written;
|
||||
c.bytes_read = c.bytes_written +
|
||||
grad_size * (sizeof(grad.data_type()) + sizeof(lr.data_type()));
|
||||
grad_size * (grad_element_size_byte + lr_element_size_byte);
|
||||
|
||||
return c;
|
||||
}
|
||||
|
|
@ -102,10 +111,18 @@ static OpSchema::Cost CostInferenceForSparseAdagrad(
|
|||
// (optimistically count sqrt as one flop).
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
|
||||
c.flops = grad_size * 7;
|
||||
auto const& param_element_size_byte =
|
||||
DataTypeToTypeMeta(param.data_type()).itemsize();
|
||||
auto const& moment_element_size_byte =
|
||||
DataTypeToTypeMeta(moment.data_type()).itemsize();
|
||||
c.bytes_written =
|
||||
grad_size * (sizeof(param.data_type()) + sizeof(moment.data_type()));
|
||||
c.bytes_read = c.bytes_written + grad_size * sizeof(grad.data_type()) +
|
||||
n * sizeof(indices.data_type());
|
||||
grad_size * (param_element_size_byte + moment_element_size_byte);
|
||||
auto const& grad_element_size_byte =
|
||||
DataTypeToTypeMeta(grad.data_type()).itemsize();
|
||||
auto const& indices_element_size_byte =
|
||||
DataTypeToTypeMeta(indices.data_type()).itemsize();
|
||||
c.bytes_read = c.bytes_written + grad_size * grad_element_size_byte +
|
||||
n * indices_element_size_byte;
|
||||
|
||||
return c;
|
||||
}
|
||||
|
|
@ -153,6 +170,16 @@ static OpSchema::Cost CostInferenceForRowWiseSparseAdagrad(
|
|||
OpSchema::Cost c;
|
||||
|
||||
if (n > 0) {
|
||||
auto const& param_element_size_byte =
|
||||
DataTypeToTypeMeta(param.data_type()).itemsize();
|
||||
auto const& moment_element_size_byte =
|
||||
DataTypeToTypeMeta(moment.data_type()).itemsize();
|
||||
auto const& grad_element_size_byte =
|
||||
DataTypeToTypeMeta(grad.data_type()).itemsize();
|
||||
auto const& indices_element_size_byte =
|
||||
DataTypeToTypeMeta(indices.data_type()).itemsize();
|
||||
auto const& lr_element_size_byte =
|
||||
DataTypeToTypeMeta(lr.data_type()).itemsize();
|
||||
auto block_size = grad_size / n;
|
||||
if (block_size == 1) {
|
||||
// +2: applying weight decay and add to grads
|
||||
|
|
@ -161,22 +188,22 @@ static OpSchema::Cost CostInferenceForRowWiseSparseAdagrad(
|
|||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
|
||||
c.flops = n * 9;
|
||||
c.bytes_written =
|
||||
n * (sizeof(param.data_type()) + sizeof(moment.data_type()));
|
||||
n * (param_element_size_byte + moment_element_size_byte);
|
||||
c.bytes_read = c.bytes_written +
|
||||
n *
|
||||
(sizeof(grad.data_type()) + sizeof(indices.data_type()) +
|
||||
sizeof(lr.data_type()));
|
||||
(grad_element_size_byte + indices_element_size_byte +
|
||||
lr_element_size_byte);
|
||||
} else {
|
||||
// 5 per block (not counting index transforms)
|
||||
// 8 for each value of a block
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
|
||||
c.flops = n * (5 + (block_size * 8));
|
||||
c.bytes_written =
|
||||
n * sizeof(moment.data_type()) + n * block_size * (param.data_type());
|
||||
c.bytes_written = n * moment_element_size_byte +
|
||||
n * block_size * param_element_size_byte;
|
||||
|
||||
c.bytes_read = c.bytes_written + n * (sizeof(lr.data_type())) +
|
||||
c.bytes_read = c.bytes_written + n * lr_element_size_byte +
|
||||
2 * n * block_size *
|
||||
(sizeof(grad.data_type()) + sizeof(param.data_type()));
|
||||
(grad_element_size_byte + param_element_size_byte);
|
||||
}
|
||||
}
|
||||
return c;
|
||||
|
|
|
|||
|
|
@ -131,6 +131,7 @@ function(caffe2_print_configuration_summary)
|
|||
endif()
|
||||
message(STATUS " USE_METAL : ${USE_METAL}")
|
||||
message(STATUS " USE_PYTORCH_METAL : ${USE_PYTORCH_METAL}")
|
||||
message(STATUS " USE_PYTORCH_METAL_EXPORT : ${USE_PYTORCH_METAL_EXPORT}")
|
||||
message(STATUS " USE_FFTW : ${USE_FFTW}")
|
||||
message(STATUS " USE_MKL : ${CAFFE2_USE_MKL}")
|
||||
message(STATUS " USE_MKLDNN : ${USE_MKLDNN}")
|
||||
|
|
|
|||
|
|
@ -389,6 +389,7 @@ in :func:`torch.nn.utils.parameterize.register_parametrization`.
|
|||
:toctree: generated
|
||||
:nosignatures:
|
||||
|
||||
parametrizations.orthogonal
|
||||
parametrizations.spectral_norm
|
||||
|
||||
Utility functions to parametrize Tensors on existing Modules.
|
||||
|
|
@ -396,7 +397,7 @@ Note that these functions can be used to parametrize a given Parameter
|
|||
or Buffer given a specific function that maps from an input space to the
|
||||
parametrized space. They are not parameterizations that would transform
|
||||
an object into a parameter. See the
|
||||
`Parametrizations <https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html>`__ tutorial
|
||||
`Parametrizations tutorial <https://pytorch.org/tutorials/intermediate/parametrizations.html>`_
|
||||
for more information on how to implement your own parametrizations.
|
||||
|
||||
.. autosummary::
|
||||
|
|
|
|||
|
|
@ -117,7 +117,7 @@ multiple modules:
|
|||
|
||||
Note that :class:`~torch.nn.Sequential` automatically feeds the output of the first ``MyLinear`` module as input
|
||||
into the :class:`~torch.nn.ReLU`, and the output of that as input into the second ``MyLinear`` module. As
|
||||
shown, it is limited to in-order chaining of modules.
|
||||
shown, it is limited to in-order chaining of modules with a single input and output.
|
||||
|
||||
In general, it is recommended to define a custom module for anything beyond the simplest use cases, as this gives
|
||||
full flexibility on how submodules are used for a module's computation.
|
||||
|
|
@ -258,16 +258,32 @@ It's also easy to move all parameters to a different device or change their prec
|
|||
dynamic_net(torch.randn(5, device='cuda', dtype=torch.float64))
|
||||
: tensor([6.5166], device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
|
||||
|
||||
These examples show how elaborate neural networks can be formed through module composition. To allow for
|
||||
quick and easy construction of neural networks with minimal boilerplate, PyTorch provides a large library of
|
||||
performant modules within the :mod:`torch.nn` namespace that perform computation commonly found within neural
|
||||
networks, including pooling, convolutions, loss functions, etc.
|
||||
More generally, an arbitrary function can be applied to a module and its submodules recursively by
|
||||
using the :func:`~torch.nn.Module.apply` function. For example, to apply custom initialization to parameters
|
||||
of a module and its submodules:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Define a function to initialize Linear weights.
|
||||
# Note that no_grad() is used here to avoid tracking this computation in the autograd graph.
|
||||
@torch.no_grad()
|
||||
def init_weights(m):
|
||||
if isinstance(m, nn.Linear):
|
||||
nn.init.xavier_normal_(m.weight)
|
||||
m.bias.fill_(0.0)
|
||||
|
||||
# Apply the function recursively on the module and its submodules.
|
||||
dynamic_net.apply(init_weights)
|
||||
|
||||
These examples show how elaborate neural networks can be formed through module composition and conveniently
|
||||
manipulated. To allow for quick and easy construction of neural networks with minimal boilerplate, PyTorch
|
||||
provides a large library of performant modules within the :mod:`torch.nn` namespace that perform common neural
|
||||
network operations like pooling, convolutions, loss functions, etc.
|
||||
|
||||
In the next section, we give a full example of training a neural network.
|
||||
|
||||
For more information, check out:
|
||||
|
||||
* Recursively :func:`~torch.nn.Module.apply` a function to a module and its submodules
|
||||
* Library of PyTorch-provided modules: `torch.nn <https://pytorch.org/docs/stable/nn.html>`_
|
||||
* Defining neural net modules: https://pytorch.org/tutorials/beginner/examples_nn/two_layer_net_module.html
|
||||
|
||||
|
|
@ -295,6 +311,12 @@ Optimizers from :mod:`torch.optim`:
|
|||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
# After training, switch the module to eval mode to do inference, compute performance metrics, etc.
|
||||
# (see discussion below for a description of training and evaluation modes)
|
||||
...
|
||||
net.eval()
|
||||
...
|
||||
|
||||
In this simplified example, the network learns to simply output zero, as any non-zero output is "penalized" according
|
||||
to its absolute value by employing :func:`torch.abs` as a loss function. While this is not a very interesting task, the
|
||||
key parts of training are present:
|
||||
|
|
@ -321,6 +343,38 @@ value of ``l1``\ 's ``weight`` parameter shows that its values are now much clos
|
|||
[ 0.0030],
|
||||
[-0.0008]], requires_grad=True)
|
||||
|
||||
Note that the above process is done entirely while the network module is in "training mode". Modules default to
|
||||
training mode and can be switched between training and evaluation modes using :func:`~torch.nn.Module.train` and
|
||||
:func:`~torch.nn.Module.eval`. They can behave differently depending on which mode they are in. For example, the
|
||||
:class:`~torch.nn.BatchNorm` module maintains a running mean and variance during training that are not updated
|
||||
when the module is in evaluation mode. In general, modules should be in training mode during training
|
||||
and only switched to evaluation mode for inference or evaluation. Below is an example of a custom module
|
||||
that behaves differently between the two modes:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
class ModalModule(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def forward(self, x):
|
||||
if self.training:
|
||||
# Add a constant only in training mode.
|
||||
return x + 1.
|
||||
else:
|
||||
return x
|
||||
|
||||
|
||||
m = ModalModule()
|
||||
x = torch.randn(4)
|
||||
|
||||
print('training mode output: {}'.format(m(x)))
|
||||
: tensor([1.6614, 1.2669, 1.0617, 1.6213, 0.5481])
|
||||
|
||||
m.eval()
|
||||
print('evaluation mode output: {}'.format(m(x)))
|
||||
: tensor([ 0.6614, 0.2669, 0.0617, 0.6213, -0.4519])
|
||||
|
||||
Training neural networks can often be tricky. For more information, check out:
|
||||
|
||||
* Using Optimizers: https://pytorch.org/tutorials/beginner/examples_nn/two_layer_net_optim.html.
|
||||
|
|
@ -409,12 +463,127 @@ Both persistent and non-persistent buffers are affected by model-wide device / d
|
|||
Buffers of a module can be iterated over using :func:`~torch.nn.Module.buffers` or
|
||||
:func:`~torch.nn.Module.named_buffers`.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
for buffer in m.named_buffers():
|
||||
print(buffer)
|
||||
|
||||
The following class demonstrates the various ways of registering parameters and buffers within a module:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
class StatefulModule(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# Setting a nn.Parameter as an attribute of the module automatically registers the tensor
|
||||
# as a parameter of the module.
|
||||
self.param1 = nn.Parameter(torch.randn(2))
|
||||
|
||||
# Alternative string-based way to register a parameter.
|
||||
self.register_parameter('param2', nn.Parameter(torch.randn(3)))
|
||||
|
||||
# Reserves the "param3" attribute as a parameter, preventing it from being set to anything
|
||||
# except a parameter. "None" entries like this will not be present in the module's state_dict.
|
||||
self.register_parameter('param3', None)
|
||||
|
||||
# Registers a list of parameters.
|
||||
self.param_list = nn.ParameterList([nn.Parameter(torch.randn(2)) for i in range(3)])
|
||||
|
||||
# Registers a dictionary of parameters.
|
||||
self.param_dict = nn.ParameterDict({
|
||||
'foo': nn.Parameter(torch.randn(3)),
|
||||
'bar': nn.Parameter(torch.randn(4))
|
||||
})
|
||||
|
||||
# Registers a persistent buffer (one that appears in the module's state_dict).
|
||||
self.register_buffer('buffer1', torch.randn(4), persistent=True)
|
||||
|
||||
# Registers a non-persistent buffer (one that does not appear in the module's state_dict).
|
||||
self.register_buffer('buffer2', torch.randn(5), persistent=False)
|
||||
|
||||
# Reserves the "buffer3" attribute as a buffer, preventing it from being set to anything
|
||||
# except a buffer. "None" entries like this will not be present in the module's state_dict.
|
||||
self.register_buffer('buffer3', None)
|
||||
|
||||
# Adding a submodule registers its parameters as parameters of the module.
|
||||
self.linear = nn.Linear(2, 3)
|
||||
|
||||
m = StatefulModule()
|
||||
|
||||
# Save and load state_dict.
|
||||
torch.save(m.state_dict(), 'state.pt')
|
||||
m_loaded = StatefulModule()
|
||||
m_loaded.load_state_dict(torch.load('state.pt'))
|
||||
|
||||
# Note that non-persistent buffer "buffer2" and reserved attributes "param3" and "buffer3" do
|
||||
# not appear in the state_dict.
|
||||
print(m_loaded.state_dict())
|
||||
: OrderedDict([('param1', tensor([-0.0322, 0.9066])),
|
||||
('param2', tensor([-0.4472, 0.1409, 0.4852])),
|
||||
('buffer1', tensor([ 0.6949, -0.1944, 1.2911, -2.1044])),
|
||||
('param_list.0', tensor([ 0.4202, -0.1953])),
|
||||
('param_list.1', tensor([ 1.5299, -0.8747])),
|
||||
('param_list.2', tensor([-1.6289, 1.4898])),
|
||||
('param_dict.bar', tensor([-0.6434, 1.5187, 0.0346, -0.4077])),
|
||||
('param_dict.foo', tensor([-0.0845, -1.4324, 0.7022])),
|
||||
('linear.weight', tensor([[-0.3915, -0.6176],
|
||||
[ 0.6062, -0.5992],
|
||||
[ 0.4452, -0.2843]])),
|
||||
('linear.bias', tensor([-0.3710, -0.0795, -0.3947]))])
|
||||
|
||||
For more information, check out:
|
||||
|
||||
* Saving and loading: https://pytorch.org/tutorials/beginner/saving_loading_models.html
|
||||
* Serialization semantics: https://pytorch.org/docs/master/notes/serialization.html
|
||||
* What is a state dict? https://pytorch.org/tutorials/recipes/recipes/what_is_state_dict.html
|
||||
|
||||
Module Initialization
|
||||
---------------------
|
||||
|
||||
By default, parameters and floating-point buffers for modules provided by :mod:`torch.nn` are initialized during
|
||||
module instantiation as 32-bit floating point values on the CPU using an initialization scheme determined to
|
||||
perform well historically for the module type. For certain use cases, it may be desired to initialize with a different
|
||||
dtype, device (e.g. GPU), or initialization technique.
|
||||
|
||||
Examples:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Initialize module directly onto GPU.
|
||||
m = nn.Linear(5, 3, device='cuda')
|
||||
|
||||
# Initialize module with 16-bit floating point parameters.
|
||||
m = nn.Linear(5, 3, dtype=torch.half)
|
||||
|
||||
# Skip default parameter initialization and perform custom (e.g. orthogonal) initialization.
|
||||
m = torch.nn.utils.skip_init(nn.Linear, 5, 3)
|
||||
nn.init.orthogonal_(m.weight)
|
||||
|
||||
Note that the device and dtype options demonstrated above also apply to any floating-point buffers registered
|
||||
for the module:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
m = nn.BatchNorm2d(3, dtype=torch.half)
|
||||
print(m.running_mean)
|
||||
: tensor([0., 0., 0.], dtype=torch.float16)
|
||||
|
||||
While module writers can use any device or dtype to initialize parameters in their custom modules, good practice is
|
||||
to use ``dtype=torch.float`` and ``device='cpu'`` by default as well. Optionally, you can provide full flexibility
|
||||
in these areas for your custom module by conforming to the convention demonstrated above that all
|
||||
:mod:`torch.nn` modules follow:
|
||||
|
||||
* Provide a ``device`` constructor kwarg that applies to any parameters / buffers registered by the module.
|
||||
* Provide a ``dtype`` constructor kwarg that applies to any parameters / floating-point buffers registered by
|
||||
the module.
|
||||
* Only use initialization functions (i.e. functions from :mod:`torch.nn.init`) on parameters and buffers within the
|
||||
module's constructor. Note that this is only required to use :func:`~torch.nn.utils.skip_init`; see
|
||||
`this page <https://pytorch.org/tutorials/prototype/skip_param_init.html#updating-modules-to-support-skipping-initialization>`_ for an explanation.
|
||||
|
||||
For more information, check out:
|
||||
|
||||
* Skipping module parameter initialization: https://pytorch.org/tutorials/prototype/skip_param_init.html
|
||||
|
||||
Module Hooks
|
||||
------------
|
||||
|
||||
|
|
@ -443,16 +612,137 @@ All hooks allow the user to return an updated value that will be used throughout
|
|||
Thus, these hooks can be used to either execute arbitrary code along the regular module forward/backward or
|
||||
modify some inputs/outputs without having to change the module's ``forward()`` function.
|
||||
|
||||
Below is an example demonstrating usage of forward and backward hooks:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
torch.manual_seed(1)
|
||||
|
||||
def forward_pre_hook(m, inputs):
|
||||
# Allows for examination and modification of the input before the forward pass.
|
||||
# Note that inputs are always wrapped in a tuple.
|
||||
input = inputs[0]
|
||||
return input + 1.
|
||||
|
||||
def forward_hook(m, inputs, output):
|
||||
# Allows for examination of inputs / outputs and modification of the outputs
|
||||
# after the forward pass. Note that inputs are always wrapped in a tuple while outputs
|
||||
# are passed as-is.
|
||||
|
||||
# Residual computation a la ResNet.
|
||||
return output + inputs[0]
|
||||
|
||||
def backward_hook(m, grad_inputs, grad_outputs):
|
||||
# Allows for examination of grad_inputs / grad_outputs and modification of
|
||||
# grad_inputs used in the rest of the backwards pass. Note that grad_inputs and
|
||||
# grad_outputs are always wrapped in tuples.
|
||||
new_grad_inputs = [torch.ones_like(gi) * 42. for gi in grad_inputs]
|
||||
return new_grad_inputs
|
||||
|
||||
# Create sample module & input.
|
||||
m = nn.Linear(3, 3)
|
||||
x = torch.randn(2, 3, requires_grad=True)
|
||||
|
||||
# ==== Demonstrate forward hooks. ====
|
||||
# Run input through module before and after adding hooks.
|
||||
print('output with no forward hooks: {}'.format(m(x)))
|
||||
: output with no forward hooks: tensor([[-0.5059, -0.8158, 0.2390],
|
||||
[-0.0043, 0.4724, -0.1714]], grad_fn=<AddmmBackward>)
|
||||
|
||||
# Note that the modified input results in a different output.
|
||||
forward_pre_hook_handle = m.register_forward_pre_hook(forward_pre_hook)
|
||||
print('output with forward pre hook: {}'.format(m(x)))
|
||||
: output with forward pre hook: tensor([[-0.5752, -0.7421, 0.4942],
|
||||
[-0.0736, 0.5461, 0.0838]], grad_fn=<AddmmBackward>)
|
||||
|
||||
# Note the modified output.
|
||||
forward_hook_handle = m.register_forward_hook(forward_hook)
|
||||
print('output with both forward hooks: {}'.format(m(x)))
|
||||
: output with both forward hooks: tensor([[-1.0980, 0.6396, 0.4666],
|
||||
[ 0.3634, 0.6538, 1.0256]], grad_fn=<AddBackward0>)
|
||||
|
||||
# Remove hooks; note that the output here matches the output before adding hooks.
|
||||
forward_pre_hook_handle.remove()
|
||||
forward_hook_handle.remove()
|
||||
print('output after removing forward hooks: {}'.format(m(x)))
|
||||
: output after removing forward hooks: tensor([[-0.5059, -0.8158, 0.2390],
|
||||
[-0.0043, 0.4724, -0.1714]], grad_fn=<AddmmBackward>)
|
||||
|
||||
# ==== Demonstrate backward hooks. ====
|
||||
m(x).sum().backward()
|
||||
print('x.grad with no backwards hook: {}'.format(x.grad))
|
||||
: x.grad with no backwards hook: tensor([[ 0.4497, -0.5046, 0.3146],
|
||||
[ 0.4497, -0.5046, 0.3146]])
|
||||
|
||||
# Clear gradients before running backward pass again.
|
||||
m.zero_grad()
|
||||
x.grad.zero_()
|
||||
|
||||
m.register_full_backward_hook(backward_hook)
|
||||
m(x).sum().backward()
|
||||
print('x.grad with backwards hook: {}'.format(x.grad))
|
||||
: x.grad with backwards hook: tensor([[42., 42., 42.],
|
||||
[42., 42., 42.]])
|
||||
|
||||
Advanced Features
|
||||
-----------------
|
||||
|
||||
PyTorch also provides several more advanced features that are designed to work with modules. All these functionalities
|
||||
are "inherited" when writing a new module. In-depth discussion of these features can be found in the links below.
|
||||
are available for custom-written modules, with the small caveat that certain features may require modules to conform
|
||||
to particular constraints in order to be supported. In-depth discussion of these features and the corresponding
|
||||
requirements can be found in the links below.
|
||||
|
||||
For more information, check out:
|
||||
Distributed Training
|
||||
********************
|
||||
|
||||
* Profiling: https://pytorch.org/tutorials/beginner/profiler.html
|
||||
* Pruning: https://pytorch.org/tutorials/intermediate/pruning_tutorial.html
|
||||
* Quantization: https://pytorch.org/tutorials/recipes/quantization.html
|
||||
* Exporting modules to TorchScript (e.g. for usage from C++):
|
||||
https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html
|
||||
Various methods for distributed training exist within PyTorch, both for scaling up training using multiple GPUs
|
||||
as well as training across multiple machines. Check out the
|
||||
`distributed training overview page <https://pytorch.org/tutorials/beginner/dist_overview.html>`_ for
|
||||
detailed information on how to utilize these.
|
||||
|
||||
Profiling Performance
|
||||
*********************
|
||||
|
||||
The `PyTorch Profiler <https://pytorch.org/tutorials/beginner/profiler.html>`_ can be useful for identifying
|
||||
performance bottlenecks within your models. It measures and outputs performance characteristics for
|
||||
both memory usage and time spent.
|
||||
|
||||
Improving Performance with Quantization
|
||||
***************************************
|
||||
|
||||
Applying quantization techniques to modules can improve performance and memory usage by utilizing lower
|
||||
bitwidths than floating-point precision. Check out the various PyTorch-provided mechanisms for quantization
|
||||
`here <https://pytorch.org/docs/stable/quantization.html>`_.
|
||||
|
||||
Improving Memory Usage with Pruning
|
||||
***********************************
|
||||
|
||||
Large deep learning models are often over-parametrized, resulting in high memory usage. To combat this, PyTorch
|
||||
provides mechanisms for model pruning, which can help reduce memory usage while maintaining task accuracy. The
|
||||
`Pruning tutorial <https://pytorch.org/tutorials/intermediate/pruning_tutorial.html>`_ describes how to utilize
|
||||
the pruning techniques PyTorch provides or define custom pruning techniques as necessary.
|
||||
|
||||
Deploying with TorchScript
|
||||
**************************
|
||||
|
||||
When deploying a model for use in production, the overhead of Python can be unacceptable due to its poor
|
||||
performance characteristics. For cases like this,
|
||||
`TorchScript <https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html>`_ provides a way to load
|
||||
and run an optimized model program from outside of Python, such as within a C++ program.
|
||||
|
||||
Parametrizations
|
||||
****************
|
||||
|
||||
For certain applications, it can be beneficial to constrain the parameter space during model training. For example,
|
||||
enforcing orthogonality of the learned parameters can improve convergence for RNNs. PyTorch provides a mechanism for
|
||||
applying `parametrizations <https://pytorch.org/tutorials/intermediate/parametrizations.html>`_ such as this, and
|
||||
further allows for custom constraints to be defined.
|
||||
|
||||
Transforming Modules with FX
|
||||
****************************
|
||||
|
||||
The `FX <https://pytorch.org/docs/stable/fx.html>`_ component of PyTorch provides a flexible way to transform
|
||||
modules by operating directly on module computation graphs. This can be used to programmatically generate or
|
||||
manipulate modules for a broad array of use cases. To explore FX, check out these examples of using FX for
|
||||
`convolution + batch norm fusion <https://pytorch.org/tutorials/intermediate/fx_conv_bn_fuser.html>`_ and
|
||||
`CPU performance analysis <https://pytorch.org/tutorials/intermediate/fx_profiling_tutorial.html>`_.
|
||||
|
|
|
|||
|
|
@ -6,10 +6,6 @@ torch.special
|
|||
|
||||
The torch.special module, modeled after SciPy's `special <https://docs.scipy.org/doc/scipy/reference/special.html>`_ module.
|
||||
|
||||
This module is in BETA. New functions are still being added, and some
|
||||
functions may change in future PyTorch releases. See the documentation of each
|
||||
function for details.
|
||||
|
||||
.. automodule:: torch.special
|
||||
:noindex:
|
||||
|
||||
|
|
|
|||
|
|
@ -9,3 +9,4 @@ torch.testing
|
|||
.. automodule:: torch.testing
|
||||
|
||||
.. autofunction:: assert_close
|
||||
.. autofunction:: make_tensor
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
#include <gtest/gtest.h>
|
||||
#include <ATen/core/boxing/impl/test_helpers.h>
|
||||
|
||||
#include <torch/torch.h>
|
||||
#include <ATen/core/op_registration/op_registration.h>
|
||||
|
||||
#include <torch/csrc/autograd/functions/basic_ops.h>
|
||||
|
||||
|
|
@ -869,6 +871,261 @@ TEST(CustomAutogradTest, BackwardWithCreateGraphWarns) {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests for AutogradNotImplementedFallback
|
||||
* - Check that we created the NotImplemented kernel when inputs require grad
|
||||
* but when no inputs require grad, we should not create this node
|
||||
* - check_inplace logic
|
||||
* - view ops (TODO: not an official view yet, update this once InplaceOrView kernel is landed)
|
||||
* - TODO: Tests for NDEBUG checks?
|
||||
* - tensorlist input and output
|
||||
* - multiple outputs / non-tensor output
|
||||
* - rebase_history vs set_history
|
||||
*/
|
||||
namespace {
|
||||
|
||||
torch::Tensor inplace_op(const torch::Tensor& self, const torch::Tensor& other) {
|
||||
return self.add_(other);
|
||||
}
|
||||
|
||||
std::tuple<torch::Tensor, torch::Tensor> two_arg_inplace_op(const torch::Tensor& self, const torch::Tensor& other) {
|
||||
other.add_(self);
|
||||
self.add_(other);
|
||||
return std::tuple<torch::Tensor, torch::Tensor>(self, other);
|
||||
}
|
||||
|
||||
std::tuple<torch::Tensor, torch::Tensor> two_pairs_of_view_op(const torch::Tensor& self, const torch::Tensor& other) {
|
||||
// This is not allowed. We test below that this calling into the boxed kernel will raise an error
|
||||
auto self_view = self.view(-1);
|
||||
auto other_view = other.view(-1);
|
||||
return std::tuple<torch::Tensor, torch::Tensor>(self_view, other_view);
|
||||
}
|
||||
|
||||
int64_t ret_single_non_tensor(const torch::Tensor& self, const torch::Tensor& other) {
|
||||
return 12;
|
||||
}
|
||||
|
||||
torch::Tensor opt_op(const torch::Tensor& self, const c10::optional<at::Tensor>& other) {
|
||||
if (other.has_value()) {
|
||||
return self + other.value();
|
||||
} else {
|
||||
return self.clone();
|
||||
}
|
||||
}
|
||||
|
||||
torch::Tensor my_custom_op(const torch::Tensor& self, const torch::Tensor& other) {
|
||||
return self + other;
|
||||
}
|
||||
|
||||
std::tuple<torch::Tensor, torch::Tensor, int64_t> ret_tuple_non_tensor(const torch::Tensor& self, const torch::Tensor& other) {
|
||||
auto a = self - other;
|
||||
auto b = self + other;
|
||||
return std::tuple<torch::Tensor, torch::Tensor, int64_t>(a, b, 12);
|
||||
}
|
||||
|
||||
torch::Tensor view_op(const torch::Tensor& self, const torch::Tensor& other) {
|
||||
return self.view(-1);
|
||||
}
|
||||
|
||||
std::vector<at::Tensor> ret_tensor_vector(const torch::Tensor& self, const torch::Tensor& other) {
|
||||
std::vector<at::Tensor> out;
|
||||
out.push_back(self + other);
|
||||
out.push_back(self - other);
|
||||
return out;
|
||||
}
|
||||
|
||||
torch::Tensor tensorlist_op(const torch::Tensor& self, at::TensorList other) {
|
||||
const auto& res = self.clone();
|
||||
for (const auto& t : other) {
|
||||
res.add_(t);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
#define REGISTER_TEST_OP(name, schema, fn) \
|
||||
auto m = MAKE_TORCH_LIBRARY(_test); \
|
||||
m.def(schema); \
|
||||
auto m_autograd = MAKE_TORCH_LIBRARY_IMPL(_test, Autograd); \
|
||||
auto m_cpu = MAKE_TORCH_LIBRARY_IMPL(_test, CPU); \
|
||||
m_cpu.impl(name, c10::DispatchKey::CPU, TORCH_FN(fn)); \
|
||||
m_autograd.impl(name, c10::DispatchKey::Autograd, autogradNotImplementedFallback());
|
||||
|
||||
template <typename F>
|
||||
void assertBasicChecks(F op) {
|
||||
auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
|
||||
auto b = torch::tensor({1.}, {torch::kFloat32});
|
||||
auto c = torch::tensor({1.}, {torch::kFloat32});
|
||||
|
||||
// If any inputs require grad,
|
||||
auto out1 = op(a, b);
|
||||
ASSERT_THROWS_WITH(out1.backward(), "is not implemented");
|
||||
|
||||
// # Should not have grad_fn if none require grad
|
||||
auto out2 = op(b, c);
|
||||
ASSERT_THROWS_WITH(out2.backward(), "element 0 of tensors does not require grad and does not have a grad_fn");
|
||||
|
||||
// TODO: Forward AD Tests?
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
TEST(TestAutogradNotImplementedFallback, RetSingleNonTensor) {
|
||||
REGISTER_TEST_OP("ret_single_non_tensor", "_test::ret_single_non_tensor(Tensor self, Tensor other) -> int", ret_single_non_tensor);
|
||||
auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::ret_single_non_tensor", "");
|
||||
auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
|
||||
return callOpUnboxed<int64_t, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
|
||||
};
|
||||
|
||||
auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
|
||||
auto b = torch::tensor({1.}, {torch::kFloat32});
|
||||
|
||||
ASSERT_EQ(op(a, b), ret_single_non_tensor(a, b));
|
||||
}
|
||||
|
||||
TEST(TestAutogradNotImplementedFallback, DoubleViewOP) {
|
||||
REGISTER_TEST_OP("two_pairs_of_view_op", "_test::two_pairs_of_view_op(Tensor(a) self, Tensor(b) other) -> (Tensor(a), Tensor(b))", two_pairs_of_view_op);
|
||||
auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::two_pairs_of_view_op", "");
|
||||
auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
|
||||
return callOpUnboxed<std::tuple<torch::Tensor, torch::Tensor>, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
|
||||
};
|
||||
auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
|
||||
auto b = torch::tensor({1.}, {torch::kFloat32});
|
||||
ASSERT_THROWS_WITH(op(a, b),
|
||||
"Expected only a single output in the operator schema to have a non-write alias annotation");
|
||||
}
|
||||
|
||||
TEST(TestAutogradNotImplementedFallback, InplaceOp) {
|
||||
REGISTER_TEST_OP("inplace_op", "_test::inplace_op(Tensor(a!) self, Tensor other) -> Tensor(a!)", inplace_op);
|
||||
auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::inplace_op", "");
|
||||
auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
|
||||
return callOpUnboxed<torch::Tensor, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
|
||||
};
|
||||
|
||||
auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
|
||||
auto b = torch::tensor({1.}, {torch::kFloat32});
|
||||
|
||||
// Check in-place
|
||||
ASSERT_THROWS_WITH(op(a, b),
|
||||
"a leaf Variable that requires grad is being used in an in-place operation");
|
||||
op(b, a);
|
||||
a = a.clone();
|
||||
b = b.clone();
|
||||
auto c = op(a, b);
|
||||
ASSERT_TRUE(torch::allclose(c, inplace_op(a, b)));
|
||||
|
||||
// Test in-place on view
|
||||
auto base = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true).clone();
|
||||
auto view = base.view(-1);
|
||||
auto t = torch::tensor({1.}, {torch::kFloat32});
|
||||
|
||||
torch::Tensor v_nograd;
|
||||
{
|
||||
c10::NoGradGuard guard;
|
||||
v_nograd = base.view(-1);
|
||||
op(v_nograd, t);
|
||||
}
|
||||
|
||||
ASSERT_THROWS_WITH(op(v_nograd, t), "A view was created in no_grad mode");
|
||||
ASSERT_EQ(op(view, t).unsafeGetTensorImpl(), view.unsafeGetTensorImpl());
|
||||
|
||||
// TODO: once we have InplaceOrView kernel, renable this since version counter would actually
|
||||
// be incremented
|
||||
// ASSERT_THAT(op(view, t).grad_fn()->name(), ::testing::HasSubstr("AsStridedBackward"));
|
||||
}
|
||||
|
||||
TEST(TestAutogradNotImplementedFallback, DoubleInplaceOp) {
|
||||
REGISTER_TEST_OP("two_arg_inplace_op", "_test::two_arg_inplace_op(Tensor(a!) self, Tensor(b!) other) -> (Tensor(a!), Tensor(b!))", two_arg_inplace_op);
|
||||
auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::two_arg_inplace_op", "");
|
||||
auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
|
||||
return callOpUnboxed<std::tuple<torch::Tensor, torch::Tensor>, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
|
||||
};
|
||||
auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
|
||||
auto b = torch::tensor({1.}, {torch::kFloat32});
|
||||
|
||||
// Both are modified in-place!
|
||||
ASSERT_THROWS_WITH(op(a, b),
|
||||
"a leaf Variable that requires grad is being used in an in-place operation");
|
||||
ASSERT_THROWS_WITH(op(b, a),
|
||||
"a leaf Variable that requires grad is being used in an in-place operation");
|
||||
}
|
||||
|
||||
TEST(TestAutogradNotImplementedFallback, OptOp) {
|
||||
REGISTER_TEST_OP("opt_op", "_test::opt_op(Tensor self, Tensor? other) -> Tensor", opt_op);
|
||||
auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::opt_op", "");
|
||||
auto op = [&](const torch::Tensor& _1, const c10::optional<torch::Tensor>& _2) {
|
||||
return callOpUnboxed<torch::Tensor, const torch::Tensor&, const c10::optional<torch::Tensor>&>(opHandle, _1, _2);
|
||||
};
|
||||
|
||||
auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
|
||||
auto b = torch::tensor({1.}, {torch::kFloat32});
|
||||
|
||||
ASSERT_TRUE(torch::allclose(op(a, b), opt_op(a, b)));
|
||||
ASSERT_TRUE(torch::allclose(op(a, {}), opt_op(a, {})));
|
||||
}
|
||||
|
||||
TEST(TestAutogradNotImplementedFallback, OutOfPlaceAddition) {
|
||||
REGISTER_TEST_OP("my_custom_op", "_test::my_custom_op(Tensor self, Tensor other) -> Tensor", my_custom_op);
|
||||
auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::my_custom_op", "");
|
||||
auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
|
||||
return callOpUnboxed<torch::Tensor, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
|
||||
};
|
||||
|
||||
assertBasicChecks(op);
|
||||
}
|
||||
|
||||
TEST(TestAutogradNotImplementedFallback, RetTupleNonTensor) {
|
||||
REGISTER_TEST_OP("ret_tuple_non_tensor", "_test::ret_tuple_non_tensor(Tensor self, Tensor other) -> (Tensor, Tensor, int)", ret_tuple_non_tensor);
|
||||
auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::ret_tuple_non_tensor", "");
|
||||
auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
|
||||
torch::Tensor out0;
|
||||
torch::Tensor out1;
|
||||
int64_t out2;
|
||||
auto out = callOpUnboxed<std::tuple<torch::Tensor, torch::Tensor, int64_t>, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
|
||||
std::tie(out0, out1, out2) = std::move(out);
|
||||
return out0;
|
||||
};
|
||||
|
||||
assertBasicChecks(op);
|
||||
}
|
||||
|
||||
TEST(TestAutogradNotImplementedFallback, ViewOp) {
|
||||
REGISTER_TEST_OP("view_op", "_test::view_op(Tensor(a) self, Tensor other) -> Tensor(a)", view_op);
|
||||
auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::view_op", "");
|
||||
auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
|
||||
return callOpUnboxed<torch::Tensor, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
|
||||
};
|
||||
assertBasicChecks(op);
|
||||
}
|
||||
|
||||
TEST(TestAutogradNotImplementedFallback, RetTensorVector) {
|
||||
REGISTER_TEST_OP("ret_tensor_vector", "_test::ret_tensor_vector(Tensor self, Tensor other) -> Tensor[]", ret_tensor_vector);
|
||||
auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::ret_tensor_vector", "");
|
||||
auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
|
||||
return callOpUnboxed<std::vector<at::Tensor>, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2)[0];
|
||||
};
|
||||
assertBasicChecks(op);
|
||||
}
|
||||
|
||||
TEST(TestAutogradNotImplementedFallback, TensorlistOp) {
|
||||
REGISTER_TEST_OP("tensorlist_op", "_test::tensorlist_op(Tensor self, Tensor[] other) -> Tensor", tensorlist_op);
|
||||
auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::tensorlist_op", "");
|
||||
auto op = [&](torch::Tensor _1, at::TensorList _2) {
|
||||
return callOpUnboxed<torch::Tensor, const torch::Tensor&, at::TensorList>(opHandle, _1, _2);
|
||||
};
|
||||
|
||||
auto a = torch::tensor({1.}, {torch::kFloat32});
|
||||
auto b = torch::tensor({1.}, {torch::kFloat32});
|
||||
auto c = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
|
||||
std::vector<torch::Tensor> vec = {b, c};
|
||||
auto out = op(a, vec);
|
||||
|
||||
ASSERT_THROWS_WITH(torch::autograd::grad({out}, {vec[0]}), "One of the differentiated Tensors does not require grad");
|
||||
ASSERT_THROWS_WITH(torch::autograd::grad({out}, {vec[1]}), "is not implemented");
|
||||
|
||||
ASSERT_TRUE(at::allclose(op(a, vec), tensorlist_op(a, vec)));
|
||||
}
|
||||
|
||||
|
||||
// TODO add these tests if needed
|
||||
// test_once_differentiable
|
||||
// test_sparse_backward
|
||||
|
|
|
|||
|
|
@ -792,6 +792,20 @@ TEST_F(FunctionalTest, CrossEntropy) {
|
|||
|
||||
ASSERT_TRUE(output.allclose(expected, 1e-04));
|
||||
ASSERT_TRUE(F::cross_entropy(input, target).allclose(expected, 1e-04));
|
||||
|
||||
// label smoothing with class indices
|
||||
input = torch::tensor({{3., 1.}, {1., 2.}}, torch::kFloat);
|
||||
output = F::cross_entropy(
|
||||
input, target, F::CrossEntropyFuncOptions().label_smoothing(0.15).reduction(torch::kMean));
|
||||
expected = torch::tensor(0.3326, torch::kFloat);
|
||||
ASSERT_TRUE(output.allclose(expected, 1e-04));
|
||||
|
||||
// label smoothing with target probabilities
|
||||
target = torch::tensor({{0.8, 0.2}, {0.1, 0.9}}, torch::kFloat);
|
||||
output = F::cross_entropy(
|
||||
input, target, F::CrossEntropyFuncOptions().label_smoothing(0.2).reduction(torch::kMean));
|
||||
expected = torch::tensor(0.5701, torch::kFloat);
|
||||
ASSERT_TRUE(output.allclose(expected, 1e-04));
|
||||
}
|
||||
|
||||
TEST_F(FunctionalTest, MaxUnpool1d) {
|
||||
|
|
|
|||
|
|
@ -28,6 +28,9 @@ TEST(IMethodTest, CallMethod) {
|
|||
auto pyModel = package.load_pickle("model", "model.pkl");
|
||||
torch::deploy::PythonMethodWrapper pyMethod(pyModel, "forward");
|
||||
|
||||
EXPECT_EQ(scriptMethod.name(), "forward");
|
||||
EXPECT_EQ(pyMethod.name(), "forward");
|
||||
|
||||
auto input = torch::ones({10, 20});
|
||||
auto outputPy = pyMethod({input});
|
||||
auto outputScript = scriptMethod({input});
|
||||
|
|
|
|||
|
|
@ -2315,6 +2315,31 @@ TEST_F(ModulesTest, CrossEntropyLoss) {
|
|||
ASSERT_TRUE(
|
||||
CrossEntropyLoss(CrossEntropyLossOptions().ignore_index(-100).reduction(torch::kMean))
|
||||
->forward(input, target).allclose(expected, 1e-04));
|
||||
|
||||
// label smoothing with class indices
|
||||
loss = CrossEntropyLoss(CrossEntropyLossOptions().label_smoothing(0.15).reduction(torch::kMean));
|
||||
input = torch::tensor({{3., 1.}, {1., 2.}}, torch::dtype(torch::kFloat).requires_grad(true));
|
||||
target = torch::tensor({0, 1}, torch::kLong);
|
||||
output = loss->forward(input, target);
|
||||
expected = torch::tensor(0.3326, torch::kFloat);
|
||||
s = output.sum();
|
||||
s.backward();
|
||||
|
||||
ASSERT_TRUE(output.allclose(expected, 1e-04));
|
||||
ASSERT_EQ(input.sizes(), input.grad().sizes());
|
||||
|
||||
// label smoothing with with target probabilities
|
||||
loss = CrossEntropyLoss(CrossEntropyLossOptions().label_smoothing(0.2).reduction(torch::kMean));
|
||||
input = torch::tensor({{3., 1.}, {1., 2.}}, torch::dtype(torch::kFloat).requires_grad(true));
|
||||
target = torch::tensor({{0.8, 0.2}, {0.1, 0.9}}, torch::kFloat);
|
||||
output = loss->forward(input, target);
|
||||
expected = torch::tensor(0.5701, torch::kFloat);
|
||||
s = output.sum();
|
||||
s.backward();
|
||||
|
||||
ASSERT_TRUE(output.allclose(expected, 1e-04));
|
||||
ASSERT_EQ(input.sizes(), input.grad().sizes());
|
||||
|
||||
}
|
||||
|
||||
TEST_F(ModulesTest, CosineSimilarity) {
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
#include <gtest/gtest.h>
|
||||
|
||||
#include <torch/csrc/autograd/generated/variable_factories.h>
|
||||
#include <torch/csrc/jit/frontend/ir_emitter.h>
|
||||
#include <torch/csrc/jit/ir/alias_analysis.h>
|
||||
#include <torch/csrc/jit/ir/irparser.h>
|
||||
#include "torch/csrc/jit/frontend/ir_emitter.h"
|
||||
#include "torch/csrc/jit/ir/alias_analysis.h"
|
||||
#include "torch/csrc/jit/runtime/custom_operator.h"
|
||||
#include "torch/csrc/utils/memory.h"
|
||||
#include <torch/csrc/jit/runtime/custom_operator.h>
|
||||
#include <torch/csrc/utils/memory.h>
|
||||
|
||||
namespace torch {
|
||||
namespace jit {
|
||||
|
|
@ -484,7 +484,7 @@ TEST(AliasAnalysisTest, SafeToChangeAliasingRelationship) {
|
|||
TEST(WriteTrackingTest, Basic) {
|
||||
RegisterOperators reg({Operator(
|
||||
"prim::creates_alias(Tensor(a) x) -> Tensor(a)",
|
||||
[](Stack* s) {},
|
||||
[](Stack&) {},
|
||||
aliasAnalysisFromSchema())});
|
||||
const auto creates_alias = Symbol::fromQualString("prim::creates_alias");
|
||||
auto graph = std::make_shared<Graph>();
|
||||
|
|
@ -949,11 +949,11 @@ TEST(WildcardsTest, Basic) {
|
|||
RegisterOperators reg(
|
||||
{Operator(
|
||||
"prim::returns_wildcard(Tensor a) -> Tensor(*)",
|
||||
[](Stack* stack) {},
|
||||
[](Stack&) {},
|
||||
aliasAnalysisFromSchema()),
|
||||
Operator(
|
||||
"prim::writes(Tensor(z!) a) -> Tensor(a)",
|
||||
[](Stack* stack) {},
|
||||
[](Stack&) {},
|
||||
aliasAnalysisFromSchema())});
|
||||
const auto returns_wildcard =
|
||||
Symbol::fromQualString("prim::returns_wildcard");
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ TEST(CustomOperatorTest, InferredSchema) {
|
|||
|
||||
Stack stack;
|
||||
push(stack, 2.0f, at::ones(5));
|
||||
op->getOperation()(&stack);
|
||||
op->getOperation()(stack);
|
||||
at::Tensor output;
|
||||
pop(stack, output);
|
||||
|
||||
|
|
@ -61,7 +61,7 @@ TEST(CustomOperatorTest, ExplicitSchema) {
|
|||
|
||||
Stack stack;
|
||||
push(stack, 2.0f, at::ones(5));
|
||||
op->getOperation()(&stack);
|
||||
op->getOperation()(stack);
|
||||
at::Tensor output;
|
||||
pop(stack, output);
|
||||
|
||||
|
|
@ -109,7 +109,7 @@ TEST(CustomOperatorTest, ListParameters) {
|
|||
c10::List<c10::complex<double>>(
|
||||
{c10::complex<double>(2.4, -5.5), c10::complex<double>(-1.3, 2)}));
|
||||
push(stack, c10::List<at::Tensor>({at::ones(5)}));
|
||||
op->getOperation()(&stack);
|
||||
op->getOperation()(stack);
|
||||
c10::List<double> output;
|
||||
pop(stack, output);
|
||||
|
||||
|
|
@ -140,7 +140,7 @@ TEST(CustomOperatorTest, ListParameters2) {
|
|||
|
||||
Stack stack;
|
||||
push(stack, c10::List<at::Tensor>({at::ones(5)}));
|
||||
op->getOperation()(&stack);
|
||||
op->getOperation()(stack);
|
||||
c10::List<at::Tensor> output;
|
||||
pop(stack, output);
|
||||
|
||||
|
|
@ -204,7 +204,7 @@ TEST(TestCustomOperator, OperatorGeneratorUndeclared) {
|
|||
torch::jit::RegisterOperators reg({OperatorGenerator(
|
||||
TORCH_SELECTIVE_NAME_IN_SCHEMA(
|
||||
op_list, "foofoo::not_exist(float a, Tensor b) -> Tensor"),
|
||||
[](Stack* stack) {
|
||||
[](Stack& stack) {
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
double a;
|
||||
at::Tensor b;
|
||||
|
|
@ -223,7 +223,7 @@ TEST(TestCustomOperator, OperatorGeneratorBasic) {
|
|||
torch::jit::RegisterOperators reg({OperatorGenerator(
|
||||
TORCH_SELECTIVE_NAME_IN_SCHEMA(
|
||||
op_list, "foofoo::bar.template(float a, Tensor b) -> Tensor"),
|
||||
[](Stack* stack) {
|
||||
[](Stack& stack) {
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
double a;
|
||||
at::Tensor b;
|
||||
|
|
@ -249,7 +249,7 @@ TEST(TestCustomOperator, OperatorGeneratorBasic) {
|
|||
|
||||
Stack stack;
|
||||
push(stack, 2.0f, at::ones(5));
|
||||
op->getOperation()(&stack);
|
||||
op->getOperation()(stack);
|
||||
at::Tensor output;
|
||||
pop(stack, output);
|
||||
|
||||
|
|
|
|||
|
|
@ -175,6 +175,15 @@ TEST(InterpreterTest, IgnorableArgsInSchema) {
|
|||
ASSERT_TRUE(op_to_specified_args_non_const["aten::conv2d"] == 6);
|
||||
}
|
||||
|
||||
TEST(InterpreterTest, IgnorableArgsInSchemaWithOut) {
|
||||
auto graph = build_mobile_export_with_out();
|
||||
MobileCode function(graph, "");
|
||||
auto op_to_specified_args = function.op_to_num_specified_args();
|
||||
ASSERT_TRUE(op_to_specified_args.size() == 1);
|
||||
// this should be 3 when the add_out flag is set to True
|
||||
ASSERT_TRUE(op_to_specified_args["aten::add.out"] == 4);
|
||||
}
|
||||
|
||||
TEST(InterpreterTest, runAsyncBasicTest) {
|
||||
/*
|
||||
TODO: there are some problem with C++ parsing script program involving
|
||||
|
|
|
|||
|
|
@ -520,6 +520,28 @@ TEST(SchemaParserTest, NestedArrays) {
|
|||
.getElementType()));
|
||||
}
|
||||
|
||||
TEST(SchemaParserTest, OutVariant) {
|
||||
auto schema_with_out = parseSchema(
|
||||
"at::foo(Tensor self, *, Tensor(a!) f, Tensor(b!) l) -> (Tensor(a!) f, Tensor(b!) l)");
|
||||
ASSERT_TRUE(schema_with_out.arguments().at(1).is_out());
|
||||
ASSERT_TRUE(schema_with_out.arguments().at(2).is_out());
|
||||
|
||||
auto schema_without_out =
|
||||
parseSchema("at::foo(Tensor self, *, int scalar) -> (int)");
|
||||
|
||||
for (const auto& arg : schema_without_out.arguments()) {
|
||||
ASSERT_TRUE(!arg.is_out());
|
||||
}
|
||||
|
||||
auto schema_with_is_write = parseSchema(
|
||||
"aten::ne_.Scalar(Tensor(a!) self, Scalar other) -> (Tensor(a!))");
|
||||
|
||||
for (const auto& arg : schema_with_is_write.arguments()) {
|
||||
ASSERT_TRUE(!arg.is_out());
|
||||
}
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
||||
TEST(SchemaParserTest, NamedReturns) {
|
||||
// named returns
|
||||
parseSchema("at::what(Tensor! i_will_be_written_to) -> ()");
|
||||
|
|
@ -1471,11 +1493,11 @@ TEST(NoneSchemaMatchTest, Basic) {
|
|||
RegisterOperators reg({
|
||||
Operator(
|
||||
"prim::test_none() -> int?",
|
||||
[](Stack* stack) { push(stack, IValue()); },
|
||||
[](Stack& stack) { push(stack, IValue()); },
|
||||
aliasAnalysisFromSchema()),
|
||||
Operator(
|
||||
"prim::is_none(int? a) -> bool",
|
||||
[](Stack* stack) {
|
||||
[](Stack& stack) {
|
||||
IValue a = pop(stack);
|
||||
if (a.isNone()) {
|
||||
push(stack, true);
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ TEST(SchemaMatchingTest, VarType) {
|
|||
RegisterOperators reg({
|
||||
Operator(
|
||||
"aten::test_vartype(t[] a, t b) -> (t)",
|
||||
[](Stack* stack) {
|
||||
[](Stack& stack) {
|
||||
c10::List<double> list;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
double a;
|
||||
|
|
@ -54,7 +54,7 @@ TEST(SchemaMatchingTest, VarType2) {
|
|||
RegisterOperators reg({
|
||||
Operator(
|
||||
"aten::test_vartype2(t a, t[] b) -> (t[])",
|
||||
[](Stack* stack) {
|
||||
[](Stack& stack) {
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
double a;
|
||||
c10::List<double> list;
|
||||
|
|
|
|||
|
|
@ -123,6 +123,21 @@ std::shared_ptr<Graph> build_mobile_export_analysis_graph() {
|
|||
return g;
|
||||
}
|
||||
|
||||
std::shared_ptr<Graph> build_mobile_export_with_out() {
|
||||
const auto graph_string = R"IR(
|
||||
graph(%x.1 : Tensor,
|
||||
%y.1 : Tensor):
|
||||
%8 : NoneType = prim::Constant()
|
||||
%6 : int = prim::Constant[value=1]()
|
||||
%7 : Tensor = aten::add(%x.1, %y.1, %6, %y.1)
|
||||
return (%8))IR";
|
||||
|
||||
auto g = std::make_shared<Graph>();
|
||||
torch::jit::parseIR(graph_string, g.get());
|
||||
g->lint();
|
||||
return g;
|
||||
}
|
||||
|
||||
std::shared_ptr<Graph> build_mobile_export_analysis_graph_nested() {
|
||||
// this is pretty much same test as build_mobile_export_analysis_graph(),
|
||||
// but some aten::slice operators are hidden under block statement to check
|
||||
|
|
@ -258,7 +273,7 @@ RegisterOperators reg({
|
|||
// because it always produces empty Tensors.
|
||||
Operator(
|
||||
"prim::MakeTestTensor() -> Tensor",
|
||||
[](Stack* stack) { push(stack, at::Tensor()); },
|
||||
[](Stack& stack) { push(stack, at::Tensor()); },
|
||||
aliasAnalysisFromSchema()),
|
||||
});
|
||||
} // namespace
|
||||
|
|
|
|||
|
|
@ -74,6 +74,7 @@ std::pair<tensor_list, tensor_list> runGradient(
|
|||
|
||||
std::shared_ptr<Graph> build_lstm();
|
||||
std::shared_ptr<Graph> build_mobile_export_analysis_graph();
|
||||
std::shared_ptr<Graph> build_mobile_export_with_out();
|
||||
std::shared_ptr<Graph> build_mobile_export_analysis_graph_with_vararg();
|
||||
std::shared_ptr<Graph> build_mobile_export_analysis_graph_nested();
|
||||
std::shared_ptr<Graph> build_mobile_export_analysis_graph_non_const();
|
||||
|
|
|
|||
|
|
@ -198,6 +198,22 @@ TEST_F(Kernel, _3) {
|
|||
}
|
||||
}
|
||||
|
||||
TEST_F(Kernel, Huge) {
|
||||
const auto graph_string = R"IR(
|
||||
graph(%x.1 : Float(4000000000, strides=[1], requires_grad=0, device=cpu)):
|
||||
%1 : int = prim::Constant[value=0]()
|
||||
%2 : Float(1, 4000000000, strides=[4000000000, 1], requires_grad=0, device=cpu) = aten::unsqueeze(%x.1, %1)
|
||||
%3 : Float(1, 4000000000, strides=[4000000000, 1], requires_grad=0, device=cpu) = aten::relu(%2)
|
||||
return (%3))IR";
|
||||
auto graph = std::make_shared<Graph>();
|
||||
parseIR(graph_string, &*graph);
|
||||
TensorExprKernel k(graph);
|
||||
std::ostringstream oss;
|
||||
oss << *k.getCodeGenStmt();
|
||||
const std::string& verification_pattern = "# CHECK: 4000000000";
|
||||
torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
|
||||
}
|
||||
|
||||
TEST_F(Kernel, ParallelStrided) {
|
||||
const auto graph_string = R"IR(
|
||||
graph(%0 : Float(5, 3, 40005, strides=[120015, 40005, 1], device=cpu),
|
||||
|
|
@ -786,9 +802,9 @@ TEST_F(Kernel, SumOneAxis) {
|
|||
// Check the IR we produced
|
||||
const std::string& verification_pattern =
|
||||
R"IR(
|
||||
# CHECK: for (int v = 0; v <
|
||||
# CHECK: for (int64_t v = 0ll; v <
|
||||
# CHECK-NEXT: sum
|
||||
# CHECK-NEXT: for (int v_1 = 0; v_1 <
|
||||
# CHECK-NEXT: for (int64_t v_1 = 0ll; v_1 <
|
||||
# CHECK-NEXT: sum)IR";
|
||||
torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
|
||||
|
||||
|
|
@ -847,10 +863,10 @@ TEST_F(Kernel, SumMultipleAxes) {
|
|||
// Check the IR we produced
|
||||
const std::string& verification_pattern =
|
||||
R"IR(
|
||||
# CHECK: int v = 0
|
||||
# CHECK: int v_1 = 0
|
||||
# CHECK: int v_2 = 0
|
||||
# CHECK: int v_3 = 0
|
||||
# CHECK: int64_t v = 0
|
||||
# CHECK: int64_t v_1 = 0
|
||||
# CHECK: int64_t v_2 = 0
|
||||
# CHECK: int64_t v_3 = 0
|
||||
# CHECK: sum)IR";
|
||||
torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
|
||||
|
||||
|
|
@ -1115,8 +1131,8 @@ TEST_F(Kernel, InlineProducerIntoReduction) {
|
|||
// We should have only one loop in the end.
|
||||
const std::string& verification_pattern =
|
||||
R"IR(
|
||||
# CHECK: for (int v = 0; v < 5;
|
||||
# CHECK-NEXT: for (int v_1 = 0; v_1 < 3;
|
||||
# CHECK: for (int64_t v = 0ll; v < 5
|
||||
# CHECK-NEXT: for (int64_t v_1 = 0ll; v_1 < 3
|
||||
# CHECK-NEXT: sum
|
||||
# CHECK-NOT: for)IR";
|
||||
torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
|
||||
|
|
@ -1154,11 +1170,11 @@ TEST_F(Kernel, InlineReductionIntoConsumer) {
|
|||
// We should have two loops in the end.
|
||||
const std::string& verification_pattern =
|
||||
R"IR(
|
||||
# CHECK: for (int v = 0; v < 5;
|
||||
# CHECK-NEXT: for (int v_1 = 0; v_1 < 3;
|
||||
# CHECK: for (int64_t v = 0ll; v < 5
|
||||
# CHECK-NEXT: for (int64_t v_1 = 0ll; v_1 < 3
|
||||
# CHECK-NEXT: sum
|
||||
# CHECK: for (int v_2 = 0; v_2 < 5;
|
||||
# CHECK-NEXT: for (int v_3 = 0; v_3 < 3;
|
||||
# CHECK: for (int64_t v_2 = 0ll; v_2 < 5
|
||||
# CHECK-NEXT: for (int64_t v_3 = 0ll; v_3 < 3
|
||||
# CHECK-NEXT: aten_mul
|
||||
# CHECK-NOT: for)IR";
|
||||
torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
|
||||
|
|
|
|||
|
|
@ -1501,42 +1501,54 @@ TEST(LLVM, RFactorVectorizedReduction) {
|
|||
ExpectAllNear(b_v, b_ref, 1e-5);
|
||||
}
|
||||
|
||||
TEST(LLVM, SimpleParallel) {
|
||||
for (int test_cfg = 0; test_cfg < 4; test_cfg++) {
|
||||
// Compute a simple operation, and try all loop-axis combination to be
|
||||
// parallel or sequential.
|
||||
const int M = 4;
|
||||
const int N = 6;
|
||||
Tensor f = Compute(
|
||||
"f", {{M, "m"}, {N, "n"}}, [](const VarHandle& m, const VarHandle& n) {
|
||||
return cast<float>(m + n);
|
||||
});
|
||||
LoopNest loop_nest({f});
|
||||
auto const& loops = loop_nest.getLoopStmtsFor(f);
|
||||
ForPtr m = loops[0];
|
||||
ForPtr n = loops[1];
|
||||
if (test_cfg & 0x1) {
|
||||
m->set_parallel();
|
||||
}
|
||||
if (test_cfg & 0x2) {
|
||||
n->set_parallel();
|
||||
}
|
||||
loop_nest.prepareForCodegen();
|
||||
StmtPtr stmt = loop_nest.root_stmt();
|
||||
LLVMCodeGen cg(stmt, {f});
|
||||
|
||||
PaddedBuffer<float> f_v(M, N, "f_v");
|
||||
std::vector<void*> args({f_v.data()});
|
||||
int value = cg.value<int>(args);
|
||||
ASSERT_EQ(value, 0);
|
||||
PaddedBuffer<float> f_ref(M, N, "f_ref");
|
||||
for (int m = 0; m < M; m++) {
|
||||
for (int n = 0; n < N; n++) {
|
||||
f_ref(m, n) = m + n;
|
||||
}
|
||||
}
|
||||
ExpectAllNear(f_v, f_ref, 1e-5);
|
||||
template <bool outer, bool inner>
|
||||
static void testSimpleParallel() {
|
||||
// Compute a simple operation, and try all loop-axis combination to be
|
||||
// parallel or sequential.
|
||||
const int M = 4;
|
||||
const int N = 6;
|
||||
Tensor f = Compute(
|
||||
"f", {{M, "m"}, {N, "n"}}, [](const VarHandle& m, const VarHandle& n) {
|
||||
return cast<float>(m + n);
|
||||
});
|
||||
LoopNest loop_nest({f});
|
||||
auto const& loops = loop_nest.getLoopStmtsFor(f);
|
||||
ForPtr m = loops[0];
|
||||
ForPtr n = loops[1];
|
||||
if (outer) {
|
||||
m->set_parallel();
|
||||
}
|
||||
if (inner) {
|
||||
n->set_parallel();
|
||||
}
|
||||
loop_nest.prepareForCodegen();
|
||||
StmtPtr stmt = loop_nest.root_stmt();
|
||||
LLVMCodeGen cg(stmt, {f});
|
||||
|
||||
PaddedBuffer<float> f_v(M, N, "f_v");
|
||||
std::vector<void*> args({f_v.data()});
|
||||
int value = cg.value<int>(args);
|
||||
ASSERT_EQ(value, 0);
|
||||
PaddedBuffer<float> f_ref(M, N, "f_ref");
|
||||
for (int m = 0; m < M; m++) {
|
||||
for (int n = 0; n < N; n++) {
|
||||
f_ref(m, n) = m + n;
|
||||
}
|
||||
}
|
||||
ExpectAllNear(f_v, f_ref, 1e-5);
|
||||
}
|
||||
|
||||
TEST(LLVM, SimpleParallelSS) {
|
||||
testSimpleParallel<false, false>();
|
||||
}
|
||||
TEST(LLVM, SimpleParallelSP) {
|
||||
testSimpleParallel<false, true>();
|
||||
}
|
||||
TEST(LLVM, SimpleParallelPS) {
|
||||
testSimpleParallel<true, false>();
|
||||
}
|
||||
TEST(LLVM, SimpleParallelPP) {
|
||||
testSimpleParallel<true, true>();
|
||||
}
|
||||
|
||||
TEST(LLVM, CompositeParallel) {
|
||||
|
|
|
|||
|
|
@ -4734,8 +4734,8 @@ TEST(LoopNest, VectorizeUse) {
|
|||
}
|
||||
|
||||
const char* int64Loop = R"IR(
|
||||
# CHECK: for (int64_t n = 0; n < 12; n++) {
|
||||
# CHECK: b[n] = (a[n]) + 1;
|
||||
# CHECK: for (int64_t n = 0ll; n < 12ll; n++) {
|
||||
# CHECK: b[n] = (a[n]) + 1ll;
|
||||
# CHECK: }
|
||||
)IR";
|
||||
|
||||
|
|
@ -4744,7 +4744,8 @@ TEST(LoopNest, Int64Direct) {
|
|||
Placeholder a("a", kLong, {N});
|
||||
Placeholder b("b", kLong, {N});
|
||||
VarHandle n("n", kLong);
|
||||
StmtPtr s = For::make(n, 0, N, b.store({n}, a.load({n}) + LongImm::make(1l)));
|
||||
StmtPtr s = For::make(
|
||||
n, LongImm::make(0l), N, b.store({n}, a.load({n}) + LongImm::make(1l)));
|
||||
s = IRSimplifier::simplify(s);
|
||||
std::ostringstream oss;
|
||||
oss << *s;
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user