Update on "Add type refinement for Union"

Differential Revision: [D28572811](https://our.internmc.facebook.com/intern/diff/D28572811)

[ghstack-poisoned]
This commit is contained in:
Ansley Ussery 2021-08-30 14:29:59 -07:00
commit b3ece930a1
284 changed files with 11376 additions and 5248 deletions

View File

@ -3,7 +3,11 @@ build --copt=-I.
build --copt=-isystem --copt bazel-out/k8-fastbuild/bin
# Configuration to disable tty features for environments like CI
build:no-tty --curses no
build:no-tty --progress_report_interval 10
build:no-tty --show_progress_rate_limit 10
# Configuration to build with GPU support
build:gpu --define=cuda=true
# define a separate build folder for faster switching between configs
build:gpu --platform_suffix=-gpu

View File

@ -14,6 +14,9 @@ chmod +x "$build_script"
# Build
cat >"$build_script" <<EOL
export PATH="$workdir/miniconda/bin:$PATH"
if [[ "$CIRCLE_BRANCH" == "nightly" ]]; then
export USE_PYTORCH_METAL_EXPORT=1
fi
if [[ "$PACKAGE_TYPE" == conda ]]; then
"$workdir/builder/conda/build_pytorch.sh"
else

View File

@ -10,18 +10,27 @@ pt_checkout="/var/lib/jenkins/workspace"
# Since we're cat-ing this file, we need to escape all $'s
echo "cpp_doc_push_script.sh: Invoked with $*"
# Argument 1: Where to copy the built documentation for Python API to
# (pytorch.github.io/$install_path)
install_path="$1"
if [ -z "$install_path" ]; then
echo "error: cpp_doc_push_script.sh: install_path (arg1) not specified"
# for statements like ${1:-${DOCS_INSTALL_PATH:-docs/}}
# the order of operations goes:
# 1. Check if there's an argument $1
# 2. If no argument check for environment var DOCS_INSTALL_PATH
# 3. If no environment var fall back to default 'docs/'
# NOTE: It might seem weird to gather the second argument before gathering the first argument
# but since DOCS_INSTALL_PATH can be derived from DOCS_VERSION it's probably better to
# try and gather it first, just so we don't potentially break people who rely on this script
# Argument 2: What version of the Python API docs we are building.
version="${2:-${DOCS_VERSION:-master}}"
if [ -z "$version" ]; then
echo "error: cpp_doc_push_script.sh: version (arg2) not specified"
exit 1
fi
# Argument 2: What version of the Python API docs we are building.
version="$2"
if [ -z "$version" ]; then
echo "error: cpp_doc_push_script.sh: version (arg2) not specified"
# Argument 1: Where to copy the built documentation for Python API to
# (pytorch.github.io/$install_path)
install_path="${1:-${DOCS_INSTALL_PATH:-docs/${DOCS_VERSION}}}"
if [ -z "$install_path" ]; then
echo "error: cpp_doc_push_script.sh: install_path (arg1) not specified"
exit 1
fi

View File

@ -13,18 +13,27 @@ echo "python_doc_push_script.sh: Invoked with $*"
set -ex
# Argument 1: Where to copy the built documentation to
# (pytorch.github.io/$install_path)
install_path="$1"
if [ -z "$install_path" ]; then
echo "error: python_doc_push_script.sh: install_path (arg1) not specified"
# for statements like ${1:-${DOCS_INSTALL_PATH:-docs/}}
# the order of operations goes:
# 1. Check if there's an argument $1
# 2. If no argument check for environment var DOCS_INSTALL_PATH
# 3. If no environment var fall back to default 'docs/'
# NOTE: It might seem weird to gather the second argument before gathering the first argument
# but since DOCS_INSTALL_PATH can be derived from DOCS_VERSION it's probably better to
# try and gather it first, just so we don't potentially break people who rely on this script
# Argument 2: What version of the docs we are building.
version="${2:-${DOCS_VERSION:-master}}"
if [ -z "$version" ]; then
echo "error: python_doc_push_script.sh: version (arg2) not specified"
exit 1
fi
# Argument 2: What version of the docs we are building.
version="$2"
if [ -z "$version" ]; then
echo "error: python_doc_push_script.sh: version (arg2) not specified"
# Argument 1: Where to copy the built documentation to
# (pytorch.github.io/$install_path)
install_path="${1:-${DOCS_INSTALL_PATH:-docs/${DOCS_VERSION}}}"
if [ -z "$install_path" ]; then
echo "error: python_doc_push_script.sh: install_path (arg1) not specified"
exit 1
fi
@ -34,7 +43,7 @@ if [ "$version" == "master" ]; then
fi
# Argument 3: The branch to push to. Usually is "site"
branch="$3"
branch="${3:-${DOCS_BRANCH:-site}}"
if [ -z "$branch" ]; then
echo "error: python_doc_push_script.sh: branch (arg3) not specified"
exit 1

View File

@ -148,6 +148,8 @@ class CIWorkflow:
enable_nogpu_no_avx_test: YamlShellBool = "''"
enable_nogpu_no_avx2_test: YamlShellBool = "''"
enable_slow_test: YamlShellBool = "''"
enable_docs_test: YamlShellBool = "''"
enable_backwards_compat_test: YamlShellBool = "''"
def __post_init__(self) -> None:
if self.is_libtorch:
@ -266,6 +268,8 @@ LINUX_WORKFLOWS = [
test_runner_type=LINUX_CPU_TEST_RUNNER,
on_pull_request=True,
enable_doc_jobs=True,
enable_docs_test=1,
enable_backwards_compat_test=1,
num_test_shards=2,
ciflow_config=CIFlowConfig(
enabled=True,
@ -497,7 +501,7 @@ BAZEL_WORKFLOWS = [
CIWorkflow(
arch="linux",
build_environment="linux-xenial-py3.6-gcc7-bazel-test",
docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc7",
docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7",
test_runner_type=LINUX_CPU_TEST_RUNNER,
on_pull_request=True,
ciflow_config=CIFlowConfig(

View File

@ -55,6 +55,10 @@ def main() -> None:
configs['distributed'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
if os.getenv('ENABLE_SLOW_TEST'):
configs['slow'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
if os.getenv('ENABLE_DOCS_TEST'):
configs['docs_test'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
if os.getenv('ENABLE_BACKWARDS_COMPAT_TEST'):
configs['backwards_compat'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
matrix = {
'include': [
{

View File

@ -254,6 +254,8 @@ jobs:
ENABLE_NOGPU_NO_AVX_TEST: !{{ enable_nogpu_no_avx_test }}
ENABLE_NOGPU_NO_AVX2_TEST: !{{ enable_nogpu_no_avx2_test }}
ENABLE_SLOW_TEST: !{{ enable_slow_test }}
ENABLE_DOCS_TEST: !{{ enable_docs_test }}
ENABLE_BACKWARDS_COMPAT_TEST: !{{ enable_backwards_compat_test }}
NUM_TEST_SHARDS: !{{ num_test_shards }}
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
NOGPU_RUNNER_TYPE: linux.2xlarge
@ -346,7 +348,6 @@ jobs:
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Test PyTorch
env:
BUILD_ENVIRONMENT: !{{ build_environment }}-${{ matrix.config }}
PR_NUMBER: ${{ github.event.pull_request.number }}
run: |
if [[ $TEST_CONFIG == 'multigpu' ]]; then
@ -441,11 +442,15 @@ jobs:
{% endblock %}
{%- endif -%}
{%- if enable_doc_jobs %}
pytorch_python_doc_build:
pytorch_doc_build:
runs-on: linux.2xlarge
strategy:
matrix:
docs_type: [cpp, python]
needs: [calculate-docker-image, build, !{{ ciflow_config.root_job_name }}]
env:
DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
DOCS_TYPE: ${{ matrix.docs_type }}
steps:
- name: Log in to ECR
run: |
@ -483,7 +488,7 @@ jobs:
- name: Unzip artifacts
run: |
unzip -o artifacts.zip
- name: Build Python Doc in Docker
- name: Build ${{ matrix.docs_type }} docs
run: |
set -ex
time docker pull "${DOCKER_IMAGE}" > /dev/null
@ -496,6 +501,8 @@ jobs:
-e IN_CI \
-e MAX_JOBS="$(nproc --ignore=2)" \
-e CIRCLE_SHA1="$GITHUB_SHA" \
-e DOCS_VERSION="${target}" \
-e DOCS_TYPE \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
@ -505,34 +512,36 @@ jobs:
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}" \
bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/python_doc_push_script.sh docs/$target $target site"
bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/${DOCS_TYPE}_doc_push_script.sh"
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
- uses: driazati/upload-artifact-s3@21c31d0a7bcb056ca50bd6ce197ba6507c26a1be
if: ${{ github.event_name == 'pull_request' }}
name: Upload Docs Preview
- uses: seemethere/upload-artifact-s3@v3
name: Upload Python Docs Preview
if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' }}
with:
name: deploy
retention-days: 14
if-no-files-found: error
path: pytorch.github.io/docs/merge
- name: Show Docs Preview URL (Click Me)
if: ${{ github.event_name == 'pull_request' }}
env:
PR_NUMBER: ${{ github.event.pull_request.number }}
run: |
echo "See rendered docs at https://docs-preview.pytorch.org/$PR_NUMBER/"
path: pytorch.github.io/docs/merge/
s3-prefix: ${{ github.repository }}/pr-previews/pr/${{ github.event.pull_request.number }}
- uses: seemethere/upload-artifact-s3@v3
name: Upload C++ Docs Preview
if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'cppdocs' }}
with:
retention-days: 14
if-no-files-found: error
path: cppdocs/
s3-prefix: ${{ github.repository }}/pr-previews/pr/${{ github.event.pull_request.number }}/cppdocs
- name: Archive artifacts into zip
run: |
zip -r pytorch_github_io.zip "${GITHUB_WORKSPACE}/pytorch.github.io"
zip -r "docs_${DOCS_TYPE}.zip" "${GITHUB_WORKSPACE}/pytorch.github.io" "${GITHUB_WORKSPACE}/cppdocs"
- uses: actions/upload-artifact@v2
name: Store PyTorch Build Artifacts
with:
name: pytorch_github_io
name: docs_${{ matrix.docs_type }}
path: docs_${{ matrix.docs_type }}.zip
if-no-files-found: error
path: pytorch_github_io.zip
- name: Hold runner for 2 hours or until ssh sessions have drained
# Always hold for active ssh sessions
if: always()

View File

@ -230,6 +230,8 @@ jobs:
ENABLE_NOGPU_NO_AVX_TEST: ''
ENABLE_NOGPU_NO_AVX2_TEST: ''
ENABLE_SLOW_TEST: ''
ENABLE_DOCS_TEST: ''
ENABLE_BACKWARDS_COMPAT_TEST: ''
NUM_TEST_SHARDS: 2
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
NOGPU_RUNNER_TYPE: linux.2xlarge
@ -325,7 +327,6 @@ jobs:
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Test PyTorch
env:
BUILD_ENVIRONMENT: linux-bionic-cuda10.2-py3.9-gcc7-${{ matrix.config }}
PR_NUMBER: ${{ github.event.pull_request.number }}
run: |
if [[ $TEST_CONFIG == 'multigpu' ]]; then

View File

@ -230,6 +230,8 @@ jobs:
ENABLE_NOGPU_NO_AVX_TEST: ''
ENABLE_NOGPU_NO_AVX2_TEST: ''
ENABLE_SLOW_TEST: ''
ENABLE_DOCS_TEST: ''
ENABLE_BACKWARDS_COMPAT_TEST: ''
NUM_TEST_SHARDS: 2
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
NOGPU_RUNNER_TYPE: linux.2xlarge
@ -325,7 +327,6 @@ jobs:
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Test PyTorch
env:
BUILD_ENVIRONMENT: linux-bionic-py3.8-gcc9-coverage-${{ matrix.config }}
PR_NUMBER: ${{ github.event.pull_request.number }}
run: |
if [[ $TEST_CONFIG == 'multigpu' ]]; then

View File

@ -230,6 +230,8 @@ jobs:
ENABLE_NOGPU_NO_AVX_TEST: 1
ENABLE_NOGPU_NO_AVX2_TEST: 1
ENABLE_SLOW_TEST: 1
ENABLE_DOCS_TEST: ''
ENABLE_BACKWARDS_COMPAT_TEST: ''
NUM_TEST_SHARDS: 2
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
NOGPU_RUNNER_TYPE: linux.2xlarge
@ -325,7 +327,6 @@ jobs:
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Test PyTorch
env:
BUILD_ENVIRONMENT: linux-xenial-cuda10.2-py3.6-gcc7-${{ matrix.config }}
PR_NUMBER: ${{ github.event.pull_request.number }}
run: |
if [[ $TEST_CONFIG == 'multigpu' ]]; then

View File

@ -230,6 +230,8 @@ jobs:
ENABLE_NOGPU_NO_AVX_TEST: ''
ENABLE_NOGPU_NO_AVX2_TEST: ''
ENABLE_SLOW_TEST: ''
ENABLE_DOCS_TEST: ''
ENABLE_BACKWARDS_COMPAT_TEST: ''
NUM_TEST_SHARDS: 2
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
NOGPU_RUNNER_TYPE: linux.2xlarge
@ -325,7 +327,6 @@ jobs:
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Test PyTorch
env:
BUILD_ENVIRONMENT: linux-xenial-cuda11.3-py3.6-gcc7-${{ matrix.config }}
PR_NUMBER: ${{ github.event.pull_request.number }}
run: |
if [[ $TEST_CONFIG == 'multigpu' ]]; then

View File

@ -230,6 +230,8 @@ jobs:
ENABLE_NOGPU_NO_AVX_TEST: ''
ENABLE_NOGPU_NO_AVX2_TEST: ''
ENABLE_SLOW_TEST: ''
ENABLE_DOCS_TEST: 1
ENABLE_BACKWARDS_COMPAT_TEST: 1
NUM_TEST_SHARDS: 2
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
NOGPU_RUNNER_TYPE: linux.2xlarge
@ -325,7 +327,6 @@ jobs:
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Test PyTorch
env:
BUILD_ENVIRONMENT: linux-xenial-py3.6-gcc5.4-${{ matrix.config }}
PR_NUMBER: ${{ github.event.pull_request.number }}
run: |
if [[ $TEST_CONFIG == 'multigpu' ]]; then
@ -429,11 +430,15 @@ jobs:
# Prune all of the docker images
docker system prune -af
pytorch_python_doc_build:
pytorch_doc_build:
runs-on: linux.2xlarge
strategy:
matrix:
docs_type: [cpp, python]
needs: [calculate-docker-image, build, ciflow_should_run]
env:
DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
DOCS_TYPE: ${{ matrix.docs_type }}
steps:
- name: Log in to ECR
run: |
@ -474,7 +479,7 @@ jobs:
- name: Unzip artifacts
run: |
unzip -o artifacts.zip
- name: Build Python Doc in Docker
- name: Build ${{ matrix.docs_type }} docs
run: |
set -ex
time docker pull "${DOCKER_IMAGE}" > /dev/null
@ -487,6 +492,8 @@ jobs:
-e IN_CI \
-e MAX_JOBS="$(nproc --ignore=2)" \
-e CIRCLE_SHA1="$GITHUB_SHA" \
-e DOCS_VERSION="${target}" \
-e DOCS_TYPE \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
@ -496,34 +503,36 @@ jobs:
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}" \
bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/python_doc_push_script.sh docs/$target $target site"
bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/${DOCS_TYPE}_doc_push_script.sh"
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
- uses: driazati/upload-artifact-s3@21c31d0a7bcb056ca50bd6ce197ba6507c26a1be
if: ${{ github.event_name == 'pull_request' }}
name: Upload Docs Preview
- uses: seemethere/upload-artifact-s3@v3
name: Upload Python Docs Preview
if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' }}
with:
name: deploy
retention-days: 14
if-no-files-found: error
path: pytorch.github.io/docs/merge
- name: Show Docs Preview URL (Click Me)
if: ${{ github.event_name == 'pull_request' }}
env:
PR_NUMBER: ${{ github.event.pull_request.number }}
run: |
echo "See rendered docs at https://docs-preview.pytorch.org/$PR_NUMBER/"
path: pytorch.github.io/docs/merge/
s3-prefix: ${{ github.repository }}/pr-previews/pr/${{ github.event.pull_request.number }}
- uses: seemethere/upload-artifact-s3@v3
name: Upload C++ Docs Preview
if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'cppdocs' }}
with:
retention-days: 14
if-no-files-found: error
path: cppdocs/
s3-prefix: ${{ github.repository }}/pr-previews/pr/${{ github.event.pull_request.number }}/cppdocs
- name: Archive artifacts into zip
run: |
zip -r pytorch_github_io.zip "${GITHUB_WORKSPACE}/pytorch.github.io"
zip -r "docs_${DOCS_TYPE}.zip" "${GITHUB_WORKSPACE}/pytorch.github.io" "${GITHUB_WORKSPACE}/cppdocs"
- uses: actions/upload-artifact@v2
name: Store PyTorch Build Artifacts
with:
name: pytorch_github_io
name: docs_${{ matrix.docs_type }}
path: docs_${{ matrix.docs_type }}.zip
if-no-files-found: error
path: pytorch_github_io.zip
- name: Hold runner for 2 hours or until ssh sessions have drained
# Always hold for active ssh sessions
if: always()

View File

@ -15,7 +15,7 @@ on:
env:
BUILD_ENVIRONMENT: linux-xenial-py3.6-gcc7-bazel-test
DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc7
DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
TORCH_CUDA_ARCH_LIST: 5.2
IN_CI: 1

View File

@ -228,6 +228,8 @@ jobs:
ENABLE_NOGPU_NO_AVX_TEST: ''
ENABLE_NOGPU_NO_AVX2_TEST: ''
ENABLE_SLOW_TEST: ''
ENABLE_DOCS_TEST: ''
ENABLE_BACKWARDS_COMPAT_TEST: ''
NUM_TEST_SHARDS: 2
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
NOGPU_RUNNER_TYPE: linux.2xlarge
@ -323,7 +325,6 @@ jobs:
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Test PyTorch
env:
BUILD_ENVIRONMENT: periodic-linux-xenial-cuda11.1-py3.6-gcc7-${{ matrix.config }}
PR_NUMBER: ${{ github.event.pull_request.number }}
run: |
if [[ $TEST_CONFIG == 'multigpu' ]]; then

View File

@ -224,7 +224,11 @@ if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
get_bazel
# first build the whole torch for CPU-only
tools/bazel build --config=no-tty :torch
# then build selected set of targets with GPU-support.
# TODO: eventually this should converge to building the whole :torch with GPU-support
tools/bazel build --config=no-tty --config=gpu :c10
else
# check that setup.py would fail with bad arguments
echo "The next three invocations are expected to fail with invalid command error messages."

View File

@ -494,6 +494,10 @@ test_torch_deploy() {
assert_git_not_dirty
}
test_docs_test() {
.jenkins/pytorch/docs-test.sh
}
if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
(cd test && python -c "import torch; print(torch.__config__.show())")
(cd test && python -c "import torch; print(torch.__config__.parallel_info())")
@ -517,7 +521,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 || "$
test_without_numpy
install_torchvision
test_python_shard1
test_distributed
test_aten
elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 || "${SHARD_NUMBER}" == 2 ]]; then
install_torchvision
@ -533,6 +536,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
elif [[ "${BUILD_ENVIRONMENT}" == *distributed* ]]; then
test_distributed
test_rpc
elif [[ "${TEST_CONFIG}" = docs_test ]]; then
test_docs_test
else
install_torchvision
install_monkeytype

View File

@ -214,6 +214,7 @@ option(USE_LMDB "Use LMDB" OFF)
option(USE_MAGMA "Use MAGMA" ON)
option(USE_METAL "Use Metal for Caffe2 iOS build" ON)
option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
option(USE_NATIVE_ARCH "Use -march=native" OFF)
cmake_dependent_option(
USE_MLCOMPUTE "Use ML Compute for macOS build" ON
@ -688,6 +689,10 @@ if(USE_PYTORCH_METAL)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_METAL")
endif()
if(USE_PYTORCH_METAL_EXPORT)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_METAL_EXPORT")
endif()
if(USE_SOURCE_DEBUG_ON_MOBILE)
string(APPEND CMAKE_CXX_FLAGS " -DSYMBOLICATE_MOBILE_DEBUG_HANDLE")
endif()

View File

@ -1,7 +1,7 @@
workspace(name = "pytorch")
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
load("//tools/rules:workspace.bzl", "new_patched_local_repository")
load("//tools/rules:workspace.bzl", "new_patched_local_repository", "new_empty_repository")
http_archive(
name = "bazel_skylib",
@ -170,3 +170,14 @@ protobuf_deps()
load("@rules_python//python:repositories.bzl", "py_repositories")
py_repositories()
local_repository(
name = "local_config_cuda",
path = "third_party/tensorflow_cuda_bazel_build",
)
# Wrapper to expose local_config_cuda in an agnostic way
new_empty_repository(
name = "cuda",
build_file = "//third_party:cuda.BUILD",
)

View File

@ -167,13 +167,12 @@ else()
endif()
# Metal
if(USE_PYTORCH_METAL)
if(APPLE)
set(all_cpu_cpp ${all_cpu_cpp} ${metal_cpp} ${native_metal_srcs})
else()
# Add files needed from optimized_for_mobile
set(all_cpu_cpp ${all_cpu_cpp} ${metal_cpp} ${metal_prepack_cpp})
endif()
if(USE_PYTORCH_METAL_EXPORT)
# Add files needed from exporting metal models(optimized_for_mobile)
set(all_cpu_cpp ${all_cpu_cpp} ${metal_cpp} ${metal_prepack_cpp})
elseif(APPLE AND USE_PYTORCH_METAL)
# Compile Metal kernels
set(all_cpu_cpp ${all_cpu_cpp} ${metal_cpp} ${native_metal_srcs})
else()
set(all_cpu_cpp ${all_cpu_cpp} ${metal_cpp})
endif()
@ -450,13 +449,21 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS})
if(NOT INTERN_BUILD_MOBILE)
list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${miopen_h})
# Metal
if(USE_PYTORCH_METAL_EXPORT)
# Add files needed from exporting metal models(optimized_for_mobile)
list(APPEND INSTALL_HEADERS ${metal_h} ${metal_prepack_h})
elseif(APPLE AND USE_PYTORCH_METAL)
# Needed by Metal kernels
list(APPEND INSTALL_HEADERS ${metal_h} ${native_metal_h})
else()
list(APPEND INSTALL_HEADERS ${metal_h})
endif()
else()
if(USE_PYTORCH_METAL)
if(IOS)
if(IOS AND USE_PYTORCH_METAL)
list(APPEND INSTALL_HEADERS ${metal_h} ${native_metal_h})
else()
else()
list(APPEND INSTALL_HEADERS ${metal_h} ${metal_prepack_h})
endif()
endif()
endif()

View File

@ -10,7 +10,7 @@ namespace at {
struct TORCH_API CPUGeneratorImpl : public c10::GeneratorImpl {
// Constructors
CPUGeneratorImpl(uint64_t seed_in = default_rng_seed_val);
~CPUGeneratorImpl() = default;
~CPUGeneratorImpl() override = default;
// CPUGeneratorImpl methods
std::shared_ptr<CPUGeneratorImpl> clone() const;

View File

@ -7,6 +7,9 @@
namespace at {
static inline int64_t maybe_wrap_dim(int64_t dim, int64_t dim_post_expr, bool wrap_scalar=true) {
// if dim_post_expr is 0 and wrap_scalar is true, then dim must be in the range [-1, 0].
// This is a special case for scalar tensors and manifests in e.g. torch.sum(scalar_tensor, 0)
// Otherwise, dim should be in the range [-dim_post_expr, dim_post_expr-1].
return c10::maybe_wrap_dim(dim, dim_post_expr, wrap_scalar);
}

View File

@ -123,7 +123,7 @@ struct BuiltinOpFunction : public Function {
return *this;
}
~BuiltinOpFunction() {}
~BuiltinOpFunction() override {}
private:
c10::QualifiedName name_;

View File

@ -344,6 +344,10 @@ public:
c10::Dispatcher::singleton().callBoxed(*this, stack);
}
void callBoxed(Stack& stack) const {
callBoxed(&stack);
}
void redispatchBoxed(DispatchKeySet ks, Stack* stack) const {
c10::Dispatcher::singleton().redispatchBoxed(*this, ks, stack);
}

View File

@ -34,6 +34,9 @@ struct Argument {
default_value_(std::move(default_value)),
kwarg_only_(kwarg_only),
alias_info_(std::move(alias_info)) {
// this is an softly-enforced invariant for out arguments.
bool is_alias = alias_info_.has_value() && alias_info_.value().isWrite();
is_out_ = kwarg_only_ && is_alias;
}
const std::string& name() const {
return name_;
@ -50,6 +53,11 @@ struct Argument {
bool kwarg_only() const {
return kwarg_only_;
}
bool is_out() const {
return is_out_;
}
const c10::optional<AliasInfo>& alias_info() const {
return alias_info_;
}
@ -116,6 +124,8 @@ struct Argument {
// is this only specifiable as a keyword argument?
bool kwarg_only_;
c10::optional<AliasInfo> alias_info_;
// marks if the argument is out variant of the schema
bool is_out_;
};
inline bool operator==(const Argument& lhs, const Argument& rhs) {

View File

@ -51,6 +51,16 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema)
return out;
}
inline size_t findFirstOutArg(const std::vector<Argument>& args) {
// find the start of out args in the schema
for (size_t out_start_idx = 0; out_start_idx < args.size(); out_start_idx++) {
if (args.at(out_start_idx).is_out()) {
return out_start_idx;
}
}
return args.size();
}
inline bool Argument::isBackwardCompatibleWith(
const Argument& old,
std::ostream* why_not) const {
@ -121,17 +131,20 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
}
}
// Make sure that all the old arguments have their corresponding backward
// compatible arguments in this schema.
for (size_t i = 0; i < old.arguments().size(); ++i) {
// we want to test both out and default args seperately
size_t old_out_start_idx = findFirstOutArg(old.arguments());
size_t new_out_start_idx = findFirstOutArg(arguments());
// make sure among the default args, they are backward compatible
for (size_t i = 0; i < old_out_start_idx; i++) {
if (!arguments().at(i).isBackwardCompatibleWith(
old.arguments().at(i), why_not)) {
return false;
}
}
// Validate that all new arguments provided a default value.
for (size_t i = old.arguments().size(); i < arguments().size(); ++i) {
// // Validate that all new arguments provided has a default value
for (size_t i = old_out_start_idx; i < new_out_start_idx; ++i) {
if (!arguments().at(i).default_value()) {
if (why_not) {
*why_not
@ -144,6 +157,15 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
}
}
// now compare the out args
for (size_t i = old_out_start_idx; i < old.arguments().size(); i++) {
if (!arguments()
.at(i - old_out_start_idx + new_out_start_idx)
.isBackwardCompatibleWith(old.arguments().at(i), why_not)) {
return false;
}
}
return true;
}

View File

@ -1,6 +1,9 @@
#pragma once
#include <type_traits>
#include <ATen/core/ivalue.h>
#include <c10/util/Deprecated.h>
// TODO move this to c10 namespace
@ -9,7 +12,42 @@ namespace jit {
using c10::IValue;
using Stack = std::vector<IValue>;
using Operation = std::function<void(Stack*)>;
class Operation {
template <typename F, typename Arg>
using accepts = std::is_constructible<std::function<void(Arg)>, F&&>;
public:
template <typename F,
std::enable_if_t<accepts<F, Stack*>::value, int> = 0>
C10_DEPRECATED_MESSAGE("Please use void(Stack&) to register operator instead.")
Operation(F&& raw): op_([raw = std::forward<F>(raw)](Stack& stack) {
raw(&stack);
}) {}
template <typename F,
std::enable_if_t<accepts<F, Stack&>::value &&
!std::is_same<std::decay_t<F>, Operation>::value, int> = 0>
Operation(F&& op): op_(std::forward<F>(op)) {}
Operation(std::nullptr_t) noexcept {}
explicit operator bool() const noexcept {
return op_ ? true : false;
}
void operator()(Stack& stack) {
op_(stack);
}
template <typename T>
T* target() noexcept {
return op_.target<T>();
}
private:
std::function<void(Stack&)> op_;
};
// An operation with N inputs and M outputs pops the last N inputs off
// the stack and pushes its M inputs onto the stack

View File

@ -459,9 +459,10 @@ TORCH_IMPL_FUNC(nll_loss_backward_out_cpu)
Tensor cross_entropy_loss_prob_target(
const Tensor& self,
const Tensor& target,
const Tensor& target_,
const Tensor& weight,
int64_t reduction) {
int64_t reduction,
double label_smoothing) {
const auto n_classes = self.size(1);
TORCH_CHECK(
!weight.defined() || (weight.dim() == 1 && weight.numel() == n_classes),
@ -472,6 +473,15 @@ Tensor cross_entropy_loss_prob_target(
weight.sizes());
auto input = at::log_softmax(self, 1, self.scalar_type());
Tensor target;
if (label_smoothing > 0.0) {
TORCH_CHECK(label_smoothing <= 1.0, "label_smoothing must be between 0.0 and 1.0. Got: ", label_smoothing);
target = target_ * (1 - label_smoothing) + label_smoothing / n_classes;
} else {
target = target_;
}
if (weight.defined()) {
// Expand weight to the correct number of dims for broadcasting with input / target
auto weight_broadcast_shape = SmallBuffer<int64_t, 5>(input.dim());
@ -503,12 +513,66 @@ Tensor cross_entropy_loss_prob_target(
}
}
Tensor cross_entropy_loss_label_smoothing(
const Tensor& self,
const Tensor& target,
const Tensor& weight,
int64_t reduction,
int64_t ignore_index,
double label_smoothing) {
auto input = at::log_softmax(self, 1, self.scalar_type());
auto nllloss = at::nll_loss_nd(input, target, weight, reduction, ignore_index);
auto n_classes = input.size(1);
Tensor smooth_loss;
if (weight.defined()) {
// Expand weight to the correct number of dims for broadcasting with input / target
auto weight_broadcast_shape = SmallBuffer<int64_t, 5>(input.dim());
std::fill(weight_broadcast_shape.begin(), weight_broadcast_shape.end(), 1);
weight_broadcast_shape[1] = weight.size(0);
Tensor weight_ = weight.view(weight_broadcast_shape);
smooth_loss = -(input * weight_).sum(1);
} else {
smooth_loss = -input.sum(1);
}
if (ignore_index >= 0) {
smooth_loss.index_put_({target == ignore_index}, 0.0);
}
Tensor ret;
switch (reduction) {
case Reduction::Mean:
if (weight.defined()) {
// TODO: This code can path can be removed if #61309 is resolved
// loss is normalized by the weights to be consistent with nll_loss_nd
ret = smooth_loss.sum() / weight.gather(0, target.flatten()).sum();
} else {
ret = smooth_loss.mean();
}
break;
case Reduction::Sum:
ret = smooth_loss.sum();
break;
case Reduction::None:
ret = smooth_loss;
break;
default:
TORCH_CHECK(false, "Invalid reduction type encountered in cross_entropy: ", reduction);
}
return (1 - label_smoothing) * nllloss + ret * (label_smoothing / n_classes);
}
Tensor cross_entropy_loss(
const Tensor& self,
const Tensor& target,
const c10::optional<Tensor>& weight,
int64_t reduction,
int64_t ignore_index) {
int64_t ignore_index,
double label_smoothing) {
Tensor ret;
if (self.sizes() == target.sizes()) {
// Assume soft targets when input and target shapes are the same
@ -519,7 +583,14 @@ Tensor cross_entropy_loss(
// See [Note: hacky wrapper removal for optional tensor]
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight);
const Tensor& weight_ = *weight_maybe_owned;
ret = cross_entropy_loss_prob_target(self, target, weight_, reduction);
ret = cross_entropy_loss_prob_target(self, target, weight_, reduction, label_smoothing);
} else if (label_smoothing > 0.0) {
TORCH_CHECK(label_smoothing <= 1.0, "label_smoothing must be between 0.0 and 1.0. Got: ", label_smoothing);
// See [Note: hacky wrapper removal for optional tensor]
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight);
const Tensor& weight_ = *weight_maybe_owned;
ret = cross_entropy_loss_label_smoothing(self, target, weight_, reduction, ignore_index, label_smoothing);
} else {
ret = at::nll_loss_nd(
at::log_softmax(self, 1, self.scalar_type()),

View File

@ -1,90 +1,17 @@
#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#include <ATen/Parallel.h>
#include <tuple>
#include <ATen/native/cpu/MaxUnpoolKernel.h>
namespace at {
namespace native {
template <typename scalar_t>
Tensor max_unpooling2d_forward_out_cpu_frame(
Tensor& output,
const Tensor& input,
const Tensor& indices,
int64_t oheight,
int64_t owidth) {
int64_t numBatch = 1;
int64_t dimc = 0;
int64_t dimh = 1;
int64_t dimw = 2;
if (input.ndimension() == 4) {
numBatch = input.size(0);
dimc++;
dimh++;
dimw++;
}
int64_t numChannels = input.size(dimc);
int64_t inputHeight = input.size(dimh);
int64_t inputWidth = input.size(dimw);
auto* rawInput = input.data_ptr<scalar_t>();
auto* rawIndices = indices.data_ptr<int64_t>();
auto* rawOutput = output.data_ptr<scalar_t>();
at::internal::lazy_init_num_threads();
for (int64_t n = 0; n < numBatch; n++) {
int64_t nOutputOffset = n * numChannels * owidth * oheight;
int64_t nInputOffset = n * numChannels * inputWidth * inputHeight;
int64_t k = 0;
bool has_error = false;
int64_t error_index = 0;
#pragma omp parallel for private(k)
for (k = 0; k < numChannels; k++) {
int64_t finalOutputOffset = nOutputOffset + k * owidth * oheight;
int64_t finalInputOffset = nInputOffset + k * inputWidth * inputHeight;
scalar_t* output_p_k = rawOutput + finalOutputOffset;
scalar_t* input_p_k = rawInput + finalInputOffset;
int64_t* ind_p_k = rawIndices + finalInputOffset;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int64_t maxp;
for (int64_t i = 0; i < inputHeight; i++) {
for (int64_t j = 0; j < inputWidth; j++) {
maxp = ind_p_k[i * inputWidth + j];
if (maxp < 0 || maxp >= owidth * oheight) {
#pragma omp critical
{
has_error = true;
error_index = maxp;
}
} else {
output_p_k[maxp] = input_p_k[i * inputWidth + j];
}
}
}
}
if (has_error) {
AT_ERROR(
"Found an invalid max index: ",
error_index,
" (output volumes are of size ",
oheight,
"x",
owidth);
(void)error_index;
}
}
return output;
}
Tensor& max_unpooling2d_forward_out_cpu(const Tensor& self_,
Tensor& max_unpooling2d_forward_out_cpu(
const Tensor& self_,
const Tensor& indices_,
IntArrayRef output_size,
Tensor& output) {
auto oheight = output_size[0];
auto owidth = output_size[1];
TORCH_CHECK(output.is_contiguous(), "output must be contiguous");
TORCH_CHECK(
indices_.scalar_type() == at::ScalarType::Long,
"elements in indices should be type int64");
@ -100,8 +27,9 @@ Tensor& max_unpooling2d_forward_out_cpu(const Tensor& self_,
TORCH_CHECK(self_.numel() > 0, "Input must be non-empty");
auto self = self_.contiguous();
auto indices = indices_.contiguous();
auto memory_format = self_.suggest_memory_format();
auto self = self_.contiguous(memory_format);
auto indices = indices_.contiguous(memory_format);
if (self.ndimension() == 3) {
int64_t numChannels = self.size(0);
@ -109,15 +37,11 @@ Tensor& max_unpooling2d_forward_out_cpu(const Tensor& self_,
} else {
int64_t numBatch = self.size(0);
int64_t numChannels = self.size(1);
output.resize_({numBatch, numChannels, oheight, owidth});
output.resize_({numBatch, numChannels, oheight, owidth}, memory_format);
}
output.zero_();
AT_DISPATCH_FLOATING_TYPES(
self.scalar_type(), "max_unpooling2d_forward_out_cpu_frame", ([&] {
max_unpooling2d_forward_out_cpu_frame<scalar_t>(
output, self, indices, oheight, owidth);
}));
max_unpool2d_kernel(kCPU, output, self, indices);
return output;
};
@ -130,87 +54,6 @@ Tensor max_unpooling2d_forward_cpu(
return output;
}
template <typename scalar_t>
Tensor max_unpooling3d_forward_out_cpu_frame(
Tensor& output,
const Tensor& input,
const Tensor& indices,
int64_t oT,
int64_t oH,
int64_t oW) {
int64_t nBatch = 1;
int64_t dimw = 3;
int64_t dimh = 2;
int64_t dimt = 1;
if (input.ndimension() == 5) {
nBatch = input.size(0);
dimw++;
dimh++;
dimt++;
}
int64_t nSlices = input.size(dimt - 1);
int64_t iT = input.size(dimt);
int64_t iH = input.size(dimh);
int64_t iW = input.size(dimw);
scalar_t* input_data = input.data_ptr<scalar_t>();
scalar_t* output_data = output.data_ptr<scalar_t>();
int64_t* indices_data = indices.data_ptr<int64_t>();
at::internal::lazy_init_num_threads();
for (int64_t p = 0; p < nBatch; p++) {
int64_t inputOffset = p * nSlices * iT * iW * iH;
int64_t outputOffset = p * nSlices * oT * oW * oH;
int64_t k = 0;
bool has_error = false;
int error_index = 0;
#pragma omp parallel for private(k)
for (k = 0; k < nSlices; k++) {
int64_t finalInputOffset = inputOffset + k * iT * iW * iH;
int64_t finalOutputOffset = outputOffset + k * oT * oW * oH;
scalar_t* output_p_k = output_data + finalOutputOffset;
scalar_t* input_p_k = input_data + finalInputOffset;
int64_t* ind_p_k = indices_data + finalInputOffset;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int maxp;
for (int64_t t = 0; t < iT; t++) {
for (int64_t i = 0; i < iH; i++) {
for (int64_t j = 0; j < iW; j++) {
int64_t index = t * iH * iW + i * iW + j;
maxp = ind_p_k[index];
if (maxp < 0 || maxp >= oT * oW * oH) {
#pragma omp critical
{
has_error = true;
error_index = maxp;
}
} else {
output_p_k[maxp] = input_p_k[index];
}
}
}
}
if (has_error) {
AT_ERROR(
"found an invalid max index ",
error_index,
" (output volumes are of size ",
oT,
"x",
oH,
"x",
oW);
(void)error_index;
}
}
}
return output;
}
static void max_unpooling3d_shape_check(
const Tensor& input,
const Tensor& gradOutput,
@ -310,16 +153,7 @@ Tensor& max_unpooling3d_forward_out_cpu(const Tensor& self_,
}
output.zero_();
AT_DISPATCH_FLOATING_TYPES(
self.scalar_type(), "max_unpooling3d_forward_out_cpu_frame", ([&] {
max_unpooling3d_forward_out_cpu_frame<scalar_t>(
output,
self,
indices,
oT,
oH,
oW);
}));
max_unpool3d_kernel(kCPU, output, self, indices);
return output;
}
@ -335,59 +169,6 @@ Tensor max_unpooling3d_forward_cpu(
return output;
}
template <typename scalar_t>
static void max_unpooling2d_backward_out_cpu_frame(
scalar_t* gradInput_p,
scalar_t* gradOutput_p,
int64_t* ind_p,
int64_t nslices,
int64_t iheight,
int64_t iwidth,
int64_t oheight,
int64_t owidth) {
bool has_error = false;
int64_t error_index = 0;
int64_t k = 0;
at::internal::lazy_init_num_threads();
#pragma omp parallel for private(k)
for (k = 0; k < nslices; k++) {
scalar_t* gradInput_p_k = gradInput_p + k * iwidth * iheight;
scalar_t* gradOutput_p_k = gradOutput_p + k * owidth * oheight;
int64_t* ind_p_k = ind_p + k * iwidth * iheight;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int64_t i, j;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int64_t maxp;
for (i = 0; i < iheight; i++) {
for (j = 0; j < iwidth; j++) {
maxp = ind_p_k[i * iwidth + j]; /* retrieve position of max */
if (maxp < 0 || maxp >= owidth * oheight) {
#pragma omp critical
{
has_error = true;
error_index = maxp;
}
}
gradInput_p_k[i * iwidth + j] =
gradOutput_p_k[maxp]; /* update gradient */
}
}
}
if (has_error) {
AT_ERROR(
"invalid max index ",
error_index,
", owidth= ",
owidth,
", oheight= ",
oheight);
(void)error_index;
}
}
Tensor& max_unpooling2d_backward_out_cpu(const Tensor& grad_output_,
const Tensor& self,
const Tensor& indices_,
@ -396,42 +177,24 @@ Tensor& max_unpooling2d_backward_out_cpu(const Tensor& grad_output_,
TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
int64_t oheight = output_size[0];
int64_t owidth = output_size[1];
int dimw = 2;
int dimh = 1;
int nbatch = 1;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int nslices;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int iheight;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int iwidth;
int64_t ndim = self.ndimension();
int64_t dimh = ndim == 3 ? 1 : 2;
int64_t dimw = ndim == 3 ? 2 : 3;
TORCH_CHECK(
indices_.scalar_type() == at::ScalarType::Long,
"elements in indices should be type int64");
TORCH_CHECK(
self.sizes() == indices_.sizes(), "Input shape must match indices shape");
TORCH_CHECK(output_size.size() == 2, "Output size must be 2");
/* get contiguous gradOutput and indices */
auto grad_output = grad_output_.contiguous();
auto indices = indices_.contiguous();
auto memory_format = self.suggest_memory_format();
auto grad_output = grad_output_.contiguous(memory_format);
auto indices = indices_.contiguous(memory_format);
/* resize */
grad_input.resize_as_(self);
grad_input.resize_(self.sizes(), memory_format);
grad_input.zero_();
if (self.ndimension() == 4) {
nbatch = self.size(0);
dimw++;
dimh++;
}
/* sizes */
nslices = self.size(dimh - 1);
iheight = self.size(dimh);
iwidth = self.size(dimw);
if (owidth != grad_output.size(dimw) || oheight != grad_output.size(dimh)) {
AT_ERROR(
"Inconsistent gradOutput size. output height = ",
@ -443,23 +206,8 @@ Tensor& max_unpooling2d_backward_out_cpu(const Tensor& grad_output_,
"x",
grad_output.size(dimw));
}
AT_DISPATCH_FLOATING_TYPES(
self.scalar_type(), "max_unpooling2d_backward_out_cpu_frame", ([&] {
int p;
for (p = 0; p < nbatch; p++) {
auto inputOffset = p * nslices * iheight * iwidth;
auto outputOffset = p * nslices * oheight * owidth;
max_unpooling2d_backward_out_cpu_frame<scalar_t>(
grad_input.data_ptr<scalar_t>() + inputOffset,
grad_output.data_ptr<scalar_t>() + outputOffset,
indices.data_ptr<int64_t>() + inputOffset,
nslices,
iheight,
iwidth,
oheight,
owidth);
}
}));
max_unpool2d_backward_kernel(kCPU, grad_input, grad_output, indices);
return grad_input;
}
@ -468,72 +216,14 @@ Tensor max_unpooling2d_backward_cpu(
const Tensor& self,
const Tensor& indices,
IntArrayRef output_size) {
auto grad_input = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
at::native::max_unpooling2d_backward_out_cpu(
auto grad_input = at::empty({0}, self.options());
max_unpooling2d_backward_out_cpu(
grad_output, self, indices, output_size, grad_input);
return grad_input;
}
template <typename scalar_t>
static void max_unpooling3d_backward_out_cpu_frame(
scalar_t* gradInput_p,
scalar_t* gradOutput_p,
int64_t* ind_p,
int64_t nslices,
int64_t iT,
int64_t iH,
int64_t iW,
int64_t oT,
int64_t oH,
int64_t oW) {
int64_t k = 0;
bool has_error = false;
int error_index = 0;
at::internal::lazy_init_num_threads();
#pragma omp parallel for private(k)
for (k = 0; k < nslices; k++) {
scalar_t* gradInput_p_k = gradInput_p + k * iT * iH * iW;
scalar_t* gradOutput_p_k = gradOutput_p + k * oT * oH * oW;
int64_t* ind_p_k = ind_p + k * iT * iH * iW;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int64_t t, i, j, index;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int64_t maxp;
for (t = 0; t < iT; t++) {
for (i = 0; i < iH; i++) {
for (j = 0; j < iW; j++) {
index = t * iH * iW + i * iW + j;
maxp = ind_p_k[index]; /* retrieve position of max */
if (maxp < 0 || maxp >= oT * oH * oW) {
#pragma omp critical
{
has_error = true;
error_index = maxp;
}
}
gradInput_p_k[index] = gradOutput_p_k[maxp]; /* update gradient */
}
}
}
}
if (has_error) {
AT_ERROR(
"invalid max index ",
error_index,
", oT= ",
oT,
", oW= ",
oW,
",oH= ",
oH);
(void)error_index;
}
}
Tensor& max_unpooling3d_backward_out_cpu(const Tensor& grad_output_,
Tensor& max_unpooling3d_backward_out_cpu(
const Tensor& grad_output_,
const Tensor& self,
const Tensor& indices_,
IntArrayRef output_size,
@ -541,26 +231,17 @@ Tensor& max_unpooling3d_backward_out_cpu(const Tensor& grad_output_,
IntArrayRef padding,
Tensor& grad_input) {
TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
auto oT = output_size[0];
auto oH = output_size[1];
auto oW = output_size[2];
int dimw = 3;
int dimh = 2;
int dimt = 1;
int nbatch = 1;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int nslices;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int iT;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int iH;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int iW;
int64_t oT = output_size[0];
int64_t oH = output_size[1];
int64_t oW = output_size[2];
int64_t ndim = self.ndimension();
int64_t dimt = ndim == 4 ? 1 : 2;
int64_t dimh = ndim == 4 ? 2 : 3;
int64_t dimw = ndim == 4 ? 3 : 4;
max_unpooling3d_shape_check(
self, grad_output_, indices_, output_size, stride, padding);
// TODO (from THNN): check gradOutput shape
/* get contiguous gradOutput */
auto grad_output = grad_output_.contiguous();
auto indices = indices_.contiguous();
@ -568,39 +249,24 @@ Tensor& max_unpooling3d_backward_out_cpu(const Tensor& grad_output_,
/* resize */
grad_input.resize_as_(self);
grad_input.zero_();
if (self.ndimension() == 5) {
nbatch = self.size(0);
dimt++;
dimw++;
dimh++;
if (oW != grad_output.size(dimw) || oH != grad_output.size(dimh) || oT != grad_output.size(dimt)) {
AT_ERROR(
"Inconsistent gradOutput size. output depth = ",
oT,
", output height = ",
oH,
", output width = ",
oW,
", gradOutput: ",
grad_output.size(dimt),
"x",
grad_output.size(dimh),
"x",
grad_output.size(dimw));
}
/* sizes */
nslices = self.size(dimt - 1);
iT = self.size(dimt);
iH = self.size(dimh);
iW = self.size(dimw);
/* backprop */
AT_DISPATCH_FLOATING_TYPES(
self.scalar_type(), "max_unpooling3d_backward_out_cpu_frame", ([&] {
int p;
for (p = 0; p < nbatch; p++) {
int inputOffset = p * nslices * iT * iH * iW;
int outputOffset = p * nslices * oT * oH * oW;
max_unpooling3d_backward_out_cpu_frame<scalar_t>(
grad_input.data_ptr<scalar_t>() + inputOffset,
grad_output.data_ptr<scalar_t>() + outputOffset,
indices.data_ptr<int64_t>() + inputOffset,
nslices,
iT,
iH,
iW,
oT,
oH,
oW);
}
}));
max_unpool3d_backward_kernel(kCPU, grad_input, grad_output, indices);
return grad_input;
}
@ -611,10 +277,16 @@ Tensor max_unpooling3d_backward_cpu(
IntArrayRef output_size,
IntArrayRef stride,
IntArrayRef padding) {
auto grad_input = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
auto grad_input = at::empty({0}, self.options());
at::native::max_unpooling3d_backward_out_cpu(
grad_output, self, indices, output_size, stride, padding, grad_input);
return grad_input;
}
DEFINE_DISPATCH(max_unpool2d_kernel);
DEFINE_DISPATCH(max_unpool2d_backward_kernel);
DEFINE_DISPATCH(max_unpool3d_kernel);
DEFINE_DISPATCH(max_unpool3d_backward_kernel);
} // namespace native
} // namespace at

View File

@ -1209,12 +1209,15 @@ Tensor index_select_sparse(const Tensor& self, int64_t dim, const Tensor& index)
if (dim < sparse_dim) {
auto dim_indices = indices[dim];
auto cpu_dim_indices = indices[dim].to(c10::kCPU).contiguous();
int64_t* cpu_dim_indices_ptr = cpu_dim_indices.data_ptr<int64_t>();
auto cpu_index = index.to(c10::kCPU).contiguous();
int64_t* cpu_index_ptr = cpu_index.data_ptr<int64_t>();
std::vector<int64_t> zindices;
std::vector<int64_t> iindices;
int64_t new_nnz = 0;
for (const auto i : c10::irange(new_sizes[dim])) {
auto idx = index[i].item<int64_t>();
for (int64_t i = 0; i < new_sizes[dim]; i++) {
int64_t idx = cpu_index_ptr[i];
if (idx < -size || idx >= size) {
TORCH_CHECK_INDEX(false, "index_select(): index contains ", idx, " that is out of range for tensor of size ",
self.sizes(), " at dimension ", dim);
@ -1222,8 +1225,8 @@ Tensor index_select_sparse(const Tensor& self, int64_t dim, const Tensor& index)
if (idx < 0) {
idx += size;
}
for (const auto j : c10::irange(nnz)) {
auto jdx = dim_indices[j].item<int64_t>();
for (int64_t j = 0; j < nnz; j++) {
int64_t jdx = cpu_dim_indices_ptr[j];
if (idx == jdx) {
new_nnz++;
iindices.push_back(i);

View File

@ -251,12 +251,16 @@ static inline scalar_t area_pixel_compute_scale(
bool align_corners,
const c10::optional<double> scale) {
// see Note [area_pixel_compute_scale]
if (output_size > 1) {
return align_corners
? static_cast<scalar_t>(input_size - 1) / (output_size - 1)
: compute_scales_value<scalar_t>(scale, input_size, output_size);
} else {
return scalar_t(0);
if(align_corners){
if(output_size > 1) {
return static_cast<scalar_t>(input_size - 1) / (output_size - 1);
}
else {
return static_cast<scalar_t>(0);
}
}
else{
return compute_scales_value<scalar_t>(scale, input_size, output_size);
}
}

View File

@ -0,0 +1,385 @@
#include <ATen/ATen.h>
#include <ATen/Dispatch.h>
#include <ATen/Parallel.h>
#include <ATen/native/Pool.h>
#include <ATen/native/cpu/utils.h>
namespace at { namespace native {
namespace {
template <typename scalar_t, bool is_3d = false>
void cpu_max_unpool(
Tensor& output_,
const Tensor& input,
const Tensor& indices) {
auto output = output_.contiguous();
auto input_data = input.data_ptr<scalar_t>();
auto indices_data = indices.data_ptr<int64_t>();
auto output_data = output.data_ptr<scalar_t>();
// NB: input tensor dimensions:
// MaxUnpool2d:
// dim = 3: CHW
// dim = 4: NCHW
// MaxUnpool3d:
// dim = 4: CDHW
// dim = 5: NCDHW
int64_t numel = input.numel();
int64_t ndim = input.ndimension();
// treat batch size and channels as one dimension
// and the feature map as another dimension
int64_t channels, output_depth, output_height, output_width;
if (is_3d) {
TORCH_CHECK(ndim == 4 || ndim == 5, "MaxUnpool3d: expect input to be 4d or 5d tensor.");
channels = ndim == 4 ? input.size(0) : input.size(0) * input.size(1);
output_depth = output.size(-3);
output_height = output.size(-2);
output_width = output.size(-1);
} else {
TORCH_CHECK(ndim == 3 || ndim == 4, "MaxUnpool2d: expect input to be 3d or 4d tensor.");
channels = ndim == 3 ? input.size(0) : input.size(0) * input.size(1);
output_depth = 1;
output_height = output.size(-2);
output_width = output.size(-1);
}
int64_t input_image_size = numel / channels;
int64_t output_image_size = output.numel() / channels;
bool has_error = false;
int64_t error_index = 0;
// parallel on dim N, C, D, H, W: [channels, input_image_size]
at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) {
int64_t c = 0;
int64_t ip = 0;
data_index_init(begin, c, channels, ip, input_image_size);
for (int64_t i = begin; i < end; i++) {
scalar_t* output_ptr = output_data + c * output_image_size;
int64_t maxp = indices_data[i];
if (maxp < 0 || maxp >= output_image_size) {
#pragma omp critical
{
has_error = true;
error_index = maxp;
}
} else {
output_ptr[maxp] = input_data[i];
}
// move on to next input index
data_index_step(c, channels, ip, input_image_size);
}
});
if (has_error) {
if (is_3d) {
AT_ERROR("Found an invalid max index: ", error_index,
" (output volumes are of size ", output_depth,
"x", output_height, "x", output_width);
(void)error_index;
} else {
AT_ERROR("Found an invalid max index: ", error_index,
" (output volumes are of size ", output_height,
"x", output_width);
(void)error_index;
}
}
if (!output_.is_contiguous()) {
output_.copy_(output);
}
}
template <typename scalar_t>
void cpu_max_unpool_channels_last(
Tensor& output_,
const Tensor& input,
const Tensor& indices) {
TORCH_CHECK(input.ndimension() == 4,
"max_unpool2d with channels last format supports tensors with 4 dims");
auto memory_format = at::MemoryFormat::ChannelsLast;
auto output = output_.contiguous(memory_format);
auto input_data = input.data_ptr<scalar_t>();
auto indices_data = indices.data_ptr<int64_t>();
auto output_data = output.data_ptr<scalar_t>();
int64_t nbatch = input.size(0);
int64_t channels = input.size(1);
int64_t input_height = input.size(2);
int64_t input_width = input.size(3);
int64_t output_height = output.size(2);
int64_t output_width = output.size(3);
int64_t input_image_size = input_height * input_width;
int64_t output_image_size = output_height * output_width;
bool has_error = false;
int64_t error_index = 0;
// parallel on dim N, H, W
at::parallel_for(0, nbatch * input_image_size, 0, [&](int64_t begin, int64_t end) {
int64_t n = 0;
int64_t ip = 0;
data_index_init(begin, n, nbatch, ip, input_image_size);
for (int64_t i = begin; i < end; i++) {
scalar_t* input_ptr = input_data + i * channels;
int64_t* indices_ptr = indices_data + i * channels;
scalar_t* output_ptr = output_data + n * output_image_size * channels;
// can't do scatter on avx2 (only available on avx512)
for (int64_t c = 0; c < channels; c++) {
int64_t maxp = indices_ptr[c];
if (maxp < 0 || maxp >= output_image_size) {
#pragma omp critical
{
has_error = true;
error_index = maxp;
}
} else {
output_ptr[maxp * channels + c] = input_ptr[c];
}
}
// move on to next input index
data_index_step(n, nbatch, ip, input_image_size);
}
});
if (has_error) {
AT_ERROR("Found an invalid max index: ", error_index,
" (output volumes are of size ", output_height,
"x", output_width);
(void)error_index;
}
if (!output_.is_contiguous(memory_format)) {
output_.copy_(output);
}
}
template <typename scalar_t, bool is_3d = false>
void cpu_max_unpool_backward(
Tensor& grad_input_,
const Tensor& grad_output,
const Tensor& indices) {
auto grad_input = grad_input_.contiguous();
auto grad_output_data = grad_output.data_ptr<scalar_t>();
auto indices_data = indices.data_ptr<int64_t>();
auto grad_input_data = grad_input.data_ptr<scalar_t>();
int64_t numel = grad_input.numel();
int64_t ndim = grad_output.ndimension();
// treat batch size and channels as one dimension
// and the feature map as another dimension
int64_t channels, output_depth, output_height, output_width;
if (is_3d) {
TORCH_CHECK(ndim == 4 || ndim == 5, "MaxUnpool3d_backward: expect grad_output to be 4d or 5d tensor.");
channels = ndim == 4 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1);
output_depth = grad_output.size(-3);
output_height = grad_output.size(-2);
output_width = grad_output.size(-1);
} else {
TORCH_CHECK(ndim == 3 || ndim == 4, "MaxUnpool2d_backward: expect grad_output to be 3d or 4d tensor.");
channels = ndim == 3 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1);
output_depth = 1;
output_height = grad_output.size(-2);
output_width = grad_output.size(-1);
}
int64_t input_image_size = numel / channels;
int64_t output_image_size = grad_output.numel() / channels;
bool has_error = false;
int64_t error_index = 0;
// parallel on dim N, C, D, H, W
at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) {
int64_t c = 0;
int64_t ip = 0;
data_index_init(begin, c, channels, ip, input_image_size);
for (int64_t i = begin; i < end; i++) {
scalar_t* grad_output_ptr = grad_output_data + c * output_image_size;
int64_t maxp = indices_data[i];
if (maxp < 0 || maxp >= output_image_size) {
#pragma omp critical
{
has_error = true;
error_index = maxp;
}
} else {
grad_input_data[i] = grad_output_ptr[maxp];
}
// move on to next input index
data_index_step(c, channels, ip, input_image_size);
}
});
if (has_error) {
if (is_3d) {
AT_ERROR("invalid max index ", error_index,
", odepth= ", output_depth,
", owidth= ", output_width,
", oheight= ", output_height);
(void)error_index;
} else {
AT_ERROR("invalid max index ", error_index,
", owidth= ", output_width,
", oheight= ", output_height);
(void)error_index;
}
}
if (!grad_input_.is_contiguous()) {
grad_input_.copy_(grad_input);
}
}
template <typename scalar_t>
void cpu_max_unpool_backward_channels_last(
Tensor& grad_input_,
const Tensor& grad_output,
const Tensor& indices) {
TORCH_CHECK(grad_output.ndimension() == 4,
"max_unpool2d backward with channels last format supports tensors with 4 dims.");
auto memory_format = at::MemoryFormat::ChannelsLast;
auto grad_input = grad_input_.contiguous(memory_format);
auto grad_input_data = grad_input.data_ptr<scalar_t>();
auto grad_output_data = grad_output.data_ptr<scalar_t>();
auto indices_data = indices.data_ptr<int64_t>();
int64_t nbatch = grad_input.size(0);
int64_t channels = grad_input.size(1);
int64_t input_height = grad_input.size(2);
int64_t input_width = grad_input.size(3);
int64_t output_height = grad_output.size(2);
int64_t output_width = grad_output.size(3);
int64_t input_image_size = input_height * input_width;
int64_t output_image_size = output_height * output_width;
bool has_error = false;
int64_t error_index = 0;
// parallel on dim N, H, W
at::parallel_for(0, nbatch * input_image_size, 0, [&](int64_t begin, int64_t end) {
int64_t n = 0;
int64_t ip = 0;
data_index_init(begin, n, nbatch, ip, input_image_size);
for (int64_t i = begin; i < end; i++) {
scalar_t* grad_output_ptr = grad_output_data + n * output_image_size * channels;
scalar_t* grad_input_ptr = grad_input_data + i * channels;
int64_t* indices_ptr = indices_data + i * channels;
for (int64_t c = 0; c < channels; c++) {
int64_t maxp = indices_ptr[c];
if (maxp < 0 || maxp >= output_image_size) {
#pragma omp critical
{
has_error = true;
error_index = maxp;
}
} else {
grad_input_ptr[c] = grad_output_ptr[maxp * channels + c];
}
}
// move on to next input index
data_index_step(n, nbatch, ip, input_image_size);
}
});
if (has_error) {
AT_ERROR("invalid max index ", error_index,
", owidth= ", output_width,
", oheight= ", output_height);
(void)error_index;
}
if (!grad_input_.is_contiguous(memory_format)) {
grad_input_.copy_(grad_input);
}
}
void max_unpool2d_kernel_impl(
Tensor& output,
const Tensor& input,
const Tensor& indices) {
switch(input.suggest_memory_format()) {
case at::MemoryFormat::Contiguous: {
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_unpool2d", [&] {
cpu_max_unpool<scalar_t, /*is_3d*/false>(output, input, indices);
});
break;
}
case at::MemoryFormat::ChannelsLast: {
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_unpool2d_channels_last", [&] {
cpu_max_unpool_channels_last<scalar_t>(output, input, indices);
});
break;
}
default:
TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
}
}
void max_unpool3d_kernel_impl(
Tensor& output,
const Tensor& input,
const Tensor& indices) {
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_unpool3d", [&] {
cpu_max_unpool<scalar_t, /*is_3d*/true>(output, input, indices);
});
}
void max_unpool2d_backward_kernel_impl(
Tensor& grad_input,
const Tensor& grad_output,
const Tensor& indices) {
switch(grad_output.suggest_memory_format()) {
case at::MemoryFormat::Contiguous: {
AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_unpool2d_backward", [&] {
cpu_max_unpool_backward<scalar_t, /*is_3d*/false>(grad_input, grad_output, indices);
});
break;
}
case at::MemoryFormat::ChannelsLast: {
AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_unpool2d_backward_channels_last", [&] {
cpu_max_unpool_backward_channels_last<scalar_t>(grad_input, grad_output, indices);
});
break;
}
default:
TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
}
}
void max_unpool3d_backward_kernel_impl(
Tensor& grad_input,
const Tensor& grad_output,
const Tensor& indices) {
AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_unpool3d_backward", [&] {
cpu_max_unpool_backward<scalar_t, /*is_3d*/true>(grad_input, grad_output, indices);
});
}
} // anonymous namespace
REGISTER_DISPATCH(max_unpool2d_kernel, &max_unpool2d_kernel_impl);
REGISTER_DISPATCH(max_unpool2d_backward_kernel, &max_unpool2d_backward_kernel_impl);
REGISTER_DISPATCH(max_unpool3d_kernel, &max_unpool3d_kernel_impl);
REGISTER_DISPATCH(max_unpool3d_backward_kernel, &max_unpool3d_backward_kernel_impl);
}} // at::native

View File

@ -0,0 +1,16 @@
#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#include <ATen/native/DispatchStub.h>
#pragma once
namespace at { namespace native {
using max_unpooling_fn = void(*)(Tensor&, const Tensor&, const Tensor&);
DECLARE_DISPATCH(max_unpooling_fn, max_unpool2d_kernel);
DECLARE_DISPATCH(max_unpooling_fn, max_unpool2d_backward_kernel);
DECLARE_DISPATCH(max_unpooling_fn, max_unpool3d_kernel);
DECLARE_DISPATCH(max_unpooling_fn, max_unpool3d_backward_kernel);
}} // at::native

View File

@ -163,24 +163,29 @@ static void std_var_kernel_impl(TensorIterator& iter, int64_t correction, bool t
}
static void prod_kernel_impl(TensorIterator& iter) {
// Workaround for the error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context]
// Workaround for the error: '*' in boolean context, suggest '&&' instead
// [-Werror=int-in-bool-context]
if (iter.dtype() == ScalarType::Bool) {
using scalar_t = bool;
binary_kernel_reduce_vec(
iter,
[=](scalar_t a, scalar_t b) -> scalar_t { return a && b; },
[=](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return a && b; },
// NOLINTNEXTLINE(bugprone-argument-comment)
/*identity=*/1);
iter,
[=](scalar_t a, scalar_t b)
__ubsan_ignore_undefined__ -> scalar_t { return a && b; },
[=](Vectorized<scalar_t> a, Vectorized<scalar_t> b)
__ubsan_ignore_undefined__ { return a && b; },
// NOLINTNEXTLINE(bugprone-argument-comment)
/*identity=*/1);
} else {
AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "prod_cpu", [&] {
binary_kernel_reduce_vec(
iter,
[=](scalar_t a, scalar_t b) -> scalar_t { return a * b; },
[=](Vectorized <scalar_t> a, Vectorized <scalar_t> b) { return a * b; },
// NOLINTNEXTLINE(bugprone-argument-comment)
/*identity=*/1);
});
iter,
[=](scalar_t a, scalar_t b)
__ubsan_ignore_undefined__ -> scalar_t { return a * b; },
[=](Vectorized<scalar_t> a, Vectorized<scalar_t> b)
__ubsan_ignore_undefined__ { return a * b; },
// NOLINTNEXTLINE(bugprone-argument-comment)
/*identity=*/1);
});
}
}

View File

@ -648,7 +648,9 @@ Tensor batch_norm_backward_elemt_cuda(const Tensor& self, const Tensor& input, c
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
const Tensor& weight = *weight_maybe_owned;
if (at::cuda::detail::canUse32BitIndexMath(self) && batch_norm_use_channels_last_kernels(self)){
if (at::cuda::detail::canUse32BitIndexMath(self) &&
batch_norm_use_channels_last_kernels(self) &&
batch_norm_use_channels_last_kernels(input)) {
return batch_norm_backward_elemt_channels_last_cuda_template(self, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count);
}

View File

@ -1649,7 +1649,8 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
const auto stride = input.sizes()[1];
const auto reduction_size = input.numel() / stride;
at::Tensor grad_input = at::empty_like(input, input.suggest_memory_format());
// Input is guarunteed to be channels-last compatible
at::Tensor grad_input = at::empty_like(input);
dim3 block;
dim3 grid;
@ -1716,7 +1717,8 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
const auto reduction_size = input.numel() / stride;
auto norm_fct = 1.0 / reduction_size;
at::Tensor grad_input = at::empty_like(input, input.suggest_memory_format());
// Input is guarunteed to be channels-last compatible
at::Tensor grad_input = at::empty_like(input);
dim3 block;
dim3 grid;

View File

@ -207,6 +207,87 @@ struct offset_t {
}
namespace {
// Segmented sort by full sort algorithm:.
// Say we are sorting a (2, 3) tensor. We have in flattened form:
// values 0.4 1.2 5.3 6.2 1.3 2.3
// indices 0 1 2 0 1 2
// segment_id 0 0 0 1 1 1
// First we sort by values, globally:
// values 6.2 5.3 2.3 1.2 1.3 0.4
// indices 0 2 2 1 1 0
// segment_id 1 0 1 0 1 0
// Then we stable sort by segment id:
// values 5.3 1.2 0.4 6.2 2.3 1.3
// indices 2 1 0 0 2 1
// segment_id 0 0 0 1 1 1
// This method can only work if the slice we are sorting (`dim`) is
// innermost, and both values and indices are contiguous. We do this
// by re-arranging the input into this form as needed, which will
// unfortunately allocate memory if the request is not in this form.
// Vectorized sort is slower than iterated sort if the number of
// slices is small (since we're sorting twice, instead of invoking a
// smaller sort `numSlices` times), but the cub sort
// implementation here is a catch-all, so we're not looking for
// efficiency, but instead correctness.
template<typename scalar_t>
__global__ void sort_postprocess_kernel(const scalar_t *in, scalar_t *out, int64_t *index, const int2 *i_s_ptr, int nsegments, int nsort) {
CUDA_KERNEL_LOOP(i, nsegments * nsort) {
int segment = i / nsort;
int j = i % nsort;
int offset = segment * nsort;
const scalar_t *in_ = in + offset;
scalar_t *out_ = out + offset;
int64_t *index_ = index + offset;
const int2 *i_s_ptr_ = i_s_ptr + offset;
int idx = i_s_ptr_[j].y;
index_[j] = idx;
out_[j] = in_[idx];
}
}
template<typename scalar_t>
inline void segmented_sort_pairs_by_full_sort(
int64_t nsegments, int64_t nsort, int64_t n, bool descending, const Tensor &indices,
const scalar_t *self_ptr, scalar_t *values_ptr, int64_t *indices_ptr
) {
int64_t segment_bits = std::max<int64_t>(1L, static_cast<int64_t>(std::ceil(std::log2(nsegments))));
auto int_options = indices.options().dtype(kInt);
auto indices_and_segment = at::empty({nsegments, nsort, 2}, int_options);
indices_and_segment.select(-1, 0).copy_( // segment id
at::arange(nsegments, int_options).view({nsegments, 1}).expand({nsegments, nsort}));
indices_and_segment.select(-1, 1).copy_( // reverse indices
at::arange(nsort, int_options).view({1, nsort}).expand({nsegments, nsort}));
auto i_s_ptr = reinterpret_cast<int2 *>(indices_and_segment.data_ptr<int>());
auto indices_and_segment2 = at::empty_like(indices_and_segment);
auto i_s_ptr2 = reinterpret_cast<int2 *>(indices_and_segment2.data_ptr<int>());
at::cuda::cub::sort_pairs<scalar_t, int2>(
self_ptr, nullptr, i_s_ptr, i_s_ptr2,
n, descending);
TORCH_INTERNAL_ASSERT(segment_bits <= 32);
// sort on lower 32bits, i.e. segment index
at::cuda::cub::sort_keys<int64_t>(
reinterpret_cast<int64_t *>(i_s_ptr2), reinterpret_cast<int64_t *>(i_s_ptr),
n, false, 0, segment_bits);
sort_postprocess_kernel<<<(n + 511) / 512, 512, 0, at::cuda::getCurrentCUDAStream()>>>(
self_ptr, values_ptr, indices_ptr, i_s_ptr, nsegments, nsort);
}
} // namespace
// We perform a segmented sort in cub with inputs that have
// more than 1024/2048 elements along the selected dimension.
// Otherwise, we do an inplace bitonic sort (see sortKeyValueInplace).
@ -349,11 +430,15 @@ std::tuple<Tensor &,Tensor &> sort_out_stable_cuda(const Tensor & self, c10::opt
int64_t n = std::min(remaining, nbatch);
int64_t nsegments = n / nsort;
auto reverse_indices = at::arange(nsort, indices.options()).view({1, nsort}).expand({nsegments, nsort}).contiguous();
at::cuda::cub::segmented_sort_pairs(self_ptr, values_ptr,
reverse_indices.data_ptr<int64_t>(), indices_ptr, n, nsegments,
offset_t{(int)nsort, 0}, offset_t{(int)nsort, 1}, descending);
if (nsegments < 128) {
segmented_sort_pairs_by_full_sort(nsegments, nsort, n, descending,
indices, self_ptr, values_ptr, indices_ptr);
} else {
auto reverse_indices = at::arange(nsort, indices.options()).view({1, nsort}).expand({nsegments, nsort}).contiguous();
at::cuda::cub::segmented_sort_pairs(self_ptr, values_ptr,
reverse_indices.data_ptr<int64_t>(), indices_ptr, n, nsegments,
offset_t{(int)nsort, 0}, offset_t{(int)nsort, 1}, descending);
}
remaining -= n;
self_ptr += n;

View File

@ -94,11 +94,16 @@ __host__ __forceinline__ static accscalar_t area_pixel_compute_scale(
int output_size,
bool align_corners,
const c10::optional<double> scale) {
if (output_size > 1) {
return align_corners ? (accscalar_t)(input_size - 1) / (output_size - 1)
: compute_scales_value<accscalar_t>(scale, input_size, output_size);
} else {
return static_cast<accscalar_t>(0);
if(align_corners) {
if(output_size > 1) {
return (accscalar_t)(input_size - 1) / (output_size - 1);
}
else {
return static_cast<accscalar_t>(0);
}
}
else{
return compute_scales_value<accscalar_t>(scale, input_size, output_size);
}
}

View File

@ -393,31 +393,32 @@ kernel void clamp(texture2d_array<half, access::read> in_arr[[texture(0), functi
}
}
kernel void hardswish(texture2d_array<half, access::read> in[[texture(0)]],
texture2d_array<half, access::write> out[[texture(1)]],
constant bool hardswish_is_arr = (ushort_arg_0 > 1 || ushort_arg_1 > 4);
constant bool hardswish_is_tex = !hardswish_is_arr;
kernel void hardswish(texture2d_array<half, access::read> in_arr[[texture(0), function_constant(hardswish_is_arr)]],
texture2d<half, access::read> in_tex[[texture(0), function_constant(hardswish_is_tex)]],
texture2d_array<half, access::write> out_arr[[texture(1), function_constant(hardswish_is_arr)]],
texture2d<half, access::write> out_tex[[texture(1), function_constant(hardswish_is_tex)]],
ushort3 gid[[thread_position_in_grid]]) {
if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
const ushort oH = ushort_arg_2;
const ushort oW = ushort_arg_3;
if (gid.x >= oW || gid.y >= oH) {
return;
}
ushort2 gid_ = gid.xy;
half4 value = in.read(gid_, gid.z);
half4 mask1 = half4(value < 3.0);
half4 mask2 = half4(value > -3.0);
half4 outval = mask2*(mask1*(value*(value + 3.0)/6.0) + (1 - mask1)*value);
out.write(outval, gid_, gid.z);
}
kernel void hardswish_nonarray(texture2d<half, access::read> in[[texture(0)]],
texture2d<half, access::write> out[[texture(1)]],
ushort2 gid[[thread_position_in_grid]]) {
if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
return;
if (hardswish_is_arr) {
half4 value = in_arr.read(gid_, gid.z);
half4 mask1 = half4(value < 3.0);
half4 mask2 = half4(value > -3.0);
half4 outval = mask2*(mask1*(value*(value + 3.0)/6.0) + (1 - mask1)*value);
out_arr.write(outval, gid_, gid.z);
} else {
half4 value = in_tex.read(gid_);
half4 mask1 = half4(value < 3);
half4 mask2 = half4(value > -3.0);
half4 outval = mask2*(mask1*(value*(value + 3.0)/6.0) + (1 - mask1)*value);
out_tex.write(outval, gid_);
}
half4 value = in.read(gid);
half4 mask1 = half4(value < 3);
half4 mask2 = half4(value > -3.0);
half4 outval = mask2*(mask1*(value*(value + 3.0)/6.0) + (1 - mask1)*value);
out.write(outval, gid);
}
constant bool out_is_arr = (ushort_arg_3 > 1 || ushort_arg_2 > 4);

View File

@ -41,6 +41,7 @@ bool test_softmax();
bool test_sigmoid();
bool test_hardsigmoid();
bool test_hardswish();
bool test_hardswish2();
bool test_upsampling_nearest2d_vec();
bool test_upsampling_nearest2d_vec2();
bool test_adaptive_avg_pool2d();

View File

@ -262,6 +262,18 @@ bool test_hardswish() {
});
}
bool test_hardswish2() {
__block std::vector<int64_t> size{1, 3, 44, 44};
return TEST(size, __PRETTY_FUNCTION__, ^bool {
auto X =
at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat)) * 12 - 6;
auto X2 = X.metal();
auto Y1 = at::hardswish_(X);
auto Y2 = at::hardswish_(X2).cpu();
return almostEqual(Y1, Y2);
});
}
bool test_addmm() {
bool result = true;
for (int i = 0; i < ITER_COUNT; ++i) {

View File

@ -69,6 +69,7 @@
REG_TEST("test_sigmoid", test_sigmoid);
REG_TEST("test_hardsigmoid", test_hardsigmoid);
REG_TEST("test_hardswish", test_hardswish);
REG_TEST("test_hardswish2", test_hardswish2);
REG_TEST("test_upsampling_nearest2d_vec", test_upsampling_nearest2d_vec);
REG_TEST("test_upsampling_nearest2d_vec2", test_upsampling_nearest2d_vec2);
REG_TEST("test_adaptive_avg_pool2d", test_adaptive_avg_pool2d);

View File

@ -24,9 +24,9 @@ Tensor& hardswish_(Tensor& input) {
id<MTLComputeCommandEncoder> encoder =
[commandBuffer.buffer computeCommandEncoder];
id<MTLComputePipelineState> state = [[MetalContext sharedInstance]
specializedPipelineState:mpscnn::kernelFor(
X, "hardswish", "hardswish_nonarray")
specializedPipelineState:"hardswish"
Constants:@[
@(X.numberOfImages),
@(X.featureChannels),
@(X.height),
@(X.width)

View File

@ -6652,7 +6652,7 @@
device_check: NoCheck # TensorIterator
variants: method
- func: cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100) -> Tensor
- func: cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100, float label_smoothing=0.0) -> Tensor
python_module: nn
- func: lstsq.X(Tensor self, Tensor A, *, Tensor(a!) X, Tensor(b!) qr) -> (Tensor(a!) solution, Tensor(b!) QR)

View File

@ -1,3 +1,5 @@
#include <ATen/native/quantized/cpu/qembeddingbag_prepack.h>
#include <c10/core/ScalarType.h>
#include <ATen/ATen.h>
#include <ATen/Parallel.h>
@ -122,7 +124,6 @@ c10::intrusive_ptr<EmbeddingPackedParamsBase> PackedEmbeddingBagWeight::prepack(
namespace at {
namespace native {
namespace {
// Note - This is a temporary pack function for embedding bag which quantizes
// and packs the float weight tensor. In the next step it will be replaced by a
@ -184,7 +185,7 @@ namespace {
//
// [[50. , 60.00000035],
// [70. , 80.00000035]]])
Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) {
// The "last" dimension of an N-Dimensioned batch of embedding bags is
// quantization channel. E.g. for a 2D embedding bag, this has
// [ row, col ] dimensions, for batched of embedding bags, dimensions might be
@ -208,17 +209,12 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
const int32_t embedding_cols = weight_sizes[cols_dim];
// Add 8 bytes per column to store FP32 scale and zero_point per row.
const int32_t output_columns = embedding_cols + 2 * sizeof(float);
Tensor weight_contig = weight.contiguous(weight.suggest_memory_format());
const auto weight_contig = weight.expect_contiguous(weight.suggest_memory_format());
// Adjust output dimensions to account for FP32 scale and zero_points.
std::vector<int64_t> output_shape = weight_sizes.vec();
output_shape[cols_dim] = output_columns;
// Allocate output packed weights
auto output = at::empty(
output_shape,
weight_contig.options().dtype(at::kByte),
weight_contig.suggest_memory_format());
at::native::resize_(output, output_shape, c10::nullopt);
auto* output_data = output.data_ptr<uint8_t>();
#ifdef USE_FBGEMM
@ -246,10 +242,9 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
}
#else
const auto float_weight = weight_contig.scalar_type() == at::ScalarType::Half
? weight_contig.to(at::ScalarType::Float)
: weight_contig;
const auto weight_data = float_weight.data_ptr<float>();
const auto weight_data = weight_contig->scalar_type() == at::ScalarType::Half
? weight_contig->to(at::ScalarType::Float).data_ptr<float>()
: weight_contig->data_ptr<float>();
constexpr float kEpsilon = 1e-8f;
for (auto row: c10::irange(embedding_rows)) {
const float* input_row = weight_data + row * embedding_cols;
@ -276,6 +271,21 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
return output;
}
Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
const auto weight_contig = weight.expect_contiguous(weight.suggest_memory_format());
auto output = at::detail::empty_cpu(
{0},
at::kByte,
weight_contig->layout(),
weight_contig->device(),
c10::nullopt,
c10::nullopt);
qembeddingbag_byte_prepack_out(output, weight);
return output;
}
namespace {
// TODO: Extend support to N-D batched embeddings, similar to qembeddingbag_byte_prepack
Tensor _qembeddingbag_nbit_prepack_helper(
const Tensor& weight,

View File

@ -0,0 +1,11 @@
#include <ATen/ATen.h>
namespace at {
namespace native {
Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight);
Tensor qembeddingbag_byte_prepack(const Tensor& weight);
} // namespace native
} // namespace at

View File

@ -11,7 +11,7 @@
#include <ATen/cuda/CUDAApplyUtils.cuh>
#include <ATen/cuda/detail/IndexUtils.cuh>
#include <ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh>
#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
#include <THC/THCThrustAllocator.cuh>

View File

@ -0,0 +1,74 @@
/*
Functions here use deprecated cuSPARSE API that was removed in CUDA 11.
This file will be removed eventually.
*/
#include <ATen/Dispatch.h>
#include <ATen/SparseTensorUtils.h>
#include <ATen/native/LinearAlgebraUtils.h>
#include <ATen/native/sparse/cuda/SparseBlasLegacy.h>
#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
namespace at {
namespace native {
void s_addmm_out_csr_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, const Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, const Tensor& crow_indices, const Tensor& col_indices, const Tensor& values, const Tensor& dense) {
TORCH_INTERNAL_ASSERT(nnz > 0);
// No half support, so we don't have to use CUDATypeConversion
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
values.scalar_type(), "addmm_sparse_cuda", [&] {
scalar_t cast_beta = beta.to<scalar_t>();
scalar_t cast_alpha = alpha.to<scalar_t>();
Tensor r__;
if (cast_beta == scalar_t(0)) {
r_.zero_();
} else if (!at::sparse::is_same_tensor(t, r_)) {
r_.copy_(t);
}
if (r_.stride(0) == 1 && r_.stride(1) == r_.size(0)) {
r__ = r_;
} else {
// Note: This storage arrangement is preferred due to most of the CUDA kernels handle only contiguous tensors
r__ = r_.transpose(0, 1).clone(at::MemoryFormat::Contiguous);
r__.transpose_(0, 1);
}
TORCH_INTERNAL_ASSERT(r__.transpose(-1, -2).is_contiguous());
Tensor dense_;
char transpose_dense;
if (dense.stride(0) == 1 && dense.stride(1) == dense.size(0)) {
transpose_dense = 'n';
dense_ = dense;
} else if (dense.stride(1) == 1 && dense.stride(0) == dense.size(1)) {
transpose_dense = 't';
dense_ = dense;
} else {
transpose_dense = 't';
dense_ = dense.contiguous();
}
sparse::cuda::csrmm2(
'n',
transpose_dense,
m,
n,
k,
nnz,
cast_alpha,
values.data_ptr<scalar_t>(),
crow_indices.data_ptr<int32_t>(),
col_indices.data_ptr<int32_t>(),
dense_.data_ptr<scalar_t>(),
(transpose_dense == 'n' ? dense_.stride(1) : dense_.stride(0)),
cast_beta,
r__.data_ptr<scalar_t>(),
r__.stride(1));
if (!at::sparse::is_same_tensor(r__, r_)) {
r_.copy_(r__);
}
}
);
}
} // namespace native
} // namespace at

View File

@ -0,0 +1,18 @@
#pragma once
#include <ATen/Tensor.h>
#include <ATen/core/Scalar.h>
/*
Functions here use deprecated cuSPARSE API that was removed in CUDA 11.
Here only 32-bit indices sparse indices are supported.
This file will be removed eventually.
*/
namespace at {
namespace native {
void s_addmm_out_csr_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, const Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, const Tensor& crow_indices, const Tensor& col_indices, const Tensor& values, const Tensor& dense);
} // namespace native
} // namespace at

View File

@ -2,7 +2,7 @@
#include <ATen/cuda/CUDAContext.h>
#include <c10/util/Exception.h>
#include <ATen/cuda/Exceptions.h>
#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <cusparse.h>
@ -14,7 +14,7 @@
// Using these APIs in any other systems will result in compile-time or run-time failures.
// Their support will be extended in the next releases.
#if defined(__CUDACC__) && (CUSPARSE_VERSION >= 11000 || (!defined(_MSC_VER) && CUSPARSE_VERSION >= 10301))
#if defined(CUDART_VERSION) && (CUSPARSE_VERSION >= 11000 || (!defined(_MSC_VER) && CUSPARSE_VERSION >= 10301))
#define IS_SPMM_AVAILABLE() 1
#else
#define IS_SPMM_AVAILABLE() 0

View File

@ -5,8 +5,9 @@
#include <ATen/NativeFunctions.h>
#include <ATen/SparseTensorUtils.h>
#include <ATen/native/sparse/SparseTensorMath.h>
#include <ATen/native/sparse/cuda/SparseBlasLegacy.h>
#include <ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh>
#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#include <ATen/cuda/CUDAUtils.h>
#include <ATen/cuda/detail/IndexUtils.cuh>
@ -50,64 +51,6 @@ namespace {
}
}
void s_addmm_out_csr_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, Tensor& crow_indices, Tensor& col_indices, Tensor& values, const Tensor& dense) {
TORCH_INTERNAL_ASSERT(nnz > 0);
// No half support, so we don't have to use CUDATypeConversion
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
values.scalar_type(), "addmm_sparse_cuda", [&] {
scalar_t cast_beta = beta.to<scalar_t>();
scalar_t cast_alpha = alpha.to<scalar_t>();
Tensor r__;
if (cast_beta == scalar_t(0)) {
r_.zero_();
} else if (!is_same_tensor(t, r_)) {
r_.copy_(t);
}
if(r_.stride(0) == 1 && r_.stride(1) == r_.size(0)) {
r__ = r_;
} else {
// Note: This storage arrangement is preferred due to most of the CUDA kernels handle only contiguous tensors
r__ = r_.transpose(0, 1).clone(at::MemoryFormat::Contiguous);
r__.transpose_(0, 1);
}
Tensor dense_;
char transpose_dense;
if(dense.stride(0) == 1 && dense.stride(1) == dense.size(0)) {
transpose_dense = 'n';
dense_ = dense;
} else if(dense.stride(1) == 1 && dense.stride(0) == dense.size(1)) {
transpose_dense = 't';
dense_ = dense;
} else {
transpose_dense = 't';
dense_ = dense.contiguous();
}
sparse::cuda::csrmm2(
'n',
transpose_dense,
m,
n,
k,
nnz,
cast_alpha,
values.data_ptr<scalar_t>(),
crow_indices.data_ptr<int32_t>(),
col_indices.data_ptr<int32_t>(),
dense_.data_ptr<scalar_t>(),
(transpose_dense == 'n' ? dense_.stride(1) : dense_.stride(0)),
cast_beta,
r__.data_ptr<scalar_t>(),
r__.stride(1));
if (!is_same_tensor(r__, r_)) {
r_.copy_(r__);
}
}
);
}
// NB: Deleted spaddcmul (aka addcmul_, but not actually wired up), spaddcdiv (not
// wired at all)

View File

@ -6,8 +6,6 @@
namespace at { namespace native {
void s_addmm_out_csr_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, Tensor& crow_indices, Tensor& col_indices, Tensor& values, const Tensor& dense);
void s_addmm_out_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, Tensor& indices, Tensor& values, const Tensor& dense);
}} // namespace at::native

View File

@ -19,7 +19,8 @@
#include <ATen/cuda/CUDAUtils.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
#include <ATen/native/sparse/cuda/SparseBlasLegacy.h>
#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
#include <ATen/native/sparse/cuda/SparseCUDATensorMath.cuh>
#include <thrust/device_ptr.h>

View File

@ -18,7 +18,7 @@
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAUtils.h>
#include <cusparse.h>
#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <thrust/device_vector.h>

View File

@ -0,0 +1,28 @@
#version 450 core
#define PRECISION $precision
layout(std430) buffer;
/* Qualifiers: layout - storage - precision - memory */
layout(set = 0, binding = 0) uniform PRECISION restrict writeonly image3D uOutput;
layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
layout(set = 0, binding = 2) uniform PRECISION restrict Block {
ivec4 size;
float negative_slope;
} uBlock;
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
void main() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);
if (all(lessThan(pos, uBlock.size.xyz))) {
const vec4 inval = texelFetch(uInput, pos, 0);
const vec4 negative_values = vec4(lessThan(inval, vec4(0.0f)));
const vec4 positive_values = vec4(1.0) - negative_values;
const vec4 mask = negative_values * vec4(uBlock.negative_slope) + positive_values;
const vec4 outval = inval * mask;
imageStore(uOutput, pos, outval);
}
}

View File

@ -0,0 +1,27 @@
#version 450 core
#define PRECISION $precision
layout(std430) buffer;
/* Qualifiers: layout - storage - precision - memory */
layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict image3D uOutput;
layout(set = 0, binding = 1) uniform PRECISION restrict Block {
ivec4 size;
float negative_slope;
} uBlock;
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
void main() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);
if (all(lessThan(pos, uBlock.size.xyz))) {
const vec4 inval = imageLoad(uOutput, pos);
const vec4 negative_values = vec4(lessThan(inval, vec4(0.0f)));
const vec4 positive_values = vec4(1.0) - negative_values;
const vec4 mask = negative_values * vec4(uBlock.negative_slope) + positive_values;
const vec4 outval = inval * mask;
imageStore(uOutput, pos, outval);
}
}

View File

@ -404,6 +404,121 @@ Tensor& hardshrink_(
return self;
}
Tensor leaky_relu(
const Tensor& self_arg,
const Scalar& negative_slope) {
api::Context* const context = api::context();
const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan();
const vTensor& v_self = convert(self);
vTensor v_output{
context,
v_self.sizes(),
v_self.options(),
};
api::Command::Pool& command_pool = context->command().pool;
api::Command::Buffer& command_buffer = command_pool.stream();
{
if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
const struct Block final {
uvec3 extents;
uint32_t _;
float negative_slope;
} block {
v_output.extents(),
0u,
negative_slope.to<float>(),
};
context->dispatch(
command_buffer,
{
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
},
VK_KERNEL(leaky_relu),
v_output.extents(),
context->gpu().adapter->local_work_group_size(),
// Write-only access bypasses synchronization but inserts appropriate
// barriers if necessary.
v_output.image(
command_buffer,
vTensor::Stage::Compute,
vTensor::Access::Write),
// Read-only access is implied on const tensors and triggers an async
// synchronization if necessary.
v_self.image(
command_buffer,
vTensor::Stage::Compute),
// Object lifetime is managed by the resource pool.
// It is OK not to keep track of the handle.
context->resource().pool.uniform(block).object);
}
else {
TORCH_CHECK(false, "Not implemented!");
}
}
command_pool.submit(context->gpu().queue, command_buffer);
return convert(v_output);
}
Tensor& leaky_relu_(
Tensor& self,
const Scalar& negative_slope) {
api::Context* const context = api::context();
TORCH_CHECK(
self.is_vulkan(),
"Vulkan: In-place leaky relu is only supported on Vulkan tensors.");
vTensor& v_self = convert(self);
api::Command::Pool& command_pool = context->command().pool;
api::Command::Buffer& command_buffer = command_pool.stream();
{
if C10_LIKELY(v_self.has_image()) {
const struct Block final {
uvec3 extents;
uint32_t _;
float negative_slope;
} block {
v_self.extents(),
0u,
negative_slope.to<float>(),
};
context->dispatch(
command_buffer,
{
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
},
VK_KERNEL(leaky_relu_),
v_self.extents(),
context->gpu().adapter->local_work_group_size(),
// Read-Write access triggers an async synchronization if necessory
// and inserts appropriate barriers if hazards are detected.
v_self.image(
command_buffer,
vTensor::Stage::Compute,
vTensor::Access::Read | vTensor::Access::Write),
// Object lifetime is managed by the resource pool.
// It is OK not to keep track of the handle.
context->resource().pool.uniform(block).object);
}
else {
TORCH_CHECK(false, "Not implemented!");
}
}
command_pool.submit(context->gpu().queue, command_buffer);
return self;
}
Tensor sigmoid(const Tensor& self) {
return ops::activation(self, VK_KERNEL(sigmoid));
}
@ -433,6 +548,8 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
m.impl(TORCH_SELECTIVE_NAME("aten::hardswish_"), hardswish_);
m.impl(TORCH_SELECTIVE_NAME("aten::hardtanh"), hardtanh);
m.impl(TORCH_SELECTIVE_NAME("aten::hardtanh_"), hardtanh_);
m.impl(TORCH_SELECTIVE_NAME("aten::leaky_relu"), leaky_relu);
m.impl(TORCH_SELECTIVE_NAME("aten::leaky_relu_"), leaky_relu_);
m.impl(TORCH_SELECTIVE_NAME("aten::sigmoid"), sigmoid);
m.impl(TORCH_SELECTIVE_NAME("aten::sigmoid_"), sigmoid_);
m.impl(TORCH_SELECTIVE_NAME("aten::tanh"), tanh);

View File

@ -43,6 +43,8 @@ namespace at {
// at namespace already.
namespace {
${dispatch_helpers}
${dispatch_anonymous_definitions}
TORCH_LIBRARY_IMPL(aten, ${DispatchKey}, m) {

View File

@ -979,6 +979,49 @@ TEST(VulkanAPITest, hardshrink_) {
}
}
TEST(VulkanAPITest, leaky_relu) {
if (!at::is_vulkan_available()) {
return;
}
for (const auto negative_slope : {0.01, 0.001, 1.0, -0.001}) {
const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
const auto in_vulkan = in_cpu.vulkan();
const auto out_cpu = at::leaky_relu(in_cpu, negative_slope);
const auto out_vulkan = at::leaky_relu(in_vulkan, negative_slope);
const auto check = almostEqual(out_cpu, out_vulkan.cpu());
if (!check) {
showRtol(out_cpu, out_vulkan.cpu());
}
ASSERT_TRUE(check);
}
}
TEST(VulkanAPITest, leaky_relu_) {
if (!at::is_vulkan_available()) {
return;
}
for (const auto negative_slope : {0.01, 0.001, 1.0, -0.001}) {
auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
auto vulkan = cpu.vulkan();
at::leaky_relu_(cpu, negative_slope);
at::leaky_relu_(vulkan, negative_slope);
const auto check = almostEqual(cpu, vulkan.cpu());
if (!check) {
showRtol(cpu, vulkan.cpu());
}
ASSERT_TRUE(check);
}
}
TEST(VulkanAPITest, hardswish) {
if (!at::is_vulkan_available()) {
return;

View File

@ -6,6 +6,7 @@ add_executable(
bench_batchnorm.cpp
bench_concat.cpp
bench_compile.cpp
bench_signed_log1p.cpp
bench_fuser_overhead.cpp
bench_gemm.cpp
bench_parallel.cpp

View File

@ -0,0 +1,120 @@
#include <benchmark/benchmark.h>
#include <torch/csrc/jit/jit_log.h>
#include <torch/csrc/jit/tensorexpr/ir.h>
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
#include <torch/csrc/jit/tensorexpr/loopnest.h>
#include <torch/csrc/jit/tensorexpr/tensor.h>
#include <torch/torch.h>
using namespace torch::jit::tensorexpr;
namespace {
class SignedLog1pBench : public benchmark::Fixture {
public:
void SetUp(const benchmark::State& state) override {
input_size_ = {state.range(0), state.range(1)};
input_size_int_ = {state.range(0), state.range(1)};
input_ = torch::rand(input_size_);
ref_ = signedLog1p(input_);
}
void TearDown(benchmark::State& state) override {
TORCH_CHECK(at::allclose(ref_, output_));
state.counters["GB/s"] = benchmark::Counter(
uint64_t(state.iterations()) * 2 * output_.nbytes(),
benchmark::Counter::kIsRate);
}
at::Tensor signedLog1p(const at::Tensor& inp) {
auto sign = at::sign(inp);
auto log1p = at::log1p(at::abs(inp));
return sign * log1p;
}
void runATen(benchmark::State& state) {
for (auto _ : state) {
output_ = signedLog1p(input_);
}
}
void runNNC(benchmark::State& state) {
Placeholder input_ph(
"input", kFloat, {input_size_int_[0], input_size_int_[1]});
Tensor abs_result = Compute(
"aten_abs",
{{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
[&](const VarHandle& m, const VarHandle& n) {
return abs(input_ph.load(m, n));
});
Tensor log1p_result = Compute(
"aten_log1p",
{{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
[&](const VarHandle& m, const VarHandle& n) {
return log1p(abs_result.load(m, n));
});
Tensor sign = Compute(
"aten_sign",
{{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
[&](const VarHandle& m, const VarHandle& n) {
return CompareSelect::make(
input_ph.load(m, n),
ExprHandle(0.0f),
ExprHandle(-1),
ExprHandle(1),
kLT);
});
Tensor output = Compute(
"aten_mul",
{{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
[&](const VarHandle& m, const VarHandle& n) {
return sign.load(m, n) * log1p_result.load(m, n);
});
LoopNest nest({output}, {abs_result, log1p_result, sign, output});
GRAPH_DEBUG("Original Stmt: ", *nest.root_stmt());
nest.inlineIntermediateBufs(true);
nest.prepareForCodegen();
nest.simplify();
nest.vectorizeInnerLoops();
nest.simplify();
GRAPH_DEBUG("Final stmt: ", *nest.root_stmt());
// StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
std::vector<CodeGen::BufferArg> buf_args;
buf_args.push_back(input_ph);
buf_args.push_back(output);
LLVMCodeGen cg(nest.root_stmt(), buf_args);
std::vector<CodeGen::CallArg> call_args;
for (auto _ : state) {
output_ = at::empty_like(ref_);
call_args.clear();
call_args.push_back(input_.data_ptr<float>());
call_args.push_back(output_.data_ptr<float>());
cg.call(call_args);
}
}
private:
std::vector<long> input_size_;
std::vector<int> input_size_int_;
at::Tensor input_;
at::Tensor output_;
at::Tensor ref_;
};
} // namespace
BENCHMARK_DEFINE_F(SignedLog1pBench, ATen)(benchmark::State& state) {
runATen(state);
}
BENCHMARK_DEFINE_F(SignedLog1pBench, NNC)(benchmark::State& state) {
runNNC(state);
}
BENCHMARK_REGISTER_F(SignedLog1pBench, ATen)->Args({10, 1467});
BENCHMARK_REGISTER_F(SignedLog1pBench, NNC)->Args({10, 1467});

View File

@ -762,3 +762,42 @@ const std::string quantize_script = R"IR(
%1249: Tensor = aten::dequantize(%1254)
return (%1249)
)IR";
const auto fmod_tensor = R"JIT(
def forward(self, a: Tensor, b: Tensor):
return torch.fmod(a, b).clone()
)JIT";
const auto fmod_scalar = R"JIT(
def forward(self, a: Tensor, b: int):
return torch.fmod(a, b).clone()
)JIT";
const std::string embedding_bag_byte_prepack_script = R"IR(
graph(%input: Tensor):
%none : None = prim::Constant()
%output: Tensor = quantized::embedding_bag_byte_prepack(%input)
%res: Tensor = aten::clone(%output, %none)
return (%res)
)IR";
const auto linalg_norm_ord_scalar = R"JIT(
def forward(self, a: Tensor, ord: int, dim: List[int], keepdim: bool, dtype: int):
return torch.linalg_norm(a, ord, dim, keepdim, dtype=dtype).clone()
)JIT";
const auto linalg_norm_ord_str = R"JIT(
def forward(self, a: Tensor, ord: str, dim: List[int], keepdim: bool, dtype: int):
return torch.linalg_norm(a, ord, dim, keepdim, dtype=dtype).clone()
)JIT";
const std::string cat_script = R"IR(
graph(%a: Tensor, %b: Tensor, %dim: int):
%ten_list: Tensor[] = prim::ListConstruct(%a, %b)
%1 : int = prim::Constant[value=0]()
%2 : int = prim::Constant[value=1]()
%3 : int = prim::Constant[value=1]()
%ten_list2 : Tensor[] = aten::slice(%ten_list, %1, %2, %3)
%ret: Tensor = aten::cat(%ten_list2, %dim)
return (%ret)
)IR";

View File

@ -1,5 +1,6 @@
#include <gtest/gtest.h>
#include <torch/csrc/jit/ir/alias_analysis.h>
#include <torch/csrc/jit/ir/irparser.h>
#include <torch/csrc/jit/runtime/static/fusion.h>
#include <torch/csrc/jit/runtime/static/impl.h>
#include <torch/csrc/jit/runtime/static/passes.h>
@ -209,6 +210,13 @@ TEST(StaticRuntime, EmbeddingBag) {
}
TEST(StaticRuntime, LayerNorm) {
#ifdef FBCODE_CAFFE2
script::Module module("module");
module.define(layer_norm_with_weights);
torch::jit::StaticModule smodule(module);
ASSERT_EQ(getNodeWithKind(smodule, "aten::layer_norm"), nullptr);
ASSERT_NE(getNodeWithKind(smodule, "static_runtime::layer_norm"), nullptr);
#endif
const auto a = torch::rand({1, 2, 2, 2});
const auto b = torch::rand({3, 2, 2, 2});
for (int normalized_size : {2, 3}) {
@ -1223,3 +1231,83 @@ TEST(StaticRuntime, IndividualOps_VarStack) {
testStaticRuntime(var_stack_script, args1, args2);
}
TEST(StaticRuntime, IndividualOps_FmodTensor) {
// fmod tensor version
auto a = at::randn({2, 3});
auto b = at::randn({2, 3});
std::vector<IValue> args0{a, b};
testStaticRuntime(fmod_tensor, args0);
// check for dynamic shapes
auto c = at::randn({4, 3, 2});
auto d = at::randn({4, 3, 2});
std::vector<IValue> args1{c, d};
testStaticRuntime(fmod_tensor, args0, args1);
}
TEST(StaticRuntime, IndividualOps_FmodScalar) {
auto a = at::randn({2, 3});
// fmod scalar version
std::vector<IValue> args2{a, 3};
testStaticRuntime(fmod_scalar, args2);
// check for dynamic shapes
auto c = at::randn({4, 3, 2});
std::vector<IValue> args3{c, 4};
testStaticRuntime(fmod_scalar, args2, args3);
}
TEST(StaticRuntime, QEmbeddingBagByteUnpack) {
auto a = torch::randn({8, 16}, at::ScalarType::Float);
auto b = torch::randn({8*2, 16*2}, at::ScalarType::Float);
testStaticRuntime(embedding_bag_byte_prepack_script, {a});
testStaticRuntime(embedding_bag_byte_prepack_script, {a},{b});
}
TEST(StaticRuntime, IndividualOps_LinalgNorm_ScalarOrd) {
auto a = at::randn({2, 3});
auto dim = std::vector<int64_t>({1});
auto dtype = at::ScalarType::Float;
std::vector<IValue> args0{a, 4, dim, true, dtype};
testStaticRuntime(linalg_norm_ord_scalar, args0);
auto b = at::randn({4, 5});
std::vector<IValue> args1{b, 4, dim, true, dtype};
testStaticRuntime(linalg_norm_ord_scalar, args0, args1);
}
TEST(StaticRuntime, IndividualOps_LinalgNorm_StringOrd) {
auto a = at::randn({2, 3});
auto dim = std::vector<int64_t>({0, 1});
auto dtype = at::ScalarType::Float;
std::vector<IValue> args0{a, "fro", dim, true, dtype};
testStaticRuntime(linalg_norm_ord_str, args0);
auto b = at::randn({4, 5});
std::vector<IValue> args1{b, "fro", dim, true, dtype};
testStaticRuntime(linalg_norm_ord_str, args0, args1);
}
TEST(StaticRuntime, IndividualOps_Cat) {
auto graph = std::make_shared<Graph>();
std::unordered_map<std::string, Value*> vmap;
parseIR(cat_script, graph.get(), vmap);
torch::jit::StaticModule smodule(graph);
ASSERT_TRUE(getNodeWithKind(smodule, "aten::cat"));
auto a = at::randn({2, 4});
auto b = at::randn({3, 4});
std::vector<IValue> args0{a, b, 0};
testStaticRuntime(cat_script, args0);
auto c = at::randn({3, 4});
auto d = at::randn({3, 5});
std::vector<IValue> args1{c, d, 1};
testStaticRuntime(cat_script, args0, args1);
}

View File

@ -4,8 +4,10 @@ namespace c10 {
namespace {
// By default, grad mode is enabled and inference mode is disabled
thread_local AutogradState autograd_state_tls =
AutogradState(/* grad_mode */ true, /* inference_mode */ false);
thread_local AutogradState autograd_state_tls = AutogradState(
/* grad_mode */ true,
/* inference_mode */ false,
/* fw_grad_mode */ true);
} // namespace
AutogradState& AutogradState::get_tls_state() {

View File

@ -12,13 +12,19 @@ struct C10_API AutogradState {
static AutogradState& get_tls_state();
static void set_tls_state(AutogradState state);
AutogradState(bool grad_mode, bool inference_mode)
: grad_mode_(grad_mode), inference_mode_(inference_mode) {}
AutogradState(bool grad_mode, bool inference_mode, bool fw_grad_mode)
: grad_mode_(grad_mode),
inference_mode_(inference_mode),
fw_grad_mode_(fw_grad_mode) {}
void set_grad_mode(bool enabled) {
grad_mode_ = enabled;
}
void set_fw_grad_mode(bool enabled) {
fw_grad_mode_ = enabled;
}
void set_inference_mode(bool enabled) {
inference_mode_ = enabled;
}
@ -27,6 +33,10 @@ struct C10_API AutogradState {
return grad_mode_;
}
bool get_fw_grad_mode() const {
return fw_grad_mode_;
}
bool get_inference_mode() const {
return inference_mode_;
}
@ -34,6 +44,7 @@ struct C10_API AutogradState {
private:
bool grad_mode_ : 1;
bool inference_mode_ : 1;
bool fw_grad_mode_ : 1;
};
} // namespace c10

View File

@ -1,4 +1,3 @@
#include <c10/core/AutogradState.h>
#include <c10/core/GradMode.h>
#include <stdexcept>

View File

@ -1,5 +1,6 @@
#pragma once
#include <c10/core/AutogradState.h>
#include <c10/macros/Macros.h>
namespace c10 {
@ -27,4 +28,17 @@ struct TORCH_API NoGradGuard : public AutoGradMode {
NoGradGuard() : AutoGradMode(/*enabled=*/false) {}
};
// A RAII, thread local (!) guard that enables or disables forward grad mode
// upon construction, and sets it back to the original value upon destruction.
struct TORCH_API AutoFwGradMode {
AutoFwGradMode(bool enabled)
: prev_mode(AutogradState::get_tls_state().get_fw_grad_mode()) {
AutogradState::get_tls_state().set_fw_grad_mode(enabled);
}
~AutoFwGradMode() {
AutogradState::get_tls_state().set_fw_grad_mode(prev_mode);
}
bool prev_mode;
};
} // namespace c10

View File

@ -53,10 +53,12 @@ struct TORCH_API InferenceMode {
InferenceMode(bool enabled = true)
: prev_mode(AutogradState::get_tls_state()),
prev_keyset(c10::impl::tls_local_dispatch_key_set()) {
// Enabling inference mode means disabling grad mode
// And disabling inference mode means enabling grad mode
AutogradState::set_tls_state(
AutogradState(/* grad_mode */ !enabled, /* inference_mode */ enabled));
// Enabling inference mode means disabling grad modes
// And disabling inference mode means enabling grad modes
AutogradState::set_tls_state(AutogradState(
/* grad_mode */ !enabled,
/* inference_mode */ enabled,
/* fw_grad_mode */ !enabled));
DispatchKeySet included = enabled
? prev_keyset.included_.remove(c10::DispatchKey::ADInplaceOrView)
: prev_keyset.included_.add(c10::DispatchKey::ADInplaceOrView);

View File

@ -68,7 +68,7 @@ struct C10_API StorageImpl final : public c10::intrusive_ptr_target {
StorageImpl() = delete;
StorageImpl(StorageImpl&& other) = default;
StorageImpl(const StorageImpl&) = delete;
~StorageImpl() = default;
~StorageImpl() override = default;
void reset() {
data_ptr_.clear();

View File

@ -6,12 +6,13 @@
#include <initializer_list>
#include <ostream>
#include <set>
#include <vector>
#include <unordered_map>
#include <vector>
#include "c10/util/Registry.h"
#include "caffe2/core/common.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/types.h"
#include "caffe2/proto/caffe2_pb.h"
#include "caffe2/utils/filler.h"
#include "caffe2/utils/proto_utils.h"
@ -273,8 +274,8 @@ class TORCH_API OpSchema {
OpSchema&
Arg(const char* name, const char* description, bool required = false);
#define DECLARE_STANDARD_ARG(name, str) \
static const char* Arg_##name; \
#define DECLARE_STANDARD_ARG(name, str) \
static const char* Arg_##name; \
OpSchema& Arg##name(const char* description);
DECLARE_STANDARD_ARG(IsTest, is_test)
@ -339,7 +340,9 @@ class TORCH_API OpSchema {
return inplace_enforced_(x, y);
}
TORCH_API friend std::ostream& operator<<(std::ostream& out, const OpSchema& schema);
TORCH_API friend std::ostream& operator<<(
std::ostream& out,
const OpSchema& schema);
const std::vector<Argument>& args() const {
return args_;
@ -562,8 +565,10 @@ OpSchema::Cost PointwiseCostInference(
}
c.flops = nElemX * OpsPerPoint;
c.bytes_read = nElemRead * sizeof(X.data_type());
c.bytes_written = nElemX * sizeof(X.data_type());
auto const& X_element_size_byte =
DataTypeToTypeMeta(X.data_type()).itemsize();
c.bytes_read = nElemRead * X_element_size_byte;
c.bytes_written = nElemX * X_element_size_byte;
return c;
}

View File

@ -1,6 +1,7 @@
#include "caffe2/operators/batch_matmul_op.h"
#include "caffe2/core/operator_schema.h"
#include "caffe2/core/types.h"
namespace caffe2 {
@ -116,9 +117,13 @@ OpSchema::Cost CostInferenceForBatchMatMul(
K = in[0].dims(ndims_A - 1);
}
auto const& A_element_size_byte =
DataTypeToTypeMeta(A.data_type()).itemsize();
auto const& Y_element_size_byte =
DataTypeToTypeMeta(Y.data_type()).itemsize();
c.flops = 2 * nElemY * K;
c.bytes_read = (nElemA + nElemB) * sizeof(A.data_type());
c.bytes_written = nElemY * sizeof(Y.data_type());
c.bytes_read = (nElemA + nElemB) * A_element_size_byte;
c.bytes_written = nElemY * Y_element_size_byte;
c.params_bytes = 0;
return c;
}
@ -180,72 +185,76 @@ class GetBatchMatMulGradient : public GradientMakerBase {
auto no_trans_arg = vector<Argument>();
auto trans_a_arg = vector<Argument>{MakeArgument<int>("trans_a", 1)};
auto trans_b_arg = vector<Argument>{MakeArgument<int>("trans_b", 1)};
auto trans_both_arg = vector<Argument>{MakeArgument<int>("trans_a", 1),
MakeArgument<int>("trans_b", 1)};
auto trans_both_arg = vector<Argument>{
MakeArgument<int>("trans_a", 1), MakeArgument<int>("trans_b", 1)};
if (trans_a) {
if (trans_b) {
// A'B':
// dA = B'G', dB = G'A'
return vector<OperatorDef>{CreateOperatorDef(
"BatchMatMul",
"",
vector<string>{I(1), GO(0)},
vector<string>{GI(0)},
trans_both_arg),
CreateOperatorDef(
"BatchMatMul",
"",
vector<string>{GO(0), I(0)},
vector<string>{GI(1)},
trans_both_arg)};
return vector<OperatorDef>{
CreateOperatorDef(
"BatchMatMul",
"",
vector<string>{I(1), GO(0)},
vector<string>{GI(0)},
trans_both_arg),
CreateOperatorDef(
"BatchMatMul",
"",
vector<string>{GO(0), I(0)},
vector<string>{GI(1)},
trans_both_arg)};
} else {
// A'B:
// dA = BG', dB = AG
return vector<OperatorDef>{CreateOperatorDef(
"BatchMatMul",
"",
vector<string>{I(1), GO(0)},
vector<string>{GI(0)},
trans_b_arg),
CreateOperatorDef(
"BatchMatMul",
"",
vector<string>{I(0), GO(0)},
vector<string>{GI(1)},
no_trans_arg)};
return vector<OperatorDef>{
CreateOperatorDef(
"BatchMatMul",
"",
vector<string>{I(1), GO(0)},
vector<string>{GI(0)},
trans_b_arg),
CreateOperatorDef(
"BatchMatMul",
"",
vector<string>{I(0), GO(0)},
vector<string>{GI(1)},
no_trans_arg)};
}
} else {
if (trans_b) {
// AB':
// dA = GB, dB = G'A
return vector<OperatorDef>{CreateOperatorDef(
"BatchMatMul",
"",
vector<string>{GO(0), I(1)},
vector<string>{GI(0)},
no_trans_arg),
CreateOperatorDef(
"BatchMatMul",
"",
vector<string>{GO(0), I(0)},
vector<string>{GI(1)},
trans_a_arg)};
return vector<OperatorDef>{
CreateOperatorDef(
"BatchMatMul",
"",
vector<string>{GO(0), I(1)},
vector<string>{GI(0)},
no_trans_arg),
CreateOperatorDef(
"BatchMatMul",
"",
vector<string>{GO(0), I(0)},
vector<string>{GI(1)},
trans_a_arg)};
} else {
// AB:
// dA = GB', dB = A'G
return vector<OperatorDef>{CreateOperatorDef(
"BatchMatMul",
"",
vector<string>{GO(0), I(1)},
vector<string>{GI(0)},
trans_b_arg),
CreateOperatorDef(
"BatchMatMul",
"",
vector<string>{I(0), GO(0)},
vector<string>{GI(1)},
trans_a_arg)};
return vector<OperatorDef>{
CreateOperatorDef(
"BatchMatMul",
"",
vector<string>{GO(0), I(1)},
vector<string>{GI(0)},
trans_b_arg),
CreateOperatorDef(
"BatchMatMul",
"",
vector<string>{I(0), GO(0)},
vector<string>{GI(1)},
trans_a_arg)};
}
}
}

View File

@ -101,9 +101,12 @@ OpSchema::Cost CostInferenceForSplit(
CAFFE_ENFORCE_GT(in.size(), 0);
struct OpSchema::Cost cost;
cost.flops = 0;
auto input_bytes_count = nElemFromDim(in[0]) * sizeof(in[0].data_type());
auto split_bytes_count =
(in.size() == 1) ? 0 : nElemFromDim(in[1]) * sizeof(in[1].data_type());
auto const& input_0_element_size_byte =
DataTypeToTypeMeta(in[0].data_type()).itemsize();
auto const& input_1_element_size_byte =
(in.size() > 1) ? DataTypeToTypeMeta(in[1].data_type()).itemsize() : 0;
auto input_bytes_count = nElemFromDim(in[0]) * input_0_element_size_byte;
auto split_bytes_count = nElemFromDim(in[1]) * input_1_element_size_byte;
// There can be two input blobs:
// (1) actual tensor to be split
// (2) lengths of outputs along split axis
@ -329,11 +332,13 @@ OpSchema::Cost CostInferenceForConcat(
}
auto split_info_bytes_count = in.size() * sizeof(int);
auto const& input_0_element_size_byte =
DataTypeToTypeMeta(in[0].data_type()).itemsize();
struct OpSchema::Cost cost;
cost.flops = 0;
cost.bytes_read = nElemRead * sizeof(in[0].data_type());
cost.bytes_read = nElemRead * input_0_element_size_byte;
cost.bytes_written =
size * sizeof(in[0].data_type()) + split_info_bytes_count;
size * input_0_element_size_byte + split_info_bytes_count;
cost.params_bytes = 0;
return cost;
}

View File

@ -7,6 +7,7 @@
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/types.h"
#include "caffe2/proto/caffe2_legacy.pb.h"
#include "caffe2/utils/math.h"
@ -519,14 +520,20 @@ class ConvPoolOpBase : public Operator<Context> {
uint64_t nElemW = nElemFromDim(W);
uint64_t nElemBias = inputs.size() > 2 ? nElemFromDim(inputs[2]) : 0;
auto const& X_elemenet_size_byte =
DataTypeToTypeMeta(X.data_type()).itemsize();
auto const& Y_element_size_byte =
DataTypeToTypeMeta(Y.data_type()).itemsize();
auto const& W_element_size_byte =
DataTypeToTypeMeta(W.data_type()).itemsize();
// grouping is NOT properly handled yet
c.flops = N * Y_t * Y_h * Y_w * kernel_t * kernel_w * kernel_h *
in_channels * out_channels * 2;
c.bytes_read = (nElemX + nElemW + nElemBias) * sizeof(X.data_type());
c.bytes_written =
N * out_channels * Y_t * Y_h * Y_w * sizeof(Y.data_type());
c.bytes_read = (nElemX + nElemW + nElemBias) * X_elemenet_size_byte;
c.bytes_written = N * out_channels * Y_t * Y_h * Y_w * Y_element_size_byte;
c.params_bytes = out_channels * in_channels * kernel_t * kernel_h *
kernel_w * sizeof(W.data_type());
kernel_w * W_element_size_byte;
return c;
}

View File

@ -1,4 +1,5 @@
#include "caffe2/operators/distance_op.h"
#include "caffe2/core/types.h"
#include "caffe2/utils/eigen_utils.h"
#ifdef CAFFE2_USE_MKLDNN
#include <caffe2/ideep/operators/operator_fallback_ideep.h>
@ -7,7 +8,7 @@
namespace caffe2 {
template<>
template <>
bool SquaredL2DistanceOp<float, CPUContext>::RunOnDevice() {
auto& X = Input(0);
auto& Y = Input(1);
@ -257,7 +258,9 @@ OpSchema::Cost CostInferenceForDotProduct(
CAFFE_ENFORCE_EQ(out[0].dims().size(), 1);
struct OpSchema::Cost c = PointwiseCostInference<2>(def, in);
c.bytes_written = out[0].dims(0) * sizeof(out[0].data_type());
auto const& out_0_element_size_byte =
DataTypeToTypeMeta(out[0].data_type()).itemsize();
c.bytes_written = out[0].dims(0) * out_0_element_size_byte;
c.params_bytes = 0;
return c;
}
@ -379,10 +382,12 @@ bool DotProductWithPaddingOp<float, CPUContext>::RunOnDevice() {
}
// L2
REGISTER_CPU_OPERATOR(SquaredL2Distance,
SquaredL2DistanceOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(SquaredL2DistanceGradient,
SquaredL2DistanceGradientOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(
SquaredL2Distance,
SquaredL2DistanceOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(
SquaredL2DistanceGradient,
SquaredL2DistanceGradientOp<float, CPUContext>);
OPERATOR_SCHEMA(SquaredL2Distance)
.NumInputs(2)
@ -402,7 +407,8 @@ class GetSquaredL2DistanceGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
"SquaredL2DistanceGradient", "",
"SquaredL2DistanceGradient",
"",
vector<string>{I(0), I(1), GO(0)},
vector<string>{GI(0), GI(1)});
}
@ -762,9 +768,9 @@ class GetDotProductWithPaddingGradient : public GradientMakerBase {
replicate = GetArgument(Def(), "replicate").i();
}
const auto dot_arg =
vector<Argument>{MakeArgument<float>("pad_value", pad_value),
MakeArgument<bool>("replicate", replicate)};
const auto dot_arg = vector<Argument>{
MakeArgument<float>("pad_value", pad_value),
MakeArgument<bool>("replicate", replicate)};
return SingleGradientDef(
"DotProductWithPaddingGradient",
@ -775,4 +781,4 @@ class GetDotProductWithPaddingGradient : public GradientMakerBase {
}
};
REGISTER_GRADIENT(DotProductWithPadding, GetDotProductWithPaddingGradient);
} // namespace caffe2
} // namespace caffe2

View File

@ -1,4 +1,5 @@
#include "caffe2/operators/fc_inference.h"
#include "caffe2/core/types.h"
namespace caffe2 {
std::vector<TensorShape> FCShapeInference(
@ -51,11 +52,12 @@ OpSchema::Cost CostInferenceForFC(
? size_from_dim_(canonical_axis_w, GetDimsVector(in[1]))
: size_to_dim_(canonical_axis_w, GetDimsVector(in[1]));
const auto& X = in[0];
auto const& X_element_size_byte =
DataTypeToTypeMeta(in[0].data_type()).itemsize();
c.flops = M * N * (2 * K + 1);
c.bytes_read = (K * (M + N) + N) * sizeof(X.data_type());
c.bytes_written = M * N * sizeof(X.data_type());
c.params_bytes = (K * N + N) * sizeof(X.data_type());
c.bytes_read = (K * (M + N) + N) * X_element_size_byte;
c.bytes_written = M * N * X_element_size_byte;
c.params_bytes = (K * N + N) * X_element_size_byte;
return c;
}
@ -94,7 +96,11 @@ OpSchema::Cost CostInferenceForFCGradient(
CAFFE_ENFORCE_LT(0, out.size());
const TensorShape dW = out[0];
auto const& dW_element_size_byte =
DataTypeToTypeMeta(dW.data_type()).itemsize();
const TensorShape db = out[1];
auto const& db_element_size_byte =
DataTypeToTypeMeta(db.data_type()).itemsize();
auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
const auto canonical_axis = canonical_axis_index_(axis, in[0].dims().size());
@ -111,15 +117,17 @@ OpSchema::Cost CostInferenceForFCGradient(
uint64_t size_db = nElemFromDim(db);
c.flops = M * N * (2 * K + 1);
c.bytes_written = (size_dW + size_db) * sizeof(float);
c.bytes_written =
size_dW * dW_element_size_byte + size_db * db_element_size_byte;
c.params_bytes = (K * N + N) * sizeof(float);
if (out.size() == 3) {
const TensorShape dX = out[2];
uint64_t size_dX = nElemFromDim(dX);
auto const& dX_element_size_byte =
DataTypeToTypeMeta(dX.data_type()).itemsize();
c.flops += 2 * M * N * K;
c.bytes_written += size_dX * sizeof(float);
c.bytes_written += size_dX * dX_element_size_byte;
}
return c;
}

View File

@ -2,6 +2,7 @@
#include "caffe2/core/operator.h"
#include "caffe2/core/tensor.h"
#include "caffe2/core/types.h"
namespace caffe2 {
@ -78,12 +79,21 @@ OpSchema::Cost CostInferenceForBatchOneHot(
const auto& length = in[1];
const auto& values = in[2];
uint64_t nBytesData = nElemFromDim(data) * sizeof(data.data_type());
uint64_t nBytesLength = nElemFromDim(length) * sizeof(length.data_type());
uint64_t nBytesValues = nElemFromDim(values) * sizeof(values.data_type());
auto const& data_element_size_byte =
DataTypeToTypeMeta(data.data_type()).itemsize();
auto const& length_element_size_byte =
DataTypeToTypeMeta(length.data_type()).itemsize();
auto const& values_element_size_byte =
DataTypeToTypeMeta(values.data_type()).itemsize();
auto const& output_element_size_byte =
DataTypeToTypeMeta(output.data_type()).itemsize();
uint64_t nBytesData = nElemFromDim(data) * data_element_size_byte;
uint64_t nBytesLength = nElemFromDim(length) * length_element_size_byte;
uint64_t nBytesValues = nElemFromDim(values) * values_element_size_byte;
c.flops = 0;
c.bytes_read = nBytesData + nBytesLength + nBytesValues;
c.bytes_written = nElemFromDim(output) * sizeof(output.data_type());
c.bytes_written = nElemFromDim(output) * output_element_size_byte;
c.params_bytes = 0;
return c;
}
@ -145,15 +155,15 @@ bool BatchBucketOneHotOp<CPUContext>::RunOnDevice() {
for (int64_t j = 0; j < D; j++) {
// here we assume the boundary values for each feature are sorted
int64_t lower_bucket_idx = std::lower_bound(
boundaries_offset,
boundaries_offset + lens_data[j],
input_data[pos]) -
boundaries_offset,
boundaries_offset + lens_data[j],
input_data[pos]) -
boundaries_offset;
int64_t upper_bucket_idx = std::upper_bound(
boundaries_offset,
boundaries_offset + lens_data[j],
input_data[pos]) -
boundaries_offset,
boundaries_offset + lens_data[j],
input_data[pos]) -
boundaries_offset;
int64_t bucket_idx = (lower_bucket_idx + upper_bucket_idx) / 2;

View File

@ -1,6 +1,7 @@
#include "caffe2/operators/utility_ops.h"
#include <cmath>
#include <iostream>
#include "caffe2/core/types.h"
#include "caffe2/utils/eigen_utils.h"
namespace caffe2 {
@ -34,9 +35,11 @@ OpSchema::Cost CostInferenceForWeightedSum(
const auto& nElem = nElemFromDim(X0);
const auto& nInputs = in.size();
c.flops = (nInputs - 1) * nElem;
c.bytes_read = (nInputs / 2) * (nElem + 1) * sizeof(X0.data_type());
c.bytes_written = nElem * sizeof(X0.data_type());
c.params_bytes = (nInputs / 2) * sizeof(X0.data_type());
auto const& X0_element_size_byte =
DataTypeToTypeMeta(X0.data_type()).itemsize();
c.bytes_read = (nInputs / 2) * (nElem + 1) * X0_element_size_byte;
c.bytes_written = nElem * X0_element_size_byte;
c.params_bytes = (nInputs / 2) * X0_element_size_byte;
return c;
}
@ -48,9 +51,7 @@ REGISTER_CPU_OPERATOR(ResizeLike, ResizeLikeOp<CPUContext>);
REGISTER_CPU_OPERATOR(SumInt, SumOp<CPUContext>);
REGISTER_CPU_OPERATOR(WeightedSum, WeightedSumOp<CPUContext>);
REGISTER_CPU_OPERATOR(WeightedSumGradient, WeightedSumGradientOp<CPUContext>);
REGISTER_CPU_OPERATOR(
ScatterWeightedSum,
ScatterWeightedSumOp<CPUContext>);
REGISTER_CPU_OPERATOR(ScatterWeightedSum, ScatterWeightedSumOp<CPUContext>);
REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp<CPUContext>);
REGISTER_CPU_OPERATOR(Scatter, ScatterOp<CPUContext>);

View File

@ -7,33 +7,39 @@ from caffe2.python.test_util import TestCase
class TestConcatOpCost(TestCase):
def test_columnwise_concat(self):
workspace.ResetWorkspace()
workspace.FeedBlob("input_1", np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32))
workspace.FeedBlob("input_2", np.array([[7], [8]], dtype=np.int32))
concat_op = core.CreateOperator(
"Concat",
["input_1", "input_2"],
["output", "split_info"],
)
workspace.RunOperatorOnce(concat_op)
def _test_columnwise_concat_for_type(dtype):
workspace.ResetWorkspace()
workspace.FeedBlob("input_1", np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype))
workspace.FeedBlob("input_2", np.array([[7], [8]], dtype=dtype))
concat_op = core.CreateOperator(
"Concat",
["input_1", "input_2"],
["output", "split_info"],
)
workspace.RunOperatorOnce(concat_op)
output = workspace.FetchBlob("output")
self.assertTupleEqual(output.shape, (2, 4))
np.testing.assert_array_equal(output, [[1, 2, 3, 7], [4, 5, 6, 8]])
output = workspace.FetchBlob("output")
self.assertTupleEqual(output.shape, (2, 4))
np.testing.assert_array_equal(output, [[1, 2, 3, 7], [4, 5, 6, 8]])
flops, bytes_written, bytes_read = workspace.GetOperatorCost(
concat_op, concat_op.input
)
flops, bytes_written, bytes_read = workspace.GetOperatorCost(
concat_op, concat_op.input
)
self.assertEqual(flops, 0)
self.assertEqual(
bytes_read,
sum(workspace.FetchBlob(b).nbytes for b in concat_op.input),
)
self.assertEqual(
bytes_written,
sum(workspace.FetchBlob(b).nbytes for b in concat_op.output),
)
self.assertEqual(flops, 0)
self.assertEqual(
bytes_read,
sum(workspace.FetchBlob(b).nbytes for b in concat_op.input),
)
self.assertEqual(
bytes_written,
sum(workspace.FetchBlob(b).nbytes for b in concat_op.output),
)
[
_test_columnwise_concat_for_type(t)
for t in [np.int64, np.float, np.half, np.int8]
]
def test_split_then_concat(self):
workspace.ResetWorkspace()

View File

@ -60,7 +60,7 @@ class TestWorkspace(unittest.TestCase):
self.assertTupleEqual(
op_cost,
namedtuple("Cost", ["flops", "bytes_written", "bytes_read"])(
1152, 256, 2084
1152, 256, 4168
),
)

View File

@ -16,7 +16,7 @@ class TORCH_API IStreamAdapter final : public ReadAdapterInterface {
size_t size() const override;
size_t read(uint64_t pos, void* buf, size_t n, const char* what = "")
const override;
~IStreamAdapter();
~IStreamAdapter() override;
private:
std::istream* istream_;

View File

@ -1,4 +1,5 @@
#include "adagrad_op.h"
#include "caffe2/core/types.h"
namespace caffe2 {
@ -23,22 +24,30 @@ static OpSchema::Cost CostInferenceForAdagrad(
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
c.flops = grad_size * 10;
auto const& moment_element_size_byte =
DataTypeToTypeMeta(moment.data_type()).itemsize();
auto const& param_element_size_byte =
DataTypeToTypeMeta(param.data_type()).itemsize();
auto const& grad_element_size_byte =
DataTypeToTypeMeta(grad.data_type()).itemsize();
auto const& lr_element_size_byte =
DataTypeToTypeMeta(lr.data_type()).itemsize();
uint64_t bytes_written =
grad_size * (sizeof(param.data_type()) + sizeof(moment.data_type()));
grad_size * param_element_size_byte + moment_element_size_byte;
if (output_size == 3) {
// also need to output effective learning rate in this case
// assume it's the same data type as lr
bytes_written += grad_size * sizeof(lr.data_type());
bytes_written += grad_size * lr_element_size_byte;
} else if (output_size == 4) {
// also need to output effective learning rate and updates in this case
// assume update is the same data type as param
bytes_written +=
grad_size * (sizeof(lr.data_type()) + sizeof(param.data_type()));
grad_size * (lr_element_size_byte + param_element_size_byte);
}
c.bytes_written = bytes_written;
c.bytes_read = c.bytes_written +
grad_size * (sizeof(grad.data_type()) + sizeof(lr.data_type()));
grad_size * (grad_element_size_byte + lr_element_size_byte);
return c;
}
@ -102,10 +111,18 @@ static OpSchema::Cost CostInferenceForSparseAdagrad(
// (optimistically count sqrt as one flop).
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
c.flops = grad_size * 7;
auto const& param_element_size_byte =
DataTypeToTypeMeta(param.data_type()).itemsize();
auto const& moment_element_size_byte =
DataTypeToTypeMeta(moment.data_type()).itemsize();
c.bytes_written =
grad_size * (sizeof(param.data_type()) + sizeof(moment.data_type()));
c.bytes_read = c.bytes_written + grad_size * sizeof(grad.data_type()) +
n * sizeof(indices.data_type());
grad_size * (param_element_size_byte + moment_element_size_byte);
auto const& grad_element_size_byte =
DataTypeToTypeMeta(grad.data_type()).itemsize();
auto const& indices_element_size_byte =
DataTypeToTypeMeta(indices.data_type()).itemsize();
c.bytes_read = c.bytes_written + grad_size * grad_element_size_byte +
n * indices_element_size_byte;
return c;
}
@ -153,6 +170,16 @@ static OpSchema::Cost CostInferenceForRowWiseSparseAdagrad(
OpSchema::Cost c;
if (n > 0) {
auto const& param_element_size_byte =
DataTypeToTypeMeta(param.data_type()).itemsize();
auto const& moment_element_size_byte =
DataTypeToTypeMeta(moment.data_type()).itemsize();
auto const& grad_element_size_byte =
DataTypeToTypeMeta(grad.data_type()).itemsize();
auto const& indices_element_size_byte =
DataTypeToTypeMeta(indices.data_type()).itemsize();
auto const& lr_element_size_byte =
DataTypeToTypeMeta(lr.data_type()).itemsize();
auto block_size = grad_size / n;
if (block_size == 1) {
// +2: applying weight decay and add to grads
@ -161,22 +188,22 @@ static OpSchema::Cost CostInferenceForRowWiseSparseAdagrad(
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
c.flops = n * 9;
c.bytes_written =
n * (sizeof(param.data_type()) + sizeof(moment.data_type()));
n * (param_element_size_byte + moment_element_size_byte);
c.bytes_read = c.bytes_written +
n *
(sizeof(grad.data_type()) + sizeof(indices.data_type()) +
sizeof(lr.data_type()));
(grad_element_size_byte + indices_element_size_byte +
lr_element_size_byte);
} else {
// 5 per block (not counting index transforms)
// 8 for each value of a block
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
c.flops = n * (5 + (block_size * 8));
c.bytes_written =
n * sizeof(moment.data_type()) + n * block_size * (param.data_type());
c.bytes_written = n * moment_element_size_byte +
n * block_size * param_element_size_byte;
c.bytes_read = c.bytes_written + n * (sizeof(lr.data_type())) +
c.bytes_read = c.bytes_written + n * lr_element_size_byte +
2 * n * block_size *
(sizeof(grad.data_type()) + sizeof(param.data_type()));
(grad_element_size_byte + param_element_size_byte);
}
}
return c;

View File

@ -131,6 +131,7 @@ function(caffe2_print_configuration_summary)
endif()
message(STATUS " USE_METAL : ${USE_METAL}")
message(STATUS " USE_PYTORCH_METAL : ${USE_PYTORCH_METAL}")
message(STATUS " USE_PYTORCH_METAL_EXPORT : ${USE_PYTORCH_METAL_EXPORT}")
message(STATUS " USE_FFTW : ${USE_FFTW}")
message(STATUS " USE_MKL : ${CAFFE2_USE_MKL}")
message(STATUS " USE_MKLDNN : ${USE_MKLDNN}")

View File

@ -389,6 +389,7 @@ in :func:`torch.nn.utils.parameterize.register_parametrization`.
:toctree: generated
:nosignatures:
parametrizations.orthogonal
parametrizations.spectral_norm
Utility functions to parametrize Tensors on existing Modules.
@ -396,7 +397,7 @@ Note that these functions can be used to parametrize a given Parameter
or Buffer given a specific function that maps from an input space to the
parametrized space. They are not parameterizations that would transform
an object into a parameter. See the
`Parametrizations <https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html>`__ tutorial
`Parametrizations tutorial <https://pytorch.org/tutorials/intermediate/parametrizations.html>`_
for more information on how to implement your own parametrizations.
.. autosummary::

View File

@ -117,7 +117,7 @@ multiple modules:
Note that :class:`~torch.nn.Sequential` automatically feeds the output of the first ``MyLinear`` module as input
into the :class:`~torch.nn.ReLU`, and the output of that as input into the second ``MyLinear`` module. As
shown, it is limited to in-order chaining of modules.
shown, it is limited to in-order chaining of modules with a single input and output.
In general, it is recommended to define a custom module for anything beyond the simplest use cases, as this gives
full flexibility on how submodules are used for a module's computation.
@ -258,16 +258,32 @@ It's also easy to move all parameters to a different device or change their prec
dynamic_net(torch.randn(5, device='cuda', dtype=torch.float64))
: tensor([6.5166], device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
These examples show how elaborate neural networks can be formed through module composition. To allow for
quick and easy construction of neural networks with minimal boilerplate, PyTorch provides a large library of
performant modules within the :mod:`torch.nn` namespace that perform computation commonly found within neural
networks, including pooling, convolutions, loss functions, etc.
More generally, an arbitrary function can be applied to a module and its submodules recursively by
using the :func:`~torch.nn.Module.apply` function. For example, to apply custom initialization to parameters
of a module and its submodules:
.. code-block:: python
# Define a function to initialize Linear weights.
# Note that no_grad() is used here to avoid tracking this computation in the autograd graph.
@torch.no_grad()
def init_weights(m):
if isinstance(m, nn.Linear):
nn.init.xavier_normal_(m.weight)
m.bias.fill_(0.0)
# Apply the function recursively on the module and its submodules.
dynamic_net.apply(init_weights)
These examples show how elaborate neural networks can be formed through module composition and conveniently
manipulated. To allow for quick and easy construction of neural networks with minimal boilerplate, PyTorch
provides a large library of performant modules within the :mod:`torch.nn` namespace that perform common neural
network operations like pooling, convolutions, loss functions, etc.
In the next section, we give a full example of training a neural network.
For more information, check out:
* Recursively :func:`~torch.nn.Module.apply` a function to a module and its submodules
* Library of PyTorch-provided modules: `torch.nn <https://pytorch.org/docs/stable/nn.html>`_
* Defining neural net modules: https://pytorch.org/tutorials/beginner/examples_nn/two_layer_net_module.html
@ -295,6 +311,12 @@ Optimizers from :mod:`torch.optim`:
loss.backward()
optimizer.step()
# After training, switch the module to eval mode to do inference, compute performance metrics, etc.
# (see discussion below for a description of training and evaluation modes)
...
net.eval()
...
In this simplified example, the network learns to simply output zero, as any non-zero output is "penalized" according
to its absolute value by employing :func:`torch.abs` as a loss function. While this is not a very interesting task, the
key parts of training are present:
@ -321,6 +343,38 @@ value of ``l1``\ 's ``weight`` parameter shows that its values are now much clos
[ 0.0030],
[-0.0008]], requires_grad=True)
Note that the above process is done entirely while the network module is in "training mode". Modules default to
training mode and can be switched between training and evaluation modes using :func:`~torch.nn.Module.train` and
:func:`~torch.nn.Module.eval`. They can behave differently depending on which mode they are in. For example, the
:class:`~torch.nn.BatchNorm` module maintains a running mean and variance during training that are not updated
when the module is in evaluation mode. In general, modules should be in training mode during training
and only switched to evaluation mode for inference or evaluation. Below is an example of a custom module
that behaves differently between the two modes:
.. code-block:: python
class ModalModule(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
if self.training:
# Add a constant only in training mode.
return x + 1.
else:
return x
m = ModalModule()
x = torch.randn(4)
print('training mode output: {}'.format(m(x)))
: tensor([1.6614, 1.2669, 1.0617, 1.6213, 0.5481])
m.eval()
print('evaluation mode output: {}'.format(m(x)))
: tensor([ 0.6614, 0.2669, 0.0617, 0.6213, -0.4519])
Training neural networks can often be tricky. For more information, check out:
* Using Optimizers: https://pytorch.org/tutorials/beginner/examples_nn/two_layer_net_optim.html.
@ -409,12 +463,127 @@ Both persistent and non-persistent buffers are affected by model-wide device / d
Buffers of a module can be iterated over using :func:`~torch.nn.Module.buffers` or
:func:`~torch.nn.Module.named_buffers`.
.. code-block:: python
for buffer in m.named_buffers():
print(buffer)
The following class demonstrates the various ways of registering parameters and buffers within a module:
.. code-block:: python
class StatefulModule(nn.Module):
def __init__(self):
super().__init__()
# Setting a nn.Parameter as an attribute of the module automatically registers the tensor
# as a parameter of the module.
self.param1 = nn.Parameter(torch.randn(2))
# Alternative string-based way to register a parameter.
self.register_parameter('param2', nn.Parameter(torch.randn(3)))
# Reserves the "param3" attribute as a parameter, preventing it from being set to anything
# except a parameter. "None" entries like this will not be present in the module's state_dict.
self.register_parameter('param3', None)
# Registers a list of parameters.
self.param_list = nn.ParameterList([nn.Parameter(torch.randn(2)) for i in range(3)])
# Registers a dictionary of parameters.
self.param_dict = nn.ParameterDict({
'foo': nn.Parameter(torch.randn(3)),
'bar': nn.Parameter(torch.randn(4))
})
# Registers a persistent buffer (one that appears in the module's state_dict).
self.register_buffer('buffer1', torch.randn(4), persistent=True)
# Registers a non-persistent buffer (one that does not appear in the module's state_dict).
self.register_buffer('buffer2', torch.randn(5), persistent=False)
# Reserves the "buffer3" attribute as a buffer, preventing it from being set to anything
# except a buffer. "None" entries like this will not be present in the module's state_dict.
self.register_buffer('buffer3', None)
# Adding a submodule registers its parameters as parameters of the module.
self.linear = nn.Linear(2, 3)
m = StatefulModule()
# Save and load state_dict.
torch.save(m.state_dict(), 'state.pt')
m_loaded = StatefulModule()
m_loaded.load_state_dict(torch.load('state.pt'))
# Note that non-persistent buffer "buffer2" and reserved attributes "param3" and "buffer3" do
# not appear in the state_dict.
print(m_loaded.state_dict())
: OrderedDict([('param1', tensor([-0.0322, 0.9066])),
('param2', tensor([-0.4472, 0.1409, 0.4852])),
('buffer1', tensor([ 0.6949, -0.1944, 1.2911, -2.1044])),
('param_list.0', tensor([ 0.4202, -0.1953])),
('param_list.1', tensor([ 1.5299, -0.8747])),
('param_list.2', tensor([-1.6289, 1.4898])),
('param_dict.bar', tensor([-0.6434, 1.5187, 0.0346, -0.4077])),
('param_dict.foo', tensor([-0.0845, -1.4324, 0.7022])),
('linear.weight', tensor([[-0.3915, -0.6176],
[ 0.6062, -0.5992],
[ 0.4452, -0.2843]])),
('linear.bias', tensor([-0.3710, -0.0795, -0.3947]))])
For more information, check out:
* Saving and loading: https://pytorch.org/tutorials/beginner/saving_loading_models.html
* Serialization semantics: https://pytorch.org/docs/master/notes/serialization.html
* What is a state dict? https://pytorch.org/tutorials/recipes/recipes/what_is_state_dict.html
Module Initialization
---------------------
By default, parameters and floating-point buffers for modules provided by :mod:`torch.nn` are initialized during
module instantiation as 32-bit floating point values on the CPU using an initialization scheme determined to
perform well historically for the module type. For certain use cases, it may be desired to initialize with a different
dtype, device (e.g. GPU), or initialization technique.
Examples:
.. code-block:: python
# Initialize module directly onto GPU.
m = nn.Linear(5, 3, device='cuda')
# Initialize module with 16-bit floating point parameters.
m = nn.Linear(5, 3, dtype=torch.half)
# Skip default parameter initialization and perform custom (e.g. orthogonal) initialization.
m = torch.nn.utils.skip_init(nn.Linear, 5, 3)
nn.init.orthogonal_(m.weight)
Note that the device and dtype options demonstrated above also apply to any floating-point buffers registered
for the module:
.. code-block:: python
m = nn.BatchNorm2d(3, dtype=torch.half)
print(m.running_mean)
: tensor([0., 0., 0.], dtype=torch.float16)
While module writers can use any device or dtype to initialize parameters in their custom modules, good practice is
to use ``dtype=torch.float`` and ``device='cpu'`` by default as well. Optionally, you can provide full flexibility
in these areas for your custom module by conforming to the convention demonstrated above that all
:mod:`torch.nn` modules follow:
* Provide a ``device`` constructor kwarg that applies to any parameters / buffers registered by the module.
* Provide a ``dtype`` constructor kwarg that applies to any parameters / floating-point buffers registered by
the module.
* Only use initialization functions (i.e. functions from :mod:`torch.nn.init`) on parameters and buffers within the
module's constructor. Note that this is only required to use :func:`~torch.nn.utils.skip_init`; see
`this page <https://pytorch.org/tutorials/prototype/skip_param_init.html#updating-modules-to-support-skipping-initialization>`_ for an explanation.
For more information, check out:
* Skipping module parameter initialization: https://pytorch.org/tutorials/prototype/skip_param_init.html
Module Hooks
------------
@ -443,16 +612,137 @@ All hooks allow the user to return an updated value that will be used throughout
Thus, these hooks can be used to either execute arbitrary code along the regular module forward/backward or
modify some inputs/outputs without having to change the module's ``forward()`` function.
Below is an example demonstrating usage of forward and backward hooks:
.. code-block:: python
torch.manual_seed(1)
def forward_pre_hook(m, inputs):
# Allows for examination and modification of the input before the forward pass.
# Note that inputs are always wrapped in a tuple.
input = inputs[0]
return input + 1.
def forward_hook(m, inputs, output):
# Allows for examination of inputs / outputs and modification of the outputs
# after the forward pass. Note that inputs are always wrapped in a tuple while outputs
# are passed as-is.
# Residual computation a la ResNet.
return output + inputs[0]
def backward_hook(m, grad_inputs, grad_outputs):
# Allows for examination of grad_inputs / grad_outputs and modification of
# grad_inputs used in the rest of the backwards pass. Note that grad_inputs and
# grad_outputs are always wrapped in tuples.
new_grad_inputs = [torch.ones_like(gi) * 42. for gi in grad_inputs]
return new_grad_inputs
# Create sample module & input.
m = nn.Linear(3, 3)
x = torch.randn(2, 3, requires_grad=True)
# ==== Demonstrate forward hooks. ====
# Run input through module before and after adding hooks.
print('output with no forward hooks: {}'.format(m(x)))
: output with no forward hooks: tensor([[-0.5059, -0.8158, 0.2390],
[-0.0043, 0.4724, -0.1714]], grad_fn=<AddmmBackward>)
# Note that the modified input results in a different output.
forward_pre_hook_handle = m.register_forward_pre_hook(forward_pre_hook)
print('output with forward pre hook: {}'.format(m(x)))
: output with forward pre hook: tensor([[-0.5752, -0.7421, 0.4942],
[-0.0736, 0.5461, 0.0838]], grad_fn=<AddmmBackward>)
# Note the modified output.
forward_hook_handle = m.register_forward_hook(forward_hook)
print('output with both forward hooks: {}'.format(m(x)))
: output with both forward hooks: tensor([[-1.0980, 0.6396, 0.4666],
[ 0.3634, 0.6538, 1.0256]], grad_fn=<AddBackward0>)
# Remove hooks; note that the output here matches the output before adding hooks.
forward_pre_hook_handle.remove()
forward_hook_handle.remove()
print('output after removing forward hooks: {}'.format(m(x)))
: output after removing forward hooks: tensor([[-0.5059, -0.8158, 0.2390],
[-0.0043, 0.4724, -0.1714]], grad_fn=<AddmmBackward>)
# ==== Demonstrate backward hooks. ====
m(x).sum().backward()
print('x.grad with no backwards hook: {}'.format(x.grad))
: x.grad with no backwards hook: tensor([[ 0.4497, -0.5046, 0.3146],
[ 0.4497, -0.5046, 0.3146]])
# Clear gradients before running backward pass again.
m.zero_grad()
x.grad.zero_()
m.register_full_backward_hook(backward_hook)
m(x).sum().backward()
print('x.grad with backwards hook: {}'.format(x.grad))
: x.grad with backwards hook: tensor([[42., 42., 42.],
[42., 42., 42.]])
Advanced Features
-----------------
PyTorch also provides several more advanced features that are designed to work with modules. All these functionalities
are "inherited" when writing a new module. In-depth discussion of these features can be found in the links below.
are available for custom-written modules, with the small caveat that certain features may require modules to conform
to particular constraints in order to be supported. In-depth discussion of these features and the corresponding
requirements can be found in the links below.
For more information, check out:
Distributed Training
********************
* Profiling: https://pytorch.org/tutorials/beginner/profiler.html
* Pruning: https://pytorch.org/tutorials/intermediate/pruning_tutorial.html
* Quantization: https://pytorch.org/tutorials/recipes/quantization.html
* Exporting modules to TorchScript (e.g. for usage from C++):
https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html
Various methods for distributed training exist within PyTorch, both for scaling up training using multiple GPUs
as well as training across multiple machines. Check out the
`distributed training overview page <https://pytorch.org/tutorials/beginner/dist_overview.html>`_ for
detailed information on how to utilize these.
Profiling Performance
*********************
The `PyTorch Profiler <https://pytorch.org/tutorials/beginner/profiler.html>`_ can be useful for identifying
performance bottlenecks within your models. It measures and outputs performance characteristics for
both memory usage and time spent.
Improving Performance with Quantization
***************************************
Applying quantization techniques to modules can improve performance and memory usage by utilizing lower
bitwidths than floating-point precision. Check out the various PyTorch-provided mechanisms for quantization
`here <https://pytorch.org/docs/stable/quantization.html>`_.
Improving Memory Usage with Pruning
***********************************
Large deep learning models are often over-parametrized, resulting in high memory usage. To combat this, PyTorch
provides mechanisms for model pruning, which can help reduce memory usage while maintaining task accuracy. The
`Pruning tutorial <https://pytorch.org/tutorials/intermediate/pruning_tutorial.html>`_ describes how to utilize
the pruning techniques PyTorch provides or define custom pruning techniques as necessary.
Deploying with TorchScript
**************************
When deploying a model for use in production, the overhead of Python can be unacceptable due to its poor
performance characteristics. For cases like this,
`TorchScript <https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html>`_ provides a way to load
and run an optimized model program from outside of Python, such as within a C++ program.
Parametrizations
****************
For certain applications, it can be beneficial to constrain the parameter space during model training. For example,
enforcing orthogonality of the learned parameters can improve convergence for RNNs. PyTorch provides a mechanism for
applying `parametrizations <https://pytorch.org/tutorials/intermediate/parametrizations.html>`_ such as this, and
further allows for custom constraints to be defined.
Transforming Modules with FX
****************************
The `FX <https://pytorch.org/docs/stable/fx.html>`_ component of PyTorch provides a flexible way to transform
modules by operating directly on module computation graphs. This can be used to programmatically generate or
manipulate modules for a broad array of use cases. To explore FX, check out these examples of using FX for
`convolution + batch norm fusion <https://pytorch.org/tutorials/intermediate/fx_conv_bn_fuser.html>`_ and
`CPU performance analysis <https://pytorch.org/tutorials/intermediate/fx_profiling_tutorial.html>`_.

View File

@ -6,10 +6,6 @@ torch.special
The torch.special module, modeled after SciPy's `special <https://docs.scipy.org/doc/scipy/reference/special.html>`_ module.
This module is in BETA. New functions are still being added, and some
functions may change in future PyTorch releases. See the documentation of each
function for details.
.. automodule:: torch.special
:noindex:

View File

@ -9,3 +9,4 @@ torch.testing
.. automodule:: torch.testing
.. autofunction:: assert_close
.. autofunction:: make_tensor

View File

@ -1,6 +1,8 @@
#include <gtest/gtest.h>
#include <ATen/core/boxing/impl/test_helpers.h>
#include <torch/torch.h>
#include <ATen/core/op_registration/op_registration.h>
#include <torch/csrc/autograd/functions/basic_ops.h>
@ -869,6 +871,261 @@ TEST(CustomAutogradTest, BackwardWithCreateGraphWarns) {
}
}
/**
* Tests for AutogradNotImplementedFallback
* - Check that we created the NotImplemented kernel when inputs require grad
* but when no inputs require grad, we should not create this node
* - check_inplace logic
* - view ops (TODO: not an official view yet, update this once InplaceOrView kernel is landed)
* - TODO: Tests for NDEBUG checks?
* - tensorlist input and output
* - multiple outputs / non-tensor output
* - rebase_history vs set_history
*/
namespace {
torch::Tensor inplace_op(const torch::Tensor& self, const torch::Tensor& other) {
return self.add_(other);
}
std::tuple<torch::Tensor, torch::Tensor> two_arg_inplace_op(const torch::Tensor& self, const torch::Tensor& other) {
other.add_(self);
self.add_(other);
return std::tuple<torch::Tensor, torch::Tensor>(self, other);
}
std::tuple<torch::Tensor, torch::Tensor> two_pairs_of_view_op(const torch::Tensor& self, const torch::Tensor& other) {
// This is not allowed. We test below that this calling into the boxed kernel will raise an error
auto self_view = self.view(-1);
auto other_view = other.view(-1);
return std::tuple<torch::Tensor, torch::Tensor>(self_view, other_view);
}
int64_t ret_single_non_tensor(const torch::Tensor& self, const torch::Tensor& other) {
return 12;
}
torch::Tensor opt_op(const torch::Tensor& self, const c10::optional<at::Tensor>& other) {
if (other.has_value()) {
return self + other.value();
} else {
return self.clone();
}
}
torch::Tensor my_custom_op(const torch::Tensor& self, const torch::Tensor& other) {
return self + other;
}
std::tuple<torch::Tensor, torch::Tensor, int64_t> ret_tuple_non_tensor(const torch::Tensor& self, const torch::Tensor& other) {
auto a = self - other;
auto b = self + other;
return std::tuple<torch::Tensor, torch::Tensor, int64_t>(a, b, 12);
}
torch::Tensor view_op(const torch::Tensor& self, const torch::Tensor& other) {
return self.view(-1);
}
std::vector<at::Tensor> ret_tensor_vector(const torch::Tensor& self, const torch::Tensor& other) {
std::vector<at::Tensor> out;
out.push_back(self + other);
out.push_back(self - other);
return out;
}
torch::Tensor tensorlist_op(const torch::Tensor& self, at::TensorList other) {
const auto& res = self.clone();
for (const auto& t : other) {
res.add_(t);
}
return res;
}
#define REGISTER_TEST_OP(name, schema, fn) \
auto m = MAKE_TORCH_LIBRARY(_test); \
m.def(schema); \
auto m_autograd = MAKE_TORCH_LIBRARY_IMPL(_test, Autograd); \
auto m_cpu = MAKE_TORCH_LIBRARY_IMPL(_test, CPU); \
m_cpu.impl(name, c10::DispatchKey::CPU, TORCH_FN(fn)); \
m_autograd.impl(name, c10::DispatchKey::Autograd, autogradNotImplementedFallback());
template <typename F>
void assertBasicChecks(F op) {
auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
auto b = torch::tensor({1.}, {torch::kFloat32});
auto c = torch::tensor({1.}, {torch::kFloat32});
// If any inputs require grad,
auto out1 = op(a, b);
ASSERT_THROWS_WITH(out1.backward(), "is not implemented");
// # Should not have grad_fn if none require grad
auto out2 = op(b, c);
ASSERT_THROWS_WITH(out2.backward(), "element 0 of tensors does not require grad and does not have a grad_fn");
// TODO: Forward AD Tests?
}
} // namespace
TEST(TestAutogradNotImplementedFallback, RetSingleNonTensor) {
REGISTER_TEST_OP("ret_single_non_tensor", "_test::ret_single_non_tensor(Tensor self, Tensor other) -> int", ret_single_non_tensor);
auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::ret_single_non_tensor", "");
auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
return callOpUnboxed<int64_t, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
};
auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
auto b = torch::tensor({1.}, {torch::kFloat32});
ASSERT_EQ(op(a, b), ret_single_non_tensor(a, b));
}
TEST(TestAutogradNotImplementedFallback, DoubleViewOP) {
REGISTER_TEST_OP("two_pairs_of_view_op", "_test::two_pairs_of_view_op(Tensor(a) self, Tensor(b) other) -> (Tensor(a), Tensor(b))", two_pairs_of_view_op);
auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::two_pairs_of_view_op", "");
auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
return callOpUnboxed<std::tuple<torch::Tensor, torch::Tensor>, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
};
auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
auto b = torch::tensor({1.}, {torch::kFloat32});
ASSERT_THROWS_WITH(op(a, b),
"Expected only a single output in the operator schema to have a non-write alias annotation");
}
TEST(TestAutogradNotImplementedFallback, InplaceOp) {
REGISTER_TEST_OP("inplace_op", "_test::inplace_op(Tensor(a!) self, Tensor other) -> Tensor(a!)", inplace_op);
auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::inplace_op", "");
auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
return callOpUnboxed<torch::Tensor, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
};
auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
auto b = torch::tensor({1.}, {torch::kFloat32});
// Check in-place
ASSERT_THROWS_WITH(op(a, b),
"a leaf Variable that requires grad is being used in an in-place operation");
op(b, a);
a = a.clone();
b = b.clone();
auto c = op(a, b);
ASSERT_TRUE(torch::allclose(c, inplace_op(a, b)));
// Test in-place on view
auto base = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true).clone();
auto view = base.view(-1);
auto t = torch::tensor({1.}, {torch::kFloat32});
torch::Tensor v_nograd;
{
c10::NoGradGuard guard;
v_nograd = base.view(-1);
op(v_nograd, t);
}
ASSERT_THROWS_WITH(op(v_nograd, t), "A view was created in no_grad mode");
ASSERT_EQ(op(view, t).unsafeGetTensorImpl(), view.unsafeGetTensorImpl());
// TODO: once we have InplaceOrView kernel, renable this since version counter would actually
// be incremented
// ASSERT_THAT(op(view, t).grad_fn()->name(), ::testing::HasSubstr("AsStridedBackward"));
}
TEST(TestAutogradNotImplementedFallback, DoubleInplaceOp) {
REGISTER_TEST_OP("two_arg_inplace_op", "_test::two_arg_inplace_op(Tensor(a!) self, Tensor(b!) other) -> (Tensor(a!), Tensor(b!))", two_arg_inplace_op);
auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::two_arg_inplace_op", "");
auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
return callOpUnboxed<std::tuple<torch::Tensor, torch::Tensor>, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
};
auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
auto b = torch::tensor({1.}, {torch::kFloat32});
// Both are modified in-place!
ASSERT_THROWS_WITH(op(a, b),
"a leaf Variable that requires grad is being used in an in-place operation");
ASSERT_THROWS_WITH(op(b, a),
"a leaf Variable that requires grad is being used in an in-place operation");
}
TEST(TestAutogradNotImplementedFallback, OptOp) {
REGISTER_TEST_OP("opt_op", "_test::opt_op(Tensor self, Tensor? other) -> Tensor", opt_op);
auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::opt_op", "");
auto op = [&](const torch::Tensor& _1, const c10::optional<torch::Tensor>& _2) {
return callOpUnboxed<torch::Tensor, const torch::Tensor&, const c10::optional<torch::Tensor>&>(opHandle, _1, _2);
};
auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
auto b = torch::tensor({1.}, {torch::kFloat32});
ASSERT_TRUE(torch::allclose(op(a, b), opt_op(a, b)));
ASSERT_TRUE(torch::allclose(op(a, {}), opt_op(a, {})));
}
TEST(TestAutogradNotImplementedFallback, OutOfPlaceAddition) {
REGISTER_TEST_OP("my_custom_op", "_test::my_custom_op(Tensor self, Tensor other) -> Tensor", my_custom_op);
auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::my_custom_op", "");
auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
return callOpUnboxed<torch::Tensor, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
};
assertBasicChecks(op);
}
TEST(TestAutogradNotImplementedFallback, RetTupleNonTensor) {
REGISTER_TEST_OP("ret_tuple_non_tensor", "_test::ret_tuple_non_tensor(Tensor self, Tensor other) -> (Tensor, Tensor, int)", ret_tuple_non_tensor);
auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::ret_tuple_non_tensor", "");
auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
torch::Tensor out0;
torch::Tensor out1;
int64_t out2;
auto out = callOpUnboxed<std::tuple<torch::Tensor, torch::Tensor, int64_t>, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
std::tie(out0, out1, out2) = std::move(out);
return out0;
};
assertBasicChecks(op);
}
TEST(TestAutogradNotImplementedFallback, ViewOp) {
REGISTER_TEST_OP("view_op", "_test::view_op(Tensor(a) self, Tensor other) -> Tensor(a)", view_op);
auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::view_op", "");
auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
return callOpUnboxed<torch::Tensor, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
};
assertBasicChecks(op);
}
TEST(TestAutogradNotImplementedFallback, RetTensorVector) {
REGISTER_TEST_OP("ret_tensor_vector", "_test::ret_tensor_vector(Tensor self, Tensor other) -> Tensor[]", ret_tensor_vector);
auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::ret_tensor_vector", "");
auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
return callOpUnboxed<std::vector<at::Tensor>, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2)[0];
};
assertBasicChecks(op);
}
TEST(TestAutogradNotImplementedFallback, TensorlistOp) {
REGISTER_TEST_OP("tensorlist_op", "_test::tensorlist_op(Tensor self, Tensor[] other) -> Tensor", tensorlist_op);
auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::tensorlist_op", "");
auto op = [&](torch::Tensor _1, at::TensorList _2) {
return callOpUnboxed<torch::Tensor, const torch::Tensor&, at::TensorList>(opHandle, _1, _2);
};
auto a = torch::tensor({1.}, {torch::kFloat32});
auto b = torch::tensor({1.}, {torch::kFloat32});
auto c = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
std::vector<torch::Tensor> vec = {b, c};
auto out = op(a, vec);
ASSERT_THROWS_WITH(torch::autograd::grad({out}, {vec[0]}), "One of the differentiated Tensors does not require grad");
ASSERT_THROWS_WITH(torch::autograd::grad({out}, {vec[1]}), "is not implemented");
ASSERT_TRUE(at::allclose(op(a, vec), tensorlist_op(a, vec)));
}
// TODO add these tests if needed
// test_once_differentiable
// test_sparse_backward

View File

@ -792,6 +792,20 @@ TEST_F(FunctionalTest, CrossEntropy) {
ASSERT_TRUE(output.allclose(expected, 1e-04));
ASSERT_TRUE(F::cross_entropy(input, target).allclose(expected, 1e-04));
// label smoothing with class indices
input = torch::tensor({{3., 1.}, {1., 2.}}, torch::kFloat);
output = F::cross_entropy(
input, target, F::CrossEntropyFuncOptions().label_smoothing(0.15).reduction(torch::kMean));
expected = torch::tensor(0.3326, torch::kFloat);
ASSERT_TRUE(output.allclose(expected, 1e-04));
// label smoothing with target probabilities
target = torch::tensor({{0.8, 0.2}, {0.1, 0.9}}, torch::kFloat);
output = F::cross_entropy(
input, target, F::CrossEntropyFuncOptions().label_smoothing(0.2).reduction(torch::kMean));
expected = torch::tensor(0.5701, torch::kFloat);
ASSERT_TRUE(output.allclose(expected, 1e-04));
}
TEST_F(FunctionalTest, MaxUnpool1d) {

View File

@ -28,6 +28,9 @@ TEST(IMethodTest, CallMethod) {
auto pyModel = package.load_pickle("model", "model.pkl");
torch::deploy::PythonMethodWrapper pyMethod(pyModel, "forward");
EXPECT_EQ(scriptMethod.name(), "forward");
EXPECT_EQ(pyMethod.name(), "forward");
auto input = torch::ones({10, 20});
auto outputPy = pyMethod({input});
auto outputScript = scriptMethod({input});

View File

@ -2315,6 +2315,31 @@ TEST_F(ModulesTest, CrossEntropyLoss) {
ASSERT_TRUE(
CrossEntropyLoss(CrossEntropyLossOptions().ignore_index(-100).reduction(torch::kMean))
->forward(input, target).allclose(expected, 1e-04));
// label smoothing with class indices
loss = CrossEntropyLoss(CrossEntropyLossOptions().label_smoothing(0.15).reduction(torch::kMean));
input = torch::tensor({{3., 1.}, {1., 2.}}, torch::dtype(torch::kFloat).requires_grad(true));
target = torch::tensor({0, 1}, torch::kLong);
output = loss->forward(input, target);
expected = torch::tensor(0.3326, torch::kFloat);
s = output.sum();
s.backward();
ASSERT_TRUE(output.allclose(expected, 1e-04));
ASSERT_EQ(input.sizes(), input.grad().sizes());
// label smoothing with with target probabilities
loss = CrossEntropyLoss(CrossEntropyLossOptions().label_smoothing(0.2).reduction(torch::kMean));
input = torch::tensor({{3., 1.}, {1., 2.}}, torch::dtype(torch::kFloat).requires_grad(true));
target = torch::tensor({{0.8, 0.2}, {0.1, 0.9}}, torch::kFloat);
output = loss->forward(input, target);
expected = torch::tensor(0.5701, torch::kFloat);
s = output.sum();
s.backward();
ASSERT_TRUE(output.allclose(expected, 1e-04));
ASSERT_EQ(input.sizes(), input.grad().sizes());
}
TEST_F(ModulesTest, CosineSimilarity) {

View File

@ -1,11 +1,11 @@
#include <gtest/gtest.h>
#include <torch/csrc/autograd/generated/variable_factories.h>
#include <torch/csrc/jit/frontend/ir_emitter.h>
#include <torch/csrc/jit/ir/alias_analysis.h>
#include <torch/csrc/jit/ir/irparser.h>
#include "torch/csrc/jit/frontend/ir_emitter.h"
#include "torch/csrc/jit/ir/alias_analysis.h"
#include "torch/csrc/jit/runtime/custom_operator.h"
#include "torch/csrc/utils/memory.h"
#include <torch/csrc/jit/runtime/custom_operator.h>
#include <torch/csrc/utils/memory.h>
namespace torch {
namespace jit {
@ -484,7 +484,7 @@ TEST(AliasAnalysisTest, SafeToChangeAliasingRelationship) {
TEST(WriteTrackingTest, Basic) {
RegisterOperators reg({Operator(
"prim::creates_alias(Tensor(a) x) -> Tensor(a)",
[](Stack* s) {},
[](Stack&) {},
aliasAnalysisFromSchema())});
const auto creates_alias = Symbol::fromQualString("prim::creates_alias");
auto graph = std::make_shared<Graph>();
@ -949,11 +949,11 @@ TEST(WildcardsTest, Basic) {
RegisterOperators reg(
{Operator(
"prim::returns_wildcard(Tensor a) -> Tensor(*)",
[](Stack* stack) {},
[](Stack&) {},
aliasAnalysisFromSchema()),
Operator(
"prim::writes(Tensor(z!) a) -> Tensor(a)",
[](Stack* stack) {},
[](Stack&) {},
aliasAnalysisFromSchema())});
const auto returns_wildcard =
Symbol::fromQualString("prim::returns_wildcard");

View File

@ -31,7 +31,7 @@ TEST(CustomOperatorTest, InferredSchema) {
Stack stack;
push(stack, 2.0f, at::ones(5));
op->getOperation()(&stack);
op->getOperation()(stack);
at::Tensor output;
pop(stack, output);
@ -61,7 +61,7 @@ TEST(CustomOperatorTest, ExplicitSchema) {
Stack stack;
push(stack, 2.0f, at::ones(5));
op->getOperation()(&stack);
op->getOperation()(stack);
at::Tensor output;
pop(stack, output);
@ -109,7 +109,7 @@ TEST(CustomOperatorTest, ListParameters) {
c10::List<c10::complex<double>>(
{c10::complex<double>(2.4, -5.5), c10::complex<double>(-1.3, 2)}));
push(stack, c10::List<at::Tensor>({at::ones(5)}));
op->getOperation()(&stack);
op->getOperation()(stack);
c10::List<double> output;
pop(stack, output);
@ -140,7 +140,7 @@ TEST(CustomOperatorTest, ListParameters2) {
Stack stack;
push(stack, c10::List<at::Tensor>({at::ones(5)}));
op->getOperation()(&stack);
op->getOperation()(stack);
c10::List<at::Tensor> output;
pop(stack, output);
@ -204,7 +204,7 @@ TEST(TestCustomOperator, OperatorGeneratorUndeclared) {
torch::jit::RegisterOperators reg({OperatorGenerator(
TORCH_SELECTIVE_NAME_IN_SCHEMA(
op_list, "foofoo::not_exist(float a, Tensor b) -> Tensor"),
[](Stack* stack) {
[](Stack& stack) {
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
double a;
at::Tensor b;
@ -223,7 +223,7 @@ TEST(TestCustomOperator, OperatorGeneratorBasic) {
torch::jit::RegisterOperators reg({OperatorGenerator(
TORCH_SELECTIVE_NAME_IN_SCHEMA(
op_list, "foofoo::bar.template(float a, Tensor b) -> Tensor"),
[](Stack* stack) {
[](Stack& stack) {
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
double a;
at::Tensor b;
@ -249,7 +249,7 @@ TEST(TestCustomOperator, OperatorGeneratorBasic) {
Stack stack;
push(stack, 2.0f, at::ones(5));
op->getOperation()(&stack);
op->getOperation()(stack);
at::Tensor output;
pop(stack, output);

View File

@ -175,6 +175,15 @@ TEST(InterpreterTest, IgnorableArgsInSchema) {
ASSERT_TRUE(op_to_specified_args_non_const["aten::conv2d"] == 6);
}
TEST(InterpreterTest, IgnorableArgsInSchemaWithOut) {
auto graph = build_mobile_export_with_out();
MobileCode function(graph, "");
auto op_to_specified_args = function.op_to_num_specified_args();
ASSERT_TRUE(op_to_specified_args.size() == 1);
// this should be 3 when the add_out flag is set to True
ASSERT_TRUE(op_to_specified_args["aten::add.out"] == 4);
}
TEST(InterpreterTest, runAsyncBasicTest) {
/*
TODO: there are some problem with C++ parsing script program involving

View File

@ -520,6 +520,28 @@ TEST(SchemaParserTest, NestedArrays) {
.getElementType()));
}
TEST(SchemaParserTest, OutVariant) {
auto schema_with_out = parseSchema(
"at::foo(Tensor self, *, Tensor(a!) f, Tensor(b!) l) -> (Tensor(a!) f, Tensor(b!) l)");
ASSERT_TRUE(schema_with_out.arguments().at(1).is_out());
ASSERT_TRUE(schema_with_out.arguments().at(2).is_out());
auto schema_without_out =
parseSchema("at::foo(Tensor self, *, int scalar) -> (int)");
for (const auto& arg : schema_without_out.arguments()) {
ASSERT_TRUE(!arg.is_out());
}
auto schema_with_is_write = parseSchema(
"aten::ne_.Scalar(Tensor(a!) self, Scalar other) -> (Tensor(a!))");
for (const auto& arg : schema_with_is_write.arguments()) {
ASSERT_TRUE(!arg.is_out());
}
}
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
TEST(SchemaParserTest, NamedReturns) {
// named returns
parseSchema("at::what(Tensor! i_will_be_written_to) -> ()");
@ -1471,11 +1493,11 @@ TEST(NoneSchemaMatchTest, Basic) {
RegisterOperators reg({
Operator(
"prim::test_none() -> int?",
[](Stack* stack) { push(stack, IValue()); },
[](Stack& stack) { push(stack, IValue()); },
aliasAnalysisFromSchema()),
Operator(
"prim::is_none(int? a) -> bool",
[](Stack* stack) {
[](Stack& stack) {
IValue a = pop(stack);
if (a.isNone()) {
push(stack, true);

View File

@ -15,7 +15,7 @@ TEST(SchemaMatchingTest, VarType) {
RegisterOperators reg({
Operator(
"aten::test_vartype(t[] a, t b) -> (t)",
[](Stack* stack) {
[](Stack& stack) {
c10::List<double> list;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
double a;
@ -54,7 +54,7 @@ TEST(SchemaMatchingTest, VarType2) {
RegisterOperators reg({
Operator(
"aten::test_vartype2(t a, t[] b) -> (t[])",
[](Stack* stack) {
[](Stack& stack) {
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
double a;
c10::List<double> list;

View File

@ -123,6 +123,21 @@ std::shared_ptr<Graph> build_mobile_export_analysis_graph() {
return g;
}
std::shared_ptr<Graph> build_mobile_export_with_out() {
const auto graph_string = R"IR(
graph(%x.1 : Tensor,
%y.1 : Tensor):
%8 : NoneType = prim::Constant()
%6 : int = prim::Constant[value=1]()
%7 : Tensor = aten::add(%x.1, %y.1, %6, %y.1)
return (%8))IR";
auto g = std::make_shared<Graph>();
torch::jit::parseIR(graph_string, g.get());
g->lint();
return g;
}
std::shared_ptr<Graph> build_mobile_export_analysis_graph_nested() {
// this is pretty much same test as build_mobile_export_analysis_graph(),
// but some aten::slice operators are hidden under block statement to check
@ -258,7 +273,7 @@ RegisterOperators reg({
// because it always produces empty Tensors.
Operator(
"prim::MakeTestTensor() -> Tensor",
[](Stack* stack) { push(stack, at::Tensor()); },
[](Stack& stack) { push(stack, at::Tensor()); },
aliasAnalysisFromSchema()),
});
} // namespace

View File

@ -74,6 +74,7 @@ std::pair<tensor_list, tensor_list> runGradient(
std::shared_ptr<Graph> build_lstm();
std::shared_ptr<Graph> build_mobile_export_analysis_graph();
std::shared_ptr<Graph> build_mobile_export_with_out();
std::shared_ptr<Graph> build_mobile_export_analysis_graph_with_vararg();
std::shared_ptr<Graph> build_mobile_export_analysis_graph_nested();
std::shared_ptr<Graph> build_mobile_export_analysis_graph_non_const();

View File

@ -198,6 +198,22 @@ TEST_F(Kernel, _3) {
}
}
TEST_F(Kernel, Huge) {
const auto graph_string = R"IR(
graph(%x.1 : Float(4000000000, strides=[1], requires_grad=0, device=cpu)):
%1 : int = prim::Constant[value=0]()
%2 : Float(1, 4000000000, strides=[4000000000, 1], requires_grad=0, device=cpu) = aten::unsqueeze(%x.1, %1)
%3 : Float(1, 4000000000, strides=[4000000000, 1], requires_grad=0, device=cpu) = aten::relu(%2)
return (%3))IR";
auto graph = std::make_shared<Graph>();
parseIR(graph_string, &*graph);
TensorExprKernel k(graph);
std::ostringstream oss;
oss << *k.getCodeGenStmt();
const std::string& verification_pattern = "# CHECK: 4000000000";
torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
}
TEST_F(Kernel, ParallelStrided) {
const auto graph_string = R"IR(
graph(%0 : Float(5, 3, 40005, strides=[120015, 40005, 1], device=cpu),
@ -786,9 +802,9 @@ TEST_F(Kernel, SumOneAxis) {
// Check the IR we produced
const std::string& verification_pattern =
R"IR(
# CHECK: for (int v = 0; v <
# CHECK: for (int64_t v = 0ll; v <
# CHECK-NEXT: sum
# CHECK-NEXT: for (int v_1 = 0; v_1 <
# CHECK-NEXT: for (int64_t v_1 = 0ll; v_1 <
# CHECK-NEXT: sum)IR";
torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
@ -847,10 +863,10 @@ TEST_F(Kernel, SumMultipleAxes) {
// Check the IR we produced
const std::string& verification_pattern =
R"IR(
# CHECK: int v = 0
# CHECK: int v_1 = 0
# CHECK: int v_2 = 0
# CHECK: int v_3 = 0
# CHECK: int64_t v = 0
# CHECK: int64_t v_1 = 0
# CHECK: int64_t v_2 = 0
# CHECK: int64_t v_3 = 0
# CHECK: sum)IR";
torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
@ -1115,8 +1131,8 @@ TEST_F(Kernel, InlineProducerIntoReduction) {
// We should have only one loop in the end.
const std::string& verification_pattern =
R"IR(
# CHECK: for (int v = 0; v < 5;
# CHECK-NEXT: for (int v_1 = 0; v_1 < 3;
# CHECK: for (int64_t v = 0ll; v < 5
# CHECK-NEXT: for (int64_t v_1 = 0ll; v_1 < 3
# CHECK-NEXT: sum
# CHECK-NOT: for)IR";
torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
@ -1154,11 +1170,11 @@ TEST_F(Kernel, InlineReductionIntoConsumer) {
// We should have two loops in the end.
const std::string& verification_pattern =
R"IR(
# CHECK: for (int v = 0; v < 5;
# CHECK-NEXT: for (int v_1 = 0; v_1 < 3;
# CHECK: for (int64_t v = 0ll; v < 5
# CHECK-NEXT: for (int64_t v_1 = 0ll; v_1 < 3
# CHECK-NEXT: sum
# CHECK: for (int v_2 = 0; v_2 < 5;
# CHECK-NEXT: for (int v_3 = 0; v_3 < 3;
# CHECK: for (int64_t v_2 = 0ll; v_2 < 5
# CHECK-NEXT: for (int64_t v_3 = 0ll; v_3 < 3
# CHECK-NEXT: aten_mul
# CHECK-NOT: for)IR";
torch::jit::testing::FileCheck().run(verification_pattern, oss.str());

View File

@ -1501,42 +1501,54 @@ TEST(LLVM, RFactorVectorizedReduction) {
ExpectAllNear(b_v, b_ref, 1e-5);
}
TEST(LLVM, SimpleParallel) {
for (int test_cfg = 0; test_cfg < 4; test_cfg++) {
// Compute a simple operation, and try all loop-axis combination to be
// parallel or sequential.
const int M = 4;
const int N = 6;
Tensor f = Compute(
"f", {{M, "m"}, {N, "n"}}, [](const VarHandle& m, const VarHandle& n) {
return cast<float>(m + n);
});
LoopNest loop_nest({f});
auto const& loops = loop_nest.getLoopStmtsFor(f);
ForPtr m = loops[0];
ForPtr n = loops[1];
if (test_cfg & 0x1) {
m->set_parallel();
}
if (test_cfg & 0x2) {
n->set_parallel();
}
loop_nest.prepareForCodegen();
StmtPtr stmt = loop_nest.root_stmt();
LLVMCodeGen cg(stmt, {f});
PaddedBuffer<float> f_v(M, N, "f_v");
std::vector<void*> args({f_v.data()});
int value = cg.value<int>(args);
ASSERT_EQ(value, 0);
PaddedBuffer<float> f_ref(M, N, "f_ref");
for (int m = 0; m < M; m++) {
for (int n = 0; n < N; n++) {
f_ref(m, n) = m + n;
}
}
ExpectAllNear(f_v, f_ref, 1e-5);
template <bool outer, bool inner>
static void testSimpleParallel() {
// Compute a simple operation, and try all loop-axis combination to be
// parallel or sequential.
const int M = 4;
const int N = 6;
Tensor f = Compute(
"f", {{M, "m"}, {N, "n"}}, [](const VarHandle& m, const VarHandle& n) {
return cast<float>(m + n);
});
LoopNest loop_nest({f});
auto const& loops = loop_nest.getLoopStmtsFor(f);
ForPtr m = loops[0];
ForPtr n = loops[1];
if (outer) {
m->set_parallel();
}
if (inner) {
n->set_parallel();
}
loop_nest.prepareForCodegen();
StmtPtr stmt = loop_nest.root_stmt();
LLVMCodeGen cg(stmt, {f});
PaddedBuffer<float> f_v(M, N, "f_v");
std::vector<void*> args({f_v.data()});
int value = cg.value<int>(args);
ASSERT_EQ(value, 0);
PaddedBuffer<float> f_ref(M, N, "f_ref");
for (int m = 0; m < M; m++) {
for (int n = 0; n < N; n++) {
f_ref(m, n) = m + n;
}
}
ExpectAllNear(f_v, f_ref, 1e-5);
}
TEST(LLVM, SimpleParallelSS) {
testSimpleParallel<false, false>();
}
TEST(LLVM, SimpleParallelSP) {
testSimpleParallel<false, true>();
}
TEST(LLVM, SimpleParallelPS) {
testSimpleParallel<true, false>();
}
TEST(LLVM, SimpleParallelPP) {
testSimpleParallel<true, true>();
}
TEST(LLVM, CompositeParallel) {

View File

@ -4734,8 +4734,8 @@ TEST(LoopNest, VectorizeUse) {
}
const char* int64Loop = R"IR(
# CHECK: for (int64_t n = 0; n < 12; n++) {
# CHECK: b[n] = (a[n]) + 1;
# CHECK: for (int64_t n = 0ll; n < 12ll; n++) {
# CHECK: b[n] = (a[n]) + 1ll;
# CHECK: }
)IR";
@ -4744,7 +4744,8 @@ TEST(LoopNest, Int64Direct) {
Placeholder a("a", kLong, {N});
Placeholder b("b", kLong, {N});
VarHandle n("n", kLong);
StmtPtr s = For::make(n, 0, N, b.store({n}, a.load({n}) + LongImm::make(1l)));
StmtPtr s = For::make(
n, LongImm::make(0l), N, b.store({n}, a.load({n}) + LongImm::make(1l)));
s = IRSimplifier::simplify(s);
std::ostringstream oss;
oss << *s;

Some files were not shown because too many files have changed in this diff Show More