diff --git a/.circleci/scripts/cpp_doc_push_script.sh b/.circleci/scripts/cpp_doc_push_script.sh index e9b86e211e6..618b64c7f12 100755 --- a/.circleci/scripts/cpp_doc_push_script.sh +++ b/.circleci/scripts/cpp_doc_push_script.sh @@ -47,16 +47,11 @@ sudo apt-get -y install doxygen # Generate ATen files pushd "${pt_checkout}" pip install -r requirements.txt -time python aten/src/ATen/gen.py \ +time python -m tools.codegen.gen \ -s aten/src/ATen \ - -d build/aten/src/ATen \ - aten/src/ATen/Declarations.cwrap \ - aten/src/THCUNN/generic/THCUNN.h \ - aten/src/ATen/nn.yaml \ - aten/src/ATen/native/native_functions.yaml + -d build/aten/src/ATen # Copy some required files -cp aten/src/ATen/common_with_cwrap.py tools/shared/cwrap_common.py cp torch/_utils_internal.py tools/shared # Generate PyTorch files diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index f765f7614a1..2086d64e61a 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -131,13 +131,9 @@ jobs: time python setup.py --cmake-only build # Generate ATen files. - time python aten/src/ATen/gen.py \ + time python -m tools.codegen.gen \ -s aten/src/ATen \ - -d build/aten/src/ATen \ - aten/src/ATen/Declarations.cwrap \ - aten/src/THCUNN/generic/THCUNN.h \ - aten/src/ATen/nn.yaml \ - aten/src/ATen/native/native_functions.yaml + -d build/aten/src/ATen # Generate PyTorch files. time python tools/setup_helpers/generate_code.py \ diff --git a/.gitignore b/.gitignore index f1c870be40f..99180410987 100644 --- a/.gitignore +++ b/.gitignore @@ -108,9 +108,6 @@ env # macOS dir files .DS_Store -# Symbolic files -tools/shared/cwrap_common.py - # Ninja files .ninja_deps .ninja_log diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh index 0b9c9209a80..bba8aa0e036 100755 --- a/.jenkins/caffe2/build.sh +++ b/.jenkins/caffe2/build.sh @@ -248,6 +248,8 @@ else export MAX_JOBS=`expr $(nproc) - 1` fi + pip install --user dataclasses + $PYTHON setup.py install --user report_compile_cache_stats diff --git a/.jenkins/pytorch/macos-common.sh b/.jenkins/pytorch/macos-common.sh index f0b28bf20f6..27c9d4ccb35 100755 --- a/.jenkins/pytorch/macos-common.sh +++ b/.jenkins/pytorch/macos-common.sh @@ -20,7 +20,7 @@ if [ ! -d "${WORKSPACE_DIR}/miniconda3" ]; then fi export PATH="${WORKSPACE_DIR}/miniconda3/bin:$PATH" source ${WORKSPACE_DIR}/miniconda3/bin/activate -retry conda install -y mkl mkl-include numpy=1.18.5 pyyaml=5.3 setuptools=46.0.0 cmake cffi ninja typing_extensions +retry conda install -y mkl mkl-include numpy=1.18.5 pyyaml=5.3 setuptools=46.0.0 cmake cffi ninja typing_extensions dataclasses # The torch.hub tests make requests to GitHub. # diff --git a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat index 0212c553703..0ddf3b4b462 100644 --- a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat +++ b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat @@ -21,8 +21,8 @@ call %INSTALLER_DIR%\install_sccache.bat call %INSTALLER_DIR%\install_miniconda3.bat -:: Install ninja -if "%REBUILD%"=="" ( pip install -q "ninja==1.9.0" ) +:: Install ninja and other deps +if "%REBUILD%"=="" ( pip install -q "ninja==1.9.0" dataclasses ) git submodule sync --recursive git submodule update --init --recursive diff --git a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat index ac0f018259f..17a3d39d076 100644 --- a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat +++ b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat @@ -22,7 +22,7 @@ call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Minic if NOT "%BUILD_ENVIRONMENT%"=="" ( :: We have to pin Python version to 3.6.7, until mkl supports Python 3.7 :: Numba is pinned to 0.44.0 to avoid https://github.com/numba/numba/issues/4352 - call conda install -y -q python=3.6.7 numpy mkl cffi pyyaml boto3 protobuf numba==0.44.0 scipy==1.5.0 typing_extensions + call conda install -y -q python=3.6.7 numpy mkl cffi pyyaml boto3 protobuf numba==0.44.0 scipy==1.5.0 typing_extensions dataclasses if %errorlevel% neq 0 ( exit /b %errorlevel% ) call conda install -y -q -c conda-forge cmake if %errorlevel% neq 0 ( exit /b %errorlevel% ) diff --git a/BUILD.bazel b/BUILD.bazel index f7be71ec624..9bedaef1676 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -106,17 +106,19 @@ cc_test( ], ) +# TODO: refactor this into its own library (but how to make +# a binary based off of a module in a library?) py_binary( name = "gen", - srcs = ["aten/src/ATen/gen.py"], + srcs = ["tools/setup_helpers/gen.py"], + deps = [ + ":tools_codegen" + ], ) genrule( name = "generated_cpp", srcs = [ - "aten/src/ATen/Declarations.cwrap", - "aten/src/THCUNN/generic/THCUNN.h", - "aten/src/ATen/nn.yaml", "aten/src/ATen/native/native_functions.yaml", ] + glob(["aten/src/ATen/templates/**"]), outs = [ @@ -126,8 +128,6 @@ genrule( "aten/src/ATen/CPUType.cpp", "aten/src/ATen/Functions.h", "aten/src/ATen/Functions.cpp", - "aten/src/ATen/LegacyTHFunctionsCPU.h", - "aten/src/ATen/LegacyTHFunctionsCPU.cpp", "aten/src/ATen/NativeFunctions.h", "aten/src/ATen/MkldnnCPUType.h", "aten/src/ATen/MkldnnCPUType.cpp", @@ -141,14 +141,13 @@ genrule( "aten/src/ATen/core/TensorMethods.cpp", "aten/src/ATen/core/ATenOpList.cpp", ], - cmd = "$(location :gen) --source-path aten/src/ATen --install_dir `dirname $(location aten/src/ATen/Declarations.yaml)` aten/src/ATen/Declarations.cwrap aten/src/THCUNN/generic/THCUNN.h aten/src/ATen/nn.yaml aten/src/ATen/native/native_functions.yaml", + cmd = "$(location :gen) --source-path aten/src/ATen --install_dir `dirname $(location aten/src/ATen/Declarations.yaml)`", tools = [":gen"], ) py_library( - name = "code_template", - srcs = ["aten/src/ATen/code_template.py"], - imports = ["aten"], + name = "tools_codegen", + srcs = glob(["tools/codegen/**/*.py"]), ) py_library( @@ -158,7 +157,7 @@ py_library( "tools/autograd/*.yaml", "tools/autograd/templates/*", ]), - deps = [":code_template"], + deps = [":tools_codegen"], ) py_library( diff --git a/README.md b/README.md index 6e1fcfdb828..d2fbecdb3dd 100644 --- a/README.md +++ b/README.md @@ -169,7 +169,7 @@ If you are building for NVIDIA's Jetson platforms (Jetson Nano, TX1, TX2, AGX Xa Common ```bash -conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests +conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses ``` On Linux diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap deleted file mode 100644 index 7325b8eb88f..00000000000 --- a/aten/src/ATen/Declarations.cwrap +++ /dev/null @@ -1,561 +0,0 @@ -[[ - name: _th_masked_fill_ - cuda_bool: True - cuda_bfloat16: True - cname: maskedFill - variants: function - backends: - - CUDA - return: self - options: - - arguments: - - THTensor* self - - THByteTensor* mask - - real value -]] -[[ - name: _th_masked_fill_bool_ - cuda_bool: True - cuda_bfloat16: True - cname: maskedFillBool - variants: function - backends: - - CUDA - return: self - options: - - arguments: - - THTensor* self - - THBoolTensor* mask - - real value -]] -[[ - name: _th_masked_scatter_ - cpu_bool: True - cuda_bool: True - cpu_bfloat16: True - cuda_bfloat16: True - cname: maskedCopy - variants: function - return: self - arguments: - - THTensor* self - - THByteTensor* mask - - THTensor* source -]] -[[ - name: _th_masked_scatter_bool_ - cpu_bool: True - cuda_bool: True - cpu_bfloat16: True - cuda_bfloat16: True - cname: maskedCopyBool - variants: function - return: self - arguments: - - THTensor* self - - THBoolTensor* mask - - THTensor* source -]] -[[ - name: _th_nonzero - cname: nonzero - cpu_half: True - cpu_bool: True - cuda_bool: True - cpu_bfloat16: True - cuda_bfloat16: True - variants: - - function - return: argument 0 - arguments: - - arg: THIndexTensor* result - output: True - - THTensor* self -]] -[[ - name: _th_index_copy_ - cname: indexCopy - cpu_bool: True - cuda_bool: True - variants: function - return: argument 0 - arguments: - - THTensor* self - - long dim - - THIndexTensor* index - - THTensor* source -]] -[[ - name: _th_take - cpu_bool: True - cuda_bool: True - cname: take - variants: - - function - return: argument 0 - arguments: - - arg: THTensor* result - output: True - - THTensor* self - - THIndexTensor* index -]] -[[ - name: _th_put_ - cpu_bool: True - cuda_bool: True - cname: put - variants: function - backends: - - CPU - - CUDA - return: argument 0 - arguments: - - THTensor* self - - THIndexTensor* index - - THTensor* source - - bool accumulate -]] -[[ - name: _th_index_fill_ - cpu_bool: True - cuda_bool: True - cname: indexFill - variants: function - return: argument 0 - options: - - arguments: - - THTensor* self - - long dim - - THIndexTensor* index - - real value -]] -[[ - name: _th_mode - variants: function - cname: mode - return: argument 0,1 - arguments: - - arg: THTensor* values - output: True - - arg: THIndexTensor* indices - output: True - - THTensor* self - - long dim - - bool keepdim -]] -[[ - name: _th_sort - cname: sort - cpu_half: True - variants: - - function - return: argument 0,1 - arguments: - - arg: THTensor* values - output: True - - arg: THIndexTensor* indices - output: True - - THTensor* self - - long dim - - bool descending -]] -[[ - name: _th_topk - cname: topk - cuda_bfloat16: True - backends: - - CUDA - variants: - - function - return: argument 0,1 - arguments: - - arg: THTensor* values - output: True - - arg: THIndexTensor* indices - output: True - - THTensor* self - - long k - - long dim - - bool largest - - bool sorted -]] -[[ - name: _th_var - types: - - floating_point - backends: - - CPU - - CUDA - variants: function - options: - - cname: var_all - return: accreal - arguments: - - THTensor* self - - bool unbiased -]] -[[ - name: _th_std - types: - - floating_point - backends: - - CPU - - CUDA - variants: function - options: - - cname: std_all - return: accreal - arguments: - - THTensor* self - - bool unbiased -]] -[[ - name: _th_renorm - cname: renorm - types: - - floating_point - backends: - - CPU - - CUDA - variants: - - function - return: argument 0 - arguments: - - arg: THTensor* result - output: True - - THTensor* self - - real p - - long dim - - real maxnorm -]] -[[ - name: _th_renorm_ - types: - - floating_point - backends: - - CPU - - CUDA - cname: renorm - variants: function - return: self - arguments: - - THTensor* self - - THTensor* self - - real p - - long dim - - real maxnorm -]] -[[ - name: _th_histc - cname: histc - types: - - Float - - Double - backends: - - CPU - variants: - - function - return: argument 0 - arguments: - - arg: THTensor* result - output: True - - THTensor* self - - long bins - - real min - - real max -]] -[[ - name: _th_trace - cname: trace - variants: - - function - return: accreal - arguments: - - THTensor* self - backends: - - CPU -]] -[[ - name: _th_fmod - return: argument 0 - variants: - - function - backends: - - CUDA - options: - - cname: fmod - arguments: - - arg: THTensor* result - output: True - - THTensor* self - - real other - - cname: cfmod - arguments: - - arg: THTensor* result - output: True - - THTensor* self - - THTensor* other -]] -[[ - name: _th_fmod_ - return: argument 0 - variants: function - backends: - - CUDA - options: - - cname: fmod - arguments: - - THTensor* self - - THTensor* self - - real other - - cname: cfmod - arguments: - - THTensor* self - - THTensor* self - - THTensor* other -]] -[[ - name: _th_cross_kernel - cname: crossKernel - variants: - - function - backends: - - CUDA - return: argument 0 - arguments: - - arg: THTensor* result - output: True - - THTensor* self - - THTensor* other - - arg: int64_t dim -]] -[[ - name: _th_addr - cname: addr - cpu_bfloat16: True - variants: function - return: argument 0 - backends: [CPU] - arguments: - - arg: THTensor* result - output: True - - THTensor* self - - THTensor* vec1 - - THTensor* vec2 - - real beta - - real alpha -]] -[[ - name: _th_addr_ - cpu_bfloat16: True - cname: addr - return: self - variants: function - backends: [CPU] - arguments: - - THTensor* self - - THTensor* self - - THTensor* vec1 - - THTensor* vec2 - - real beta - - real alpha -]] -[[ -[[ - name: _th_bmm - cuda_bfloat16: True - cname: baddbmm - variants: - - function - backends: - - CUDA - return: argument 0 - arguments: - - arg: THTensor* result - output: True - - argument 0 - - THTensor* self - - THTensor* mat2 - - CONSTANT AS_REAL(0) - - CONSTANT AS_REAL(1) -]] -[[ - name: _th_baddbmm - cuda_bfloat16: True - cname: baddbmm - variants: - - function - backends: - - CUDA - return: argument 0 - arguments: - - arg: THTensor* result - output: True - - arg: THTensor* self - - THTensor* batch1 - - THTensor* batch2 - - real beta - - real alpha -]] -[[ - name: _th_gels - cname: gels - types: - - Float - - Double - backends: - - CPU - - CUDA - variants: - - function - return: argument 0,1 - arguments: - - arg: THTensor* res1 - output: True - - arg: THTensor* res2 - output: True - - THTensor* self - - THTensor* A -]] -[[ - name: _th_eig - cname: geev - types: - - Float - - Double - backends: - - CPU - - CUDA - variants: - - function - return: argument 0,1 - arguments: - - arg: THTensor* res1 - output: True - - arg: THTensor* res2 - output: True - - THTensor* self - - bool eigenvectors -]] -[[ - name: _th_potri - cname: potri - types: - - Float - - Double - backends: - - CPU - - CUDA - variants: - - function - return: argument 0 - arguments: - - arg: THTensor* output - output: True - - THTensor* self - - bool upper -]] -[[ - name: _th_geqrf - cname: geqrf - types: - - Float - - Double - backends: - - CPU - - CUDA - variants: - - function - return: argument 0,1 - arguments: - - arg: THTensor* res1 - output: True - - arg: THTensor* res2 - output: True - - THTensor* self -]] -[[ - name: _th_orgqr - cname: orgqr - types: - - Float - - Double - backends: - - CPU - variants: - - function - return: argument 0 - arguments: - - arg: THTensor* result - output: True - - THTensor* self - - THTensor* input2 -]] -[[ - name: _th_ormqr - cname: ormqr - types: - - Float - - Double - backends: - - CPU - variants: - - function - return: argument 0 - arguments: - - arg: THTensor* result - output: True - - THTensor* self - - THTensor* input2 - - THTensor* input3 - - bool left - - bool transpose -]] -[[ - name: _th_multinomial_alias_setup - cname: multinomialAliasSetup - variants: - - function - types: - - floating_point - backends: - - CPU - - CUDA - return: argument 1,2 - arguments: - - arg: THTensor* probs - - arg: THIndexTensor* J - output: True - - arg: THTensor* q - output: True -]] -[[ - name: _th_multinomial_alias_draw - cname: multinomialAliasDraw - types: - - floating_point - backends: - - CPU - - CUDA - variants: - - function - return: argument 0 - arguments: - - arg: THIndexTensor* result - output: True - - THTensor* q - - THIndexTensor* J - - long num_samples - - c10::optional generator -]] -[[ - name: _th_copy_ignoring_overlaps_ - cname: copyIgnoringOverlaps - return: self - variants: function - backends: - - CUDA - arguments: - - THTensor* self - - THTensor* src -]] diff --git a/aten/src/ATen/LegacyTHFunctionsCPU.cpp b/aten/src/ATen/LegacyTHFunctionsCPU.cpp new file mode 100644 index 00000000000..a7413033c5c --- /dev/null +++ b/aten/src/ATen/LegacyTHFunctionsCPU.cpp @@ -0,0 +1,1712 @@ +#include + +// @generated by aten/src/ATen/gen.py from LegacyTHFunctions.cpp + +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +namespace legacy { +namespace cpu { + +namespace { + ScalarType infer_scalar_type(const Tensor & t) { + return t.scalar_type(); + } + ScalarType infer_scalar_type(const TensorList & tl) { + TORCH_CHECK(tl.size() > 0, "expected a non-empty list of Tensors"); + return tl[0].scalar_type(); + } + + TensorOptions options(ScalarType s) { + return TensorOptions().dtype(s) + .device(DeviceType::CPU) + .layout(kStrided); + } + + Allocator* allocator() { + return getCPUAllocator(); + } +} + +Tensor & _th_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Bool: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CPU, ScalarType::Byte); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type); + THBoolTensor_maskedCopy(self_, mask_, source_); + break; + } + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CPU, ScalarType::Byte); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type); + THByteTensor_maskedCopy(self_, mask_, source_); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CPU, ScalarType::Byte); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type); + THCharTensor_maskedCopy(self_, mask_, source_); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CPU, ScalarType::Byte); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_maskedCopy(self_, mask_, source_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CPU, ScalarType::Byte); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_maskedCopy(self_, mask_, source_); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CPU, ScalarType::Byte); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type); + THIntTensor_maskedCopy(self_, mask_, source_); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CPU, ScalarType::Byte); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type); + THLongTensor_maskedCopy(self_, mask_, source_); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CPU, ScalarType::Byte); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type); + THShortTensor_maskedCopy(self_, mask_, source_); + break; + } + case ScalarType::BFloat16: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CPU, ScalarType::Byte); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type); + THBFloat16Tensor_maskedCopy(self_, mask_, source_); + break; + } + default: + AT_ERROR("_th_masked_scatter_ not supported on CPUType for ", dispatch_scalar_type); + } + return self; +} +Tensor & _th_masked_scatter_bool_(Tensor & self, const Tensor & mask, const Tensor & source) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Bool: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CPU, ScalarType::Bool); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type); + THBoolTensor_maskedCopyBool(self_, mask_, source_); + break; + } + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CPU, ScalarType::Bool); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type); + THByteTensor_maskedCopyBool(self_, mask_, source_); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CPU, ScalarType::Bool); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type); + THCharTensor_maskedCopyBool(self_, mask_, source_); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CPU, ScalarType::Bool); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_maskedCopyBool(self_, mask_, source_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CPU, ScalarType::Bool); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_maskedCopyBool(self_, mask_, source_); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CPU, ScalarType::Bool); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type); + THIntTensor_maskedCopyBool(self_, mask_, source_); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CPU, ScalarType::Bool); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type); + THLongTensor_maskedCopyBool(self_, mask_, source_); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CPU, ScalarType::Bool); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type); + THShortTensor_maskedCopyBool(self_, mask_, source_); + break; + } + case ScalarType::BFloat16: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CPU, ScalarType::Bool); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type); + THBFloat16Tensor_maskedCopyBool(self_, mask_, source_); + break; + } + default: + AT_ERROR("_th_masked_scatter_bool_ not supported on CPUType for ", dispatch_scalar_type); + } + return self; +} +Tensor & _th_nonzero_out(Tensor & result, const Tensor & self) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Bool: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type); + THBoolTensor_nonzero(result_, self_); + break; + } + case ScalarType::Byte: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type); + THByteTensor_nonzero(result_, self_); + break; + } + case ScalarType::Char: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type); + THCharTensor_nonzero(result_, self_); + break; + } + case ScalarType::Double: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_nonzero(result_, self_); + break; + } + case ScalarType::Float: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_nonzero(result_, self_); + break; + } + case ScalarType::Int: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type); + THIntTensor_nonzero(result_, self_); + break; + } + case ScalarType::Long: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type); + THLongTensor_nonzero(result_, self_); + break; + } + case ScalarType::Short: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type); + THShortTensor_nonzero(result_, self_); + break; + } + case ScalarType::Half: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type); + THHalfTensor_nonzero(result_, self_); + break; + } + case ScalarType::BFloat16: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type); + THBFloat16Tensor_nonzero(result_, self_); + break; + } + default: + AT_ERROR("_th_nonzero_out not supported on CPUType for ", dispatch_scalar_type); + } + return result; +} +Tensor _th_nonzero(const Tensor & self) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto result_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(ScalarType::Long)).release(); + auto result = Tensor(c10::intrusive_ptr::reclaim(result_)); + switch (dispatch_scalar_type) { + case ScalarType::Bool: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type); + THBoolTensor_nonzero(result_, self_); + break; + } + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type); + THByteTensor_nonzero(result_, self_); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type); + THCharTensor_nonzero(result_, self_); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_nonzero(result_, self_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_nonzero(result_, self_); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type); + THIntTensor_nonzero(result_, self_); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type); + THLongTensor_nonzero(result_, self_); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type); + THShortTensor_nonzero(result_, self_); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type); + THHalfTensor_nonzero(result_, self_); + break; + } + case ScalarType::BFloat16: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type); + THBFloat16Tensor_nonzero(result_, self_); + break; + } + default: + AT_ERROR("_th_nonzero not supported on CPUType for ", dispatch_scalar_type); + } + return result; +} +Tensor & _th_index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Bool: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CPU, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type); + THBoolTensor_indexCopy(self_, dim, index_, source_); + break; + } + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CPU, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type); + THByteTensor_indexCopy(self_, dim, index_, source_); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CPU, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type); + THCharTensor_indexCopy(self_, dim, index_, source_); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CPU, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_indexCopy(self_, dim, index_, source_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CPU, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_indexCopy(self_, dim, index_, source_); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CPU, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type); + THIntTensor_indexCopy(self_, dim, index_, source_); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CPU, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type); + THLongTensor_indexCopy(self_, dim, index_, source_); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CPU, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type); + THShortTensor_indexCopy(self_, dim, index_, source_); + break; + } + default: + AT_ERROR("_th_index_copy_ not supported on CPUType for ", dispatch_scalar_type); + } + return self; +} +Tensor & _th_take_out(Tensor & result, const Tensor & self, const Tensor & index) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Bool: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long); + THBoolTensor_take(result_, self_, index_); + break; + } + case ScalarType::Byte: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long); + THByteTensor_take(result_, self_, index_); + break; + } + case ScalarType::Char: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long); + THCharTensor_take(result_, self_, index_); + break; + } + case ScalarType::Double: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long); + THDoubleTensor_take(result_, self_, index_); + break; + } + case ScalarType::Float: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long); + THFloatTensor_take(result_, self_, index_); + break; + } + case ScalarType::Int: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long); + THIntTensor_take(result_, self_, index_); + break; + } + case ScalarType::Long: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long); + THLongTensor_take(result_, self_, index_); + break; + } + case ScalarType::Short: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long); + THShortTensor_take(result_, self_, index_); + break; + } + default: + AT_ERROR("_th_take_out not supported on CPUType for ", dispatch_scalar_type); + } + return result; +} +Tensor _th_take(const Tensor & self, const Tensor & index) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto result_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto result = Tensor(c10::intrusive_ptr::reclaim(result_)); + switch (dispatch_scalar_type) { + case ScalarType::Bool: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long); + THBoolTensor_take(result_, self_, index_); + break; + } + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long); + THByteTensor_take(result_, self_, index_); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long); + THCharTensor_take(result_, self_, index_); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long); + THDoubleTensor_take(result_, self_, index_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long); + THFloatTensor_take(result_, self_, index_); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long); + THIntTensor_take(result_, self_, index_); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long); + THLongTensor_take(result_, self_, index_); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long); + THShortTensor_take(result_, self_, index_); + break; + } + default: + AT_ERROR("_th_take not supported on CPUType for ", dispatch_scalar_type); + } + return result; +} +Tensor & _th_put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Bool: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CPU, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type); + THBoolTensor_put(self_, index_, source_, accumulate); + break; + } + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CPU, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type); + THByteTensor_put(self_, index_, source_, accumulate); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CPU, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type); + THCharTensor_put(self_, index_, source_, accumulate); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CPU, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_put(self_, index_, source_, accumulate); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CPU, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_put(self_, index_, source_, accumulate); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CPU, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type); + THIntTensor_put(self_, index_, source_, accumulate); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CPU, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type); + THLongTensor_put(self_, index_, source_, accumulate); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CPU, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type); + THShortTensor_put(self_, index_, source_, accumulate); + break; + } + default: + AT_ERROR("_th_put_ not supported on CPUType for ", dispatch_scalar_type); + } + return self; +} +Tensor & _th_index_fill_(Tensor & self, int64_t dim, const Tensor & index, Scalar value) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Bool: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CPU, ScalarType::Long); + auto value_ = value.toBool(); + THBoolTensor_indexFill(self_, dim, index_, value_); + break; + } + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CPU, ScalarType::Long); + auto value_ = value.toByte(); + THByteTensor_indexFill(self_, dim, index_, value_); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CPU, ScalarType::Long); + auto value_ = value.toChar(); + THCharTensor_indexFill(self_, dim, index_, value_); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CPU, ScalarType::Long); + auto value_ = value.toDouble(); + THDoubleTensor_indexFill(self_, dim, index_, value_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CPU, ScalarType::Long); + auto value_ = value.toFloat(); + THFloatTensor_indexFill(self_, dim, index_, value_); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CPU, ScalarType::Long); + auto value_ = value.toInt(); + THIntTensor_indexFill(self_, dim, index_, value_); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CPU, ScalarType::Long); + auto value_ = value.toLong(); + THLongTensor_indexFill(self_, dim, index_, value_); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CPU, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CPU, ScalarType::Long); + auto value_ = value.toShort(); + THShortTensor_indexFill(self_, dim, index_, value_); + break; + } + default: + AT_ERROR("_th_index_fill_ not supported on CPUType for ", dispatch_scalar_type); + } + return self; +} +std::tuple _th_mode_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool keepdim) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type); + THByteTensor_mode(values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Char: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type); + THCharTensor_mode(values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Double: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_mode(values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Float: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_mode(values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Int: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type); + THIntTensor_mode(values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Long: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type); + THLongTensor_mode(values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Short: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type); + THShortTensor_mode(values_, indices_, self_, dim, keepdim); + break; + } + default: + AT_ERROR("_th_mode_out not supported on CPUType for ", dispatch_scalar_type); + } + return std::tuple(values, indices); +} +std::tuple _th_mode(const Tensor & self, int64_t dim, bool keepdim) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto values_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto values = Tensor(c10::intrusive_ptr::reclaim(values_)); + auto indices_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(ScalarType::Long)).release(); + auto indices = Tensor(c10::intrusive_ptr::reclaim(indices_)); + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CPU, dispatch_scalar_type); + THByteTensor_mode(values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CPU, dispatch_scalar_type); + THCharTensor_mode(values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_mode(values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_mode(values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CPU, dispatch_scalar_type); + THIntTensor_mode(values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CPU, dispatch_scalar_type); + THLongTensor_mode(values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CPU, dispatch_scalar_type); + THShortTensor_mode(values_, indices_, self_, dim, keepdim); + break; + } + default: + AT_ERROR("_th_mode not supported on CPUType for ", dispatch_scalar_type); + } + return std::tuple(values, indices); +} +std::tuple _th_sort_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool descending) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type); + THByteTensor_sort(values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Char: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type); + THCharTensor_sort(values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Double: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_sort(values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Float: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_sort(values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Int: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type); + THIntTensor_sort(values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Long: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type); + THLongTensor_sort(values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Short: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type); + THShortTensor_sort(values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Half: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type); + THHalfTensor_sort(values_, indices_, self_, dim, descending); + break; + } + default: + AT_ERROR("_th_sort_out not supported on CPUType for ", dispatch_scalar_type); + } + return std::tuple(values, indices); +} +std::tuple _th_sort(const Tensor & self, int64_t dim, bool descending) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto values_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto values = Tensor(c10::intrusive_ptr::reclaim(values_)); + auto indices_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(ScalarType::Long)).release(); + auto indices = Tensor(c10::intrusive_ptr::reclaim(indices_)); + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type); + THByteTensor_sort(values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type); + THCharTensor_sort(values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_sort(values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_sort(values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type); + THIntTensor_sort(values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type); + THLongTensor_sort(values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type); + THShortTensor_sort(values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type); + THHalfTensor_sort(values_, indices_, self_, dim, descending); + break; + } + default: + AT_ERROR("_th_sort not supported on CPUType for ", dispatch_scalar_type); + } + return std::tuple(values, indices); +} +Tensor _th_var(const Tensor & self, bool unbiased) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_var", false, DeviceType::CPU, dispatch_scalar_type); + return at::scalar_tensor(convert(THDoubleTensor_var_all(self_, unbiased)), options(ScalarType::Double)); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_var", false, DeviceType::CPU, dispatch_scalar_type); + return at::scalar_tensor(convert(THFloatTensor_var_all(self_, unbiased)), options(ScalarType::Float)); + break; + } + default: + AT_ERROR("_th_var not supported on CPUType for ", dispatch_scalar_type); + } +} +Tensor _th_std(const Tensor & self, bool unbiased) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_std", false, DeviceType::CPU, dispatch_scalar_type); + return at::scalar_tensor(convert(THDoubleTensor_std_all(self_, unbiased)), options(ScalarType::Double)); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_std", false, DeviceType::CPU, dispatch_scalar_type); + return at::scalar_tensor(convert(THFloatTensor_std_all(self_, unbiased)), options(ScalarType::Float)); + break; + } + default: + AT_ERROR("_th_std not supported on CPUType for ", dispatch_scalar_type); + } +} +Tensor & _th_renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_renorm_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_out", false, DeviceType::CPU, dispatch_scalar_type); + auto p_ = p.toDouble(); + auto maxnorm_ = maxnorm.toDouble(); + THDoubleTensor_renorm(result_, self_, p_, dim, maxnorm_); + break; + } + case ScalarType::Float: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_renorm_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_out", false, DeviceType::CPU, dispatch_scalar_type); + auto p_ = p.toFloat(); + auto maxnorm_ = maxnorm.toFloat(); + THFloatTensor_renorm(result_, self_, p_, dim, maxnorm_); + break; + } + default: + AT_ERROR("_th_renorm_out not supported on CPUType for ", dispatch_scalar_type); + } + return result; +} +Tensor _th_renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto result_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto result = Tensor(c10::intrusive_ptr::reclaim(result_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm", false, DeviceType::CPU, dispatch_scalar_type); + auto p_ = p.toDouble(); + auto maxnorm_ = maxnorm.toDouble(); + THDoubleTensor_renorm(result_, self_, p_, dim, maxnorm_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm", false, DeviceType::CPU, dispatch_scalar_type); + auto p_ = p.toFloat(); + auto maxnorm_ = maxnorm.toFloat(); + THFloatTensor_renorm(result_, self_, p_, dim, maxnorm_); + break; + } + default: + AT_ERROR("_th_renorm not supported on CPUType for ", dispatch_scalar_type); + } + return result; +} +Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_", false, DeviceType::CPU, dispatch_scalar_type); + auto p_ = p.toDouble(); + auto maxnorm_ = maxnorm.toDouble(); + THDoubleTensor_renorm(self_, self_, p_, dim, maxnorm_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_", false, DeviceType::CPU, dispatch_scalar_type); + auto p_ = p.toFloat(); + auto maxnorm_ = maxnorm.toFloat(); + THFloatTensor_renorm(self_, self_, p_, dim, maxnorm_); + break; + } + default: + AT_ERROR("_th_renorm_ not supported on CPUType for ", dispatch_scalar_type); + } + return self; +} +Tensor & _th_histc_out(Tensor & result, const Tensor & self, int64_t bins, Scalar min, Scalar max) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_histc_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_histc_out", false, DeviceType::CPU, dispatch_scalar_type); + auto min_ = min.toDouble(); + auto max_ = max.toDouble(); + THDoubleTensor_histc(result_, self_, bins, min_, max_); + break; + } + case ScalarType::Float: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_histc_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_histc_out", false, DeviceType::CPU, dispatch_scalar_type); + auto min_ = min.toFloat(); + auto max_ = max.toFloat(); + THFloatTensor_histc(result_, self_, bins, min_, max_); + break; + } + default: + AT_ERROR("_th_histc_out not supported on CPUType for ", dispatch_scalar_type); + } + return result; +} +Tensor _th_histc(const Tensor & self, int64_t bins, Scalar min, Scalar max) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto result_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto result = Tensor(c10::intrusive_ptr::reclaim(result_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_histc", false, DeviceType::CPU, dispatch_scalar_type); + auto min_ = min.toDouble(); + auto max_ = max.toDouble(); + THDoubleTensor_histc(result_, self_, bins, min_, max_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_histc", false, DeviceType::CPU, dispatch_scalar_type); + auto min_ = min.toFloat(); + auto max_ = max.toFloat(); + THFloatTensor_histc(result_, self_, bins, min_, max_); + break; + } + default: + AT_ERROR("_th_histc not supported on CPUType for ", dispatch_scalar_type); + } + return result; +} +Tensor _th_trace(const Tensor & self) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type); + return at::scalar_tensor(convert(THByteTensor_trace(self_)), options(ScalarType::Byte)); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type); + return at::scalar_tensor(convert(THCharTensor_trace(self_)), options(ScalarType::Char)); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type); + return at::scalar_tensor(convert(THDoubleTensor_trace(self_)), options(ScalarType::Double)); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type); + return at::scalar_tensor(convert(THFloatTensor_trace(self_)), options(ScalarType::Float)); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type); + return at::scalar_tensor(convert(THIntTensor_trace(self_)), options(ScalarType::Int)); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type); + return at::scalar_tensor(convert(THLongTensor_trace(self_)), options(ScalarType::Long)); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type); + return at::scalar_tensor(convert(THShortTensor_trace(self_)), options(ScalarType::Short)); + break; + } + default: + AT_ERROR("_th_trace not supported on CPUType for ", dispatch_scalar_type); + } +} +Tensor & _th_addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toByte(); + auto alpha_ = alpha.toByte(); + THByteTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + case ScalarType::Char: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toChar(); + auto alpha_ = alpha.toChar(); + THCharTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + case ScalarType::Double: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toDouble(); + auto alpha_ = alpha.toDouble(); + THDoubleTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + case ScalarType::Float: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toFloat(); + auto alpha_ = alpha.toFloat(); + THFloatTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + case ScalarType::Int: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toInt(); + auto alpha_ = alpha.toInt(); + THIntTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + case ScalarType::Long: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toLong(); + auto alpha_ = alpha.toLong(); + THLongTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + case ScalarType::Short: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toShort(); + auto alpha_ = alpha.toShort(); + THShortTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + case ScalarType::BFloat16: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toBFloat16(); + auto alpha_ = alpha.toBFloat16(); + THBFloat16Tensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + default: + AT_ERROR("_th_addr_out not supported on CPUType for ", dispatch_scalar_type); + } + return result; +} +Tensor _th_addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto result_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto result = Tensor(c10::intrusive_ptr::reclaim(result_)); + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toByte(); + auto alpha_ = alpha.toByte(); + THByteTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toChar(); + auto alpha_ = alpha.toChar(); + THCharTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toDouble(); + auto alpha_ = alpha.toDouble(); + THDoubleTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toFloat(); + auto alpha_ = alpha.toFloat(); + THFloatTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toInt(); + auto alpha_ = alpha.toInt(); + THIntTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toLong(); + auto alpha_ = alpha.toLong(); + THLongTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toShort(); + auto alpha_ = alpha.toShort(); + THShortTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + case ScalarType::BFloat16: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toBFloat16(); + auto alpha_ = alpha.toBFloat16(); + THBFloat16Tensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + default: + AT_ERROR("_th_addr not supported on CPUType for ", dispatch_scalar_type); + } + return result; +} +Tensor & _th_addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toByte(); + auto alpha_ = alpha.toByte(); + THByteTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toChar(); + auto alpha_ = alpha.toChar(); + THCharTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toDouble(); + auto alpha_ = alpha.toDouble(); + THDoubleTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toFloat(); + auto alpha_ = alpha.toFloat(); + THFloatTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toInt(); + auto alpha_ = alpha.toInt(); + THIntTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toLong(); + auto alpha_ = alpha.toLong(); + THLongTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toShort(); + auto alpha_ = alpha.toShort(); + THShortTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + case ScalarType::BFloat16: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); + auto beta_ = beta.toBFloat16(); + auto alpha_ = alpha.toBFloat16(); + THBFloat16Tensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_); + break; + } + default: + AT_ERROR("_th_addr_ not supported on CPUType for ", dispatch_scalar_type); + } + return self; +} +std::tuple _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type); + auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type); + auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_gels(res1_, res2_, self_, A_); + break; + } + case ScalarType::Float: { + auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type); + auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type); + auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_gels(res1_, res2_, self_, A_); + break; + } + default: + AT_ERROR("_th_gels_out not supported on CPUType for ", dispatch_scalar_type); + } + return std::tuple(res1, res2); +} +std::tuple _th_gels(const Tensor & self, const Tensor & A) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto res1_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto res1 = Tensor(c10::intrusive_ptr::reclaim(res1_)); + auto res2_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto res2 = Tensor(c10::intrusive_ptr::reclaim(res2_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels", false, DeviceType::CPU, dispatch_scalar_type); + auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_gels(res1_, res2_, self_, A_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels", false, DeviceType::CPU, dispatch_scalar_type); + auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_gels(res1_, res2_, self_, A_); + break; + } + default: + AT_ERROR("_th_gels not supported on CPUType for ", dispatch_scalar_type); + } + return std::tuple(res1, res2); +} +std::tuple _th_eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_eig_out", false, DeviceType::CPU, dispatch_scalar_type); + auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_eig_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_eig_out", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_geev(res1_, res2_, self_, eigenvectors); + break; + } + case ScalarType::Float: { + auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_eig_out", false, DeviceType::CPU, dispatch_scalar_type); + auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_eig_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_eig_out", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_geev(res1_, res2_, self_, eigenvectors); + break; + } + default: + AT_ERROR("_th_eig_out not supported on CPUType for ", dispatch_scalar_type); + } + return std::tuple(res1, res2); +} +std::tuple _th_eig(const Tensor & self, bool eigenvectors) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto res1_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto res1 = Tensor(c10::intrusive_ptr::reclaim(res1_)); + auto res2_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto res2 = Tensor(c10::intrusive_ptr::reclaim(res2_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_eig", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_geev(res1_, res2_, self_, eigenvectors); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_eig", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_geev(res1_, res2_, self_, eigenvectors); + break; + } + default: + AT_ERROR("_th_eig not supported on CPUType for ", dispatch_scalar_type); + } + return std::tuple(res1, res2); +} +Tensor & _th_potri_out(Tensor & output, const Tensor & self, bool upper) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto output_ = checked_dense_tensor_unwrap(output, "output", 0, "_th_potri_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_potri_out", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_potri(output_, self_, upper); + break; + } + case ScalarType::Float: { + auto output_ = checked_dense_tensor_unwrap(output, "output", 0, "_th_potri_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_potri_out", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_potri(output_, self_, upper); + break; + } + default: + AT_ERROR("_th_potri_out not supported on CPUType for ", dispatch_scalar_type); + } + return output; +} +Tensor _th_potri(const Tensor & self, bool upper) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto output_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto output = Tensor(c10::intrusive_ptr::reclaim(output_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_potri", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_potri(output_, self_, upper); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_potri", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_potri(output_, self_, upper); + break; + } + default: + AT_ERROR("_th_potri not supported on CPUType for ", dispatch_scalar_type); + } + return output; +} +std::tuple _th_geqrf_out(Tensor & res1, Tensor & res2, const Tensor & self) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_geqrf_out", false, DeviceType::CPU, dispatch_scalar_type); + auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_geqrf_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_geqrf_out", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_geqrf(res1_, res2_, self_); + break; + } + case ScalarType::Float: { + auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_geqrf_out", false, DeviceType::CPU, dispatch_scalar_type); + auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_geqrf_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_geqrf_out", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_geqrf(res1_, res2_, self_); + break; + } + default: + AT_ERROR("_th_geqrf_out not supported on CPUType for ", dispatch_scalar_type); + } + return std::tuple(res1, res2); +} +std::tuple _th_geqrf(const Tensor & self) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto res1_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto res1 = Tensor(c10::intrusive_ptr::reclaim(res1_)); + auto res2_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto res2 = Tensor(c10::intrusive_ptr::reclaim(res2_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_geqrf", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_geqrf(res1_, res2_, self_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_geqrf", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_geqrf(res1_, res2_, self_); + break; + } + default: + AT_ERROR("_th_geqrf not supported on CPUType for ", dispatch_scalar_type); + } + return std::tuple(res1, res2); +} +Tensor & _th_orgqr_out(Tensor & result, const Tensor & self, const Tensor & input2) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_orgqr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_orgqr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto input2_ = checked_dense_tensor_unwrap(input2, "input2", 2, "_th_orgqr_out", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_orgqr(result_, self_, input2_); + break; + } + case ScalarType::Float: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_orgqr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_orgqr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto input2_ = checked_dense_tensor_unwrap(input2, "input2", 2, "_th_orgqr_out", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_orgqr(result_, self_, input2_); + break; + } + default: + AT_ERROR("_th_orgqr_out not supported on CPUType for ", dispatch_scalar_type); + } + return result; +} +Tensor _th_orgqr(const Tensor & self, const Tensor & input2) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto result_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto result = Tensor(c10::intrusive_ptr::reclaim(result_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_orgqr", false, DeviceType::CPU, dispatch_scalar_type); + auto input2_ = checked_dense_tensor_unwrap(input2, "input2", 2, "_th_orgqr", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_orgqr(result_, self_, input2_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_orgqr", false, DeviceType::CPU, dispatch_scalar_type); + auto input2_ = checked_dense_tensor_unwrap(input2, "input2", 2, "_th_orgqr", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_orgqr(result_, self_, input2_); + break; + } + default: + AT_ERROR("_th_orgqr not supported on CPUType for ", dispatch_scalar_type); + } + return result; +} +Tensor & _th_ormqr_out(Tensor & result, const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_ormqr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_ormqr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto input2_ = checked_dense_tensor_unwrap(input2, "input2", 2, "_th_ormqr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto input3_ = checked_dense_tensor_unwrap(input3, "input3", 3, "_th_ormqr_out", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_ormqr(result_, self_, input2_, input3_, left, transpose); + break; + } + case ScalarType::Float: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_ormqr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_ormqr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto input2_ = checked_dense_tensor_unwrap(input2, "input2", 2, "_th_ormqr_out", false, DeviceType::CPU, dispatch_scalar_type); + auto input3_ = checked_dense_tensor_unwrap(input3, "input3", 3, "_th_ormqr_out", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_ormqr(result_, self_, input2_, input3_, left, transpose); + break; + } + default: + AT_ERROR("_th_ormqr_out not supported on CPUType for ", dispatch_scalar_type); + } + return result; +} +Tensor _th_ormqr(const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto result_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto result = Tensor(c10::intrusive_ptr::reclaim(result_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_ormqr", false, DeviceType::CPU, dispatch_scalar_type); + auto input2_ = checked_dense_tensor_unwrap(input2, "input2", 2, "_th_ormqr", false, DeviceType::CPU, dispatch_scalar_type); + auto input3_ = checked_dense_tensor_unwrap(input3, "input3", 3, "_th_ormqr", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_ormqr(result_, self_, input2_, input3_, left, transpose); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_ormqr", false, DeviceType::CPU, dispatch_scalar_type); + auto input2_ = checked_dense_tensor_unwrap(input2, "input2", 2, "_th_ormqr", false, DeviceType::CPU, dispatch_scalar_type); + auto input3_ = checked_dense_tensor_unwrap(input3, "input3", 3, "_th_ormqr", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_ormqr(result_, self_, input2_, input3_, left, transpose); + break; + } + default: + AT_ERROR("_th_ormqr not supported on CPUType for ", dispatch_scalar_type); + } + return result; +} +std::tuple _th_multinomial_alias_setup_out(Tensor & J, Tensor & q, const Tensor & probs) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(J); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto probs_ = checked_dense_tensor_unwrap(probs, "probs", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CPU, dispatch_scalar_type); + auto J_ = checked_dense_tensor_unwrap(J, "J", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CPU, ScalarType::Long); + auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_multinomialAliasSetup(probs_, J_, q_); + break; + } + case ScalarType::Float: { + auto probs_ = checked_dense_tensor_unwrap(probs, "probs", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CPU, dispatch_scalar_type); + auto J_ = checked_dense_tensor_unwrap(J, "J", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CPU, ScalarType::Long); + auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_multinomialAliasSetup(probs_, J_, q_); + break; + } + default: + AT_ERROR("_th_multinomial_alias_setup_out not supported on CPUType for ", dispatch_scalar_type); + } + return std::tuple(J, q); +} +std::tuple _th_multinomial_alias_setup(const Tensor & probs) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(probs); + auto J_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(ScalarType::Long)).release(); + auto J = Tensor(c10::intrusive_ptr::reclaim(J_)); + auto q_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto q = Tensor(c10::intrusive_ptr::reclaim(q_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto probs_ = checked_dense_tensor_unwrap(probs, "probs", 1, "_th_multinomial_alias_setup", false, DeviceType::CPU, dispatch_scalar_type); + THDoubleTensor_multinomialAliasSetup(probs_, J_, q_); + break; + } + case ScalarType::Float: { + auto probs_ = checked_dense_tensor_unwrap(probs, "probs", 1, "_th_multinomial_alias_setup", false, DeviceType::CPU, dispatch_scalar_type); + THFloatTensor_multinomialAliasSetup(probs_, J_, q_); + break; + } + default: + AT_ERROR("_th_multinomial_alias_setup not supported on CPUType for ", dispatch_scalar_type); + } + return std::tuple(J, q); +} +Tensor & _th_multinomial_alias_draw_out(Tensor & result, const Tensor & q, const Tensor & J, int64_t num_samples, c10::optional generator) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(result); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_multinomial_alias_draw_out", false, DeviceType::CPU, ScalarType::Long); + auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_draw_out", false, DeviceType::CPU, dispatch_scalar_type); + auto J_ = checked_dense_tensor_unwrap(J, "J", 2, "_th_multinomial_alias_draw_out", false, DeviceType::CPU, ScalarType::Long); + THDoubleTensor_multinomialAliasDraw(result_, q_, J_, num_samples, generator); + break; + } + case ScalarType::Float: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_multinomial_alias_draw_out", false, DeviceType::CPU, ScalarType::Long); + auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_draw_out", false, DeviceType::CPU, dispatch_scalar_type); + auto J_ = checked_dense_tensor_unwrap(J, "J", 2, "_th_multinomial_alias_draw_out", false, DeviceType::CPU, ScalarType::Long); + THFloatTensor_multinomialAliasDraw(result_, q_, J_, num_samples, generator); + break; + } + default: + AT_ERROR("_th_multinomial_alias_draw_out not supported on CPUType for ", dispatch_scalar_type); + } + return result; +} +Tensor _th_multinomial_alias_draw(const Tensor & q, const Tensor & J, int64_t num_samples, c10::optional generator) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(q); + auto result_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(ScalarType::Long)).release(); + auto result = Tensor(c10::intrusive_ptr::reclaim(result_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_draw", false, DeviceType::CPU, dispatch_scalar_type); + auto J_ = checked_dense_tensor_unwrap(J, "J", 2, "_th_multinomial_alias_draw", false, DeviceType::CPU, ScalarType::Long); + THDoubleTensor_multinomialAliasDraw(result_, q_, J_, num_samples, generator); + break; + } + case ScalarType::Float: { + auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_draw", false, DeviceType::CPU, dispatch_scalar_type); + auto J_ = checked_dense_tensor_unwrap(J, "J", 2, "_th_multinomial_alias_draw", false, DeviceType::CPU, ScalarType::Long); + THFloatTensor_multinomialAliasDraw(result_, q_, J_, num_samples, generator); + break; + } + default: + AT_ERROR("_th_multinomial_alias_draw not supported on CPUType for ", dispatch_scalar_type); + } + return result; +} + +} // namespace th +} // namespace legacy +} // namespace native +} // namespace at diff --git a/aten/src/ATen/LegacyTHFunctionsCPU.h b/aten/src/ATen/LegacyTHFunctionsCPU.h new file mode 100644 index 00000000000..1abca1b1f91 --- /dev/null +++ b/aten/src/ATen/LegacyTHFunctionsCPU.h @@ -0,0 +1,67 @@ +#pragma once + +// @generated by aten/src/ATen/gen.py from LegacyTHFunctions.h + +#include +#include +#include + +namespace c10 { +class Scalar; +} +namespace at { +struct Generator; +class Tensor; +struct Type; +} // namespace at + +namespace at { +namespace native { +namespace legacy { +namespace cpu { + +Tensor & _th_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source); +Tensor & _th_masked_scatter_bool_(Tensor & self, const Tensor & mask, const Tensor & source); +Tensor & _th_nonzero_out(Tensor & result, const Tensor & self); +Tensor _th_nonzero(const Tensor & self); +Tensor & _th_index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source); +Tensor & _th_take_out(Tensor & result, const Tensor & self, const Tensor & index); +Tensor _th_take(const Tensor & self, const Tensor & index); +Tensor & _th_put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate); +Tensor & _th_index_fill_(Tensor & self, int64_t dim, const Tensor & index, Scalar value); +std::tuple _th_mode_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool keepdim); +std::tuple _th_mode(const Tensor & self, int64_t dim, bool keepdim); +std::tuple _th_sort_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool descending); +std::tuple _th_sort(const Tensor & self, int64_t dim, bool descending); +Tensor _th_var(const Tensor & self, bool unbiased); +Tensor _th_std(const Tensor & self, bool unbiased); +Tensor & _th_renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm); +Tensor _th_renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm); +Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm); +Tensor & _th_histc_out(Tensor & result, const Tensor & self, int64_t bins, Scalar min, Scalar max); +Tensor _th_histc(const Tensor & self, int64_t bins, Scalar min, Scalar max); +Tensor _th_trace(const Tensor & self); +Tensor & _th_addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha); +Tensor _th_addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha); +Tensor & _th_addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha); +std::tuple _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A); +std::tuple _th_gels(const Tensor & self, const Tensor & A); +std::tuple _th_eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors); +std::tuple _th_eig(const Tensor & self, bool eigenvectors); +Tensor & _th_potri_out(Tensor & output, const Tensor & self, bool upper); +Tensor _th_potri(const Tensor & self, bool upper); +std::tuple _th_geqrf_out(Tensor & res1, Tensor & res2, const Tensor & self); +std::tuple _th_geqrf(const Tensor & self); +Tensor & _th_orgqr_out(Tensor & result, const Tensor & self, const Tensor & input2); +Tensor _th_orgqr(const Tensor & self, const Tensor & input2); +Tensor & _th_ormqr_out(Tensor & result, const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose); +Tensor _th_ormqr(const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose); +std::tuple _th_multinomial_alias_setup_out(Tensor & J, Tensor & q, const Tensor & probs); +std::tuple _th_multinomial_alias_setup(const Tensor & probs); +Tensor & _th_multinomial_alias_draw_out(Tensor & result, const Tensor & q, const Tensor & J, int64_t num_samples, c10::optional generator); +Tensor _th_multinomial_alias_draw(const Tensor & q, const Tensor & J, int64_t num_samples, c10::optional generator); + +} // namespace th +} // namespace legacy +} // namespace native +} // namespace at diff --git a/aten/src/ATen/LegacyTHFunctionsCUDA.h b/aten/src/ATen/LegacyTHFunctionsCUDA.h new file mode 100644 index 00000000000..8e2410cc87e --- /dev/null +++ b/aten/src/ATen/LegacyTHFunctionsCUDA.h @@ -0,0 +1,111 @@ +#pragma once + +// @generated by aten/src/ATen/gen.py from LegacyTHFunctions.h + +#include +#include +#include + +namespace c10 { +class Scalar; +} +namespace at { +struct Generator; +class Tensor; +struct Type; +} // namespace at + +namespace at { +namespace native { +namespace legacy { +namespace cuda { + +Tensor & _th_masked_fill_(Tensor & self, const Tensor & mask, Scalar value); +Tensor & _th_masked_fill_bool_(Tensor & self, const Tensor & mask, Scalar value); +Tensor & _th_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source); +Tensor & _th_masked_scatter_bool_(Tensor & self, const Tensor & mask, const Tensor & source); +Tensor & _th_nonzero_out(Tensor & result, const Tensor & self); +Tensor _th_nonzero(const Tensor & self); +Tensor & _th_index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source); +Tensor & _th_take_out(Tensor & result, const Tensor & self, const Tensor & index); +Tensor _th_take(const Tensor & self, const Tensor & index); +Tensor & _th_put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate); +Tensor & _th_index_fill_(Tensor & self, int64_t dim, const Tensor & index, Scalar value); +std::tuple _th_mode_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool keepdim); +std::tuple _th_mode(const Tensor & self, int64_t dim, bool keepdim); +std::tuple _th_sort_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool descending); +std::tuple _th_sort(const Tensor & self, int64_t dim, bool descending); +std::tuple _th_topk_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted); +std::tuple _th_topk(const Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted); +Tensor _th_var(const Tensor & self, bool unbiased); +Tensor _th_std(const Tensor & self, bool unbiased); +Tensor & _th_renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm); +Tensor _th_renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm); +Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm); +Tensor & _th_fmod_out(Tensor & result, const Tensor & self, Scalar other); +Tensor _th_fmod(const Tensor & self, Scalar other); +Tensor & _th_fmod_out(Tensor & result, const Tensor & self, const Tensor & other); +Tensor _th_fmod(const Tensor & self, const Tensor & other); +Tensor & _th_fmod_(Tensor & self, Scalar other); +Tensor & _th_fmod_(Tensor & self, const Tensor & other); +Tensor & _th_cross_kernel_out(Tensor & result, const Tensor & self, const Tensor & other, int64_t dim); +Tensor _th_cross_kernel(const Tensor & self, const Tensor & other, int64_t dim); +Tensor & _th_bmm_out(Tensor & result, const Tensor & self, const Tensor & mat2); +Tensor _th_bmm(const Tensor & self, const Tensor & mat2); +Tensor & _th_baddbmm_out(Tensor & result, const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha); +Tensor _th_baddbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha); +std::tuple _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A); +std::tuple _th_gels(const Tensor & self, const Tensor & A); +std::tuple _th_eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors); +std::tuple _th_eig(const Tensor & self, bool eigenvectors); +Tensor & _th_potri_out(Tensor & output, const Tensor & self, bool upper); +Tensor _th_potri(const Tensor & self, bool upper); +std::tuple _th_geqrf_out(Tensor & res1, Tensor & res2, const Tensor & self); +std::tuple _th_geqrf(const Tensor & self); +std::tuple _th_multinomial_alias_setup_out(Tensor & J, Tensor & q, const Tensor & probs); +std::tuple _th_multinomial_alias_setup(const Tensor & probs); +Tensor & _th_multinomial_alias_draw_out(Tensor & result, const Tensor & q, const Tensor & J, int64_t num_samples, c10::optional generator); +Tensor _th_multinomial_alias_draw(const Tensor & q, const Tensor & J, int64_t num_samples, c10::optional generator); +Tensor & _th_copy_ignoring_overlaps_(Tensor & self, const Tensor & src); +Tensor & _thnn_multi_margin_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, int64_t reduction); +Tensor _thnn_multi_margin_loss_forward(const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, int64_t reduction); +Tensor & _thnn_multi_margin_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, int64_t reduction); +Tensor _thnn_multi_margin_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, int64_t reduction); +std::tuple _thnn_multilabel_margin_loss_forward_out(Tensor & output, Tensor & is_target, const Tensor & self, const Tensor & target, int64_t reduction); +std::tuple _thnn_multilabel_margin_loss_forward(const Tensor & self, const Tensor & target, int64_t reduction); +Tensor & _thnn_multilabel_margin_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, int64_t reduction, const Tensor & is_target); +Tensor _thnn_multilabel_margin_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, int64_t reduction, const Tensor & is_target); +std::tuple _thnn_nll_loss_forward_out(Tensor & output, Tensor & total_weight, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index); +std::tuple _thnn_nll_loss_forward(const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index); +Tensor & _thnn_nll_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index, const Tensor & total_weight); +Tensor _thnn_nll_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index, const Tensor & total_weight); +std::tuple _thnn_nll_loss2d_forward_out(Tensor & output, Tensor & total_weight, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index); +std::tuple _thnn_nll_loss2d_forward(const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index); +Tensor & _thnn_nll_loss2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index, const Tensor & total_weight); +Tensor _thnn_nll_loss2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index, const Tensor & total_weight); +Tensor & _thnn_glu_forward_out(Tensor & output, const Tensor & self, int64_t dim); +Tensor _thnn_glu_forward(const Tensor & self, int64_t dim); +Tensor & _thnn_glu_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t dim); +Tensor _thnn_glu_backward(const Tensor & grad_output, const Tensor & self, int64_t dim); +std::tuple _thnn_log_sigmoid_forward_out(Tensor & output, Tensor & buffer, const Tensor & self); +std::tuple _thnn_log_sigmoid_forward(const Tensor & self); +Tensor & _thnn_log_sigmoid_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & buffer); +Tensor _thnn_log_sigmoid_backward(const Tensor & grad_output, const Tensor & self, const Tensor & buffer); +Tensor & _thnn_rrelu_with_noise_forward_out(Tensor & output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, c10::optional generator); +Tensor _thnn_rrelu_with_noise_forward(const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, c10::optional generator); +Tensor & _thnn_rrelu_with_noise_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training); +Tensor _thnn_rrelu_with_noise_backward(const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training); +Tensor & _thnn_rrelu_with_noise_forward_(Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, c10::optional generator); +std::tuple _thnn_conv2d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding); +std::tuple _thnn_conv2d_forward(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding); +std::tuple _thnn_conv2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, const Tensor & columns, const Tensor & ones); +std::tuple _thnn_conv2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, const Tensor & columns, const Tensor & ones, std::array output_mask); +Tensor & _thnn_conv_depthwise2d_forward_out(Tensor & output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation); +Tensor _thnn_conv_depthwise2d_forward(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation); +std::tuple _thnn_conv_depthwise2d_backward_out(Tensor & grad_input, Tensor & grad_weight, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation); +std::tuple _thnn_conv_depthwise2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, std::array output_mask); + +} // namespace th +} // namespace legacy +} // namespace native +} // namespace at diff --git a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp new file mode 100644 index 00000000000..2b07a19de0e --- /dev/null +++ b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp @@ -0,0 +1,4176 @@ +#include + +// @generated by aten/src/ATen/gen.py from LegacyTHFunctions.cpp + +#include +#include +#include +#include +#include +#include +#include +#include +#undef THNN_ +#undef THCIndexTensor_ +#include +#include +#include +#include + +namespace at { +namespace native { +namespace legacy { +namespace cuda { + +namespace { + ScalarType infer_scalar_type(const Tensor & t) { + return t.scalar_type(); + } + ScalarType infer_scalar_type(const TensorList & tl) { + TORCH_CHECK(tl.size() > 0, "expected a non-empty list of Tensors"); + return tl[0].scalar_type(); + } + + TensorOptions options(ScalarType s) { + return TensorOptions().dtype(s) + .device(DeviceType::CUDA) + .layout(kStrided); + } + + Allocator* allocator() { + return at::cuda::getCUDADeviceAllocator(); + } +} + +Tensor & _th_masked_fill_(Tensor & self, const Tensor & mask, Scalar value) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Bool: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_", false, DeviceType::CUDA, ScalarType::Byte); + auto value_ = value.toBool(); + THCudaBoolTensor_maskedFill(globalContext().getTHCState(), self_, mask_, value_); + break; + } + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_", false, DeviceType::CUDA, ScalarType::Byte); + auto value_ = value.toByte(); + THCudaByteTensor_maskedFill(globalContext().getTHCState(), self_, mask_, value_); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_", false, DeviceType::CUDA, ScalarType::Byte); + auto value_ = value.toChar(); + THCudaCharTensor_maskedFill(globalContext().getTHCState(), self_, mask_, value_); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_", false, DeviceType::CUDA, ScalarType::Byte); + auto value_ = value.toDouble(); + THCudaDoubleTensor_maskedFill(globalContext().getTHCState(), self_, mask_, value_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_", false, DeviceType::CUDA, ScalarType::Byte); + auto value_ = value.toFloat(); + THCudaTensor_maskedFill(globalContext().getTHCState(), self_, mask_, value_); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_", false, DeviceType::CUDA, ScalarType::Byte); + auto value_ = value.toInt(); + THCudaIntTensor_maskedFill(globalContext().getTHCState(), self_, mask_, value_); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_", false, DeviceType::CUDA, ScalarType::Byte); + auto value_ = value.toLong(); + THCudaLongTensor_maskedFill(globalContext().getTHCState(), self_, mask_, value_); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_", false, DeviceType::CUDA, ScalarType::Byte); + auto value_ = value.toShort(); + THCudaShortTensor_maskedFill(globalContext().getTHCState(), self_, mask_, value_); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_", false, DeviceType::CUDA, ScalarType::Byte); + auto value_ = value.toHalf(); + THCudaHalfTensor_maskedFill(globalContext().getTHCState(), self_, mask_, value_); + break; + } + case ScalarType::BFloat16: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_", false, DeviceType::CUDA, ScalarType::Byte); + auto value_ = value.toBFloat16(); + THCudaBFloat16Tensor_maskedFill(globalContext().getTHCState(), self_, mask_, value_); + break; + } + default: + AT_ERROR("_th_masked_fill_ not supported on CUDAType for ", dispatch_scalar_type); + } + return self; +} +Tensor & _th_masked_fill_bool_(Tensor & self, const Tensor & mask, Scalar value) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Bool: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_bool_", false, DeviceType::CUDA, ScalarType::Bool); + auto value_ = value.toBool(); + THCudaBoolTensor_maskedFillBool(globalContext().getTHCState(), self_, mask_, value_); + break; + } + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_bool_", false, DeviceType::CUDA, ScalarType::Bool); + auto value_ = value.toByte(); + THCudaByteTensor_maskedFillBool(globalContext().getTHCState(), self_, mask_, value_); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_bool_", false, DeviceType::CUDA, ScalarType::Bool); + auto value_ = value.toChar(); + THCudaCharTensor_maskedFillBool(globalContext().getTHCState(), self_, mask_, value_); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_bool_", false, DeviceType::CUDA, ScalarType::Bool); + auto value_ = value.toDouble(); + THCudaDoubleTensor_maskedFillBool(globalContext().getTHCState(), self_, mask_, value_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_bool_", false, DeviceType::CUDA, ScalarType::Bool); + auto value_ = value.toFloat(); + THCudaTensor_maskedFillBool(globalContext().getTHCState(), self_, mask_, value_); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_bool_", false, DeviceType::CUDA, ScalarType::Bool); + auto value_ = value.toInt(); + THCudaIntTensor_maskedFillBool(globalContext().getTHCState(), self_, mask_, value_); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_bool_", false, DeviceType::CUDA, ScalarType::Bool); + auto value_ = value.toLong(); + THCudaLongTensor_maskedFillBool(globalContext().getTHCState(), self_, mask_, value_); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_bool_", false, DeviceType::CUDA, ScalarType::Bool); + auto value_ = value.toShort(); + THCudaShortTensor_maskedFillBool(globalContext().getTHCState(), self_, mask_, value_); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_bool_", false, DeviceType::CUDA, ScalarType::Bool); + auto value_ = value.toHalf(); + THCudaHalfTensor_maskedFillBool(globalContext().getTHCState(), self_, mask_, value_); + break; + } + case ScalarType::BFloat16: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_bool_", false, DeviceType::CUDA, ScalarType::Bool); + auto value_ = value.toBFloat16(); + THCudaBFloat16Tensor_maskedFillBool(globalContext().getTHCState(), self_, mask_, value_); + break; + } + default: + AT_ERROR("_th_masked_fill_bool_ not supported on CUDAType for ", dispatch_scalar_type); + } + return self; +} +Tensor & _th_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Bool: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaBoolTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_); + break; + } + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaByteTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaCharTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaIntTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaLongTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaShortTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaHalfTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_); + break; + } + case ScalarType::BFloat16: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaBFloat16Tensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_); + break; + } + default: + AT_ERROR("_th_masked_scatter_ not supported on CUDAType for ", dispatch_scalar_type); + } + return self; +} +Tensor & _th_masked_scatter_bool_(Tensor & self, const Tensor & mask, const Tensor & source) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Bool: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaBoolTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_); + break; + } + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaByteTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaCharTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaIntTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaLongTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaShortTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaHalfTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_); + break; + } + case ScalarType::BFloat16: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaBFloat16Tensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_); + break; + } + default: + AT_ERROR("_th_masked_scatter_bool_ not supported on CUDAType for ", dispatch_scalar_type); + } + return self; +} +Tensor & _th_nonzero_out(Tensor & result, const Tensor & self) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Bool: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaBoolTensor_nonzero(globalContext().getTHCState(), result_, self_); + break; + } + case ScalarType::Byte: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaByteTensor_nonzero(globalContext().getTHCState(), result_, self_); + break; + } + case ScalarType::Char: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaCharTensor_nonzero(globalContext().getTHCState(), result_, self_); + break; + } + case ScalarType::Double: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_nonzero(globalContext().getTHCState(), result_, self_); + break; + } + case ScalarType::Float: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_nonzero(globalContext().getTHCState(), result_, self_); + break; + } + case ScalarType::Int: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaIntTensor_nonzero(globalContext().getTHCState(), result_, self_); + break; + } + case ScalarType::Long: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaLongTensor_nonzero(globalContext().getTHCState(), result_, self_); + break; + } + case ScalarType::Short: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaShortTensor_nonzero(globalContext().getTHCState(), result_, self_); + break; + } + case ScalarType::Half: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaHalfTensor_nonzero(globalContext().getTHCState(), result_, self_); + break; + } + case ScalarType::BFloat16: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaBFloat16Tensor_nonzero(globalContext().getTHCState(), result_, self_); + break; + } + default: + AT_ERROR("_th_nonzero_out not supported on CUDAType for ", dispatch_scalar_type); + } + return result; +} +Tensor _th_nonzero(const Tensor & self) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto result_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(ScalarType::Long)).release(); + auto result = Tensor(c10::intrusive_ptr::reclaim(result_)); + switch (dispatch_scalar_type) { + case ScalarType::Bool: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaBoolTensor_nonzero(globalContext().getTHCState(), result_, self_); + break; + } + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaByteTensor_nonzero(globalContext().getTHCState(), result_, self_); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaCharTensor_nonzero(globalContext().getTHCState(), result_, self_); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_nonzero(globalContext().getTHCState(), result_, self_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_nonzero(globalContext().getTHCState(), result_, self_); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaIntTensor_nonzero(globalContext().getTHCState(), result_, self_); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaLongTensor_nonzero(globalContext().getTHCState(), result_, self_); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaShortTensor_nonzero(globalContext().getTHCState(), result_, self_); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaHalfTensor_nonzero(globalContext().getTHCState(), result_, self_); + break; + } + case ScalarType::BFloat16: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaBFloat16Tensor_nonzero(globalContext().getTHCState(), result_, self_); + break; + } + default: + AT_ERROR("_th_nonzero not supported on CUDAType for ", dispatch_scalar_type); + } + return result; +} +Tensor & _th_index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Bool: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CUDA, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaBoolTensor_indexCopy(globalContext().getTHCState(), self_, dim, index_, source_); + break; + } + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CUDA, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaByteTensor_indexCopy(globalContext().getTHCState(), self_, dim, index_, source_); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CUDA, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaCharTensor_indexCopy(globalContext().getTHCState(), self_, dim, index_, source_); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CUDA, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_indexCopy(globalContext().getTHCState(), self_, dim, index_, source_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CUDA, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_indexCopy(globalContext().getTHCState(), self_, dim, index_, source_); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CUDA, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaIntTensor_indexCopy(globalContext().getTHCState(), self_, dim, index_, source_); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CUDA, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaLongTensor_indexCopy(globalContext().getTHCState(), self_, dim, index_, source_); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CUDA, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaShortTensor_indexCopy(globalContext().getTHCState(), self_, dim, index_, source_); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CUDA, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaHalfTensor_indexCopy(globalContext().getTHCState(), self_, dim, index_, source_); + break; + } + default: + AT_ERROR("_th_index_copy_ not supported on CUDAType for ", dispatch_scalar_type); + } + return self; +} +Tensor & _th_take_out(Tensor & result, const Tensor & self, const Tensor & index) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Bool: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long); + THCudaBoolTensor_take(globalContext().getTHCState(), result_, self_, index_); + break; + } + case ScalarType::Byte: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long); + THCudaByteTensor_take(globalContext().getTHCState(), result_, self_, index_); + break; + } + case ScalarType::Char: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long); + THCudaCharTensor_take(globalContext().getTHCState(), result_, self_, index_); + break; + } + case ScalarType::Double: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long); + THCudaDoubleTensor_take(globalContext().getTHCState(), result_, self_, index_); + break; + } + case ScalarType::Float: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long); + THCudaTensor_take(globalContext().getTHCState(), result_, self_, index_); + break; + } + case ScalarType::Int: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long); + THCudaIntTensor_take(globalContext().getTHCState(), result_, self_, index_); + break; + } + case ScalarType::Long: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long); + THCudaLongTensor_take(globalContext().getTHCState(), result_, self_, index_); + break; + } + case ScalarType::Short: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long); + THCudaShortTensor_take(globalContext().getTHCState(), result_, self_, index_); + break; + } + case ScalarType::Half: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long); + THCudaHalfTensor_take(globalContext().getTHCState(), result_, self_, index_); + break; + } + default: + AT_ERROR("_th_take_out not supported on CUDAType for ", dispatch_scalar_type); + } + return result; +} +Tensor _th_take(const Tensor & self, const Tensor & index) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto result_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto result = Tensor(c10::intrusive_ptr::reclaim(result_)); + switch (dispatch_scalar_type) { + case ScalarType::Bool: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long); + THCudaBoolTensor_take(globalContext().getTHCState(), result_, self_, index_); + break; + } + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long); + THCudaByteTensor_take(globalContext().getTHCState(), result_, self_, index_); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long); + THCudaCharTensor_take(globalContext().getTHCState(), result_, self_, index_); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long); + THCudaDoubleTensor_take(globalContext().getTHCState(), result_, self_, index_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long); + THCudaTensor_take(globalContext().getTHCState(), result_, self_, index_); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long); + THCudaIntTensor_take(globalContext().getTHCState(), result_, self_, index_); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long); + THCudaLongTensor_take(globalContext().getTHCState(), result_, self_, index_); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long); + THCudaShortTensor_take(globalContext().getTHCState(), result_, self_, index_); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long); + THCudaHalfTensor_take(globalContext().getTHCState(), result_, self_, index_); + break; + } + default: + AT_ERROR("_th_take not supported on CUDAType for ", dispatch_scalar_type); + } + return result; +} +Tensor & _th_put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Bool: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CUDA, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaBoolTensor_put(globalContext().getTHCState(), self_, index_, source_, accumulate); + break; + } + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CUDA, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaByteTensor_put(globalContext().getTHCState(), self_, index_, source_, accumulate); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CUDA, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaCharTensor_put(globalContext().getTHCState(), self_, index_, source_, accumulate); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CUDA, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_put(globalContext().getTHCState(), self_, index_, source_, accumulate); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CUDA, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_put(globalContext().getTHCState(), self_, index_, source_, accumulate); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CUDA, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaIntTensor_put(globalContext().getTHCState(), self_, index_, source_, accumulate); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CUDA, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaLongTensor_put(globalContext().getTHCState(), self_, index_, source_, accumulate); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CUDA, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaShortTensor_put(globalContext().getTHCState(), self_, index_, source_, accumulate); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CUDA, ScalarType::Long); + auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaHalfTensor_put(globalContext().getTHCState(), self_, index_, source_, accumulate); + break; + } + default: + AT_ERROR("_th_put_ not supported on CUDAType for ", dispatch_scalar_type); + } + return self; +} +Tensor & _th_index_fill_(Tensor & self, int64_t dim, const Tensor & index, Scalar value) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Bool: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CUDA, ScalarType::Long); + auto value_ = value.toBool(); + THCudaBoolTensor_indexFill(globalContext().getTHCState(), self_, dim, index_, value_); + break; + } + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CUDA, ScalarType::Long); + auto value_ = value.toByte(); + THCudaByteTensor_indexFill(globalContext().getTHCState(), self_, dim, index_, value_); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CUDA, ScalarType::Long); + auto value_ = value.toChar(); + THCudaCharTensor_indexFill(globalContext().getTHCState(), self_, dim, index_, value_); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CUDA, ScalarType::Long); + auto value_ = value.toDouble(); + THCudaDoubleTensor_indexFill(globalContext().getTHCState(), self_, dim, index_, value_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CUDA, ScalarType::Long); + auto value_ = value.toFloat(); + THCudaTensor_indexFill(globalContext().getTHCState(), self_, dim, index_, value_); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CUDA, ScalarType::Long); + auto value_ = value.toInt(); + THCudaIntTensor_indexFill(globalContext().getTHCState(), self_, dim, index_, value_); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CUDA, ScalarType::Long); + auto value_ = value.toLong(); + THCudaLongTensor_indexFill(globalContext().getTHCState(), self_, dim, index_, value_); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CUDA, ScalarType::Long); + auto value_ = value.toShort(); + THCudaShortTensor_indexFill(globalContext().getTHCState(), self_, dim, index_, value_); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CUDA, dispatch_scalar_type); + auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CUDA, ScalarType::Long); + auto value_ = value.toHalf(); + THCudaHalfTensor_indexFill(globalContext().getTHCState(), self_, dim, index_, value_); + break; + } + default: + AT_ERROR("_th_index_fill_ not supported on CUDAType for ", dispatch_scalar_type); + } + return self; +} +std::tuple _th_mode_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool keepdim) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaByteTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Char: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaCharTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Double: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Float: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Int: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaIntTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Long: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaLongTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Short: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaShortTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Half: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaHalfTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim); + break; + } + default: + AT_ERROR("_th_mode_out not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(values, indices); +} +std::tuple _th_mode(const Tensor & self, int64_t dim, bool keepdim) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto values_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto values = Tensor(c10::intrusive_ptr::reclaim(values_)); + auto indices_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(ScalarType::Long)).release(); + auto indices = Tensor(c10::intrusive_ptr::reclaim(indices_)); + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaByteTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaCharTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaIntTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaLongTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaShortTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaHalfTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim); + break; + } + default: + AT_ERROR("_th_mode not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(values, indices); +} +std::tuple _th_sort_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool descending) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaByteTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Char: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaCharTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Double: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Float: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Int: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaIntTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Long: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaLongTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Short: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaShortTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Half: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaHalfTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending); + break; + } + default: + AT_ERROR("_th_sort_out not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(values, indices); +} +std::tuple _th_sort(const Tensor & self, int64_t dim, bool descending) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto values_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto values = Tensor(c10::intrusive_ptr::reclaim(values_)); + auto indices_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(ScalarType::Long)).release(); + auto indices = Tensor(c10::intrusive_ptr::reclaim(indices_)); + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaByteTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaCharTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaIntTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaLongTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaShortTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaHalfTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending); + break; + } + default: + AT_ERROR("_th_sort not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(values, indices); +} +std::tuple _th_topk_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_topk_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaByteTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted); + break; + } + case ScalarType::Char: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_topk_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaCharTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted); + break; + } + case ScalarType::Double: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_topk_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted); + break; + } + case ScalarType::Float: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_topk_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted); + break; + } + case ScalarType::Int: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_topk_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaIntTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted); + break; + } + case ScalarType::Long: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_topk_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaLongTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted); + break; + } + case ScalarType::Short: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_topk_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaShortTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted); + break; + } + case ScalarType::Half: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_topk_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaHalfTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted); + break; + } + case ScalarType::BFloat16: { + auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_topk_out", false, DeviceType::CUDA, ScalarType::Long); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaBFloat16Tensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted); + break; + } + default: + AT_ERROR("_th_topk_out not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(values, indices); +} +std::tuple _th_topk(const Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto values_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto values = Tensor(c10::intrusive_ptr::reclaim(values_)); + auto indices_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(ScalarType::Long)).release(); + auto indices = Tensor(c10::intrusive_ptr::reclaim(indices_)); + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaByteTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaCharTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaIntTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaLongTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaShortTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaHalfTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted); + break; + } + case ScalarType::BFloat16: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaBFloat16Tensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted); + break; + } + default: + AT_ERROR("_th_topk not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(values, indices); +} +Tensor _th_var(const Tensor & self, bool unbiased) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_var", false, DeviceType::CUDA, dispatch_scalar_type); + return at::scalar_tensor(convert(THCudaDoubleTensor_var_all(globalContext().getTHCState(), self_, unbiased)), options(ScalarType::Double)); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_var", false, DeviceType::CUDA, dispatch_scalar_type); + return at::scalar_tensor(convert(THCudaTensor_var_all(globalContext().getTHCState(), self_, unbiased)), options(ScalarType::Float)); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_var", false, DeviceType::CUDA, dispatch_scalar_type); + return at::scalar_tensor(convert(THCudaHalfTensor_var_all(globalContext().getTHCState(), self_, unbiased)), options(ScalarType::Half)); + break; + } + default: + AT_ERROR("_th_var not supported on CUDAType for ", dispatch_scalar_type); + } +} +Tensor _th_std(const Tensor & self, bool unbiased) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_std", false, DeviceType::CUDA, dispatch_scalar_type); + return at::scalar_tensor(convert(THCudaDoubleTensor_std_all(globalContext().getTHCState(), self_, unbiased)), options(ScalarType::Double)); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_std", false, DeviceType::CUDA, dispatch_scalar_type); + return at::scalar_tensor(convert(THCudaTensor_std_all(globalContext().getTHCState(), self_, unbiased)), options(ScalarType::Float)); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_std", false, DeviceType::CUDA, dispatch_scalar_type); + return at::scalar_tensor(convert(THCudaHalfTensor_std_all(globalContext().getTHCState(), self_, unbiased)), options(ScalarType::Half)); + break; + } + default: + AT_ERROR("_th_std not supported on CUDAType for ", dispatch_scalar_type); + } +} +Tensor & _th_renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_renorm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto p_ = p.toDouble(); + auto maxnorm_ = maxnorm.toDouble(); + THCudaDoubleTensor_renorm(globalContext().getTHCState(), result_, self_, p_, dim, maxnorm_); + break; + } + case ScalarType::Float: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_renorm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto p_ = p.toFloat(); + auto maxnorm_ = maxnorm.toFloat(); + THCudaTensor_renorm(globalContext().getTHCState(), result_, self_, p_, dim, maxnorm_); + break; + } + case ScalarType::Half: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_renorm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto p_ = p.toHalf(); + auto maxnorm_ = maxnorm.toHalf(); + THCudaHalfTensor_renorm(globalContext().getTHCState(), result_, self_, p_, dim, maxnorm_); + break; + } + default: + AT_ERROR("_th_renorm_out not supported on CUDAType for ", dispatch_scalar_type); + } + return result; +} +Tensor _th_renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto result_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto result = Tensor(c10::intrusive_ptr::reclaim(result_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm", false, DeviceType::CUDA, dispatch_scalar_type); + auto p_ = p.toDouble(); + auto maxnorm_ = maxnorm.toDouble(); + THCudaDoubleTensor_renorm(globalContext().getTHCState(), result_, self_, p_, dim, maxnorm_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm", false, DeviceType::CUDA, dispatch_scalar_type); + auto p_ = p.toFloat(); + auto maxnorm_ = maxnorm.toFloat(); + THCudaTensor_renorm(globalContext().getTHCState(), result_, self_, p_, dim, maxnorm_); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm", false, DeviceType::CUDA, dispatch_scalar_type); + auto p_ = p.toHalf(); + auto maxnorm_ = maxnorm.toHalf(); + THCudaHalfTensor_renorm(globalContext().getTHCState(), result_, self_, p_, dim, maxnorm_); + break; + } + default: + AT_ERROR("_th_renorm not supported on CUDAType for ", dispatch_scalar_type); + } + return result; +} +Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_", false, DeviceType::CUDA, dispatch_scalar_type); + auto p_ = p.toDouble(); + auto maxnorm_ = maxnorm.toDouble(); + THCudaDoubleTensor_renorm(globalContext().getTHCState(), self_, self_, p_, dim, maxnorm_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_", false, DeviceType::CUDA, dispatch_scalar_type); + auto p_ = p.toFloat(); + auto maxnorm_ = maxnorm.toFloat(); + THCudaTensor_renorm(globalContext().getTHCState(), self_, self_, p_, dim, maxnorm_); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_", false, DeviceType::CUDA, dispatch_scalar_type); + auto p_ = p.toHalf(); + auto maxnorm_ = maxnorm.toHalf(); + THCudaHalfTensor_renorm(globalContext().getTHCState(), self_, self_, p_, dim, maxnorm_); + break; + } + default: + AT_ERROR("_th_renorm_ not supported on CUDAType for ", dispatch_scalar_type); + } + return self; +} +Tensor & _th_fmod_out(Tensor & result, const Tensor & self, Scalar other) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toByte(); + THCudaByteTensor_fmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Char: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toChar(); + THCudaCharTensor_fmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Double: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toDouble(); + THCudaDoubleTensor_fmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Float: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toFloat(); + THCudaTensor_fmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Int: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toInt(); + THCudaIntTensor_fmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Long: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toLong(); + THCudaLongTensor_fmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Short: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toShort(); + THCudaShortTensor_fmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Half: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toHalf(); + THCudaHalfTensor_fmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + default: + AT_ERROR("_th_fmod_out not supported on CUDAType for ", dispatch_scalar_type); + } + return result; +} +Tensor _th_fmod(const Tensor & self, Scalar other) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto result_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto result = Tensor(c10::intrusive_ptr::reclaim(result_)); + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toByte(); + THCudaByteTensor_fmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toChar(); + THCudaCharTensor_fmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toDouble(); + THCudaDoubleTensor_fmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toFloat(); + THCudaTensor_fmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toInt(); + THCudaIntTensor_fmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toLong(); + THCudaLongTensor_fmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toShort(); + THCudaShortTensor_fmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toHalf(); + THCudaHalfTensor_fmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + default: + AT_ERROR("_th_fmod not supported on CUDAType for ", dispatch_scalar_type); + } + return result; +} +Tensor & _th_fmod_out(Tensor & result, const Tensor & self, const Tensor & other) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaByteTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Char: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaCharTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Double: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Float: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Int: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaIntTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Long: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaLongTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Short: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaShortTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Half: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaHalfTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + default: + AT_ERROR("_th_fmod_out not supported on CUDAType for ", dispatch_scalar_type); + } + return result; +} +Tensor _th_fmod(const Tensor & self, const Tensor & other) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto result_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto result = Tensor(c10::intrusive_ptr::reclaim(result_)); + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaByteTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaCharTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaIntTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaLongTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaShortTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaHalfTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); + break; + } + default: + AT_ERROR("_th_fmod not supported on CUDAType for ", dispatch_scalar_type); + } + return result; +} +Tensor & _th_fmod_(Tensor & self, Scalar other) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toByte(); + THCudaByteTensor_fmod(globalContext().getTHCState(), self_, self_, other_); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toChar(); + THCudaCharTensor_fmod(globalContext().getTHCState(), self_, self_, other_); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toDouble(); + THCudaDoubleTensor_fmod(globalContext().getTHCState(), self_, self_, other_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toFloat(); + THCudaTensor_fmod(globalContext().getTHCState(), self_, self_, other_); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toInt(); + THCudaIntTensor_fmod(globalContext().getTHCState(), self_, self_, other_); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toLong(); + THCudaLongTensor_fmod(globalContext().getTHCState(), self_, self_, other_); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toShort(); + THCudaShortTensor_fmod(globalContext().getTHCState(), self_, self_, other_); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = other.toHalf(); + THCudaHalfTensor_fmod(globalContext().getTHCState(), self_, self_, other_); + break; + } + default: + AT_ERROR("_th_fmod_ not supported on CUDAType for ", dispatch_scalar_type); + } + return self; +} +Tensor & _th_fmod_(Tensor & self, const Tensor & other) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaByteTensor_cfmod(globalContext().getTHCState(), self_, self_, other_); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaCharTensor_cfmod(globalContext().getTHCState(), self_, self_, other_); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_cfmod(globalContext().getTHCState(), self_, self_, other_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_cfmod(globalContext().getTHCState(), self_, self_, other_); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaIntTensor_cfmod(globalContext().getTHCState(), self_, self_, other_); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaLongTensor_cfmod(globalContext().getTHCState(), self_, self_, other_); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaShortTensor_cfmod(globalContext().getTHCState(), self_, self_, other_); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaHalfTensor_cfmod(globalContext().getTHCState(), self_, self_, other_); + break; + } + default: + AT_ERROR("_th_fmod_ not supported on CUDAType for ", dispatch_scalar_type); + } + return self; +} +Tensor & _th_cross_kernel_out(Tensor & result, const Tensor & self, const Tensor & other, int64_t dim) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaByteTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim); + break; + } + case ScalarType::Char: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaCharTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim); + break; + } + case ScalarType::Double: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim); + break; + } + case ScalarType::Float: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim); + break; + } + case ScalarType::Int: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaIntTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim); + break; + } + case ScalarType::Long: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaLongTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim); + break; + } + case ScalarType::Short: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaShortTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim); + break; + } + case ScalarType::Half: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaHalfTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim); + break; + } + default: + AT_ERROR("_th_cross_kernel_out not supported on CUDAType for ", dispatch_scalar_type); + } + return result; +} +Tensor _th_cross_kernel(const Tensor & self, const Tensor & other, int64_t dim) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto result_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto result = Tensor(c10::intrusive_ptr::reclaim(result_)); + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaByteTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaCharTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaIntTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaLongTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaShortTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type); + auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaHalfTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim); + break; + } + default: + AT_ERROR("_th_cross_kernel not supported on CUDAType for ", dispatch_scalar_type); + } + return result; +} +Tensor & _th_bmm_out(Tensor & result, const Tensor & self, const Tensor & mat2) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaByteTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, uint8_t(0), uint8_t(1)); + break; + } + case ScalarType::Char: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaCharTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, int8_t(0), int8_t(1)); + break; + } + case ScalarType::Double: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, double(0), double(1)); + break; + } + case ScalarType::Float: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, float(0), float(1)); + break; + } + case ScalarType::Int: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaIntTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, int(0), int(1)); + break; + } + case ScalarType::Long: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaLongTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, int64_t(0), int64_t(1)); + break; + } + case ScalarType::Short: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaShortTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, int16_t(0), int16_t(1)); + break; + } + case ScalarType::Half: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaHalfTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, Half(0), Half(1)); + break; + } + case ScalarType::BFloat16: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaBFloat16Tensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, BFloat16(0), BFloat16(1)); + break; + } + default: + AT_ERROR("_th_bmm_out not supported on CUDAType for ", dispatch_scalar_type); + } + return result; +} +Tensor _th_bmm(const Tensor & self, const Tensor & mat2) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto result_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto result = Tensor(c10::intrusive_ptr::reclaim(result_)); + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaByteTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, uint8_t(0), uint8_t(1)); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaCharTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, int8_t(0), int8_t(1)); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, double(0), double(1)); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, float(0), float(1)); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaIntTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, int(0), int(1)); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaLongTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, int64_t(0), int64_t(1)); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaShortTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, int16_t(0), int16_t(1)); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaHalfTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, Half(0), Half(1)); + break; + } + case ScalarType::BFloat16: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaBFloat16Tensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, BFloat16(0), BFloat16(1)); + break; + } + default: + AT_ERROR("_th_bmm not supported on CUDAType for ", dispatch_scalar_type); + } + return result; +} +Tensor & _th_baddbmm_out(Tensor & result, const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto beta_ = beta.toByte(); + auto alpha_ = alpha.toByte(); + THCudaByteTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_); + break; + } + case ScalarType::Char: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto beta_ = beta.toChar(); + auto alpha_ = alpha.toChar(); + THCudaCharTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_); + break; + } + case ScalarType::Double: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto beta_ = beta.toDouble(); + auto alpha_ = alpha.toDouble(); + THCudaDoubleTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_); + break; + } + case ScalarType::Float: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto beta_ = beta.toFloat(); + auto alpha_ = alpha.toFloat(); + THCudaTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_); + break; + } + case ScalarType::Int: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto beta_ = beta.toInt(); + auto alpha_ = alpha.toInt(); + THCudaIntTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_); + break; + } + case ScalarType::Long: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto beta_ = beta.toLong(); + auto alpha_ = alpha.toLong(); + THCudaLongTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_); + break; + } + case ScalarType::Short: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto beta_ = beta.toShort(); + auto alpha_ = alpha.toShort(); + THCudaShortTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_); + break; + } + case ScalarType::Half: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto beta_ = beta.toHalf(); + auto alpha_ = alpha.toHalf(); + THCudaHalfTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_); + break; + } + case ScalarType::BFloat16: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto beta_ = beta.toBFloat16(); + auto alpha_ = alpha.toBFloat16(); + THCudaBFloat16Tensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_); + break; + } + default: + AT_ERROR("_th_baddbmm_out not supported on CUDAType for ", dispatch_scalar_type); + } + return result; +} +Tensor _th_baddbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto result_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto result = Tensor(c10::intrusive_ptr::reclaim(result_)); + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto beta_ = beta.toByte(); + auto alpha_ = alpha.toByte(); + THCudaByteTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto beta_ = beta.toChar(); + auto alpha_ = alpha.toChar(); + THCudaCharTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto beta_ = beta.toDouble(); + auto alpha_ = alpha.toDouble(); + THCudaDoubleTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto beta_ = beta.toFloat(); + auto alpha_ = alpha.toFloat(); + THCudaTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto beta_ = beta.toInt(); + auto alpha_ = alpha.toInt(); + THCudaIntTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto beta_ = beta.toLong(); + auto alpha_ = alpha.toLong(); + THCudaLongTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto beta_ = beta.toShort(); + auto alpha_ = alpha.toShort(); + THCudaShortTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto beta_ = beta.toHalf(); + auto alpha_ = alpha.toHalf(); + THCudaHalfTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_); + break; + } + case ScalarType::BFloat16: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type); + auto beta_ = beta.toBFloat16(); + auto alpha_ = alpha.toBFloat16(); + THCudaBFloat16Tensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_); + break; + } + default: + AT_ERROR("_th_baddbmm not supported on CUDAType for ", dispatch_scalar_type); + } + return result; +} +std::tuple _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_gels(globalContext().getTHCState(), res1_, res2_, self_, A_); + break; + } + case ScalarType::Float: { + auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_gels(globalContext().getTHCState(), res1_, res2_, self_, A_); + break; + } + default: + AT_ERROR("_th_gels_out not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(res1, res2); +} +std::tuple _th_gels(const Tensor & self, const Tensor & A) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto res1_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto res1 = Tensor(c10::intrusive_ptr::reclaim(res1_)); + auto res2_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto res2 = Tensor(c10::intrusive_ptr::reclaim(res2_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels", false, DeviceType::CUDA, dispatch_scalar_type); + auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_gels(globalContext().getTHCState(), res1_, res2_, self_, A_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels", false, DeviceType::CUDA, dispatch_scalar_type); + auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_gels(globalContext().getTHCState(), res1_, res2_, self_, A_); + break; + } + default: + AT_ERROR("_th_gels not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(res1, res2); +} +std::tuple _th_eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_eig_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_eig_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_eig_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_geev(globalContext().getTHCState(), res1_, res2_, self_, eigenvectors); + break; + } + case ScalarType::Float: { + auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_eig_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_eig_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_eig_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_geev(globalContext().getTHCState(), res1_, res2_, self_, eigenvectors); + break; + } + default: + AT_ERROR("_th_eig_out not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(res1, res2); +} +std::tuple _th_eig(const Tensor & self, bool eigenvectors) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto res1_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto res1 = Tensor(c10::intrusive_ptr::reclaim(res1_)); + auto res2_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto res2 = Tensor(c10::intrusive_ptr::reclaim(res2_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_eig", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_geev(globalContext().getTHCState(), res1_, res2_, self_, eigenvectors); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_eig", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_geev(globalContext().getTHCState(), res1_, res2_, self_, eigenvectors); + break; + } + default: + AT_ERROR("_th_eig not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(res1, res2); +} +Tensor & _th_potri_out(Tensor & output, const Tensor & self, bool upper) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto output_ = checked_dense_tensor_unwrap(output, "output", 0, "_th_potri_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_potri_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_potri(globalContext().getTHCState(), output_, self_, upper); + break; + } + case ScalarType::Float: { + auto output_ = checked_dense_tensor_unwrap(output, "output", 0, "_th_potri_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_potri_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_potri(globalContext().getTHCState(), output_, self_, upper); + break; + } + default: + AT_ERROR("_th_potri_out not supported on CUDAType for ", dispatch_scalar_type); + } + return output; +} +Tensor _th_potri(const Tensor & self, bool upper) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto output_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto output = Tensor(c10::intrusive_ptr::reclaim(output_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_potri", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_potri(globalContext().getTHCState(), output_, self_, upper); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_potri", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_potri(globalContext().getTHCState(), output_, self_, upper); + break; + } + default: + AT_ERROR("_th_potri not supported on CUDAType for ", dispatch_scalar_type); + } + return output; +} +std::tuple _th_geqrf_out(Tensor & res1, Tensor & res2, const Tensor & self) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_geqrf_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_geqrf_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_geqrf_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_geqrf(globalContext().getTHCState(), res1_, res2_, self_); + break; + } + case ScalarType::Float: { + auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_geqrf_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_geqrf_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_geqrf_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_geqrf(globalContext().getTHCState(), res1_, res2_, self_); + break; + } + default: + AT_ERROR("_th_geqrf_out not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(res1, res2); +} +std::tuple _th_geqrf(const Tensor & self) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + auto res1_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto res1 = Tensor(c10::intrusive_ptr::reclaim(res1_)); + auto res2_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto res2 = Tensor(c10::intrusive_ptr::reclaim(res2_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_geqrf", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_geqrf(globalContext().getTHCState(), res1_, res2_, self_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_geqrf", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_geqrf(globalContext().getTHCState(), res1_, res2_, self_); + break; + } + default: + AT_ERROR("_th_geqrf not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(res1, res2); +} +std::tuple _th_multinomial_alias_setup_out(Tensor & J, Tensor & q, const Tensor & probs) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(J); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto probs_ = checked_dense_tensor_unwrap(probs, "probs", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto J_ = checked_dense_tensor_unwrap(J, "J", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CUDA, ScalarType::Long); + auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_multinomialAliasSetup(globalContext().getTHCState(), probs_, J_, q_); + break; + } + case ScalarType::Float: { + auto probs_ = checked_dense_tensor_unwrap(probs, "probs", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto J_ = checked_dense_tensor_unwrap(J, "J", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CUDA, ScalarType::Long); + auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_multinomialAliasSetup(globalContext().getTHCState(), probs_, J_, q_); + break; + } + case ScalarType::Half: { + auto probs_ = checked_dense_tensor_unwrap(probs, "probs", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto J_ = checked_dense_tensor_unwrap(J, "J", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CUDA, ScalarType::Long); + auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaHalfTensor_multinomialAliasSetup(globalContext().getTHCState(), probs_, J_, q_); + break; + } + default: + AT_ERROR("_th_multinomial_alias_setup_out not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(J, q); +} +std::tuple _th_multinomial_alias_setup(const Tensor & probs) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(probs); + auto J_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(ScalarType::Long)).release(); + auto J = Tensor(c10::intrusive_ptr::reclaim(J_)); + auto q_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto q = Tensor(c10::intrusive_ptr::reclaim(q_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto probs_ = checked_dense_tensor_unwrap(probs, "probs", 1, "_th_multinomial_alias_setup", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_multinomialAliasSetup(globalContext().getTHCState(), probs_, J_, q_); + break; + } + case ScalarType::Float: { + auto probs_ = checked_dense_tensor_unwrap(probs, "probs", 1, "_th_multinomial_alias_setup", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_multinomialAliasSetup(globalContext().getTHCState(), probs_, J_, q_); + break; + } + case ScalarType::Half: { + auto probs_ = checked_dense_tensor_unwrap(probs, "probs", 1, "_th_multinomial_alias_setup", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaHalfTensor_multinomialAliasSetup(globalContext().getTHCState(), probs_, J_, q_); + break; + } + default: + AT_ERROR("_th_multinomial_alias_setup not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(J, q); +} +Tensor & _th_multinomial_alias_draw_out(Tensor & result, const Tensor & q, const Tensor & J, int64_t num_samples, c10::optional generator) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(result); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_multinomial_alias_draw_out", false, DeviceType::CUDA, ScalarType::Long); + auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_draw_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto J_ = checked_dense_tensor_unwrap(J, "J", 2, "_th_multinomial_alias_draw_out", false, DeviceType::CUDA, ScalarType::Long); + THCudaDoubleTensor_multinomialAliasDraw(globalContext().getTHCState(), result_, q_, J_, num_samples, generator); + break; + } + case ScalarType::Float: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_multinomial_alias_draw_out", false, DeviceType::CUDA, ScalarType::Long); + auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_draw_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto J_ = checked_dense_tensor_unwrap(J, "J", 2, "_th_multinomial_alias_draw_out", false, DeviceType::CUDA, ScalarType::Long); + THCudaTensor_multinomialAliasDraw(globalContext().getTHCState(), result_, q_, J_, num_samples, generator); + break; + } + case ScalarType::Half: { + auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_multinomial_alias_draw_out", false, DeviceType::CUDA, ScalarType::Long); + auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_draw_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto J_ = checked_dense_tensor_unwrap(J, "J", 2, "_th_multinomial_alias_draw_out", false, DeviceType::CUDA, ScalarType::Long); + THCudaHalfTensor_multinomialAliasDraw(globalContext().getTHCState(), result_, q_, J_, num_samples, generator); + break; + } + default: + AT_ERROR("_th_multinomial_alias_draw_out not supported on CUDAType for ", dispatch_scalar_type); + } + return result; +} +Tensor _th_multinomial_alias_draw(const Tensor & q, const Tensor & J, int64_t num_samples, c10::optional generator) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(q); + auto result_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(ScalarType::Long)).release(); + auto result = Tensor(c10::intrusive_ptr::reclaim(result_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_draw", false, DeviceType::CUDA, dispatch_scalar_type); + auto J_ = checked_dense_tensor_unwrap(J, "J", 2, "_th_multinomial_alias_draw", false, DeviceType::CUDA, ScalarType::Long); + THCudaDoubleTensor_multinomialAliasDraw(globalContext().getTHCState(), result_, q_, J_, num_samples, generator); + break; + } + case ScalarType::Float: { + auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_draw", false, DeviceType::CUDA, dispatch_scalar_type); + auto J_ = checked_dense_tensor_unwrap(J, "J", 2, "_th_multinomial_alias_draw", false, DeviceType::CUDA, ScalarType::Long); + THCudaTensor_multinomialAliasDraw(globalContext().getTHCState(), result_, q_, J_, num_samples, generator); + break; + } + case ScalarType::Half: { + auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_draw", false, DeviceType::CUDA, dispatch_scalar_type); + auto J_ = checked_dense_tensor_unwrap(J, "J", 2, "_th_multinomial_alias_draw", false, DeviceType::CUDA, ScalarType::Long); + THCudaHalfTensor_multinomialAliasDraw(globalContext().getTHCState(), result_, q_, J_, num_samples, generator); + break; + } + default: + AT_ERROR("_th_multinomial_alias_draw not supported on CUDAType for ", dispatch_scalar_type); + } + return result; +} +Tensor & _th_copy_ignoring_overlaps_(Tensor & self, const Tensor & src) { + // DeviceGuard omitted + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Byte: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); + auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaByteTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_); + break; + } + case ScalarType::Char: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); + auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaCharTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_); + break; + } + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); + auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaDoubleTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); + auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_); + break; + } + case ScalarType::Int: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); + auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaIntTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_); + break; + } + case ScalarType::Long: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); + auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaLongTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_); + break; + } + case ScalarType::Short: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); + auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaShortTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); + auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); + THCudaHalfTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_); + break; + } + default: + AT_ERROR("_th_copy_ignoring_overlaps_ not supported on CUDAType for ", dispatch_scalar_type); + } + return self; +} +Tensor & _thnn_multi_margin_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, int64_t reduction) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long); + auto p_ = p.toDouble(); + auto margin_ = margin.toDouble(); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 5, "_thnn_multi_margin_loss_forward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleMultiMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, p_, weight_ ? weight_ : NULL, margin_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long); + auto p_ = p.toDouble(); + auto margin_ = margin.toDouble(); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 5, "_thnn_multi_margin_loss_forward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaMultiMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, p_, weight_ ? weight_ : NULL, margin_); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long); + auto p_ = p.toDouble(); + auto margin_ = margin.toDouble(); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 5, "_thnn_multi_margin_loss_forward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfMultiMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, p_, weight_ ? weight_ : NULL, margin_); + break; + } + default: + AT_ERROR("_thnn_multi_margin_loss_forward_out not supported on CUDAType for ", dispatch_scalar_type); + } + return output; +} +Tensor _thnn_multi_margin_loss_forward(const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, int64_t reduction) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + auto output_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto output = Tensor(c10::intrusive_ptr::reclaim(output_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multi_margin_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multi_margin_loss_forward", false, DeviceType::CUDA, ScalarType::Long); + auto p_ = p.toDouble(); + auto margin_ = margin.toDouble(); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 5, "_thnn_multi_margin_loss_forward", true, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleMultiMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, p_, weight_ ? weight_ : NULL, margin_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multi_margin_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multi_margin_loss_forward", false, DeviceType::CUDA, ScalarType::Long); + auto p_ = p.toDouble(); + auto margin_ = margin.toDouble(); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 5, "_thnn_multi_margin_loss_forward", true, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaMultiMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, p_, weight_ ? weight_ : NULL, margin_); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multi_margin_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multi_margin_loss_forward", false, DeviceType::CUDA, ScalarType::Long); + auto p_ = p.toDouble(); + auto margin_ = margin.toDouble(); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 5, "_thnn_multi_margin_loss_forward", true, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfMultiMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, p_, weight_ ? weight_ : NULL, margin_); + break; + } + default: + AT_ERROR("_thnn_multi_margin_loss_forward not supported on CUDAType for ", dispatch_scalar_type); + } + return output; +} +Tensor & _thnn_multi_margin_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, int64_t reduction) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long); + auto p_ = p.toDouble(); + auto margin_ = margin.toDouble(); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 6, "_thnn_multi_margin_loss_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleMultiMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, p_, weight_ ? weight_ : NULL, margin_); + break; + } + case ScalarType::Float: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long); + auto p_ = p.toDouble(); + auto margin_ = margin.toDouble(); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 6, "_thnn_multi_margin_loss_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaMultiMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, p_, weight_ ? weight_ : NULL, margin_); + break; + } + case ScalarType::Half: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long); + auto p_ = p.toDouble(); + auto margin_ = margin.toDouble(); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 6, "_thnn_multi_margin_loss_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfMultiMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, p_, weight_ ? weight_ : NULL, margin_); + break; + } + default: + AT_ERROR("_thnn_multi_margin_loss_backward_out not supported on CUDAType for ", dispatch_scalar_type); + } + return grad_input; +} +Tensor _thnn_multi_margin_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, int64_t reduction) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + auto grad_input_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto grad_input = Tensor(c10::intrusive_ptr::reclaim(grad_input_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, ScalarType::Long); + auto p_ = p.toDouble(); + auto margin_ = margin.toDouble(); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 6, "_thnn_multi_margin_loss_backward", true, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleMultiMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, p_, weight_ ? weight_ : NULL, margin_); + break; + } + case ScalarType::Float: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, ScalarType::Long); + auto p_ = p.toDouble(); + auto margin_ = margin.toDouble(); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 6, "_thnn_multi_margin_loss_backward", true, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaMultiMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, p_, weight_ ? weight_ : NULL, margin_); + break; + } + case ScalarType::Half: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, ScalarType::Long); + auto p_ = p.toDouble(); + auto margin_ = margin.toDouble(); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 6, "_thnn_multi_margin_loss_backward", true, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfMultiMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, p_, weight_ ? weight_ : NULL, margin_); + break; + } + default: + AT_ERROR("_thnn_multi_margin_loss_backward not supported on CUDAType for ", dispatch_scalar_type); + } + return grad_input; +} +std::tuple _thnn_multilabel_margin_loss_forward_out(Tensor & output, Tensor & is_target, const Tensor & self, const Tensor & target, int64_t reduction) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long); + auto output_ = checked_dense_tensor_unwrap(output, "output", 3, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 3, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleMultiLabelMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, is_target_, reduction); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long); + auto output_ = checked_dense_tensor_unwrap(output, "output", 3, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 3, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaMultiLabelMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, is_target_, reduction); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long); + auto output_ = checked_dense_tensor_unwrap(output, "output", 3, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 3, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfMultiLabelMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, is_target_, reduction); + break; + } + case ScalarType::BFloat16: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long); + auto output_ = checked_dense_tensor_unwrap(output, "output", 3, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 3, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaBFloat16MultiLabelMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, is_target_, reduction); + break; + } + default: + AT_ERROR("_thnn_multilabel_margin_loss_forward_out not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(output, is_target); +} +std::tuple _thnn_multilabel_margin_loss_forward(const Tensor & self, const Tensor & target, int64_t reduction) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + auto output_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto output = Tensor(c10::intrusive_ptr::reclaim(output_)); + auto is_target_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto is_target = Tensor(c10::intrusive_ptr::reclaim(is_target_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multilabel_margin_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multilabel_margin_loss_forward", false, DeviceType::CUDA, ScalarType::Long); + THNN_CudaDoubleMultiLabelMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, is_target_, reduction); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multilabel_margin_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multilabel_margin_loss_forward", false, DeviceType::CUDA, ScalarType::Long); + THNN_CudaMultiLabelMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, is_target_, reduction); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multilabel_margin_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multilabel_margin_loss_forward", false, DeviceType::CUDA, ScalarType::Long); + THNN_CudaHalfMultiLabelMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, is_target_, reduction); + break; + } + case ScalarType::BFloat16: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multilabel_margin_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multilabel_margin_loss_forward", false, DeviceType::CUDA, ScalarType::Long); + THNN_CudaBFloat16MultiLabelMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, is_target_, reduction); + break; + } + default: + AT_ERROR("_thnn_multilabel_margin_loss_forward not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(output, is_target); +} +Tensor & _thnn_multilabel_margin_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, int64_t reduction, const Tensor & is_target) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long); + auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 5, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 5, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleMultiLabelMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, is_target_, reduction); + break; + } + case ScalarType::Float: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long); + auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 5, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 5, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaMultiLabelMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, is_target_, reduction); + break; + } + case ScalarType::Half: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long); + auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 5, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 5, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfMultiLabelMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, is_target_, reduction); + break; + } + case ScalarType::BFloat16: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long); + auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 5, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 5, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaBFloat16MultiLabelMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, is_target_, reduction); + break; + } + default: + AT_ERROR("_thnn_multilabel_margin_loss_backward_out not supported on CUDAType for ", dispatch_scalar_type); + } + return grad_input; +} +Tensor _thnn_multilabel_margin_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, int64_t reduction, const Tensor & is_target) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + auto grad_input_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto grad_input = Tensor(c10::intrusive_ptr::reclaim(grad_input_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, ScalarType::Long); + auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 5, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleMultiLabelMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, is_target_, reduction); + break; + } + case ScalarType::Float: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, ScalarType::Long); + auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 5, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaMultiLabelMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, is_target_, reduction); + break; + } + case ScalarType::Half: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, ScalarType::Long); + auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 5, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfMultiLabelMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, is_target_, reduction); + break; + } + case ScalarType::BFloat16: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, ScalarType::Long); + auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 5, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaBFloat16MultiLabelMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, is_target_, reduction); + break; + } + default: + AT_ERROR("_thnn_multilabel_margin_loss_backward not supported on CUDAType for ", dispatch_scalar_type); + } + return grad_input; +} +std::tuple _thnn_nll_loss_forward_out(Tensor & output, Tensor & total_weight, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss_forward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto output_ = checked_dense_tensor_unwrap(output, "output", 5, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 5, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss_forward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto output_ = checked_dense_tensor_unwrap(output, "output", 5, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 5, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss_forward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto output_ = checked_dense_tensor_unwrap(output, "output", 5, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 5, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::BFloat16: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss_forward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto output_ = checked_dense_tensor_unwrap(output, "output", 5, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 5, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaBFloat16ClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + default: + AT_ERROR("_thnn_nll_loss_forward_out not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(output, total_weight); +} +std::tuple _thnn_nll_loss_forward(const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + auto output_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto output = Tensor(c10::intrusive_ptr::reclaim(output_)); + auto total_weight_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto total_weight = Tensor(c10::intrusive_ptr::reclaim(total_weight_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss_forward", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss_forward", true, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss_forward", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss_forward", true, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss_forward", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss_forward", true, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::BFloat16: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss_forward", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss_forward", true, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaBFloat16ClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + default: + AT_ERROR("_thnn_nll_loss_forward not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(output, total_weight); +} +Tensor & _thnn_nll_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index, const Tensor & total_weight) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::Float: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::Half: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::BFloat16: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaBFloat16ClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + default: + AT_ERROR("_thnn_nll_loss_backward_out not supported on CUDAType for ", dispatch_scalar_type); + } + return grad_input; +} +Tensor _thnn_nll_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index, const Tensor & total_weight) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + auto grad_input_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto grad_input = Tensor(c10::intrusive_ptr::reclaim(grad_input_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss_backward", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss_backward", true, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::Float: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss_backward", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss_backward", true, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::Half: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss_backward", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss_backward", true, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::BFloat16: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss_backward", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss_backward", true, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaBFloat16ClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + default: + AT_ERROR("_thnn_nll_loss_backward not supported on CUDAType for ", dispatch_scalar_type); + } + return grad_input; +} +std::tuple _thnn_nll_loss2d_forward_out(Tensor & output, Tensor & total_weight, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto output_ = checked_dense_tensor_unwrap(output, "output", 5, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 5, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleSpatialClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto output_ = checked_dense_tensor_unwrap(output, "output", 5, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 5, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaSpatialClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto output_ = checked_dense_tensor_unwrap(output, "output", 5, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 5, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfSpatialClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::BFloat16: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto output_ = checked_dense_tensor_unwrap(output, "output", 5, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 5, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaBFloat16SpatialClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + default: + AT_ERROR("_thnn_nll_loss2d_forward_out not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(output, total_weight); +} +std::tuple _thnn_nll_loss2d_forward(const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + auto output_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto output = Tensor(c10::intrusive_ptr::reclaim(output_)); + auto total_weight_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto total_weight = Tensor(c10::intrusive_ptr::reclaim(total_weight_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss2d_forward", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss2d_forward", true, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleSpatialClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss2d_forward", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss2d_forward", true, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaSpatialClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss2d_forward", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss2d_forward", true, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfSpatialClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::BFloat16: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss2d_forward", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss2d_forward", true, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaBFloat16SpatialClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + default: + AT_ERROR("_thnn_nll_loss2d_forward not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(output, total_weight); +} +Tensor & _thnn_nll_loss2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index, const Tensor & total_weight) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleSpatialClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::Float: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaSpatialClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::Half: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfSpatialClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::BFloat16: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaBFloat16SpatialClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + default: + AT_ERROR("_thnn_nll_loss2d_backward_out not supported on CUDAType for ", dispatch_scalar_type); + } + return grad_input; +} +Tensor _thnn_nll_loss2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index, const Tensor & total_weight) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + auto grad_input_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto grad_input = Tensor(c10::intrusive_ptr::reclaim(grad_input_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss2d_backward", true, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleSpatialClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::Float: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss2d_backward", true, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaSpatialClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::Half: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss2d_backward", true, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfSpatialClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + case ScalarType::BFloat16: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, ScalarType::Long); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss2d_backward", true, DeviceType::CUDA, dispatch_scalar_type); + auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaBFloat16SpatialClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index); + break; + } + default: + AT_ERROR("_thnn_nll_loss2d_backward not supported on CUDAType for ", dispatch_scalar_type); + } + return grad_input; +} +Tensor & _thnn_glu_forward_out(Tensor & output, const Tensor & self, int64_t dim) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_glu_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto output_ = checked_dense_tensor_unwrap(output, "output", 2, "_thnn_glu_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleGatedLinear_updateOutput(globalContext().getTHCState(), self_, output_, dim); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_glu_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto output_ = checked_dense_tensor_unwrap(output, "output", 2, "_thnn_glu_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaGatedLinear_updateOutput(globalContext().getTHCState(), self_, output_, dim); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_glu_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto output_ = checked_dense_tensor_unwrap(output, "output", 2, "_thnn_glu_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfGatedLinear_updateOutput(globalContext().getTHCState(), self_, output_, dim); + break; + } + default: + AT_ERROR("_thnn_glu_forward_out not supported on CUDAType for ", dispatch_scalar_type); + } + return output; +} +Tensor _thnn_glu_forward(const Tensor & self, int64_t dim) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + auto output_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto output = Tensor(c10::intrusive_ptr::reclaim(output_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_glu_forward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleGatedLinear_updateOutput(globalContext().getTHCState(), self_, output_, dim); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_glu_forward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaGatedLinear_updateOutput(globalContext().getTHCState(), self_, output_, dim); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_glu_forward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfGatedLinear_updateOutput(globalContext().getTHCState(), self_, output_, dim); + break; + } + default: + AT_ERROR("_thnn_glu_forward not supported on CUDAType for ", dispatch_scalar_type); + } + return output; +} +Tensor & _thnn_glu_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t dim) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_glu_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_glu_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 3, "_thnn_glu_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleGatedLinear_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, dim); + break; + } + case ScalarType::Float: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_glu_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_glu_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 3, "_thnn_glu_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaGatedLinear_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, dim); + break; + } + case ScalarType::Half: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_glu_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_glu_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 3, "_thnn_glu_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfGatedLinear_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, dim); + break; + } + default: + AT_ERROR("_thnn_glu_backward_out not supported on CUDAType for ", dispatch_scalar_type); + } + return grad_input; +} +Tensor _thnn_glu_backward(const Tensor & grad_output, const Tensor & self, int64_t dim) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + auto grad_input_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto grad_input = Tensor(c10::intrusive_ptr::reclaim(grad_input_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_glu_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_glu_backward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleGatedLinear_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, dim); + break; + } + case ScalarType::Float: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_glu_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_glu_backward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaGatedLinear_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, dim); + break; + } + case ScalarType::Half: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_glu_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_glu_backward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfGatedLinear_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, dim); + break; + } + default: + AT_ERROR("_thnn_glu_backward not supported on CUDAType for ", dispatch_scalar_type); + } + return grad_input; +} +std::tuple _thnn_log_sigmoid_forward_out(Tensor & output, Tensor & buffer, const Tensor & self) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_log_sigmoid_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto output_ = checked_dense_tensor_unwrap(output, "output", 1, "_thnn_log_sigmoid_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto buffer_ = checked_dense_tensor_unwrap(buffer, "buffer", 1, "_thnn_log_sigmoid_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleLogSigmoid_updateOutput(globalContext().getTHCState(), self_, output_, buffer_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_log_sigmoid_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto output_ = checked_dense_tensor_unwrap(output, "output", 1, "_thnn_log_sigmoid_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto buffer_ = checked_dense_tensor_unwrap(buffer, "buffer", 1, "_thnn_log_sigmoid_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaLogSigmoid_updateOutput(globalContext().getTHCState(), self_, output_, buffer_); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_log_sigmoid_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto output_ = checked_dense_tensor_unwrap(output, "output", 1, "_thnn_log_sigmoid_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto buffer_ = checked_dense_tensor_unwrap(buffer, "buffer", 1, "_thnn_log_sigmoid_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfLogSigmoid_updateOutput(globalContext().getTHCState(), self_, output_, buffer_); + break; + } + default: + AT_ERROR("_thnn_log_sigmoid_forward_out not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(output, buffer); +} +std::tuple _thnn_log_sigmoid_forward(const Tensor & self) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + auto output_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto output = Tensor(c10::intrusive_ptr::reclaim(output_)); + auto buffer_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto buffer = Tensor(c10::intrusive_ptr::reclaim(buffer_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_log_sigmoid_forward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleLogSigmoid_updateOutput(globalContext().getTHCState(), self_, output_, buffer_); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_log_sigmoid_forward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaLogSigmoid_updateOutput(globalContext().getTHCState(), self_, output_, buffer_); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_log_sigmoid_forward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfLogSigmoid_updateOutput(globalContext().getTHCState(), self_, output_, buffer_); + break; + } + default: + AT_ERROR("_thnn_log_sigmoid_forward not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(output, buffer); +} +Tensor & _thnn_log_sigmoid_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & buffer) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto buffer_ = checked_dense_tensor_unwrap(buffer, "buffer", 3, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 3, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleLogSigmoid_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, buffer_); + break; + } + case ScalarType::Float: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto buffer_ = checked_dense_tensor_unwrap(buffer, "buffer", 3, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 3, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaLogSigmoid_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, buffer_); + break; + } + case ScalarType::Half: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto buffer_ = checked_dense_tensor_unwrap(buffer, "buffer", 3, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 3, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfLogSigmoid_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, buffer_); + break; + } + default: + AT_ERROR("_thnn_log_sigmoid_backward_out not supported on CUDAType for ", dispatch_scalar_type); + } + return grad_input; +} +Tensor _thnn_log_sigmoid_backward(const Tensor & grad_output, const Tensor & self, const Tensor & buffer) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + auto grad_input_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto grad_input = Tensor(c10::intrusive_ptr::reclaim(grad_input_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_log_sigmoid_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_log_sigmoid_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto buffer_ = checked_dense_tensor_unwrap(buffer, "buffer", 3, "_thnn_log_sigmoid_backward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleLogSigmoid_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, buffer_); + break; + } + case ScalarType::Float: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_log_sigmoid_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_log_sigmoid_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto buffer_ = checked_dense_tensor_unwrap(buffer, "buffer", 3, "_thnn_log_sigmoid_backward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaLogSigmoid_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, buffer_); + break; + } + case ScalarType::Half: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_log_sigmoid_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_log_sigmoid_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto buffer_ = checked_dense_tensor_unwrap(buffer, "buffer", 3, "_thnn_log_sigmoid_backward", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfLogSigmoid_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, buffer_); + break; + } + default: + AT_ERROR("_thnn_log_sigmoid_backward not supported on CUDAType for ", dispatch_scalar_type); + } + return grad_input; +} +Tensor & _thnn_rrelu_with_noise_forward_out(Tensor & output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, c10::optional generator) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto lower_ = lower.toDouble(); + auto upper_ = upper.toDouble(); + auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto lower_ = lower.toDouble(); + auto upper_ = upper.toDouble(); + auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto lower_ = lower.toDouble(); + auto upper_ = upper.toDouble(); + auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator); + break; + } + default: + AT_ERROR("_thnn_rrelu_with_noise_forward_out not supported on CUDAType for ", dispatch_scalar_type); + } + return output; +} +Tensor _thnn_rrelu_with_noise_forward(const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, c10::optional generator) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + auto output_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto output = Tensor(c10::intrusive_ptr::reclaim(output_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto lower_ = lower.toDouble(); + auto upper_ = upper.toDouble(); + THNN_CudaDoubleRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto lower_ = lower.toDouble(); + auto upper_ = upper.toDouble(); + THNN_CudaRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto lower_ = lower.toDouble(); + auto upper_ = upper.toDouble(); + THNN_CudaHalfRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator); + break; + } + default: + AT_ERROR("_thnn_rrelu_with_noise_forward not supported on CUDAType for ", dispatch_scalar_type); + } + return output; +} +Tensor & _thnn_rrelu_with_noise_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 3, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto lower_ = lower.toDouble(); + auto upper_ = upper.toDouble(); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 6, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleRReLU_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, noise_, lower_, upper_, training, false); + break; + } + case ScalarType::Float: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 3, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto lower_ = lower.toDouble(); + auto upper_ = upper.toDouble(); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 6, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaRReLU_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, noise_, lower_, upper_, training, false); + break; + } + case ScalarType::Half: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 3, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto lower_ = lower.toDouble(); + auto upper_ = upper.toDouble(); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 6, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfRReLU_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, noise_, lower_, upper_, training, false); + break; + } + default: + AT_ERROR("_thnn_rrelu_with_noise_backward_out not supported on CUDAType for ", dispatch_scalar_type); + } + return grad_input; +} +Tensor _thnn_rrelu_with_noise_backward(const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + auto grad_input_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto grad_input = Tensor(c10::intrusive_ptr::reclaim(grad_input_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 3, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto lower_ = lower.toDouble(); + auto upper_ = upper.toDouble(); + THNN_CudaDoubleRReLU_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, noise_, lower_, upper_, training, false); + break; + } + case ScalarType::Float: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 3, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto lower_ = lower.toDouble(); + auto upper_ = upper.toDouble(); + THNN_CudaRReLU_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, noise_, lower_, upper_, training, false); + break; + } + case ScalarType::Half: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 3, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto lower_ = lower.toDouble(); + auto upper_ = upper.toDouble(); + THNN_CudaHalfRReLU_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, noise_, lower_, upper_, training, false); + break; + } + default: + AT_ERROR("_thnn_rrelu_with_noise_backward not supported on CUDAType for ", dispatch_scalar_type); + } + return grad_input; +} +Tensor & _thnn_rrelu_with_noise_forward_(Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, c10::optional generator) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type); + auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type); + auto lower_ = lower.toDouble(); + auto upper_ = upper.toDouble(); + THNN_CudaDoubleRReLU_updateOutput(globalContext().getTHCState(), self_, self_, noise_, lower_, upper_, training, true, generator); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type); + auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type); + auto lower_ = lower.toDouble(); + auto upper_ = upper.toDouble(); + THNN_CudaRReLU_updateOutput(globalContext().getTHCState(), self_, self_, noise_, lower_, upper_, training, true, generator); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type); + auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type); + auto lower_ = lower.toDouble(); + auto upper_ = upper.toDouble(); + THNN_CudaHalfRReLU_updateOutput(globalContext().getTHCState(), self_, self_, noise_, lower_, upper_, training, true, generator); + break; + } + default: + AT_ERROR("_thnn_rrelu_with_noise_forward_ not supported on CUDAType for ", dispatch_scalar_type); + } + return self; +} +std::tuple _thnn_conv2d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); + auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); + auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); + auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); + break; + } + case ScalarType::BFloat16: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); + auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaBFloat16SpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); + break; + } + default: + AT_ERROR("_thnn_conv2d_forward_out not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(output, columns, ones); +} +std::tuple _thnn_conv2d_forward(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + auto output_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto output = Tensor(c10::intrusive_ptr::reclaim(output_)); + auto columns_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto columns = Tensor(c10::intrusive_ptr::reclaim(columns_)); + auto ones_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto ones = Tensor(c10::intrusive_ptr::reclaim(ones_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); + auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward", true, DeviceType::CUDA, dispatch_scalar_type); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + THNN_CudaDoubleSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); + auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward", true, DeviceType::CUDA, dispatch_scalar_type); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + THNN_CudaSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); + auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward", true, DeviceType::CUDA, dispatch_scalar_type); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + THNN_CudaHalfSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); + break; + } + case ScalarType::BFloat16: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); + auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward", true, DeviceType::CUDA, dispatch_scalar_type); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + THNN_CudaBFloat16SpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); + break; + } + default: + AT_ERROR("_thnn_conv2d_forward not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(output, columns, ones); +} +std::tuple _thnn_conv2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, const Tensor & columns, const Tensor & ones) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto grad_bias_ = checked_dense_tensor_unwrap(grad_bias, "grad_bias", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + if (grad_input_) THNN_CudaDoubleSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); + if (grad_weight_ || grad_bias_) THNN_CudaDoubleSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1); + break; + } + case ScalarType::Float: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto grad_bias_ = checked_dense_tensor_unwrap(grad_bias, "grad_bias", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + if (grad_input_) THNN_CudaSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); + if (grad_weight_ || grad_bias_) THNN_CudaSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1); + break; + } + case ScalarType::Half: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto grad_bias_ = checked_dense_tensor_unwrap(grad_bias, "grad_bias", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + if (grad_input_) THNN_CudaHalfSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); + if (grad_weight_ || grad_bias_) THNN_CudaHalfSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1); + break; + } + case ScalarType::BFloat16: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto grad_bias_ = checked_dense_tensor_unwrap(grad_bias, "grad_bias", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + if (grad_input_) THNN_CudaBFloat16SpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); + if (grad_weight_ || grad_bias_) THNN_CudaBFloat16SpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1); + break; + } + default: + AT_ERROR("_thnn_conv2d_backward_out not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(grad_input, grad_weight, grad_bias); +} +std::tuple _thnn_conv2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, const Tensor & columns, const Tensor & ones, std::array output_mask) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + auto grad_input_ = output_mask[0] ? c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release() : nullptr; + auto grad_input = Tensor(c10::intrusive_ptr::reclaim(grad_input_ == nullptr ? (TensorImpl*)UndefinedTensorImpl::singleton() : (TensorImpl*)grad_input_)); + auto grad_weight_ = output_mask[1] ? c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release() : nullptr; + auto grad_weight = Tensor(c10::intrusive_ptr::reclaim(grad_weight_ == nullptr ? (TensorImpl*)UndefinedTensorImpl::singleton() : (TensorImpl*)grad_weight_)); + auto grad_bias_ = output_mask[2] ? c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release() : nullptr; + auto grad_bias = Tensor(c10::intrusive_ptr::reclaim(grad_bias_ == nullptr ? (TensorImpl*)UndefinedTensorImpl::singleton() : (TensorImpl*)grad_bias_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + if (grad_input_) THNN_CudaDoubleSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); + if (grad_weight_ || grad_bias_) THNN_CudaDoubleSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1); + break; + } + case ScalarType::Float: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + if (grad_input_) THNN_CudaSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); + if (grad_weight_ || grad_bias_) THNN_CudaSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1); + break; + } + case ScalarType::Half: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + if (grad_input_) THNN_CudaHalfSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); + if (grad_weight_ || grad_bias_) THNN_CudaHalfSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1); + break; + } + case ScalarType::BFloat16: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + if (grad_input_) THNN_CudaBFloat16SpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); + if (grad_weight_ || grad_bias_) THNN_CudaBFloat16SpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1); + break; + } + default: + AT_ERROR("_thnn_conv2d_backward not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(grad_input, grad_weight, grad_bias); +} +Tensor & _thnn_conv_depthwise2d_forward_out(Tensor & output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); + auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv_depthwise2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto dilation_ = check_intlist<2>(dilation, "dilation", 7); + auto output_ = checked_dense_tensor_unwrap(output, "output", 7, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaDoubleSpatialDepthwiseConvolution_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); + auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv_depthwise2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto dilation_ = check_intlist<2>(dilation, "dilation", 7); + auto output_ = checked_dense_tensor_unwrap(output, "output", 7, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaSpatialDepthwiseConvolution_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); + auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv_depthwise2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto dilation_ = check_intlist<2>(dilation, "dilation", 7); + auto output_ = checked_dense_tensor_unwrap(output, "output", 7, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaHalfSpatialDepthwiseConvolution_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + break; + } + case ScalarType::BFloat16: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); + auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv_depthwise2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto dilation_ = check_intlist<2>(dilation, "dilation", 7); + auto output_ = checked_dense_tensor_unwrap(output, "output", 7, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); + THNN_CudaBFloat16SpatialDepthwiseConvolution_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + break; + } + default: + AT_ERROR("_thnn_conv_depthwise2d_forward_out not supported on CUDAType for ", dispatch_scalar_type); + } + return output; +} +Tensor _thnn_conv_depthwise2d_forward(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + auto output_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); + auto output = Tensor(c10::intrusive_ptr::reclaim(output_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv_depthwise2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv_depthwise2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); + auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv_depthwise2d_forward", true, DeviceType::CUDA, dispatch_scalar_type); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto dilation_ = check_intlist<2>(dilation, "dilation", 7); + THNN_CudaDoubleSpatialDepthwiseConvolution_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + break; + } + case ScalarType::Float: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv_depthwise2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv_depthwise2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); + auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv_depthwise2d_forward", true, DeviceType::CUDA, dispatch_scalar_type); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto dilation_ = check_intlist<2>(dilation, "dilation", 7); + THNN_CudaSpatialDepthwiseConvolution_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + break; + } + case ScalarType::Half: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv_depthwise2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv_depthwise2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); + auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv_depthwise2d_forward", true, DeviceType::CUDA, dispatch_scalar_type); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto dilation_ = check_intlist<2>(dilation, "dilation", 7); + THNN_CudaHalfSpatialDepthwiseConvolution_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + break; + } + case ScalarType::BFloat16: { + auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv_depthwise2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv_depthwise2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); + auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv_depthwise2d_forward", true, DeviceType::CUDA, dispatch_scalar_type); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto dilation_ = check_intlist<2>(dilation, "dilation", 7); + THNN_CudaBFloat16SpatialDepthwiseConvolution_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + break; + } + default: + AT_ERROR("_thnn_conv_depthwise2d_forward not supported on CUDAType for ", dispatch_scalar_type); + } + return output; +} +std::tuple _thnn_conv_depthwise2d_backward_out(Tensor & grad_input, Tensor & grad_weight, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto dilation_ = check_intlist<2>(dilation, "dilation", 7); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_conv_depthwise2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 7, "_thnn_conv_depthwise2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + if (grad_input_) THNN_CudaDoubleSpatialDepthwiseConvolution_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + if (grad_weight_) THNN_CudaDoubleSpatialDepthwiseConvolution_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + break; + } + case ScalarType::Float: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto dilation_ = check_intlist<2>(dilation, "dilation", 7); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_conv_depthwise2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 7, "_thnn_conv_depthwise2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + if (grad_input_) THNN_CudaSpatialDepthwiseConvolution_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + if (grad_weight_) THNN_CudaSpatialDepthwiseConvolution_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + break; + } + case ScalarType::Half: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto dilation_ = check_intlist<2>(dilation, "dilation", 7); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_conv_depthwise2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 7, "_thnn_conv_depthwise2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + if (grad_input_) THNN_CudaHalfSpatialDepthwiseConvolution_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + if (grad_weight_) THNN_CudaHalfSpatialDepthwiseConvolution_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + break; + } + case ScalarType::BFloat16: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto dilation_ = check_intlist<2>(dilation, "dilation", 7); + auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_conv_depthwise2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 7, "_thnn_conv_depthwise2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); + if (grad_input_) THNN_CudaBFloat16SpatialDepthwiseConvolution_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + if (grad_weight_) THNN_CudaBFloat16SpatialDepthwiseConvolution_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + break; + } + default: + AT_ERROR("_thnn_conv_depthwise2d_backward_out not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(grad_input, grad_weight); +} +std::tuple _thnn_conv_depthwise2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, std::array output_mask) { + const OptionalDeviceGuard device_guard(device_of(self)); + auto dispatch_scalar_type = infer_scalar_type(self); + auto grad_input_ = output_mask[0] ? c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release() : nullptr; + auto grad_input = Tensor(c10::intrusive_ptr::reclaim(grad_input_ == nullptr ? (TensorImpl*)UndefinedTensorImpl::singleton() : (TensorImpl*)grad_input_)); + auto grad_weight_ = output_mask[1] ? c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release() : nullptr; + auto grad_weight = Tensor(c10::intrusive_ptr::reclaim(grad_weight_ == nullptr ? (TensorImpl*)UndefinedTensorImpl::singleton() : (TensorImpl*)grad_weight_)); + switch (dispatch_scalar_type) { + case ScalarType::Double: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto dilation_ = check_intlist<2>(dilation, "dilation", 7); + if (grad_input_) THNN_CudaDoubleSpatialDepthwiseConvolution_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + if (grad_weight_) THNN_CudaDoubleSpatialDepthwiseConvolution_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + break; + } + case ScalarType::Float: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto dilation_ = check_intlist<2>(dilation, "dilation", 7); + if (grad_input_) THNN_CudaSpatialDepthwiseConvolution_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + if (grad_weight_) THNN_CudaSpatialDepthwiseConvolution_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + break; + } + case ScalarType::Half: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto dilation_ = check_intlist<2>(dilation, "dilation", 7); + if (grad_input_) THNN_CudaHalfSpatialDepthwiseConvolution_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + if (grad_weight_) THNN_CudaHalfSpatialDepthwiseConvolution_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + break; + } + case ScalarType::BFloat16: { + auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); + auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); + auto stride_ = check_intlist<2>(stride, "stride", 5); + auto padding_ = check_intlist<2>(padding, "padding", 6); + auto dilation_ = check_intlist<2>(dilation, "dilation", 7); + if (grad_input_) THNN_CudaBFloat16SpatialDepthwiseConvolution_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + if (grad_weight_) THNN_CudaBFloat16SpatialDepthwiseConvolution_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]); + break; + } + default: + AT_ERROR("_thnn_conv_depthwise2d_backward not supported on CUDAType for ", dispatch_scalar_type); + } + return std::tuple(grad_input, grad_weight); +} + +} // namespace th +} // namespace legacy +} // namespace native +} // namespace at diff --git a/aten/src/ATen/cwrap_parser.py b/aten/src/ATen/cwrap_parser.py deleted file mode 100644 index 27bbbd7140f..00000000000 --- a/aten/src/ATen/cwrap_parser.py +++ /dev/null @@ -1,38 +0,0 @@ -import yaml -import copy - -try: - # use faster C loader if available - from yaml import CLoader as Loader -except ImportError: - from yaml import Loader - -# follows similar logic to cwrap, ignores !inc, and just looks for [[]] - - -def parse(filename): - with open(filename, 'r') as file: - declaration_lines = [] - declarations = [] - in_declaration = False - for line in file.readlines(): - line = line.rstrip() - if line == '[[': - declaration_lines = [] - in_declaration = True - elif line == ']]': - in_declaration = False - declaration = yaml.load('\n'.join(declaration_lines), Loader=Loader) - declarations.append(declaration) - elif in_declaration: - declaration_lines.append(line) - declarations = [process_declaration(declaration) for declaration in declarations] - return declarations - -def process_declaration(declaration): - declaration = copy.deepcopy(declaration) - if "arguments" in declaration: - declaration["schema_order_arguments"] = copy.deepcopy(declaration["arguments"]) - if "options" in declaration: - declaration["options"] = [process_declaration(option) for option in declaration["options"]] - return declaration diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py deleted file mode 100644 index f996e73e5d9..00000000000 --- a/aten/src/ATen/function_wrapper.py +++ /dev/null @@ -1,1544 +0,0 @@ -# HEY! Trying to understand what this file does? Read -# "what has to be done to add a Operation ..." first! - -import re -import copy -from code_template import CodeTemplate - - -from typing import Any, Dict, List, Optional, Set, Tuple, NamedTuple - -try: - from mypy_extensions import TypedDict -except ImportError: - # Avoid the dependency on the mypy_extensions package. - # It is required, however, for type checking. - def TypedDict(name, attrs, total=True): # type: ignore - return Dict[Any, Any] - -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# what has to be done to add a Operation ... -# -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -# TH functions are generated into at::legacy::cpu and at::legacy::cuda, -# where they can be called directly by a native function, they can be wrapped -# by a native function that handles dispatch - -LEGACY_TH_DECLARATION = CodeTemplate("""\ -${return_type} ${api_name}(${formals}); -""") - -LEGACY_TH_DEFINITION = CodeTemplate("""\ -${return_type} ${api_name}(${formals}) { - ${device_guard_declaration} - ${type_definition_body} -} -""") - -LEGACY_TH_DEFINITION_SWITCH_STATEMENT = CodeTemplate("""\ -${dispatch_scalar_type_declaration} -${switch_prologue} -switch (dispatch_scalar_type) { - ${cases} - default: - AT_ERROR("${api_name} not supported on ${Type} for ", dispatch_scalar_type); -} -${switch_epilogue} -""") - -LEGACY_TH_DEFINITION_CASE = CodeTemplate("""\ -case ScalarType::${ScalarName}: { - ${case_body} - break; -} -""") - -# Native functions are generated and registered on the dispatcher. We register the -# function on Backend::Undefined if it does not have backend dependent dispatch. -# In this case, it will be called for all backends, but can be overwritten on a -# per backend basis. -NATIVE_DISPATCH_DECLARATION = CodeTemplate("""\ -${return_type} ${type_wrapper_name}(${native_formals}); -""") - -NATIVE_DISPATCH_DEFINITION_DEFAULT = CodeTemplate("""\ -${return_type} ${type_wrapper_name}(${native_formals}) { - ${device_guard_declaration} - ${return_call} at::native::${native_type_method_dispatch}(${actuals}); -} -""") - -NATIVE_DISPATCH_DEFINITION_CPU_BACKEND = CodeTemplate("""\ -${return_type} ${type_wrapper_name}(${native_formals}) { - ${return_call} at::native::${native_type_method_dispatch}(${actuals}); -} -""") - -NATIVE_DISPATCH_DEFINITION_GENERIC_BACKEND = CodeTemplate("""\ -${return_type} ${type_wrapper_name}(${native_formals}) { - ${device_init} - ${device_guard_declaration} - ${return_call} at::native::${native_type_method_dispatch}(${actuals}); -} -""") - -# A schema registration specifies alias analysis for an operator, but doesn't -# actually provide an implementation. Although our registration API allows you -# to specify all of this information at a function registration site, it's -# better to do it once at a schema registration so that we don't have to -# repeat ourselves everywhere else. -SCHEMA_REGISTRATION = CodeTemplate("""\ -m.def("${unqual_schema_string}"); -""") - -# NOTE[UnboxedOnly] Many of our codegen templates currently exist twice, once -# in an _UNBOXEDONLY_ variant and once without _UNBOXEDONLY_. This is because -# ops that are `use_c10_dispatcher: full` need different c++ code than ops -# that aren't `use_c10_dispatcher: full` yet. The _UNBOXEDONLY_ variants -# are for ops that aren't `use_c10_dispatcher: full` yet and those code templates -# can be deleted once all ops are `use_c10_dispatcher: full`. -# If you update one of the templates, you likely also have to update the other. - -# NB: Specifiction of the namespace is handled by the enclosing -# TORCH_LIBRARY macro invocation -# See NOTE[UnboxedOnly] -DEFAULT_UNBOXEDONLY_FUNCTION_REGISTRATION = CodeTemplate("""\ -m.impl("${unqual_operator_name_with_overload}", - torch::CppFunction::makeUnboxedOnly(&TypeDefault::${type_wrapper_name})); -""") - -DEFAULT_FUNCTION_REGISTRATION = CodeTemplate("""\ -m.impl("${unqual_operator_name_with_overload}", - c10::impl::hacky_wrapper_for_legacy_signatures<${schema_order_cpp_signature}>(TORCH_FN(TypeDefault::${type_wrapper_name}))); -""") - -# NB: In the ordinary, TypeDerived code generation work flow, specification -# of the backend is handled by the enclosing block, so the torch::dispatch -# invocation here is strictly unnecessary. However, in the fbcode mobile -# only workflow using per-op registration, these registrations will get dumped -# in a TORCH_LIBRARY_FRAGMENT that does not have an ambient backend. So -# the torch::dispatch specification here is important! See -# Note [Redundancy in registration code is OK] for how we handle redundant info. -BACKEND_UNBOXEDONLY_FUNCTION_REGISTRATION = CodeTemplate("""\ -m.impl("${unqual_operator_name_with_overload}", - torch::dispatch(DispatchKey::${Backend}, - torch::CppFunction::makeUnboxedOnly(&${Type}::${type_wrapper_name})) -); -""") - -BACKEND_FUNCTION_REGISTRATION = CodeTemplate("""\ -m.impl("${unqual_operator_name_with_overload}", - torch::dispatch(DispatchKey::${Backend}, - c10::impl::hacky_wrapper_for_legacy_signatures<${schema_order_cpp_signature}>( - TORCH_FN(${Type}::${type_wrapper_name}))) -); -""") - -# add non-virtual declaration to TensorBody.h -TENSOR_METHOD_DECLARATION = CodeTemplate("""\ -${return_type} ${api_name}(${method_formals_with_defaults}) const; -""") - -# add non-virtual declaration to Tensor.cpp -TENSOR_METHOD_DEFINITION = CodeTemplate("""\ - -// ${schema_string} -${return_type} Tensor::${api_name}(${method_formals}) const { - static auto op = c10::Dispatcher::singleton() - .findSchemaOrThrow("aten::${operator_name}", "${overload_name}") - .typed<${tensor_method_cpp_signature}>(); - return op.call(${tensor_method_actuals}); -} -""") - -# add a method declaration in Functions.h -FUNCTION_DECLARATION = CodeTemplate("""\ -CAFFE2_API ${return_type} ${api_name}(${formals_with_defaults}); -""") - -# add a method declaration in Functions.h -DEPRECATED_FUNCTION_DECLARATION = CodeTemplate("""\ -C10_DEPRECATED CAFFE2_API ${return_type} ${api_name}(${formals_with_defaults}); -""") - -# add method definition in Functions.h -FUNCTION_DEFINITION = CodeTemplate("""\ - -// ${schema_string} -${return_type} ${api_name}(${formals}) { - static auto op = c10::Dispatcher::singleton() - .findSchemaOrThrow("aten::${operator_name}", "${overload_name}") - .typed<${function_cpp_signature}>(); - return op.call(${function_actuals}); -} -""") - -IFDEF_BLOCK = CodeTemplate("""\ -#ifdef ${ifdef_guard} -${content} -#endif -""") - -# add a native declaration for a native function -NATIVE_DECLARATION = CodeTemplate("""\ -CAFFE2_API ${return_type} ${native_type_method_dispatch}(${native_formals_with_defaults}); -""") - -CALL_TEMPLATE = CodeTemplate("${cname}(${actuals})") - -OPERATOR_NAME = CodeTemplate("aten::${operator_name}") - -OPERATOR_NAME_FULL = CodeTemplate("""\ - {"aten::${operator_name}", "${overload_name}"}, -""") - -# scalar_name, c_type, accreal, is_floating_type -scalar_types = [ - ('Bool', 'bool', 'BoolAccrealNotDefined', False), - ('Byte', 'uint8_t', 'Long', False), - ('Char', 'int8_t', 'Long', False), - ('Double', 'double', 'Double', True), - ('Float', 'float', 'Double', True), - ('Int', 'int', 'Long', False), - ('Long', 'int64_t', 'Long', False), - ('Short', 'int16_t', 'Long', False), - ('Half', 'Half', 'Double', True), - ('BFloat16', 'BFloat16', 'BFloat16AccrealNotDefined', True), - ('ComplexFloat', 'ComplexFloat', 'ComplexDouble', False), - ('ComplexDouble', 'ComplexDouble', 'ComplexDouble', False), -] - -class NYIError(Exception): - """Indicates we don't support this declaration yet""" - - __slots__ = ['reason'] - - def __init__(self, reason): - self.reason = reason - - -TYPE_FORMAL_GENERIC = { - 'THTensor*': 'Tensor &', - 'THByteTensor*': 'Tensor &', - 'THIndexTensor*': 'Tensor &', - 'THBoolTensor*': 'Tensor &', - 'IntArrayRefSize': 'IntArrayRef', - 'accreal': 'Scalar', - 'real': 'Scalar', - 'long': 'int64_t', -} - -DYNAMIC_TYPE = { - 'THTensor*': 'Tensor', - 'THByteTensor*': 'ByteTensor', - 'THBoolTensor*': 'BoolTensor', - 'THIndexTensor*': 'IndexTensor', - 'IntArrayRefSize': 'IntArrayRef', - 'accreal': 'accreal', - 'real': 'real', - 'long': 'int64_t', -} - -NATIVE_DYNAMIC_TYPE = { - 'Tensor &': 'Tensor', - 'const Tensor &': 'Tensor', -} - -TYPE_RETURN = { - 'THTensor*': 'Tensor', - 'THIndexTensor*': 'Tensor', - 'THByteTensor*': 'Tensor', - 'THBoolTensor*': 'Tensor', - 'real': 'Tensor', - 'accreal': 'Tensor', - 'long': 'int64_t', -} - -CHECKED_CAST = { - 'THTensor*': - CodeTemplate( - 'checked_dense_tensor_unwrap(' - '${arg_name}, "${arg_name}", ${arg_pos}, "${api_name}", ${null_okay}, ' - 'DeviceType::${DeviceType}, ${scalar_type})'), - 'THByteTensor*': - CodeTemplate( - 'checked_dense_tensor_unwrap(' - '${arg_name}, "${arg_name}", ${arg_pos}, "${api_name}", ${null_okay}, ' - 'DeviceType::${DeviceType}, ScalarType::Byte)'), - 'THBoolTensor*': - CodeTemplate( - 'checked_dense_tensor_unwrap(' - '${arg_name}, "${arg_name}", ${arg_pos}, "${api_name}", ${null_okay}, ' - 'DeviceType::${DeviceType}, ScalarType::Bool)'), - 'THIndexTensor*': - CodeTemplate( - 'checked_dense_tensor_unwrap(' - '${arg_name}, "${arg_name}", ${arg_pos}, "${api_name}", ${null_okay}, ' - 'DeviceType::${DeviceType}, ScalarType::Long)'), - 'real': CodeTemplate('${arg_name}.to${ScalarName}()'), - 'accreal': CodeTemplate('${arg_name}.to${AccScalarName}()'), - 'TensorList': CodeTemplate( - 'checked_dense_tensor_list_unwrap(${arg_name},"${arg_name}",${arg_pos}, ' - 'DeviceType::${DeviceType}, ${scalar_type})'), - 'IntArrayRef': CodeTemplate('check_intlist<${size}>(${arg_name}, "${arg_name}", ${arg_pos})') -} - -CHECKED_USE = { - 'THTensor*': '{}_', - 'THIndexTensor*': '{}_', - 'THByteTensor*': '{}_', - 'THBoolTensor*': '{}_', - 'TensorList': "{0}_.data(), {0}_.size()", -} - -CHECKED_USE_NULLABLE = CodeTemplate('${arg_name}_ ? ${usage} : NULL') - -ALLOC_NOARGS_WRAP = { - 'THTensor*': 'c10::make_intrusive' - '(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),' - 'DispatchKey::${Backend}, scalarTypeToTypeMeta(${ScalarName})).release()', - 'THByteTensor*': 'c10::make_intrusive' - '(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),' - 'DispatchKey::${Backend}, scalarTypeToTypeMeta(ScalarType::Byte)).release()', - 'THBoolTensor*': 'c10::make_intrusive' - '(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),' - 'DispatchKey::${Backend}, scalarTypeToTypeMeta(ScalarType::Bool)).release()', - 'THIndexTensor*': 'c10::make_intrusive' - '(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),' - 'DispatchKey::${Backend}, scalarTypeToTypeMeta(ScalarType::Long)).release()', -} - -# Replacements for constants when calling into TH -CONSTANT_REPLACEMENTS = [ - ('AS_REAL', '${ScalarType}'), -] - -# Replacements for constants in header file function definitions -HEADER_CONSTANT_REPLACEMENTS = [ - (r'AS_REAL\((.*)\)', r'\1'), -] - - -class nested_dict(object): - def __init__(self, base, parent): - self.base, self.parent = base, parent - - def __getitem__(self, x): - r = self.base.get(x) - if r is not None: - return r - return self.parent[x] - - -Environment = TypedDict('Environment', { - 'state': str, - 'ScalarType': str, - 'ScalarName': str, - 'THTensor': str, - 'THType': str, - 'Backend': str, - 'DeviceType': str, - 'AccScalarName': str, -}) - -TopEnvironment = TypedDict('TopEnvironment', { - 'type_registrations': List[str], - 'type_headers': List[str], - 'function_registrations': List[str], - 'aten_ops': List[str], - 'type_method_declarations': List[str], - 'type_method_definitions': List[str], - 'tensor_method_declarations': List[str], - 'tensor_method_definitions': List[str], - 'function_declarations': List[str], - 'function_definitions': List[str], - 'type_ids': List[str], - 'native_function_declarations': List[str], -}) - -# A Declarations.cwrap formal argument -# type can contain THTensor* types -# NOTE: this must contain all 'AtFormal' attributes, because FunctionOption -# doesn't differentiate between whether we have AtFormals or THFormals -THFormal = TypedDict('THFormal', { - 'name': str, - 'type': str, - 'dynamic_type': str, - 'kwarg_only': bool, - 'is_nullable': bool, - 'default': str, - 'output': bool, - 'size': int, - 'annotation': str, - 'allocate': bool, - 'mask': bool, -}, total=False) - -# Generic ATen formal or native_functions.yaml formal argument. -# type can contain Tensor& reference types. -AtFormal = TypedDict('AtFormal', { - 'name': str, - 'type': str, - 'dynamic_type': str, - 'kwarg_only': bool, - 'is_nullable': bool, - 'default': str, - 'output': bool, - 'size': int, - 'annotation': str, -}, total=False) - -# Note [field_name versus name] -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# What is the difference between "field_name" and "name"? -# -# Return values of ATen operators always have a name: if it is not -# explicitly assigned a name inside native_functions.yaml like func: -# myop() -> (Tensor indices, Tensor value), then the codegen will -# automatically assign it a name like result0, or name might be -# specified inside Declarations.cwrap. We don't want these assigned -# names to become part of the public API when we return a namedtuple for -# any such multiple-return function. -# -# Thus field_name is like name, but it is defined only when there is a -# name specified in native_functions.yaml. If field_name is defined, -# then the codegen would generate code to return namedtuple. Otherwise, -# it would just return tuple. - -ReturnType = TypedDict('ReturnType', { - 'name': str, - # See Note [field_name versus name] - 'field_name': str, - 'type': str, - 'dynamic_type': str, -}, total=False) - -ReturnDecl = TypedDict('ReturnDecl', { - 'kind': str, - 'type': str, - 'arguments': List[int], -}, total=False) - -# Represents a buffer in nn.yaml -NNBuffer = TypedDict('NNBuffer', { - 'name': str, -}) - -FunctionOption = TypedDict('FunctionOption', { - 'actuals': List[str], - 'schema_order_actuals': List[str], - 'api_name': str, - # Like api_name, but it is the name of the internal - # CPUType/CUDAType/TypeDefault function that wraps - # the actual native call. This name is NOT user - # visible and is mangled with the overload name - 'type_wrapper_name': str, - 'arguments': List[THFormal], - # 'schema_order_arguments' is like 'arguments' but keeps them in the - # order they are defined in the JIT function schema while - # 'arguments' does some modifications (e.g. reorders out arguments - # and packs TensorOptions) - 'schema_order_arguments': List[THFormal], - 'backend_types': Dict[str, List[str]], - 'backends': List[str], - 'buffers': List[NNBuffer], - # cimpls is really a List[FunctionOption] - 'cimpls': List[Any], - 'cname': str, - # explicitly specify whether the function is a factory function or other special category - 'category_override': str, - 'condition': str, - 'device_guard': bool, - 'device_guard_declaration': str, - 'dispatch_scalar_type_declaration': str, - 'use_c10_dispatcher': str, - 'manual_kernel_registration': bool, - 'with_gil': bool, - 'cpu_half': bool, - 'cpu_bfloat16': bool, - 'cuda_bfloat16': bool, - 'deprecated': bool, - 'cpu_bool': bool, - 'cuda_bool': bool, - # See Note [field_name versus name] - 'field_name': str, - 'formals_list': List[AtFormal], - 'formals_with_defaults': List[str], - 'native_formals_with_defaults': List[str], - 'formals': List[str], - 'native_formals': List[str], - 'formals_types': List[str], - 'cpp_signature': str, - # 'schema_order_cpp_signature' is like 'cpp_signature' but keeps them in the - # order they are defined in the JIT function schema while - # 'cpp_signature' does some modifications (e.g. reorders out arguments - # and packs TensorOptions) - 'schema_order_cpp_signature': str, - 'inplace': bool, - 'matches_jit_signature': bool, - # This controls whether or not we generate the interface in Type or - # TypeExtendedInterface - 'extended_method': bool, - 'method_actuals': List[str], - 'schema_order_method_actuals': List[str], - 'method_formals_with_defaults': List[str], - 'method_formals': List[str], - 'mode': str, - 'python_module': str, - 'name': str, - 'operator_name': str, - 'overload_name': str, - 'native_type_method_dispatch': str, - # options should be List[FunctionOption] - 'options': Any, - 'schema_string': str, - 'return_call': str, - 'return_type': str, - 'return': ReturnDecl, - 'returns': List[ReturnType], - 'sparse': bool, - 'type_definition_body': List[str], - 'type_method_definition_dispatch': str, - 'variants': str, -}) - -OutputDeclaration = NamedTuple('OutputDeclaration', [ - ('name', str), - ('operator_name', str), - ('overload_name', str), - ('use_c10_dispatcher', str), - ('manual_kernel_registration', bool), - ('category_override', str), - ('matches_jit_signature', bool), - ('schema_string', str), - ('arguments', List[AtFormal]), - ('schema_order_cpp_signature', str), - # 'schema_order_arguments' is like 'arguments' but keeps them in the - # order they are defined in the JIT function schema while - # 'arguments' does some modifications (e.g. reorders out arguments - # and packs TensorOptions) - ('schema_order_arguments', List[AtFormal]), - ('method_of', List[str]), - ('mode', str), - ('python_module', str), - ('buffers', Optional[List[str]]), - ('returns', List[ReturnType]), - ('inplace', bool), - ('is_factory_method', bool), - ('abstract', bool), - ('device_guard', bool), - ('with_gil', bool), - ('deprecated', bool), -]) - -FunctionCode = NamedTuple('FunctionCode', [ - ('definition', str), - ('declaration', str), -]) - -OpRegistration = NamedTuple('OpRegistration', [ - ('operator_name', str), - ('registration_code', str), - ('schema_registration_code', str), -]) - - -def device_guard(option, dispatch_options, dispatch_tensor): - # For factory methods the `DeviceGuard` is already in the template. - if option.get('device_guard', True): - if dispatch_options: - return 'const DeviceGuard device_guard({}.device());'.format(dispatch_options['name']) - if dispatch_tensor: - return 'const OptionalDeviceGuard device_guard(device_of({}));'.format(dispatch_tensor) - return '// DeviceGuard omitted' - - -def dispatch_scalar_type(option, dispatch_options, dispatch_tensor): - if dispatch_options: - return 'auto dispatch_scalar_type = typeMetaToScalarType({}.dtype());'.format(dispatch_options['name']) - if dispatch_tensor: - return 'auto dispatch_scalar_type = infer_scalar_type({});'.format(dispatch_tensor) - return '// dispatch_scalar_type omitted' - - -def is_real_argument_to_wrapper(argument): - # type: (THFormal) -> bool - return not argument.get('output', False) and\ - argument['type'] != 'CONSTANT' and\ - argument['type'] != 'argument' - - -def is_mutable_formal_argument(argument, option): - # type: (THFormal, FunctionOption) -> bool - return argument.get('output') or option['inplace'] and argument['name'] == 'self' - - -def check_methods_do_not_start_with_underscore(name, is_method): - if name in {'_values', '_indices', '_nnz', '_dimI', '_dimV', '_coalesced_', - '_version'}: - return - if is_method and name.startswith('_') and not name.startswith('__') and not name.startswith('_th_'): - message = "Function '{}' starts with a single underscore and is ".format(name) - message += "configured to have a method on Tensor. Functions that start with " - message += " a single underscore should only be functions in the at:: " - message += "namespace and not methods on Tensor!" - raise RuntimeError(message) - - -def to_return_type(arg, option): - # type: (THFormal, FunctionOption) -> ReturnType - t = arg['type'] - rt = TYPE_RETURN.get(t, t) - if rt == 'Tensor' and not arg.get('allocate'): - rt = rt + ' &' - if not is_mutable_formal_argument(arg, option): - rt = 'const ' + rt - return { - 'name': arg['name'], - 'type': rt, - 'dynamic_type': DYNAMIC_TYPE.get(arg['type'], arg['type']), - } - - -def is_any_tensor_type(formal): - return (formal['dynamic_type'] == 'Tensor' or formal['dynamic_type'] == 'ByteTensor' - or formal['dynamic_type'] == 'IndexTensor' or formal['dynamic_type'] == 'BoolTensor') - - -def find_tensors(formals): - # type: (List[AtFormal]) -> List[str] - return [formal['name'] for formal in formals if is_any_tensor_type(formal)] - - -def find_tensorlists(formals): - # type: (List[AtFormal]) -> List[str] - return [formal['name'] for formal in formals if formal['dynamic_type'] == 'TensorList'] - - -def find_dispatch_tensor(formals): - # type: (List[AtFormal]) -> Optional[str] - # Determine legacy TH-style single dispatch tensor. - # - # Also used to determine what tensor should be used to provide a default - # DeviceGuard. Unlike dispatch, we don't guard on ALL tensor arguments - # (because this is not actually a thing you can do.) Guarding on the - # first argument is best effort to help people avoid doing this - # themselves. - - for formal in formals: - if formal['name'] == 'self' and is_any_tensor_type(formal) and not formal.get('is_nullable', False): - return formal['name'] - # otherwise dispatch to the first Tensor or TensorList - for formal in formals: - if 'TensorList' == formal['dynamic_type'] or is_any_tensor_type(formal) and \ - not formal.get('is_nullable', False): - return formal['name'] - - return None - - -def is_multidispatch_formal(formal): - # type: (AtFormal) -> bool - return formal['dynamic_type'] in ['TensorOptions', 'TensorList'] or is_any_tensor_type(formal) - - -def find_multidispatch_formals(formals): - # type: (List[AtFormal]) -> List[AtFormal] - # Compute the list of all arguments which should be considered - # for multiple dispatch. Note that this doesn't completely replace - # find_dispatch_tensor because we use the "dispatch tensor" to determine - # device guards. TensorOptions is included as part of this calculation. - # - # The interaction of multiple dispatch with TensorOptions - # is quite interesting. In particular, suppose I have: - # - # cuda_tensor.new_like(1, device='cpu') - # - # Multiple dispatch will attempt a dispatch to CUDA, even though - # the end tensor that should be produced here is a CPU one. The - # upshot is that if you have an operator with mixed TensorOptions - # and Tensor arguments, you MUST only ever register it generically. - return [f for f in formals if is_multidispatch_formal(f)] - - -def find_formal_by_type(formal_name, formals): - # type: (str,List[AtFormal]) -> Optional[AtFormal] - for formal in formals: - if formal_name == formal['dynamic_type']: - return formal - return None - - -def format_formal(f): - # type: (AtFormal) -> str - return '{} {}'.format(f['type'], f['name']) - - -def formal_with_default(f): - # type: (AtFormal) -> str - s = format_formal(f) - v = f.get('default') - if v is None: - return s - if isinstance(v, bool): - v = str(v).lower() - return '{}={}'.format(s, v) - - -def gen_dispatch_key_init(var_name, formals): - # type: (str, List[AtFormal]) -> List[str] - topt_formals = [] - non_topt_formals = [] - for f in find_multidispatch_formals(formals): - if f['dynamic_type'] == 'TensorOptions': - topt_formals.append(f) - else: - non_topt_formals.append(f) - - if len(topt_formals) == 1 and non_topt_formals == []: - topt = topt_formals[0] - return ['DispatchKey {} = {}.computeDispatchKey();'.format(var_name, topt['name'])] - - subexprs = [] - for f in topt_formals: - subexprs.append('DispatchKeySet({}.computeDispatchKey())'.format(f['name'])) - if non_topt_formals != []: - args = ', '.join([f['name'] for f in non_topt_formals]) - subexprs.append('c10::detail::multi_dispatch_key_set({})'.format(args)) - return [ - 'DispatchKeySet _dk_set = {};'.format(' | '.join(subexprs)), - 'DispatchKeySet _dk_mask = c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, DispatchKey::BackendSelect);', - 'DispatchKey {} = c10::impl::dispatchTypeId(_dk_set, _dk_mask);'.format(var_name), - ] - - -def is_factory(option): - # type: (FunctionOption) -> bool - formals = option['formals_list'] - return find_formal_by_type('TensorOptions', formals) is not None and 'method' not in option['variants'] - - -def gen_device_init(option, backend_type_env): - # type: (FunctionOption, Environment) -> List[str] - # generate a device init statement, if the passed function option is a Tensor factory. - # - if is_factory(option): - name = option['name'] - device_type = backend_type_env['DeviceType'] - if device_type == 'CUDA' or device_type == 'HIP': - return ['globalContext().lazyInit{}();'.format(device_type)] - return [] - -# TODO The maybe_unwrap_optional_tensors is only needed because our at::native::xxx functions -# still take "Tensor" instead of "optional", so we need CPUType, TypeDefault, ... -# to do the same. Once at::native::xxx are converted, we can remove use_optional_tensor -# and use the use_optional_tensor=True behavior always. -def maybe_unwrap_optional_tensors(option, formals, args): - assert len(formals) == len(args), \ - "Assert we didn't screw up with method_args removing self but forgetting to remove it from formals" - if option['use_c10_dispatcher'] == 'full': - def maybe_unwrap_optional_tensor(formal, arg): - if formal['dynamic_type'] == 'Tensor' and formal['is_nullable']: - return "{}.has_value() ? *{} : at::Tensor()".format(arg, arg) - else: - return arg - return [maybe_unwrap_optional_tensor(formal, arg) for (formal, arg) in zip(formals, args)] - else: - assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' - return args - -def create_generic(top_env, declarations): - # type: (TopEnvironment, List[FunctionOption]) -> Tuple[List[OutputDeclaration], List[OpRegistration]] - # translates defaults from cwrap types to C++ values - def translate_default(argument, type_str, default): - # type: (THFormal, str, Any) -> Any - if default is None: - # cause the default constructor for the object to run - return '{}' - for pattern, replacement in HEADER_CONSTANT_REPLACEMENTS: - default = re.sub(pattern, replacement, str(default)) - if type_str in {'Scalar', 'int64_t', 'double'}: - try: - return int(default) - except Exception: - try: - return float(default) - except Exception: - return default - elif type_str == 'bool': - assert default.lower() in ['true', 'false'] - return default.lower() == 'true' - else: - return default - - # change from THTensor* to Tensor & so we get how it will appear - # in the aten argument list... - def translate_formal(argument, option): - # type: (THFormal, FunctionOption) -> AtFormal - type_str = TYPE_FORMAL_GENERIC.get(argument['type'], argument['type']) - if type_str == 'Tensor &' and not is_mutable_formal_argument(argument, option): - type_str = 'const ' + type_str - translated = { - 'name': argument['name'], - 'type': type_str, - 'dynamic_type': DYNAMIC_TYPE.get(argument['type'], argument['type']), - } # type: AtFormal - if 'default' in argument: - default = translate_default(argument, type_str, argument['default']) - translated['default'] = default - if argument.get('output'): - translated['output'] = True - if argument.get('size'): - translated['size'] = argument['size'] - if argument.get('is_nullable') is not None: - translated['is_nullable'] = argument['is_nullable'] - return translated - - def get_formals(option, schema_order, include_constants=False): - # type: (FunctionOption, bool, bool) -> List[AtFormal] - seen = set() # type: Set[str] - pos_args = [] # type: List[THFormal] - kwd_args = [] # type: List[THFormal] - - def insert(argument): - # type: (THFormal) -> None - if argument['name'] not in seen: - seen.add(argument['name']) - # there are no kwarg_only THFormals - pos_args.append(argument) - - def has_output_mask(argument): - # type: (THFormal) -> bool - return argument.get('allocate', False) and argument.get('mask', False) - - if schema_order: - arguments = copy.deepcopy(option['schema_order_arguments']) - else: - arguments = copy.deepcopy(option['arguments']) - for argument in arguments: - if argument.get('output') and not argument.get('allocate', False): - insert(argument) - for argument in arguments: - if include_constants and argument['type'] == 'CONSTANT': - insert(argument) - elif is_real_argument_to_wrapper(argument): - insert(argument) - if any(has_output_mask(arg) for arg in arguments): - mask_size = sum(has_output_mask(arg) for arg in arguments) - insert({ - 'name': 'output_mask', - # NB: Lack of space in comma works around parsing - # problem in gen_variable_type.py - 'type': 'std::array'.format(mask_size), - 'default': '{{' + ', '.join(['true'] * mask_size) + '}}', - }) - - result = pos_args + kwd_args - return [translate_formal(argument, option) for argument in result] - - def get_return_types(option): - # type: (FunctionOption) -> List[ReturnType] - ret = option['return'] - if ret['kind'] == 'arguments': - argument_indices = ret['arguments'] - if len(argument_indices) == 1: - the_arg = option['arguments'][argument_indices[0]] - return [to_return_type(the_arg, option)] - else: - return [to_return_type(option['arguments'][idx], option) - for idx in argument_indices] - elif ret['kind'] == 'type': - return [{ - 'type': TYPE_RETURN.get(ret['type'], ret['type']), - 'dynamic_type': DYNAMIC_TYPE.get(ret['type'], ret['type']), - }] - else: - raise Exception("format_return_type") - - def format_return_type(return_types): - # type: (List[ReturnType]) -> str - if len(return_types) == 0: - return "void" - elif len(return_types) == 1: - return return_types[0]['type'] - return "std::tuple<{}>".format(','.join(r['type'] for r in return_types)) - - def process_schema_order_actual(schema_order_actual): - if schema_order_actual == 'dtype': - return 'optTypeMetaToScalarType(options.dtype_opt())' - elif schema_order_actual == 'layout': - return 'options.layout_opt()' - elif schema_order_actual == 'device': - return 'options.device_opt()' - elif schema_order_actual == 'pin_memory': - return 'options.pinned_memory_opt()' - elif schema_order_actual == 'memory_format': - return 'c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format)' - else: - return schema_order_actual - - def process_legacy_th_option(option): - # type: (FunctionOption) -> None - # Mutably populate option with derived values computed from values - # passed in to option. - option['inplace'] = re.search( - '(^__i|[^_]_$)', option['api_name']) is not None - - # print(yaml.dump(option)) - formals = get_formals(option, False) - schema_order_formals = get_formals(option, True) - option['formals_list'] = formals - option['formals'] = [format_formal(f) for f in formals] - option['formals_with_defaults'] = [formal_with_default(f) for f in formals] - option['returns'] = get_return_types(option) - option['return_type'] = format_return_type(option['returns']) - option['return_call'] = 'return ' if option['return_type'] != 'void' else '' - option['actuals'] = [f['name'] for f in formals] - - option['method_formals'] = [format_formal(f) for f in formals - if f['name'] != 'self'] - option['method_formals_with_defaults'] = ( - [formal_with_default(f) for f in formals if f['name'] != 'self']) - # *this is 'const Tensor&' since all Tensor methods are const and must - # be const_casted to be accepted as native function's non-const argument - option['method_actuals'] = [ - f['name'] if f['name'] != 'self' else 'const_cast(*this)' for f in formals] - - assert 'method' not in option['variants'], 'TH functions cannot be methods' - is_function = 'function' in option['variants'] - # NB: TH functions don't support multiple dispatch - dispatch_tensor = find_dispatch_tensor(formals) - is_namespace_function = is_function and dispatch_tensor is not None - - if option['mode'] == 'TH': - option['device_guard'] = False - option['device_guard_declaration'] = device_guard(option, False, dispatch_tensor) - option['dispatch_scalar_type_declaration'] = dispatch_scalar_type(option, False, dispatch_tensor) - - assert option['extended_method'], 'Expected legacy operator to be an extended method' - - def native_get_formals(option, schema_order, use_optional_tensor, include_constants=False): - # type: (FunctionOption, bool, bool, bool) -> List[AtFormal] - - # TODO The use_optional_tensor argument is only needed because our at::native::xxx functions - # still take "Tensor" instead of "optional", so we need CPUType, TypeDefault, ... - # to do the same. Once at::native::xxx are converted, we can remove use_optional_tensor - # and use the use_optional_tensor=True behavior always. - - seen = set() # type: Set[str] - pos_args = [] - kwd_args = [] - - def insert(argument): - # type: (AtFormal) -> None - if argument['name'] not in seen: - seen.add(argument['name']) - if argument.get('kwarg_only', False): - kwd_args.append(argument) - else: - pos_args.append(argument) - - if schema_order: - arguments = option['schema_order_arguments'] - else: - arguments = option['arguments'] - for argument in arguments: - insert(argument) - - # not clear we need dynamic_type translation as we can specify the correct type - # directly in native functions - def add_dynamic_type(argument, option): - # type: (AtFormal, FunctionOption) -> AtFormal - argument['dynamic_type'] = NATIVE_DYNAMIC_TYPE.get(argument['type'], argument['type']) - return argument - - result = pos_args + kwd_args - result = [add_dynamic_type(argument, option) for argument in result] - - # ensure we get reference-type formals when appropriate - def native_translate_formals(argument, option): - # type: (AtFormal, FunctionOption) -> AtFormal - argument = copy.deepcopy(argument) - - def translate_map(const): - # type: (bool) -> Dict[str, str] - return { - 'Tensor': 'const Tensor &' if const else 'Tensor &', - 'Type': 'const Type &' if const else 'Type &', - 'TensorOptions': 'const TensorOptions &' if const else 'TensorOptions &', - 'TensorList': 'TensorList', - } - - if argument.get('is_nullable') and argument['type'] not in translate_map(False).keys(): - argument['type'] = "c10::optional<{}>".format(argument['type']) - elif use_optional_tensor and argument.get('is_nullable') and argument['type'] == 'Tensor': - argument['type'] = "const c10::optional&" - - - # Note: the 'self' trap is here only to preserve the const arg 0 for set_data. - # I.e., the signature of the cpp implementation currently fits the code - # generated from a misread schema, but the alias annotation is the truth. - # TODO fix the signature of set_data's cpp impl to match correct codegen from - # the current schema. - # then remove this - if argument['name'] == 'self': - is_mutable = option['inplace'] - else: - is_mutable = '!' in (argument['annotation'] or '') - - if is_mutable: - argument['type'] = translate_map(False).get(argument['type'], argument['type']) - else: - argument['type'] = translate_map(True).get(argument['type'], argument['type']) - - return argument - - result = [native_translate_formals(argument, option) for argument in result] - return result - - # this can return multiple return types in a list, e.g. ['Tensor', 'Tensor'] - def native_get_return_types(option): - # type: (FunctionOption) -> List[ReturnType] - ret = option['return'] - - return_types = [] # List[ReturnType] - for t_raw in ret: - # See Note [field_name versus name] - field_name = None - if isinstance(t_raw, str): - t = t_raw - name = None - else: - t = t_raw['type'] - name = t_raw['name'] - if 'field_name' in t_raw: - field_name = t_raw['field_name'] - - # can't actually return a TensorList (since it's a reference object) - actual_return_type = {'TensorList': 'std::vector'}.get(t, t) - - if actual_return_type == 'Tensor' and (option['inplace'] or option['api_name'].endswith('_out')): - # follow normal ATen convention of returning Tensor & for inplace functions. - actual_return_type = 'Tensor &' - - rtype = { - 'type': actual_return_type, - 'dynamic_type': NATIVE_DYNAMIC_TYPE.get(t, t), - } # type: ReturnType - if name is not None: - rtype['name'] = name - if field_name is not None: - rtype['field_name'] = field_name - return_types.append(rtype) - - return return_types - - def process_native(option): - # type: (FunctionOption) -> Optional[OutputDeclaration] - valid_modules = {'nn', 'fft', 'linalg'} - assert (option['python_module'] == '' or - option['python_module'] in valid_modules), \ - "Found python_module of {} for decl {}, but only \'\' string, \'nn\' and \'fft\' are supported".format( - option['python_module'], option['name']) - use_optional_tensors_in_cpp_frontend = option['use_c10_dispatcher'] == 'full' - formals = native_get_formals(option, False, use_optional_tensors_in_cpp_frontend) - native_formals = native_get_formals(option, False, False) - schema_order_formals = native_get_formals(option, True, use_optional_tensors_in_cpp_frontend) - option['formals_list'] = formals - option['formals'] = [format_formal(f) for f in formals] - option['native_formals'] = [format_formal(f) for f in native_formals] - option['formals_with_defaults'] = [formal_with_default(f) for f in formals] - option['native_formals_with_defaults'] = [formal_with_default(f) for f in native_formals] - option['returns'] = native_get_return_types(option) - option['return_type'] = format_return_type(option['returns']) - option['return_call'] = 'return ' if option['return_type'] != 'void' else '' - option['actuals'] = [f['name'] for f in formals] - option['schema_order_actuals'] = [f['name'] for f in schema_order_formals] - - option['formals_types'] = [f['type'] for f in option['formals_list']] - - option['cpp_signature'] = "{} ({})".format(option['return_type'], ", ".join(option['formals_types'])) - option['schema_order_cpp_signature'] = "{} ({})".format( - option['return_type'], - ", ".join([f['type'] for f in schema_order_formals])) - - option['method_formals'] = [format_formal(f) for f in formals - if f['name'] != 'self'] - option['method_formals_with_defaults'] = ( - [formal_with_default(f) for f in formals if f['name'] != 'self']) - # *this is 'const Tensor&' since all Tensor methods are const and must - # be const_casted to be accepted as native function's non-const argument - option['method_actuals'] = [ - f['name'] if f['name'] != 'self' else 'const_cast(*this)' for f in formals] - option['schema_order_method_actuals'] = [ - f['name'] if f['name'] != 'self' else 'const_cast(*this)' for f in schema_order_formals] - - if find_formal_by_type('TensorOptions', formals) is not None: - option['schema_order_actuals'] = [ - process_schema_order_actual(actual) for actual in option['schema_order_actuals']] - option['schema_order_method_actuals'] = [ - process_schema_order_actual(actual) for actual in option['schema_order_method_actuals']] - - def gen_tensor_method(option, formals): - # type: (Any, List[AtFormal]) -> FunctionCode - def swizzle_self(f): # blegh - if f['name'] == 'self': - fc = f.copy() - fc['name'] = '*this' - return fc - else: - return f - - dispatch_key_var_name = '_dk' - dispatch_key_init = gen_dispatch_key_init(dispatch_key_var_name, [swizzle_self(f) for f in formals]) - - method_actuals = maybe_unwrap_optional_tensors(option, formals, option['method_actuals']) - - # See NOTE[UnboxedOnly] - if option['use_c10_dispatcher'] == 'full': - tensor_method_actuals = option['schema_order_method_actuals'] - tensor_method_cpp_signature = option['schema_order_cpp_signature'] - else: - assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' - tensor_method_actuals = option['method_actuals'] - tensor_method_cpp_signature = option['cpp_signature'] - - method_definition = TENSOR_METHOD_DEFINITION.substitute( - option, - tensor_method_actuals=tensor_method_actuals, - tensor_method_cpp_signature=tensor_method_cpp_signature - ) - return FunctionCode( - declaration=TENSOR_METHOD_DECLARATION.substitute(option), - definition=method_definition) - - def gen_namespace_function(option, multidispatch_formals): - # type: (Any, List[AtFormal]) -> FunctionCode - - dispatch_key_var_name = '_dk' - dispatch_key_init = gen_dispatch_key_init(dispatch_key_var_name, formals) - - declaration = DEPRECATED_FUNCTION_DECLARATION if option['deprecated'] else FUNCTION_DECLARATION - fn_declaration = declaration.substitute(option) - - actuals = maybe_unwrap_optional_tensors(option, formals, option['actuals']) - - # See NOTE[UnboxedOnly] - if option['use_c10_dispatcher'] == 'full': - function_actuals = option['schema_order_actuals'] - function_cpp_signature = option['schema_order_cpp_signature'] - else: - assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' - function_actuals = option['actuals'] - function_cpp_signature = option['cpp_signature'] - - fn_definition = FUNCTION_DEFINITION.substitute( - option, - function_actuals=function_actuals, - function_cpp_signature=function_cpp_signature) - - return FunctionCode(definition=fn_definition, declaration=fn_declaration) - - assert find_formal_by_type('Type', formals) is None, \ - "Found Type argument in {}({}). Use TensorOptions instead.".format( - option['name'], ", ".join(option['method_formals_with_defaults'])) - - type_method_dispatch = option['type_method_definition_dispatch'] - - is_method = 'method' in option['variants'] - is_namespace_function = 'function' in option['variants'] - # For method-only entries, the first argument should be self - if is_method and not is_namespace_function: - assert formals[0]['name'] == 'self' - is_factory_method = is_factory(option) - - check_methods_do_not_start_with_underscore(option['name'], is_method) - - # NB: Device guard and scalar type generated code is still based on the - # first argument. Scalar type test will be removed once TH is removed. - # If you need more complex device guard behavior, you should disable - # device guard and then manually add the guards you need. - dispatch_options = find_formal_by_type('TensorOptions', formals) - guard_tensor = None if dispatch_options else find_dispatch_tensor(formals) - option['device_guard_declaration'] = device_guard(option, dispatch_options, guard_tensor) - option['dispatch_scalar_type_declaration'] = dispatch_scalar_type(option, dispatch_options, guard_tensor) - - top_env['aten_ops'].append(OPERATOR_NAME_FULL.substitute(option)) - - option['native_type_method_dispatch'] = type_method_dispatch - - # Note [Abstract ATen methods] - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # An abstract ATen method is one whose dispatch differs between - # types. These are implemented in derived types (with a - # standard (throwing) definition in Type). A concrete ATen - # method is one which has the same dispatch for all types; - # we just implement it in the base Type. This is exposed - # in Declarations.yaml via a field named 'abstract'. - abstract = False - op_registrations.append(OpRegistration( - operator_name=OPERATOR_NAME.substitute(option), - registration_code=SCHEMA_REGISTRATION.substitute(option), - schema_registration_code=SCHEMA_REGISTRATION.substitute(option))) - if isinstance(type_method_dispatch, dict): - abstract = True - # Having manual_kernel_registration for an abstract method doesn't make sense. - assert not option['manual_kernel_registration'] - else: - top_env['type_method_declarations'].append(NATIVE_DISPATCH_DECLARATION.substitute(option)) - top_env['type_method_definitions'].append(NATIVE_DISPATCH_DEFINITION_DEFAULT.substitute(option)) - if not option['manual_kernel_registration']: - # See NOTE[UnboxedOnly] - if option['use_c10_dispatcher'] == 'full': - op_registrations.append(OpRegistration( - operator_name=OPERATOR_NAME.substitute(option), - registration_code=DEFAULT_FUNCTION_REGISTRATION.substitute(option), - schema_registration_code=SCHEMA_REGISTRATION.substitute(option))) - else: - assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' - op_registrations.append(OpRegistration( - operator_name=OPERATOR_NAME.substitute(option), - registration_code=DEFAULT_UNBOXEDONLY_FUNCTION_REGISTRATION.substitute(option), - schema_registration_code=SCHEMA_REGISTRATION.substitute(option))) - - # generate the at::native function declarations (i.e. what the user will implement) - if isinstance(type_method_dispatch, dict): - generated_native_functions = [] # type: List[str] - for key in sorted(type_method_dispatch.keys()): - value = type_method_dispatch[key] - # skip functions in different namespace, e.g. legacy::cpu - if "::" in value: - continue - if value not in generated_native_functions: - option['native_type_method_dispatch'] = value - top_env['native_function_declarations'].append(NATIVE_DECLARATION.substitute(option)) - generated_native_functions.append(value) - else: - top_env['native_function_declarations'].append(NATIVE_DECLARATION.substitute(option)) - - method_of = ['Type'] - if is_method: - code = gen_tensor_method(option, formals) - top_env['tensor_method_declarations'].append(code.declaration) - top_env['tensor_method_definitions'].append(code.definition) - method_of.append('Tensor') - - if is_namespace_function: - code = gen_namespace_function(option, formals) - top_env['function_definitions'].append(code.definition) - top_env['function_declarations'].append(code.declaration) - method_of.append('namespace') - - return OutputDeclaration( - name=option['api_name'], - operator_name=option['operator_name'], - overload_name=option['overload_name'], - use_c10_dispatcher=option['use_c10_dispatcher'], - manual_kernel_registration=option['manual_kernel_registration'], - schema_order_cpp_signature=option['schema_order_cpp_signature'], - category_override=option['category_override'], - matches_jit_signature=option["matches_jit_signature"], - schema_string=option["schema_string"], - arguments=formals, - schema_order_arguments=schema_order_formals, - method_of=method_of, - mode=option['mode'], - python_module=option['python_module'], - buffers=None, - returns=option['returns'], - inplace=option['inplace'], - is_factory_method=is_factory_method, - # See Note [Abstract ATen methods] - abstract=abstract, - device_guard=option.get('device_guard', True), - with_gil=option.get('with_gil', False), - deprecated=option['deprecated'], - ) - - output_declarations = [] # type: List[OutputDeclaration] - op_registrations = [] # type: List[OpRegistration] - for declaration in declarations: - output_options = [] # type: List[OutputDeclaration] - for option in declaration['options']: - option["matches_jit_signature"] = declaration["matches_jit_signature"] - option["schema_string"] = declaration["schema_string"] - try: - if option['mode'] != 'native': - # Mutably populate option with values - process_legacy_th_option(option) - else: - output_option = process_native(option) - if output_option: - output_options.append(output_option) - except NYIError: - option['skip'] = True - output_declarations.extend(output_options) - - return output_declarations, op_registrations - - -def create_derived(backend_type_env, declarations): - # type: (Environment, List[FunctionOption]) -> Tuple[List[str], List[str], List[OpRegistration], List[str], List[str]] - type_object_declarations = [] # type: List[str] - type_object_definitions = [] # type: List[str] - op_registrations = [] # type: List[OpRegistration] - legacy_th_declarations = [] # type: List[str] - legacy_th_definitions = [] # type: List[str] - is_cuda = 'CUDA' in backend_type_env['Backend'] - - def requires_checked_cast(argument): - # type: (THFormal) -> bool - if argument['type'] == 'IntArrayRef': - return 'size' in argument - return argument['type'] in CHECKED_CAST - - def nullable_argument(argument): - # type: (THFormal) -> bool - return argument.get('is_nullable', False) - - def get_argument(env, argument, option): - # type: (Environment, THFormal, FunctionOption) -> str - if requires_checked_cast(argument): - checked_use = CHECKED_USE.get( - argument['type'], '{}_').format(argument['name']) - if nullable_argument(argument): - checked_use = CHECKED_USE_NULLABLE.substitute( - env={}, arg_name=argument['name'], usage=checked_use) - return checked_use - elif argument['type'] == 'CONSTANT': - v = str(argument.get('default', argument['name'])) - for pattern, replacement in CONSTANT_REPLACEMENTS: - v = re.sub(pattern, replacement, v) - return CodeTemplate(v).substitute(env) - # e.g. argument 0, i.e. repeat the 0th argument in this position... - elif argument['type'] == 'argument': - index = int(argument['name']) - return get_argument(env, option['arguments'][index], option) - else: - return argument['name'] - - def get_arguments(env, arguments, option): - # type: (Environment, List[THFormal], FunctionOption) -> List[str] - return [get_argument(env, argument, option) - for argument in arguments] - - def allocate_arg(arg, output_count, backend, scalar_name): - # type: (THFormal, int, str, str) -> List[str] - name = arg['name'] - allocation = CodeTemplate(ALLOC_NOARGS_WRAP[arg['type']]).substitute(Backend=backend, ScalarName=scalar_name) - tensor_arg = '{}_'.format(name) - if arg.get('mask', False): - allocation = 'output_mask[{}] ? {} : nullptr'.format(output_count, allocation) - tensor_arg = ('{}_ == nullptr ? (TensorImpl*)UndefinedTensorImpl::singleton() : (TensorImpl*){}_' - .format(name, name)) - intrusive_ptr_type = 'c10::intrusive_ptr' - return [ - 'auto {}_ = {};'.format(name, allocation), - 'auto {} = Tensor({}::reclaim({}));'.format(name, intrusive_ptr_type, tensor_arg), - ] - - def handle_call(env, option, cimpl): - # type: (Environment, FunctionOption, FunctionOption) -> str - is_nn = option['mode'] == 'NN' - actuals = get_arguments(env, cimpl['arguments'], option) - if is_cuda or is_nn: - actuals = ['globalContext().getTHCState()'] + actuals - - cname = cimpl['cname'] - if option.get('sparse', False): - if is_cuda: - cname = 'THCS' + env['ScalarName'] + "Tensor_" + cname - else: - cname = env['THTensor'].replace('TH', 'THS') + '_' + cname - elif is_nn: - cname = 'THNN_{}'.format(env['THType']) + cname - else: - cname = env['THTensor'] + '_' + cname - - call = CALL_TEMPLATE.substitute(actuals=actuals, cname=cname) - if cimpl.get('condition') is not None: - call = 'if ({}) {}'.format(cimpl['condition'], call) - return call - - def emit_body(env, option, scalar_type_cases): - # type: (Environment, FunctionOption, List[str]) -> List[str] - body = [] # type: List[str] - - switch_prologue = [] # type: List[str] - output_count = 0 - cases = [] - - for arg in option['arguments']: - # make a new allocation of TensorImpl, then wrap a Tensor around it. - if arg.get('allocate', False): - switch_prologue += allocate_arg(arg, output_count, env['Backend'], 'dispatch_scalar_type') - output_count += 1 - - for scalar_name, c_type, accreal, _ in scalar_types: - if scalar_name in scalar_type_cases: - case_body = [] # type: List[str] - # arguments are potentially duplicated because of one argument - # referencing another - seen_names = set() # type: Set[str] - count = 0 - - case_env = { - 'Backend': env['Backend'], - 'DeviceType': env['DeviceType'], - 'state': env['state'], - 'ScalarType': c_type, - 'ScalarName': scalar_name, - 'AccScalarName': accreal, - 'THType': scalar_name, - 'THTensor': 'TH{}Tensor'.format(scalar_name) - } # type: Environment - if case_env['Backend'] == 'CUDA': - sname = '' if scalar_name == "Float" else scalar_name - case_env['THType'] = 'Cuda{}'.format(sname) - case_env['THTensor'] = 'THCuda{}Tensor'.format(sname) - - for arg in option['arguments']: - if is_real_argument_to_wrapper(arg): - count += 1 - - # only generated checked casts the first time we see it - if arg['name'] not in seen_names and requires_checked_cast(arg): - seen_names.add(arg['name']) - - # make a new allocation of TensorImpl, then wrap a Tensor around it. - if not arg.get('allocate', False): - # special case where we allow undefined Tensors, and thus - # the checked cast succeeds even if the Tensor is not - # defined - null_okay = 'true' if nullable_argument(arg) else 'false' - - # extract the TensorImpl from an existing tensor - check_cast = CHECKED_CAST[arg['type']].substitute( - case_env, arg_name=arg['name'], arg_pos=count, - api_name=option['api_name'], null_okay=null_okay, - size=arg.get('size'), scalar_type='dispatch_scalar_type') - case_body.append("auto {}_ = {};".format( - arg['name'], check_cast)) - - # cimpls, if it exists, contains the underlying C function names and - # arguments. Otherwise use option - cimpls = option.get('cimpls', [option]) - calls = [handle_call(case_env, option, cimpl) for cimpl in cimpls] - - ret = option['return'] - - if ret['kind'] == 'arguments': - case_body.extend([call + ';' for call in calls]) - # return handled later - elif ret['kind'] == 'type': - assert len(calls) == 1 - call = calls[0] - - # return the same underlying Tensor type for both real and accreal; this ensures - # e.g. x.sum(0) and x.sum() return the same type. We explicitly cast to the - # ScalarType before constructing the scalar_tensor to avoid overflow checking. - if ret['type'] == 'accreal' or ret['type'] == 'real': - return_scalar = ('return at::scalar_tensor(convert<${ScalarType}>(${call}), ' - 'options(ScalarType::${ScalarName}));') - case_body.append(CodeTemplate(return_scalar).substitute(case_env, call=call)) - else: - case_body.append("return {};".format(call)) - else: - raise Exception("NYI - return handling") - - cases.append(LEGACY_TH_DEFINITION_CASE.substitute(case_env, case_body=case_body)) - switch_epilogue = '' - if ret['kind'] == 'arguments': - arguments_indices = ret['arguments'] - arguments = [option['arguments'][argi] - for argi in arguments_indices] - if len(arguments_indices) == 1: - arg = arguments[0] - switch_epilogue = "return {};".format(arg['name']) - else: - types = [to_return_type(arg, option)['type'] - for arg in arguments] - # TODO: check for move semantics... - names = [arg['name'] for arg in arguments] - switch_epilogue = CodeTemplate("return std::tuple<${types}>(${names});").substitute( - types=types, names=names) - body.append(LEGACY_TH_DEFINITION_SWITCH_STATEMENT.substitute(env, cases=cases, - switch_prologue=switch_prologue, - switch_epilogue=switch_epilogue)) - return body - - def process_legacy_th_option(option): - # type: (FunctionOption) -> None - backend = backend_type_env['Backend'] - if backend in option['backend_types']: - env = nested_dict(option, backend_type_env) - body = emit_body(env, option, option['backend_types'][backend]) # type: ignore - option['type_definition_body'] = body - # These type ignores arise from the fact that a nested_dict - # technically isn't a Mapping, as it doesn't implement - # enough methods. I could fix this with a Protocol but - # then I need typing_extensions which isn't currently - # a build dep. - legacy_th_declarations.append( - LEGACY_TH_DECLARATION.substitute(env)) # type: ignore - legacy_th_definitions.append( - LEGACY_TH_DEFINITION.substitute(env)) # type: ignore - - def process_native(option): - # type: (FunctionOption) -> None - dispatch = option['type_method_definition_dispatch'] - env = nested_dict(option, backend_type_env) - - if isinstance(dispatch, dict): - # If we're here, then our native_functions.yaml entry has dispatch configuration. - # Having manual kernel registration doesn't make sense. - assert not option['manual_kernel_registration'] - backend = backend_type_env['Backend'] - if backend in option['backend_types']: - - native_dispatch = dispatch.get(backend) - - type_object_declarations.append( - NATIVE_DISPATCH_DECLARATION.substitute(env)) - - option['native_type_method_dispatch'] = native_dispatch - option['device_init'] = gen_device_init(option, backend_type_env) - - if backend in ['CPU', 'SparseCPU', 'QuantizedCPU', 'MkldnnCPU']: - # Omit the device guard entirely in these cases - def_backend = NATIVE_DISPATCH_DEFINITION_CPU_BACKEND - else: - def_backend = NATIVE_DISPATCH_DEFINITION_GENERIC_BACKEND - - type_object_definitions.append(def_backend.substitute(env)) - - if native_dispatch: - # See NOTE[UnboxedOnly] - if option['use_c10_dispatcher'] == 'full': - op_registrations.append(OpRegistration( - operator_name=OPERATOR_NAME.substitute(option), - registration_code=BACKEND_FUNCTION_REGISTRATION.substitute(env), - schema_registration_code=SCHEMA_REGISTRATION.substitute(option))) - else: - assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' - op_registrations.append(OpRegistration( - operator_name=OPERATOR_NAME.substitute(option), - registration_code=BACKEND_UNBOXEDONLY_FUNCTION_REGISTRATION.substitute(env), - schema_registration_code=SCHEMA_REGISTRATION.substitute(option))) - - for declaration in declarations: - for option in declaration['options']: - if not option.get('skip', False): - try: - if option['mode'] == 'NN' and option.get('cimpls') is None: - continue - if option['mode'] != 'native': - process_legacy_th_option(option) - else: - process_native(option) - except NYIError: - pass - return (type_object_declarations, type_object_definitions, op_registrations, - legacy_th_declarations, legacy_th_definitions) diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py deleted file mode 100644 index d3605bf385a..00000000000 --- a/aten/src/ATen/gen.py +++ /dev/null @@ -1,545 +0,0 @@ - -import argparse -import os - -import yaml -from collections import defaultdict -from collections import OrderedDict - -import sys -from os import path -sys.path.append(path.dirname(path.abspath(__file__))) - -import cwrap_parser -import nn_parse -import native_parse -import preprocess_declarations -import function_wrapper -import gen_backend_select_register - -from code_template import CodeTemplate - - -# This file is the top-level entry point for code generation in ATen. -# It takes an arbitrary number of arguments specifying metadata files to -# process (.cwrap, .yaml and .h) and outputs a number generated header -# and cpp files in ATen/ (see invocations of 'write' for each file that -# is written.) It is invoked from cmake; look for the 'cwrap_files' -# variable for an up-to-date list of files which are passed. - -parser = argparse.ArgumentParser(description='Generate ATen source files') -parser.add_argument('files', help='cwrap files', nargs='+') - -parser.add_argument( - '-s', - '--source-path', - help='path to source directory for ATen', - default='.') -parser.add_argument( - '-o', - '--output-dependencies', - help='output a list of dependencies into the given file and exit') -parser.add_argument( - '-d', '--install_dir', help='output directory', default='ATen') -parser.add_argument( - '--rocm', - action='store_true', - help='reinterpret CUDA as ROCm/HIP and adjust filepaths accordingly') -parser.add_argument( - '--vulkan', - action='store_true', - help='Generate Vulkan backend functions') -parser.add_argument( - '--op_registration_whitelist', - nargs='*', - help='filter op registrations by the whitelist (if set); ' - 'each item is `namespace`::`operator name` without overload name; ' - 'e.g.: aten::empty aten::conv2d ...') -parser.add_argument( - '--backend_whitelist', - nargs='*', - help='filter dispatch backend by the whitelist (if set), ' - 'e.g.: CPU CUDA QuantizedCPU ...') -parser.add_argument( - '--per_op_registration', - action='store_true', - help='group function registrations by op name and write to separate files; ' - 'must also set --op_registration_whitelist param') -parser.add_argument( - '--force_schema_registration', - action='store_true', - help='force it to generate schema-only registrations for all ops, including' - 'those that are not listed on --op_registration_whitelist') -options = parser.parse_args() - -# NB: It is mandatory to NOT use os.path.join here, as the install directory -# will eventually be ingested by cmake, which does not respect Windows style -# path slashes. If you switch this to use os.path.join, you'll get an error -# like: -# -# Syntax error in cmake code when parsing string -# -# C:/Jenkins/workspace/pytorch-builds/pytorch-win-ws2016-cuda9-cudnn7-py3-build/build/aten/src/ATen\core/TensorMethods.h -# -# Invalid character escape '\c'. -core_install_dir = options.install_dir + '/core' if options.install_dir is not None else None -if options.install_dir is not None and not os.path.exists(options.install_dir): - os.makedirs(options.install_dir) -if core_install_dir is not None and not os.path.exists(core_install_dir): - os.makedirs(core_install_dir) - - -class FileManager(object): - def __init__(self, install_dir=None): - self.install_dir = install_dir if install_dir else options.install_dir - self.filenames = set() - self.outputs_written = False - self.undeclared_files = [] - - def will_write(self, filename): - filename = '{}/{}'.format(self.install_dir, filename) - if self.outputs_written: - raise Exception("'will_write' can only be called before " + - "the call to write_outputs, refactor so outputs are registered " + - "before running the generators") - self.filenames.add(filename) - - def _write_if_changed(self, filename, contents): - try: - with open(filename, 'r') as f: - old_contents = f.read() - except IOError: - old_contents = None - if contents != old_contents: - with open(filename, 'w') as f: - f.write(contents) - - def write_outputs(self, filename): - """Write a file containing the list of all outputs which are - generated by this script.""" - self._write_if_changed( - filename, - ''.join(name + ";" for name in sorted(self.filenames))) - self.outputs_written = True - - def write(self, filename, s, env=None): - filename = '{}/{}'.format(self.install_dir, filename) - if isinstance(s, CodeTemplate): - assert env is not None - comment = "@" + "generated by aten/src/ATen/gen.py" - if s.filename: - comment += " from {}".format(os.path.basename(s.filename)) - env['generated_comment'] = comment - s = s.substitute(env) - self._write_if_changed(filename, s) - if filename not in self.filenames: - self.undeclared_files.append(filename) - else: - self.filenames.remove(filename) - - def check_all_files_written(self): - if len(self.undeclared_files) > 0: - raise Exception( - "trying to write files {} which are not ".format(self.undeclared_files) + - "in the list of outputs this script produces. " + - "use will_write to add them.") - if len(self.filenames) > 0: - raise Exception("Outputs declared with 'will_write' were " + - "never written: {}".format(self.filenames)) - - -TEMPLATE_PATH = options.source_path + "/templates" -TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.cpp") -SPARSE_TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/SparseTypeDerived.cpp") -TYPE_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.h") -TYPE_DEFAULT_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDefault.h") -TYPE_DEFAULT_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDefault.cpp") -OPS_ALREADY_MOVED_TO_C10_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/ATenOpList.cpp") -BACKEND_SELECT_REGISTER_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/BackendSelectRegister.cpp") -SCHEMA_REGISTER_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/SchemaRegister.cpp") -TENSOR_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TensorBody.h") -TENSOR_METHODS_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/TensorMethods.cpp") - -FUNCTIONS_H = CodeTemplate.from_file(TEMPLATE_PATH + "/Functions.h") -FUNCTIONS_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/Functions.cpp") - -LEGACY_TH_FUNCTIONS_H = CodeTemplate.from_file(TEMPLATE_PATH + "/LegacyTHFunctions.h") -LEGACY_TH_FUNCTIONS_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/LegacyTHFunctions.cpp") - -NATIVE_FUNCTIONS_H = CodeTemplate.from_file(TEMPLATE_PATH + "/NativeFunctions.h") - -PER_OP_REGISTRATION_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/PerOpRegistration.cpp") - -core_file_manager = FileManager(core_install_dir) -file_manager = FileManager() -cuda_file_manager = FileManager() - -def backend_to_devicetype(backend): - if backend == 'QuantizedCPU': - return 'CPU' - elif backend == 'QuantizedCUDA': - return 'CUDA' - return backend - -backends = ['CPU', 'CUDA'] -densities = ['Dense', 'Sparse', 'Mkldnn'] # TODO: layout instead of densities? - -quantized_backends = ['QuantizedCPU', 'QuantizedCUDA'] - -# scalar_name, c_type, accreal, is_floating_type -quantized_scalar_types = [ - ('QInt8', 'qint8', 'QInt8AccrealNotDefined', 'QInt8IsFloatingTypeNotDefined'), - ('QUInt8', 'quint8', 'QUInt8AccrealNotDefined', 'QUInt8IsFloatingTypeNotDefined'), - ('QInt32', 'qint32', 'QInt32AccrealNotDefined', 'Qint32IsFloatingTypeNotDefined'), -] - -# whitelist used to filter op registrations for custom build -if options.op_registration_whitelist is not None: - op_registration_whitelist = set(options.op_registration_whitelist) -else: - op_registration_whitelist = None - -# shared environment for non-derived base classes TensorBody.h Storage.h -top_env = { - 'cpu_type_headers': [], - 'cuda_type_headers': [], - 'function_registrations': [], - 'aten_ops': [], - 'type_method_declarations': [], - 'type_method_definitions': [], - 'tensor_method_declarations': [], - 'tensor_method_definitions': [], - 'function_declarations': [], - 'function_definitions': [], - 'type_ids': [], - 'native_function_declarations': [], -} - - -def is_whitelisted_backend(backend): - return options.backend_whitelist is None or backend in options.backend_whitelist - -def is_cuda_backend(backend): - return backend in ("QuantizedCUDA", "CUDA") - -def dict_representer(dumper, data): - return dumper.represent_dict(data.items()) - - -def postprocess_output_declarations(output_declarations): - # ensure each return has a name associated with it - for decl in output_declarations: - has_named_ret = False - for n, ret in enumerate(decl.returns): - if 'name' not in ret: - assert not has_named_ret - if decl.inplace: - ret['name'] = 'self' - elif len(decl.returns) == 1: - ret['name'] = 'out' - else: - ret['name'] = 'out' + str(n) - else: - has_named_ret = True - - def remove_key_if_none(dictionary, key): - if key in dictionary.keys() and dictionary[key] is None: - del dictionary[key] - return dictionary - - return [remove_key_if_none(decl._asdict(), 'buffers') - for decl in output_declarations] - - -def format_yaml(data): - if options.output_dependencies: - # yaml formatting is slow so don't do it if we will ditch it. - return "" - noalias_dumper = yaml.dumper.SafeDumper - noalias_dumper.ignore_aliases = lambda self, data: True - # Support serializing OrderedDict - noalias_dumper.add_representer(OrderedDict, dict_representer) - # Some yaml parsers (e.g. Haskell's) don't understand line breaks. - # width=float('Inf') turns off optional line breaks and improves - # the portability of the outputted yaml. - return yaml.dump(data, default_flow_style=False, Dumper=noalias_dumper, width=float('Inf')) - - -def add_op_registrations(per_type_registrations, per_op_registrations, schema_registrations, op_registrations): - for op_registration in op_registrations: - opname = op_registration.operator_name - registration = op_registration.registration_code - - # collect schema registration for all ops (whitelisted or not) - if schema_registrations is not None: - schema_registrations.append(op_registration.schema_registration_code) - - # apply whitelist - if op_registration_whitelist is not None and opname not in op_registration_whitelist: - continue - if options.per_op_registration: - # per op registration - per_op_registrations[opname].append(registration) - else: - # per type registration - per_type_registrations.append(registration) - - -def generate_storage_type_and_tensor(backend, density, declarations, per_op_registrations, schema_registrations): - env = {} - density_tag = density if density != 'Dense' else '' - env['Density'] = density - env['Type'] = "{}{}Type".format(density_tag, backend) - env['DeviceType'] = backend_to_devicetype(backend) - env['Backend'] = density_tag + backend - if not is_whitelisted_backend(env['Backend']): - return - env['storage_tensor_headers'] = [] - if density != 'Sparse': - env['storage_tensor_headers'] = ['#include '] - - # used for generating switch logic for external functions - tag = density_tag + backend - env['TypeID'] = 'TypeID::' + tag - top_env['type_ids'].append(tag + ',') - - env['legacy_th_headers'] = [] - if is_cuda_backend(backend): - env['extra_cuda_headers'] = [] - env['extra_cuda_headers'].append('#include ') - if options.rocm: - env['th_headers'] = [ - '#include ', - '#include ', - '#include ', - '#undef THNN_', - '#undef THCIndexTensor_', - ] - env['extra_cuda_headers'].append('#include ') - env['extra_cuda_headers'].append('#include ') - env['extra_cuda_headers'].append('#include ') - else: - env['th_headers'] = [ - '#include ', - '#include ', - '#include ', - '#undef THNN_', - '#undef THCIndexTensor_', - ] - env['extra_cuda_headers'].append('#include ') - env['extra_cuda_headers'].append('#include ') - env['extra_cuda_headers'].append('#include ') - env['state'] = ['globalContext().getTHCState()'] - env['isCUDA'] = 'true' - env['storage_device'] = 'return storage->device;' - env['Generator'] = 'CUDAGeneratorImpl' - env['allocator'] = 'at::cuda::getCUDADeviceAllocator()' - else: - env['th_headers'] = [ - '#include ', - '#include ', - ] - env['extra_cuda_headers'] = [] - env['state'] = [] - env['isCUDA'] = 'false' - env['storage_device'] = 'throw std::runtime_error("CPU storage has no device");' - env['Generator'] = 'CPUGeneratorImpl' - env['allocator'] = 'getCPUAllocator()' - - declarations, definitions, op_registrations, th_declarations, th_definitions = function_wrapper.create_derived( - env, declarations) - env['type_derived_method_declarations'] = declarations - env['type_derived_method_definitions'] = definitions - env['legacy_th_declarations'] = th_declarations - env['legacy_th_definitions'] = th_definitions - env['function_registrations'] = [] - add_op_registrations(env['function_registrations'], per_op_registrations, schema_registrations, op_registrations) - - fm = file_manager - if env['DeviceType'] == 'CUDA': - fm = cuda_file_manager - - if env['Backend'] == 'CPU' or env['Backend'] == 'CUDA': - env['namespace'] = env['Backend'].lower() - env['legacy_th_headers'].append('#include ") - fm.write('LegacyTHFunctions' + env['Backend'] + ".h", LEGACY_TH_FUNCTIONS_H, env) - fm.write('LegacyTHFunctions' + env['Backend'] + ".cpp", LEGACY_TH_FUNCTIONS_CPP, env) - - if density != 'Sparse': - fm.write(env['Type'] + ".cpp", TYPE_DERIVED_CPP, env) - else: - fm.write(env['Type'] + ".cpp", SPARSE_TYPE_DERIVED_CPP, env) - fm.write(env['Type'] + ".h", TYPE_DERIVED_H, env) - - if env['DeviceType'] == 'CPU' or env['DeviceType'] == 'Vulkan': - top_env['cpu_type_headers'].append( - '#include '.format(env['Type'])) - else: - assert env['DeviceType'] == 'CUDA' - top_env['cuda_type_headers'].append( - '#include '.format(env['Type'])) - - -# yields (backend, density) tuples -def iterate_types(): - for backend in backends: - for density in densities: - if density == 'Mkldnn' and backend != 'CPU': - continue - else: - yield (backend, density) - for backend in quantized_backends: - yield (backend, 'Dense') - if options.vulkan: - yield('Vulkan', 'Dense') - - -def gen_per_op_registration_filename(opname): - return 'pt_op_register_{}.cpp'.format(opname.replace(':', '-')) - - -################### -# declare what files will be output _before_ we do any work -# so that the script runs quickly when we are just querying the -# outputs -def declare_outputs(): - core_files = ['TensorBody.h', 'TensorMethods.cpp', 'ATenOpList.cpp'] - for f in core_files: - core_file_manager.will_write(f) - files = ['Declarations.yaml', 'TypeDefault.cpp', 'TypeDefault.h', - 'Functions.h', 'Functions.cpp', 'NativeFunctions.h', 'BackendSelectRegister.cpp'] - for f in files: - file_manager.will_write(f) - for backend, density in iterate_types(): - full_backend = backend if density == "Dense" else density + backend - if not is_whitelisted_backend(full_backend): - continue - fm = file_manager - if is_cuda_backend(backend): - fm = cuda_file_manager - for kind in ["Type"]: - if kind != 'Type' and density == "Sparse": - # No Storage or Tensor for sparse - continue - fm.will_write("{}{}.h".format(full_backend, kind)) - fm.will_write("{}{}.cpp".format(full_backend, kind)) - if backend == 'CPU' or backend == 'CUDA': - fm.will_write("LegacyTHFunctions{}.h".format(backend)) - fm.will_write("LegacyTHFunctions{}.cpp".format(backend)) - - if options.per_op_registration: - if op_registration_whitelist is None: - raise Exception("Must set --op_registration_whitelist for per-op registration.") - for whitelisted_op in op_registration_whitelist: - fname = gen_per_op_registration_filename(whitelisted_op) - file_manager.will_write(fname) - - if options.force_schema_registration: - file_manager.will_write('SchemaRegister.cpp') - - -def filter_by_extension(files, *extensions): - filtered_files = [] - for file in files: - for extension in extensions: - if file.endswith(extension): - filtered_files.append(file) - return filtered_files - - -def generate_per_op_registration(per_op_registrations): - if not options.per_op_registration: - return - - # Ensure all whitelisted operators have a corresponding registration file. - # Generate an empty placeholder file for nonexistent operators, which might - # be registered manually instead of via codegen. - # This can simplify the custom BUCK build which consumes the output of this - # script, since it can uniformly create per-op build targets and dependencies - # without having to know the subtle difference about op registration. - # Manually registered operators might call codegen registered operators thus - # we cannot simply ignore them when calculating transitive dependencies for - # custom build. - for whitelisted_op in op_registration_whitelist: - if whitelisted_op not in per_op_registrations: - per_op_registrations[whitelisted_op] = [] - - for opname, function_registrations in per_op_registrations.items(): - fname = gen_per_op_registration_filename(opname) - file_manager.write(fname, PER_OP_REGISTRATION_CPP, { - 'extra_headers': top_env['cpu_type_headers'] + top_env['cuda_type_headers'], - 'function_registrations': function_registrations, - }) - - -def generate_schema_registration(schema_registrations): - if not options.force_schema_registration: - return - file_manager.write('SchemaRegister.cpp', SCHEMA_REGISTER_CPP, { - 'schema_registrations': sorted(set(schema_registrations)), - }) - - -def generate_outputs(): - cwrap_files = filter_by_extension(options.files, '.cwrap') - nn_files = filter_by_extension(options.files, 'nn.yaml', '.h') - native_files = filter_by_extension(options.files, 'native_functions.yaml') - - declarations = [d - for file in cwrap_files - for d in cwrap_parser.parse(file)] - - declarations += nn_parse.run(nn_files) - declarations += native_parse.run(native_files) - declarations = preprocess_declarations.run(declarations) - - per_op_registrations = defaultdict(list) if options.per_op_registration else None - schema_registrations = [] if options.force_schema_registration else None - - # note: this will fill in top_env['type/tensor_method_declarations/definitions'] - # and modify the declarations to include any information that will all_backends - # be used by function_wrapper.create_derived - output_declarations, op_registrations = function_wrapper.create_generic( - top_env, declarations) - output_declarations = postprocess_output_declarations(output_declarations) - file_manager.write("Declarations.yaml", format_yaml(output_declarations)) - - gen_backend_select_register.register_backend_select_methods(declarations, BACKEND_SELECT_REGISTER_CPP, file_manager) - - add_op_registrations( - top_env['function_registrations'], per_op_registrations, schema_registrations, op_registrations) - - for backend, density in iterate_types(): - generate_storage_type_and_tensor( - backend, density, declarations, per_op_registrations, schema_registrations) - - core_files = { - 'TensorBody.h': TENSOR_H, - 'TensorMethods.cpp': TENSOR_METHODS_CPP, - 'ATenOpList.cpp': OPS_ALREADY_MOVED_TO_C10_CPP, - } - - for core_file, core_template_file in core_files.items(): - core_file_manager.write(core_file, core_template_file, top_env) - - file_manager.write('TypeDefault.h', TYPE_DEFAULT_H, top_env) - file_manager.write('TypeDefault.cpp', TYPE_DEFAULT_CPP, top_env) - - file_manager.write('Functions.h', FUNCTIONS_H, top_env) - file_manager.write('Functions.cpp', FUNCTIONS_CPP, top_env) - - file_manager.write('NativeFunctions.h', NATIVE_FUNCTIONS_H, top_env) - - generate_per_op_registration(per_op_registrations) - generate_schema_registration(schema_registrations) - - file_manager.check_all_files_written() - cuda_file_manager.check_all_files_written() - -declare_outputs() -if options.output_dependencies is not None: - file_manager.write_outputs(options.output_dependencies) - core_file_manager.write_outputs(options.output_dependencies + "-core") - cuda_file_manager.write_outputs(options.output_dependencies + "-cuda") -else: - generate_outputs() diff --git a/aten/src/ATen/gen_backend_select_register.py b/aten/src/ATen/gen_backend_select_register.py deleted file mode 100644 index 3ffb1d412f9..00000000000 --- a/aten/src/ATen/gen_backend_select_register.py +++ /dev/null @@ -1,111 +0,0 @@ -# This script generates BackendSelectRegister.cpp which is being used for dispatching purposes. -# -# TLDR: most operators take one or more Tensors as arguments, and dispatch keys extracted from -# these Tensors determine which kernel (operator implementation) the dispatcher actually invokes. -# E.g., calling add() on two CUDA Tensors will dispatch to the CUDA implementation of add(), -# and so on. -# -# But factory functions don't take Tensors, so we need to get dispatch keys from other arguments. -# Rather than teaching the dispatcher how to extract dispatch keys from types besides Tensor, we -# register an extra kernel for each factory op, under the `BackendSelect` dispatch key. This key -# has higher precedence than dispatch keys for actual backends, so a BackendSelect kernel will -# front-run other kernels registered for the same op. -# -# It's the responsibility of the BackendSelect factory kernels to extract the "real" dispatch -# key from non-Tensor arguments, and redispatch using this key. Here, we generate implementations -# that obtain the key from the TensorOptions argument that's passed to all Tensor factory ops. -# -# BackendSelectRegister.cpp will contain both the BackendSelect kernels and registrations for -# all factory functions that have 'backend_select' flag in its native_functions.yaml definition. - -from code_template import CodeTemplate -from function_wrapper import gen_dispatch_key_init - -GENERATED_COMMENT = CodeTemplate( - "@" + "generated from ${filename}") - -# See NOTE[UnboxedOnly] in function_wrapper.py -UNBOXEDONLY_FUNCTION_REGISTRATION = CodeTemplate("""\ - m.impl_UNBOXED("aten::${op_name_with_overload_name}", ${function_name}); -""") - -FUNCTION_REGISTRATION = CodeTemplate("""\ - m.impl("aten::${op_name_with_overload_name}", - c10::impl::hacky_wrapper_for_legacy_signatures<${schema_order_cpp_signature}>( - TORCH_FN(${function_name}))); -""") - -FUNCTION_DEFINITION = CodeTemplate("""\ -// ${schema_string} -Tensor ${function_name}(${method_formals}) { - static auto op = c10::Dispatcher::singleton() - .findSchemaOrThrow("aten::${name}", "${overload_name}") - .typed<${function_cpp_signature}>(); - ${dispatch_key_init} - return op.callWithDispatchKey(_dk, ${function_actuals}); -} -""") - - -def needs_backend_select(declaration_option): - # We register an op under the BackendSelect dispatch key - # if a TensorOptions argument has been gathered from its declared args - # We skip all the 'new_*' and '*_like' ops as they are special cased and avoid dispatching. - # See TypeDefault.cpp - if declaration_option['name'].endswith('_like') or declaration_option['name'].startswith('new_'): - return False - - return any(a.get('dynamic_type') == 'TensorOptions' for a in declaration_option['arguments']) - -def register_backend_select_methods(declarations, template_path, file_manager): - backend_select_method_definitions = [] - backend_select_function_registrations = [] - - for decl in declarations: - for option in decl["options"]: - if needs_backend_select(option): - name = option['name'] - op_name_with_overload_name = option['name'] - if option.get('overload_name', '') != '': - name = "{0}_{1}".format(name, option['overload_name']) - op_name_with_overload_name = "{0}.{1}".format(op_name_with_overload_name, option['overload_name']) - - if option['use_c10_dispatcher'] == 'full': - func_reg = FUNCTION_REGISTRATION.substitute(schema_string=option['schema_string'], - op_name_with_overload_name=op_name_with_overload_name, - function_name=name, - schema_order_cpp_signature=option['schema_order_cpp_signature']) - else: - assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' - func_reg = UNBOXEDONLY_FUNCTION_REGISTRATION.substitute(schema_string=option['schema_string'], - op_name_with_overload_name=op_name_with_overload_name, - function_name=name) - - dispatch_key_init = gen_dispatch_key_init('_dk', option['formals_list']) - - # See NOTE[UnboxedOnly] in function_wrapper.py - if option['use_c10_dispatcher'] == 'full': - function_cpp_signature = option['schema_order_cpp_signature'] - function_actuals = option['schema_order_actuals'] - else: - assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' - function_cpp_signature = option['cpp_signature'] - function_actuals = option['actuals'] - method_def = FUNCTION_DEFINITION.substitute(function_name=name, - schema_string=option['schema_string'], - method_formals=option['formals_with_defaults'], - name=option['name'], - overload_name=option['overload_name'], - dispatch_key_init=dispatch_key_init, - function_cpp_signature=function_cpp_signature, - function_actuals=function_actuals) - - backend_select_function_registrations.append(func_reg) - backend_select_method_definitions.append(method_def) - - env = {} - env['backend_select_method_definitions'] = backend_select_method_definitions - env['backend_select_function_registrations'] = backend_select_function_registrations - - env['generated_comment'] = GENERATED_COMMENT.substitute(filename=template_path) - file_manager.write('BackendSelectRegister.cpp', template_path, env) diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 79575d46357..436619da4b4 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -3166,7 +3166,7 @@ CPU: roll_cpu CUDA: roll_cuda -# default int[] value [0,1] should not add space after comma, since native_parse.py uses ', ' to split args +# default int[] value [0,1] should not add space after comma, since codegen parser uses ', ' to split args - func: rot90(Tensor self, int k=1, int[] dims=[0,1]) -> Tensor use_c10_dispatcher: full @@ -5773,7 +5773,7 @@ CPU: foreach_tensor_add_scalar_kernel_slow CUDA: foreach_tensor_add_scalar_kernel_cuda -- func: _foreach_add_.Scalar(Tensor[](a!) self, Scalar scalar) -> () +- func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () device_guard: False variants: function dispatch: diff --git a/aten/src/ATen/native_parse.py b/aten/src/ATen/native_parse.py deleted file mode 100644 index e8206e72141..00000000000 --- a/aten/src/ATen/native_parse.py +++ /dev/null @@ -1,482 +0,0 @@ -from __future__ import print_function -import re -import yaml -import pprint -import sys -import copy - -try: - # use faster C loader if available - from yaml import CLoader as Loader -except ImportError: - from yaml import Loader - -# [temp translations] -# We're currently incrementally moving from the custom func schema to the -# JIT signature schema incrementally. This will reduce overall complexity -# and increase compliance between these components. So for now we do simple -# type translations to continue to emit the legacy func schema for further -# processing by downstream tools. This will helps us avoid having to prematurely -# change all downstream tools to detect these new types. -def type_argument_translations(arg): - type_and_name = [a.strip() for a in arg.rsplit(' ', 1)] - name = '' - if len(type_and_name) > 1: - name = type_and_name[1] - t = type_and_name[0] - name = name.split('=') - default = None - nullable = False - size = None # Only applies to int[\d+] and Tensor[\d+] arguments - if len(name) > 1: - default = name[1] - name = name[0] - - match = re.match(r'(Tensor.*)\((.+)\)(.*)', t) - annotation = None - if match: - t = match.group(1) + match.group(3) - annotation = match.group(2) - - # XXX: is_nullable flag can only annotate entire type as optional type, - # need to special case Generator? logic to make ? only available in jit - # TODO: deprecate is_nullable global flag, and parse the type - # to support annotating complicated types with optional annotation - nullable = '?' in t - - # This enables "Generator? x = None and translates to legacy - # "Generator x = nullptr". See [temp translations]. - if t == 'Generator?' and default == 'None': - t = 'Generator' - default = 'c10::nullopt' - # Enables Tensor[] by translating to legacy TensorList. - elif t == 'Tensor[]' or t == 'Tensor?[]': - t = 'TensorList' - # Enables int[] by translating to legacy IntArrayRef. - elif t == 'int[]': - t = 'IntArrayRef' - elif t == 'int[]?': - t = 'IntArrayRef?' - # Enables int by translating to legacy int64_t. - elif t == 'int': - t = 'int64_t' - elif t == 'int?': - t = 'int64_t?' - elif t == 'int64_t': - raise RuntimeError("Please use int and not int64_t. " - "See [temp translations] for details.") - elif t == 'int64_t?': - raise RuntimeError("Please use int? and not int64_t?. " - "See [temp translations] for details.") - # Enables Dimname[] by translating to legacy DimnameList. - elif t == 'Dimname[]': - t = 'DimnameList' - elif t == 'Dimname[]?': - t = 'DimnameList?' - # Enables float by translating to legacy double. - elif t == 'float': - t = 'double' - elif t == 'float?': - t = 'double?' - elif t == 'float[]': - t = 'ArrayRef' - elif t == 'float[]?': - t = 'ArrayRef?' - # Enables str by translating to legacy std::string. - elif t == 'str': - t = 'std::string' - elif t == 'double': - raise RuntimeError("Please use float and not double. " - "See [temp translations] for details.") - # Enables int[x] by translating to legacy IntArrayRef[x]. See [temp translations] - elif re.match(r'int\[(\d+)\]\?', t): - match = re.match(r'int\[(\d+)\]\?', t) - t = 'IntArrayRef' - size = int(match.group(1)) - elif re.match(r'int\[(\d+)\]', t): - match = re.match(r'int\[(\d+)\]', t) - t = 'IntArrayRef' - size = int(match.group(1)) - # Enables bool[x] by translating to legacy std::array. See [temp translations] - elif re.match(r'bool\[(\d+)\]', t): - match = re.match(r'bool\[(\d+)\]', t) - t = 'std::array'.format(match.group(1)) - elif re.match(r'std::array', t): - raise RuntimeError("Please use array notation, e.g. bool[3] and not std::array." - "See [temp translations] for details.") - # Enables Dimname[x] by translating to DimnameList[x]. See [temp translations] - elif re.match(r'Dimname\[(\d+)\]', t): - match = re.match(r'Dimname\[(\d+)\]', t) - t = 'DimnameList' - size = int(match.group(1)) - - if not default: - pass - # This enables Tensor? x=None and translates to legacy - # "Tensor? x={}". See [temp translations]. - elif t.startswith('Tensor?') and default == 'None': - default = "{}" - elif default == 'True': - default = True - elif default == 'False': - default = False - elif default == 'true': - raise RuntimeError("Please use True and not true. " - "See [temp translations] for details.") - elif default == 'false': - raise RuntimeError("Please use False and not false. " - "See [temp translations] for details.") - # Enables default argument [] by translating to legacy {}. - # See [temp translations] - elif default == '[]': - default = '{}' - # Enables lists by translating to legacy {.*}. - # See [temp translations] - elif re.match(r'\[.*\]', default): - default = "{" + default[1:-1] + "}" - elif default == 'None': - default = 'c10::nullopt' - # The JIT signature schema uses Mean, but in particular C++ needs - # the legacy at::Reduction::Mean. So we'll continue emiting that until - # we change this at either a JIT schema or C++ level. - elif default == 'Mean': - default = 'at::Reduction::Mean' - elif default == 'contiguous_format': - default = 'MemoryFormat::Contiguous' - elif default == 'per_tensor_affine': - default = 'QScheme::PER_TENSOR_AFFINE' - else: - try: - default = int(default) - except ValueError: - try: - default = float(default) - except ValueError: - pass - - return t, name, default, nullable, size, annotation - - -def parse_arguments(args): - arguments = [] - kwarg_only = False - - if len(args.strip()) == 0: - return arguments - - # TODO: Use a real parser here; this will get bamboozled - # by signatures that contain things like std::array (note the space) - for arg_idx, arg in enumerate(args.split(', ')): - type_and_name = [a.strip() for a in arg.rsplit(' ', 1)] - if type_and_name == ['*']: - assert not kwarg_only - kwarg_only = True - continue - - t, name, default, nullable, size, annotation = type_argument_translations(arg) - - argument_dict = {'type': t.rstrip('?'), 'name': name, 'is_nullable': nullable, 'annotation': annotation} - if size: - argument_dict['size'] = size - if default is not None: - argument_dict['default'] = default - if kwarg_only: - argument_dict['kwarg_only'] = True - arguments.append(argument_dict) - - return arguments - -def process_arguments(arguments, func_variants, declaration, func_return): - is_out_fn = False - arguments_out = [] - arguments_other = [] - for argument in arguments: - if argument['type'] == "Tensor" and \ - argument['annotation'] and \ - re.match(r'^(.*!)$', argument['annotation']) and \ - argument.get('kwarg_only'): - argument['output'] = True - argument['kwarg_only'] = False - arguments_out.append(argument) - is_out_fn = True - else: - arguments_other.append(argument) - - arguments = arguments_out + arguments_other - - name = declaration['name'] - if is_out_fn: - declaration['name'] += "_out" - - # Reverse splat of TensorOptions - # As we move towards the JIT function schema for native_functions.yaml we need to support - # the expanded version of TensorOptions. For now we discover whether there are three - # types and names of keyword arguments: "ScalarType dtype", "Layout layout" and "Device device" - # Each, if set, must have default arguments set to long or float, strided and "cpu" respectively. - # They must appear in this order and in this order only in order for us to be able to process them. - # In the future we will get rid of this specific processing as downstream consumers start relying - # less on the content of Declarations.yaml. If you want to support more than this you'll - # potentially have to extend the JIT. - - supported_topt_arguments = [ - [ - {'name': 'dtype', 'type': 'ScalarType', 'is_nullable': False, 'annotation': None}, - {'name': 'layout', 'type': 'Layout', 'is_nullable': False, 'annotation': None}, - {'name': 'device', 'type': 'Device', 'is_nullable': False, 'annotation': None}, - {'name': 'pin_memory', 'type': 'bool', 'is_nullable': False, 'annotation': None, 'default': False}, - ] - ] - supported_topt_arguments.append(copy.deepcopy(supported_topt_arguments[0])) - for arg in supported_topt_arguments[1]: - arg.update({'kwarg_only': True}) - supported_topt_arguments.append(copy.deepcopy(supported_topt_arguments[1])) - for arg in supported_topt_arguments[2]: - arg.update({'default': 'c10::nullopt', 'is_nullable': True}) - # add explicit support for what is needed for tril_indices / triu_indices - supported_topt_arguments.append( - [ - {'name': 'dtype', 'type': 'ScalarType', 'annotation': None, 'kwarg_only': True, - 'default': 'long', 'is_nullable': True}, - {'name': 'layout', 'type': 'Layout', 'annotation': None, 'kwarg_only': True, - 'default': 'c10::nullopt', 'is_nullable': True}, - {'name': 'device', 'type': 'Device', 'annotation': None, 'kwarg_only': True, - 'default': 'c10::nullopt', 'is_nullable': True}, - {'name': 'pin_memory', 'type': 'bool', 'annotation': None, 'kwarg_only': True, - 'default': 'c10::nullopt', 'is_nullable': True}, - ] - ) - supported_topt_arguments.append( - [ - {'name': 'dtype', 'type': 'ScalarType', 'annotation': None, 'kwarg_only': True, - 'default': 'c10::nullopt', 'is_nullable': True}, - {'name': 'layout', 'type': 'Layout', 'annotation': None, 'kwarg_only': True, - 'default': 'c10::nullopt', 'is_nullable': True}, - {'name': 'device', 'type': 'Device', 'annotation': None, 'kwarg_only': True, - 'default': 'c10::nullopt', 'is_nullable': True}, - {'name': 'pin_memory', 'type': 'bool', 'annotation': None, 'kwarg_only': True, - 'default': False, 'is_nullable': True}, - ] - ) - - corresponding_topts = [ - {'type': 'TensorOptions', 'name': 'options', 'is_nullable': False, 'annotation': None}, - ] - corresponding_topts.append(corresponding_topts[0].copy()) - corresponding_topts[1]['kwarg_only'] = True - corresponding_topts.append(corresponding_topts[1].copy()) - corresponding_topts[2]['default'] = '{}' - corresponding_topts.append( - {'type': 'TensorOptions', 'name': 'options', 'is_nullable': False, 'annotation': None, - 'kwarg_only': True, 'default': 'at::kLong'}) - corresponding_topts.append( - {'type': 'TensorOptions', 'name': 'options', 'is_nullable': False, 'annotation': None, - 'kwarg_only': True}) - - def check_topt_representation(topt_representation): - for idx, supported_topt in enumerate(supported_topt_arguments): - matches = all(topt_representation[i] == topt for i, topt in enumerate(supported_topt)) - if matches: - return corresponding_topts[idx] - return None - - def is_tensor_option(argument): - return argument['name'] in ['dtype', 'layout', 'device', 'pin_memory'] - - new_arguments = [] - idx = 0 - while idx < len(arguments): - argument = arguments[idx] - number_of_arguments = len(supported_topt_arguments[0]) - if is_tensor_option(argument) and len(arguments) - idx >= number_of_arguments: - topt_representation = [] - for i in range(number_of_arguments): - argument = arguments[idx] - if not is_tensor_option(argument): - break - topt_representation.append(argument) - idx += 1 - if len(topt_representation) == number_of_arguments: - merged_argument = check_topt_representation(topt_representation) - assert merged_argument, \ - "Unsupported combination of TensorOptions {}, the only currently supported combinations are {}"\ - .format(str(topt_representation), str(supported_topt_arguments)) - new_arguments.append(merged_argument) - else: - new_arguments += topt_representation - else: - new_arguments.append(argument) - idx += 1 - - arguments = new_arguments - - # Sanity checks - - # TODO: convention is that the ith-argument correspond to the i-th return, but it would - # be better if we just named everything and matched by name. - for arg_idx, argument in enumerate(arguments_out): - assert argument['annotation'] == func_return[arg_idx]['annotation'], \ - "For func {} writeable keyword Tensor arguments need to have a matching return Tensor. Further, " \ - "the ith-argument needs to correspond to the i-th return.".format(name) - - assert len(arguments_out) <= len(func_return), "func {} must return at least as many Tensors " \ - "as can be passed as output.".format(name) - - if name.endswith('_out'): - raise RuntimeError("Native function {} may not be suffixed with _out as we transition to a unified schema. " - "Otherwise you will cause confusion amongst consumers of native functions.".format(name)) - - if is_out_fn and func_variants not in [[], 'function', ['function']]: - raise RuntimeError("Native functions with output MUST be declared with only the function variant; " - "e.g., variants: function; otherwise you will tickle a Python argument binding bug " - "(which usually manifests itself as the result variable being undefined.) " - "The culprit was: {}".format(name)) - if not is_out_fn: - assert len(arguments_out) == 0, "func {} is not marked as output yet contains output " \ - "keyword arguments".format(name) - - # TODO: Explicit checking for void is a hack and should disappear after a more - # functionally complete implementation of Tensor aliases. - if declaration['inplace'] and len(func_return) > 0: - found_self = False - for arg_idx, argument in enumerate(arguments): - if argument['name'] == "self": - assert argument['annotation'] and argument['annotation'].endswith("!"), \ - "Inplace function \"{}\" needs to annotate Tensor argument named self " \ - "as mutable.".format(name) - found_self = True - assert argument['annotation'] == func_return[arg_idx]['annotation'], \ - "Inplace function annotations of function {} need to match between " \ - "input and correponding output.".format(name) - assert argument['name'] == func_return[arg_idx]['name'] or \ - argument['name'] == func_return[arg_idx]['name'] + "_return" - assert argument['type'] == func_return[arg_idx]['type'] - assert found_self, "Inplace function \"{}\" needs Tensor argument named self.".format(name) - - return arguments - - -def parse_return_arguments(return_decl, inplace, func_decl): - arguments = [] - if return_decl == '()': - return arguments - - # TODO: Use a real parser here; this will get bamboozled - # by signatures that contain things like std::array (note the space) - if return_decl[0] == '(' and return_decl[-1] == ')': - return_decl = return_decl[1:-1] - - multiple_args = len(return_decl.split(', ')) > 1 - for arg_idx, arg in enumerate(return_decl.split(', ')): - t, name, default, nullable, size, annotation = type_argument_translations(arg) - # name of arguments and name of return sometimes have collision - # in this case, we rename the return name to _return. - return_name = name - if name in func_decl['func'].split('->')[0]: - return_name = name + "_return" - argument_dict = {'type': t, 'name': return_name, 'annotation': annotation} - if name: - # See Note [field_name versus name] - argument_dict['field_name'] = name - else: - if t == "Tensor" and inplace: - assert annotation and annotation.endswith("!"), \ - "Return Tensor of function \"{}\" flagged as inplace needs to be " \ - "annotated as mutable".format(func_decl['func']) - argument_dict['name'] = 'self' - elif t == "TensorList" and inplace: - assert annotation and annotation.endswith("!"), \ - "Return TensorList of function \"{}\" flagged as inplace needs to be " \ - "annotated as mutable".format(func_decl['func']) - argument_dict['name'] = 'self' - else: - argument_dict['name'] = 'result' if not multiple_args else 'result' + str(arg_idx) - argument_dict['output'] = True - arguments.append(argument_dict) - return arguments - - -def parse_dispatch(name, dispatch): - """ - Parse a dictionary like {"CPU, CUDA": "blah"} - into {"CPU": "blah", "CUDA": "blah"} - """ - if not isinstance(dispatch, dict): - return dispatch - r = {} - for old_k, v in dispatch.items(): - ks = old_k.split(',') - for k in ks: - k = k.strip() - assert k not in r, "{}, {}".format(name, k) - r[k] = v - return r - - -def parse_native_yaml(path): - with open(path, 'r') as f: - return yaml.load(f, Loader=Loader) - - -def propagate_field_names(output_arguments, return_arguments): - if output_arguments: - for i, r in enumerate(return_arguments): - if 'field_name' in r: - output_arguments[i]['field_name'] = r['field_name'] - - -def run(paths): - declarations = [] - for path in paths: - for func in parse_native_yaml(path): - declaration = {'mode': 'native'} - try: - declaration['schema_string'] = "aten::" + func['func'] - if '->' in func['func']: - func_decl, return_decl = [x.strip() for x in func['func'].split('->')] - else: - raise Exception('Expected return declaration') - fn_name, arguments = func_decl.split('(', 1) - if '.' in fn_name: - fn_name, overload_name = fn_name.split('.', 1) - else: - overload_name = '' - assert arguments[-1] == ")", "Expecting closing ) for {}".format(func['func']) - arguments = arguments[:-1] # Expect closing ) - declaration['name'] = func.get('name', fn_name) - declaration['operator_name'] = func.get('name', fn_name) - declaration['overload_name'] = func.get('overload_name', overload_name) - declaration['inplace'] = re.search('(^__i|[^_]_$)', fn_name) is not None - return_arguments = parse_return_arguments(return_decl, declaration['inplace'], func) - schema_order_arguments = parse_arguments(arguments) - arguments = process_arguments(schema_order_arguments, func.get('variants', []), declaration, return_arguments) - output_arguments = [x for x in arguments if x.get('output')] - propagate_field_names(output_arguments, return_arguments) - declaration['return'] = return_arguments if len(output_arguments) == 0 else output_arguments - declaration['variants'] = func.get('variants', ['function']) - declaration['matches_jit_signature'] = func.get('matches_jit_signature', True) - declaration['cpu_half'] = func.get('cpu_half', False) - declaration['cpu_bfloat16'] = func.get('cpu_bfloat16', False) - declaration['cuda_bfloat16'] = func.get('cuda_bfloat16', False) - declaration['cpu_bool'] = func.get('cpu_bool', False) - declaration['cuda_bool'] = func.get('cuda_bool', False) - declaration['deprecated'] = func.get('deprecated', False) - declaration['device_guard'] = func.get('device_guard', True) - declaration['use_c10_dispatcher'] = func.get('use_c10_dispatcher', 'with_codegenerated_unboxing_wrapper') - assert declaration['use_c10_dispatcher'] in ['with_codegenerated_unboxing_wrapper', 'full'] - declaration['manual_kernel_registration'] = func.get('manual_kernel_registration', False) - declaration['category_override'] = func.get('category_override', '') - declaration['arguments'] = func.get('arguments', arguments) - declaration['schema_order_arguments'] = func.get('schema_order_arguments', schema_order_arguments) - declaration['type_method_definition_dispatch'] = \ - parse_dispatch(fn_name, func.get('dispatch', declaration['name'])) - declaration['python_module'] = func.get('python_module', '') - declarations.append(declaration) - except Exception as e: - msg = '''Exception raised in processing function: -{func} -Generated partial declaration: -{decl}'''.format(func=pprint.pformat(func), decl=pprint.pformat(declaration)) - print(msg, file=sys.stderr) - raise e - - return declarations diff --git a/aten/src/ATen/nn.yaml b/aten/src/ATen/nn.yaml deleted file mode 100644 index a95de7be719..00000000000 --- a/aten/src/ATen/nn.yaml +++ /dev/null @@ -1,62 +0,0 @@ -# Loss functions - -- name: _thnn_multi_margin_loss(Tensor self, LongTensor target, Scalar p, Scalar margin, Tensor? weight, int64_t reduction) - cname: MultiMarginCriterion - -- name: _thnn_multilabel_margin_loss(Tensor self, LongTensor target, int64_t reduction=at::Reduction::Mean) - cname: MultiLabelMarginCriterion - buffers: [is_target] - CUDA: - forward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16'] - backward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16'] - -- name: _thnn_nll_loss(Tensor self, LongTensor target, Tensor? weight, int64_t reduction, int64_t ignore_index) - cname: ClassNLLCriterion - buffers: [total_weight] - CPU: - forward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16'] - backward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16'] - CUDA: - forward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16'] - backward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16'] - -- name: _thnn_nll_loss2d(Tensor self, LongTensor target, Tensor? weight, int64_t reduction, int64_t ignore_index) - cname: SpatialClassNLLCriterion - buffers: [total_weight] - CUDA: - forward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16'] - backward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16'] - -# Activation functions - -- name: _thnn_glu(Tensor self, int64_t dim) - cname: GatedLinear - -- name: _thnn_log_sigmoid(Tensor self) - cname: LogSigmoid - buffers: [buffer] - -# NOTE: we treat noise as an input (it's really a buffer) because the codegen -# can't handle in-place functions that have buffers -- name: _thnn_rrelu_with_noise(Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, Generator? generator=None) - cname: RReLU - has_inplace: True - -# Convolutions - -- name: _thnn_conv2d(Tensor self, Tensor weight, IntArrayRef[2] kernel_size, Tensor? bias, IntArrayRef[2] stride, IntArrayRef[2] padding) - cname: SpatialConvolutionMM - buffers: [columns, ones] - CPU: - forward_scalar_types: ['Float', 'Double', 'Long', 'BFloat16'] - backward_scalar_types: ['Float', 'Double', 'BFloat16'] - CUDA: - forward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16'] - backward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16'] - -- name: _thnn_conv_depthwise2d(Tensor self, Tensor weight, IntArrayRef[2] kernel_size, Tensor? bias, IntArrayRef[2] stride, IntArrayRef[2] padding, IntArrayRef[2] dilation) - cname: SpatialDepthwiseConvolution - buffers: [] - CUDA: - forward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16'] - backward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16'] diff --git a/aten/src/ATen/nn_parse.py b/aten/src/ATen/nn_parse.py deleted file mode 100644 index 33d78abf61d..00000000000 --- a/aten/src/ATen/nn_parse.py +++ /dev/null @@ -1,388 +0,0 @@ -import copy -import re -import common_with_cwrap -import yaml -from collections import OrderedDict, defaultdict - -try: - # use faster C loader if available - from yaml import CLoader as Loader -except ImportError: - from yaml import Loader - - -# matches `name`, `params` in `name(params)` -NAME_PARAM_REGEX = r'(\w+)\((.*)\)' - - -def argument_to_declaration(param, func=None): - arg = {} - arg['type'], name = param.split(' ') - if (arg['type'].endswith('?')): - arg['is_nullable'] = True - arg['type'] = arg['type'].rstrip('?') - if arg['type'] == 'Tensor': - arg['type'] = 'THTensor*' - elif arg['type'] == 'LongTensor': - arg['type'] = 'THIndexTensor*' - elif arg['type'] == 'Scalar': - arg['type'] = 'accreal' - elif arg['type'] == 'Generator': - arg['type'] = 'c10::optional' - - match = re.match(r'IntArrayRef\[(\d+)\]', arg['type']) - if match: - arg['type'] = 'IntArrayRef' - arg['size'] = int(match.group(1)) - - if '=' in name: - name, default = name.split('=') - arg['optional'] = True - arg['default'] = default - arg['name'] = name - - return arg - - -def output_arguments(thnn_function): - cname = thnn_function.name - output_args = [] - - # function_wrapper expects everything in a declaration to be in - # the base type (i.e. THTensor*), but if we pull a THCUNN only - # implementation, it will have THCTensor* as the arg type. So we - # strip the THC here before returning - def map_to_th_type(t): - if t.startswith('THC'): - t = t.replace('THC', 'TH') - return t - - def is_output_arg(arg_name, func_name): - if arg_name == 'output' and 'updateOutput' in cname: - return True - if name in {'gradInput', 'gradWeight', 'gradBias', 'gradGrid'}: - return True - if arg_name == 'indices' and 'updateOutput' in cname and 'Unpool' not in cname: - # indices is an output argument in pooling and an input in unpooling - return True - return False - - for arg in thnn_function.arguments: - name = arg.name - if is_output_arg(name, cname): - desc = { - 'type': map_to_th_type(arg.type), - 'name': camel_to_snake(name), - 'output': True, - } - if name.startswith('grad_'): - desc['is_nullable'] = True - output_args.append(desc) - return output_args - - -def get_return(args): - indices = [str(idx) for idx, arg in enumerate(args) if arg.get('output')] - return 'argument {}'.format(','.join(indices)) - - -ARGUMENT_MAPPINGS = { - 'k': 'kernel_size', - 'd': 'stride', - 'pad': 'padding', - 'p': 'padding', - 'o': 'output_size', - 'osize': 'output_size', - 'output': 'output_size', # as a prefix e.g. outputW - 'isize': 'input_size', - 'dilation': 'dilation', - 'adj': 'output_padding', - 'a': 'output_padding', -} - -DIMENSION_OFFSET = { - 'width': -1, - 'height': -2, - 'B': 0, - 'C': 1, - 'W': -1, - 'H': -2, - 'T': -3, - 'left': 0, - 'right': 1, - 'top': 2, - 'bottom': 3, - 'front': 4, - 'back': 5, -} - -SUBSTITUTIONS = { - 'input': 'self', - 'weights': 'weight', - 'train': 'training', - 'val': 'value', - 'lambda': 'lambd', - 'negval': 'negative_slope', -} - - -def camel_to_snake(name): - # from https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case - s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) - return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() - - -def get_thnn_args(thnn_function, params, inplace): - params_by_name = {p['name']: p for p in params} - - def arg_expr(prefix, suffix): - # e.g kW, kH - name = ARGUMENT_MAPPINGS[prefix] - if name not in params_by_name: - raise RuntimeError('missing arg "{}" in {}'.format(name, thnn_function.name)) - param = params_by_name[name] - if param['type'] == 'IntArrayRef' and 'size' in param: - name = name + '_' - # NB: We calculate the dimension based on the name of - # the argument, not its positional order. This means - # that we may reorder arguments to get them in - # the right place; e.g., if a THNN implementation - # has arguments in the order kernelW, kernelH, we - # will generate a caller that is kernel[1], kernel[0] - # to order them in the correct way. - index = DIMENSION_OFFSET[suffix] - if index < 0: - index += param['size'] - expr = '{}[{}]'.format(name, index) - return {'type': 'EXPRESSION', 'name': expr} - - thnn_args = [] - for arg in thnn_function.arguments: - name = arg.name - if name == 'state': - continue - if inplace and name == 'output': - name = 'self' - aten_name = camel_to_snake(SUBSTITUTIONS.get(name, name)) - parts = aten_name.split('_') - if aten_name in params_by_name: - param = params_by_name[aten_name] - if arg.is_optional: - param['is_nullable'] = True - thnn_args.append(copy.deepcopy(param)) - elif len(parts) == 2 and parts[0] in ARGUMENT_MAPPINGS and parts[1] in DIMENSION_OFFSET: - # e.g. pad_left - thnn_args.append(arg_expr(parts[0], parts[1])) - elif name[-1] in DIMENSION_OFFSET and name[:-1] in ARGUMENT_MAPPINGS: - # e.g kW, kH - thnn_args.append(arg_expr(name[:-1], name[-1])) - elif name == 'owidth' or name == 'oheight': - thnn_args.append(arg_expr(name[0], name[1:])) - elif name == 'scale': - thnn_args.append({'type': 'EXPRESSION', 'name': '1'}) - elif name == 'inplace': - thnn_args.append({'type': 'EXPRESSION', 'name': str(inplace).lower()}) - else: - raise RuntimeError("{}: can't find binding for '{}'" - .format(thnn_function.name, name)) - return thnn_args - - -def remove_unused_args(args, thnn_args): - """Returns the subset of args whose name appears in thnn_args""" - def clean_name(name): - name = name[:name.index('[')] if '[' in name else name - if name.endswith('_'): - name = name[:-1] - return name - uses = set([clean_name(arg['name']) for arg in thnn_args]) - uses.add('output_mask') - args = [arg for arg in args if arg['name'] in uses] - for arg in args: - if 'default' in arg: - del arg['default'] - return args - - -def unique_args(argslist): - result = [] - seen = set() - for args in argslist: - for arg in args: - if arg['name'] in seen: - continue - seen.add(arg['name']) - result.append(arg) - return result - - -def function_info(name, arguments, cimpls, buffers, backends, inplace, backend_types): - """ - cimpls contains information use to call into THNN: - cname: THNN function name - arguments: arguments to functional call - condition: [optional] guard around call - """ - return { - 'mode': 'NN', - 'name': name, - 'cpu_bfloat16': True if backend_types is not None and 'CPU' in backend_types and - 'BFloat16' in backend_types['CPU'] else False, - 'cuda_bfloat16': True if backend_types is not None and 'CUDA' in backend_types and - 'BFloat16' in backend_types['CUDA'] else False, - 'backend_types': backend_types, - 'arguments': arguments, - 'schema_order_arguments': copy.deepcopy(arguments), - 'return': 'argument 0' if inplace else get_return(arguments), - 'buffers': buffers, - 'backends': backends, - 'cimpls': cimpls, - 'variants': ['function'], - } - -def base_declaration(func, thnn_function, backends, backend_types, inplace=False): - """Creates the NN function without any buffers in it's signature""" - name, params = re.match(NAME_PARAM_REGEX, func['name']).groups() - if inplace: - name += '_' - params = params.split(', ') - arguments = [argument_to_declaration(a, func) for a in params] - if not inplace: - arguments += output_arguments(thnn_function) - buffers = [argument_to_declaration('Tensor ' + buf) - for buf in func.get('buffers', [])] - - return function_info(name, arguments, None, buffers, backends, inplace, backend_types) - -def forward_declaration(base, thnn_function, backend_types, inplace=False): - name = '{}_forward'.format(base['name']) - if inplace: - name += '_' - - arguments = [copy.deepcopy(arg) for arg in base['arguments'] - if not arg.get('output')] - - arguments += output_arguments(thnn_function) - for buffer in base['buffers']: - buffer = copy.deepcopy(buffer) - buffer['output'] = True - arguments.append(buffer) - - thnn_args = get_thnn_args(thnn_function, arguments, inplace) - arguments = remove_unused_args(arguments, thnn_args) - cimpl = {'cname': thnn_function.name, 'arguments': thnn_args} - - return function_info(name, arguments, [cimpl], [], base['backends'], inplace, backend_types) - -def backward_declaration(base, thnn_functions, backend_types): - name = '{}_backward'.format(base['name']) - - arguments = [] - arguments.append({'type': 'THTensor*', 'name': 'grad_output'}) - arguments += [copy.deepcopy(arg) for arg in base['arguments'] - if arg['name'] != 'inplace'] - arguments += base['buffers'] - - # outputs from the forward may be inputs to the backwards - for arg in arguments: - if 'output' in arg: - del arg['output'] - - arguments += unique_args([output_arguments(f) for f in thnn_functions]) - - def initialize_output_arg(arg): - # the mask array specifies which return values to compute - arg['mask'] = True - arg['is_nullable'] = True - - is_batch_norm_backward = '_backward' in thnn_functions[0].name - grad_params = [] - if len(thnn_functions) > 1 or is_batch_norm_backward: - for arg in arguments: - if arg.get('output', False): - initialize_output_arg(arg) - if 'Tensor' in arg['type'] and arg['name'].startswith('grad_') and \ - 'input' not in arg['name'] and 'output' not in arg['name']: - grad_params.append(arg['name']) - - thnn_args = [get_thnn_args(f, arguments, False) for f in thnn_functions] - arguments = remove_unused_args(arguments, unique_args(thnn_args)) - cimpls = [] - - def get_condition(func): - # only call into the THNN functions if the output args are not null - if '_updateGradInput' in func.name: - return 'grad_input_' - if '_accGradParameters' in func.name: - return ' || '.join(p + '_' for p in grad_params) - return None - - for func, args in zip(thnn_functions, thnn_args): - cimpl = {'cname': func.name, 'arguments': args} - if len(thnn_functions) > 1: - cimpl['condition'] = get_condition(func) - cimpls.append(cimpl) - - output_args = [arg for arg in arguments if arg.get('output', False)] - - return function_info(name, arguments, cimpls, [], base['backends'], False, backend_types) - - -def parse_nn_yaml(filename): - with open(filename, 'r') as f: - return yaml.load(f, Loader=Loader) - - -include_only = '(updateOutput|updateGradInput|accGradParameters|backward)$' -exclude = 'LookupTable' - - -def run(paths): - function_backends = defaultdict(list) - header_functions = OrderedDict() - - headers = [p for p in paths if p.endswith('.h')] - yamls = [p for p in paths if p.endswith('.yaml')] - - for path in headers: - backend = 'CUDA' if re.search('THCU', path) else 'CPU' - for func in common_with_cwrap.parse_header(path): - if re.search(include_only, func.name) is None or re.search(exclude, func.name) is not None: - continue - function_backends[func.name].append(backend) - if func.name not in header_functions: - header_functions[func.name] = func - - bwd_suffixes = ['_updateGradInput', '_accGradParameters', '_backward'] - - declarations = [] - for path in yamls: - for func in parse_nn_yaml(path): - cname = func['cname'] - backends = function_backends[cname + '_updateOutput'] - - fwd_function = header_functions[cname + '_updateOutput'] - bwd_functions = [] - for suffix in bwd_suffixes: - if cname + suffix in header_functions: - bwd_functions.append(header_functions[cname + suffix]) - - default_scalar_types = ['Float', 'Double', 'Half'] # Half will be stripped for CPU backend - forward_backend_types = {} - backward_backend_types = {} - for backend in backends: - backend_props = func.get(backend, {}) - forward_backend_types[backend] = backend_props.get('forward_scalar_types', default_scalar_types) - backward_backend_types[backend] = backend_props.get('backward_scalar_types', default_scalar_types) - - base = base_declaration(func, fwd_function, backends, None) - declarations.append(forward_declaration(base, fwd_function, forward_backend_types)) - if bwd_functions: - declarations.append(backward_declaration(base, bwd_functions, backward_backend_types)) - - - if func.get('has_inplace', False): - declarations.append(base_declaration(func, fwd_function, backends, forward_backend_types, True)) - declarations.append(forward_declaration(base, fwd_function, forward_backend_types, True)) - - return declarations diff --git a/aten/src/ATen/preprocess_declarations.py b/aten/src/ATen/preprocess_declarations.py deleted file mode 100644 index 1c18144ba9f..00000000000 --- a/aten/src/ATen/preprocess_declarations.py +++ /dev/null @@ -1,213 +0,0 @@ -import re -from copy import deepcopy -from function_wrapper import TYPE_FORMAL_GENERIC -import common_with_cwrap - -type_map = { - 'floating_point': [ - 'Float', - 'Double', - 'Half', - 'BFloat16', - ], - 'integral': [ - 'Byte', - 'Char', - 'Short', - 'Int', - 'Long', - 'Bool', - ], - 'quantized': [ - 'QInt8', - 'QUInt8', - 'QInt32', - ] -} - -all_types = type_map['floating_point'] + type_map['integral'] + type_map['quantized'] -type_map['all'] = all_types - -all_backends = ['CPU', 'CUDA', 'SparseCPU', 'SparseCUDA', 'MkldnnCPU', 'QuantizedCPU', 'QuantizedCUDA', 'Vulkan'] -default_backends = ['CPU', 'CUDA'] - - -def process_types_and_backends(option): - # if specific pairs were not listed, then enumerate them - # based on the backend and type attributes - # if backend or type is not defined, it is assumed to be all of them - if 'backend_types' not in option: - backends = option.get('backends', default_backends) - if isinstance(option.get('type_method_definition_dispatch'), dict): - backends = option.get('type_method_definition_dispatch').keys() - backends = set(backends) - - backend_types = {} - for backend in backends: - if backend in ('QuantizedCPU', 'QuantizedCUDA'): - backend_types[backend] = type_map['quantized'] - else: - backend_types[backend] = option.get('types', all_types) - else: - backend_types = option['backend_types'] - - # expand type alias (integral, floating_point, all) - def expand(types): - ret = [] - for t in types: - if t in type_map: - ret.extend(type_map[t]) - else: - assert(t in all_types) - ret.append(t) - return ret - - for backend in backend_types.keys(): - assert backend in all_backends, "{} {}".format(backend, option['name']) - backend_types[backend] = set(expand(backend_types[backend])) - - # special case remove Half for cpu unless it is explicitly enabled - if not option.get('cpu_half', False): - if 'CPU' in backend_types: - backend_types['CPU'].discard('Half') - - # special case remove BFloat16 for cpu and cuda unless it is explicitly enabled - if not option.get('cpu_bfloat16', False): - if 'CPU' in backend_types: - backend_types['CPU'].discard('BFloat16') - - if not option.get('cuda_bfloat16', False): - if 'CUDA' in backend_types: - backend_types['CUDA'].discard('BFloat16') - - # special cases remove bool for cpu and cuda unless it is explicitly enabled - if not option.get('cpu_bool', False): - if 'CPU' in backend_types: - backend_types['CPU'].discard('Bool') - - if not option.get('cuda_bool', False): - if 'CUDA' in backend_types: - backend_types['CUDA'].discard('Bool') - - # sort the result for easy reading - for backend in backend_types.keys(): - backend_types[backend] = sorted(backend_types[backend]) - option['backend_types'] = backend_types - - -def exclude(declaration): - return 'only_register' in declaration or declaration.get('name') == 'ndimension' - - -def add_variants(option): - option.setdefault('variants', ['method']) - -# if we have 'output' arguments, generate a variant where -# we mark oututs as allocate = True, and where the method variant -# is disabled... - - -def handle_outputs_taken_as_arguments(options): - new_options = [] - - def is_nullable(arg): - return (arg['type'] in {'THIntegerTensor*', 'THTensor*'} and - arg.get('default', '') in {None, 'NULL', 'nullptr'}) - - def should_generate_out_variant(option): - if 'function' in option['variants'] and option['mode'] != 'native': - # don't generate _out variants for in-place functions - return re.search('(^__i|[^_]_$)', option['api_name']) is None - return False - - for option in options: - for arg in option['arguments']: - # mark arguments which can be null - if is_nullable(arg): - arg['is_nullable'] = True - - if any('output' in arg for arg in option['arguments']): - allocate_option = deepcopy(option) - # the allocating option needs to be marked - for arg in allocate_option['arguments']: - if 'output' in arg: - arg['allocate'] = True - - # the original option, which takes arguments for the results, - # is no longer a method, and has _out added to indicte it takes - # output arguments - if should_generate_out_variant(option): - if 'method' in option['variants']: - option['variants'].remove('method') - option['api_name'] += '_out' - new_options.append(option) - - new_options.append(allocate_option) - else: - new_options.append(option) - return new_options - - -def sanitize_return(option): - ret = option['return'] - m = re.match(r'argument (\d+(,\d+)*)', ret) - if m is not None: - arguments = [int(x) for x in m.group(1).split(',')] - option['return'] = {'kind': 'arguments', 'arguments': arguments} - elif ret == 'self': - option['return'] = {'kind': 'arguments', 'arguments': []} - for i, x in enumerate(option['arguments']): - if x['name'] == 'self': - option['return']['arguments'].append(i) - break - else: - option['return'] = {'kind': 'type', 'type': option['return']} - - -def set_mode(option): - option['mode'] = option.get('mode', 'TH') - - -def is_extended_method(option): - if 'method' in option['variants']: - return False - else: - return True - - -def run(declarations): - declarations = [d for d in declarations if not exclude(d)] - non_extended_methods = set() - for declaration in declarations: - common_with_cwrap.set_declaration_defaults(declaration) - declaration['options'] = [deepcopy(o) for o in declaration['options']] - declaration['options'] = common_with_cwrap.filter_unique_options( - declaration['options'], - allow_kwarg=False, - type_to_signature=TYPE_FORMAL_GENERIC, - remove_self=True) - - common_with_cwrap.sort_by_number_of_args(declaration) - - for option in declaration['options']: - set_mode(option) - if option['mode'] != 'native': - sanitize_return(option) - process_types_and_backends(option) - add_variants(option) - if not is_extended_method(option): - non_extended_methods.add(option['api_name']) - declaration['options'] = handle_outputs_taken_as_arguments( - declaration['options']) - # We (very unfortunately) have overloaded virtual methods. Because - # of C++'s rules, we cannot move one overload without doing some - # extra work to make sure that overload in a superclass and an - # overload in a subclass resolve together. I've chosen to resolve - # this problem simply by moving ALL overloads of a method which - # occurs in Tensor to Type. This is why we have to first compute - # which methods *names* go on type, and then move ALL overloads - # of this name to Type. - for declaration in declarations: - for option in declaration['options']: - option['extended_method'] = option['api_name'] not in non_extended_methods - return declarations diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index c00fcc789aa..bdda67afb1f 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -304,10 +304,6 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) # Generate files set(TOOLS_PATH "${TORCH_ROOT}/tools") - configure_file("${TORCH_ROOT}/aten/src/ATen/common_with_cwrap.py" - "${TOOLS_PATH}/shared/cwrap_common.py" - COPYONLY) - configure_file("${TORCH_SRC_DIR}/_utils_internal.py" "${TOOLS_PATH}/shared/_utils_internal.py" COPYONLY) diff --git a/caffe2/contrib/aten/gen_op.py b/caffe2/contrib/aten/gen_op.py index aa2d0f7ddbc..791df87e3a0 100755 --- a/caffe2/contrib/aten/gen_op.py +++ b/caffe2/contrib/aten/gen_op.py @@ -36,10 +36,10 @@ if args.aten_root: if not os.path.exists(args.aten_root): raise ValueError('aten_root ({}) does not exist'.format( args.aten_root)) - sys.path.append(os.path.join(args.aten_root, 'src', 'ATen')) - from code_template import CodeTemplate as CT + sys.path.append(os.path.join(args.aten_root, '..')) # TODO: fix this + from tools.codegen.code_template import CodeTemplate as CT else: - from src.ATen.code_template import CodeTemplate as CT # type: ignore[import,no-redef] + from tools.codegen.code_template import CodeTemplate as CT # type: ignore[import,no-redef] OP_TEMPLATE = CT.from_file( os.path.join(args.template_dir, 'aten_op_template.h')) diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake index 9116dd2e317..61501a1105d 100644 --- a/cmake/Codegen.cmake +++ b/cmake/Codegen.cmake @@ -144,13 +144,7 @@ if(INTERN_BUILD_ATEN_OPS) endforeach() list(APPEND ATen_CPU_SRCS ${cpu_kernel_cpp}) - set(cwrap_files - ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/Declarations.cwrap - ${CMAKE_CURRENT_LIST_DIR}/../aten/src/THCUNN/generic/THCUNN.h - ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/nn.yaml - ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/native_functions.yaml) - - file(GLOB all_python "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/*.py") + file(GLOB all_python "${CMAKE_CURRENT_LIST_DIR}/../tools/codegen/*.py") set(GEN_ROCM_FLAG) if(USE_ROCM) @@ -189,11 +183,10 @@ if(INTERN_BUILD_ATEN_OPS) endif() set(GEN_COMMAND - "${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/gen.py + "${PYTHON_EXECUTABLE}" -m tools.codegen.gen --source-path ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen --install_dir ${CMAKE_BINARY_DIR}/aten/src/ATen ${GEN_ROCM_FLAG} - ${cwrap_files} ${CUSTOM_BUILD_FLAGS} ${GEN_VULKAN_FLAGS} ) @@ -202,6 +195,7 @@ if(INTERN_BUILD_ATEN_OPS) COMMAND ${GEN_COMMAND} --output-dependencies ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt RESULT_VARIABLE RETURN_VALUE + WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/.. ) if(NOT RETURN_VALUE EQUAL 0) message(STATUS ${generated_cpp}) @@ -219,7 +213,10 @@ if(INTERN_BUILD_ATEN_OPS) add_custom_command(OUTPUT ${generated_cpp} ${cuda_generated_cpp} ${core_generated_cpp} COMMAND ${GEN_COMMAND} - DEPENDS ${all_python} ${all_templates} ${cwrap_files}) + DEPENDS ${all_python} ${all_templates} + ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/native_functions.yaml + WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/.. + ) # Generated headers used from a CUDA (.cu) file are # not tracked correctly in CMake. We make the libATen.so depend explicitly diff --git a/docs/cpp/source/check-doxygen.sh b/docs/cpp/source/check-doxygen.sh index 454ea228dd5..b258a412141 100755 --- a/docs/cpp/source/check-doxygen.sh +++ b/docs/cpp/source/check-doxygen.sh @@ -14,16 +14,9 @@ command -v doxygen >/dev/null 2>&1 || { echo >&2 "doxygen is not supported. Abor pushd "$(dirname "$0")/../../.." -cp aten/src/ATen/common_with_cwrap.py tools/shared/cwrap_common.py cp torch/_utils_internal.py tools/shared -python aten/src/ATen/gen.py \ - -s aten/src/ATen \ - -d build/aten/src/ATen \ - aten/src/ATen/Declarations.cwrap \ - aten/src/THCUNN/generic/THCUNN.h \ - aten/src/ATen/nn.yaml \ - aten/src/ATen/native/native_functions.yaml +python -m tools.codegen.gen python tools/setup_helpers/generate_code.py \ --declarations-path build/aten/src/ATen/Declarations.yaml \ diff --git a/mypy-strict.ini b/mypy-strict.ini index 21563d4be91..95a8d599606 100644 --- a/mypy-strict.ini +++ b/mypy-strict.ini @@ -29,5 +29,4 @@ warn_return_any = True implicit_reexport = False strict_equality = True -files = - aten/src/ATen/code_template.py +files = tools/codegen/gen.py diff --git a/mypy.ini b/mypy.ini index d2765089197..9b73e839d29 100644 --- a/mypy.ini +++ b/mypy.ini @@ -17,7 +17,6 @@ check_untyped_defs = True files = torch, caffe2, - aten/src/ATen/function_wrapper.py, test/test_complex.py, test/test_futures.py, test/test_torch.py, diff --git a/requirements.txt b/requirements.txt index 29fb620ec62..07127f738ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ requests setuptools six typing_extensions +dataclasses diff --git a/setup.py b/setup.py index 508dcdd94e9..8c060a1c5e3 100644 --- a/setup.py +++ b/setup.py @@ -351,8 +351,8 @@ def build_deps(): # Use copies instead of symbolic files. # Windows has very poor support for them. - sym_files = ['tools/shared/cwrap_common.py', 'tools/shared/_utils_internal.py'] - orig_files = ['aten/src/ATen/common_with_cwrap.py', 'torch/_utils_internal.py'] + sym_files = ['tools/shared/_utils_internal.py'] + orig_files = ['torch/_utils_internal.py'] for sym_file, orig_file in zip(sym_files, orig_files): same = False if os.path.exists(sym_file): @@ -368,7 +368,7 @@ def build_deps(): ################################################################################ # the list of runtime dependencies required by this built package -install_requires = ['future', 'typing_extensions'] +install_requires = ['future', 'typing_extensions', 'dataclasses'] missing_pydep = ''' Missing build dependency: Unable to `import {importname}`. diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py index bccaa21b811..c566e6f9026 100644 --- a/test/backward_compatibility/check_backward_compatibility.py +++ b/test/backward_compatibility/check_backward_compatibility.py @@ -25,6 +25,8 @@ from torch._C import parse_schema # 1: date until which the allowlist entry is valid # 2: (optional) function argument regex # ] +# +# NB: function name DOES NOT include overload name! allow_list = [ ("c10_experimental", datetime.date(2222, 1, 1)), # We export some functions and classes for test_jit.py directly from libtorch.so, @@ -69,9 +71,11 @@ allow_list = [ ("aten::gcd", datetime.date(2020, 7, 30)), ("aten::unflatten", datetime.date(2020, 8, 14)), ("aten::linalg_outer", datetime.date(2020, 8, 30)), + # WARNING: overload name here doesn't do anything ("aten::linalg_outer.out", datetime.date(2020, 8, 30)), ("aten::_compute_linear_combination", datetime.date(2020, 9, 1)), ("__getstate__", datetime.date(2020, 9, 1), "Conv[23]dPackedParams"), + ("aten::_foreach_add_", datetime.date(2020, 10, 1)), ] diff --git a/test/test_type_hints.py b/test/test_type_hints.py index 3f6e1215a10..55c080e2cc5 100644 --- a/test/test_type_hints.py +++ b/test/test_type_hints.py @@ -215,7 +215,7 @@ class TestTypeHints(TestCase): finally: os.chdir(cwd) if result != 0: - self.fail("mypy failed: {}".format(stdout)) + self.fail("mypy failed: {} {}".format(stdout, stderr)) @unittest.skipIf(not HAVE_MYPY, "need mypy") def test_run_mypy_strict(self): @@ -237,7 +237,7 @@ class TestTypeHints(TestCase): finally: os.chdir(cwd) if result != 0: - self.fail("mypy failed: {}".format(stdout)) + self.fail("mypy failed: {} {}".format(stdout, stderr)) if __name__ == '__main__': run_tests() diff --git a/tools/autograd/gen_autograd.py b/tools/autograd/gen_autograd.py index 98d9c463581..82d908de618 100644 --- a/tools/autograd/gen_autograd.py +++ b/tools/autograd/gen_autograd.py @@ -154,7 +154,6 @@ def load_aten_declarations(path): if has_tensoroptions_argument(declaration): declaration['schema_order_args'] = [process_schema_order_arg(arg) for arg in declaration['schema_order_args']] declaration['api_name'] = declaration['name'] - # NB: keep this in sync with common_with_cwrap.py if declaration.get('overload_name'): declaration['type_wrapper_name'] = "{}_{}".format( declaration['name'], declaration['overload_name']) diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py index 4ea27f01cc4..834916a72b5 100644 --- a/tools/autograd/gen_python_functions.py +++ b/tools/autograd/gen_python_functions.py @@ -35,11 +35,7 @@ import re from .gen_variable_type import should_trace from .utils import write, is_tensor_method -try: - from src.ATen.code_template import CodeTemplate -except ImportError: - from tools.shared.module_loader import import_module - CodeTemplate = import_module('code_template', 'aten/src/ATen/code_template.py').CodeTemplate +from tools.codegen.code_template import CodeTemplate # # declarations blocklist diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index c2b8688de91..7329495ac99 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -216,7 +216,15 @@ ${return_type} ${type_wrapper_name}(${formals}) { } """) -# See NOTE[UnboxedOnly] in function_wrapper.py +# NOTE[UnboxedOnly] Many of our codegen templates currently exist twice, once +# in an _UNBOXEDONLY_ variant and once without _UNBOXEDONLY_. This is because +# ops that are `use_c10_dispatcher: full` need different c++ code than ops +# that aren't `use_c10_dispatcher: full` yet. The _UNBOXEDONLY_ variants +# are for ops that aren't `use_c10_dispatcher: full` yet and those code templates +# can be deleted once all ops are `use_c10_dispatcher: full`. +# If you update one of the templates, you likely also have to update the other. + +# See NOTE[UnboxedOnly] UNBOXEDONLY_WRAPPER_REGISTRATION = CodeTemplate("""\ m.impl_UNBOXED("${unqual_operator_name_with_overload}", &${class_type}::${type_wrapper_name}); """) @@ -366,7 +374,7 @@ ${return_type} ${api_name}(${declaration_formals}); // {"schema": "${schema_stri # TraceType templates # TODO: change `redispatch` to `NoTracerDispatchMode` + regular `call`. -# See NOTE[UnboxedOnly] in function_wrapper.py +# See NOTE[UnboxedOnly] UNBOXED_TRACE_DISPATCH = CodeTemplate("""\ static auto op = c10::Dispatcher::singleton() .findSchemaOrThrow("aten::${operator_name}", "${overload_name}") diff --git a/tools/autograd/utils.py b/tools/autograd/utils.py index 96add9ad385..92f8fe89f56 100644 --- a/tools/autograd/utils.py +++ b/tools/autograd/utils.py @@ -9,11 +9,7 @@ __all__ = [ 'split_name_params', 'write', ] -try: - from src.ATen.code_template import CodeTemplate -except ImportError: - from tools.shared.module_loader import import_module - CodeTemplate = import_module('code_template', 'aten/src/ATen/code_template.py').CodeTemplate +from tools.codegen.code_template import CodeTemplate # You should use these lines, rather than doing it manually. # Especially if you see this error! diff --git a/tools/codegen/__init__.py b/tools/codegen/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tools/codegen/api/__init__.py b/tools/codegen/api/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tools/codegen/api/cpp.py b/tools/codegen/api/cpp.py new file mode 100644 index 00000000000..452c3721ab9 --- /dev/null +++ b/tools/codegen/api/cpp.py @@ -0,0 +1,241 @@ +from tools.codegen.model import * +from tools.codegen.api.types import TensorOptionsArguments, CppArgument, ThisArgument +import tools.codegen.local as local +from typing import Optional, Sequence, Union, Callable, List + +# This file describes the translation of JIT schema to the public C++ +# API, which is what people use when they call functions like at::add. +# +# Prominent characteristics of the C++ API: +# +# - dtype, layout, device and pin_memory are collected into +# a single C++ type TensorOptions (the legacy dispatcher API +# also has this, but tensor options is really most relevant +# for the C++ API; it makes calling kwarg factory functions +# pleasant) +# +# - for 'use_c10_dispatcher: full' functions, optional tensors are +# represented explicitly using c10::optional +# +# - defaulting lives here (in fact, the dispatcher is completely +# oblivious of defaults!) +# +# BTW: policy on name collisions: we try not to have types with +# collisions, but functions are fair game to collide + +def name(func: FunctionSchema) -> str: + name = str(func.name.name) + if func.is_out_fn(): + name += '_out' + return name + +# Translation of "value types" in JIT schema to C++ API type. Value +# types look the same no matter if they are argument types are return +# types. Returns None if the type in question is not a value type. +def valuetype_type(t: Type) -> Optional[str]: + if isinstance(t, BaseType): + if t.name == BaseTy.Tensor: + return None + elif t.name == BaseTy.int: + return 'int64_t' + elif t.name == BaseTy.float: + return 'double' + elif t.name == BaseTy.str: + return 'std::string' + elif t.name in [BaseTy.bool, BaseTy.QScheme, BaseTy.Scalar, + BaseTy.ScalarType, BaseTy.Generator, BaseTy.Storage, + BaseTy.Layout, BaseTy.Device, BaseTy.MemoryFormat, + BaseTy.Dimname, BaseTy.ConstQuantizerPtr]: + # These C++ names line up with their schema names + return t.name.name + else: + raise AssertionError(f"unsupported type: {t}") + elif isinstance(t, OptionalType): + elem = valuetype_type(t.elem) + if elem is None: + return None + return f"c10::optional<{elem}>" + elif isinstance(t, ListType): + if str(t.elem) == 'bool': + assert t.size is not None + return f"std::array" + else: + return None + else: + raise AssertionError(f"unrecognized type {repr(t)}") + +# Translation of types occuring in JIT arguments to a C++ argument type. +def argumenttype_type(t: Type, *, mutable: bool) -> str: + # If it's a value type, do the value type translation + r = valuetype_type(t) + if r is not None: + return r + + if str(t) == 'Tensor' and mutable and local.hack_const_mutable_self(): + return 'const Tensor &' + + if isinstance(t, BaseType): + if t.name == BaseTy.Tensor: + if mutable: + return 'Tensor &' + else: + return 'const Tensor &' + else: + raise AssertionError(f"base type should have been value type {t}") + elif isinstance(t, OptionalType): + if str(t.elem) == 'Tensor': + if mutable: + return 'Tensor &' # TODO: fix this discrepancy + else: + if local.use_c10_dispatcher() is UseC10Dispatcher.full: + return 'const c10::optional&' + else: + return 'const Tensor &' + elem = argumenttype_type(t.elem, mutable=mutable) + return f"c10::optional<{elem}>" + elif isinstance(t, ListType): + # TODO: remove these special cases, ArrayRef fallthrough works fine + if str(t.elem) == 'int': + return "IntArrayRef" + elif str(t.elem) == 'Tensor': + return "TensorList" + elif str(t.elem) == 'Dimname': + return "DimnameList" + # TODO: do something reasonable about lists of optional tensors + elif not local.use_c10_dispatcher() is UseC10Dispatcher.full and str(t.elem) == 'Tensor?': + return "TensorList" + elem = argumenttype_type(t.elem, mutable=mutable) + # TODO: explicitly qualify namespace here + return f"ArrayRef<{elem}>" + else: + raise AssertionError(f"unrecognized type {repr(t)}") + +# Translate a JIT argument into its C++ type +def argument_type(a: Argument) -> str: + return argumenttype_type(a.type, mutable=a.is_write) + +# Translation of a (non-multi) return type from JIT to C++ +def returntype_type(t: Type, *, mutable: bool) -> str: + r = valuetype_type(t) + if r is not None: + return r + + if isinstance(t, BaseType): + if t.name == BaseTy.Tensor: + if mutable: + return 'Tensor &' + else: + return 'Tensor' + elif isinstance(t, ListType): + elem = returntype_type(t.elem, mutable=mutable) + assert t.size is None, f"fixed size list returns not supported: {t}" + return f"std::vector<{elem}>" + + raise AssertionError(f"unrecognized return type {t}") + +# Translation of a single return to its C++ type +def return_type(r: Return) -> str: + return returntype_type(r.type, mutable=r.is_write) + +# Translation of a full (possibly multi) return from JIT to its C++ type +def returns_type(rs: Sequence[Return]) -> str: + if len(rs) == 0: + return 'void' + elif len(rs) == 1: + return return_type(rs[0]) + else: + args = ','.join(map(return_type, rs)) + return f'std::tuple<{args}>' + +JIT_TO_CPP_DEFAULT = { + 'False': 'false', + 'True': 'true', + 'None': 'c10::nullopt', # UGH this one is type directed + 'Mean': 'at::Reduction::Mean', + '[]': '{}', + '[0,1]': '{0,1}', # TODO: stop special casing + 'contiguous_format': 'MemoryFormat::Contiguous', +} + +# Convert a JIT default into C++ expression representing the default +def default_expr(d: str, t: Type) -> str: + if d == 'None' and str(t) == 'Tensor?': + return '{}' + return JIT_TO_CPP_DEFAULT.get(d, d) + +# Convert an argument into its C++ API form +def argument(a: Union[Argument, TensorOptionsArguments, ThisArgument]) -> CppArgument: + if isinstance(a, Argument): + return CppArgument( + type=argument_type(a), + name=a.name, + default=default_expr(a.default, a.type) if a.default is not None else None, + argument=a, + ) + elif isinstance(a, ThisArgument): + return CppArgument( + type=argument_type(a.argument), + name="const_cast(*this)", # this is an abuse but it's convenient + default=None, + argument=a, + ) + elif isinstance(a, TensorOptionsArguments): + default = None + if all(x.default == "None" for x in a.all()): + default = '{}' + elif a.dtype.default == "long": + default = 'at::kLong' # TODO: this is wrong + return CppArgument( + type='const TensorOptions &', + name='options', + default=default, + argument=a, + ) + else: + assert_never(a) + +def group_arguments( + func: FunctionSchema, *, method: bool = False +) -> Sequence[Union[Argument, TensorOptionsArguments, ThisArgument]]: + args: List[Union[Argument, ThisArgument, TensorOptionsArguments]] = [] + args.extend(func.out_arguments) + + if method: + args.extend(ThisArgument(a) if a.name == "self" else a for a in func.arguments) + else: + args.extend(func.arguments) + + # group up arguments for tensor options + + def pred(name: str, ty: Type) -> Callable[[Argument], bool]: + return lambda a: a.name == name and a.type in [ty, OptionalType(ty)] + predicates = [ # order matters + pred('dtype', Type.parse('ScalarType')), + pred('layout', Type.parse('Layout')), + pred('device', Type.parse('Device')), + pred('pin_memory', Type.parse('bool')), + ] + + i = 0 + while i < len(func.kwarg_only_arguments): + # If there is enough space... + if i <= len(func.kwarg_only_arguments) - len(predicates): + # And the next len(predicates) arguments look like TensorOptions arguments + if all(p(a) for p, a in zip(predicates, func.kwarg_only_arguments[i : i + len(predicates)])): + # Group them together as one argument + args.append(TensorOptionsArguments( + dtype=func.kwarg_only_arguments[i], + layout=func.kwarg_only_arguments[i + 1], + device=func.kwarg_only_arguments[i + 2], + pin_memory=func.kwarg_only_arguments[i + 3], + )) + i += len(predicates) + continue + args.append(func.kwarg_only_arguments[i]) + i += 1 + + return args + +# Convert arguments to C++ API form +def arguments(func: FunctionSchema, *, method: bool = False) -> Sequence[CppArgument]: + return list(map(argument, group_arguments(func, method=method))) diff --git a/tools/codegen/api/dispatcher.py b/tools/codegen/api/dispatcher.py new file mode 100644 index 00000000000..34960534275 --- /dev/null +++ b/tools/codegen/api/dispatcher.py @@ -0,0 +1,109 @@ +from tools.codegen.model import * + +from tools.codegen.api.types import CppArgument, DispatcherExpr, TensorOptionsArguments, \ + DispatcherArgument, ThisArgument, LegacyDispatcherArgument +import tools.codegen.api.cpp as cpp +import tools.codegen.api.legacy_dispatcher as legacy_dispatcher +import tools.codegen.local as local + +import itertools +from typing import Sequence, Optional + +# This file describes the translation of JIT schema to the dispatcher +# API, the *unboxed* calling convention by which invocations through +# the dispatcher are made. Historically, the dispatcher API matched +# the C++ API, but with the establishment of the boxed API, we've +# made changes to the dispatcher API to so that the unboxed API +# better aligns with the boxed API. The dispatcher API hooks heavily +# into our template based boxing/unboxing machinery, so changes +# to this convention will usually need template updates too. +# +# Prominent characteristics of the dispatcher API: +# +# - 'use_c10_dispatcher: full' controls whether or not we actually +# use the modern calling convention or not. When use_c10_dispatcher +# is not enabled, we don't use the template machinery. +# +# - dtype, layout, device and pin_memory are represented as separate +# arguments. +# + +def argumenttype_type(t: Type, *, mutable: bool) -> str: + if local.use_c10_dispatcher() is UseC10Dispatcher.full: + # This is a faux amis. If it makes sense in the future to add + # more special cases here, or invert things so cpp.argument_type + # calls this, or just completely inline the function, please do + # it. + return cpp.argumenttype_type(t, mutable=mutable) + else: + # This is real sharing. If you're modifying this path, ask + # yourself why you are changing the legacy dispatcher protocol + # here and not in legacy_dispatcher. + return legacy_dispatcher.argumenttype_type(t, mutable=mutable) + +def argument_type(a: Argument) -> str: + return argumenttype_type(a.type, mutable=a.is_write) + +def returns_type(rs: Sequence[Return]) -> str: + # At present, there is no difference. But there could be! + return cpp.returns_type(rs) + +def argument(a: Argument) -> DispatcherArgument: + if local.use_c10_dispatcher() is UseC10Dispatcher.full: + return DispatcherArgument( + type=argument_type(a), + name=a.name, + argument=a, + ) + else: + la = legacy_dispatcher.argument(a) + return DispatcherArgument( + type=la.type, + name=la.name, + argument=la.argument, + ) + +def arguments(func: FunctionSchema) -> Sequence[DispatcherArgument]: + if local.use_c10_dispatcher() is UseC10Dispatcher.full: + return list(map(argument, itertools.chain(func.out_arguments, func.arguments, func.kwarg_only_arguments))) + else: + return [ + DispatcherArgument(type=la.type, name=la.name, argument=la.argument) + for la in legacy_dispatcher.arguments(func) + ] + +# Given a set of CppArguments in scope, return a sequence of dispatcher +# expressions that translate the cpp API into dispatcher API +def cppargument_exprs(a: CppArgument, *, tensor_options: Optional[CppArgument]) -> Sequence[DispatcherExpr]: + if isinstance(a.argument, TensorOptionsArguments): + if local.use_c10_dispatcher() is UseC10Dispatcher.full: + ta = a.argument + return [ + DispatcherExpr(type=argument_type(ta.dtype), expr=f'optTypeMetaToScalarType({a.name}.dtype_opt())'), + DispatcherExpr(type=argument_type(ta.layout), expr=f'{a.name}.layout_opt()'), + DispatcherExpr(type=argument_type(ta.device), expr=f'{a.name}.device_opt()'), + DispatcherExpr(type=argument_type(ta.pin_memory), expr=f'{a.name}.pinned_memory_opt()'), # weird discrep + ] + else: + return [DispatcherExpr(type='const TensorOptions &', expr=a.name)] + elif isinstance(a.argument, Argument): + if a.name == 'memory_format' and tensor_options is not None and local.use_c10_dispatcher() is UseC10Dispatcher.full: + return [DispatcherExpr( + type=argument_type(a.argument), + expr=f'c10::impl::check_tensor_options_and_extract_memory_format({tensor_options.name}, {a.name})') + ] + else: + return [DispatcherExpr(type=argument_type(a.argument), expr=a.name)] + elif isinstance(a.argument, ThisArgument): + return [DispatcherExpr(type=argument_type(a.argument.argument), expr=a.name)] + else: + assert_never(a.argument) + +def cpparguments_exprs(args: Sequence[CppArgument]) -> Sequence[DispatcherExpr]: + tensor_options = next((a for a in args if isinstance(a.argument, TensorOptionsArguments)), None) + return [r for a in args for r in cppargument_exprs(a, tensor_options=tensor_options)] + +# I don't think this is entirely sound, but it should be reasonably +# close +def legacydispatcherarguments_exprs(args: Sequence[LegacyDispatcherArgument]) -> Sequence[DispatcherExpr]: + return cpparguments_exprs([CppArgument(type=a.type, name=a.name, default=None, argument=a.argument) for a in args]) diff --git a/tools/codegen/api/legacy_dispatcher.py b/tools/codegen/api/legacy_dispatcher.py new file mode 100644 index 00000000000..db3d26c84fd --- /dev/null +++ b/tools/codegen/api/legacy_dispatcher.py @@ -0,0 +1,74 @@ +from tools.codegen.model import * + +from tools.codegen.api.types import TensorOptionsArguments, LegacyDispatcherArgument, ThisArgument +import tools.codegen.api.cpp as cpp + +from typing import Union, Sequence + +# This file describes the translation of JIT schema to the legacy +# dispatcher API. This looks a lot like the C++ API (which +# makes historical sense, because historically the dispatcher API +# and the C++ API exactly matched), but over time we have +# evolved the C++ API without actually changing our native:: +# kernels. To be deleted eventually. Dispatcher calls use +# this when you are not use_c10_dispatcher: full. + +def name(func: FunctionSchema) -> str: + name = str(func.name.name) + # TODO: delete this! + if func.is_out_fn(): + name += '_out' + if func.name.overload_name: + name += f'_{func.name.overload_name}' + return name + +def argumenttype_type(t: Type, *, mutable: bool) -> str: + if str(t) == 'Tensor?': + if mutable: + return 'Tensor &' + else: + return 'const Tensor &' + elif str(t) == 'Tensor?[]': + return 'TensorList' + return cpp.argumenttype_type(t, mutable=mutable) + +def returns_type(rs: Sequence[Return]) -> str: + return cpp.returns_type(rs) + +def argument_type(a: Argument) -> str: + return argumenttype_type(a.type, mutable=a.is_write) + +def argument(a: Union[Argument, ThisArgument, TensorOptionsArguments]) -> LegacyDispatcherArgument: + if isinstance(a, Argument): + return LegacyDispatcherArgument( + type=argument_type(a), + name=a.name, + default=cpp.default_expr(a.default, a.type) if a.default is not None else None, + argument=a, + ) + elif isinstance(a, ThisArgument): + # Erase ThisArgument from the distinction + return LegacyDispatcherArgument( + type=argument_type(a.argument), + name=a.argument.name, + default=None, + argument=a.argument, + ) + elif isinstance(a, TensorOptionsArguments): + # TODO: expunge this logic entirely + default = None + if all(x.default == "None" for x in a.all()): + default = '{}' + elif a.dtype.default == "long": + default = 'at::kLong' # TODO: this is wrong + return LegacyDispatcherArgument( + type='const TensorOptions &', + name='options', + default=default, + argument=a, + ) + else: + assert_never(a) + +def arguments(func: FunctionSchema) -> Sequence[LegacyDispatcherArgument]: + return list(map(argument, cpp.group_arguments(func))) diff --git a/tools/codegen/api/types.py b/tools/codegen/api/types.py new file mode 100644 index 00000000000..cb315cfc752 --- /dev/null +++ b/tools/codegen/api/types.py @@ -0,0 +1,95 @@ +from tools.codegen.model import * +from dataclasses import dataclass +from typing import Optional, Union, Sequence + +# Represents the implicit *this argument for method calls in C++ API +@dataclass(frozen=True) +class ThisArgument: + argument: Argument + +# Bundle of arguments that represent a TensorOptions in the C++ API. +@dataclass(frozen=True) +class TensorOptionsArguments: + dtype: Argument + layout: Argument + device: Argument + pin_memory: Argument + + def all(self) -> Sequence[Argument]: + return [self.dtype, self.layout, self.device, self.pin_memory] + +# Describe a argument (e.g., the x in "f(int x)") in the C++ API +@dataclass(frozen=True) +class CppArgument: + # C++ type, e.g., int + type: str + # C++ name, e.g., x + name: str + # Only used by the header, but we work it out in all cases anyway + default: Optional[str] + # The JIT argument(s) this formal was derived from. May + # correspond to multiple arguments if this is TensorOptions! + # May also correspond to the implicit *this argument! + argument: Union[Argument, TensorOptionsArguments, ThisArgument] + + # Default string representation prints the most elaborated form + # of the formal + def __str__(self) -> str: + mb_default = "" + if self.default is not None: + mb_default = f"={self.default}" + return f"{self.type} {self.name}{mb_default}" + + # However, you might also find the version with no default useful + def str_no_default(self) -> str: + return f"{self.type} {self.name}" + +@dataclass(frozen=True) +class CppExpr: + type: str + expr: str + +@dataclass(frozen=True) +class DispatcherExpr: + type: str + expr: str + +@dataclass(frozen=True) +class LegacyDispatcherExpr: + type: str + expr: str + +@dataclass(frozen=True) +class DispatcherArgument: + type: str + name: str + # dispatcher NEVER has defaults + argument: Union[Argument, TensorOptionsArguments] + # TensorOptionsArguments can occur when not using full c10 dispatch + + def __str__(self) -> str: + return f"{self.type} {self.name}" + +@dataclass(frozen=True) +class LegacyDispatcherArgument: + type: str + name: str + # Legacy dispatcher arguments have defaults for some reasons (e.g., + # the function prototypes in CPUType.h are defaulted). There isn't + # really any good reason to do this, as these functions are only + # ever called from a context where all defaulted arguments are + # guaranteed to be given explicitly. + # TODO: Remove this + default: Optional[str] + argument: Union[Argument, TensorOptionsArguments] + + # Convention here is swapped because arguably legacy + # dispatcher shouldn't have defaults... + def __str__(self) -> str: + return f"{self.type} {self.name}" + + def str_with_default(self) -> str: + mb_default = "" + if self.default is not None: + mb_default = f"={self.default}" + return f"{self.type} {self.name}{mb_default}" diff --git a/aten/src/ATen/code_template.py b/tools/codegen/code_template.py similarity index 100% rename from aten/src/ATen/code_template.py rename to tools/codegen/code_template.py diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py new file mode 100644 index 00000000000..a67901ea594 --- /dev/null +++ b/tools/codegen/gen.py @@ -0,0 +1,1111 @@ +import os +import contextlib +import textwrap +import itertools +from typing import List, Dict, Optional, Iterator, Tuple, Set, Callable, Any, TypeVar, DefaultDict, Union, Sequence +import yaml +from enum import Enum +from collections import OrderedDict +import argparse +import pathlib +import functools + +from tools.codegen.code_template import CodeTemplate +from tools.codegen.model import * +from tools.codegen.api.types import * +import tools.codegen.api.cpp as cpp +import tools.codegen.api.dispatcher as dispatcher +import tools.codegen.api.legacy_dispatcher as legacy_dispatcher +import tools.codegen.local as local + +try: + # use faster C loader if available + from yaml import CLoader as Loader +except ImportError: + from yaml import Loader # type: ignore + +# Welcome to the ATen code generator v2! The ATen code generator is +# responsible for parsing native_functions.yaml and then generating +# various generated files (e.g., TypeDefault.cpp) based on the operators +# defined in this file. This means that the code generator knows how to +# parse function schema, and then translate this into various C++ types +# and boilerplate code. +# +# Some things to know about this file when you modify it: +# +# - This file has STRICT mypy typechecking. Typecheck it with +# `mypy --config mypy-strict.ini` in the root source directory +# +# - Most of the heavy lifting lives in external modules: +# - 'model' has the data model for native_functions.yaml. The classes +# in those file represent what you see when you look at +# a native_functions.yaml +# - 'api' has conversions for how to translate JIT schema into +# the various C++ APIs that the codegen interacts with. There +# are in fact THREE different C++ APIs: the public C++ API, +# the dispatcher API, and the legacy disaptcher API. See each +# of these respective files for more information + + +# Note [Byte-for-byte compatibility] +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Some special cases we have made in this codegen have been strictly +# to make sure that git diff -w reports no changes, but we believe +# they are not semantically meaningful. After landing the new codegen, +# we should remove these special cases + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# HELPER FUNCTIONS +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # + +# Conveniently add error context to exceptions raised. Lets us +# easily say that an error occurred while processing a specific +# context. +@contextlib.contextmanager +def context(msg: str) -> Iterator[None]: + try: + yield + except Exception as e: + # TODO: this does the wrong thing with KeyError + msg = textwrap.indent(msg, ' ') + msg = f'{e.args[0]}\n{msg}' if e.args else msg + e.args = (msg,) + e.args[1:] + raise + +# A custom loader for YAML to let us also keep track of line numbers +# of each entry in the YAML file +class LineLoader(Loader): + def construct_mapping(self, node, deep=False): # type: ignore + mapping = super().construct_mapping(node, deep=deep) # type: ignore + # Add 1 so line numbering starts at 1 + mapping['__line__'] = node.start_mark.line + 1 + return mapping + +# Parse native_functions.yaml into a sequence of NativeFunctions +def parse_native_yaml(path: str) -> List[NativeFunction]: + with open(path, 'r') as f: + es = yaml.load(f, Loader=LineLoader) + assert isinstance(es, list) + rs: List[NativeFunction] = [] + for e in es: + assert isinstance(e.get('__line__'), int), e + loc = Location(path, e['__line__']) + funcs = e.get('func') + with context(f'in {loc}:\n {funcs}'): + rs.append(NativeFunction.from_yaml(e, loc)) + return rs + +T = TypeVar('T') +S = TypeVar('S') + +# Given a function that operates on NativeFunction, wrap it into a new function +# that sets some appropriate context managers for that native function. +# YOU MUST WRAP FUNCTIONS IN THIS for calls to api modules to be sound +# (you will get an error if we try to access the local variables without having +# set them). +def with_native_function(func: Callable[[NativeFunction], T]) -> Callable[[NativeFunction], T]: + @functools.wraps(func) + def wrapper(f: NativeFunction) -> T: + with context(f'in {f.loc}:\n {f.func}'): + with local.parametrize( + use_c10_dispatcher=f.use_c10_dispatcher, + # See Note [Byte-for-byte compatibility] + hack_const_mutable_self=str(f.func.name) in ["set_data", "retain_grad"], + ): + return func(f) + return wrapper + +# These two functions purposely return generators in analogy to map() +# so that you don't mix up when you need to list() them + +# Map over function that may return None; omit Nones from output sequence +def mapMaybe(func: Callable[[T], Optional[S]], xs: Sequence[T]) -> Iterator[S]: + for x in xs: + r = func(x) + if r is not None: + yield r + +# Map over function that returns sequences and cat them all together +def concatMap(func: Callable[[T], Sequence[S]], xs: Sequence[T]) -> Iterator[S]: + for x in xs: + for r in func(x): + yield r + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# C++ CODE GENERATION +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # + +# Most functions in this section are curried: they consist of a function +# that takes some parameters (e.g., what is to be generated) which itself +# returns a function that actually maps NativeFunction to the code +# to be generated. This pattern makes it convenient to use map, concatMap +# and similar functional combinators. + +# Many of these functions share logic for defining both the definition +# and declaration (for example, the function signature is the same), so +# we organize them into one function that takes a Target to say which +# code we want. +Target = Enum('Target', ('DEFINITION', 'DECLARATION', 'REGISTRATION')) + +# Generates {dispatch}Type.cpp and {dispatch}Type.h (e.g., CPUType.cpp +# and CPUType.h). This function is also reused to implement per-operator +# registration. It also generates TypeDefault.cpp and TypeDefault.h when +# dispatch is None. +# +# {dispatch}Type.cpp +# - The primary function of this file is to register all of the +# implementations for the given dispatch key to the dispatcher, +# so they are available for use in PyTorch. If dispatch is +# None, we generate schema (def) registrations and catchall +# registrations. +# - The secondary function of this file is to generate a wrapper +# around functions. In CPUType these wrappers do nothing +# (and should be removed), but in other cases they handle +# DeviceGuard. A small extra benefit of wrappers is they +# are not overloaded, so they can be used in the registration +# API without having to disambiguate which overload you want +# (as would be the case if you directly registered native:: +# functions). +# +# {dispatch}Type.h +# - In principle, this file shouldn't exist at all; historically, +# it existed so that we could directly access these functions +# outside of the registration API for the implementation of +# static dispatch. Should be deleted now! +# +# This function is also used for a secondary purpose: the registration +# logic is also reused to implement per-operator registration. +def compute_type_method( + dispatch: Optional[str], *, + target: Target, + # Which operators to actually generate code for. If None, generate + # code for all operators + op_registration_whitelist: Optional[Set[str]], + # Only valid for generating registrations. If True, only generate + # def() invocations (for schema registration); do not generate + # any impl() invocations for, e.g., catch-all kernels + def_only: bool = False +) -> Callable[[NativeFunction], Optional[str]]: + + if def_only: + assert target is Target.REGISTRATION and dispatch is None + + @with_native_function + def func(f: NativeFunction) -> Optional[str]: + if dispatch is not None: + if f.dispatch is None or dispatch not in f.dispatch: + return None + else: + if f.dispatch is not None and target is not Target.REGISTRATION: + return None + + if op_registration_whitelist is not None and \ + f"aten::{f.func.name.name}" not in op_registration_whitelist and target is Target.REGISTRATION: + return None + + name = legacy_dispatcher.name(f.func) + returns_type = legacy_dispatcher.returns_type(f.func.returns) + args = legacy_dispatcher.arguments(f.func) + args_str = ', '.join(map(str, args)) + + if target is Target.DECLARATION: + return f"{returns_type} {name}({args_str});" + elif target is Target.DEFINITION: + if f.dispatch is None: + cpp_name = cpp.name(f.func) + impl_name = f"at::native::{cpp_name}" + else: + assert dispatch is not None + impl_name = f"at::native::{f.dispatch[dispatch]}" + + args_exprs_str = ', '.join(map(lambda a: a.name, args)) + + # See Note [Byte-for-byte compatibility] + # (return void_func() is valid C++) + return_kw = " return " + if returns_type == "void": + return_kw = " " + + cuda_guard = "" + if dispatch is None or 'CUDA' in dispatch or 'Vulkan' == dispatch: + self_args = (a for a in f.func.arguments if a.name == "self") + + # There is precedence for which argument we use to do + # device guard. This describes the precedence order. + candidate_args = itertools.chain(self_args, f.func.out_arguments, f.func.arguments) + + # Only tensor like arguments are eligible + device_of = next((f'{a.name}' for a in candidate_args if a.type.is_tensor_like()), None) + + # See Note [Byte-for-byte compatibility] + # I wasn't able to figure out the internal logic for + # these device guards + if str(f.func.name) == "_thnn_fused_lstm_cell_backward": + device_of = "cx" + elif str(f.func.name) == "_thnn_differentiable_lstm_cell_backward": + device_of = "input_gates" + + has_tensor_options = any(isinstance(a.argument, TensorOptionsArguments) for a in args) + + # TODO: There is probably a simpler version of this that + # works just as well. + if f.device_guard and (dispatch is None or 'Vulkan' == dispatch) and has_tensor_options: + cuda_guard = """\ + const DeviceGuard device_guard(options.device()); +""" + # See Note [Byte-for-byte compatibility] + if dispatch is not None: + cuda_guard = f"\n{cuda_guard}" + elif f.device_guard and dispatch is not None and 'CUDA' in dispatch and has_tensor_options: + cuda_guard = """\ + globalContext().lazyInitCUDA(); + const DeviceGuard device_guard(options.device()); +""" + elif f.device_guard and device_of is not None: + cuda_guard = f"""\ + const OptionalDeviceGuard device_guard(device_of({device_of})); +""" + # See Note [Byte-for-byte compatibility] + if dispatch is not None: + cuda_guard = f"\n{cuda_guard}" + else: + cuda_guard = """\ + // DeviceGuard omitted +""" + # See Note [Byte-for-byte compatibility] + if dispatch is not None: + cuda_guard = f"\n{cuda_guard}" + + return f"""\ +{returns_type} {name}({args_str}) {{ +{cuda_guard}{return_kw}{impl_name}({args_exprs_str}); +}} +""" + + elif target is Target.REGISTRATION: + assert returns_type == dispatcher.returns_type(f.func.returns) + dispatcher_args = dispatcher.arguments(f.func) + dispatcher_args_types_str = ', '.join(map(lambda a: a.type, dispatcher_args)) + if dispatch is None: + type_name = f'TypeDefault::{name}' + else: + type_name = f'{dispatch}Type::{name}' + + # def registration only happens in TypeDefault + def_registration = "" + if dispatch is None: + def_registration = f'm.def("{f.func}");\n' + + impl_registration = "" + if not def_only and not f.manual_kernel_registration and (dispatch is not None or f.dispatch is None): + # Figure out which signature the function is + if local.use_c10_dispatcher() is UseC10Dispatcher.full: + # See Note [Byte-for-byte compatibility] + if dispatch is not None: + nl = "\n" + else: + nl = "" + + payload = "c10::impl::hacky_wrapper_for_legacy_signatures<" \ + f"{returns_type} ({dispatcher_args_types_str})>({nl}TORCH_FN({type_name}))" + + else: + payload = f"torch::CppFunction::makeUnboxedOnly(&{type_name})" + + # Annotate it with dispatch information if necessary + # + # NB: In the ordinary, TypeDerived code generation work flow, specification + # of the backend is handled by the enclosing block, so the torch::dispatch + # invocation here is strictly unnecessary. However, in the fbcode mobile + # only workflow using per-op registration, these registrations will get dumped + # in a TORCH_LIBRARY_FRAGMENT that does not have an ambient backend. So + # the torch::dispatch specification here is important! See + # Note [Redundancy in registration code is OK] for how we handle redundant info. + if dispatch is not None: + payload = f"torch::dispatch(DispatchKey::{dispatch},\n{payload})\n" + + impl_registration = f'm.impl("{f.func.name}",\n{payload});\n' + + return f"{def_registration}{impl_registration}" + else: + assert_never(target) + + return func + +# Generates Function.cpp and Function.h. These files provide the +# functional public C++ API, and the scaffolding to call into +# the dispatcher from these functions. See also compute_tensor_method. +def compute_function(*, target: Target) -> Callable[[NativeFunction], Optional[str]]: + @with_native_function + def go(f: NativeFunction) -> Optional[str]: + if f.manual_kernel_registration: + return None + if Variant.function not in f.variants: + return None + + name = cpp.name(f.func) + + cpp_returns_type = cpp.returns_type(f.func.returns) + cpp_args = cpp.arguments(f.func) + cpp_args_str = ', '.join(map(str, cpp_args)) + + if target is Target.DECLARATION: + return f"CAFFE2_API {cpp_returns_type} {name}({cpp_args_str});" + + assert target is Target.DEFINITION + + dispatcher_exprs = dispatcher.cpparguments_exprs(cpp_args) + cpp_args_str_no_default = ', '.join(map(lambda a: a.str_no_default(), cpp_args)) + dispatcher_returns_type = dispatcher.returns_type(f.func.returns) + dispatcher_types_str = ', '.join(map(lambda a: a.type, dispatcher_exprs)) + dispatcher_exprs_str = ', '.join(map(lambda a: a.expr, dispatcher_exprs)) + + return f""" +// aten::{f.func} +{cpp_returns_type} {name}({cpp_args_str_no_default}) {{ + static auto op = c10::Dispatcher::singleton() + .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}") + .typed<{dispatcher_returns_type} ({dispatcher_types_str})>(); + return op.call({dispatcher_exprs_str}); +}} +""" + return go + +# Generates TensorBody.h (sic) and TensorMethods.cpp. These files provide the +# object-oriented (method-based) public C++ API, and the scaffolding to call into +# the dispatcher from these functions. See also compute_function. +def compute_tensor_method(*, target: Target) -> Callable[[NativeFunction], Optional[str]]: + @with_native_function + def go(f: NativeFunction) -> Optional[str]: + if Variant.method not in f.variants: + return None + + assert not f.func.is_out_fn() + assert len(f.func.arguments) > 0 + assert sum(a.name == 'self' for a in f.func.arguments) == 1 + + name = cpp.name(f.func) + cpp_returns_type = cpp.returns_type(f.func.returns) + cpp_args = cpp.arguments(f.func, method=True) + cpp_args_exclude_this = [a for a in cpp_args if not isinstance(a.argument, ThisArgument)] + cpp_args_exclude_this_str = ', '.join(str(a) for a in cpp_args_exclude_this) + + if target is Target.DECLARATION: + return f"{cpp_returns_type} {name}({cpp_args_exclude_this_str}) const;" + + assert target is Target.DEFINITION + + dispatcher_exprs = dispatcher.cpparguments_exprs(cpp_args) + cpp_args_exclude_this_str_no_default = ', '.join(a.str_no_default() for a in cpp_args_exclude_this) + dispatcher_returns_type = dispatcher.returns_type(f.func.returns) + dispatcher_types_str = ', '.join(map(lambda a: a.type, dispatcher_exprs)) + dispatcher_exprs_str = ', '.join(map(lambda a: a.expr, dispatcher_exprs)) + + return f""" +// aten::{f.func} +{cpp_returns_type} Tensor::{name}({cpp_args_exclude_this_str_no_default}) const {{ + static auto op = c10::Dispatcher::singleton() + .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}") + .typed<{dispatcher_returns_type} ({dispatcher_types_str})>(); + return op.call({dispatcher_exprs_str}); +}} +""" + + return go + +# Generates ATenOpList.cpp, a runtime accessible list of all aten +# operators. +# TODO: This was historically used to help some JIT interop code +# figure out whether or not to treat aten namespace'd operators +# one way or another, we should reevaluate if this is actually needed. +@with_native_function +def compute_aten_op(f: NativeFunction) -> str: + return f'{{"aten::{f.func.name.name}", "{f.func.name.overload_name}"}},' + +# Generates NativeFunctions.h, a list of forward declarations of all +# actual kernel definitions we keep in aten/src/ATen/native/ +@with_native_function +def compute_native_function_declaration(f: NativeFunction) -> List[str]: + if f.dispatch is None: + ns = [cpp.name(f.func)] + else: + ns = list(f.dispatch.values()) + + rs = [] + # Sometimes a function name shows up multiple times; only generate + # it once! + seen = set() + for n in ns: + if n in seen: + continue + if "legacy::" in n: + continue + seen.add(n) + returns_type = legacy_dispatcher.returns_type(f.func.returns) + args = legacy_dispatcher.arguments(f.func) + rs.append(f"CAFFE2_API {returns_type} {n}({', '.join(map(lambda a: a.str_with_default(), args))});") + + return rs + +# Generates BackendSelectRegister.cpp, a series of kernels which provide +# specialized computation of dispatch key for operator signatures which cannot +# be easily done automatically using templating. +def compute_backend_select(*, target: Target) -> Callable[[NativeFunction], Optional[str]]: + @with_native_function + def go(f: NativeFunction) -> Optional[str]: + if str(f.func.name.name).endswith('_like') or str(f.func.name.name).startswith('new_'): + return None + + name = legacy_dispatcher.name(f.func) + legacy_dispatcher_returns_type = legacy_dispatcher.returns_type(f.func.returns) + legacy_dispatcher_args = legacy_dispatcher.arguments(f.func) + + if not any(isinstance(a.argument, TensorOptionsArguments) for a in legacy_dispatcher_args): + return None + + legacy_dispatcher_tensor_args = [ + a for a in legacy_dispatcher_args + if isinstance(a.argument, Argument) and a.argument.type.is_tensor_like() + ] + + dispatcher_returns_type = dispatcher.returns_type(f.func.returns) + dispatcher_args = dispatcher.arguments(f.func) + dispatcher_exprs = dispatcher.legacydispatcherarguments_exprs(legacy_dispatcher_args) + + if target is Target.DEFINITION: + # See Note [Byte-for-byte compatibility] + # I don't think there's actually a good reason to generate + # these two cases differently + if legacy_dispatcher_tensor_args: + tensor_args = ', '.join(a.name for a in legacy_dispatcher_tensor_args) + compute_dk = f"""\ +DispatchKeySet _dk_set = DispatchKeySet(options.computeDispatchKey()) | c10::detail::multi_dispatch_key_set({tensor_args}); + DispatchKeySet _dk_mask = c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, DispatchKey::BackendSelect); + DispatchKey _dk = c10::impl::dispatchTypeId(_dk_set, _dk_mask);""" + else: + compute_dk = "DispatchKey _dk = options.computeDispatchKey();" + return f"""\ +// aten::{f.func} +{legacy_dispatcher_returns_type} {name}({', '.join(a.str_with_default() for a in legacy_dispatcher_args)}) {{ + static auto op = c10::Dispatcher::singleton() + .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}") + .typed<{dispatcher_returns_type} ({', '.join(a.type for a in dispatcher_args)})>(); + {compute_dk} + return op.callWithDispatchKey(_dk, {', '.join(a.expr for a in dispatcher_exprs)}); +}} +""" + elif target is Target.REGISTRATION: + if local.use_c10_dispatcher() is UseC10Dispatcher.full: + return f"""m.impl("aten::{f.func.name}", + c10::impl::hacky_wrapper_for_legacy_signatures<{dispatcher_returns_type} ({', '.join(a.type for a in dispatcher_args)})>( + TORCH_FN({name})));""" + else: + return f"""m.impl_UNBOXED("aten::{f.func.name}", {name});""" + elif target is Target.DECLARATION: + raise AssertionError() + else: + assert_never(target) + return go + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# YAML CODE GENERATION +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # + +def dict_representer(dumper: Any, data: Any) -> Any: + return dumper.represent_dict(data.items()) + +def format_yaml(data: object) -> str: + noalias_dumper = yaml.dumper.SafeDumper + noalias_dumper.ignore_aliases = lambda self, data: True # type: ignore + # Support serializing OrderedDict + noalias_dumper.add_representer(OrderedDict, dict_representer) # type: ignore + # Some yaml parsers (e.g. Haskell's) don't understand line breaks. + # width=float('Inf') turns off optional line breaks and improves + # the portability of the outputted yaml. + return yaml.dump(data, default_flow_style=False, Dumper=noalias_dumper, width=float('Inf')) # type: ignore + +# For some reason, some defaults we write to YAML are written as native +# YAML objects, rather than doing them uniformly as strings. This +# function detects those cases and converts them into native Python +# objects. +def pythonify_default(s: str) -> object: + if s == 'true': + return True + elif s == 'false': + return False + + try: + return int(s) + except ValueError: + try: + return float(s) + except ValueError: + return s + +# What is a dynamic type? Over time, the semantic meaning of +# dynamic type has degraded to meaninglessness (in the old days, +# it captured dtype-ness of types, but that has gone away with +# the removal of TH). These days, it's mostly the same thing as +# the C++ API argument type, except that Tensor and Tensor? +# arguments simply present as Tensor. +# +# TODO: Get rid of dynamic_type, after getting tools/autograd +# to use the new codegen framework +def dynamic_type(t: Type) -> str: + if isinstance(t, OptionalType): + return dynamic_type(t.elem) + # Note we don't use t.is_tensor_like() here because it would + # also include Tensor[] + if str(t) == 'Tensor': + return 'Tensor' + return cpp.argumenttype_type(t, mutable=False) + +def compute_method_of_yaml(variants: Set[Variant]) -> List[str]: + # This is written out explicitly to ensure that Tensor and + # namespace are put into the list in the right order + method_of = ['Type'] + if Variant.method in variants: + method_of.append('Tensor') + if Variant.function in variants: + method_of.append('namespace') + return method_of + +def compute_returns_yaml(f: NativeFunction) -> Tuple[List[Dict[str, str]], Dict[str, str]]: + # Note [name and field_name] + # ~~~~~~~~~~~~~~~~~~~~~~~~~~ + # To understand name_to_field_name, we must first talk about this + # schema: + # + # lstsq.X(Tensor self, Tensor A, *, Tensor(a!) X, Tensor(b!) qr) -> (Tensor(a!) solution, Tensor(b!) QR) + # + # There is something very odd about this schema: it is an out + # variant of the function (that is to say, it will convert into + # at::lstsq_out() in the C++ API), but the names of the output + # return arguments don't match the keyword argument names of + # the inputs. It TURNS OUT that in this situation, the historical + # Declarations.yaml we want to output is this (abbreviated to + # only show relevant fields): + # + # arguments: + # ... + # - field_name: solution + # name: X + # - field_name: QR + # name: qr + # ... + # + # returns: + # - field_name: solution + # name: X + # - field_name: QR + # name: qr + # + # The name of the return fields is stored in 'field_name', and the + # name of the arguments is stored in 'name'. So when we process + # arguments, we need a way to get at the corresponding return. At + # the moment, this is most conveniently done by constructing a + # mapping from name (the argument concept) to field_name (the + # return concept) while processing return arguments, since we don't + # directly maintain this correspondence in the modeling of function + # schema itself. + # + # See also https://github.com/pytorch/pytorch/issues/43114 + name_to_field_name: Dict[str, str] = {} + + # Compute the returns field of the YAML entry + returns = [] + for i, r in enumerate(f.func.returns): + # If we have an inplace function, the return argument is + # implicitly named self. + # TODO: Consider incorporating this into the data model + if f.func.name.name.inplace: + assert i == 0, "illegal inplace function with multiple returns" + name = 'self' + # If we are out function, the name is the name of the + # corresponding output function (r.name will get recorded + # in field_name later.) + elif f.func.is_out_fn(): + name = f.func.out_arguments[i].name + # If the return argument is explicitly named... + elif r.name: + # See Note [Byte-for-byte compatibility] + # + # Check if it would conflict with an existing argument. + # Downstream codegen assumes that return names and argument + # names don't conflict with each other, so we disambiguate + # (by adding a trailing _return) this case. Notice that + # historically, the collision check was buggy: it just did a + # straight string contains test on the entirety of the + # inputs part of the format string, meaning that it also + # picked up occurrences of the argument name in the NAME of + # the function, as well as substring occurrences of the name + # in arguments. We have simulated the old logic here... + buggy_name_conflict = r.name in str(f.func.name) or \ + any(r.name in a.name for a in f.func.schema_order_arguments()) + # ... but a more correct version is simply + # name_conflict = any(r.name == a.name for a in f.func.schema_order_arguments()) + if buggy_name_conflict and not f.func.is_out_fn(): + name = f'{r.name}_return' + else: + name = r.name + # If there is no explicit name, we just name the output result, + # unless it's a multi-return, in which case it's result0, + # result1, etc (zero-indexed) + else: + name = 'result' if len(f.func.returns) == 1 else f'result{i}' + + ret = { + 'dynamic_type': dynamic_type(r.type), + 'name': name, + 'type': cpp.return_type(r), + } + + if r.name: + # See Note [name and field_name] + ret['field_name'] = r.name + if f.func.is_out_fn(): + name_to_field_name[f.func.out_arguments[i].name] = r.name + + returns.append(ret) + + return returns, name_to_field_name + +# arguments in yaml roughly corresponds to the public C++ API +def compute_cpp_argument_yaml(cpp_a: CppArgument, *, schema_order: bool, kwarg_only_set: Set[str], + out_arg_set: Set[str], name_to_field_name: Dict[str, str]) -> object: + if isinstance(cpp_a.argument, TensorOptionsArguments): + arg: Dict[str, object] = { + 'annotation': None, + 'dynamic_type': 'TensorOptions', + 'is_nullable': False, + 'name': cpp_a.name, + 'type': cpp_a.type, + 'kwarg_only': True, + } + if cpp_a.default is not None: + arg['default'] = cpp_a.default + return arg + elif isinstance(cpp_a.argument, ThisArgument): + raise AssertionError() + elif isinstance(cpp_a.argument, Argument): + return compute_argument_yaml( + cpp_a.argument, schema_order=schema_order, + kwarg_only_set=kwarg_only_set, out_arg_set=out_arg_set, name_to_field_name=name_to_field_name) + +def compute_argument_yaml(a: Argument, *, schema_order: bool, kwarg_only_set: Set[str], + out_arg_set: Set[str], name_to_field_name: Dict[str, str]) -> object: + arg: Dict[str, object] = { + 'annotation': str(a.annotation) if a.annotation else None, + 'dynamic_type': dynamic_type(a.type), + 'is_nullable': a.type.is_nullable(), + 'name': a.name, + 'type': cpp.argument_type(a), + } + if a.default is not None: + arg['default'] = pythonify_default(cpp.default_expr(a.default, a.type)) + if a.name in kwarg_only_set: + arg['kwarg_only'] = True + # See Note [Byte-for-byte compatibility] + # The default value of kwarg_only is False; this case exists for + # byte-for-byte compatibility + elif a.name in out_arg_set: + arg['kwarg_only'] = False + if a.name in out_arg_set: + arg['output'] = True + # See Note [Byte-for-byte compatibility] + # This is probably a bug in the original implementation, where + # the specification of allocate was not properly propagated to + # the schema-order arguments. In any case, this field + # is redundant with the output field + if not schema_order: + arg['allocate'] = True + # See Note [name and field_name] + if a.name in name_to_field_name: + arg['field_name'] = name_to_field_name[a.name] + # Historically, booleans don't get their size recorded, because it + # is already built into the cpp type (e.g., std::array) + l = a.type.is_list_like() + if l is not None and l.size is not None and str(l.elem) != 'bool': + arg['size'] = l.size + return arg + +@with_native_function +def compute_declaration_yaml(f: NativeFunction) -> object: + returns, name_to_field_name = compute_returns_yaml(f) + + # These sets are used to conveniently test if an argument is a + # kwarg-only or out argument + kwarg_only_set = set(a.name for a in f.func.kwarg_only_arguments) + out_arg_set = set(a.name for a in f.func.out_arguments) + + cpp_args = cpp.arguments(f.func) + arguments = [ + compute_cpp_argument_yaml( + cpp_a, schema_order=False, + kwarg_only_set=kwarg_only_set, out_arg_set=out_arg_set, name_to_field_name=name_to_field_name) + for cpp_a in cpp_args + ] + + # See Note [Byte-for-byte compatibility] + # NB: NOT actually schema order. This is almost certainly a BUG. + schema_order_jit_arguments = list(itertools.chain(f.func.arguments, f.func.out_arguments, f.func.kwarg_only_arguments)) + + schema_order_arguments = [ + compute_argument_yaml( + a, schema_order=True, + kwarg_only_set=kwarg_only_set, out_arg_set=out_arg_set, name_to_field_name=name_to_field_name) + for a in schema_order_jit_arguments + ] + + cpp_schema_order_types = [cpp.argument(a).type for a in schema_order_jit_arguments] + cpp_returns = cpp.returns_type(f.func.returns) + schema_order_cpp_signature = f"{cpp_returns} ({', '.join(cpp_schema_order_types)})" + + is_factory_method = any(isinstance(a.argument, TensorOptionsArguments) for a in cpp_args) \ + and Variant.method not in f.variants + + return OrderedDict([ + ('name', cpp.name(f.func)), + ('operator_name', str(f.func.name.name)), + ('overload_name', str(f.func.name.overload_name)), + ('use_c10_dispatcher', f.use_c10_dispatcher.name), + ('manual_kernel_registration', f.manual_kernel_registration), + ('category_override', f.category_override if f.category_override is not None else ''), + ('matches_jit_signature', True), + ('schema_string', f'aten::{f.func}'), + ('arguments', arguments), + ('schema_order_cpp_signature', schema_order_cpp_signature), + ('schema_order_arguments', schema_order_arguments), + ('method_of', compute_method_of_yaml(f.variants)), + ('mode', 'native'), + ('python_module', '' if f.python_module is None else f.python_module), + ('returns', returns), + ('inplace', f.func.name.name.inplace), + ('is_factory_method', is_factory_method), + # Note [Abstract ATen methods] + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # An abstract ATen method is one whose dispatch differs between + # types. These are implemented in derived types (with a + # standard (throwing) definition in Type). A concrete ATen + # method is one which has the same dispatch for all types; + # we just implement it in the base Type. This is exposed + # in Declarations.yaml via a field named 'abstract'. + # + # Although this is what we have historically exposed, it is + # actually not all that useful for end users, who are also interested + # whether or not there is an explicit entry in derivatives.yaml + # for the entry or not (as this affects whether or not the operation is + # overrideable or not.) Once this all gets cleaned up, this + # property will be obsolete. + ('abstract', f.dispatch is not None), + ('device_guard', f.device_guard), + ('with_gil', False), + ('deprecated', False), + ]) + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# RUN IT ALL +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # + +@functools.lru_cache(maxsize=None) +def _read_template(template_fn: str) -> CodeTemplate: + return CodeTemplate.from_file(template_fn) + +# A small abstraction for writing out generated files and keeping track +# of what files have been written (so you can write out a list of output +# files) +class FileManager: + install_dir: str + template_dir: str + dry_run: bool + filenames: Set[str] + + def __init__(self, install_dir: str, template_dir: str, dry_run: bool) -> None: + self.install_dir = install_dir + self.template_dir = template_dir + self.filenames = set() + self.dry_run = dry_run + + def _write_if_changed(self, filename: str, contents: str) -> None: + old_contents: Optional[str] + try: + with open(filename, 'r') as f: + old_contents = f.read() + except IOError: + old_contents = None + if contents != old_contents: + with open(filename, 'w') as f: + f.write(contents) + + def write_with_template(self, filename: str, template_fn: str, + env_callable: Callable[[], Union[str, Dict[str, object]]]) -> None: + filename = '{}/{}'.format(self.install_dir, filename) + assert filename not in self.filenames, "duplicate file write {filename}" + self.filenames.add(filename) + if not self.dry_run: + env = env_callable() + if isinstance(env, dict): + # TODO: Update the comment reference to the correct location + comment = "@" + "generated by aten/src/ATen/gen.py" + comment += " from {}".format(os.path.basename(template_fn)) + env['generated_comment'] = comment + template = _read_template(os.path.join(self.template_dir, template_fn)) + self._write_if_changed(filename, template.substitute(env)) + elif isinstance(env, str): + self._write_if_changed(filename, env) + else: + assert_never(env) + + + def write(self, filename: str, env_callable: Callable[[], Union[str, Union[str, Dict[str, object]]]]) -> None: + self.write_with_template(filename, filename, env_callable) + + def write_outputs(self, filename: str) -> None: + """Write a file containing the list of all outputs which are + generated by this script.""" + self._write_if_changed( + filename, + ''.join(name + ";" for name in sorted(self.filenames))) + +def main() -> None: + parser = argparse.ArgumentParser(description='Generate ATen source files') + parser.add_argument( + '-s', + '--source-path', + help='path to source directory for ATen', + default='aten/src/ATen') + parser.add_argument( + '-o', + '--output-dependencies', + help='output a list of dependencies into the given file and exit') + parser.add_argument( + '-d', '--install_dir', help='output directory', + default='build/aten/src/ATen') + parser.add_argument( + '--rocm', + action='store_true', + help='reinterpret CUDA as ROCm/HIP and adjust filepaths accordingly') + # TODO: remove this, we should just unconditionally generate Vulkan + parser.add_argument( + '--vulkan', + action='store_true', + help='Generate Vulkan backend functions') + parser.add_argument( + '--op_registration_whitelist', + nargs='*', + help='filter op registrations by the whitelist (if set); ' + 'each item is `namespace`::`operator name` without overload name; ' + 'e.g.: aten::empty aten::conv2d ...') + parser.add_argument( + '--backend_whitelist', + nargs='*', + help='filter dispatch backend by the whitelist (if set), ' + 'e.g.: CPU CUDA QuantizedCPU ...') + parser.add_argument( + '--per_op_registration', + action='store_true', + help='group function registrations by op name and write to separate files; ' + 'must also set --op_registration_whitelist param') + parser.add_argument( + '--force_schema_registration', + action='store_true', + help='force it to generate schema-only registrations for all ops, including' + 'those that are not listed on --op_registration_whitelist') + options = parser.parse_args() + + op_registration_whitelist: Optional[Set[str]] + if options.op_registration_whitelist is not None: + op_registration_whitelist = set(options.op_registration_whitelist) + else: + op_registration_whitelist = None + + native_functions = parse_native_yaml(os.path.join(options.source_path, 'native/native_functions.yaml')) + + template_dir = os.path.join(options.source_path, "templates") + + # NB: It is mandatory to NOT use os.path.join here, as the install directory + # will eventually be ingested by cmake, which does not respect Windows style + # path slashes. If you switch this to use os.path.join, you'll get an error + # like: + # + # Syntax error in cmake code when parsing string + # + # C:/Jenkins/workspace/pytorch-builds/pytorch-win-ws2016-cuda9-cudnn7-py3-build/build/aten/src/ATen\core/TensorMethods.h + # + # Invalid character escape '\c'. + core_install_dir = f'{options.install_dir}/core' + pathlib.Path(core_install_dir).mkdir(parents=True, exist_ok=True) + + def make_file_manager(install_dir: str) -> FileManager: + return FileManager(install_dir=install_dir, template_dir=template_dir, dry_run=options.output_dependencies) + + core_fm = make_file_manager(core_install_dir) + cpu_fm = make_file_manager(options.install_dir) + cuda_fm = make_file_manager(options.install_dir) + + extra_cuda_headers = '''\ +#include +#include +#include +#include ''' + if options.rocm: + extra_cuda_headers = '''\ +#include +#include +#include +#include ''' + + backends = ["CPU", "SparseCPU", "MkldnnCPU", "CUDA", "SparseCUDA", "QuantizedCPU", "QuantizedCUDA"] + if options.vulkan: + backends.append("Vulkan") + if options.backend_whitelist: + backends = [b for b in backends if b in options.backend_whitelist] + + for dispatch in backends: + h_template = 'TypeDerived.h' + cpp_template = 'TypeDerived.cpp' + # TODO: delete this special case + if 'Sparse' in dispatch: + cpp_template = 'SparseTypeDerived.cpp' + + fm = cuda_fm if 'CUDA' in dispatch else cpu_fm + + fm.write_with_template(f'{dispatch}Type.h', h_template, lambda: { + 'Type': f'{dispatch}Type', + 'extra_cuda_headers': extra_cuda_headers if 'CUDA' in dispatch else '', # TODO: remove this + 'type_derived_method_declarations': list(mapMaybe( + compute_type_method(dispatch, target=Target.DECLARATION, op_registration_whitelist=op_registration_whitelist), + native_functions + )), + }) + fm.write_with_template(f'{dispatch}Type.cpp', cpp_template, lambda: { + 'Type': f'{dispatch}Type', + # TODO: remove this + 'extra_cuda_headers': extra_cuda_headers if 'CUDA' in dispatch else '', + # TODO: remove this + 'storage_tensor_headers': '#include ', + # TODO: remove this + 'Generator': 'CUDAGeneratorImpl' if 'CUDA' in dispatch else 'CPUGeneratorImpl', + 'legacy_th_headers': + '#include ' if dispatch == "CPU" else + '#include ' if dispatch == "CUDA" else + '', + 'Backend': dispatch, + 'type_derived_method_definitions': list(mapMaybe( + compute_type_method(dispatch, target=Target.DEFINITION, op_registration_whitelist=op_registration_whitelist), + native_functions + )), + 'function_registrations': list(mapMaybe( + compute_type_method( + dispatch, target=Target.REGISTRATION, op_registration_whitelist=op_registration_whitelist), + native_functions + )) if not options.per_op_registration else [], + }) + del fm + + cpu_fm.write('TypeDefault.h', lambda: { + 'type_method_declarations': list(mapMaybe( + compute_type_method(None, target=Target.DECLARATION, op_registration_whitelist=op_registration_whitelist), + native_functions)), + }) + cpu_fm.write('TypeDefault.cpp', lambda: { + 'type_method_definitions': list(mapMaybe( + compute_type_method(None, target=Target.DEFINITION, op_registration_whitelist=op_registration_whitelist), + native_functions)), + 'function_registrations': list(mapMaybe( + compute_type_method(None, target=Target.REGISTRATION, op_registration_whitelist=op_registration_whitelist), + native_functions)) if not options.per_op_registration else [], + }) + cpu_fm.write('Functions.h', lambda: { + 'function_declarations': list(mapMaybe(compute_function(target=Target.DECLARATION), native_functions)), + }) + cpu_fm.write('Functions.cpp', lambda: { + 'function_definitions': list(mapMaybe(compute_function(target=Target.DEFINITION), native_functions)), + }) + core_fm.write('TensorBody.h', lambda: { + 'tensor_method_declarations': list(mapMaybe(compute_tensor_method(target=Target.DECLARATION), native_functions)), + }) + core_fm.write('TensorMethods.cpp', lambda: { + 'tensor_method_definitions': list(mapMaybe(compute_tensor_method(target=Target.DEFINITION), native_functions)), + }) + core_fm.write('ATenOpList.cpp', lambda: { + 'aten_ops': list(mapMaybe(compute_aten_op, native_functions)), + }) + cpu_fm.write('NativeFunctions.h', lambda: { + 'native_function_declarations': list(concatMap(compute_native_function_declaration, native_functions)), + }) + cpu_fm.write('BackendSelectRegister.cpp', lambda: { + 'backend_select_method_definitions': + list(mapMaybe(compute_backend_select(target=Target.DEFINITION), native_functions)), + 'backend_select_function_registrations': + list(mapMaybe(compute_backend_select(target=Target.REGISTRATION), native_functions)), + }) + + if options.force_schema_registration: + def computeSchemaRegister() -> Dict[str, object]: + schema_registrations = list(mapMaybe( + compute_type_method(None, target=Target.REGISTRATION, op_registration_whitelist=None, def_only=True), + native_functions)) + # See Note [Byte-for-byte compatibility] + schema_registrations.sort() + return { + 'schema_registrations': schema_registrations, + } + cpu_fm.write('SchemaRegister.cpp', computeSchemaRegister) + + if options.per_op_registration: + def gen_per_op_registration_filename(opname: str) -> str: + return 'pt_op_register_{}.cpp'.format(opname.replace(':', '-')) + + if op_registration_whitelist is None: + raise Exception("Must set --op_registration_whitelist for per-op registration.") + + # First, group all native functions by unoverloaded operator name + grouped_functions : DefaultDict[str, List[NativeFunction]] = DefaultDict(list) + for f in native_functions: + grouped_functions[f"aten::{f.func.name.name}"].append(f) + extra_headers = [] + for b in backends: + extra_headers.append(f'#include ') + + # Next, generate registration for each one + for name in op_registration_whitelist: + def computePerOpRegistration() -> Dict[str, object]: + fs = grouped_functions[name] + registrations: List[str] = [] + for mb_dispatch in itertools.chain([None], backends): + # or you could pass in op_registration_whitelist, it doesn't + # matter! + # NB: Use of compute_type_method here is kind of an abuse; + # this is why we have to unconditionally write in + # torch::dispatch in the registration when it should be + # contextually clear + registrations.extend( + mapMaybe( + compute_type_method(mb_dispatch, target=Target.REGISTRATION, op_registration_whitelist=None), + fs)) + return { + 'extra_headers': extra_headers, + 'function_registrations': registrations, + } + + cpu_fm.write_with_template( + gen_per_op_registration_filename(name), 'PerOpRegistration.cpp', computePerOpRegistration) + + cpu_fm.write('Declarations.yaml', lambda: format_yaml(list(map(compute_declaration_yaml, native_functions)))) + + if options.output_dependencies: + cpu_fm.write_outputs(options.output_dependencies) + core_fm.write_outputs(f"{options.output_dependencies}-core") + cuda_fm.write_outputs(f"{options.output_dependencies}-cuda") + +if __name__ == '__main__': + main() diff --git a/tools/codegen/local.py b/tools/codegen/local.py new file mode 100644 index 00000000000..9244cb181ae --- /dev/null +++ b/tools/codegen/local.py @@ -0,0 +1,49 @@ +import threading +from contextlib import contextmanager +from typing import Optional, Iterator + +from tools.codegen.model import UseC10Dispatcher + +# Simple dynamic scoping implementation. The name "parametrize" comes +# from Racket. +# +# WARNING WARNING: LOOKING TO EDIT THIS FILE? Think carefully about +# why you need to add a toggle to the global behavior of code +# generation. The parameters here should really only be used +# for "temporary" situations, where we need to temporarily change +# the codegen in some cases because we cannot conveniently update +# all call sites, and are slated to be eliminated once all call +# sites are eliminated. If you don't have a plan for how to get there, +# DON'T add a new entry here. + +class Locals(threading.local): + use_c10_dispatcher: Optional[UseC10Dispatcher] = None + hack_const_mutable_self: bool = False +_locals = Locals() + +# The use_c10_dispatcher field in native_functions.yaml is used to +# control codegen behavior, so that we can handle cases where +# Dispatcher templating logic can't handle. In the terminal +# state, use_c10_dispatcher should always be UseC10Dispatcher.full +# and this flag can be eliminated. +def use_c10_dispatcher() -> UseC10Dispatcher: + assert _locals.use_c10_dispatcher is not None, \ + "need to initialize local.use_c10_dispatcher with local.parametrize" + return _locals.use_c10_dispatcher + +# This is used to maintain compat, see Note [Byte-for-byte compatibility] +# It can be removed when we drop compat. +def hack_const_mutable_self() -> bool: + return _locals.hack_const_mutable_self + +@contextmanager +def parametrize(*, use_c10_dispatcher: UseC10Dispatcher, hack_const_mutable_self: bool) -> Iterator[None]: + old_use_c10_dispatcher = _locals.use_c10_dispatcher + old_hack_const_mutable_self = _locals.hack_const_mutable_self + try: + _locals.use_c10_dispatcher = use_c10_dispatcher + _locals.hack_const_mutable_self = hack_const_mutable_self + yield + finally: + _locals.use_c10_dispatcher = old_use_c10_dispatcher + _locals.hack_const_mutable_self = old_hack_const_mutable_self diff --git a/tools/codegen/model.py b/tools/codegen/model.py new file mode 100644 index 00000000000..de553704d5d --- /dev/null +++ b/tools/codegen/model.py @@ -0,0 +1,766 @@ +import re + +from dataclasses import dataclass +from typing import List, Sequence, Dict, Optional, Iterator, Tuple, Set, NoReturn +from enum import Enum +import itertools + +# A little trick from https://github.com/python/mypy/issues/6366 +# for getting mypy to do exhaustiveness checking +# TODO: put this somewhere else, maybe +def assert_never(x: NoReturn) -> NoReturn: + raise AssertionError("Unhandled type: {}".format(type(x).__name__)) + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# DATA MODEL +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# Some general principles for our data model. +# +# - Stop using C++ data types as the internal data representation +# format. Instead, the internal data structures are centered +# around JIT schema representation. This avoid a big problem +# with the old codegen where we read in all the types from +# native_functions.yaml and then immediately had to retranslate +# them into C++ types. +# +# - More semantic data representation. Instead of representing +# everything as dicts and strings, we define dataclasses for +# every interesting entity the code generation has to deal with. +# These dataclasses have strong semantic invariants: for example, +# we generally require them to roundtrip losslessly into the +# form they were parsed from. These structures are immutable +# and you're expected to populate information once during +# construction. + +# Represent a source location; used for better error reporting +@dataclass(frozen=True) +class Location: + file: str + line: int + + def __str__(self) -> str: + return "{}:{}".format(self.file, self.line) + +# Valid values of the 'variants' field in native_functions.yaml +Variant = Enum('Variant', ('function', 'method')) + +UseC10Dispatcher = Enum('UseC10Dispatcher', ( + 'full', + 'with_codegenerated_unboxing_wrapper' +)) + +# The basic input to the code generation is native_functions.yaml. +# The name "native", BTW, comes from the distinction between native +# functions and legacy TH functions. The legacy TH functions are gone, +# but the "native" descriptor has stuck. +# +# NativeFunction models a single entry in native_functions.yaml. Its +# fields roughly correspond to what you would see in the YAML itself, +# but after canonicalization and parsing has occurred. +# +# You can see some of the overall design patterns for how we setup +# dataclasses in this class, but we will defer a complete discussion +# of this at FunctionSchema. +@dataclass(frozen=True) +class NativeFunction: + # The function schema of the operator in question. This schema + # has been parsed; see FunctionSchema for more about its structure. + # (This type is quoted as we are forward referencing a type + # defined later in the file. I opted for this ordering of the + # classes for expository clarity.) + func: 'FunctionSchema' + + # Corresponds to the 'use_c10_dispatcher' field. The default + # is 'with_codegenerated_unboxing_wrapper' + use_c10_dispatcher: UseC10Dispatcher + + # Whether or not to omit automatic generation of a DeviceGuard + device_guard: bool + + # What python module to put the function in + python_module: Optional[str] + + # TODO: figure out what this does + category_override: Optional[str] + + # If no variants are specified in native_functions.yaml, this is + # assumed to be {'function'}. + variants: Set[Variant] + + # Whether or not we should skip generating registrations for + # this kernel. This is a bit of a double-edged sword, as manual + # registrations don't participate in codegen-based selective build! + manual_kernel_registration: bool + + # Distinguish between a missing dispatch dict (historically, this + # means to register a catch-all kernel) and a present but empty + # dispatch dict (this means register nothing; arguably, this should + # subsume manual_kernel_registration). + # + # TODO: str key could be replaced with more explicit enum + dispatch: Optional[Dict[str, str]] + + # The location in the YAML file were this native function entry was + # defined. This is for conveniently reporting error messages! + loc: 'Location' + + # NB: The benefit of defining a dataclass is that we automatically get + # a constructor defined for all the fields we specify. No need + # to explicitly write it out. + + @staticmethod + def from_yaml(ei: Dict[str, object], loc: 'Location') -> 'NativeFunction': + """ + Parse a NativeFunction from a dictionary as directly parsed + from native_functions.yaml + """ + e = ei.copy() + + funcs = e.pop('func') + assert isinstance(funcs, str), f'not a str: {funcs}' + func = FunctionSchema.parse(funcs) + + use_c10_dispatcher_s = e.pop('use_c10_dispatcher', None) + if use_c10_dispatcher_s is None: + use_c10_dispatcher = UseC10Dispatcher.with_codegenerated_unboxing_wrapper + elif use_c10_dispatcher_s == 'full': + use_c10_dispatcher = UseC10Dispatcher.full + else: + raise AssertionError( + f'use_c10_dispatcher must be unset or set to full, got {use_c10_dispatcher}') + + variants_s = e.pop('variants', 'function') + assert isinstance(variants_s, str) + variants: Set[Variant] = set() + for v in variants_s.split(', '): + if v == 'function': + variants.add(Variant.function) + elif v == 'method': + variants.add(Variant.method) + else: + raise AssertionError(f'illegal variant {v}') + + manual_kernel_registration = e.pop('manual_kernel_registration', False) + assert isinstance(manual_kernel_registration, bool), f'not a bool: {manual_kernel_registration}' + + device_guard = e.pop('device_guard', True) + assert isinstance(device_guard, bool), f'not a bool: {device_guard}' + + python_module = e.pop('python_module', None) + assert python_module is None or isinstance(python_module, str), f'not a str: {python_module}' + + category_override = e.pop('category_override', None) + assert category_override is None or isinstance(category_override, str), f'not a str: {category_override}' + + raw_dispatch = e.pop('dispatch', None) + assert raw_dispatch is None or isinstance(raw_dispatch, dict), e + dispatch: Optional[Dict[str, str]] = None + if raw_dispatch is not None: + dispatch = {} + for ks, v in raw_dispatch.items(): + if ks == '__line__': + continue # not worth tracking line numbers for dispatch entries + assert isinstance(ks, str), e + assert isinstance(v, str), e + for k in ks.split(","): + dispatch[k.strip()] = v + + e.pop('__line__') + assert not e, f"leftover entries: {e}" + + return NativeFunction( + func=func, + use_c10_dispatcher=use_c10_dispatcher, + variants=variants, + manual_kernel_registration=manual_kernel_registration, + python_module=python_module, + category_override=category_override, + dispatch=dispatch, + device_guard=device_guard, + loc=loc, + ) + + # __post_init__ functions in dataclasses can be used to do extra + # validation after construction. + # + # Notice that we don't do any type validation here. In fact, we + # rely exclusively on mypy to check if you've done types correctly! + # Validation is for nontrivial invariants that cannot be (conveniently) + # encoded in the type system. + def __post_init__(self) -> None: + if self.func.out_arguments: + assert self.variants == {Variant.function}, "Native functions with out arguments MUST " \ + "be declared with only function variant; e.g., variants: function; " \ + "otherwise you will tickle a Python argument binding bug " \ + "(which usually manifests itself as the result variable being undefined.)" + +# The function schema is undoubtedly the most important data structure +# in all of the codegen, as it defines the type signature for operators, +# and most of the code generation we do is type directed (e.g., look at +# the types, decide what to do. Think about how we code generate +# C++ function stubs!) +# +# We will also see in this class the general structure for how we model +# data in this code generation. A few notable properties to point out +# ahead of time: +# +# - These dataclasses are a *lossless* representation of the strings +# they are parsed from. In fact, we assert that given the +# information stored in the dataclass, we can exactly reconstruct +# the string we parsed from (and assert this inside the parse +# definition). There are a few reasons for this: +# +# - If you find that it is difficult to reconstruct the string +# given a dataclass, that is a clue that you are data +# representation is wrong. +# +# - It helps ensure that all relevant information is present +# in the dataclass, so that downstream users aren't tempted +# to reparse the original string to get some information +# that was omitted. +# +# - It forces you to represent the data in-memory in the same way +# it is recorded textually, which makes the dataclasses easier +# to understand for someone who is familiar with the +# textual format. (As a tradeoff, it means you have to model +# the syntax, even when it is inconvenient. But maybe that means +# the syntax is bad!) If you don't understand the internal +# representation, go look at the printing code to see how +# it maps onto the surface syntax! +# +# - It makes it easy to test the parsing code, as parsing code +# that is inconsistent with the string code will fail early +# and loudly. (As a tradeoff, it makes the parsing code a bit +# brittle (in particular, with trivial whitespace changes you +# are likely to trigger an assert error). +# +# In general, try to make the __str__ code as simple as possible +# (even at the cost of more complex parsing logic.) Additionally, +# try to minimize redundancy in data representation. (Precomputed +# fields are OK though: they are defined as a simple function on +# the canonical representation in question.) +# +# - These dataclasses are all frozen; once constructed their +# values never change. This makes it easy to tell where any +# given data came from: just look to the constructor. As a +# tradeoff, you can't easily "decorate" a schema with extra +# information from a post-facto analysis. We impose this +# restriction to make these structures more understandable. +# +@dataclass(frozen=True) +class FunctionSchema: + # The name of the operator this function schema describes. + name: 'OperatorName' + + # NB: Sequence here is intentional, to make it read only + arguments: Sequence['Argument'] + kwarg_only_arguments: Sequence['Argument'] # but not including out args + # Unlike in the previous codegen, we have factored out 'out' arguments + # in the canonical representation, removing them from kwarg + # arguments. This choice is justified by numerous downstream + # transformations which treat out arguments specially; additionally, + # you can see that canonicity is not violated! + out_arguments: Sequence['Argument'] # these are also kwarg-only + + # TODO: Need to handle collisions with argument names at some point + returns: Sequence['Return'] + + def schema_order_arguments(self) -> Iterator['Argument']: + return itertools.chain(self.arguments, self.kwarg_only_arguments, self.out_arguments) + + @staticmethod + def parse(func: str) -> 'FunctionSchema': + # We should probably get a proper parser here + assert ' -> ' in func, "function schema missing return type (spaces are mandatory)" + func_decl, return_decl = [x.strip() for x in func.split(' -> ')] + ops, args = func_decl.split('(', 1) + assert args[-1] == ")", "Expecting closing )" + args = args[:-1] + name = OperatorName.parse(ops) + arguments, kwarg_only_arguments, out_arguments = parse_arguments(args) + returns = parse_returns(return_decl) + r = FunctionSchema( + name=name, + arguments=arguments, + kwarg_only_arguments=kwarg_only_arguments, + out_arguments=out_arguments, + returns=returns + ) + assert str(r) == func, f'{str(r)} != {func}' + return r + + def __post_init__(self) -> None: + for arg, ret in zip(self.out_arguments, self.returns): + assert arg.annotation == ret.annotation, \ + "Out arguments must have matching return Tensor; furthermore, " \ + "the ith-argument needs to correspond to the ith return" + if self.out_arguments: + assert len(self.out_arguments) == len(self.returns), \ + "Must return as many arguments as there are out arguments" + if self.name.name.inplace: + # TODO: fixme + if str(self.name) not in [ + '_amp_non_finite_check_and_unscale_', + '_foreach_add_.Scalar']: + assert len(self.returns) == 1 + + def is_out_fn(self) -> bool: + # Note [is_out_fn] + # + # out functions are the variants which take an explicit out= argument + # to populate into. We need to know if a schema corresponds to an + # out function for several reasons: + # + # - They codegen differently in C++ API + # - codegen to at::add_out rather than at::add + # - out argument is moved to front of C++ argument list + # + # out functions are DEFINED to be any function with a keyword-only + # argument that is mutable. In principle, this could lead to a + # false positive if you define a function that mutates a + # kwarg only argument, but this isn't the "true" output of this + # function. A more robust definition that would work in this + # case would also look at: + # + # - The output types. Out functions take in the arguments + # they mutate and then return them again; this is sort + # of "definitionally" what makes something an out function. + # Historically, we DO check this for consistency. + # - Correspondence with pure variant. An out function + # should have a signature equivalent to its pure variant, + # but just with extra kwargs for the output elements. This + # is difficult to actually check for and historically + # we only do this check in tools/ + return bool(self.out_arguments) + + def __str__(self) -> str: + all_arguments: List[str] = [] + all_arguments.extend(map(str, self.arguments)) + if self.kwarg_only_arguments or self.out_arguments: + all_arguments.append('*') + all_arguments.extend(map(str, self.kwarg_only_arguments)) + all_arguments.extend(map(str, self.out_arguments)) + all_arguments_str = ', '.join(all_arguments) + if len(self.returns) == 1: + returns = str(self.returns[0]) # omit parentheses + else: + returns = '(' + ', '.join(map(str, self.returns)) + ')' + return f'{self.name}({all_arguments_str}) -> {returns}' + +# Here is the rest of the data model, described more briefly. + +# Simplified version for what actually shows up in built-ins. +# Look at alias_info.h for expanded syntax. If you need the structure, +# you also need to make this structure recursive so it can be lined +# up with the type components too. For primitives this isn't really +# necessary +@dataclass(frozen=True) +class Annotation: + # Typically only has one element. Not actually a set so + # we can conveniently assume it is canonically ordered + alias_set: Sequence[str] + is_write: bool + + @staticmethod + def parse(ann: str) -> 'Annotation': + m = re.match(r'^([a-z])(!?)$', ann) + assert m is not None, f'unrecognized alias annotation {ann}' + alias_set = [m.group(1)] + is_write = m.group(2) == '!' + r = Annotation(alias_set=alias_set, is_write=is_write) + assert str(r) == ann, f'{r} != {ann}' + return r + + def __str__(self) -> str: + alias_set = '|'.join(self.alias_set) + is_write = '!' if self.is_write else '' + return f'{alias_set}{is_write}' + +# The base class for the type system. This is also loosely modeled +# off of jit_type.h, but we've simplified the hierarchy to focus +# in on the aspects of the type system that matter for code generation +# (for example, there's no SingleElementType subclass anymore). +# You never actually construct a Type; usually it's going to be one +# of the subclasses. If Python had ADTs this would be one! +@dataclass(frozen=True) +class Type: + @staticmethod + def parse(t: str) -> 'Type': + r = Type._parse(t) + assert str(r) == t, f'{r} != {t}' + return r + + @staticmethod + def _parse(t: str) -> 'Type': + m = re.match(r'^(.+)\?$', t) + if m is not None: + return OptionalType(Type.parse(m.group(1))) + m = re.match(r'^(.+)\[([0-9]+)?\]$', t) + if m is not None: + size = int(m.group(2)) if m.group(2) is not None else None + return ListType(elem=Type.parse(m.group(1)), size=size) + try: + return BaseType(BaseTy[t]) + except KeyError: + raise RuntimeError(f"unrecognized type {t}") + + def __str__(self) -> str: + raise NotImplementedError + + # WARNING: These concepts are not very well-defined. For example, + # is "int?" nullable? How about "int?[]". They are defined + # so we can conveniently generate legacy Declarations.yaml but + # really we should probably just remove these at some point + + def is_tensor_like(self) -> bool: + raise NotImplementedError + + def is_nullable(self) -> bool: + raise NotImplementedError + + def is_list_like(self) -> Optional['ListType']: + raise NotImplementedError + +# Base types are simple, atomic types with no further structure +BaseTy = Enum('BaseTy', ( + 'Generator', + 'ScalarType', + 'Tensor', + 'int', + 'Dimname', + 'float', + 'str', + 'bool', + 'Layout', + 'Device', + 'Scalar', + 'MemoryFormat', + 'QScheme', + 'Storage', + 'ConstQuantizerPtr', # TODO: rename +)) + +@dataclass(frozen=True) +class BaseType(Type): + name: BaseTy + + def __str__(self) -> str: + return f'{self.name.name}' + + def is_tensor_like(self) -> bool: + return self.name == BaseTy.Tensor + + def is_nullable(self) -> bool: + return False + + def is_list_like(self) -> Optional['ListType']: + return None + +# Optional types may be specified, or may also be validly given None +@dataclass(frozen=True) +class OptionalType(Type): + elem: Type + + def __str__(self) -> str: + return f'{self.elem}?' + + def is_tensor_like(self) -> bool: + return self.elem.is_tensor_like() + + def is_nullable(self) -> bool: + return True + + def is_list_like(self) -> Optional['ListType']: + return self.elem.is_list_like() + +# List types specify that we may have multiples of an element. We +# also support explicit sizes on list types, but these have +# some nontrivial semantics! (However, for C++ API purposes, explicit +# sizes are mostly erased from the type system.) +# +# DANGER WILL ROBINSON: C++ elaboration depends on elem type; e.g., +# int[] elaborates differently than bool[3]! +@dataclass(frozen=True) +class ListType(Type): + elem: Type + size: Optional[int] + + def __str__(self) -> str: + size = f'{self.size}' if self.size else '' + return f'{self.elem}[{size}]' + + def is_tensor_like(self) -> bool: + return self.elem.is_tensor_like() + + def is_nullable(self) -> bool: + return self.elem.is_nullable() + + def is_list_like(self) -> Optional['ListType']: + return self + +@dataclass(frozen=True) +class Argument: + # NB: I didn't put kwarg_only as a boolean field here, unlike + # c10::Argument, so that printing works correctly + + name: str + type: Type + default: Optional[str] + + # The semantics of the annotation field are a little strange. + # + # Alias annotations parametrize Tensors (since Tensors are the only things + # that can alias.) This motivates why I write Tensor(a!)? (and not, for + # example, Tensor?(a!)), because the (a!) describes aliasing on the tensor, + # which may be optional (i.e., the alias annotation should bind first to + # Tensor, before the optional postfix annotation). + # + # However, despite being a property of Tensor, we (and c10::Argument) + # store the annotation at the top level of the Argument, rather than + # inside the embedded Tensor type. In the C++ version of this + # class, we then go through great lengths to mimic the type + # structure in the annotation structure so we can correlate + # annotations with types. + # + # Now, it turns out, in all applications in code generation, the + # structure of annotated types is very simple. So we just hard + # code it here. But if we ever do get anything more complex, this + # model will have to change! + annotation: Optional[Annotation] + + @staticmethod + def parse(arg: str) -> 'Argument': + name: str + default: Optional[str] + type_and_annot, name_and_default = arg.rsplit(' ', 1) + if '=' in name_and_default: + name, default = name_and_default.split('=') + else: + name = name_and_default + default = None + # TODO: deduplicate annotation matching with Return + match = re.match(r'Tensor\((.+)\)(.*)', type_and_annot) + annotation: Optional[Annotation] + if match: + # If you update this, make sure the __str__ still works too + assert match.group(2) in ['', '?', '[]'], 'unrecognized alias analysis form with Tensor' + type_s = 'Tensor' + match.group(2) + annotation = Annotation.parse(match.group(1)) + else: + type_s = type_and_annot + annotation = None + type = Type.parse(type_s) + r = Argument( + name=name, + type=type, + default=default, + annotation=annotation, + ) + assert str(r) == arg, f'{str(r)} != {arg}' + return r + + @property + def is_write(self) -> bool: + return self.annotation is not None and self.annotation.is_write + + def __str__(self) -> str: + type = f'{self.type}' + if self.annotation: + assert type in ['Tensor', 'Tensor?', 'Tensor[]'] + type = type.replace('Tensor', f'Tensor({self.annotation})') + if self.name is None: + return type + else: + mb_default = '' + if self.default: + mb_default = f'={self.default}' + return f"{type} {self.name}{mb_default}" + + +@dataclass(frozen=True) +class Return: + name: Optional[str] + type: Type + annotation: Optional[Annotation] + + @staticmethod + def parse(arg: str) -> 'Return': + name: Optional[str] + if ' ' in arg: + type_and_annot, name = arg.rsplit(' ', 1) + else: + type_and_annot = arg + name = None + match = re.match(r'Tensor\((.+)\)(.*)', type_and_annot) + annotation: Optional[Annotation] + if match: + # If you update this, make sure the __str__ still works too + assert match.group(2) in ['', '?', '[]'], 'unrecognized alias analysis form with Tensor' + type_s = 'Tensor' + match.group(2) + annotation = Annotation.parse(match.group(1)) + else: + type_s = type_and_annot + annotation = None + type = Type.parse(type_s) + r = Return( + name=name, + type=type, + annotation=annotation, + ) + assert str(r) == arg, f'{str(r)} != {arg}' + return r + + @property + def is_write(self) -> bool: + return self.annotation is not None and self.annotation.is_write + + def __str__(self) -> str: + type = f'{self.type}' + if self.annotation: + assert type in ['Tensor', 'Tensor?', 'Tensor[]'] + type = type.replace('Tensor', f'Tensor({self.annotation})') + if self.name is None: + return type + else: + return f"{type} {self.name}" + + +# Names that validly are __iXXX__ indicating inplace operations. +# Taken from https://www.python.org/dev/peps/pep-0203/#new-methods +# NB: PyTorch hasn't actually implemented all of these +AUGMENTED_ASSIGNMENT_NAMES = ['add', 'sub', 'mul', 'div', 'mod', 'pow', 'lshift', 'rshift', 'and', 'xor', 'or'] + +# A BaseOperatorName is what we think of the operator name, without +# the overload name. Unusually, we don't represent this as just a +# string; instead, we directly represent a few important semantic +# bits of information we derive from the string: namely whether +# or not it's inplace (add_) and whether or not it's a double-underscore +# method (__add__) +@dataclass(frozen=True) +class BaseOperatorName: + base: str + inplace: bool + dunder_method: bool + + @staticmethod + def parse(op: str) -> 'BaseOperatorName': + assert op != '' + assert not op.endswith('_out'), \ + "_out suffix is reserved and not permitted for operator names; " \ + "did you mean to specify an out overload name instead?" + m = re.match(r'^__([^_]+)__$', op) + if m is not None: + dunder_method = True + base = m.group(1) + if any(base == f'i{n}' for n in AUGMENTED_ASSIGNMENT_NAMES): + inplace = True + base = base[1:] + else: + inplace = False + # temporary, this is not intrinsically true but + # has been historically true for dunder methods + # we support (but, if we ever got, say, __int__, this would + # be wrong!) + assert base[0] != 'i' + else: + dunder_method = False + base = op + if base[-1] == '_': + inplace = True + base = base[:-1] + else: + inplace = False + r = BaseOperatorName(base=base, inplace=inplace, dunder_method=dunder_method) + assert str(r) == op, f'{str(r)} != {op}' + return r + + def __str__(self) -> str: + if self.dunder_method: + i = 'i' if self.inplace else '' + return f'__{i}{self.base}__' + else: + i = '_' if self.inplace else '' + return f'{self.base}{i}' + +# Operator name is the base operator name along with the (typically not +# user visible) overload string. +@dataclass(frozen=True) +class OperatorName: + name: BaseOperatorName + overload_name: str + + @staticmethod + def parse(op_name: str) -> 'OperatorName': + if '.' in op_name: + name, overload_name = op_name.split('.', 1) + else: + name = op_name + overload_name = '' + r = OperatorName( + name=BaseOperatorName.parse(name), + overload_name=overload_name + ) + assert str(r) == op_name, f'{str(r)} != {op_name}' + return r + + def __str__(self) -> str: + if self.overload_name: + return f"{self.name}.{self.overload_name}" + else: + return f"{self.name}" + +# Helper functions for parsing argument lists (both inputs and returns) + +def parse_returns(return_decl: str) -> Sequence[Return]: + """ + Input: '()' + Output: [] + """ + if return_decl == '()': + return [] + if return_decl[0] == '(' and return_decl[-1] == ')': + return_decl = return_decl[1:-1] + returns = [] + for arg in return_decl.split(', '): + returns.append(Return.parse(arg)) + return returns + +def parse_arguments(args: str) -> Tuple[Sequence[Argument], Sequence[Argument], Sequence[Argument]]: + """ + Input: 'int x, int y, int z' + Output: positional args, kwarg only args + """ + arguments: List[Argument] = [] + kwarg_only_arguments: List[Argument] = [] + out_arguments: List[Argument] = [] + arguments_acc = arguments + + # TODO: Use a real parser here; this will get bamboozled + # by signatures that contain things like std::array (note the space) + for arg in args.split(', '): + if not arg: + continue + if arg == '*': + assert arguments_acc is arguments, "invalid syntax: kwarg-only specifier * can only occur once" + arguments_acc = kwarg_only_arguments + continue + parg = Argument.parse(arg) + # Currently, we rely directly on the invariant that there are NO + # kwarg-only mutating arguments. If you want to relax this, + # we will need a more semantic way of matching that takes + # into account return arguments. In that case, you will have + # to manage out_arguments computation a level up, in + # FunctionSchema. See Note [is_out_fn] + if parg.annotation is not None and parg.annotation.is_write: + if arguments_acc is arguments: + pass # do nothing + elif arguments_acc is kwarg_only_arguments: + arguments_acc = out_arguments + else: + assert arguments_acc is not out_arguments + arguments_acc.append(parg) + + return arguments, kwarg_only_arguments, out_arguments diff --git a/tools/setup_helpers/gen.py b/tools/setup_helpers/gen.py new file mode 100644 index 00000000000..bdb52ee44ef --- /dev/null +++ b/tools/setup_helpers/gen.py @@ -0,0 +1,11 @@ +# Little stub file to get BUILD.bazel to play along + +import os.path +import sys + +root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.insert(0, root) + +import tools.codegen.gen + +tools.codegen.gen.main() diff --git a/aten/src/ATen/common_with_cwrap.py b/tools/shared/cwrap_common.py similarity index 100% rename from aten/src/ATen/common_with_cwrap.py rename to tools/shared/cwrap_common.py