diff --git a/.circleci/scripts/cpp_doc_push_script.sh b/.circleci/scripts/cpp_doc_push_script.sh
index e9b86e211e6..618b64c7f12 100755
--- a/.circleci/scripts/cpp_doc_push_script.sh
+++ b/.circleci/scripts/cpp_doc_push_script.sh
@@ -47,16 +47,11 @@ sudo apt-get -y install doxygen
 # Generate ATen files
 pushd "${pt_checkout}"
 pip install -r requirements.txt
-time python aten/src/ATen/gen.py \
+time python -m tools.codegen.gen \
   -s aten/src/ATen \
-  -d build/aten/src/ATen \
-  aten/src/ATen/Declarations.cwrap \
-  aten/src/THCUNN/generic/THCUNN.h \
-  aten/src/ATen/nn.yaml \
-  aten/src/ATen/native/native_functions.yaml
+  -d build/aten/src/ATen
 
 # Copy some required files
-cp aten/src/ATen/common_with_cwrap.py tools/shared/cwrap_common.py
 cp torch/_utils_internal.py tools/shared
 
 # Generate PyTorch files
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index f765f7614a1..2086d64e61a 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -131,13 +131,9 @@ jobs:
             time python setup.py --cmake-only build
 
             # Generate ATen files.
-            time python aten/src/ATen/gen.py \
+            time python -m tools.codegen.gen \
               -s aten/src/ATen \
-              -d build/aten/src/ATen \
-              aten/src/ATen/Declarations.cwrap \
-              aten/src/THCUNN/generic/THCUNN.h \
-              aten/src/ATen/nn.yaml \
-              aten/src/ATen/native/native_functions.yaml
+              -d build/aten/src/ATen
 
             # Generate PyTorch files.
             time python tools/setup_helpers/generate_code.py            \
diff --git a/.gitignore b/.gitignore
index f1c870be40f..99180410987 100644
--- a/.gitignore
+++ b/.gitignore
@@ -108,9 +108,6 @@ env
 # macOS dir files
 .DS_Store
 
-# Symbolic files
-tools/shared/cwrap_common.py
-
 # Ninja files
 .ninja_deps
 .ninja_log
diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh
index 0b9c9209a80..bba8aa0e036 100755
--- a/.jenkins/caffe2/build.sh
+++ b/.jenkins/caffe2/build.sh
@@ -248,6 +248,8 @@ else
     export MAX_JOBS=`expr $(nproc) - 1`
   fi
 
+  pip install --user dataclasses
+
   $PYTHON setup.py install --user
 
   report_compile_cache_stats
diff --git a/.jenkins/pytorch/macos-common.sh b/.jenkins/pytorch/macos-common.sh
index f0b28bf20f6..27c9d4ccb35 100755
--- a/.jenkins/pytorch/macos-common.sh
+++ b/.jenkins/pytorch/macos-common.sh
@@ -20,7 +20,7 @@ if [ ! -d "${WORKSPACE_DIR}/miniconda3" ]; then
 fi
 export PATH="${WORKSPACE_DIR}/miniconda3/bin:$PATH"
 source ${WORKSPACE_DIR}/miniconda3/bin/activate
-retry conda install -y mkl mkl-include numpy=1.18.5 pyyaml=5.3 setuptools=46.0.0 cmake cffi ninja typing_extensions
+retry conda install -y mkl mkl-include numpy=1.18.5 pyyaml=5.3 setuptools=46.0.0 cmake cffi ninja typing_extensions dataclasses
 
 # The torch.hub tests make requests to GitHub.
 #
diff --git a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
index 0212c553703..0ddf3b4b462 100644
--- a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
@@ -21,8 +21,8 @@ call %INSTALLER_DIR%\install_sccache.bat
 call %INSTALLER_DIR%\install_miniconda3.bat
 
 
-:: Install ninja
-if "%REBUILD%"=="" ( pip install -q "ninja==1.9.0" )
+:: Install ninja and other deps
+if "%REBUILD%"=="" ( pip install -q "ninja==1.9.0" dataclasses )
 
 git submodule sync --recursive
 git submodule update --init --recursive
diff --git a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
index ac0f018259f..17a3d39d076 100644
--- a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
@@ -22,7 +22,7 @@ call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Minic
 if NOT "%BUILD_ENVIRONMENT%"=="" (
     :: We have to pin Python version to 3.6.7, until mkl supports Python 3.7
     :: Numba is pinned to 0.44.0 to avoid https://github.com/numba/numba/issues/4352
-    call conda install -y -q python=3.6.7 numpy mkl cffi pyyaml boto3 protobuf numba==0.44.0 scipy==1.5.0 typing_extensions
+    call conda install -y -q python=3.6.7 numpy mkl cffi pyyaml boto3 protobuf numba==0.44.0 scipy==1.5.0 typing_extensions dataclasses
     if %errorlevel% neq 0 ( exit /b %errorlevel% )
     call conda install -y -q -c conda-forge cmake
     if %errorlevel% neq 0 ( exit /b %errorlevel% )
diff --git a/BUILD.bazel b/BUILD.bazel
index f7be71ec624..9bedaef1676 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -106,17 +106,19 @@ cc_test(
     ],
 )
 
+# TODO: refactor this into its own library (but how to make
+# a binary based off of a module in a library?)
 py_binary(
     name = "gen",
-    srcs = ["aten/src/ATen/gen.py"],
+    srcs = ["tools/setup_helpers/gen.py"],
+    deps = [
+        ":tools_codegen"
+    ],
 )
 
 genrule(
     name = "generated_cpp",
     srcs = [
-        "aten/src/ATen/Declarations.cwrap",
-        "aten/src/THCUNN/generic/THCUNN.h",
-        "aten/src/ATen/nn.yaml",
         "aten/src/ATen/native/native_functions.yaml",
     ] + glob(["aten/src/ATen/templates/**"]),
     outs = [
@@ -126,8 +128,6 @@ genrule(
         "aten/src/ATen/CPUType.cpp",
         "aten/src/ATen/Functions.h",
         "aten/src/ATen/Functions.cpp",
-        "aten/src/ATen/LegacyTHFunctionsCPU.h",
-        "aten/src/ATen/LegacyTHFunctionsCPU.cpp",
         "aten/src/ATen/NativeFunctions.h",
         "aten/src/ATen/MkldnnCPUType.h",
         "aten/src/ATen/MkldnnCPUType.cpp",
@@ -141,14 +141,13 @@ genrule(
         "aten/src/ATen/core/TensorMethods.cpp",
         "aten/src/ATen/core/ATenOpList.cpp",
     ],
-    cmd = "$(location :gen) --source-path aten/src/ATen --install_dir `dirname $(location aten/src/ATen/Declarations.yaml)` aten/src/ATen/Declarations.cwrap aten/src/THCUNN/generic/THCUNN.h aten/src/ATen/nn.yaml aten/src/ATen/native/native_functions.yaml",
+    cmd = "$(location :gen) --source-path aten/src/ATen --install_dir `dirname $(location aten/src/ATen/Declarations.yaml)`",
     tools = [":gen"],
 )
 
 py_library(
-    name = "code_template",
-    srcs = ["aten/src/ATen/code_template.py"],
-    imports = ["aten"],
+    name = "tools_codegen",
+    srcs = glob(["tools/codegen/**/*.py"]),
 )
 
 py_library(
@@ -158,7 +157,7 @@ py_library(
         "tools/autograd/*.yaml",
         "tools/autograd/templates/*",
     ]),
-    deps = [":code_template"],
+    deps = [":tools_codegen"],
 )
 
 py_library(
diff --git a/README.md b/README.md
index 6e1fcfdb828..d2fbecdb3dd 100644
--- a/README.md
+++ b/README.md
@@ -169,7 +169,7 @@ If you are building for NVIDIA's Jetson platforms (Jetson Nano, TX1, TX2, AGX Xa
 
 Common
 ```bash
-conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests
+conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses
 ```
 
 On Linux
diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap
deleted file mode 100644
index 7325b8eb88f..00000000000
--- a/aten/src/ATen/Declarations.cwrap
+++ /dev/null
@@ -1,561 +0,0 @@
-[[
-  name: _th_masked_fill_
-  cuda_bool: True
-  cuda_bfloat16: True
-  cname: maskedFill
-  variants: function
-  backends:
-    - CUDA
-  return: self
-  options:
-    - arguments:
-      - THTensor* self
-      - THByteTensor* mask
-      - real value
-]]
-[[
-  name: _th_masked_fill_bool_
-  cuda_bool: True
-  cuda_bfloat16: True
-  cname: maskedFillBool
-  variants: function
-  backends:
-    - CUDA
-  return: self
-  options:
-    - arguments:
-      - THTensor* self
-      - THBoolTensor* mask
-      - real value
-]]
-[[
-  name: _th_masked_scatter_
-  cpu_bool: True
-  cuda_bool: True
-  cpu_bfloat16: True
-  cuda_bfloat16: True
-  cname: maskedCopy
-  variants: function
-  return: self
-  arguments:
-    - THTensor* self
-    - THByteTensor* mask
-    - THTensor* source
-]]
-[[
-  name: _th_masked_scatter_bool_
-  cpu_bool: True
-  cuda_bool: True
-  cpu_bfloat16: True
-  cuda_bfloat16: True
-  cname: maskedCopyBool
-  variants: function
-  return: self
-  arguments:
-    - THTensor* self
-    - THBoolTensor* mask
-    - THTensor* source
-]]
-[[
-  name: _th_nonzero
-  cname: nonzero
-  cpu_half: True
-  cpu_bool: True
-  cuda_bool: True
-  cpu_bfloat16: True
-  cuda_bfloat16: True
-  variants:
-    - function
-  return: argument 0
-  arguments:
-    - arg: THIndexTensor* result
-      output: True
-    - THTensor* self
-]]
-[[
-  name: _th_index_copy_
-  cname: indexCopy
-  cpu_bool: True
-  cuda_bool: True
-  variants: function
-  return: argument 0
-  arguments:
-    - THTensor* self
-    - long dim
-    - THIndexTensor* index
-    - THTensor* source
-]]
-[[
-  name: _th_take
-  cpu_bool: True
-  cuda_bool: True
-  cname: take
-  variants:
-    - function
-  return: argument 0
-  arguments:
-    - arg: THTensor* result
-      output: True
-    - THTensor* self
-    - THIndexTensor* index
-]]
-[[
-  name: _th_put_
-  cpu_bool: True
-  cuda_bool: True
-  cname: put
-  variants: function
-  backends:
-    - CPU
-    - CUDA
-  return: argument 0
-  arguments:
-    - THTensor* self
-    - THIndexTensor* index
-    - THTensor* source
-    - bool accumulate
-]]
-[[
-  name: _th_index_fill_
-  cpu_bool: True
-  cuda_bool: True
-  cname: indexFill
-  variants: function
-  return: argument 0
-  options:
-    - arguments:
-      - THTensor* self
-      - long dim
-      - THIndexTensor* index
-      - real value
-]]
-[[
-  name: _th_mode
-  variants: function
-  cname: mode
-  return: argument 0,1
-  arguments:
-    - arg: THTensor* values
-      output: True
-    - arg: THIndexTensor* indices
-      output: True
-    - THTensor* self
-    - long dim
-    - bool keepdim
-]]
-[[
-  name: _th_sort
-  cname: sort
-  cpu_half: True
-  variants:
-    - function
-  return: argument 0,1
-  arguments:
-    - arg: THTensor* values
-      output: True
-    - arg: THIndexTensor* indices
-      output: True
-    - THTensor* self
-    - long dim
-    - bool descending
-]]
-[[
-  name: _th_topk
-  cname: topk
-  cuda_bfloat16: True
-  backends:
-    - CUDA
-  variants:
-    - function
-  return: argument 0,1
-  arguments:
-    - arg: THTensor* values
-      output: True
-    - arg: THIndexTensor* indices
-      output: True
-    - THTensor* self
-    - long k
-    - long dim
-    - bool largest
-    - bool sorted
-]]
-[[
-  name: _th_var
-  types:
-    - floating_point
-  backends:
-    - CPU
-    - CUDA
-  variants: function
-  options:
-    - cname: var_all
-      return: accreal
-      arguments:
-        - THTensor* self
-        - bool unbiased
-]]
-[[
-  name: _th_std
-  types:
-    - floating_point
-  backends:
-    - CPU
-    - CUDA
-  variants: function
-  options:
-    - cname: std_all
-      return: accreal
-      arguments:
-        - THTensor* self
-        - bool unbiased
-]]
-[[
-  name: _th_renorm
-  cname: renorm
-  types:
-    - floating_point
-  backends:
-    - CPU
-    - CUDA
-  variants:
-    - function
-  return: argument 0
-  arguments:
-    - arg: THTensor* result
-      output: True
-    - THTensor* self
-    - real p
-    - long dim
-    - real maxnorm
-]]
-[[
-  name: _th_renorm_
-  types:
-    - floating_point
-  backends:
-    - CPU
-    - CUDA
-  cname: renorm
-  variants: function
-  return: self
-  arguments:
-    - THTensor* self
-    - THTensor* self
-    - real p
-    - long dim
-    - real maxnorm
-]]
-[[
-  name: _th_histc
-  cname: histc
-  types:
-    - Float
-    - Double
-  backends:
-    - CPU
-  variants:
-    - function
-  return: argument 0
-  arguments:
-    - arg: THTensor* result
-      output: True
-    - THTensor* self
-    - long bins
-    - real min
-    - real max
-]]
-[[
-  name: _th_trace
-  cname: trace
-  variants:
-    - function
-  return: accreal
-  arguments:
-    - THTensor* self
-  backends:
-    - CPU
-]]
-[[
-  name: _th_fmod
-  return: argument 0
-  variants:
-    - function
-  backends:
-    - CUDA
-  options:
-    - cname: fmod
-      arguments:
-        - arg: THTensor* result
-          output: True
-        - THTensor* self
-        - real other
-    - cname: cfmod
-      arguments:
-        - arg: THTensor* result
-          output: True
-        - THTensor* self
-        - THTensor* other
-]]
-[[
-  name: _th_fmod_
-  return: argument 0
-  variants: function
-  backends:
-    - CUDA
-  options:
-    - cname: fmod
-      arguments:
-        - THTensor* self
-        - THTensor* self
-        - real other
-    - cname: cfmod
-      arguments:
-        - THTensor* self
-        - THTensor* self
-        - THTensor* other
-]]
-[[
-  name: _th_cross_kernel
-  cname: crossKernel
-  variants:
-    - function
-  backends:
-    - CUDA
-  return: argument 0
-  arguments:
-    - arg: THTensor* result
-      output: True
-    - THTensor* self
-    - THTensor* other
-    - arg: int64_t dim
-]]
-[[
-  name: _th_addr
-  cname: addr
-  cpu_bfloat16: True
-  variants: function
-  return: argument 0
-  backends: [CPU]
-  arguments:
-    - arg: THTensor* result
-      output: True
-    - THTensor* self
-    - THTensor* vec1
-    - THTensor* vec2
-    - real beta
-    - real alpha
-]]
-[[
-  name: _th_addr_
-  cpu_bfloat16: True
-  cname: addr
-  return: self
-  variants: function
-  backends: [CPU]
-  arguments:
-    - THTensor* self
-    - THTensor* self
-    - THTensor* vec1
-    - THTensor* vec2
-    - real beta
-    - real alpha
-]]
-[[
-[[
-  name: _th_bmm
-  cuda_bfloat16: True
-  cname: baddbmm
-  variants:
-    - function
-  backends:
-    - CUDA
-  return: argument 0
-  arguments:
-    - arg: THTensor* result
-      output: True
-    - argument 0
-    - THTensor* self
-    - THTensor* mat2
-    - CONSTANT AS_REAL(0)
-    - CONSTANT AS_REAL(1)
-]]
-[[
-  name: _th_baddbmm
-  cuda_bfloat16: True
-  cname: baddbmm
-  variants:
-    - function
-  backends:
-    - CUDA
-  return: argument 0
-  arguments:
-    - arg: THTensor* result
-      output: True
-    - arg: THTensor* self
-    - THTensor* batch1
-    - THTensor* batch2
-    - real beta
-    - real alpha
-]]
-[[
-  name: _th_gels
-  cname: gels
-  types:
-    - Float
-    - Double
-  backends:
-    - CPU
-    - CUDA
-  variants:
-    - function
-  return: argument 0,1
-  arguments:
-    - arg: THTensor* res1
-      output: True
-    - arg: THTensor* res2
-      output: True
-    - THTensor* self
-    - THTensor* A
-]]
-[[
-  name: _th_eig
-  cname: geev
-  types:
-    - Float
-    - Double
-  backends:
-    - CPU
-    - CUDA
-  variants:
-    - function
-  return: argument 0,1
-  arguments:
-    - arg: THTensor* res1
-      output: True
-    - arg: THTensor* res2
-      output: True
-    - THTensor* self
-    - bool eigenvectors
-]]
-[[
-  name: _th_potri
-  cname: potri
-  types:
-    - Float
-    - Double
-  backends:
-    - CPU
-    - CUDA
-  variants:
-    - function
-  return: argument 0
-  arguments:
-    - arg: THTensor* output
-      output: True
-    - THTensor* self
-    - bool upper
-]]
-[[
-  name: _th_geqrf
-  cname: geqrf
-  types:
-    - Float
-    - Double
-  backends:
-    - CPU
-    - CUDA
-  variants:
-    - function
-  return: argument 0,1
-  arguments:
-    - arg: THTensor* res1
-      output: True
-    - arg: THTensor* res2
-      output: True
-    - THTensor* self
-]]
-[[
-  name: _th_orgqr
-  cname: orgqr
-  types:
-    - Float
-    - Double
-  backends:
-    - CPU
-  variants:
-    - function
-  return: argument 0
-  arguments:
-    - arg: THTensor* result
-      output: True
-    - THTensor* self
-    - THTensor* input2
-]]
-[[
-  name: _th_ormqr
-  cname: ormqr
-  types:
-    - Float
-    - Double
-  backends:
-    - CPU
-  variants:
-    - function
-  return: argument 0
-  arguments:
-    - arg: THTensor* result
-      output: True
-    - THTensor* self
-    - THTensor* input2
-    - THTensor* input3
-    - bool left
-    - bool transpose
-]]
-[[
-  name: _th_multinomial_alias_setup
-  cname: multinomialAliasSetup
-  variants:
-    - function
-  types:
-    - floating_point
-  backends:
-    - CPU
-    - CUDA
-  return: argument 1,2
-  arguments:
-    - arg: THTensor* probs
-    - arg: THIndexTensor* J
-      output: True
-    - arg: THTensor* q
-      output: True
-]]
-[[
-  name: _th_multinomial_alias_draw
-  cname: multinomialAliasDraw
-  types:
-    - floating_point
-  backends:
-    - CPU
-    - CUDA
-  variants:
-    - function
-  return: argument 0
-  arguments:
-    - arg: THIndexTensor* result
-      output: True
-    - THTensor* q
-    - THIndexTensor* J
-    - long num_samples
-    - c10::optional<Generator> generator
-]]
-[[
-  name: _th_copy_ignoring_overlaps_
-  cname: copyIgnoringOverlaps
-  return: self
-  variants: function
-  backends:
-    - CUDA
-  arguments:
-    - THTensor* self
-    - THTensor* src
-]]
diff --git a/aten/src/ATen/LegacyTHFunctionsCPU.cpp b/aten/src/ATen/LegacyTHFunctionsCPU.cpp
new file mode 100644
index 00000000000..a7413033c5c
--- /dev/null
+++ b/aten/src/ATen/LegacyTHFunctionsCPU.cpp
@@ -0,0 +1,1712 @@
+#include <ATen/LegacyTHFunctionsCPU.h>
+
+// @generated by aten/src/ATen/gen.py from LegacyTHFunctions.cpp
+
+#include <ATen/ATen.h>
+#include <ATen/Utils.h>
+#include <ATen/NamedTensorUtils.h>
+#include <ATen/CPUGeneratorImpl.h>
+#include <ATen/ExpandUtils.h>
+#include <TH/TH.h>
+#include <TH/THTensor.hpp>
+
+
+namespace at {
+namespace native {
+namespace legacy {
+namespace cpu {
+
+namespace {
+  ScalarType infer_scalar_type(const Tensor & t) {
+    return t.scalar_type();
+  }
+  ScalarType infer_scalar_type(const TensorList & tl) {
+    TORCH_CHECK(tl.size() > 0, "expected a non-empty list of Tensors");
+    return tl[0].scalar_type();
+  }
+
+  TensorOptions options(ScalarType s) {
+    return TensorOptions().dtype(s)
+                          .device(DeviceType::CPU)
+                          .layout(kStrided);
+  }
+
+  Allocator* allocator() {
+    return getCPUAllocator();
+  }
+}
+
+Tensor & _th_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Bool: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CPU, ScalarType::Byte);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type);
+            THBoolTensor_maskedCopy(self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CPU, ScalarType::Byte);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type);
+            THByteTensor_maskedCopy(self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CPU, ScalarType::Byte);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type);
+            THCharTensor_maskedCopy(self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CPU, ScalarType::Byte);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_maskedCopy(self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CPU, ScalarType::Byte);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_maskedCopy(self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CPU, ScalarType::Byte);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type);
+            THIntTensor_maskedCopy(self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CPU, ScalarType::Byte);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type);
+            THLongTensor_maskedCopy(self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CPU, ScalarType::Byte);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type);
+            THShortTensor_maskedCopy(self_, mask_, source_);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CPU, ScalarType::Byte);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type);
+            THBFloat16Tensor_maskedCopy(self_, mask_, source_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_masked_scatter_ not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return self;
+}
+Tensor & _th_masked_scatter_bool_(Tensor & self, const Tensor & mask, const Tensor & source) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Bool: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CPU, ScalarType::Bool);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type);
+            THBoolTensor_maskedCopyBool(self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CPU, ScalarType::Bool);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type);
+            THByteTensor_maskedCopyBool(self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CPU, ScalarType::Bool);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type);
+            THCharTensor_maskedCopyBool(self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CPU, ScalarType::Bool);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_maskedCopyBool(self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CPU, ScalarType::Bool);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_maskedCopyBool(self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CPU, ScalarType::Bool);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type);
+            THIntTensor_maskedCopyBool(self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CPU, ScalarType::Bool);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type);
+            THLongTensor_maskedCopyBool(self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CPU, ScalarType::Bool);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type);
+            THShortTensor_maskedCopyBool(self_, mask_, source_);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CPU, ScalarType::Bool);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type);
+            THBFloat16Tensor_maskedCopyBool(self_, mask_, source_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_masked_scatter_bool_ not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return self;
+}
+Tensor & _th_nonzero_out(Tensor & result, const Tensor & self) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Bool: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THBoolTensor_nonzero(result_, self_);
+            break;
+        }
+        case ScalarType::Byte: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THByteTensor_nonzero(result_, self_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THCharTensor_nonzero(result_, self_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_nonzero(result_, self_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_nonzero(result_, self_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THIntTensor_nonzero(result_, self_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THLongTensor_nonzero(result_, self_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THShortTensor_nonzero(result_, self_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THHalfTensor_nonzero(result_, self_);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THBFloat16Tensor_nonzero(result_, self_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_nonzero_out not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor _th_nonzero(const Tensor & self) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(ScalarType::Long)).release();
+    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Bool: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
+            THBoolTensor_nonzero(result_, self_);
+            break;
+        }
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
+            THByteTensor_nonzero(result_, self_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
+            THCharTensor_nonzero(result_, self_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_nonzero(result_, self_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_nonzero(result_, self_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
+            THIntTensor_nonzero(result_, self_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
+            THLongTensor_nonzero(result_, self_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
+            THShortTensor_nonzero(result_, self_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
+            THHalfTensor_nonzero(result_, self_);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
+            THBFloat16Tensor_nonzero(result_, self_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_nonzero not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor & _th_index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Bool: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CPU, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type);
+            THBoolTensor_indexCopy(self_, dim, index_, source_);
+            break;
+        }
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CPU, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type);
+            THByteTensor_indexCopy(self_, dim, index_, source_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CPU, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type);
+            THCharTensor_indexCopy(self_, dim, index_, source_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CPU, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_indexCopy(self_, dim, index_, source_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CPU, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_indexCopy(self_, dim, index_, source_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CPU, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type);
+            THIntTensor_indexCopy(self_, dim, index_, source_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CPU, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type);
+            THLongTensor_indexCopy(self_, dim, index_, source_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CPU, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type);
+            THShortTensor_indexCopy(self_, dim, index_, source_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_index_copy_ not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return self;
+}
+Tensor & _th_take_out(Tensor & result, const Tensor & self, const Tensor & index) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Bool: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
+            THBoolTensor_take(result_, self_, index_);
+            break;
+        }
+        case ScalarType::Byte: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
+            THByteTensor_take(result_, self_, index_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
+            THCharTensor_take(result_, self_, index_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
+            THDoubleTensor_take(result_, self_, index_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
+            THFloatTensor_take(result_, self_, index_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
+            THIntTensor_take(result_, self_, index_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
+            THLongTensor_take(result_, self_, index_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
+            THShortTensor_take(result_, self_, index_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_take_out not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor _th_take(const Tensor & self, const Tensor & index) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Bool: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
+            THBoolTensor_take(result_, self_, index_);
+            break;
+        }
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
+            THByteTensor_take(result_, self_, index_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
+            THCharTensor_take(result_, self_, index_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
+            THDoubleTensor_take(result_, self_, index_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
+            THFloatTensor_take(result_, self_, index_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
+            THIntTensor_take(result_, self_, index_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
+            THLongTensor_take(result_, self_, index_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
+            THShortTensor_take(result_, self_, index_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_take not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor & _th_put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Bool: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CPU, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type);
+            THBoolTensor_put(self_, index_, source_, accumulate);
+            break;
+        }
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CPU, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type);
+            THByteTensor_put(self_, index_, source_, accumulate);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CPU, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type);
+            THCharTensor_put(self_, index_, source_, accumulate);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CPU, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_put(self_, index_, source_, accumulate);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CPU, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_put(self_, index_, source_, accumulate);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CPU, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type);
+            THIntTensor_put(self_, index_, source_, accumulate);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CPU, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type);
+            THLongTensor_put(self_, index_, source_, accumulate);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CPU, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type);
+            THShortTensor_put(self_, index_, source_, accumulate);
+            break;
+        }
+        default:
+            AT_ERROR("_th_put_ not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return self;
+}
+Tensor & _th_index_fill_(Tensor & self, int64_t dim, const Tensor & index, Scalar value) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Bool: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CPU, ScalarType::Long);
+            auto value_ = value.toBool();
+            THBoolTensor_indexFill(self_, dim, index_, value_);
+            break;
+        }
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CPU, ScalarType::Long);
+            auto value_ = value.toByte();
+            THByteTensor_indexFill(self_, dim, index_, value_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CPU, ScalarType::Long);
+            auto value_ = value.toChar();
+            THCharTensor_indexFill(self_, dim, index_, value_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CPU, ScalarType::Long);
+            auto value_ = value.toDouble();
+            THDoubleTensor_indexFill(self_, dim, index_, value_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CPU, ScalarType::Long);
+            auto value_ = value.toFloat();
+            THFloatTensor_indexFill(self_, dim, index_, value_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CPU, ScalarType::Long);
+            auto value_ = value.toInt();
+            THIntTensor_indexFill(self_, dim, index_, value_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CPU, ScalarType::Long);
+            auto value_ = value.toLong();
+            THLongTensor_indexFill(self_, dim, index_, value_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CPU, ScalarType::Long);
+            auto value_ = value.toShort();
+            THShortTensor_indexFill(self_, dim, index_, value_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_index_fill_ not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return self;
+}
+std::tuple<Tensor &,Tensor &> _th_mode_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool keepdim) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THByteTensor_mode(values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Char: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THCharTensor_mode(values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Double: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_mode(values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Float: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_mode(values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Int: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THIntTensor_mode(values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Long: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THLongTensor_mode(values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Short: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THShortTensor_mode(values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        default:
+            AT_ERROR("_th_mode_out not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor &, Tensor &>(values, indices);
+}
+std::tuple<Tensor,Tensor> _th_mode(const Tensor & self, int64_t dim, bool keepdim) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto values_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto values = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(values_));
+    auto indices_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(ScalarType::Long)).release();
+    auto indices = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(indices_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CPU, dispatch_scalar_type);
+            THByteTensor_mode(values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CPU, dispatch_scalar_type);
+            THCharTensor_mode(values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_mode(values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_mode(values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CPU, dispatch_scalar_type);
+            THIntTensor_mode(values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CPU, dispatch_scalar_type);
+            THLongTensor_mode(values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CPU, dispatch_scalar_type);
+            THShortTensor_mode(values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        default:
+            AT_ERROR("_th_mode not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor, Tensor>(values, indices);
+}
+std::tuple<Tensor &,Tensor &> _th_sort_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool descending) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THByteTensor_sort(values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Char: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THCharTensor_sort(values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Double: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_sort(values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Float: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_sort(values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Int: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THIntTensor_sort(values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Long: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THLongTensor_sort(values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Short: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THShortTensor_sort(values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Half: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THHalfTensor_sort(values_, indices_, self_, dim, descending);
+            break;
+        }
+        default:
+            AT_ERROR("_th_sort_out not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor &, Tensor &>(values, indices);
+}
+std::tuple<Tensor,Tensor> _th_sort(const Tensor & self, int64_t dim, bool descending) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto values_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto values = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(values_));
+    auto indices_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(ScalarType::Long)).release();
+    auto indices = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(indices_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type);
+            THByteTensor_sort(values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type);
+            THCharTensor_sort(values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_sort(values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_sort(values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type);
+            THIntTensor_sort(values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type);
+            THLongTensor_sort(values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type);
+            THShortTensor_sort(values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type);
+            THHalfTensor_sort(values_, indices_, self_, dim, descending);
+            break;
+        }
+        default:
+            AT_ERROR("_th_sort not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor, Tensor>(values, indices);
+}
+Tensor _th_var(const Tensor & self, bool unbiased) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_var", false, DeviceType::CPU, dispatch_scalar_type);
+            return at::scalar_tensor(convert<double>(THDoubleTensor_var_all(self_, unbiased)), options(ScalarType::Double));
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_var", false, DeviceType::CPU, dispatch_scalar_type);
+            return at::scalar_tensor(convert<float>(THFloatTensor_var_all(self_, unbiased)), options(ScalarType::Float));
+            break;
+        }
+        default:
+            AT_ERROR("_th_var not supported on CPUType for ", dispatch_scalar_type);
+    }
+}
+Tensor _th_std(const Tensor & self, bool unbiased) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_std", false, DeviceType::CPU, dispatch_scalar_type);
+            return at::scalar_tensor(convert<double>(THDoubleTensor_std_all(self_, unbiased)), options(ScalarType::Double));
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_std", false, DeviceType::CPU, dispatch_scalar_type);
+            return at::scalar_tensor(convert<float>(THFloatTensor_std_all(self_, unbiased)), options(ScalarType::Float));
+            break;
+        }
+        default:
+            AT_ERROR("_th_std not supported on CPUType for ", dispatch_scalar_type);
+    }
+}
+Tensor & _th_renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_renorm_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto p_ = p.toDouble();
+            auto maxnorm_ = maxnorm.toDouble();
+            THDoubleTensor_renorm(result_, self_, p_, dim, maxnorm_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_renorm_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto p_ = p.toFloat();
+            auto maxnorm_ = maxnorm.toFloat();
+            THFloatTensor_renorm(result_, self_, p_, dim, maxnorm_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_renorm_out not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor _th_renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm", false, DeviceType::CPU, dispatch_scalar_type);
+            auto p_ = p.toDouble();
+            auto maxnorm_ = maxnorm.toDouble();
+            THDoubleTensor_renorm(result_, self_, p_, dim, maxnorm_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm", false, DeviceType::CPU, dispatch_scalar_type);
+            auto p_ = p.toFloat();
+            auto maxnorm_ = maxnorm.toFloat();
+            THFloatTensor_renorm(result_, self_, p_, dim, maxnorm_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_renorm not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto p_ = p.toDouble();
+            auto maxnorm_ = maxnorm.toDouble();
+            THDoubleTensor_renorm(self_, self_, p_, dim, maxnorm_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto p_ = p.toFloat();
+            auto maxnorm_ = maxnorm.toFloat();
+            THFloatTensor_renorm(self_, self_, p_, dim, maxnorm_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_renorm_ not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return self;
+}
+Tensor & _th_histc_out(Tensor & result, const Tensor & self, int64_t bins, Scalar min, Scalar max) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_histc_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_histc_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto min_ = min.toDouble();
+            auto max_ = max.toDouble();
+            THDoubleTensor_histc(result_, self_, bins, min_, max_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_histc_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_histc_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto min_ = min.toFloat();
+            auto max_ = max.toFloat();
+            THFloatTensor_histc(result_, self_, bins, min_, max_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_histc_out not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor _th_histc(const Tensor & self, int64_t bins, Scalar min, Scalar max) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_histc", false, DeviceType::CPU, dispatch_scalar_type);
+            auto min_ = min.toDouble();
+            auto max_ = max.toDouble();
+            THDoubleTensor_histc(result_, self_, bins, min_, max_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_histc", false, DeviceType::CPU, dispatch_scalar_type);
+            auto min_ = min.toFloat();
+            auto max_ = max.toFloat();
+            THFloatTensor_histc(result_, self_, bins, min_, max_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_histc not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor _th_trace(const Tensor & self) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
+            return at::scalar_tensor(convert<uint8_t>(THByteTensor_trace(self_)), options(ScalarType::Byte));
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
+            return at::scalar_tensor(convert<int8_t>(THCharTensor_trace(self_)), options(ScalarType::Char));
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
+            return at::scalar_tensor(convert<double>(THDoubleTensor_trace(self_)), options(ScalarType::Double));
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
+            return at::scalar_tensor(convert<float>(THFloatTensor_trace(self_)), options(ScalarType::Float));
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
+            return at::scalar_tensor(convert<int>(THIntTensor_trace(self_)), options(ScalarType::Int));
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
+            return at::scalar_tensor(convert<int64_t>(THLongTensor_trace(self_)), options(ScalarType::Long));
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
+            return at::scalar_tensor(convert<int16_t>(THShortTensor_trace(self_)), options(ScalarType::Short));
+            break;
+        }
+        default:
+            AT_ERROR("_th_trace not supported on CPUType for ", dispatch_scalar_type);
+    }
+}
+Tensor & _th_addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toByte();
+            auto alpha_ = alpha.toByte();
+            THByteTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toChar();
+            auto alpha_ = alpha.toChar();
+            THCharTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toDouble();
+            auto alpha_ = alpha.toDouble();
+            THDoubleTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toFloat();
+            auto alpha_ = alpha.toFloat();
+            THFloatTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toInt();
+            auto alpha_ = alpha.toInt();
+            THIntTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toLong();
+            auto alpha_ = alpha.toLong();
+            THLongTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toShort();
+            auto alpha_ = alpha.toShort();
+            THShortTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toBFloat16();
+            auto alpha_ = alpha.toBFloat16();
+            THBFloat16Tensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_addr_out not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor _th_addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toByte();
+            auto alpha_ = alpha.toByte();
+            THByteTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toChar();
+            auto alpha_ = alpha.toChar();
+            THCharTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toDouble();
+            auto alpha_ = alpha.toDouble();
+            THDoubleTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toFloat();
+            auto alpha_ = alpha.toFloat();
+            THFloatTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toInt();
+            auto alpha_ = alpha.toInt();
+            THIntTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toLong();
+            auto alpha_ = alpha.toLong();
+            THLongTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toShort();
+            auto alpha_ = alpha.toShort();
+            THShortTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toBFloat16();
+            auto alpha_ = alpha.toBFloat16();
+            THBFloat16Tensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_addr not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor & _th_addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toByte();
+            auto alpha_ = alpha.toByte();
+            THByteTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toChar();
+            auto alpha_ = alpha.toChar();
+            THCharTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toDouble();
+            auto alpha_ = alpha.toDouble();
+            THDoubleTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toFloat();
+            auto alpha_ = alpha.toFloat();
+            THFloatTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toInt();
+            auto alpha_ = alpha.toInt();
+            THIntTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toLong();
+            auto alpha_ = alpha.toLong();
+            THLongTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toShort();
+            auto alpha_ = alpha.toShort();
+            THShortTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
+            auto beta_ = beta.toBFloat16();
+            auto alpha_ = alpha.toBFloat16();
+            THBFloat16Tensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_addr_ not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return self;
+}
+std::tuple<Tensor &,Tensor &> _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_gels(res1_, res2_, self_, A_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_gels(res1_, res2_, self_, A_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_gels_out not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor &, Tensor &>(res1, res2);
+}
+std::tuple<Tensor,Tensor> _th_gels(const Tensor & self, const Tensor & A) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto res1_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto res1 = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(res1_));
+    auto res2_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto res2 = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(res2_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels", false, DeviceType::CPU, dispatch_scalar_type);
+            auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_gels(res1_, res2_, self_, A_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels", false, DeviceType::CPU, dispatch_scalar_type);
+            auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_gels(res1_, res2_, self_, A_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_gels not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor, Tensor>(res1, res2);
+}
+std::tuple<Tensor &,Tensor &> _th_eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_eig_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_eig_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_eig_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_geev(res1_, res2_, self_, eigenvectors);
+            break;
+        }
+        case ScalarType::Float: {
+            auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_eig_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_eig_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_eig_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_geev(res1_, res2_, self_, eigenvectors);
+            break;
+        }
+        default:
+            AT_ERROR("_th_eig_out not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor &, Tensor &>(res1, res2);
+}
+std::tuple<Tensor,Tensor> _th_eig(const Tensor & self, bool eigenvectors) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto res1_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto res1 = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(res1_));
+    auto res2_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto res2 = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(res2_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_eig", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_geev(res1_, res2_, self_, eigenvectors);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_eig", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_geev(res1_, res2_, self_, eigenvectors);
+            break;
+        }
+        default:
+            AT_ERROR("_th_eig not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor, Tensor>(res1, res2);
+}
+Tensor & _th_potri_out(Tensor & output, const Tensor & self, bool upper) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 0, "_th_potri_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_potri_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_potri(output_, self_, upper);
+            break;
+        }
+        case ScalarType::Float: {
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 0, "_th_potri_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_potri_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_potri(output_, self_, upper);
+            break;
+        }
+        default:
+            AT_ERROR("_th_potri_out not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return output;
+}
+Tensor _th_potri(const Tensor & self, bool upper) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto output_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto output = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(output_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_potri", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_potri(output_, self_, upper);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_potri", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_potri(output_, self_, upper);
+            break;
+        }
+        default:
+            AT_ERROR("_th_potri not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return output;
+}
+std::tuple<Tensor &,Tensor &> _th_geqrf_out(Tensor & res1, Tensor & res2, const Tensor & self) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_geqrf_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_geqrf_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_geqrf_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_geqrf(res1_, res2_, self_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_geqrf_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_geqrf_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_geqrf_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_geqrf(res1_, res2_, self_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_geqrf_out not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor &, Tensor &>(res1, res2);
+}
+std::tuple<Tensor,Tensor> _th_geqrf(const Tensor & self) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto res1_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto res1 = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(res1_));
+    auto res2_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto res2 = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(res2_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_geqrf", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_geqrf(res1_, res2_, self_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_geqrf", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_geqrf(res1_, res2_, self_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_geqrf not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor, Tensor>(res1, res2);
+}
+Tensor & _th_orgqr_out(Tensor & result, const Tensor & self, const Tensor & input2) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_orgqr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_orgqr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto input2_ = checked_dense_tensor_unwrap(input2, "input2", 2, "_th_orgqr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_orgqr(result_, self_, input2_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_orgqr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_orgqr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto input2_ = checked_dense_tensor_unwrap(input2, "input2", 2, "_th_orgqr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_orgqr(result_, self_, input2_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_orgqr_out not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor _th_orgqr(const Tensor & self, const Tensor & input2) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_orgqr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto input2_ = checked_dense_tensor_unwrap(input2, "input2", 2, "_th_orgqr", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_orgqr(result_, self_, input2_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_orgqr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto input2_ = checked_dense_tensor_unwrap(input2, "input2", 2, "_th_orgqr", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_orgqr(result_, self_, input2_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_orgqr not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor & _th_ormqr_out(Tensor & result, const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_ormqr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_ormqr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto input2_ = checked_dense_tensor_unwrap(input2, "input2", 2, "_th_ormqr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto input3_ = checked_dense_tensor_unwrap(input3, "input3", 3, "_th_ormqr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_ormqr(result_, self_, input2_, input3_, left, transpose);
+            break;
+        }
+        case ScalarType::Float: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_ormqr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_ormqr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto input2_ = checked_dense_tensor_unwrap(input2, "input2", 2, "_th_ormqr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto input3_ = checked_dense_tensor_unwrap(input3, "input3", 3, "_th_ormqr_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_ormqr(result_, self_, input2_, input3_, left, transpose);
+            break;
+        }
+        default:
+            AT_ERROR("_th_ormqr_out not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor _th_ormqr(const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_ormqr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto input2_ = checked_dense_tensor_unwrap(input2, "input2", 2, "_th_ormqr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto input3_ = checked_dense_tensor_unwrap(input3, "input3", 3, "_th_ormqr", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_ormqr(result_, self_, input2_, input3_, left, transpose);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_ormqr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto input2_ = checked_dense_tensor_unwrap(input2, "input2", 2, "_th_ormqr", false, DeviceType::CPU, dispatch_scalar_type);
+            auto input3_ = checked_dense_tensor_unwrap(input3, "input3", 3, "_th_ormqr", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_ormqr(result_, self_, input2_, input3_, left, transpose);
+            break;
+        }
+        default:
+            AT_ERROR("_th_ormqr not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+std::tuple<Tensor &,Tensor &> _th_multinomial_alias_setup_out(Tensor & J, Tensor & q, const Tensor & probs) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(J);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto probs_ = checked_dense_tensor_unwrap(probs, "probs", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto J_ = checked_dense_tensor_unwrap(J, "J", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CPU, ScalarType::Long);
+            auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_multinomialAliasSetup(probs_, J_, q_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto probs_ = checked_dense_tensor_unwrap(probs, "probs", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto J_ = checked_dense_tensor_unwrap(J, "J", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CPU, ScalarType::Long);
+            auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_multinomialAliasSetup(probs_, J_, q_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_multinomial_alias_setup_out not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor &, Tensor &>(J, q);
+}
+std::tuple<Tensor,Tensor> _th_multinomial_alias_setup(const Tensor & probs) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(probs);
+    auto J_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(ScalarType::Long)).release();
+    auto J = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(J_));
+    auto q_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto q = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(q_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto probs_ = checked_dense_tensor_unwrap(probs, "probs", 1, "_th_multinomial_alias_setup", false, DeviceType::CPU, dispatch_scalar_type);
+            THDoubleTensor_multinomialAliasSetup(probs_, J_, q_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto probs_ = checked_dense_tensor_unwrap(probs, "probs", 1, "_th_multinomial_alias_setup", false, DeviceType::CPU, dispatch_scalar_type);
+            THFloatTensor_multinomialAliasSetup(probs_, J_, q_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_multinomial_alias_setup not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor, Tensor>(J, q);
+}
+Tensor & _th_multinomial_alias_draw_out(Tensor & result, const Tensor & q, const Tensor & J, int64_t num_samples, c10::optional<Generator> generator) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(result);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_multinomial_alias_draw_out", false, DeviceType::CPU, ScalarType::Long);
+            auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_draw_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto J_ = checked_dense_tensor_unwrap(J, "J", 2, "_th_multinomial_alias_draw_out", false, DeviceType::CPU, ScalarType::Long);
+            THDoubleTensor_multinomialAliasDraw(result_, q_, J_, num_samples, generator);
+            break;
+        }
+        case ScalarType::Float: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_multinomial_alias_draw_out", false, DeviceType::CPU, ScalarType::Long);
+            auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_draw_out", false, DeviceType::CPU, dispatch_scalar_type);
+            auto J_ = checked_dense_tensor_unwrap(J, "J", 2, "_th_multinomial_alias_draw_out", false, DeviceType::CPU, ScalarType::Long);
+            THFloatTensor_multinomialAliasDraw(result_, q_, J_, num_samples, generator);
+            break;
+        }
+        default:
+            AT_ERROR("_th_multinomial_alias_draw_out not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor _th_multinomial_alias_draw(const Tensor & q, const Tensor & J, int64_t num_samples, c10::optional<Generator> generator) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(q);
+    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(ScalarType::Long)).release();
+    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_draw", false, DeviceType::CPU, dispatch_scalar_type);
+            auto J_ = checked_dense_tensor_unwrap(J, "J", 2, "_th_multinomial_alias_draw", false, DeviceType::CPU, ScalarType::Long);
+            THDoubleTensor_multinomialAliasDraw(result_, q_, J_, num_samples, generator);
+            break;
+        }
+        case ScalarType::Float: {
+            auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_draw", false, DeviceType::CPU, dispatch_scalar_type);
+            auto J_ = checked_dense_tensor_unwrap(J, "J", 2, "_th_multinomial_alias_draw", false, DeviceType::CPU, ScalarType::Long);
+            THFloatTensor_multinomialAliasDraw(result_, q_, J_, num_samples, generator);
+            break;
+        }
+        default:
+            AT_ERROR("_th_multinomial_alias_draw not supported on CPUType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+
+} // namespace th
+} // namespace legacy
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/LegacyTHFunctionsCPU.h b/aten/src/ATen/LegacyTHFunctionsCPU.h
new file mode 100644
index 00000000000..1abca1b1f91
--- /dev/null
+++ b/aten/src/ATen/LegacyTHFunctionsCPU.h
@@ -0,0 +1,67 @@
+#pragma once
+
+// @generated by aten/src/ATen/gen.py from LegacyTHFunctions.h
+
+#include <ATen/Context.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/TensorOptions.h>
+
+namespace c10 {
+class Scalar;
+}
+namespace at {
+struct Generator;
+class Tensor;
+struct Type;
+} // namespace at
+
+namespace at {
+namespace native {
+namespace legacy {
+namespace cpu {
+
+Tensor & _th_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source);
+Tensor & _th_masked_scatter_bool_(Tensor & self, const Tensor & mask, const Tensor & source);
+Tensor & _th_nonzero_out(Tensor & result, const Tensor & self);
+Tensor _th_nonzero(const Tensor & self);
+Tensor & _th_index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source);
+Tensor & _th_take_out(Tensor & result, const Tensor & self, const Tensor & index);
+Tensor _th_take(const Tensor & self, const Tensor & index);
+Tensor & _th_put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate);
+Tensor & _th_index_fill_(Tensor & self, int64_t dim, const Tensor & index, Scalar value);
+std::tuple<Tensor &,Tensor &> _th_mode_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool keepdim);
+std::tuple<Tensor,Tensor> _th_mode(const Tensor & self, int64_t dim, bool keepdim);
+std::tuple<Tensor &,Tensor &> _th_sort_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool descending);
+std::tuple<Tensor,Tensor> _th_sort(const Tensor & self, int64_t dim, bool descending);
+Tensor _th_var(const Tensor & self, bool unbiased);
+Tensor _th_std(const Tensor & self, bool unbiased);
+Tensor & _th_renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
+Tensor _th_renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
+Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
+Tensor & _th_histc_out(Tensor & result, const Tensor & self, int64_t bins, Scalar min, Scalar max);
+Tensor _th_histc(const Tensor & self, int64_t bins, Scalar min, Scalar max);
+Tensor _th_trace(const Tensor & self);
+Tensor & _th_addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha);
+Tensor _th_addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha);
+Tensor & _th_addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha);
+std::tuple<Tensor &,Tensor &> _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A);
+std::tuple<Tensor,Tensor> _th_gels(const Tensor & self, const Tensor & A);
+std::tuple<Tensor &,Tensor &> _th_eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors);
+std::tuple<Tensor,Tensor> _th_eig(const Tensor & self, bool eigenvectors);
+Tensor & _th_potri_out(Tensor & output, const Tensor & self, bool upper);
+Tensor _th_potri(const Tensor & self, bool upper);
+std::tuple<Tensor &,Tensor &> _th_geqrf_out(Tensor & res1, Tensor & res2, const Tensor & self);
+std::tuple<Tensor,Tensor> _th_geqrf(const Tensor & self);
+Tensor & _th_orgqr_out(Tensor & result, const Tensor & self, const Tensor & input2);
+Tensor _th_orgqr(const Tensor & self, const Tensor & input2);
+Tensor & _th_ormqr_out(Tensor & result, const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose);
+Tensor _th_ormqr(const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose);
+std::tuple<Tensor &,Tensor &> _th_multinomial_alias_setup_out(Tensor & J, Tensor & q, const Tensor & probs);
+std::tuple<Tensor,Tensor> _th_multinomial_alias_setup(const Tensor & probs);
+Tensor & _th_multinomial_alias_draw_out(Tensor & result, const Tensor & q, const Tensor & J, int64_t num_samples, c10::optional<Generator> generator);
+Tensor _th_multinomial_alias_draw(const Tensor & q, const Tensor & J, int64_t num_samples, c10::optional<Generator> generator);
+
+} // namespace th
+} // namespace legacy
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/LegacyTHFunctionsCUDA.h b/aten/src/ATen/LegacyTHFunctionsCUDA.h
new file mode 100644
index 00000000000..8e2410cc87e
--- /dev/null
+++ b/aten/src/ATen/LegacyTHFunctionsCUDA.h
@@ -0,0 +1,111 @@
+#pragma once
+
+// @generated by aten/src/ATen/gen.py from LegacyTHFunctions.h
+
+#include <ATen/Context.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/TensorOptions.h>
+
+namespace c10 {
+class Scalar;
+}
+namespace at {
+struct Generator;
+class Tensor;
+struct Type;
+} // namespace at
+
+namespace at {
+namespace native {
+namespace legacy {
+namespace cuda {
+
+Tensor & _th_masked_fill_(Tensor & self, const Tensor & mask, Scalar value);
+Tensor & _th_masked_fill_bool_(Tensor & self, const Tensor & mask, Scalar value);
+Tensor & _th_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source);
+Tensor & _th_masked_scatter_bool_(Tensor & self, const Tensor & mask, const Tensor & source);
+Tensor & _th_nonzero_out(Tensor & result, const Tensor & self);
+Tensor _th_nonzero(const Tensor & self);
+Tensor & _th_index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source);
+Tensor & _th_take_out(Tensor & result, const Tensor & self, const Tensor & index);
+Tensor _th_take(const Tensor & self, const Tensor & index);
+Tensor & _th_put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate);
+Tensor & _th_index_fill_(Tensor & self, int64_t dim, const Tensor & index, Scalar value);
+std::tuple<Tensor &,Tensor &> _th_mode_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool keepdim);
+std::tuple<Tensor,Tensor> _th_mode(const Tensor & self, int64_t dim, bool keepdim);
+std::tuple<Tensor &,Tensor &> _th_sort_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool descending);
+std::tuple<Tensor,Tensor> _th_sort(const Tensor & self, int64_t dim, bool descending);
+std::tuple<Tensor &,Tensor &> _th_topk_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted);
+std::tuple<Tensor,Tensor> _th_topk(const Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted);
+Tensor _th_var(const Tensor & self, bool unbiased);
+Tensor _th_std(const Tensor & self, bool unbiased);
+Tensor & _th_renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
+Tensor _th_renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
+Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
+Tensor & _th_fmod_out(Tensor & result, const Tensor & self, Scalar other);
+Tensor _th_fmod(const Tensor & self, Scalar other);
+Tensor & _th_fmod_out(Tensor & result, const Tensor & self, const Tensor & other);
+Tensor _th_fmod(const Tensor & self, const Tensor & other);
+Tensor & _th_fmod_(Tensor & self, Scalar other);
+Tensor & _th_fmod_(Tensor & self, const Tensor & other);
+Tensor & _th_cross_kernel_out(Tensor & result, const Tensor & self, const Tensor & other, int64_t dim);
+Tensor _th_cross_kernel(const Tensor & self, const Tensor & other, int64_t dim);
+Tensor & _th_bmm_out(Tensor & result, const Tensor & self, const Tensor & mat2);
+Tensor _th_bmm(const Tensor & self, const Tensor & mat2);
+Tensor & _th_baddbmm_out(Tensor & result, const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha);
+Tensor _th_baddbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha);
+std::tuple<Tensor &,Tensor &> _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A);
+std::tuple<Tensor,Tensor> _th_gels(const Tensor & self, const Tensor & A);
+std::tuple<Tensor &,Tensor &> _th_eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors);
+std::tuple<Tensor,Tensor> _th_eig(const Tensor & self, bool eigenvectors);
+Tensor & _th_potri_out(Tensor & output, const Tensor & self, bool upper);
+Tensor _th_potri(const Tensor & self, bool upper);
+std::tuple<Tensor &,Tensor &> _th_geqrf_out(Tensor & res1, Tensor & res2, const Tensor & self);
+std::tuple<Tensor,Tensor> _th_geqrf(const Tensor & self);
+std::tuple<Tensor &,Tensor &> _th_multinomial_alias_setup_out(Tensor & J, Tensor & q, const Tensor & probs);
+std::tuple<Tensor,Tensor> _th_multinomial_alias_setup(const Tensor & probs);
+Tensor & _th_multinomial_alias_draw_out(Tensor & result, const Tensor & q, const Tensor & J, int64_t num_samples, c10::optional<Generator> generator);
+Tensor _th_multinomial_alias_draw(const Tensor & q, const Tensor & J, int64_t num_samples, c10::optional<Generator> generator);
+Tensor & _th_copy_ignoring_overlaps_(Tensor & self, const Tensor & src);
+Tensor & _thnn_multi_margin_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, int64_t reduction);
+Tensor _thnn_multi_margin_loss_forward(const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, int64_t reduction);
+Tensor & _thnn_multi_margin_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, int64_t reduction);
+Tensor _thnn_multi_margin_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, int64_t reduction);
+std::tuple<Tensor &,Tensor &> _thnn_multilabel_margin_loss_forward_out(Tensor & output, Tensor & is_target, const Tensor & self, const Tensor & target, int64_t reduction);
+std::tuple<Tensor,Tensor> _thnn_multilabel_margin_loss_forward(const Tensor & self, const Tensor & target, int64_t reduction);
+Tensor & _thnn_multilabel_margin_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, int64_t reduction, const Tensor & is_target);
+Tensor _thnn_multilabel_margin_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, int64_t reduction, const Tensor & is_target);
+std::tuple<Tensor &,Tensor &> _thnn_nll_loss_forward_out(Tensor & output, Tensor & total_weight, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index);
+std::tuple<Tensor,Tensor> _thnn_nll_loss_forward(const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index);
+Tensor & _thnn_nll_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index, const Tensor & total_weight);
+Tensor _thnn_nll_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index, const Tensor & total_weight);
+std::tuple<Tensor &,Tensor &> _thnn_nll_loss2d_forward_out(Tensor & output, Tensor & total_weight, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index);
+std::tuple<Tensor,Tensor> _thnn_nll_loss2d_forward(const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index);
+Tensor & _thnn_nll_loss2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index, const Tensor & total_weight);
+Tensor _thnn_nll_loss2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index, const Tensor & total_weight);
+Tensor & _thnn_glu_forward_out(Tensor & output, const Tensor & self, int64_t dim);
+Tensor _thnn_glu_forward(const Tensor & self, int64_t dim);
+Tensor & _thnn_glu_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t dim);
+Tensor _thnn_glu_backward(const Tensor & grad_output, const Tensor & self, int64_t dim);
+std::tuple<Tensor &,Tensor &> _thnn_log_sigmoid_forward_out(Tensor & output, Tensor & buffer, const Tensor & self);
+std::tuple<Tensor,Tensor> _thnn_log_sigmoid_forward(const Tensor & self);
+Tensor & _thnn_log_sigmoid_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & buffer);
+Tensor _thnn_log_sigmoid_backward(const Tensor & grad_output, const Tensor & self, const Tensor & buffer);
+Tensor & _thnn_rrelu_with_noise_forward_out(Tensor & output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, c10::optional<at::Generator> generator);
+Tensor _thnn_rrelu_with_noise_forward(const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, c10::optional<at::Generator> generator);
+Tensor & _thnn_rrelu_with_noise_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training);
+Tensor _thnn_rrelu_with_noise_backward(const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training);
+Tensor & _thnn_rrelu_with_noise_forward_(Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, c10::optional<at::Generator> generator);
+std::tuple<Tensor &,Tensor &,Tensor &> _thnn_conv2d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding);
+std::tuple<Tensor,Tensor,Tensor> _thnn_conv2d_forward(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding);
+std::tuple<Tensor &,Tensor &,Tensor &> _thnn_conv2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, const Tensor & columns, const Tensor & ones);
+std::tuple<Tensor,Tensor,Tensor> _thnn_conv2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, const Tensor & columns, const Tensor & ones, std::array<bool,3> output_mask);
+Tensor & _thnn_conv_depthwise2d_forward_out(Tensor & output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation);
+Tensor _thnn_conv_depthwise2d_forward(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation);
+std::tuple<Tensor &,Tensor &> _thnn_conv_depthwise2d_backward_out(Tensor & grad_input, Tensor & grad_weight, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation);
+std::tuple<Tensor,Tensor> _thnn_conv_depthwise2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, std::array<bool,2> output_mask);
+
+} // namespace th
+} // namespace legacy
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
new file mode 100644
index 00000000000..2b07a19de0e
--- /dev/null
+++ b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
@@ -0,0 +1,4176 @@
+#include <ATen/LegacyTHFunctionsCUDA.h>
+
+// @generated by aten/src/ATen/gen.py from LegacyTHFunctions.cpp
+
+#include <ATen/ATen.h>
+#include <ATen/Utils.h>
+#include <ATen/NamedTensorUtils.h>
+#include <ATen/CUDAGeneratorImpl.h>
+#include <ATen/ExpandUtils.h>
+#include <THC/THC.h>
+#include <THC/THCTensor.hpp>
+#include <THCUNN/THCUNN.h>
+#undef THNN_
+#undef THCIndexTensor_
+#include <ATen/DeviceGuard.h>
+#include <ATen/cuda/ATenCUDAGeneral.h>
+#include <ATen/cuda/CUDADevice.h>
+#include <ATen/cuda/CUDAContext.h>
+
+namespace at {
+namespace native {
+namespace legacy {
+namespace cuda {
+
+namespace {
+  ScalarType infer_scalar_type(const Tensor & t) {
+    return t.scalar_type();
+  }
+  ScalarType infer_scalar_type(const TensorList & tl) {
+    TORCH_CHECK(tl.size() > 0, "expected a non-empty list of Tensors");
+    return tl[0].scalar_type();
+  }
+
+  TensorOptions options(ScalarType s) {
+    return TensorOptions().dtype(s)
+                          .device(DeviceType::CUDA)
+                          .layout(kStrided);
+  }
+
+  Allocator* allocator() {
+    return at::cuda::getCUDADeviceAllocator();
+  }
+}
+
+Tensor & _th_masked_fill_(Tensor & self, const Tensor & mask, Scalar value) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Bool: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_", false, DeviceType::CUDA, ScalarType::Byte);
+            auto value_ = value.toBool();
+            THCudaBoolTensor_maskedFill(globalContext().getTHCState(), self_, mask_, value_);
+            break;
+        }
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_", false, DeviceType::CUDA, ScalarType::Byte);
+            auto value_ = value.toByte();
+            THCudaByteTensor_maskedFill(globalContext().getTHCState(), self_, mask_, value_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_", false, DeviceType::CUDA, ScalarType::Byte);
+            auto value_ = value.toChar();
+            THCudaCharTensor_maskedFill(globalContext().getTHCState(), self_, mask_, value_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_", false, DeviceType::CUDA, ScalarType::Byte);
+            auto value_ = value.toDouble();
+            THCudaDoubleTensor_maskedFill(globalContext().getTHCState(), self_, mask_, value_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_", false, DeviceType::CUDA, ScalarType::Byte);
+            auto value_ = value.toFloat();
+            THCudaTensor_maskedFill(globalContext().getTHCState(), self_, mask_, value_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_", false, DeviceType::CUDA, ScalarType::Byte);
+            auto value_ = value.toInt();
+            THCudaIntTensor_maskedFill(globalContext().getTHCState(), self_, mask_, value_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_", false, DeviceType::CUDA, ScalarType::Byte);
+            auto value_ = value.toLong();
+            THCudaLongTensor_maskedFill(globalContext().getTHCState(), self_, mask_, value_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_", false, DeviceType::CUDA, ScalarType::Byte);
+            auto value_ = value.toShort();
+            THCudaShortTensor_maskedFill(globalContext().getTHCState(), self_, mask_, value_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_", false, DeviceType::CUDA, ScalarType::Byte);
+            auto value_ = value.toHalf();
+            THCudaHalfTensor_maskedFill(globalContext().getTHCState(), self_, mask_, value_);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_", false, DeviceType::CUDA, ScalarType::Byte);
+            auto value_ = value.toBFloat16();
+            THCudaBFloat16Tensor_maskedFill(globalContext().getTHCState(), self_, mask_, value_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_masked_fill_ not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return self;
+}
+Tensor & _th_masked_fill_bool_(Tensor & self, const Tensor & mask, Scalar value) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Bool: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_bool_", false, DeviceType::CUDA, ScalarType::Bool);
+            auto value_ = value.toBool();
+            THCudaBoolTensor_maskedFillBool(globalContext().getTHCState(), self_, mask_, value_);
+            break;
+        }
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_bool_", false, DeviceType::CUDA, ScalarType::Bool);
+            auto value_ = value.toByte();
+            THCudaByteTensor_maskedFillBool(globalContext().getTHCState(), self_, mask_, value_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_bool_", false, DeviceType::CUDA, ScalarType::Bool);
+            auto value_ = value.toChar();
+            THCudaCharTensor_maskedFillBool(globalContext().getTHCState(), self_, mask_, value_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_bool_", false, DeviceType::CUDA, ScalarType::Bool);
+            auto value_ = value.toDouble();
+            THCudaDoubleTensor_maskedFillBool(globalContext().getTHCState(), self_, mask_, value_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_bool_", false, DeviceType::CUDA, ScalarType::Bool);
+            auto value_ = value.toFloat();
+            THCudaTensor_maskedFillBool(globalContext().getTHCState(), self_, mask_, value_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_bool_", false, DeviceType::CUDA, ScalarType::Bool);
+            auto value_ = value.toInt();
+            THCudaIntTensor_maskedFillBool(globalContext().getTHCState(), self_, mask_, value_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_bool_", false, DeviceType::CUDA, ScalarType::Bool);
+            auto value_ = value.toLong();
+            THCudaLongTensor_maskedFillBool(globalContext().getTHCState(), self_, mask_, value_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_bool_", false, DeviceType::CUDA, ScalarType::Bool);
+            auto value_ = value.toShort();
+            THCudaShortTensor_maskedFillBool(globalContext().getTHCState(), self_, mask_, value_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_bool_", false, DeviceType::CUDA, ScalarType::Bool);
+            auto value_ = value.toHalf();
+            THCudaHalfTensor_maskedFillBool(globalContext().getTHCState(), self_, mask_, value_);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_fill_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_fill_bool_", false, DeviceType::CUDA, ScalarType::Bool);
+            auto value_ = value.toBFloat16();
+            THCudaBFloat16Tensor_maskedFillBool(globalContext().getTHCState(), self_, mask_, value_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_masked_fill_bool_ not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return self;
+}
+Tensor & _th_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Bool: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaBoolTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaByteTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaCharTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaIntTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaLongTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaShortTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaHalfTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaBFloat16Tensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_masked_scatter_ not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return self;
+}
+Tensor & _th_masked_scatter_bool_(Tensor & self, const Tensor & mask, const Tensor & source) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Bool: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaBoolTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaByteTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaCharTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaIntTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaLongTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaShortTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaHalfTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaBFloat16Tensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_masked_scatter_bool_ not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return self;
+}
+Tensor & _th_nonzero_out(Tensor & result, const Tensor & self) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Bool: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaBoolTensor_nonzero(globalContext().getTHCState(), result_, self_);
+            break;
+        }
+        case ScalarType::Byte: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaByteTensor_nonzero(globalContext().getTHCState(), result_, self_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaCharTensor_nonzero(globalContext().getTHCState(), result_, self_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_nonzero(globalContext().getTHCState(), result_, self_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_nonzero(globalContext().getTHCState(), result_, self_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaIntTensor_nonzero(globalContext().getTHCState(), result_, self_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaLongTensor_nonzero(globalContext().getTHCState(), result_, self_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaShortTensor_nonzero(globalContext().getTHCState(), result_, self_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaHalfTensor_nonzero(globalContext().getTHCState(), result_, self_);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaBFloat16Tensor_nonzero(globalContext().getTHCState(), result_, self_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_nonzero_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor _th_nonzero(const Tensor & self) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(ScalarType::Long)).release();
+    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Bool: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaBoolTensor_nonzero(globalContext().getTHCState(), result_, self_);
+            break;
+        }
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaByteTensor_nonzero(globalContext().getTHCState(), result_, self_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaCharTensor_nonzero(globalContext().getTHCState(), result_, self_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_nonzero(globalContext().getTHCState(), result_, self_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_nonzero(globalContext().getTHCState(), result_, self_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaIntTensor_nonzero(globalContext().getTHCState(), result_, self_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaLongTensor_nonzero(globalContext().getTHCState(), result_, self_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaShortTensor_nonzero(globalContext().getTHCState(), result_, self_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaHalfTensor_nonzero(globalContext().getTHCState(), result_, self_);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaBFloat16Tensor_nonzero(globalContext().getTHCState(), result_, self_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_nonzero not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor & _th_index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Bool: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CUDA, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaBoolTensor_indexCopy(globalContext().getTHCState(), self_, dim, index_, source_);
+            break;
+        }
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CUDA, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaByteTensor_indexCopy(globalContext().getTHCState(), self_, dim, index_, source_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CUDA, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaCharTensor_indexCopy(globalContext().getTHCState(), self_, dim, index_, source_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CUDA, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_indexCopy(globalContext().getTHCState(), self_, dim, index_, source_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CUDA, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_indexCopy(globalContext().getTHCState(), self_, dim, index_, source_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CUDA, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaIntTensor_indexCopy(globalContext().getTHCState(), self_, dim, index_, source_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CUDA, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaLongTensor_indexCopy(globalContext().getTHCState(), self_, dim, index_, source_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CUDA, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaShortTensor_indexCopy(globalContext().getTHCState(), self_, dim, index_, source_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_copy_", false, DeviceType::CUDA, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 4, "_th_index_copy_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaHalfTensor_indexCopy(globalContext().getTHCState(), self_, dim, index_, source_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_index_copy_ not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return self;
+}
+Tensor & _th_take_out(Tensor & result, const Tensor & self, const Tensor & index) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Bool: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaBoolTensor_take(globalContext().getTHCState(), result_, self_, index_);
+            break;
+        }
+        case ScalarType::Byte: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaByteTensor_take(globalContext().getTHCState(), result_, self_, index_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaCharTensor_take(globalContext().getTHCState(), result_, self_, index_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaDoubleTensor_take(globalContext().getTHCState(), result_, self_, index_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaTensor_take(globalContext().getTHCState(), result_, self_, index_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaIntTensor_take(globalContext().getTHCState(), result_, self_, index_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaLongTensor_take(globalContext().getTHCState(), result_, self_, index_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaShortTensor_take(globalContext().getTHCState(), result_, self_, index_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaHalfTensor_take(globalContext().getTHCState(), result_, self_, index_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_take_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor _th_take(const Tensor & self, const Tensor & index) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Bool: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaBoolTensor_take(globalContext().getTHCState(), result_, self_, index_);
+            break;
+        }
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaByteTensor_take(globalContext().getTHCState(), result_, self_, index_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaCharTensor_take(globalContext().getTHCState(), result_, self_, index_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaDoubleTensor_take(globalContext().getTHCState(), result_, self_, index_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaTensor_take(globalContext().getTHCState(), result_, self_, index_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaIntTensor_take(globalContext().getTHCState(), result_, self_, index_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaLongTensor_take(globalContext().getTHCState(), result_, self_, index_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaShortTensor_take(globalContext().getTHCState(), result_, self_, index_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaHalfTensor_take(globalContext().getTHCState(), result_, self_, index_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_take not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor & _th_put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Bool: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CUDA, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaBoolTensor_put(globalContext().getTHCState(), self_, index_, source_, accumulate);
+            break;
+        }
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CUDA, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaByteTensor_put(globalContext().getTHCState(), self_, index_, source_, accumulate);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CUDA, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaCharTensor_put(globalContext().getTHCState(), self_, index_, source_, accumulate);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CUDA, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_put(globalContext().getTHCState(), self_, index_, source_, accumulate);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CUDA, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_put(globalContext().getTHCState(), self_, index_, source_, accumulate);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CUDA, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaIntTensor_put(globalContext().getTHCState(), self_, index_, source_, accumulate);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CUDA, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaLongTensor_put(globalContext().getTHCState(), self_, index_, source_, accumulate);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CUDA, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaShortTensor_put(globalContext().getTHCState(), self_, index_, source_, accumulate);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_put_", false, DeviceType::CUDA, ScalarType::Long);
+            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_put_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaHalfTensor_put(globalContext().getTHCState(), self_, index_, source_, accumulate);
+            break;
+        }
+        default:
+            AT_ERROR("_th_put_ not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return self;
+}
+Tensor & _th_index_fill_(Tensor & self, int64_t dim, const Tensor & index, Scalar value) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Bool: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CUDA, ScalarType::Long);
+            auto value_ = value.toBool();
+            THCudaBoolTensor_indexFill(globalContext().getTHCState(), self_, dim, index_, value_);
+            break;
+        }
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CUDA, ScalarType::Long);
+            auto value_ = value.toByte();
+            THCudaByteTensor_indexFill(globalContext().getTHCState(), self_, dim, index_, value_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CUDA, ScalarType::Long);
+            auto value_ = value.toChar();
+            THCudaCharTensor_indexFill(globalContext().getTHCState(), self_, dim, index_, value_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CUDA, ScalarType::Long);
+            auto value_ = value.toDouble();
+            THCudaDoubleTensor_indexFill(globalContext().getTHCState(), self_, dim, index_, value_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CUDA, ScalarType::Long);
+            auto value_ = value.toFloat();
+            THCudaTensor_indexFill(globalContext().getTHCState(), self_, dim, index_, value_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CUDA, ScalarType::Long);
+            auto value_ = value.toInt();
+            THCudaIntTensor_indexFill(globalContext().getTHCState(), self_, dim, index_, value_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CUDA, ScalarType::Long);
+            auto value_ = value.toLong();
+            THCudaLongTensor_indexFill(globalContext().getTHCState(), self_, dim, index_, value_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CUDA, ScalarType::Long);
+            auto value_ = value.toShort();
+            THCudaShortTensor_indexFill(globalContext().getTHCState(), self_, dim, index_, value_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto index_ = checked_dense_tensor_unwrap(index, "index", 3, "_th_index_fill_", false, DeviceType::CUDA, ScalarType::Long);
+            auto value_ = value.toHalf();
+            THCudaHalfTensor_indexFill(globalContext().getTHCState(), self_, dim, index_, value_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_index_fill_ not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return self;
+}
+std::tuple<Tensor &,Tensor &> _th_mode_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool keepdim) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaByteTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Char: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaCharTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Double: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Float: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Int: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaIntTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Long: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaLongTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Short: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaShortTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Half: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_mode_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaHalfTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        default:
+            AT_ERROR("_th_mode_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor &, Tensor &>(values, indices);
+}
+std::tuple<Tensor,Tensor> _th_mode(const Tensor & self, int64_t dim, bool keepdim) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto values_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto values = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(values_));
+    auto indices_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(ScalarType::Long)).release();
+    auto indices = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(indices_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaByteTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaCharTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaIntTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaLongTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaShortTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_mode", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaHalfTensor_mode(globalContext().getTHCState(), values_, indices_, self_, dim, keepdim);
+            break;
+        }
+        default:
+            AT_ERROR("_th_mode not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor, Tensor>(values, indices);
+}
+std::tuple<Tensor &,Tensor &> _th_sort_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool descending) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaByteTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Char: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaCharTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Double: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Float: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Int: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaIntTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Long: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaLongTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Short: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaShortTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Half: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaHalfTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending);
+            break;
+        }
+        default:
+            AT_ERROR("_th_sort_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor &, Tensor &>(values, indices);
+}
+std::tuple<Tensor,Tensor> _th_sort(const Tensor & self, int64_t dim, bool descending) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto values_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto values = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(values_));
+    auto indices_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(ScalarType::Long)).release();
+    auto indices = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(indices_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaByteTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaCharTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaIntTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaLongTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaShortTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaHalfTensor_sort(globalContext().getTHCState(), values_, indices_, self_, dim, descending);
+            break;
+        }
+        default:
+            AT_ERROR("_th_sort not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor, Tensor>(values, indices);
+}
+std::tuple<Tensor &,Tensor &> _th_topk_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_topk_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaByteTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted);
+            break;
+        }
+        case ScalarType::Char: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_topk_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaCharTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted);
+            break;
+        }
+        case ScalarType::Double: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_topk_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted);
+            break;
+        }
+        case ScalarType::Float: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_topk_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted);
+            break;
+        }
+        case ScalarType::Int: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_topk_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaIntTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted);
+            break;
+        }
+        case ScalarType::Long: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_topk_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaLongTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted);
+            break;
+        }
+        case ScalarType::Short: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_topk_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaShortTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted);
+            break;
+        }
+        case ScalarType::Half: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_topk_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaHalfTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_topk_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaBFloat16Tensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted);
+            break;
+        }
+        default:
+            AT_ERROR("_th_topk_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor &, Tensor &>(values, indices);
+}
+std::tuple<Tensor,Tensor> _th_topk(const Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto values_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto values = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(values_));
+    auto indices_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(ScalarType::Long)).release();
+    auto indices = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(indices_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaByteTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaCharTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaIntTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaLongTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaShortTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaHalfTensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_topk", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaBFloat16Tensor_topk(globalContext().getTHCState(), values_, indices_, self_, k, dim, largest, sorted);
+            break;
+        }
+        default:
+            AT_ERROR("_th_topk not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor, Tensor>(values, indices);
+}
+Tensor _th_var(const Tensor & self, bool unbiased) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_var", false, DeviceType::CUDA, dispatch_scalar_type);
+            return at::scalar_tensor(convert<double>(THCudaDoubleTensor_var_all(globalContext().getTHCState(), self_, unbiased)), options(ScalarType::Double));
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_var", false, DeviceType::CUDA, dispatch_scalar_type);
+            return at::scalar_tensor(convert<float>(THCudaTensor_var_all(globalContext().getTHCState(), self_, unbiased)), options(ScalarType::Float));
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_var", false, DeviceType::CUDA, dispatch_scalar_type);
+            return at::scalar_tensor(convert<Half>(THCudaHalfTensor_var_all(globalContext().getTHCState(), self_, unbiased)), options(ScalarType::Half));
+            break;
+        }
+        default:
+            AT_ERROR("_th_var not supported on CUDAType for ", dispatch_scalar_type);
+    }
+}
+Tensor _th_std(const Tensor & self, bool unbiased) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_std", false, DeviceType::CUDA, dispatch_scalar_type);
+            return at::scalar_tensor(convert<double>(THCudaDoubleTensor_std_all(globalContext().getTHCState(), self_, unbiased)), options(ScalarType::Double));
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_std", false, DeviceType::CUDA, dispatch_scalar_type);
+            return at::scalar_tensor(convert<float>(THCudaTensor_std_all(globalContext().getTHCState(), self_, unbiased)), options(ScalarType::Float));
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_std", false, DeviceType::CUDA, dispatch_scalar_type);
+            return at::scalar_tensor(convert<Half>(THCudaHalfTensor_std_all(globalContext().getTHCState(), self_, unbiased)), options(ScalarType::Half));
+            break;
+        }
+        default:
+            AT_ERROR("_th_std not supported on CUDAType for ", dispatch_scalar_type);
+    }
+}
+Tensor & _th_renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_renorm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto p_ = p.toDouble();
+            auto maxnorm_ = maxnorm.toDouble();
+            THCudaDoubleTensor_renorm(globalContext().getTHCState(), result_, self_, p_, dim, maxnorm_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_renorm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto p_ = p.toFloat();
+            auto maxnorm_ = maxnorm.toFloat();
+            THCudaTensor_renorm(globalContext().getTHCState(), result_, self_, p_, dim, maxnorm_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_renorm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto p_ = p.toHalf();
+            auto maxnorm_ = maxnorm.toHalf();
+            THCudaHalfTensor_renorm(globalContext().getTHCState(), result_, self_, p_, dim, maxnorm_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_renorm_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor _th_renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto p_ = p.toDouble();
+            auto maxnorm_ = maxnorm.toDouble();
+            THCudaDoubleTensor_renorm(globalContext().getTHCState(), result_, self_, p_, dim, maxnorm_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto p_ = p.toFloat();
+            auto maxnorm_ = maxnorm.toFloat();
+            THCudaTensor_renorm(globalContext().getTHCState(), result_, self_, p_, dim, maxnorm_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto p_ = p.toHalf();
+            auto maxnorm_ = maxnorm.toHalf();
+            THCudaHalfTensor_renorm(globalContext().getTHCState(), result_, self_, p_, dim, maxnorm_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_renorm not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto p_ = p.toDouble();
+            auto maxnorm_ = maxnorm.toDouble();
+            THCudaDoubleTensor_renorm(globalContext().getTHCState(), self_, self_, p_, dim, maxnorm_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto p_ = p.toFloat();
+            auto maxnorm_ = maxnorm.toFloat();
+            THCudaTensor_renorm(globalContext().getTHCState(), self_, self_, p_, dim, maxnorm_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto p_ = p.toHalf();
+            auto maxnorm_ = maxnorm.toHalf();
+            THCudaHalfTensor_renorm(globalContext().getTHCState(), self_, self_, p_, dim, maxnorm_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_renorm_ not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return self;
+}
+Tensor & _th_fmod_out(Tensor & result, const Tensor & self, Scalar other) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toByte();
+            THCudaByteTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toChar();
+            THCudaCharTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toDouble();
+            THCudaDoubleTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toFloat();
+            THCudaTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toInt();
+            THCudaIntTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toLong();
+            THCudaLongTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toShort();
+            THCudaShortTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toHalf();
+            THCudaHalfTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_fmod_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor _th_fmod(const Tensor & self, Scalar other) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toByte();
+            THCudaByteTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toChar();
+            THCudaCharTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toDouble();
+            THCudaDoubleTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toFloat();
+            THCudaTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toInt();
+            THCudaIntTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toLong();
+            THCudaLongTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toShort();
+            THCudaShortTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toHalf();
+            THCudaHalfTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_fmod not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor & _th_fmod_out(Tensor & result, const Tensor & self, const Tensor & other) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaByteTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaCharTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaIntTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaLongTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaShortTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaHalfTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_fmod_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor _th_fmod(const Tensor & self, const Tensor & other) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaByteTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaCharTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaIntTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaLongTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaShortTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaHalfTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_fmod not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor & _th_fmod_(Tensor & self, Scalar other) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toByte();
+            THCudaByteTensor_fmod(globalContext().getTHCState(), self_, self_, other_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toChar();
+            THCudaCharTensor_fmod(globalContext().getTHCState(), self_, self_, other_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toDouble();
+            THCudaDoubleTensor_fmod(globalContext().getTHCState(), self_, self_, other_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toFloat();
+            THCudaTensor_fmod(globalContext().getTHCState(), self_, self_, other_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toInt();
+            THCudaIntTensor_fmod(globalContext().getTHCState(), self_, self_, other_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toLong();
+            THCudaLongTensor_fmod(globalContext().getTHCState(), self_, self_, other_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toShort();
+            THCudaShortTensor_fmod(globalContext().getTHCState(), self_, self_, other_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = other.toHalf();
+            THCudaHalfTensor_fmod(globalContext().getTHCState(), self_, self_, other_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_fmod_ not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return self;
+}
+Tensor & _th_fmod_(Tensor & self, const Tensor & other) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaByteTensor_cfmod(globalContext().getTHCState(), self_, self_, other_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaCharTensor_cfmod(globalContext().getTHCState(), self_, self_, other_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_cfmod(globalContext().getTHCState(), self_, self_, other_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_cfmod(globalContext().getTHCState(), self_, self_, other_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaIntTensor_cfmod(globalContext().getTHCState(), self_, self_, other_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaLongTensor_cfmod(globalContext().getTHCState(), self_, self_, other_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaShortTensor_cfmod(globalContext().getTHCState(), self_, self_, other_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaHalfTensor_cfmod(globalContext().getTHCState(), self_, self_, other_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_fmod_ not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return self;
+}
+Tensor & _th_cross_kernel_out(Tensor & result, const Tensor & self, const Tensor & other, int64_t dim) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaByteTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim);
+            break;
+        }
+        case ScalarType::Char: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaCharTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim);
+            break;
+        }
+        case ScalarType::Double: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim);
+            break;
+        }
+        case ScalarType::Float: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim);
+            break;
+        }
+        case ScalarType::Int: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaIntTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim);
+            break;
+        }
+        case ScalarType::Long: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaLongTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim);
+            break;
+        }
+        case ScalarType::Short: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaShortTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim);
+            break;
+        }
+        case ScalarType::Half: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaHalfTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim);
+            break;
+        }
+        default:
+            AT_ERROR("_th_cross_kernel_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor _th_cross_kernel(const Tensor & self, const Tensor & other, int64_t dim) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaByteTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaCharTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaIntTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaLongTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaShortTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_cross_kernel", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaHalfTensor_crossKernel(globalContext().getTHCState(), result_, self_, other_, dim);
+            break;
+        }
+        default:
+            AT_ERROR("_th_cross_kernel not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor & _th_bmm_out(Tensor & result, const Tensor & self, const Tensor & mat2) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaByteTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, uint8_t(0), uint8_t(1));
+            break;
+        }
+        case ScalarType::Char: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaCharTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, int8_t(0), int8_t(1));
+            break;
+        }
+        case ScalarType::Double: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, double(0), double(1));
+            break;
+        }
+        case ScalarType::Float: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, float(0), float(1));
+            break;
+        }
+        case ScalarType::Int: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaIntTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, int(0), int(1));
+            break;
+        }
+        case ScalarType::Long: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaLongTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, int64_t(0), int64_t(1));
+            break;
+        }
+        case ScalarType::Short: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaShortTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, int16_t(0), int16_t(1));
+            break;
+        }
+        case ScalarType::Half: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaHalfTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, Half(0), Half(1));
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaBFloat16Tensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, BFloat16(0), BFloat16(1));
+            break;
+        }
+        default:
+            AT_ERROR("_th_bmm_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor _th_bmm(const Tensor & self, const Tensor & mat2) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaByteTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, uint8_t(0), uint8_t(1));
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaCharTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, int8_t(0), int8_t(1));
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, double(0), double(1));
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, float(0), float(1));
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaIntTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, int(0), int(1));
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaLongTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, int64_t(0), int64_t(1));
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaShortTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, int16_t(0), int16_t(1));
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaHalfTensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, Half(0), Half(1));
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto mat2_ = checked_dense_tensor_unwrap(mat2, "mat2", 2, "_th_bmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaBFloat16Tensor_baddbmm(globalContext().getTHCState(), result_, result_, self_, mat2_, BFloat16(0), BFloat16(1));
+            break;
+        }
+        default:
+            AT_ERROR("_th_bmm not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor & _th_baddbmm_out(Tensor & result, const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto beta_ = beta.toByte();
+            auto alpha_ = alpha.toByte();
+            THCudaByteTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto beta_ = beta.toChar();
+            auto alpha_ = alpha.toChar();
+            THCudaCharTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto beta_ = beta.toDouble();
+            auto alpha_ = alpha.toDouble();
+            THCudaDoubleTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto beta_ = beta.toFloat();
+            auto alpha_ = alpha.toFloat();
+            THCudaTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto beta_ = beta.toInt();
+            auto alpha_ = alpha.toInt();
+            THCudaIntTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto beta_ = beta.toLong();
+            auto alpha_ = alpha.toLong();
+            THCudaLongTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto beta_ = beta.toShort();
+            auto alpha_ = alpha.toShort();
+            THCudaShortTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto beta_ = beta.toHalf();
+            auto alpha_ = alpha.toHalf();
+            THCudaHalfTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto beta_ = beta.toBFloat16();
+            auto alpha_ = alpha.toBFloat16();
+            THCudaBFloat16Tensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_baddbmm_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor _th_baddbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto beta_ = beta.toByte();
+            auto alpha_ = alpha.toByte();
+            THCudaByteTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto beta_ = beta.toChar();
+            auto alpha_ = alpha.toChar();
+            THCudaCharTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto beta_ = beta.toDouble();
+            auto alpha_ = alpha.toDouble();
+            THCudaDoubleTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto beta_ = beta.toFloat();
+            auto alpha_ = alpha.toFloat();
+            THCudaTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto beta_ = beta.toInt();
+            auto alpha_ = alpha.toInt();
+            THCudaIntTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto beta_ = beta.toLong();
+            auto alpha_ = alpha.toLong();
+            THCudaLongTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto beta_ = beta.toShort();
+            auto alpha_ = alpha.toShort();
+            THCudaShortTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto beta_ = beta.toHalf();
+            auto alpha_ = alpha.toHalf();
+            THCudaHalfTensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch1_ = checked_dense_tensor_unwrap(batch1, "batch1", 2, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto batch2_ = checked_dense_tensor_unwrap(batch2, "batch2", 3, "_th_baddbmm", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto beta_ = beta.toBFloat16();
+            auto alpha_ = alpha.toBFloat16();
+            THCudaBFloat16Tensor_baddbmm(globalContext().getTHCState(), result_, self_, batch1_, batch2_, beta_, alpha_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_baddbmm not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+std::tuple<Tensor &,Tensor &> _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_gels(globalContext().getTHCState(), res1_, res2_, self_, A_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_gels(globalContext().getTHCState(), res1_, res2_, self_, A_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_gels_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor &, Tensor &>(res1, res2);
+}
+std::tuple<Tensor,Tensor> _th_gels(const Tensor & self, const Tensor & A) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto res1_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto res1 = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(res1_));
+    auto res2_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto res2 = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(res2_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_gels(globalContext().getTHCState(), res1_, res2_, self_, A_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_gels(globalContext().getTHCState(), res1_, res2_, self_, A_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_gels not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor, Tensor>(res1, res2);
+}
+std::tuple<Tensor &,Tensor &> _th_eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_eig_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_eig_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_eig_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_geev(globalContext().getTHCState(), res1_, res2_, self_, eigenvectors);
+            break;
+        }
+        case ScalarType::Float: {
+            auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_eig_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_eig_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_eig_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_geev(globalContext().getTHCState(), res1_, res2_, self_, eigenvectors);
+            break;
+        }
+        default:
+            AT_ERROR("_th_eig_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor &, Tensor &>(res1, res2);
+}
+std::tuple<Tensor,Tensor> _th_eig(const Tensor & self, bool eigenvectors) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto res1_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto res1 = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(res1_));
+    auto res2_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto res2 = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(res2_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_eig", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_geev(globalContext().getTHCState(), res1_, res2_, self_, eigenvectors);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_eig", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_geev(globalContext().getTHCState(), res1_, res2_, self_, eigenvectors);
+            break;
+        }
+        default:
+            AT_ERROR("_th_eig not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor, Tensor>(res1, res2);
+}
+Tensor & _th_potri_out(Tensor & output, const Tensor & self, bool upper) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 0, "_th_potri_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_potri_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_potri(globalContext().getTHCState(), output_, self_, upper);
+            break;
+        }
+        case ScalarType::Float: {
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 0, "_th_potri_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_potri_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_potri(globalContext().getTHCState(), output_, self_, upper);
+            break;
+        }
+        default:
+            AT_ERROR("_th_potri_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return output;
+}
+Tensor _th_potri(const Tensor & self, bool upper) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto output_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto output = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(output_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_potri", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_potri(globalContext().getTHCState(), output_, self_, upper);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_potri", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_potri(globalContext().getTHCState(), output_, self_, upper);
+            break;
+        }
+        default:
+            AT_ERROR("_th_potri not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return output;
+}
+std::tuple<Tensor &,Tensor &> _th_geqrf_out(Tensor & res1, Tensor & res2, const Tensor & self) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_geqrf_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_geqrf_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_geqrf_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_geqrf(globalContext().getTHCState(), res1_, res2_, self_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_geqrf_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_geqrf_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_geqrf_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_geqrf(globalContext().getTHCState(), res1_, res2_, self_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_geqrf_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor &, Tensor &>(res1, res2);
+}
+std::tuple<Tensor,Tensor> _th_geqrf(const Tensor & self) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto res1_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto res1 = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(res1_));
+    auto res2_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto res2 = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(res2_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_geqrf", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_geqrf(globalContext().getTHCState(), res1_, res2_, self_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_geqrf", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_geqrf(globalContext().getTHCState(), res1_, res2_, self_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_geqrf not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor, Tensor>(res1, res2);
+}
+std::tuple<Tensor &,Tensor &> _th_multinomial_alias_setup_out(Tensor & J, Tensor & q, const Tensor & probs) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(J);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto probs_ = checked_dense_tensor_unwrap(probs, "probs", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto J_ = checked_dense_tensor_unwrap(J, "J", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_multinomialAliasSetup(globalContext().getTHCState(), probs_, J_, q_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto probs_ = checked_dense_tensor_unwrap(probs, "probs", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto J_ = checked_dense_tensor_unwrap(J, "J", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_multinomialAliasSetup(globalContext().getTHCState(), probs_, J_, q_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto probs_ = checked_dense_tensor_unwrap(probs, "probs", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto J_ = checked_dense_tensor_unwrap(J, "J", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaHalfTensor_multinomialAliasSetup(globalContext().getTHCState(), probs_, J_, q_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_multinomial_alias_setup_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor &, Tensor &>(J, q);
+}
+std::tuple<Tensor,Tensor> _th_multinomial_alias_setup(const Tensor & probs) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(probs);
+    auto J_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(ScalarType::Long)).release();
+    auto J = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(J_));
+    auto q_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto q = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(q_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto probs_ = checked_dense_tensor_unwrap(probs, "probs", 1, "_th_multinomial_alias_setup", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_multinomialAliasSetup(globalContext().getTHCState(), probs_, J_, q_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto probs_ = checked_dense_tensor_unwrap(probs, "probs", 1, "_th_multinomial_alias_setup", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_multinomialAliasSetup(globalContext().getTHCState(), probs_, J_, q_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto probs_ = checked_dense_tensor_unwrap(probs, "probs", 1, "_th_multinomial_alias_setup", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaHalfTensor_multinomialAliasSetup(globalContext().getTHCState(), probs_, J_, q_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_multinomial_alias_setup not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor, Tensor>(J, q);
+}
+Tensor & _th_multinomial_alias_draw_out(Tensor & result, const Tensor & q, const Tensor & J, int64_t num_samples, c10::optional<Generator> generator) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(result);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_multinomial_alias_draw_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_draw_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto J_ = checked_dense_tensor_unwrap(J, "J", 2, "_th_multinomial_alias_draw_out", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaDoubleTensor_multinomialAliasDraw(globalContext().getTHCState(), result_, q_, J_, num_samples, generator);
+            break;
+        }
+        case ScalarType::Float: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_multinomial_alias_draw_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_draw_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto J_ = checked_dense_tensor_unwrap(J, "J", 2, "_th_multinomial_alias_draw_out", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaTensor_multinomialAliasDraw(globalContext().getTHCState(), result_, q_, J_, num_samples, generator);
+            break;
+        }
+        case ScalarType::Half: {
+            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_multinomial_alias_draw_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_draw_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto J_ = checked_dense_tensor_unwrap(J, "J", 2, "_th_multinomial_alias_draw_out", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaHalfTensor_multinomialAliasDraw(globalContext().getTHCState(), result_, q_, J_, num_samples, generator);
+            break;
+        }
+        default:
+            AT_ERROR("_th_multinomial_alias_draw_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor _th_multinomial_alias_draw(const Tensor & q, const Tensor & J, int64_t num_samples, c10::optional<Generator> generator) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(q);
+    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(ScalarType::Long)).release();
+    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_draw", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto J_ = checked_dense_tensor_unwrap(J, "J", 2, "_th_multinomial_alias_draw", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaDoubleTensor_multinomialAliasDraw(globalContext().getTHCState(), result_, q_, J_, num_samples, generator);
+            break;
+        }
+        case ScalarType::Float: {
+            auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_draw", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto J_ = checked_dense_tensor_unwrap(J, "J", 2, "_th_multinomial_alias_draw", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaTensor_multinomialAliasDraw(globalContext().getTHCState(), result_, q_, J_, num_samples, generator);
+            break;
+        }
+        case ScalarType::Half: {
+            auto q_ = checked_dense_tensor_unwrap(q, "q", 1, "_th_multinomial_alias_draw", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto J_ = checked_dense_tensor_unwrap(J, "J", 2, "_th_multinomial_alias_draw", false, DeviceType::CUDA, ScalarType::Long);
+            THCudaHalfTensor_multinomialAliasDraw(globalContext().getTHCState(), result_, q_, J_, num_samples, generator);
+            break;
+        }
+        default:
+            AT_ERROR("_th_multinomial_alias_draw not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return result;
+}
+Tensor & _th_copy_ignoring_overlaps_(Tensor & self, const Tensor & src) {
+    // DeviceGuard omitted
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Byte: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaByteTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_);
+            break;
+        }
+        case ScalarType::Char: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaCharTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_);
+            break;
+        }
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaDoubleTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_);
+            break;
+        }
+        case ScalarType::Int: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaIntTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_);
+            break;
+        }
+        case ScalarType::Long: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaLongTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_);
+            break;
+        }
+        case ScalarType::Short: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaShortTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
+            THCudaHalfTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_);
+            break;
+        }
+        default:
+            AT_ERROR("_th_copy_ignoring_overlaps_ not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return self;
+}
+Tensor & _thnn_multi_margin_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, int64_t reduction) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto p_ = p.toDouble();
+            auto margin_ = margin.toDouble();
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 5, "_thnn_multi_margin_loss_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleMultiMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, p_, weight_ ? weight_ : NULL, margin_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto p_ = p.toDouble();
+            auto margin_ = margin.toDouble();
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 5, "_thnn_multi_margin_loss_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaMultiMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, p_, weight_ ? weight_ : NULL, margin_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto p_ = p.toDouble();
+            auto margin_ = margin.toDouble();
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 5, "_thnn_multi_margin_loss_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfMultiMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, p_, weight_ ? weight_ : NULL, margin_);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_multi_margin_loss_forward_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return output;
+}
+Tensor _thnn_multi_margin_loss_forward(const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, int64_t reduction) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto output_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto output = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(output_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multi_margin_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multi_margin_loss_forward", false, DeviceType::CUDA, ScalarType::Long);
+            auto p_ = p.toDouble();
+            auto margin_ = margin.toDouble();
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 5, "_thnn_multi_margin_loss_forward", true, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleMultiMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, p_, weight_ ? weight_ : NULL, margin_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multi_margin_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multi_margin_loss_forward", false, DeviceType::CUDA, ScalarType::Long);
+            auto p_ = p.toDouble();
+            auto margin_ = margin.toDouble();
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 5, "_thnn_multi_margin_loss_forward", true, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaMultiMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, p_, weight_ ? weight_ : NULL, margin_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multi_margin_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multi_margin_loss_forward", false, DeviceType::CUDA, ScalarType::Long);
+            auto p_ = p.toDouble();
+            auto margin_ = margin.toDouble();
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 5, "_thnn_multi_margin_loss_forward", true, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfMultiMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, p_, weight_ ? weight_ : NULL, margin_);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_multi_margin_loss_forward not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return output;
+}
+Tensor & _thnn_multi_margin_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, int64_t reduction) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto p_ = p.toDouble();
+            auto margin_ = margin.toDouble();
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 6, "_thnn_multi_margin_loss_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleMultiMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, p_, weight_ ? weight_ : NULL, margin_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto p_ = p.toDouble();
+            auto margin_ = margin.toDouble();
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 6, "_thnn_multi_margin_loss_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaMultiMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, p_, weight_ ? weight_ : NULL, margin_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto p_ = p.toDouble();
+            auto margin_ = margin.toDouble();
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 6, "_thnn_multi_margin_loss_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfMultiMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, p_, weight_ ? weight_ : NULL, margin_);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_multi_margin_loss_backward_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return grad_input;
+}
+Tensor _thnn_multi_margin_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, int64_t reduction) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto grad_input_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto grad_input = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(grad_input_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, ScalarType::Long);
+            auto p_ = p.toDouble();
+            auto margin_ = margin.toDouble();
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 6, "_thnn_multi_margin_loss_backward", true, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleMultiMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, p_, weight_ ? weight_ : NULL, margin_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, ScalarType::Long);
+            auto p_ = p.toDouble();
+            auto margin_ = margin.toDouble();
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 6, "_thnn_multi_margin_loss_backward", true, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaMultiMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, p_, weight_ ? weight_ : NULL, margin_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, ScalarType::Long);
+            auto p_ = p.toDouble();
+            auto margin_ = margin.toDouble();
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 6, "_thnn_multi_margin_loss_backward", true, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfMultiMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, p_, weight_ ? weight_ : NULL, margin_);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_multi_margin_loss_backward not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return grad_input;
+}
+std::tuple<Tensor &,Tensor &> _thnn_multilabel_margin_loss_forward_out(Tensor & output, Tensor & is_target, const Tensor & self, const Tensor & target, int64_t reduction) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 3, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 3, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleMultiLabelMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, is_target_, reduction);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 3, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 3, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaMultiLabelMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, is_target_, reduction);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 3, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 3, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfMultiLabelMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, is_target_, reduction);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 3, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 3, "_thnn_multilabel_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaBFloat16MultiLabelMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, is_target_, reduction);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_multilabel_margin_loss_forward_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor &, Tensor &>(output, is_target);
+}
+std::tuple<Tensor,Tensor> _thnn_multilabel_margin_loss_forward(const Tensor & self, const Tensor & target, int64_t reduction) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto output_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto output = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(output_));
+    auto is_target_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto is_target = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(is_target_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multilabel_margin_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multilabel_margin_loss_forward", false, DeviceType::CUDA, ScalarType::Long);
+            THNN_CudaDoubleMultiLabelMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, is_target_, reduction);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multilabel_margin_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multilabel_margin_loss_forward", false, DeviceType::CUDA, ScalarType::Long);
+            THNN_CudaMultiLabelMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, is_target_, reduction);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multilabel_margin_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multilabel_margin_loss_forward", false, DeviceType::CUDA, ScalarType::Long);
+            THNN_CudaHalfMultiLabelMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, is_target_, reduction);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multilabel_margin_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multilabel_margin_loss_forward", false, DeviceType::CUDA, ScalarType::Long);
+            THNN_CudaBFloat16MultiLabelMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, is_target_, reduction);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_multilabel_margin_loss_forward not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor, Tensor>(output, is_target);
+}
+Tensor & _thnn_multilabel_margin_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, int64_t reduction, const Tensor & is_target) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 5, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 5, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleMultiLabelMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, is_target_, reduction);
+            break;
+        }
+        case ScalarType::Float: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 5, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 5, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaMultiLabelMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, is_target_, reduction);
+            break;
+        }
+        case ScalarType::Half: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 5, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 5, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfMultiLabelMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, is_target_, reduction);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 5, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 5, "_thnn_multilabel_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaBFloat16MultiLabelMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, is_target_, reduction);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_multilabel_margin_loss_backward_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return grad_input;
+}
+Tensor _thnn_multilabel_margin_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, int64_t reduction, const Tensor & is_target) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto grad_input_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto grad_input = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(grad_input_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, ScalarType::Long);
+            auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 5, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleMultiLabelMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, is_target_, reduction);
+            break;
+        }
+        case ScalarType::Float: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, ScalarType::Long);
+            auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 5, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaMultiLabelMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, is_target_, reduction);
+            break;
+        }
+        case ScalarType::Half: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, ScalarType::Long);
+            auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 5, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfMultiLabelMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, is_target_, reduction);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, ScalarType::Long);
+            auto is_target_ = checked_dense_tensor_unwrap(is_target, "is_target", 5, "_thnn_multilabel_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaBFloat16MultiLabelMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, is_target_, reduction);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_multilabel_margin_loss_backward not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return grad_input;
+}
+std::tuple<Tensor &,Tensor &> _thnn_nll_loss_forward_out(Tensor & output, Tensor & total_weight, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 5, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 5, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 5, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 5, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 5, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 5, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 5, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 5, "_thnn_nll_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaBFloat16ClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_nll_loss_forward_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor &, Tensor &>(output, total_weight);
+}
+std::tuple<Tensor,Tensor> _thnn_nll_loss_forward(const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto output_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto output = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(output_));
+    auto total_weight_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto total_weight = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(total_weight_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss_forward", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss_forward", true, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss_forward", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss_forward", true, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss_forward", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss_forward", true, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss_forward", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss_forward", true, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaBFloat16ClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_nll_loss_forward not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor, Tensor>(output, total_weight);
+}
+Tensor & _thnn_nll_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index, const Tensor & total_weight) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::Float: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::Half: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_nll_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaBFloat16ClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_nll_loss_backward_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return grad_input;
+}
+Tensor _thnn_nll_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index, const Tensor & total_weight) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto grad_input_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto grad_input = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(grad_input_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss_backward", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss_backward", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::Float: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss_backward", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss_backward", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::Half: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss_backward", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss_backward", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss_backward", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss_backward", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaBFloat16ClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_nll_loss_backward not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return grad_input;
+}
+std::tuple<Tensor &,Tensor &> _thnn_nll_loss2d_forward_out(Tensor & output, Tensor & total_weight, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 5, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 5, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleSpatialClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 5, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 5, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaSpatialClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 5, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 5, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfSpatialClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 5, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 5, "_thnn_nll_loss2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaBFloat16SpatialClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_nll_loss2d_forward_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor &, Tensor &>(output, total_weight);
+}
+std::tuple<Tensor,Tensor> _thnn_nll_loss2d_forward(const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto output_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto output = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(output_));
+    auto total_weight_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto total_weight = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(total_weight_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss2d_forward", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss2d_forward", true, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleSpatialClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss2d_forward", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss2d_forward", true, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaSpatialClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss2d_forward", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss2d_forward", true, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfSpatialClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_nll_loss2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_nll_loss2d_forward", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_nll_loss2d_forward", true, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaBFloat16SpatialClassNLLCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_nll_loss2d_forward not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor, Tensor>(output, total_weight);
+}
+Tensor & _thnn_nll_loss2d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index, const Tensor & total_weight) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleSpatialClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::Float: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaSpatialClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::Half: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfSpatialClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_nll_loss2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaBFloat16SpatialClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_nll_loss2d_backward_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return grad_input;
+}
+Tensor _thnn_nll_loss2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Tensor & weight, int64_t reduction, int64_t ignore_index, const Tensor & total_weight) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto grad_input_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto grad_input = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(grad_input_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss2d_backward", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleSpatialClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::Float: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss2d_backward", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaSpatialClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::Half: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss2d_backward", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfSpatialClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, ScalarType::Long);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 4, "_thnn_nll_loss2d_backward", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto total_weight_ = checked_dense_tensor_unwrap(total_weight, "total_weight", 7, "_thnn_nll_loss2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaBFloat16SpatialClassNLLCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, weight_ ? weight_ : NULL, total_weight_, ignore_index);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_nll_loss2d_backward not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return grad_input;
+}
+Tensor & _thnn_glu_forward_out(Tensor & output, const Tensor & self, int64_t dim) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_glu_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 2, "_thnn_glu_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleGatedLinear_updateOutput(globalContext().getTHCState(), self_, output_, dim);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_glu_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 2, "_thnn_glu_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaGatedLinear_updateOutput(globalContext().getTHCState(), self_, output_, dim);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_glu_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 2, "_thnn_glu_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfGatedLinear_updateOutput(globalContext().getTHCState(), self_, output_, dim);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_glu_forward_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return output;
+}
+Tensor _thnn_glu_forward(const Tensor & self, int64_t dim) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto output_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto output = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(output_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_glu_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleGatedLinear_updateOutput(globalContext().getTHCState(), self_, output_, dim);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_glu_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaGatedLinear_updateOutput(globalContext().getTHCState(), self_, output_, dim);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_glu_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfGatedLinear_updateOutput(globalContext().getTHCState(), self_, output_, dim);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_glu_forward not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return output;
+}
+Tensor & _thnn_glu_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, int64_t dim) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_glu_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_glu_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 3, "_thnn_glu_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleGatedLinear_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, dim);
+            break;
+        }
+        case ScalarType::Float: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_glu_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_glu_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 3, "_thnn_glu_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaGatedLinear_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, dim);
+            break;
+        }
+        case ScalarType::Half: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_glu_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_glu_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 3, "_thnn_glu_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfGatedLinear_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, dim);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_glu_backward_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return grad_input;
+}
+Tensor _thnn_glu_backward(const Tensor & grad_output, const Tensor & self, int64_t dim) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto grad_input_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto grad_input = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(grad_input_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_glu_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_glu_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleGatedLinear_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, dim);
+            break;
+        }
+        case ScalarType::Float: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_glu_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_glu_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaGatedLinear_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, dim);
+            break;
+        }
+        case ScalarType::Half: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_glu_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_glu_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfGatedLinear_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, dim);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_glu_backward not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return grad_input;
+}
+std::tuple<Tensor &,Tensor &> _thnn_log_sigmoid_forward_out(Tensor & output, Tensor & buffer, const Tensor & self) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_log_sigmoid_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 1, "_thnn_log_sigmoid_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto buffer_ = checked_dense_tensor_unwrap(buffer, "buffer", 1, "_thnn_log_sigmoid_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleLogSigmoid_updateOutput(globalContext().getTHCState(), self_, output_, buffer_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_log_sigmoid_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 1, "_thnn_log_sigmoid_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto buffer_ = checked_dense_tensor_unwrap(buffer, "buffer", 1, "_thnn_log_sigmoid_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaLogSigmoid_updateOutput(globalContext().getTHCState(), self_, output_, buffer_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_log_sigmoid_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 1, "_thnn_log_sigmoid_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto buffer_ = checked_dense_tensor_unwrap(buffer, "buffer", 1, "_thnn_log_sigmoid_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfLogSigmoid_updateOutput(globalContext().getTHCState(), self_, output_, buffer_);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_log_sigmoid_forward_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor &, Tensor &>(output, buffer);
+}
+std::tuple<Tensor,Tensor> _thnn_log_sigmoid_forward(const Tensor & self) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto output_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto output = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(output_));
+    auto buffer_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto buffer = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(buffer_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_log_sigmoid_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleLogSigmoid_updateOutput(globalContext().getTHCState(), self_, output_, buffer_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_log_sigmoid_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaLogSigmoid_updateOutput(globalContext().getTHCState(), self_, output_, buffer_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_log_sigmoid_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfLogSigmoid_updateOutput(globalContext().getTHCState(), self_, output_, buffer_);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_log_sigmoid_forward not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor, Tensor>(output, buffer);
+}
+Tensor & _thnn_log_sigmoid_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & buffer) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto buffer_ = checked_dense_tensor_unwrap(buffer, "buffer", 3, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 3, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleLogSigmoid_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, buffer_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto buffer_ = checked_dense_tensor_unwrap(buffer, "buffer", 3, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 3, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaLogSigmoid_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, buffer_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto buffer_ = checked_dense_tensor_unwrap(buffer, "buffer", 3, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 3, "_thnn_log_sigmoid_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfLogSigmoid_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, buffer_);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_log_sigmoid_backward_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return grad_input;
+}
+Tensor _thnn_log_sigmoid_backward(const Tensor & grad_output, const Tensor & self, const Tensor & buffer) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto grad_input_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto grad_input = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(grad_input_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_log_sigmoid_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_log_sigmoid_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto buffer_ = checked_dense_tensor_unwrap(buffer, "buffer", 3, "_thnn_log_sigmoid_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleLogSigmoid_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, buffer_);
+            break;
+        }
+        case ScalarType::Float: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_log_sigmoid_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_log_sigmoid_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto buffer_ = checked_dense_tensor_unwrap(buffer, "buffer", 3, "_thnn_log_sigmoid_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaLogSigmoid_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, buffer_);
+            break;
+        }
+        case ScalarType::Half: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_log_sigmoid_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_log_sigmoid_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto buffer_ = checked_dense_tensor_unwrap(buffer, "buffer", 3, "_thnn_log_sigmoid_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfLogSigmoid_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, buffer_);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_log_sigmoid_backward not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return grad_input;
+}
+Tensor & _thnn_rrelu_with_noise_forward_out(Tensor & output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, c10::optional<at::Generator> generator) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto lower_ = lower.toDouble();
+            auto upper_ = upper.toDouble();
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto lower_ = lower.toDouble();
+            auto upper_ = upper.toDouble();
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto lower_ = lower.toDouble();
+            auto upper_ = upper.toDouble();
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_rrelu_with_noise_forward_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return output;
+}
+Tensor _thnn_rrelu_with_noise_forward(const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, c10::optional<at::Generator> generator) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto output_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto output = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(output_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto lower_ = lower.toDouble();
+            auto upper_ = upper.toDouble();
+            THNN_CudaDoubleRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto lower_ = lower.toDouble();
+            auto upper_ = upper.toDouble();
+            THNN_CudaRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto lower_ = lower.toDouble();
+            auto upper_ = upper.toDouble();
+            THNN_CudaHalfRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_rrelu_with_noise_forward not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return output;
+}
+Tensor & _thnn_rrelu_with_noise_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 3, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto lower_ = lower.toDouble();
+            auto upper_ = upper.toDouble();
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 6, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleRReLU_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, noise_, lower_, upper_, training, false);
+            break;
+        }
+        case ScalarType::Float: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 3, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto lower_ = lower.toDouble();
+            auto upper_ = upper.toDouble();
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 6, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaRReLU_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, noise_, lower_, upper_, training, false);
+            break;
+        }
+        case ScalarType::Half: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 3, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto lower_ = lower.toDouble();
+            auto upper_ = upper.toDouble();
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 6, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfRReLU_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, noise_, lower_, upper_, training, false);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_rrelu_with_noise_backward_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return grad_input;
+}
+Tensor _thnn_rrelu_with_noise_backward(const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto grad_input_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto grad_input = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(grad_input_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 3, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto lower_ = lower.toDouble();
+            auto upper_ = upper.toDouble();
+            THNN_CudaDoubleRReLU_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, noise_, lower_, upper_, training, false);
+            break;
+        }
+        case ScalarType::Float: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 3, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto lower_ = lower.toDouble();
+            auto upper_ = upper.toDouble();
+            THNN_CudaRReLU_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, noise_, lower_, upper_, training, false);
+            break;
+        }
+        case ScalarType::Half: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 3, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto lower_ = lower.toDouble();
+            auto upper_ = upper.toDouble();
+            THNN_CudaHalfRReLU_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, noise_, lower_, upper_, training, false);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_rrelu_with_noise_backward not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return grad_input;
+}
+Tensor & _thnn_rrelu_with_noise_forward_(Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, c10::optional<at::Generator> generator) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto lower_ = lower.toDouble();
+            auto upper_ = upper.toDouble();
+            THNN_CudaDoubleRReLU_updateOutput(globalContext().getTHCState(), self_, self_, noise_, lower_, upper_, training, true, generator);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto lower_ = lower.toDouble();
+            auto upper_ = upper.toDouble();
+            THNN_CudaRReLU_updateOutput(globalContext().getTHCState(), self_, self_, noise_, lower_, upper_, training, true, generator);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto lower_ = lower.toDouble();
+            auto upper_ = upper.toDouble();
+            THNN_CudaHalfRReLU_updateOutput(globalContext().getTHCState(), self_, self_, noise_, lower_, upper_, training, true, generator);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_rrelu_with_noise_forward_ not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return self;
+}
+std::tuple<Tensor &,Tensor &,Tensor &> _thnn_conv2d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
+            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
+            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
+            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
+            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaBFloat16SpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_conv2d_forward_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor &, Tensor &, Tensor &>(output, columns, ones);
+}
+std::tuple<Tensor,Tensor,Tensor> _thnn_conv2d_forward(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto output_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto output = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(output_));
+    auto columns_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto columns = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(columns_));
+    auto ones_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto ones = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(ones_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
+            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            THNN_CudaDoubleSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
+            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            THNN_CudaSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
+            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            THNN_CudaHalfSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
+            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            THNN_CudaBFloat16SpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_conv2d_forward not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor, Tensor, Tensor>(output, columns, ones);
+}
+std::tuple<Tensor &,Tensor &,Tensor &> _thnn_conv2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, const Tensor & columns, const Tensor & ones) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_bias_ = checked_dense_tensor_unwrap(grad_bias, "grad_bias", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            if (grad_input_) THNN_CudaDoubleSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
+            if (grad_weight_ || grad_bias_) THNN_CudaDoubleSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1);
+            break;
+        }
+        case ScalarType::Float: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_bias_ = checked_dense_tensor_unwrap(grad_bias, "grad_bias", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            if (grad_input_) THNN_CudaSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
+            if (grad_weight_ || grad_bias_) THNN_CudaSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1);
+            break;
+        }
+        case ScalarType::Half: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_bias_ = checked_dense_tensor_unwrap(grad_bias, "grad_bias", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            if (grad_input_) THNN_CudaHalfSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
+            if (grad_weight_ || grad_bias_) THNN_CudaHalfSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_bias_ = checked_dense_tensor_unwrap(grad_bias, "grad_bias", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            if (grad_input_) THNN_CudaBFloat16SpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
+            if (grad_weight_ || grad_bias_) THNN_CudaBFloat16SpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_conv2d_backward_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor &, Tensor &, Tensor &>(grad_input, grad_weight, grad_bias);
+}
+std::tuple<Tensor,Tensor,Tensor> _thnn_conv2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, const Tensor & columns, const Tensor & ones, std::array<bool,3> output_mask) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto grad_input_ = output_mask[0] ? c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release() : nullptr;
+    auto grad_input = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(grad_input_ == nullptr ? (TensorImpl*)UndefinedTensorImpl::singleton() : (TensorImpl*)grad_input_));
+    auto grad_weight_ = output_mask[1] ? c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release() : nullptr;
+    auto grad_weight = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(grad_weight_ == nullptr ? (TensorImpl*)UndefinedTensorImpl::singleton() : (TensorImpl*)grad_weight_));
+    auto grad_bias_ = output_mask[2] ? c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release() : nullptr;
+    auto grad_bias = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(grad_bias_ == nullptr ? (TensorImpl*)UndefinedTensorImpl::singleton() : (TensorImpl*)grad_bias_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            if (grad_input_) THNN_CudaDoubleSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
+            if (grad_weight_ || grad_bias_) THNN_CudaDoubleSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1);
+            break;
+        }
+        case ScalarType::Float: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            if (grad_input_) THNN_CudaSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
+            if (grad_weight_ || grad_bias_) THNN_CudaSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1);
+            break;
+        }
+        case ScalarType::Half: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            if (grad_input_) THNN_CudaHalfSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
+            if (grad_weight_ || grad_bias_) THNN_CudaHalfSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            if (grad_input_) THNN_CudaBFloat16SpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
+            if (grad_weight_ || grad_bias_) THNN_CudaBFloat16SpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_conv2d_backward not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor, Tensor, Tensor>(grad_input, grad_weight, grad_bias);
+}
+Tensor & _thnn_conv_depthwise2d_forward_out(Tensor & output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
+            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv_depthwise2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto dilation_ = check_intlist<2>(dilation, "dilation", 7);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 7, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaDoubleSpatialDepthwiseConvolution_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
+            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv_depthwise2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto dilation_ = check_intlist<2>(dilation, "dilation", 7);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 7, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaSpatialDepthwiseConvolution_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
+            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv_depthwise2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto dilation_ = check_intlist<2>(dilation, "dilation", 7);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 7, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaHalfSpatialDepthwiseConvolution_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
+            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv_depthwise2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto dilation_ = check_intlist<2>(dilation, "dilation", 7);
+            auto output_ = checked_dense_tensor_unwrap(output, "output", 7, "_thnn_conv_depthwise2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            THNN_CudaBFloat16SpatialDepthwiseConvolution_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_conv_depthwise2d_forward_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return output;
+}
+Tensor _thnn_conv_depthwise2d_forward(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto output_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
+    auto output = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(output_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv_depthwise2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv_depthwise2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
+            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv_depthwise2d_forward", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto dilation_ = check_intlist<2>(dilation, "dilation", 7);
+            THNN_CudaDoubleSpatialDepthwiseConvolution_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            break;
+        }
+        case ScalarType::Float: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv_depthwise2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv_depthwise2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
+            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv_depthwise2d_forward", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto dilation_ = check_intlist<2>(dilation, "dilation", 7);
+            THNN_CudaSpatialDepthwiseConvolution_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            break;
+        }
+        case ScalarType::Half: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv_depthwise2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv_depthwise2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
+            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv_depthwise2d_forward", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto dilation_ = check_intlist<2>(dilation, "dilation", 7);
+            THNN_CudaHalfSpatialDepthwiseConvolution_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv_depthwise2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv_depthwise2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
+            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv_depthwise2d_forward", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto dilation_ = check_intlist<2>(dilation, "dilation", 7);
+            THNN_CudaBFloat16SpatialDepthwiseConvolution_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_conv_depthwise2d_forward not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return output;
+}
+std::tuple<Tensor &,Tensor &> _thnn_conv_depthwise2d_backward_out(Tensor & grad_input, Tensor & grad_weight, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto dilation_ = check_intlist<2>(dilation, "dilation", 7);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_conv_depthwise2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 7, "_thnn_conv_depthwise2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            if (grad_input_) THNN_CudaDoubleSpatialDepthwiseConvolution_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            if (grad_weight_) THNN_CudaDoubleSpatialDepthwiseConvolution_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            break;
+        }
+        case ScalarType::Float: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto dilation_ = check_intlist<2>(dilation, "dilation", 7);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_conv_depthwise2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 7, "_thnn_conv_depthwise2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            if (grad_input_) THNN_CudaSpatialDepthwiseConvolution_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            if (grad_weight_) THNN_CudaSpatialDepthwiseConvolution_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            break;
+        }
+        case ScalarType::Half: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto dilation_ = check_intlist<2>(dilation, "dilation", 7);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_conv_depthwise2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 7, "_thnn_conv_depthwise2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            if (grad_input_) THNN_CudaHalfSpatialDepthwiseConvolution_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            if (grad_weight_) THNN_CudaHalfSpatialDepthwiseConvolution_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv_depthwise2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto dilation_ = check_intlist<2>(dilation, "dilation", 7);
+            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_conv_depthwise2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 7, "_thnn_conv_depthwise2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
+            if (grad_input_) THNN_CudaBFloat16SpatialDepthwiseConvolution_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            if (grad_weight_) THNN_CudaBFloat16SpatialDepthwiseConvolution_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_conv_depthwise2d_backward_out not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor &, Tensor &>(grad_input, grad_weight);
+}
+std::tuple<Tensor,Tensor> _thnn_conv_depthwise2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, std::array<bool,2> output_mask) {
+    const OptionalDeviceGuard device_guard(device_of(self));
+    auto dispatch_scalar_type = infer_scalar_type(self);
+    auto grad_input_ = output_mask[0] ? c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release() : nullptr;
+    auto grad_input = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(grad_input_ == nullptr ? (TensorImpl*)UndefinedTensorImpl::singleton() : (TensorImpl*)grad_input_));
+    auto grad_weight_ = output_mask[1] ? c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release() : nullptr;
+    auto grad_weight = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(grad_weight_ == nullptr ? (TensorImpl*)UndefinedTensorImpl::singleton() : (TensorImpl*)grad_weight_));
+    switch (dispatch_scalar_type) {
+        case ScalarType::Double: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto dilation_ = check_intlist<2>(dilation, "dilation", 7);
+            if (grad_input_) THNN_CudaDoubleSpatialDepthwiseConvolution_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            if (grad_weight_) THNN_CudaDoubleSpatialDepthwiseConvolution_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            break;
+        }
+        case ScalarType::Float: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto dilation_ = check_intlist<2>(dilation, "dilation", 7);
+            if (grad_input_) THNN_CudaSpatialDepthwiseConvolution_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            if (grad_weight_) THNN_CudaSpatialDepthwiseConvolution_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            break;
+        }
+        case ScalarType::Half: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto dilation_ = check_intlist<2>(dilation, "dilation", 7);
+            if (grad_input_) THNN_CudaHalfSpatialDepthwiseConvolution_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            if (grad_weight_) THNN_CudaHalfSpatialDepthwiseConvolution_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            break;
+        }
+        case ScalarType::BFloat16: {
+            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv_depthwise2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
+            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
+            auto stride_ = check_intlist<2>(stride, "stride", 5);
+            auto padding_ = check_intlist<2>(padding, "padding", 6);
+            auto dilation_ = check_intlist<2>(dilation, "dilation", 7);
+            if (grad_input_) THNN_CudaBFloat16SpatialDepthwiseConvolution_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            if (grad_weight_) THNN_CudaBFloat16SpatialDepthwiseConvolution_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], dilation_[1], dilation_[0]);
+            break;
+        }
+        default:
+            AT_ERROR("_thnn_conv_depthwise2d_backward not supported on CUDAType for ", dispatch_scalar_type);
+    }
+    return std::tuple<Tensor, Tensor>(grad_input, grad_weight);
+}
+
+} // namespace th
+} // namespace legacy
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/cwrap_parser.py b/aten/src/ATen/cwrap_parser.py
deleted file mode 100644
index 27bbbd7140f..00000000000
--- a/aten/src/ATen/cwrap_parser.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import yaml
-import copy
-
-try:
-    # use faster C loader if available
-    from yaml import CLoader as Loader
-except ImportError:
-    from yaml import Loader
-
-# follows similar logic to cwrap, ignores !inc, and just looks for [[]]
-
-
-def parse(filename):
-    with open(filename, 'r') as file:
-        declaration_lines = []
-        declarations = []
-        in_declaration = False
-        for line in file.readlines():
-            line = line.rstrip()
-            if line == '[[':
-                declaration_lines = []
-                in_declaration = True
-            elif line == ']]':
-                in_declaration = False
-                declaration = yaml.load('\n'.join(declaration_lines), Loader=Loader)
-                declarations.append(declaration)
-            elif in_declaration:
-                declaration_lines.append(line)
-        declarations = [process_declaration(declaration) for declaration in declarations]
-        return declarations
-
-def process_declaration(declaration):
-    declaration = copy.deepcopy(declaration)
-    if "arguments" in declaration:
-        declaration["schema_order_arguments"] = copy.deepcopy(declaration["arguments"])
-    if "options" in declaration:
-        declaration["options"] = [process_declaration(option) for option in declaration["options"]]
-    return declaration
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
deleted file mode 100644
index f996e73e5d9..00000000000
--- a/aten/src/ATen/function_wrapper.py
+++ /dev/null
@@ -1,1544 +0,0 @@
-# HEY! Trying to understand what this file does?  Read
-# "what has to be done to add a Operation ..." first!
-
-import re
-import copy
-from code_template import CodeTemplate
-
-
-from typing import Any, Dict, List, Optional, Set, Tuple, NamedTuple
-
-try:
-    from mypy_extensions import TypedDict
-except ImportError:
-    # Avoid the dependency on the mypy_extensions package.
-    # It is required, however, for type checking.
-    def TypedDict(name, attrs, total=True):  # type: ignore
-        return Dict[Any, Any]
-
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# what has to be done to add a Operation ...
-#
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-# TH functions are generated into at::legacy::cpu and at::legacy::cuda,
-# where they can be called directly by a native function, they can be wrapped
-# by a native function that handles dispatch
-
-LEGACY_TH_DECLARATION = CodeTemplate("""\
-${return_type} ${api_name}(${formals});
-""")
-
-LEGACY_TH_DEFINITION = CodeTemplate("""\
-${return_type} ${api_name}(${formals}) {
-    ${device_guard_declaration}
-    ${type_definition_body}
-}
-""")
-
-LEGACY_TH_DEFINITION_SWITCH_STATEMENT = CodeTemplate("""\
-${dispatch_scalar_type_declaration}
-${switch_prologue}
-switch (dispatch_scalar_type) {
-    ${cases}
-    default:
-        AT_ERROR("${api_name} not supported on ${Type} for ", dispatch_scalar_type);
-}
-${switch_epilogue}
-""")
-
-LEGACY_TH_DEFINITION_CASE = CodeTemplate("""\
-case ScalarType::${ScalarName}: {
-    ${case_body}
-    break;
-}
-""")
-
-# Native functions are generated and registered on the dispatcher. We register the
-# function on Backend::Undefined if it does not have backend dependent dispatch.
-# In this case, it will be called for all backends, but can be overwritten on a
-# per backend basis.
-NATIVE_DISPATCH_DECLARATION = CodeTemplate("""\
-${return_type} ${type_wrapper_name}(${native_formals});
-""")
-
-NATIVE_DISPATCH_DEFINITION_DEFAULT = CodeTemplate("""\
-${return_type} ${type_wrapper_name}(${native_formals}) {
-    ${device_guard_declaration}
-    ${return_call} at::native::${native_type_method_dispatch}(${actuals});
-}
-""")
-
-NATIVE_DISPATCH_DEFINITION_CPU_BACKEND = CodeTemplate("""\
-${return_type} ${type_wrapper_name}(${native_formals}) {
-    ${return_call} at::native::${native_type_method_dispatch}(${actuals});
-}
-""")
-
-NATIVE_DISPATCH_DEFINITION_GENERIC_BACKEND = CodeTemplate("""\
-${return_type} ${type_wrapper_name}(${native_formals}) {
-    ${device_init}
-    ${device_guard_declaration}
-    ${return_call} at::native::${native_type_method_dispatch}(${actuals});
-}
-""")
-
-# A schema registration specifies alias analysis for an operator, but doesn't
-# actually provide an implementation.  Although our registration API allows you
-# to specify all of this information at a function registration site, it's
-# better to do it once at a schema registration so that we don't have to
-# repeat ourselves everywhere else.
-SCHEMA_REGISTRATION = CodeTemplate("""\
-m.def("${unqual_schema_string}");
-""")
-
-# NOTE[UnboxedOnly] Many of our codegen templates currently exist twice, once
-# in an _UNBOXEDONLY_ variant and once without _UNBOXEDONLY_. This is because
-# ops that are `use_c10_dispatcher: full` need different c++ code than ops
-# that aren't `use_c10_dispatcher: full` yet. The _UNBOXEDONLY_ variants
-# are for ops that aren't `use_c10_dispatcher: full` yet and those code templates
-# can be deleted once all ops are `use_c10_dispatcher: full`.
-# If you update one of the templates, you likely also have to update the other.
-
-# NB: Specifiction of the namespace is handled by the enclosing
-# TORCH_LIBRARY macro invocation
-# See NOTE[UnboxedOnly]
-DEFAULT_UNBOXEDONLY_FUNCTION_REGISTRATION = CodeTemplate("""\
-m.impl("${unqual_operator_name_with_overload}",
-       torch::CppFunction::makeUnboxedOnly(&TypeDefault::${type_wrapper_name}));
-""")
-
-DEFAULT_FUNCTION_REGISTRATION = CodeTemplate("""\
-m.impl("${unqual_operator_name_with_overload}",
-       c10::impl::hacky_wrapper_for_legacy_signatures<${schema_order_cpp_signature}>(TORCH_FN(TypeDefault::${type_wrapper_name})));
-""")
-
-# NB: In the ordinary, TypeDerived code generation work flow, specification
-# of the backend is handled by the enclosing block, so the torch::dispatch
-# invocation here is strictly unnecessary.  However, in the fbcode mobile
-# only workflow using per-op registration, these registrations will get dumped
-# in a TORCH_LIBRARY_FRAGMENT that does not have an ambient backend.  So
-# the torch::dispatch specification here is important!  See
-# Note [Redundancy in registration code is OK] for how we handle redundant info.
-BACKEND_UNBOXEDONLY_FUNCTION_REGISTRATION = CodeTemplate("""\
-m.impl("${unqual_operator_name_with_overload}",
-       torch::dispatch(DispatchKey::${Backend},
-                       torch::CppFunction::makeUnboxedOnly(&${Type}::${type_wrapper_name}))
-);
-""")
-
-BACKEND_FUNCTION_REGISTRATION = CodeTemplate("""\
-m.impl("${unqual_operator_name_with_overload}",
-       torch::dispatch(DispatchKey::${Backend},
-                       c10::impl::hacky_wrapper_for_legacy_signatures<${schema_order_cpp_signature}>(
-                           TORCH_FN(${Type}::${type_wrapper_name})))
-);
-""")
-
-# add non-virtual declaration to TensorBody.h
-TENSOR_METHOD_DECLARATION = CodeTemplate("""\
-${return_type} ${api_name}(${method_formals_with_defaults}) const;
-""")
-
-# add non-virtual declaration to Tensor.cpp
-TENSOR_METHOD_DEFINITION = CodeTemplate("""\
-
-// ${schema_string}
-${return_type} Tensor::${api_name}(${method_formals}) const {
-    static auto op = c10::Dispatcher::singleton()
-        .findSchemaOrThrow("aten::${operator_name}", "${overload_name}")
-        .typed<${tensor_method_cpp_signature}>();
-    return op.call(${tensor_method_actuals});
-}
-""")
-
-# add a method declaration in Functions.h
-FUNCTION_DECLARATION = CodeTemplate("""\
-CAFFE2_API ${return_type} ${api_name}(${formals_with_defaults});
-""")
-
-# add a method declaration in Functions.h
-DEPRECATED_FUNCTION_DECLARATION = CodeTemplate("""\
-C10_DEPRECATED CAFFE2_API ${return_type} ${api_name}(${formals_with_defaults});
-""")
-
-# add method definition in Functions.h
-FUNCTION_DEFINITION = CodeTemplate("""\
-
-// ${schema_string}
-${return_type} ${api_name}(${formals}) {
-    static auto op = c10::Dispatcher::singleton()
-        .findSchemaOrThrow("aten::${operator_name}", "${overload_name}")
-        .typed<${function_cpp_signature}>();
-    return op.call(${function_actuals});
-}
-""")
-
-IFDEF_BLOCK = CodeTemplate("""\
-#ifdef ${ifdef_guard}
-${content}
-#endif
-""")
-
-# add a native declaration for a native function
-NATIVE_DECLARATION = CodeTemplate("""\
-CAFFE2_API ${return_type} ${native_type_method_dispatch}(${native_formals_with_defaults});
-""")
-
-CALL_TEMPLATE = CodeTemplate("${cname}(${actuals})")
-
-OPERATOR_NAME = CodeTemplate("aten::${operator_name}")
-
-OPERATOR_NAME_FULL = CodeTemplate("""\
-    {"aten::${operator_name}", "${overload_name}"},
-""")
-
-# scalar_name, c_type, accreal, is_floating_type
-scalar_types = [
-    ('Bool', 'bool', 'BoolAccrealNotDefined', False),
-    ('Byte', 'uint8_t', 'Long', False),
-    ('Char', 'int8_t', 'Long', False),
-    ('Double', 'double', 'Double', True),
-    ('Float', 'float', 'Double', True),
-    ('Int', 'int', 'Long', False),
-    ('Long', 'int64_t', 'Long', False),
-    ('Short', 'int16_t', 'Long', False),
-    ('Half', 'Half', 'Double', True),
-    ('BFloat16', 'BFloat16', 'BFloat16AccrealNotDefined', True),
-    ('ComplexFloat', 'ComplexFloat', 'ComplexDouble', False),
-    ('ComplexDouble', 'ComplexDouble', 'ComplexDouble', False),
-]
-
-class NYIError(Exception):
-    """Indicates we don't support this declaration yet"""
-
-    __slots__ = ['reason']
-
-    def __init__(self, reason):
-        self.reason = reason
-
-
-TYPE_FORMAL_GENERIC = {
-    'THTensor*': 'Tensor &',
-    'THByteTensor*': 'Tensor &',
-    'THIndexTensor*': 'Tensor &',
-    'THBoolTensor*': 'Tensor &',
-    'IntArrayRefSize': 'IntArrayRef',
-    'accreal': 'Scalar',
-    'real': 'Scalar',
-    'long': 'int64_t',
-}
-
-DYNAMIC_TYPE = {
-    'THTensor*': 'Tensor',
-    'THByteTensor*': 'ByteTensor',
-    'THBoolTensor*': 'BoolTensor',
-    'THIndexTensor*': 'IndexTensor',
-    'IntArrayRefSize': 'IntArrayRef',
-    'accreal': 'accreal',
-    'real': 'real',
-    'long': 'int64_t',
-}
-
-NATIVE_DYNAMIC_TYPE = {
-    'Tensor &': 'Tensor',
-    'const Tensor &': 'Tensor',
-}
-
-TYPE_RETURN = {
-    'THTensor*': 'Tensor',
-    'THIndexTensor*': 'Tensor',
-    'THByteTensor*': 'Tensor',
-    'THBoolTensor*': 'Tensor',
-    'real': 'Tensor',
-    'accreal': 'Tensor',
-    'long': 'int64_t',
-}
-
-CHECKED_CAST = {
-    'THTensor*':
-        CodeTemplate(
-            'checked_dense_tensor_unwrap('
-            '${arg_name}, "${arg_name}", ${arg_pos}, "${api_name}", ${null_okay}, '
-            'DeviceType::${DeviceType}, ${scalar_type})'),
-    'THByteTensor*':
-        CodeTemplate(
-            'checked_dense_tensor_unwrap('
-            '${arg_name}, "${arg_name}", ${arg_pos}, "${api_name}", ${null_okay}, '
-            'DeviceType::${DeviceType}, ScalarType::Byte)'),
-    'THBoolTensor*':
-        CodeTemplate(
-            'checked_dense_tensor_unwrap('
-            '${arg_name}, "${arg_name}", ${arg_pos}, "${api_name}", ${null_okay}, '
-            'DeviceType::${DeviceType}, ScalarType::Bool)'),
-    'THIndexTensor*':
-        CodeTemplate(
-            'checked_dense_tensor_unwrap('
-            '${arg_name}, "${arg_name}", ${arg_pos}, "${api_name}", ${null_okay}, '
-            'DeviceType::${DeviceType}, ScalarType::Long)'),
-    'real': CodeTemplate('${arg_name}.to${ScalarName}()'),
-    'accreal': CodeTemplate('${arg_name}.to${AccScalarName}()'),
-    'TensorList': CodeTemplate(
-            'checked_dense_tensor_list_unwrap(${arg_name},"${arg_name}",${arg_pos}, '
-            'DeviceType::${DeviceType}, ${scalar_type})'),
-    'IntArrayRef': CodeTemplate('check_intlist<${size}>(${arg_name}, "${arg_name}", ${arg_pos})')
-}
-
-CHECKED_USE = {
-    'THTensor*': '{}_',
-    'THIndexTensor*': '{}_',
-    'THByteTensor*': '{}_',
-    'THBoolTensor*': '{}_',
-    'TensorList': "{0}_.data(), {0}_.size()",
-}
-
-CHECKED_USE_NULLABLE = CodeTemplate('${arg_name}_ ? ${usage} : NULL')
-
-ALLOC_NOARGS_WRAP = {
-    'THTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensorImpl>'
-                 '(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),'
-                 'DispatchKey::${Backend}, scalarTypeToTypeMeta(${ScalarName})).release()',
-    'THByteTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensorImpl>'
-                     '(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),'
-                     'DispatchKey::${Backend}, scalarTypeToTypeMeta(ScalarType::Byte)).release()',
-    'THBoolTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensorImpl>'
-                     '(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),'
-                     'DispatchKey::${Backend}, scalarTypeToTypeMeta(ScalarType::Bool)).release()',
-    'THIndexTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensorImpl>'
-                     '(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),'
-                     'DispatchKey::${Backend}, scalarTypeToTypeMeta(ScalarType::Long)).release()',
-}
-
-# Replacements for constants when calling into TH
-CONSTANT_REPLACEMENTS = [
-    ('AS_REAL', '${ScalarType}'),
-]
-
-# Replacements for constants in header file function definitions
-HEADER_CONSTANT_REPLACEMENTS = [
-    (r'AS_REAL\((.*)\)', r'\1'),
-]
-
-
-class nested_dict(object):
-    def __init__(self, base, parent):
-        self.base, self.parent = base, parent
-
-    def __getitem__(self, x):
-        r = self.base.get(x)
-        if r is not None:
-            return r
-        return self.parent[x]
-
-
-Environment = TypedDict('Environment', {
-    'state': str,
-    'ScalarType': str,
-    'ScalarName': str,
-    'THTensor': str,
-    'THType': str,
-    'Backend': str,
-    'DeviceType': str,
-    'AccScalarName': str,
-})
-
-TopEnvironment = TypedDict('TopEnvironment', {
-    'type_registrations': List[str],
-    'type_headers': List[str],
-    'function_registrations': List[str],
-    'aten_ops': List[str],
-    'type_method_declarations': List[str],
-    'type_method_definitions': List[str],
-    'tensor_method_declarations': List[str],
-    'tensor_method_definitions': List[str],
-    'function_declarations': List[str],
-    'function_definitions': List[str],
-    'type_ids': List[str],
-    'native_function_declarations': List[str],
-})
-
-# A Declarations.cwrap formal argument
-# type can contain THTensor* types
-# NOTE: this must contain all 'AtFormal' attributes, because FunctionOption
-# doesn't differentiate between whether we have AtFormals or THFormals
-THFormal = TypedDict('THFormal', {
-    'name': str,
-    'type': str,
-    'dynamic_type': str,
-    'kwarg_only': bool,
-    'is_nullable': bool,
-    'default': str,
-    'output': bool,
-    'size': int,
-    'annotation': str,
-    'allocate': bool,
-    'mask': bool,
-}, total=False)
-
-# Generic ATen formal or native_functions.yaml formal argument.
-# type can contain Tensor& reference types.
-AtFormal = TypedDict('AtFormal', {
-    'name': str,
-    'type': str,
-    'dynamic_type': str,
-    'kwarg_only': bool,
-    'is_nullable': bool,
-    'default': str,
-    'output': bool,
-    'size': int,
-    'annotation': str,
-}, total=False)
-
-# Note [field_name versus name]
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# What is the difference between "field_name" and "name"?
-#
-# Return values of ATen operators always have a name: if it is not
-# explicitly assigned a name inside native_functions.yaml like func:
-# myop() -> (Tensor indices, Tensor value), then the codegen will
-# automatically assign it a name like result0, or name might be
-# specified inside Declarations.cwrap.  We don't want these assigned
-# names to become part of the public API when we return a namedtuple for
-# any such multiple-return function.
-#
-# Thus field_name is like name, but it is defined only when there is a
-# name specified in native_functions.yaml. If field_name is defined,
-# then the codegen would generate code to return namedtuple. Otherwise,
-# it would just return tuple.
-
-ReturnType = TypedDict('ReturnType', {
-    'name': str,
-    # See Note [field_name versus name]
-    'field_name': str,
-    'type': str,
-    'dynamic_type': str,
-}, total=False)
-
-ReturnDecl = TypedDict('ReturnDecl', {
-    'kind': str,
-    'type': str,
-    'arguments': List[int],
-}, total=False)
-
-# Represents a buffer in nn.yaml
-NNBuffer = TypedDict('NNBuffer', {
-    'name': str,
-})
-
-FunctionOption = TypedDict('FunctionOption', {
-    'actuals': List[str],
-    'schema_order_actuals': List[str],
-    'api_name': str,
-    # Like api_name, but it is the name of the internal
-    # CPUType/CUDAType/TypeDefault function that wraps
-    # the actual native call.  This name is NOT user
-    # visible and is mangled with the overload name
-    'type_wrapper_name': str,
-    'arguments': List[THFormal],
-    # 'schema_order_arguments' is like 'arguments' but keeps them in the
-    # order they are defined in the JIT function schema while
-    # 'arguments' does some modifications (e.g. reorders out arguments
-    # and packs TensorOptions)
-    'schema_order_arguments': List[THFormal],
-    'backend_types': Dict[str, List[str]],
-    'backends': List[str],
-    'buffers': List[NNBuffer],
-    # cimpls is really a List[FunctionOption]
-    'cimpls': List[Any],
-    'cname': str,
-    # explicitly specify whether the function is a factory function or other special category
-    'category_override': str,
-    'condition': str,
-    'device_guard': bool,
-    'device_guard_declaration': str,
-    'dispatch_scalar_type_declaration': str,
-    'use_c10_dispatcher': str,
-    'manual_kernel_registration': bool,
-    'with_gil': bool,
-    'cpu_half': bool,
-    'cpu_bfloat16': bool,
-    'cuda_bfloat16': bool,
-    'deprecated': bool,
-    'cpu_bool': bool,
-    'cuda_bool': bool,
-    # See Note [field_name versus name]
-    'field_name': str,
-    'formals_list': List[AtFormal],
-    'formals_with_defaults': List[str],
-    'native_formals_with_defaults': List[str],
-    'formals': List[str],
-    'native_formals': List[str],
-    'formals_types': List[str],
-    'cpp_signature': str,
-    # 'schema_order_cpp_signature' is like 'cpp_signature' but keeps them in the
-    # order they are defined in the JIT function schema while
-    # 'cpp_signature' does some modifications (e.g. reorders out arguments
-    # and packs TensorOptions)
-    'schema_order_cpp_signature': str,
-    'inplace': bool,
-    'matches_jit_signature': bool,
-    # This controls whether or not we generate the interface in Type or
-    # TypeExtendedInterface
-    'extended_method': bool,
-    'method_actuals': List[str],
-    'schema_order_method_actuals': List[str],
-    'method_formals_with_defaults': List[str],
-    'method_formals': List[str],
-    'mode': str,
-    'python_module': str,
-    'name': str,
-    'operator_name': str,
-    'overload_name': str,
-    'native_type_method_dispatch': str,
-    # options should be List[FunctionOption]
-    'options': Any,
-    'schema_string': str,
-    'return_call': str,
-    'return_type': str,
-    'return': ReturnDecl,
-    'returns': List[ReturnType],
-    'sparse': bool,
-    'type_definition_body': List[str],
-    'type_method_definition_dispatch': str,
-    'variants': str,
-})
-
-OutputDeclaration = NamedTuple('OutputDeclaration', [
-    ('name', str),
-    ('operator_name', str),
-    ('overload_name', str),
-    ('use_c10_dispatcher', str),
-    ('manual_kernel_registration', bool),
-    ('category_override', str),
-    ('matches_jit_signature', bool),
-    ('schema_string', str),
-    ('arguments', List[AtFormal]),
-    ('schema_order_cpp_signature', str),
-    # 'schema_order_arguments' is like 'arguments' but keeps them in the
-    # order they are defined in the JIT function schema while
-    # 'arguments' does some modifications (e.g. reorders out arguments
-    # and packs TensorOptions)
-    ('schema_order_arguments', List[AtFormal]),
-    ('method_of', List[str]),
-    ('mode', str),
-    ('python_module', str),
-    ('buffers', Optional[List[str]]),
-    ('returns', List[ReturnType]),
-    ('inplace', bool),
-    ('is_factory_method', bool),
-    ('abstract', bool),
-    ('device_guard', bool),
-    ('with_gil', bool),
-    ('deprecated', bool),
-])
-
-FunctionCode = NamedTuple('FunctionCode', [
-    ('definition', str),
-    ('declaration', str),
-])
-
-OpRegistration = NamedTuple('OpRegistration', [
-    ('operator_name', str),
-    ('registration_code', str),
-    ('schema_registration_code', str),
-])
-
-
-def device_guard(option, dispatch_options, dispatch_tensor):
-    # For factory methods the `DeviceGuard` is already in the template.
-    if option.get('device_guard', True):
-        if dispatch_options:
-            return 'const DeviceGuard device_guard({}.device());'.format(dispatch_options['name'])
-        if dispatch_tensor:
-            return 'const OptionalDeviceGuard device_guard(device_of({}));'.format(dispatch_tensor)
-    return '// DeviceGuard omitted'
-
-
-def dispatch_scalar_type(option, dispatch_options, dispatch_tensor):
-    if dispatch_options:
-        return 'auto dispatch_scalar_type = typeMetaToScalarType({}.dtype());'.format(dispatch_options['name'])
-    if dispatch_tensor:
-        return 'auto dispatch_scalar_type = infer_scalar_type({});'.format(dispatch_tensor)
-    return '// dispatch_scalar_type omitted'
-
-
-def is_real_argument_to_wrapper(argument):
-    # type: (THFormal) -> bool
-    return not argument.get('output', False) and\
-        argument['type'] != 'CONSTANT' and\
-        argument['type'] != 'argument'
-
-
-def is_mutable_formal_argument(argument, option):
-    # type: (THFormal, FunctionOption) -> bool
-    return argument.get('output') or option['inplace'] and argument['name'] == 'self'
-
-
-def check_methods_do_not_start_with_underscore(name, is_method):
-    if name in {'_values', '_indices', '_nnz', '_dimI', '_dimV', '_coalesced_',
-                '_version'}:
-        return
-    if is_method and name.startswith('_') and not name.startswith('__') and not name.startswith('_th_'):
-        message = "Function '{}' starts with a single underscore and is ".format(name)
-        message += "configured to have a method on Tensor. Functions that start with "
-        message += " a single underscore should only be functions in the at:: "
-        message += "namespace and not methods on Tensor!"
-        raise RuntimeError(message)
-
-
-def to_return_type(arg, option):
-    # type: (THFormal, FunctionOption) -> ReturnType
-    t = arg['type']
-    rt = TYPE_RETURN.get(t, t)
-    if rt == 'Tensor' and not arg.get('allocate'):
-        rt = rt + ' &'
-        if not is_mutable_formal_argument(arg, option):
-            rt = 'const ' + rt
-    return {
-        'name': arg['name'],
-        'type': rt,
-        'dynamic_type': DYNAMIC_TYPE.get(arg['type'], arg['type']),
-    }
-
-
-def is_any_tensor_type(formal):
-    return (formal['dynamic_type'] == 'Tensor' or formal['dynamic_type'] == 'ByteTensor'
-            or formal['dynamic_type'] == 'IndexTensor' or formal['dynamic_type'] == 'BoolTensor')
-
-
-def find_tensors(formals):
-    # type: (List[AtFormal]) -> List[str]
-    return [formal['name'] for formal in formals if is_any_tensor_type(formal)]
-
-
-def find_tensorlists(formals):
-    # type: (List[AtFormal]) -> List[str]
-    return [formal['name'] for formal in formals if formal['dynamic_type'] == 'TensorList']
-
-
-def find_dispatch_tensor(formals):
-    # type: (List[AtFormal]) -> Optional[str]
-    # Determine legacy TH-style single dispatch tensor.
-    #
-    # Also used to determine what tensor should be used to provide a default
-    # DeviceGuard.  Unlike dispatch, we don't guard on ALL tensor arguments
-    # (because this is not actually a thing you can do.)  Guarding on the
-    # first argument is best effort to help people avoid doing this
-    # themselves.
-
-    for formal in formals:
-        if formal['name'] == 'self' and is_any_tensor_type(formal) and not formal.get('is_nullable', False):
-            return formal['name']
-    # otherwise dispatch to the first Tensor or TensorList
-    for formal in formals:
-        if 'TensorList' == formal['dynamic_type'] or is_any_tensor_type(formal) and \
-                not formal.get('is_nullable', False):
-            return formal['name']
-
-    return None
-
-
-def is_multidispatch_formal(formal):
-    # type: (AtFormal) -> bool
-    return formal['dynamic_type'] in ['TensorOptions', 'TensorList'] or is_any_tensor_type(formal)
-
-
-def find_multidispatch_formals(formals):
-    # type: (List[AtFormal]) -> List[AtFormal]
-    # Compute the list of all arguments which should be considered
-    # for multiple dispatch.  Note that this doesn't completely replace
-    # find_dispatch_tensor because we use the "dispatch tensor" to determine
-    # device guards.  TensorOptions is included as part of this calculation.
-    #
-    # The interaction of multiple dispatch with TensorOptions
-    # is quite interesting.  In particular, suppose I have:
-    #
-    #   cuda_tensor.new_like(1, device='cpu')
-    #
-    # Multiple dispatch will attempt a dispatch to CUDA, even though
-    # the end tensor that should be produced here is a CPU one.  The
-    # upshot is that if you have an operator with mixed TensorOptions
-    # and Tensor arguments, you MUST only ever register it generically.
-    return [f for f in formals if is_multidispatch_formal(f)]
-
-
-def find_formal_by_type(formal_name, formals):
-    # type: (str,List[AtFormal]) -> Optional[AtFormal]
-    for formal in formals:
-        if formal_name == formal['dynamic_type']:
-            return formal
-    return None
-
-
-def format_formal(f):
-    # type: (AtFormal) -> str
-    return '{} {}'.format(f['type'], f['name'])
-
-
-def formal_with_default(f):
-    # type: (AtFormal) -> str
-    s = format_formal(f)
-    v = f.get('default')
-    if v is None:
-        return s
-    if isinstance(v, bool):
-        v = str(v).lower()
-    return '{}={}'.format(s, v)
-
-
-def gen_dispatch_key_init(var_name, formals):
-    # type: (str, List[AtFormal]) -> List[str]
-    topt_formals = []
-    non_topt_formals = []
-    for f in find_multidispatch_formals(formals):
-        if f['dynamic_type'] == 'TensorOptions':
-            topt_formals.append(f)
-        else:
-            non_topt_formals.append(f)
-
-    if len(topt_formals) == 1 and non_topt_formals == []:
-        topt = topt_formals[0]
-        return ['DispatchKey {} = {}.computeDispatchKey();'.format(var_name, topt['name'])]
-
-    subexprs = []
-    for f in topt_formals:
-        subexprs.append('DispatchKeySet({}.computeDispatchKey())'.format(f['name']))
-    if non_topt_formals != []:
-        args = ', '.join([f['name'] for f in non_topt_formals])
-        subexprs.append('c10::detail::multi_dispatch_key_set({})'.format(args))
-    return [
-        'DispatchKeySet _dk_set = {};'.format(' | '.join(subexprs)),
-        'DispatchKeySet _dk_mask = c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, DispatchKey::BackendSelect);',
-        'DispatchKey {} = c10::impl::dispatchTypeId(_dk_set, _dk_mask);'.format(var_name),
-    ]
-
-
-def is_factory(option):
-    # type: (FunctionOption) -> bool
-    formals = option['formals_list']
-    return find_formal_by_type('TensorOptions', formals) is not None and 'method' not in option['variants']
-
-
-def gen_device_init(option, backend_type_env):
-    # type: (FunctionOption, Environment) -> List[str]
-    # generate a device init statement, if the passed function option is a Tensor factory.
-    #
-    if is_factory(option):
-        name = option['name']
-        device_type = backend_type_env['DeviceType']
-        if device_type == 'CUDA' or device_type == 'HIP':
-            return ['globalContext().lazyInit{}();'.format(device_type)]
-    return []
-
-# TODO The maybe_unwrap_optional_tensors is only needed because our at::native::xxx functions
-# still take "Tensor" instead of "optional<Tensor>", so we need CPUType, TypeDefault, ...
-# to do the same. Once at::native::xxx are converted, we can remove use_optional_tensor
-# and use the use_optional_tensor=True behavior always.
-def maybe_unwrap_optional_tensors(option, formals, args):
-    assert len(formals) == len(args), \
-        "Assert we didn't screw up with method_args removing self but forgetting to remove it from formals"
-    if option['use_c10_dispatcher'] == 'full':
-        def maybe_unwrap_optional_tensor(formal, arg):
-            if formal['dynamic_type'] == 'Tensor' and formal['is_nullable']:
-                return "{}.has_value() ? *{} : at::Tensor()".format(arg, arg)
-            else:
-                return arg
-        return [maybe_unwrap_optional_tensor(formal, arg) for (formal, arg) in zip(formals, args)]
-    else:
-        assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
-        return args
-
-def create_generic(top_env, declarations):
-    # type: (TopEnvironment, List[FunctionOption]) -> Tuple[List[OutputDeclaration], List[OpRegistration]]
-    # translates defaults from cwrap types to C++ values
-    def translate_default(argument, type_str, default):
-        # type: (THFormal, str, Any) -> Any
-        if default is None:
-            # cause the default constructor for the object to run
-            return '{}'
-        for pattern, replacement in HEADER_CONSTANT_REPLACEMENTS:
-            default = re.sub(pattern, replacement, str(default))
-        if type_str in {'Scalar', 'int64_t', 'double'}:
-            try:
-                return int(default)
-            except Exception:
-                try:
-                    return float(default)
-                except Exception:
-                    return default
-        elif type_str == 'bool':
-            assert default.lower() in ['true', 'false']
-            return default.lower() == 'true'
-        else:
-            return default
-
-    # change from THTensor* to Tensor & so we get how it will appear
-    # in the aten argument list...
-    def translate_formal(argument, option):
-        # type: (THFormal, FunctionOption) -> AtFormal
-        type_str = TYPE_FORMAL_GENERIC.get(argument['type'], argument['type'])
-        if type_str == 'Tensor &' and not is_mutable_formal_argument(argument, option):
-            type_str = 'const ' + type_str
-        translated = {
-            'name': argument['name'],
-            'type': type_str,
-            'dynamic_type': DYNAMIC_TYPE.get(argument['type'], argument['type']),
-        }  # type: AtFormal
-        if 'default' in argument:
-            default = translate_default(argument, type_str, argument['default'])
-            translated['default'] = default
-        if argument.get('output'):
-            translated['output'] = True
-        if argument.get('size'):
-            translated['size'] = argument['size']
-        if argument.get('is_nullable') is not None:
-            translated['is_nullable'] = argument['is_nullable']
-        return translated
-
-    def get_formals(option, schema_order, include_constants=False):
-        # type: (FunctionOption, bool, bool) -> List[AtFormal]
-        seen = set()  # type: Set[str]
-        pos_args = []  # type: List[THFormal]
-        kwd_args = []  # type: List[THFormal]
-
-        def insert(argument):
-            # type: (THFormal) -> None
-            if argument['name'] not in seen:
-                seen.add(argument['name'])
-                # there are no kwarg_only THFormals
-                pos_args.append(argument)
-
-        def has_output_mask(argument):
-            # type: (THFormal) -> bool
-            return argument.get('allocate', False) and argument.get('mask', False)
-
-        if schema_order:
-            arguments = copy.deepcopy(option['schema_order_arguments'])
-        else:
-            arguments = copy.deepcopy(option['arguments'])
-        for argument in arguments:
-            if argument.get('output') and not argument.get('allocate', False):
-                insert(argument)
-        for argument in arguments:
-            if include_constants and argument['type'] == 'CONSTANT':
-                insert(argument)
-            elif is_real_argument_to_wrapper(argument):
-                insert(argument)
-        if any(has_output_mask(arg) for arg in arguments):
-            mask_size = sum(has_output_mask(arg) for arg in arguments)
-            insert({
-                'name': 'output_mask',
-                # NB: Lack of space in comma works around parsing
-                # problem in gen_variable_type.py
-                'type': 'std::array<bool,{}>'.format(mask_size),
-                'default': '{{' + ', '.join(['true'] * mask_size) + '}}',
-            })
-
-        result = pos_args + kwd_args
-        return [translate_formal(argument, option) for argument in result]
-
-    def get_return_types(option):
-        # type: (FunctionOption) -> List[ReturnType]
-        ret = option['return']
-        if ret['kind'] == 'arguments':
-            argument_indices = ret['arguments']
-            if len(argument_indices) == 1:
-                the_arg = option['arguments'][argument_indices[0]]
-                return [to_return_type(the_arg, option)]
-            else:
-                return [to_return_type(option['arguments'][idx], option)
-                        for idx in argument_indices]
-        elif ret['kind'] == 'type':
-            return [{
-                'type': TYPE_RETURN.get(ret['type'], ret['type']),
-                'dynamic_type': DYNAMIC_TYPE.get(ret['type'], ret['type']),
-            }]
-        else:
-            raise Exception("format_return_type")
-
-    def format_return_type(return_types):
-        # type: (List[ReturnType]) -> str
-        if len(return_types) == 0:
-            return "void"
-        elif len(return_types) == 1:
-            return return_types[0]['type']
-        return "std::tuple<{}>".format(','.join(r['type'] for r in return_types))
-
-    def process_schema_order_actual(schema_order_actual):
-        if schema_order_actual == 'dtype':
-            return 'optTypeMetaToScalarType(options.dtype_opt())'
-        elif schema_order_actual == 'layout':
-            return 'options.layout_opt()'
-        elif schema_order_actual == 'device':
-            return 'options.device_opt()'
-        elif schema_order_actual == 'pin_memory':
-            return 'options.pinned_memory_opt()'
-        elif schema_order_actual == 'memory_format':
-            return 'c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format)'
-        else:
-            return schema_order_actual
-
-    def process_legacy_th_option(option):
-        # type: (FunctionOption) -> None
-        # Mutably populate option with derived values computed from values
-        # passed in to option.
-        option['inplace'] = re.search(
-            '(^__i|[^_]_$)', option['api_name']) is not None
-
-        # print(yaml.dump(option))
-        formals = get_formals(option, False)
-        schema_order_formals = get_formals(option, True)
-        option['formals_list'] = formals
-        option['formals'] = [format_formal(f) for f in formals]
-        option['formals_with_defaults'] = [formal_with_default(f) for f in formals]
-        option['returns'] = get_return_types(option)
-        option['return_type'] = format_return_type(option['returns'])
-        option['return_call'] = 'return ' if option['return_type'] != 'void' else ''
-        option['actuals'] = [f['name'] for f in formals]
-
-        option['method_formals'] = [format_formal(f) for f in formals
-                                    if f['name'] != 'self']
-        option['method_formals_with_defaults'] = (
-            [formal_with_default(f) for f in formals if f['name'] != 'self'])
-        # *this is 'const Tensor&' since all Tensor methods are const and must
-        # be const_casted to be accepted as native function's non-const argument
-        option['method_actuals'] = [
-            f['name'] if f['name'] != 'self' else 'const_cast<Tensor&>(*this)' for f in formals]
-
-        assert 'method' not in option['variants'], 'TH functions cannot be methods'
-        is_function = 'function' in option['variants']
-        # NB: TH functions don't support multiple dispatch
-        dispatch_tensor = find_dispatch_tensor(formals)
-        is_namespace_function = is_function and dispatch_tensor is not None
-
-        if option['mode'] == 'TH':
-            option['device_guard'] = False
-        option['device_guard_declaration'] = device_guard(option, False, dispatch_tensor)
-        option['dispatch_scalar_type_declaration'] = dispatch_scalar_type(option, False, dispatch_tensor)
-
-        assert option['extended_method'], 'Expected legacy operator to be an extended method'
-
-    def native_get_formals(option, schema_order, use_optional_tensor, include_constants=False):
-        # type: (FunctionOption, bool, bool, bool) -> List[AtFormal]
-
-        # TODO The use_optional_tensor argument is only needed because our at::native::xxx functions
-        # still take "Tensor" instead of "optional<Tensor>", so we need CPUType, TypeDefault, ...
-        # to do the same. Once at::native::xxx are converted, we can remove use_optional_tensor
-        # and use the use_optional_tensor=True behavior always.
-
-        seen = set()  # type: Set[str]
-        pos_args = []
-        kwd_args = []
-
-        def insert(argument):
-            # type: (AtFormal) -> None
-            if argument['name'] not in seen:
-                seen.add(argument['name'])
-                if argument.get('kwarg_only', False):
-                    kwd_args.append(argument)
-                else:
-                    pos_args.append(argument)
-
-        if schema_order:
-            arguments = option['schema_order_arguments']
-        else:
-            arguments = option['arguments']
-        for argument in arguments:
-            insert(argument)
-
-        # not clear we need dynamic_type translation as we can specify the correct type
-        # directly in native functions
-        def add_dynamic_type(argument, option):
-            # type: (AtFormal, FunctionOption) -> AtFormal
-            argument['dynamic_type'] = NATIVE_DYNAMIC_TYPE.get(argument['type'], argument['type'])
-            return argument
-
-        result = pos_args + kwd_args
-        result = [add_dynamic_type(argument, option) for argument in result]
-
-        # ensure we get reference-type formals when appropriate
-        def native_translate_formals(argument, option):
-            # type: (AtFormal, FunctionOption) -> AtFormal
-            argument = copy.deepcopy(argument)
-
-            def translate_map(const):
-                # type: (bool) -> Dict[str, str]
-                return {
-                    'Tensor': 'const Tensor &' if const else 'Tensor &',
-                    'Type': 'const Type &' if const else 'Type &',
-                    'TensorOptions': 'const TensorOptions &' if const else 'TensorOptions &',
-                    'TensorList': 'TensorList',
-                }
-
-            if argument.get('is_nullable') and argument['type'] not in translate_map(False).keys():
-                argument['type'] = "c10::optional<{}>".format(argument['type'])
-            elif use_optional_tensor and argument.get('is_nullable') and argument['type'] == 'Tensor':
-                argument['type'] = "const c10::optional<Tensor>&"
-
-
-            # Note: the 'self' trap is here only to preserve the const arg 0 for set_data.
-            # I.e., the signature of the cpp implementation currently fits the code
-            # generated from a misread schema, but the alias annotation is the truth.
-            # TODO fix the signature of set_data's cpp impl to match correct codegen from
-            # the current schema.
-            # then remove this
-            if argument['name'] == 'self':
-                is_mutable = option['inplace']
-            else:
-                is_mutable = '!' in (argument['annotation'] or '')
-
-            if is_mutable:
-                argument['type'] = translate_map(False).get(argument['type'], argument['type'])
-            else:
-                argument['type'] = translate_map(True).get(argument['type'], argument['type'])
-
-            return argument
-
-        result = [native_translate_formals(argument, option) for argument in result]
-        return result
-
-    # this can return multiple return types in a list, e.g. ['Tensor', 'Tensor']
-    def native_get_return_types(option):
-        # type: (FunctionOption) -> List[ReturnType]
-        ret = option['return']
-
-        return_types = []  # List[ReturnType]
-        for t_raw in ret:
-            # See Note [field_name versus name]
-            field_name = None
-            if isinstance(t_raw, str):
-                t = t_raw
-                name = None
-            else:
-                t = t_raw['type']
-                name = t_raw['name']
-                if 'field_name' in t_raw:
-                    field_name = t_raw['field_name']
-
-            # can't actually return a TensorList (since it's a reference object)
-            actual_return_type = {'TensorList': 'std::vector<Tensor>'}.get(t, t)
-
-            if actual_return_type == 'Tensor' and (option['inplace'] or option['api_name'].endswith('_out')):
-                # follow normal ATen convention of returning Tensor & for inplace functions.
-                actual_return_type = 'Tensor &'
-
-            rtype = {
-                'type': actual_return_type,
-                'dynamic_type': NATIVE_DYNAMIC_TYPE.get(t, t),
-            }  # type: ReturnType
-            if name is not None:
-                rtype['name'] = name
-            if field_name is not None:
-                rtype['field_name'] = field_name
-            return_types.append(rtype)
-
-        return return_types
-
-    def process_native(option):
-        # type: (FunctionOption) -> Optional[OutputDeclaration]
-        valid_modules = {'nn', 'fft', 'linalg'}
-        assert (option['python_module'] == '' or
-                option['python_module'] in valid_modules), \
-            "Found python_module of {} for decl {}, but only \'\' string, \'nn\' and \'fft\' are supported".format(
-                option['python_module'], option['name'])
-        use_optional_tensors_in_cpp_frontend = option['use_c10_dispatcher'] == 'full'
-        formals = native_get_formals(option, False, use_optional_tensors_in_cpp_frontend)
-        native_formals = native_get_formals(option, False, False)
-        schema_order_formals = native_get_formals(option, True, use_optional_tensors_in_cpp_frontend)
-        option['formals_list'] = formals
-        option['formals'] = [format_formal(f) for f in formals]
-        option['native_formals'] = [format_formal(f) for f in native_formals]
-        option['formals_with_defaults'] = [formal_with_default(f) for f in formals]
-        option['native_formals_with_defaults'] = [formal_with_default(f) for f in native_formals]
-        option['returns'] = native_get_return_types(option)
-        option['return_type'] = format_return_type(option['returns'])
-        option['return_call'] = 'return ' if option['return_type'] != 'void' else ''
-        option['actuals'] = [f['name'] for f in formals]
-        option['schema_order_actuals'] = [f['name'] for f in schema_order_formals]
-
-        option['formals_types'] = [f['type'] for f in option['formals_list']]
-
-        option['cpp_signature'] = "{} ({})".format(option['return_type'], ", ".join(option['formals_types']))
-        option['schema_order_cpp_signature'] = "{} ({})".format(
-            option['return_type'],
-            ", ".join([f['type'] for f in schema_order_formals]))
-
-        option['method_formals'] = [format_formal(f) for f in formals
-                                    if f['name'] != 'self']
-        option['method_formals_with_defaults'] = (
-            [formal_with_default(f) for f in formals if f['name'] != 'self'])
-        # *this is 'const Tensor&' since all Tensor methods are const and must
-        # be const_casted to be accepted as native function's non-const argument
-        option['method_actuals'] = [
-            f['name'] if f['name'] != 'self' else 'const_cast<Tensor&>(*this)' for f in formals]
-        option['schema_order_method_actuals'] = [
-            f['name'] if f['name'] != 'self' else 'const_cast<Tensor&>(*this)' for f in schema_order_formals]
-
-        if find_formal_by_type('TensorOptions', formals) is not None:
-            option['schema_order_actuals'] = [
-                process_schema_order_actual(actual) for actual in option['schema_order_actuals']]
-            option['schema_order_method_actuals'] = [
-                process_schema_order_actual(actual) for actual in option['schema_order_method_actuals']]
-
-        def gen_tensor_method(option, formals):
-            # type: (Any, List[AtFormal]) -> FunctionCode
-            def swizzle_self(f):  # blegh
-                if f['name'] == 'self':
-                    fc = f.copy()
-                    fc['name'] = '*this'
-                    return fc
-                else:
-                    return f
-
-            dispatch_key_var_name = '_dk'
-            dispatch_key_init = gen_dispatch_key_init(dispatch_key_var_name, [swizzle_self(f) for f in formals])
-
-            method_actuals = maybe_unwrap_optional_tensors(option, formals, option['method_actuals'])
-
-            # See NOTE[UnboxedOnly]
-            if option['use_c10_dispatcher'] == 'full':
-                tensor_method_actuals = option['schema_order_method_actuals']
-                tensor_method_cpp_signature = option['schema_order_cpp_signature']
-            else:
-                assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
-                tensor_method_actuals = option['method_actuals']
-                tensor_method_cpp_signature = option['cpp_signature']
-
-            method_definition = TENSOR_METHOD_DEFINITION.substitute(
-                option,
-                tensor_method_actuals=tensor_method_actuals,
-                tensor_method_cpp_signature=tensor_method_cpp_signature
-            )
-            return FunctionCode(
-                declaration=TENSOR_METHOD_DECLARATION.substitute(option),
-                definition=method_definition)
-
-        def gen_namespace_function(option, multidispatch_formals):
-            # type: (Any, List[AtFormal]) -> FunctionCode
-
-            dispatch_key_var_name = '_dk'
-            dispatch_key_init = gen_dispatch_key_init(dispatch_key_var_name, formals)
-
-            declaration = DEPRECATED_FUNCTION_DECLARATION if option['deprecated'] else FUNCTION_DECLARATION
-            fn_declaration = declaration.substitute(option)
-
-            actuals = maybe_unwrap_optional_tensors(option, formals, option['actuals'])
-
-            # See NOTE[UnboxedOnly]
-            if option['use_c10_dispatcher'] == 'full':
-                function_actuals = option['schema_order_actuals']
-                function_cpp_signature = option['schema_order_cpp_signature']
-            else:
-                assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
-                function_actuals = option['actuals']
-                function_cpp_signature = option['cpp_signature']
-
-            fn_definition = FUNCTION_DEFINITION.substitute(
-                option,
-                function_actuals=function_actuals,
-                function_cpp_signature=function_cpp_signature)
-
-            return FunctionCode(definition=fn_definition, declaration=fn_declaration)
-
-        assert find_formal_by_type('Type', formals) is None, \
-            "Found Type argument in {}({}). Use TensorOptions instead.".format(
-                option['name'], ", ".join(option['method_formals_with_defaults']))
-
-        type_method_dispatch = option['type_method_definition_dispatch']
-
-        is_method = 'method' in option['variants']
-        is_namespace_function = 'function' in option['variants']
-        # For method-only entries, the first argument should be self
-        if is_method and not is_namespace_function:
-            assert formals[0]['name'] == 'self'
-        is_factory_method = is_factory(option)
-
-        check_methods_do_not_start_with_underscore(option['name'], is_method)
-
-        # NB: Device guard and scalar type generated code is still based on the
-        # first argument.  Scalar type test will be removed once TH is removed.
-        # If you need more complex device guard behavior, you should disable
-        # device guard and then manually add the guards you need.
-        dispatch_options = find_formal_by_type('TensorOptions', formals)
-        guard_tensor = None if dispatch_options else find_dispatch_tensor(formals)
-        option['device_guard_declaration'] = device_guard(option, dispatch_options, guard_tensor)
-        option['dispatch_scalar_type_declaration'] = dispatch_scalar_type(option, dispatch_options, guard_tensor)
-
-        top_env['aten_ops'].append(OPERATOR_NAME_FULL.substitute(option))
-
-        option['native_type_method_dispatch'] = type_method_dispatch
-
-        # Note [Abstract ATen methods]
-        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-        # An abstract ATen method is one whose dispatch differs between
-        # types.  These are implemented in derived types (with a
-        # standard (throwing) definition in Type).  A concrete ATen
-        # method is one which has the same dispatch for all types;
-        # we just implement it in the base Type.  This is exposed
-        # in Declarations.yaml via a field named 'abstract'.
-        abstract = False
-        op_registrations.append(OpRegistration(
-            operator_name=OPERATOR_NAME.substitute(option),
-            registration_code=SCHEMA_REGISTRATION.substitute(option),
-            schema_registration_code=SCHEMA_REGISTRATION.substitute(option)))
-        if isinstance(type_method_dispatch, dict):
-            abstract = True
-            # Having manual_kernel_registration for an abstract method doesn't make sense.
-            assert not option['manual_kernel_registration']
-        else:
-            top_env['type_method_declarations'].append(NATIVE_DISPATCH_DECLARATION.substitute(option))
-            top_env['type_method_definitions'].append(NATIVE_DISPATCH_DEFINITION_DEFAULT.substitute(option))
-            if not option['manual_kernel_registration']:
-                # See NOTE[UnboxedOnly]
-                if option['use_c10_dispatcher'] == 'full':
-                    op_registrations.append(OpRegistration(
-                        operator_name=OPERATOR_NAME.substitute(option),
-                        registration_code=DEFAULT_FUNCTION_REGISTRATION.substitute(option),
-                        schema_registration_code=SCHEMA_REGISTRATION.substitute(option)))
-                else:
-                    assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
-                    op_registrations.append(OpRegistration(
-                        operator_name=OPERATOR_NAME.substitute(option),
-                        registration_code=DEFAULT_UNBOXEDONLY_FUNCTION_REGISTRATION.substitute(option),
-                        schema_registration_code=SCHEMA_REGISTRATION.substitute(option)))
-
-        # generate the at::native function declarations (i.e. what the user will implement)
-        if isinstance(type_method_dispatch, dict):
-            generated_native_functions = []  # type: List[str]
-            for key in sorted(type_method_dispatch.keys()):
-                value = type_method_dispatch[key]
-                # skip functions in different namespace, e.g. legacy::cpu
-                if "::" in value:
-                    continue
-                if value not in generated_native_functions:
-                    option['native_type_method_dispatch'] = value
-                    top_env['native_function_declarations'].append(NATIVE_DECLARATION.substitute(option))
-                    generated_native_functions.append(value)
-        else:
-            top_env['native_function_declarations'].append(NATIVE_DECLARATION.substitute(option))
-
-        method_of = ['Type']
-        if is_method:
-            code = gen_tensor_method(option, formals)
-            top_env['tensor_method_declarations'].append(code.declaration)
-            top_env['tensor_method_definitions'].append(code.definition)
-            method_of.append('Tensor')
-
-        if is_namespace_function:
-            code = gen_namespace_function(option, formals)
-            top_env['function_definitions'].append(code.definition)
-            top_env['function_declarations'].append(code.declaration)
-            method_of.append('namespace')
-
-        return OutputDeclaration(
-            name=option['api_name'],
-            operator_name=option['operator_name'],
-            overload_name=option['overload_name'],
-            use_c10_dispatcher=option['use_c10_dispatcher'],
-            manual_kernel_registration=option['manual_kernel_registration'],
-            schema_order_cpp_signature=option['schema_order_cpp_signature'],
-            category_override=option['category_override'],
-            matches_jit_signature=option["matches_jit_signature"],
-            schema_string=option["schema_string"],
-            arguments=formals,
-            schema_order_arguments=schema_order_formals,
-            method_of=method_of,
-            mode=option['mode'],
-            python_module=option['python_module'],
-            buffers=None,
-            returns=option['returns'],
-            inplace=option['inplace'],
-            is_factory_method=is_factory_method,
-            # See Note [Abstract ATen methods]
-            abstract=abstract,
-            device_guard=option.get('device_guard', True),
-            with_gil=option.get('with_gil', False),
-            deprecated=option['deprecated'],
-        )
-
-    output_declarations = []  # type: List[OutputDeclaration]
-    op_registrations = []  # type: List[OpRegistration]
-    for declaration in declarations:
-        output_options = []  # type: List[OutputDeclaration]
-        for option in declaration['options']:
-            option["matches_jit_signature"] = declaration["matches_jit_signature"]
-            option["schema_string"] = declaration["schema_string"]
-            try:
-                if option['mode'] != 'native':
-                    # Mutably populate option with values
-                    process_legacy_th_option(option)
-                else:
-                    output_option = process_native(option)
-                    if output_option:
-                        output_options.append(output_option)
-            except NYIError:
-                option['skip'] = True
-        output_declarations.extend(output_options)
-
-    return output_declarations, op_registrations
-
-
-def create_derived(backend_type_env, declarations):
-    # type: (Environment, List[FunctionOption]) -> Tuple[List[str], List[str], List[OpRegistration], List[str], List[str]]
-    type_object_declarations = []  # type: List[str]
-    type_object_definitions = []  # type: List[str]
-    op_registrations = []  # type: List[OpRegistration]
-    legacy_th_declarations = []  # type: List[str]
-    legacy_th_definitions = []  # type: List[str]
-    is_cuda = 'CUDA' in backend_type_env['Backend']
-
-    def requires_checked_cast(argument):
-        # type: (THFormal) -> bool
-        if argument['type'] == 'IntArrayRef':
-            return 'size' in argument
-        return argument['type'] in CHECKED_CAST
-
-    def nullable_argument(argument):
-        # type: (THFormal) -> bool
-        return argument.get('is_nullable', False)
-
-    def get_argument(env, argument, option):
-        # type: (Environment, THFormal, FunctionOption) -> str
-        if requires_checked_cast(argument):
-            checked_use = CHECKED_USE.get(
-                argument['type'], '{}_').format(argument['name'])
-            if nullable_argument(argument):
-                checked_use = CHECKED_USE_NULLABLE.substitute(
-                    env={}, arg_name=argument['name'], usage=checked_use)
-            return checked_use
-        elif argument['type'] == 'CONSTANT':
-            v = str(argument.get('default', argument['name']))
-            for pattern, replacement in CONSTANT_REPLACEMENTS:
-                v = re.sub(pattern, replacement, v)
-            return CodeTemplate(v).substitute(env)
-        # e.g. argument 0, i.e. repeat the 0th argument in this position...
-        elif argument['type'] == 'argument':
-            index = int(argument['name'])
-            return get_argument(env, option['arguments'][index], option)
-        else:
-            return argument['name']
-
-    def get_arguments(env, arguments, option):
-        # type: (Environment, List[THFormal], FunctionOption) -> List[str]
-        return [get_argument(env, argument, option)
-                for argument in arguments]
-
-    def allocate_arg(arg, output_count, backend, scalar_name):
-        # type: (THFormal, int, str, str) -> List[str]
-        name = arg['name']
-        allocation = CodeTemplate(ALLOC_NOARGS_WRAP[arg['type']]).substitute(Backend=backend, ScalarName=scalar_name)
-        tensor_arg = '{}_'.format(name)
-        if arg.get('mask', False):
-            allocation = 'output_mask[{}] ? {} : nullptr'.format(output_count, allocation)
-            tensor_arg = ('{}_ == nullptr ? (TensorImpl*)UndefinedTensorImpl::singleton() : (TensorImpl*){}_'
-                          .format(name, name))
-        intrusive_ptr_type = 'c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>'
-        return [
-            'auto {}_ = {};'.format(name, allocation),
-            'auto {} = Tensor({}::reclaim({}));'.format(name, intrusive_ptr_type, tensor_arg),
-        ]
-
-    def handle_call(env, option, cimpl):
-        # type: (Environment, FunctionOption, FunctionOption) -> str
-        is_nn = option['mode'] == 'NN'
-        actuals = get_arguments(env, cimpl['arguments'], option)
-        if is_cuda or is_nn:
-            actuals = ['globalContext().getTHCState()'] + actuals
-
-        cname = cimpl['cname']
-        if option.get('sparse', False):
-            if is_cuda:
-                cname = 'THCS' + env['ScalarName'] + "Tensor_" + cname
-            else:
-                cname = env['THTensor'].replace('TH', 'THS') + '_' + cname
-        elif is_nn:
-            cname = 'THNN_{}'.format(env['THType']) + cname
-        else:
-            cname = env['THTensor'] + '_' + cname
-
-        call = CALL_TEMPLATE.substitute(actuals=actuals, cname=cname)
-        if cimpl.get('condition') is not None:
-            call = 'if ({}) {}'.format(cimpl['condition'], call)
-        return call
-
-    def emit_body(env, option, scalar_type_cases):
-        # type: (Environment, FunctionOption, List[str]) -> List[str]
-        body = []  # type: List[str]
-
-        switch_prologue = []  # type: List[str]
-        output_count = 0
-        cases = []
-
-        for arg in option['arguments']:
-            # make a new allocation of TensorImpl, then wrap a Tensor around it.
-            if arg.get('allocate', False):
-                switch_prologue += allocate_arg(arg, output_count, env['Backend'], 'dispatch_scalar_type')
-                output_count += 1
-
-        for scalar_name, c_type, accreal, _ in scalar_types:
-            if scalar_name in scalar_type_cases:
-                case_body = []  # type: List[str]
-                # arguments are potentially duplicated because of one argument
-                # referencing another
-                seen_names = set()  # type: Set[str]
-                count = 0
-
-                case_env = {
-                    'Backend': env['Backend'],
-                    'DeviceType': env['DeviceType'],
-                    'state': env['state'],
-                    'ScalarType': c_type,
-                    'ScalarName': scalar_name,
-                    'AccScalarName': accreal,
-                    'THType': scalar_name,
-                    'THTensor': 'TH{}Tensor'.format(scalar_name)
-                }  # type: Environment
-                if case_env['Backend'] == 'CUDA':
-                    sname = '' if scalar_name == "Float" else scalar_name
-                    case_env['THType'] = 'Cuda{}'.format(sname)
-                    case_env['THTensor'] = 'THCuda{}Tensor'.format(sname)
-
-                for arg in option['arguments']:
-                    if is_real_argument_to_wrapper(arg):
-                        count += 1
-
-                    # only generated checked casts the first time we see it
-                    if arg['name'] not in seen_names and requires_checked_cast(arg):
-                        seen_names.add(arg['name'])
-
-                        # make a new allocation of TensorImpl, then wrap a Tensor around it.
-                        if not arg.get('allocate', False):
-                            # special case where we allow undefined Tensors, and thus
-                            # the checked cast succeeds even if the Tensor is not
-                            # defined
-                            null_okay = 'true' if nullable_argument(arg) else 'false'
-
-                            # extract the TensorImpl from an existing tensor
-                            check_cast = CHECKED_CAST[arg['type']].substitute(
-                                case_env, arg_name=arg['name'], arg_pos=count,
-                                api_name=option['api_name'], null_okay=null_okay,
-                                size=arg.get('size'), scalar_type='dispatch_scalar_type')
-                            case_body.append("auto {}_ = {};".format(
-                                arg['name'], check_cast))
-
-                # cimpls, if it exists, contains the underlying C function names and
-                # arguments. Otherwise use option
-                cimpls = option.get('cimpls', [option])
-                calls = [handle_call(case_env, option, cimpl) for cimpl in cimpls]
-
-                ret = option['return']
-
-                if ret['kind'] == 'arguments':
-                    case_body.extend([call + ';' for call in calls])
-                    # return handled later
-                elif ret['kind'] == 'type':
-                    assert len(calls) == 1
-                    call = calls[0]
-
-                    # return the same underlying Tensor type for both real and accreal; this ensures
-                    # e.g. x.sum(0) and x.sum() return the same type. We explicitly cast to the
-                    # ScalarType before constructing the scalar_tensor to avoid overflow checking.
-                    if ret['type'] == 'accreal' or ret['type'] == 'real':
-                        return_scalar = ('return at::scalar_tensor(convert<${ScalarType}>(${call}), '
-                                         'options(ScalarType::${ScalarName}));')
-                        case_body.append(CodeTemplate(return_scalar).substitute(case_env, call=call))
-                    else:
-                        case_body.append("return {};".format(call))
-                else:
-                    raise Exception("NYI - return handling")
-
-                cases.append(LEGACY_TH_DEFINITION_CASE.substitute(case_env, case_body=case_body))
-        switch_epilogue = ''
-        if ret['kind'] == 'arguments':
-            arguments_indices = ret['arguments']
-            arguments = [option['arguments'][argi]
-                         for argi in arguments_indices]
-            if len(arguments_indices) == 1:
-                arg = arguments[0]
-                switch_epilogue = "return {};".format(arg['name'])
-            else:
-                types = [to_return_type(arg, option)['type']
-                         for arg in arguments]
-                # TODO: check for move semantics...
-                names = [arg['name'] for arg in arguments]
-                switch_epilogue = CodeTemplate("return std::tuple<${types}>(${names});").substitute(
-                    types=types, names=names)
-        body.append(LEGACY_TH_DEFINITION_SWITCH_STATEMENT.substitute(env, cases=cases,
-                                                                     switch_prologue=switch_prologue,
-                                                                     switch_epilogue=switch_epilogue))
-        return body
-
-    def process_legacy_th_option(option):
-        # type: (FunctionOption) -> None
-        backend = backend_type_env['Backend']
-        if backend in option['backend_types']:
-            env = nested_dict(option, backend_type_env)
-            body = emit_body(env, option, option['backend_types'][backend])  # type: ignore
-            option['type_definition_body'] = body
-            # These type ignores arise from the fact that a nested_dict
-            # technically isn't a Mapping, as it doesn't implement
-            # enough methods.  I could fix this with a Protocol but
-            # then I need typing_extensions which isn't currently
-            # a build dep.
-            legacy_th_declarations.append(
-                LEGACY_TH_DECLARATION.substitute(env))  # type: ignore
-            legacy_th_definitions.append(
-                LEGACY_TH_DEFINITION.substitute(env))  # type: ignore
-
-    def process_native(option):
-        # type: (FunctionOption) -> None
-        dispatch = option['type_method_definition_dispatch']
-        env = nested_dict(option, backend_type_env)
-
-        if isinstance(dispatch, dict):
-            # If we're here, then our native_functions.yaml entry has dispatch configuration.
-            # Having manual kernel registration doesn't make sense.
-            assert not option['manual_kernel_registration']
-            backend = backend_type_env['Backend']
-            if backend in option['backend_types']:
-
-                native_dispatch = dispatch.get(backend)
-
-                type_object_declarations.append(
-                    NATIVE_DISPATCH_DECLARATION.substitute(env))
-
-                option['native_type_method_dispatch'] = native_dispatch
-                option['device_init'] = gen_device_init(option, backend_type_env)
-
-                if backend in ['CPU', 'SparseCPU', 'QuantizedCPU', 'MkldnnCPU']:
-                    # Omit the device guard entirely in these cases
-                    def_backend = NATIVE_DISPATCH_DEFINITION_CPU_BACKEND
-                else:
-                    def_backend = NATIVE_DISPATCH_DEFINITION_GENERIC_BACKEND
-
-                type_object_definitions.append(def_backend.substitute(env))
-
-                if native_dispatch:
-                    # See NOTE[UnboxedOnly]
-                    if option['use_c10_dispatcher'] == 'full':
-                        op_registrations.append(OpRegistration(
-                            operator_name=OPERATOR_NAME.substitute(option),
-                            registration_code=BACKEND_FUNCTION_REGISTRATION.substitute(env),
-                            schema_registration_code=SCHEMA_REGISTRATION.substitute(option)))
-                    else:
-                        assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
-                        op_registrations.append(OpRegistration(
-                            operator_name=OPERATOR_NAME.substitute(option),
-                            registration_code=BACKEND_UNBOXEDONLY_FUNCTION_REGISTRATION.substitute(env),
-                            schema_registration_code=SCHEMA_REGISTRATION.substitute(option)))
-
-    for declaration in declarations:
-        for option in declaration['options']:
-            if not option.get('skip', False):
-                try:
-                    if option['mode'] == 'NN' and option.get('cimpls') is None:
-                        continue
-                    if option['mode'] != 'native':
-                        process_legacy_th_option(option)
-                    else:
-                        process_native(option)
-                except NYIError:
-                    pass
-    return (type_object_declarations, type_object_definitions, op_registrations,
-            legacy_th_declarations, legacy_th_definitions)
diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py
deleted file mode 100644
index d3605bf385a..00000000000
--- a/aten/src/ATen/gen.py
+++ /dev/null
@@ -1,545 +0,0 @@
-
-import argparse
-import os
-
-import yaml
-from collections import defaultdict
-from collections import OrderedDict
-
-import sys
-from os import path
-sys.path.append(path.dirname(path.abspath(__file__)))
-
-import cwrap_parser
-import nn_parse
-import native_parse
-import preprocess_declarations
-import function_wrapper
-import gen_backend_select_register
-
-from code_template import CodeTemplate
-
-
-# This file is the top-level entry point for code generation in ATen.
-# It takes an arbitrary number of arguments specifying metadata files to
-# process (.cwrap, .yaml and .h) and outputs a number generated header
-# and cpp files in ATen/ (see invocations of 'write' for each file that
-# is written.) It is invoked from cmake; look for the 'cwrap_files'
-# variable for an up-to-date list of files which are passed.
-
-parser = argparse.ArgumentParser(description='Generate ATen source files')
-parser.add_argument('files', help='cwrap files', nargs='+')
-
-parser.add_argument(
-    '-s',
-    '--source-path',
-    help='path to source directory for ATen',
-    default='.')
-parser.add_argument(
-    '-o',
-    '--output-dependencies',
-    help='output a list of dependencies into the given file and exit')
-parser.add_argument(
-    '-d', '--install_dir', help='output directory', default='ATen')
-parser.add_argument(
-    '--rocm',
-    action='store_true',
-    help='reinterpret CUDA as ROCm/HIP and adjust filepaths accordingly')
-parser.add_argument(
-    '--vulkan',
-    action='store_true',
-    help='Generate Vulkan backend functions')
-parser.add_argument(
-    '--op_registration_whitelist',
-    nargs='*',
-    help='filter op registrations by the whitelist (if set); '
-         'each item is `namespace`::`operator name` without overload name; '
-         'e.g.: aten::empty aten::conv2d ...')
-parser.add_argument(
-    '--backend_whitelist',
-    nargs='*',
-    help='filter dispatch backend by the whitelist (if set), '
-         'e.g.: CPU CUDA QuantizedCPU ...')
-parser.add_argument(
-    '--per_op_registration',
-    action='store_true',
-    help='group function registrations by op name and write to separate files; '
-         'must also set --op_registration_whitelist param')
-parser.add_argument(
-    '--force_schema_registration',
-    action='store_true',
-    help='force it to generate schema-only registrations for all ops, including'
-         'those that are not listed on --op_registration_whitelist')
-options = parser.parse_args()
-
-# NB: It is mandatory to NOT use os.path.join here, as the install directory
-# will eventually be ingested by cmake, which does not respect Windows style
-# path slashes.  If you switch this to use os.path.join, you'll get an error
-# like:
-#
-#   Syntax error in cmake code when parsing string
-#
-#     C:/Jenkins/workspace/pytorch-builds/pytorch-win-ws2016-cuda9-cudnn7-py3-build/build/aten/src/ATen\core/TensorMethods.h
-#
-#   Invalid character escape '\c'.
-core_install_dir = options.install_dir + '/core' if options.install_dir is not None else None
-if options.install_dir is not None and not os.path.exists(options.install_dir):
-    os.makedirs(options.install_dir)
-if core_install_dir is not None and not os.path.exists(core_install_dir):
-    os.makedirs(core_install_dir)
-
-
-class FileManager(object):
-    def __init__(self, install_dir=None):
-        self.install_dir = install_dir if install_dir else options.install_dir
-        self.filenames = set()
-        self.outputs_written = False
-        self.undeclared_files = []
-
-    def will_write(self, filename):
-        filename = '{}/{}'.format(self.install_dir, filename)
-        if self.outputs_written:
-            raise Exception("'will_write' can only be called before " +
-                            "the call to write_outputs, refactor so outputs are registered " +
-                            "before running the generators")
-        self.filenames.add(filename)
-
-    def _write_if_changed(self, filename, contents):
-        try:
-            with open(filename, 'r') as f:
-                old_contents = f.read()
-        except IOError:
-            old_contents = None
-        if contents != old_contents:
-            with open(filename, 'w') as f:
-                f.write(contents)
-
-    def write_outputs(self, filename):
-        """Write a file containing the list of all outputs which are
-        generated by this script."""
-        self._write_if_changed(
-            filename,
-            ''.join(name + ";" for name in sorted(self.filenames)))
-        self.outputs_written = True
-
-    def write(self, filename, s, env=None):
-        filename = '{}/{}'.format(self.install_dir, filename)
-        if isinstance(s, CodeTemplate):
-            assert env is not None
-            comment = "@" + "generated by aten/src/ATen/gen.py"
-            if s.filename:
-                comment += " from {}".format(os.path.basename(s.filename))
-            env['generated_comment'] = comment
-            s = s.substitute(env)
-        self._write_if_changed(filename, s)
-        if filename not in self.filenames:
-            self.undeclared_files.append(filename)
-        else:
-            self.filenames.remove(filename)
-
-    def check_all_files_written(self):
-        if len(self.undeclared_files) > 0:
-            raise Exception(
-                "trying to write files {} which are not ".format(self.undeclared_files) +
-                "in the list of outputs this script produces. " +
-                "use will_write to add them.")
-        if len(self.filenames) > 0:
-            raise Exception("Outputs declared with 'will_write' were " +
-                            "never written: {}".format(self.filenames))
-
-
-TEMPLATE_PATH = options.source_path + "/templates"
-TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.cpp")
-SPARSE_TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/SparseTypeDerived.cpp")
-TYPE_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.h")
-TYPE_DEFAULT_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDefault.h")
-TYPE_DEFAULT_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDefault.cpp")
-OPS_ALREADY_MOVED_TO_C10_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/ATenOpList.cpp")
-BACKEND_SELECT_REGISTER_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/BackendSelectRegister.cpp")
-SCHEMA_REGISTER_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/SchemaRegister.cpp")
-TENSOR_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TensorBody.h")
-TENSOR_METHODS_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/TensorMethods.cpp")
-
-FUNCTIONS_H = CodeTemplate.from_file(TEMPLATE_PATH + "/Functions.h")
-FUNCTIONS_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/Functions.cpp")
-
-LEGACY_TH_FUNCTIONS_H = CodeTemplate.from_file(TEMPLATE_PATH + "/LegacyTHFunctions.h")
-LEGACY_TH_FUNCTIONS_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/LegacyTHFunctions.cpp")
-
-NATIVE_FUNCTIONS_H = CodeTemplate.from_file(TEMPLATE_PATH + "/NativeFunctions.h")
-
-PER_OP_REGISTRATION_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/PerOpRegistration.cpp")
-
-core_file_manager = FileManager(core_install_dir)
-file_manager = FileManager()
-cuda_file_manager = FileManager()
-
-def backend_to_devicetype(backend):
-    if backend == 'QuantizedCPU':
-        return 'CPU'
-    elif backend == 'QuantizedCUDA':
-        return 'CUDA'
-    return backend
-
-backends = ['CPU', 'CUDA']
-densities = ['Dense', 'Sparse', 'Mkldnn']  # TODO: layout instead of densities?
-
-quantized_backends = ['QuantizedCPU', 'QuantizedCUDA']
-
-# scalar_name, c_type, accreal, is_floating_type
-quantized_scalar_types = [
-    ('QInt8', 'qint8', 'QInt8AccrealNotDefined', 'QInt8IsFloatingTypeNotDefined'),
-    ('QUInt8', 'quint8', 'QUInt8AccrealNotDefined', 'QUInt8IsFloatingTypeNotDefined'),
-    ('QInt32', 'qint32', 'QInt32AccrealNotDefined', 'Qint32IsFloatingTypeNotDefined'),
-]
-
-# whitelist used to filter op registrations for custom build
-if options.op_registration_whitelist is not None:
-    op_registration_whitelist = set(options.op_registration_whitelist)
-else:
-    op_registration_whitelist = None
-
-# shared environment for non-derived base classes TensorBody.h Storage.h
-top_env = {
-    'cpu_type_headers': [],
-    'cuda_type_headers': [],
-    'function_registrations': [],
-    'aten_ops': [],
-    'type_method_declarations': [],
-    'type_method_definitions': [],
-    'tensor_method_declarations': [],
-    'tensor_method_definitions': [],
-    'function_declarations': [],
-    'function_definitions': [],
-    'type_ids': [],
-    'native_function_declarations': [],
-}
-
-
-def is_whitelisted_backend(backend):
-    return options.backend_whitelist is None or backend in options.backend_whitelist
-
-def is_cuda_backend(backend):
-    return backend in ("QuantizedCUDA", "CUDA")
-
-def dict_representer(dumper, data):
-    return dumper.represent_dict(data.items())
-
-
-def postprocess_output_declarations(output_declarations):
-    # ensure each return has a name associated with it
-    for decl in output_declarations:
-        has_named_ret = False
-        for n, ret in enumerate(decl.returns):
-            if 'name' not in ret:
-                assert not has_named_ret
-                if decl.inplace:
-                    ret['name'] = 'self'
-                elif len(decl.returns) == 1:
-                    ret['name'] = 'out'
-                else:
-                    ret['name'] = 'out' + str(n)
-            else:
-                has_named_ret = True
-
-    def remove_key_if_none(dictionary, key):
-        if key in dictionary.keys() and dictionary[key] is None:
-            del dictionary[key]
-        return dictionary
-
-    return [remove_key_if_none(decl._asdict(), 'buffers')
-            for decl in output_declarations]
-
-
-def format_yaml(data):
-    if options.output_dependencies:
-        # yaml formatting is slow so don't do it if we will ditch it.
-        return ""
-    noalias_dumper = yaml.dumper.SafeDumper
-    noalias_dumper.ignore_aliases = lambda self, data: True
-    # Support serializing OrderedDict
-    noalias_dumper.add_representer(OrderedDict, dict_representer)
-    # Some yaml parsers (e.g. Haskell's) don't understand line breaks.
-    # width=float('Inf') turns off optional line breaks and improves
-    # the portability of the outputted yaml.
-    return yaml.dump(data, default_flow_style=False, Dumper=noalias_dumper, width=float('Inf'))
-
-
-def add_op_registrations(per_type_registrations, per_op_registrations, schema_registrations, op_registrations):
-    for op_registration in op_registrations:
-        opname = op_registration.operator_name
-        registration = op_registration.registration_code
-
-        # collect schema registration for all ops (whitelisted or not)
-        if schema_registrations is not None:
-            schema_registrations.append(op_registration.schema_registration_code)
-
-        # apply whitelist
-        if op_registration_whitelist is not None and opname not in op_registration_whitelist:
-            continue
-        if options.per_op_registration:
-            # per op registration
-            per_op_registrations[opname].append(registration)
-        else:
-            # per type registration
-            per_type_registrations.append(registration)
-
-
-def generate_storage_type_and_tensor(backend, density, declarations, per_op_registrations, schema_registrations):
-    env = {}
-    density_tag = density if density != 'Dense' else ''
-    env['Density'] = density
-    env['Type'] = "{}{}Type".format(density_tag, backend)
-    env['DeviceType'] = backend_to_devicetype(backend)
-    env['Backend'] = density_tag + backend
-    if not is_whitelisted_backend(env['Backend']):
-        return
-    env['storage_tensor_headers'] = []
-    if density != 'Sparse':
-        env['storage_tensor_headers'] = ['#include <c10/core/TensorImpl.h>']
-
-    # used for generating switch logic for external functions
-    tag = density_tag + backend
-    env['TypeID'] = 'TypeID::' + tag
-    top_env['type_ids'].append(tag + ',')
-
-    env['legacy_th_headers'] = []
-    if is_cuda_backend(backend):
-        env['extra_cuda_headers'] = []
-        env['extra_cuda_headers'].append('#include <ATen/DeviceGuard.h>')
-        if options.rocm:
-            env['th_headers'] = [
-                '#include <THH/THH.h>',
-                '#include <THH/THHTensor.hpp>',
-                '#include <THHUNN/THHUNN.h>',
-                '#undef THNN_',
-                '#undef THCIndexTensor_',
-            ]
-            env['extra_cuda_headers'].append('#include <ATen/hip/ATenHIPGeneral.h>')
-            env['extra_cuda_headers'].append('#include <ATen/hip/HIPDevice.h>')
-            env['extra_cuda_headers'].append('#include <ATen/hip/HIPContext.h>')
-        else:
-            env['th_headers'] = [
-                '#include <THC/THC.h>',
-                '#include <THC/THCTensor.hpp>',
-                '#include <THCUNN/THCUNN.h>',
-                '#undef THNN_',
-                '#undef THCIndexTensor_',
-            ]
-            env['extra_cuda_headers'].append('#include <ATen/cuda/ATenCUDAGeneral.h>')
-            env['extra_cuda_headers'].append('#include <ATen/cuda/CUDADevice.h>')
-            env['extra_cuda_headers'].append('#include <ATen/cuda/CUDAContext.h>')
-        env['state'] = ['globalContext().getTHCState()']
-        env['isCUDA'] = 'true'
-        env['storage_device'] = 'return storage->device;'
-        env['Generator'] = 'CUDAGeneratorImpl'
-        env['allocator'] = 'at::cuda::getCUDADeviceAllocator()'
-    else:
-        env['th_headers'] = [
-            '#include <TH/TH.h>',
-            '#include <TH/THTensor.hpp>',
-        ]
-        env['extra_cuda_headers'] = []
-        env['state'] = []
-        env['isCUDA'] = 'false'
-        env['storage_device'] = 'throw std::runtime_error("CPU storage has no device");'
-        env['Generator'] = 'CPUGeneratorImpl'
-        env['allocator'] = 'getCPUAllocator()'
-
-    declarations, definitions, op_registrations, th_declarations, th_definitions = function_wrapper.create_derived(
-        env, declarations)
-    env['type_derived_method_declarations'] = declarations
-    env['type_derived_method_definitions'] = definitions
-    env['legacy_th_declarations'] = th_declarations
-    env['legacy_th_definitions'] = th_definitions
-    env['function_registrations'] = []
-    add_op_registrations(env['function_registrations'], per_op_registrations, schema_registrations, op_registrations)
-
-    fm = file_manager
-    if env['DeviceType'] == 'CUDA':
-        fm = cuda_file_manager
-
-    if env['Backend'] == 'CPU' or env['Backend'] == 'CUDA':
-        env['namespace'] = env['Backend'].lower()
-        env['legacy_th_headers'].append('#include <ATen/LegacyTHFunctions' + env['Backend'] + ".h>")
-        fm.write('LegacyTHFunctions' + env['Backend'] + ".h", LEGACY_TH_FUNCTIONS_H, env)
-        fm.write('LegacyTHFunctions' + env['Backend'] + ".cpp", LEGACY_TH_FUNCTIONS_CPP, env)
-
-    if density != 'Sparse':
-        fm.write(env['Type'] + ".cpp", TYPE_DERIVED_CPP, env)
-    else:
-        fm.write(env['Type'] + ".cpp", SPARSE_TYPE_DERIVED_CPP, env)
-    fm.write(env['Type'] + ".h", TYPE_DERIVED_H, env)
-
-    if env['DeviceType'] == 'CPU' or env['DeviceType'] == 'Vulkan':
-        top_env['cpu_type_headers'].append(
-            '#include <ATen/{}.h>'.format(env['Type']))
-    else:
-        assert env['DeviceType'] == 'CUDA'
-        top_env['cuda_type_headers'].append(
-            '#include <ATen/{}.h>'.format(env['Type']))
-
-
-# yields (backend, density) tuples
-def iterate_types():
-    for backend in backends:
-        for density in densities:
-            if density == 'Mkldnn' and backend != 'CPU':
-                continue
-            else:
-                yield (backend, density)
-    for backend in quantized_backends:
-        yield (backend, 'Dense')
-    if options.vulkan:
-        yield('Vulkan', 'Dense')
-
-
-def gen_per_op_registration_filename(opname):
-    return 'pt_op_register_{}.cpp'.format(opname.replace(':', '-'))
-
-
-###################
-# declare what files will be output _before_ we do any work
-# so that the script runs quickly when we are just querying the
-# outputs
-def declare_outputs():
-    core_files = ['TensorBody.h', 'TensorMethods.cpp', 'ATenOpList.cpp']
-    for f in core_files:
-        core_file_manager.will_write(f)
-    files = ['Declarations.yaml', 'TypeDefault.cpp', 'TypeDefault.h',
-             'Functions.h', 'Functions.cpp', 'NativeFunctions.h', 'BackendSelectRegister.cpp']
-    for f in files:
-        file_manager.will_write(f)
-    for backend, density in iterate_types():
-        full_backend = backend if density == "Dense" else density + backend
-        if not is_whitelisted_backend(full_backend):
-            continue
-        fm = file_manager
-        if is_cuda_backend(backend):
-            fm = cuda_file_manager
-        for kind in ["Type"]:
-            if kind != 'Type' and density == "Sparse":
-                # No Storage or Tensor for sparse
-                continue
-            fm.will_write("{}{}.h".format(full_backend, kind))
-            fm.will_write("{}{}.cpp".format(full_backend, kind))
-        if backend == 'CPU' or backend == 'CUDA':
-            fm.will_write("LegacyTHFunctions{}.h".format(backend))
-            fm.will_write("LegacyTHFunctions{}.cpp".format(backend))
-
-    if options.per_op_registration:
-        if op_registration_whitelist is None:
-            raise Exception("Must set --op_registration_whitelist for per-op registration.")
-        for whitelisted_op in op_registration_whitelist:
-            fname = gen_per_op_registration_filename(whitelisted_op)
-            file_manager.will_write(fname)
-
-    if options.force_schema_registration:
-        file_manager.will_write('SchemaRegister.cpp')
-
-
-def filter_by_extension(files, *extensions):
-    filtered_files = []
-    for file in files:
-        for extension in extensions:
-            if file.endswith(extension):
-                filtered_files.append(file)
-    return filtered_files
-
-
-def generate_per_op_registration(per_op_registrations):
-    if not options.per_op_registration:
-        return
-
-    # Ensure all whitelisted operators have a corresponding registration file.
-    # Generate an empty placeholder file for nonexistent operators, which might
-    # be registered manually instead of via codegen.
-    # This can simplify the custom BUCK build which consumes the output of this
-    # script, since it can uniformly create per-op build targets and dependencies
-    # without having to know the subtle difference about op registration.
-    # Manually registered operators might call codegen registered operators thus
-    # we cannot simply ignore them when calculating transitive dependencies for
-    # custom build.
-    for whitelisted_op in op_registration_whitelist:
-        if whitelisted_op not in per_op_registrations:
-            per_op_registrations[whitelisted_op] = []
-
-    for opname, function_registrations in per_op_registrations.items():
-        fname = gen_per_op_registration_filename(opname)
-        file_manager.write(fname, PER_OP_REGISTRATION_CPP, {
-            'extra_headers': top_env['cpu_type_headers'] + top_env['cuda_type_headers'],
-            'function_registrations': function_registrations,
-        })
-
-
-def generate_schema_registration(schema_registrations):
-    if not options.force_schema_registration:
-        return
-    file_manager.write('SchemaRegister.cpp', SCHEMA_REGISTER_CPP, {
-        'schema_registrations': sorted(set(schema_registrations)),
-    })
-
-
-def generate_outputs():
-    cwrap_files = filter_by_extension(options.files, '.cwrap')
-    nn_files = filter_by_extension(options.files, 'nn.yaml', '.h')
-    native_files = filter_by_extension(options.files, 'native_functions.yaml')
-
-    declarations = [d
-                    for file in cwrap_files
-                    for d in cwrap_parser.parse(file)]
-
-    declarations += nn_parse.run(nn_files)
-    declarations += native_parse.run(native_files)
-    declarations = preprocess_declarations.run(declarations)
-
-    per_op_registrations = defaultdict(list) if options.per_op_registration else None
-    schema_registrations = [] if options.force_schema_registration else None
-
-    # note: this will fill in top_env['type/tensor_method_declarations/definitions']
-    # and modify the declarations to include any information that will all_backends
-    # be used by function_wrapper.create_derived
-    output_declarations, op_registrations = function_wrapper.create_generic(
-        top_env, declarations)
-    output_declarations = postprocess_output_declarations(output_declarations)
-    file_manager.write("Declarations.yaml", format_yaml(output_declarations))
-
-    gen_backend_select_register.register_backend_select_methods(declarations, BACKEND_SELECT_REGISTER_CPP, file_manager)
-
-    add_op_registrations(
-        top_env['function_registrations'], per_op_registrations, schema_registrations, op_registrations)
-
-    for backend, density in iterate_types():
-        generate_storage_type_and_tensor(
-            backend, density, declarations, per_op_registrations, schema_registrations)
-
-    core_files = {
-        'TensorBody.h': TENSOR_H,
-        'TensorMethods.cpp': TENSOR_METHODS_CPP,
-        'ATenOpList.cpp': OPS_ALREADY_MOVED_TO_C10_CPP,
-    }
-
-    for core_file, core_template_file in core_files.items():
-        core_file_manager.write(core_file, core_template_file, top_env)
-
-    file_manager.write('TypeDefault.h', TYPE_DEFAULT_H, top_env)
-    file_manager.write('TypeDefault.cpp', TYPE_DEFAULT_CPP, top_env)
-
-    file_manager.write('Functions.h', FUNCTIONS_H, top_env)
-    file_manager.write('Functions.cpp', FUNCTIONS_CPP, top_env)
-
-    file_manager.write('NativeFunctions.h', NATIVE_FUNCTIONS_H, top_env)
-
-    generate_per_op_registration(per_op_registrations)
-    generate_schema_registration(schema_registrations)
-
-    file_manager.check_all_files_written()
-    cuda_file_manager.check_all_files_written()
-
-declare_outputs()
-if options.output_dependencies is not None:
-    file_manager.write_outputs(options.output_dependencies)
-    core_file_manager.write_outputs(options.output_dependencies + "-core")
-    cuda_file_manager.write_outputs(options.output_dependencies + "-cuda")
-else:
-    generate_outputs()
diff --git a/aten/src/ATen/gen_backend_select_register.py b/aten/src/ATen/gen_backend_select_register.py
deleted file mode 100644
index 3ffb1d412f9..00000000000
--- a/aten/src/ATen/gen_backend_select_register.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# This script generates BackendSelectRegister.cpp which is being used for dispatching purposes.
-#
-# TLDR: most operators take one or more Tensors as arguments, and dispatch keys extracted from
-# these Tensors determine which kernel (operator implementation) the dispatcher actually invokes.
-# E.g., calling add() on two CUDA Tensors will dispatch to the CUDA implementation of add(),
-# and so on.
-#
-# But factory functions don't take Tensors, so we need to get dispatch keys from other arguments.
-# Rather than teaching the dispatcher how to extract dispatch keys from types besides Tensor, we
-# register an extra kernel for each factory op, under the `BackendSelect` dispatch key. This key
-# has higher precedence than dispatch keys for actual backends, so a BackendSelect kernel will
-# front-run other kernels registered for the same op.
-#
-# It's the responsibility of the BackendSelect factory kernels to extract the "real" dispatch
-# key from non-Tensor arguments, and redispatch using this key. Here, we generate implementations
-# that obtain the key from the TensorOptions argument that's passed to all Tensor factory ops.
-#
-# BackendSelectRegister.cpp will contain both the BackendSelect kernels and registrations for
-# all factory functions that have 'backend_select' flag in its native_functions.yaml definition.
-
-from code_template import CodeTemplate
-from function_wrapper import gen_dispatch_key_init
-
-GENERATED_COMMENT = CodeTemplate(
-    "@" + "generated from ${filename}")
-
-# See NOTE[UnboxedOnly] in function_wrapper.py
-UNBOXEDONLY_FUNCTION_REGISTRATION = CodeTemplate("""\
-  m.impl_UNBOXED("aten::${op_name_with_overload_name}", ${function_name});
-""")
-
-FUNCTION_REGISTRATION = CodeTemplate("""\
-  m.impl("aten::${op_name_with_overload_name}",
-          c10::impl::hacky_wrapper_for_legacy_signatures<${schema_order_cpp_signature}>(
-              TORCH_FN(${function_name})));
-""")
-
-FUNCTION_DEFINITION = CodeTemplate("""\
-// ${schema_string}
-Tensor ${function_name}(${method_formals}) {
-  static auto op = c10::Dispatcher::singleton()
-    .findSchemaOrThrow("aten::${name}", "${overload_name}")
-    .typed<${function_cpp_signature}>();
-  ${dispatch_key_init}
-  return op.callWithDispatchKey(_dk, ${function_actuals});
-}
-""")
-
-
-def needs_backend_select(declaration_option):
-    # We register an op under the BackendSelect dispatch key
-    # if a TensorOptions argument has been gathered from its declared args
-    # We skip all the 'new_*' and '*_like' ops as they are special cased and avoid dispatching.
-    # See TypeDefault.cpp
-    if declaration_option['name'].endswith('_like') or declaration_option['name'].startswith('new_'):
-        return False
-
-    return any(a.get('dynamic_type') == 'TensorOptions' for a in declaration_option['arguments'])
-
-def register_backend_select_methods(declarations, template_path, file_manager):
-    backend_select_method_definitions = []
-    backend_select_function_registrations = []
-
-    for decl in declarations:
-        for option in decl["options"]:
-            if needs_backend_select(option):
-                name = option['name']
-                op_name_with_overload_name = option['name']
-                if option.get('overload_name', '') != '':
-                    name = "{0}_{1}".format(name, option['overload_name'])
-                    op_name_with_overload_name = "{0}.{1}".format(op_name_with_overload_name, option['overload_name'])
-
-                if option['use_c10_dispatcher'] == 'full':
-                    func_reg = FUNCTION_REGISTRATION.substitute(schema_string=option['schema_string'],
-                                                                op_name_with_overload_name=op_name_with_overload_name,
-                                                                function_name=name,
-                                                                schema_order_cpp_signature=option['schema_order_cpp_signature'])
-                else:
-                    assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
-                    func_reg = UNBOXEDONLY_FUNCTION_REGISTRATION.substitute(schema_string=option['schema_string'],
-                                                                            op_name_with_overload_name=op_name_with_overload_name,
-                                                                            function_name=name)
-
-                dispatch_key_init = gen_dispatch_key_init('_dk', option['formals_list'])
-
-                # See NOTE[UnboxedOnly] in function_wrapper.py
-                if option['use_c10_dispatcher'] == 'full':
-                    function_cpp_signature = option['schema_order_cpp_signature']
-                    function_actuals = option['schema_order_actuals']
-                else:
-                    assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
-                    function_cpp_signature = option['cpp_signature']
-                    function_actuals = option['actuals']
-                method_def = FUNCTION_DEFINITION.substitute(function_name=name,
-                                                            schema_string=option['schema_string'],
-                                                            method_formals=option['formals_with_defaults'],
-                                                            name=option['name'],
-                                                            overload_name=option['overload_name'],
-                                                            dispatch_key_init=dispatch_key_init,
-                                                            function_cpp_signature=function_cpp_signature,
-                                                            function_actuals=function_actuals)
-
-                backend_select_function_registrations.append(func_reg)
-                backend_select_method_definitions.append(method_def)
-
-    env = {}
-    env['backend_select_method_definitions'] = backend_select_method_definitions
-    env['backend_select_function_registrations'] = backend_select_function_registrations
-
-    env['generated_comment'] = GENERATED_COMMENT.substitute(filename=template_path)
-    file_manager.write('BackendSelectRegister.cpp', template_path, env)
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 79575d46357..436619da4b4 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -3166,7 +3166,7 @@
     CPU: roll_cpu
     CUDA: roll_cuda
 
-# default int[] value [0,1] should not add space after comma, since native_parse.py uses ', ' to split args
+# default int[] value [0,1] should not add space after comma, since codegen parser uses ', ' to split args
 
 - func: rot90(Tensor self, int k=1, int[] dims=[0,1]) -> Tensor
   use_c10_dispatcher: full
@@ -5773,7 +5773,7 @@
     CPU: foreach_tensor_add_scalar_kernel_slow
     CUDA: foreach_tensor_add_scalar_kernel_cuda
 
-- func: _foreach_add_.Scalar(Tensor[](a!) self, Scalar scalar) -> ()
+- func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
   device_guard: False
   variants: function
   dispatch:
diff --git a/aten/src/ATen/native_parse.py b/aten/src/ATen/native_parse.py
deleted file mode 100644
index e8206e72141..00000000000
--- a/aten/src/ATen/native_parse.py
+++ /dev/null
@@ -1,482 +0,0 @@
-from __future__ import print_function
-import re
-import yaml
-import pprint
-import sys
-import copy
-
-try:
-    # use faster C loader if available
-    from yaml import CLoader as Loader
-except ImportError:
-    from yaml import Loader
-
-# [temp translations]
-# We're currently incrementally moving from the custom func schema to the
-# JIT signature schema incrementally. This will reduce overall complexity
-# and increase compliance between these components. So for now we do simple
-# type translations to continue to emit the legacy func schema for further
-# processing by downstream tools. This will helps us avoid having to prematurely
-# change all downstream tools to detect these new types.
-def type_argument_translations(arg):
-    type_and_name = [a.strip() for a in arg.rsplit(' ', 1)]
-    name = ''
-    if len(type_and_name) > 1:
-        name = type_and_name[1]
-    t = type_and_name[0]
-    name = name.split('=')
-    default = None
-    nullable = False
-    size = None  # Only applies to int[\d+] and Tensor[\d+] arguments
-    if len(name) > 1:
-        default = name[1]
-    name = name[0]
-
-    match = re.match(r'(Tensor.*)\((.+)\)(.*)', t)
-    annotation = None
-    if match:
-        t = match.group(1) + match.group(3)
-        annotation = match.group(2)
-
-    # XXX: is_nullable flag can only annotate entire type as optional type,
-    # need to special case Generator? logic to make ? only available in jit
-    # TODO: deprecate is_nullable global flag, and parse the type
-    # to support annotating complicated types with optional annotation
-    nullable = '?' in t
-
-    # This enables "Generator? x = None and translates to legacy
-    # "Generator x = nullptr". See [temp translations].
-    if t == 'Generator?' and default == 'None':
-        t = 'Generator'
-        default = 'c10::nullopt'
-    # Enables Tensor[] by translating to legacy TensorList.
-    elif t == 'Tensor[]' or t == 'Tensor?[]':
-        t = 'TensorList'
-    # Enables int[] by translating to legacy IntArrayRef.
-    elif t == 'int[]':
-        t = 'IntArrayRef'
-    elif t == 'int[]?':
-        t = 'IntArrayRef?'
-    # Enables int by translating to legacy int64_t.
-    elif t == 'int':
-        t = 'int64_t'
-    elif t == 'int?':
-        t = 'int64_t?'
-    elif t == 'int64_t':
-        raise RuntimeError("Please use int and not int64_t. "
-                           "See [temp translations] for details.")
-    elif t == 'int64_t?':
-        raise RuntimeError("Please use int? and not int64_t?. "
-                           "See [temp translations] for details.")
-    # Enables Dimname[] by translating to legacy DimnameList.
-    elif t == 'Dimname[]':
-        t = 'DimnameList'
-    elif t == 'Dimname[]?':
-        t = 'DimnameList?'
-    # Enables float by translating to legacy double.
-    elif t == 'float':
-        t = 'double'
-    elif t == 'float?':
-        t = 'double?'
-    elif t == 'float[]':
-        t = 'ArrayRef<double>'
-    elif t == 'float[]?':
-        t = 'ArrayRef<double>?'
-    # Enables str by translating to legacy std::string.
-    elif t == 'str':
-        t = 'std::string'
-    elif t == 'double':
-        raise RuntimeError("Please use float and not double. "
-                           "See [temp translations] for details.")
-    # Enables int[x] by translating to legacy IntArrayRef[x]. See [temp translations]
-    elif re.match(r'int\[(\d+)\]\?', t):
-        match = re.match(r'int\[(\d+)\]\?', t)
-        t = 'IntArrayRef'
-        size = int(match.group(1))
-    elif re.match(r'int\[(\d+)\]', t):
-        match = re.match(r'int\[(\d+)\]', t)
-        t = 'IntArrayRef'
-        size = int(match.group(1))
-    # Enables bool[x] by translating to legacy std::array<bool,x>. See [temp translations]
-    elif re.match(r'bool\[(\d+)\]', t):
-        match = re.match(r'bool\[(\d+)\]', t)
-        t = 'std::array<bool,{}>'.format(match.group(1))
-    elif re.match(r'std::array', t):
-        raise RuntimeError("Please use array notation, e.g. bool[3] and not std::array."
-                           "See [temp translations] for details.")
-    # Enables Dimname[x] by translating to DimnameList[x]. See [temp translations]
-    elif re.match(r'Dimname\[(\d+)\]', t):
-        match = re.match(r'Dimname\[(\d+)\]', t)
-        t = 'DimnameList'
-        size = int(match.group(1))
-
-    if not default:
-        pass
-    # This enables Tensor? x=None and translates to legacy
-    # "Tensor? x={}". See [temp translations].
-    elif t.startswith('Tensor?') and default == 'None':
-        default = "{}"
-    elif default == 'True':
-        default = True
-    elif default == 'False':
-        default = False
-    elif default == 'true':
-        raise RuntimeError("Please use True and not true. "
-                           "See [temp translations] for details.")
-    elif default == 'false':
-        raise RuntimeError("Please use False and not false. "
-                           "See [temp translations] for details.")
-    # Enables default argument [] by translating to legacy {}.
-    # See [temp translations]
-    elif default == '[]':
-        default = '{}'
-    # Enables lists by translating to legacy {.*}.
-    # See [temp translations]
-    elif re.match(r'\[.*\]', default):
-        default = "{" + default[1:-1] + "}"
-    elif default == 'None':
-        default = 'c10::nullopt'
-    # The JIT signature schema uses Mean, but in particular C++ needs
-    # the legacy at::Reduction::Mean. So we'll continue emiting that until
-    # we change this at either a JIT schema or C++ level.
-    elif default == 'Mean':
-        default = 'at::Reduction::Mean'
-    elif default == 'contiguous_format':
-        default = 'MemoryFormat::Contiguous'
-    elif default == 'per_tensor_affine':
-        default = 'QScheme::PER_TENSOR_AFFINE'
-    else:
-        try:
-            default = int(default)
-        except ValueError:
-            try:
-                default = float(default)
-            except ValueError:
-                pass
-
-    return t, name, default, nullable, size, annotation
-
-
-def parse_arguments(args):
-    arguments = []
-    kwarg_only = False
-
-    if len(args.strip()) == 0:
-        return arguments
-
-    # TODO: Use a real parser here; this will get bamboozled
-    # by signatures that contain things like std::array<bool, 2> (note the space)
-    for arg_idx, arg in enumerate(args.split(', ')):
-        type_and_name = [a.strip() for a in arg.rsplit(' ', 1)]
-        if type_and_name == ['*']:
-            assert not kwarg_only
-            kwarg_only = True
-            continue
-
-        t, name, default, nullable, size, annotation = type_argument_translations(arg)
-
-        argument_dict = {'type': t.rstrip('?'), 'name': name, 'is_nullable': nullable, 'annotation': annotation}
-        if size:
-            argument_dict['size'] = size
-        if default is not None:
-            argument_dict['default'] = default
-        if kwarg_only:
-            argument_dict['kwarg_only'] = True
-        arguments.append(argument_dict)
-
-    return arguments
-
-def process_arguments(arguments, func_variants, declaration, func_return):
-    is_out_fn = False
-    arguments_out = []
-    arguments_other = []
-    for argument in arguments:
-        if argument['type'] == "Tensor" and \
-                argument['annotation'] and \
-                re.match(r'^(.*!)$', argument['annotation']) and \
-                argument.get('kwarg_only'):
-            argument['output'] = True
-            argument['kwarg_only'] = False
-            arguments_out.append(argument)
-            is_out_fn = True
-        else:
-            arguments_other.append(argument)
-
-    arguments = arguments_out + arguments_other
-
-    name = declaration['name']
-    if is_out_fn:
-        declaration['name'] += "_out"
-
-    # Reverse splat of TensorOptions
-    # As we move towards the JIT function schema for native_functions.yaml we need to support
-    # the expanded version of TensorOptions. For now we discover whether there are three
-    # types and names of keyword arguments: "ScalarType dtype", "Layout layout" and "Device device"
-    # Each, if set, must have default arguments set to long or float, strided and "cpu" respectively.
-    # They must appear in this order and in this order only in order for us to be able to process them.
-    # In the future we will get rid of this specific processing as downstream consumers start relying
-    # less on the content of Declarations.yaml. If you want to support more than this you'll
-    # potentially have to extend the JIT.
-
-    supported_topt_arguments = [
-        [
-            {'name': 'dtype', 'type': 'ScalarType', 'is_nullable': False, 'annotation': None},
-            {'name': 'layout', 'type': 'Layout', 'is_nullable': False, 'annotation': None},
-            {'name': 'device', 'type': 'Device', 'is_nullable': False, 'annotation': None},
-            {'name': 'pin_memory', 'type': 'bool', 'is_nullable': False, 'annotation': None, 'default': False},
-        ]
-    ]
-    supported_topt_arguments.append(copy.deepcopy(supported_topt_arguments[0]))
-    for arg in supported_topt_arguments[1]:
-        arg.update({'kwarg_only': True})
-    supported_topt_arguments.append(copy.deepcopy(supported_topt_arguments[1]))
-    for arg in supported_topt_arguments[2]:
-        arg.update({'default': 'c10::nullopt', 'is_nullable': True})
-    # add explicit support for what is needed for tril_indices / triu_indices
-    supported_topt_arguments.append(
-        [
-            {'name': 'dtype', 'type': 'ScalarType', 'annotation': None, 'kwarg_only': True,
-             'default': 'long', 'is_nullable': True},
-            {'name': 'layout', 'type': 'Layout', 'annotation': None, 'kwarg_only': True,
-             'default': 'c10::nullopt', 'is_nullable': True},
-            {'name': 'device', 'type': 'Device', 'annotation': None, 'kwarg_only': True,
-             'default': 'c10::nullopt', 'is_nullable': True},
-            {'name': 'pin_memory', 'type': 'bool', 'annotation': None, 'kwarg_only': True,
-             'default': 'c10::nullopt', 'is_nullable': True},
-        ]
-    )
-    supported_topt_arguments.append(
-        [
-            {'name': 'dtype', 'type': 'ScalarType', 'annotation': None, 'kwarg_only': True,
-             'default': 'c10::nullopt', 'is_nullable': True},
-            {'name': 'layout', 'type': 'Layout', 'annotation': None, 'kwarg_only': True,
-             'default': 'c10::nullopt', 'is_nullable': True},
-            {'name': 'device', 'type': 'Device', 'annotation': None, 'kwarg_only': True,
-             'default': 'c10::nullopt', 'is_nullable': True},
-            {'name': 'pin_memory', 'type': 'bool', 'annotation': None, 'kwarg_only': True,
-             'default': False, 'is_nullable': True},
-        ]
-    )
-
-    corresponding_topts = [
-        {'type': 'TensorOptions', 'name': 'options', 'is_nullable': False, 'annotation': None},
-    ]
-    corresponding_topts.append(corresponding_topts[0].copy())
-    corresponding_topts[1]['kwarg_only'] = True
-    corresponding_topts.append(corresponding_topts[1].copy())
-    corresponding_topts[2]['default'] = '{}'
-    corresponding_topts.append(
-        {'type': 'TensorOptions', 'name': 'options', 'is_nullable': False, 'annotation': None,
-         'kwarg_only': True, 'default': 'at::kLong'})
-    corresponding_topts.append(
-        {'type': 'TensorOptions', 'name': 'options', 'is_nullable': False, 'annotation': None,
-         'kwarg_only': True})
-
-    def check_topt_representation(topt_representation):
-        for idx, supported_topt in enumerate(supported_topt_arguments):
-            matches = all(topt_representation[i] == topt for i, topt in enumerate(supported_topt))
-            if matches:
-                return corresponding_topts[idx]
-        return None
-
-    def is_tensor_option(argument):
-        return argument['name'] in ['dtype', 'layout', 'device', 'pin_memory']
-
-    new_arguments = []
-    idx = 0
-    while idx < len(arguments):
-        argument = arguments[idx]
-        number_of_arguments = len(supported_topt_arguments[0])
-        if is_tensor_option(argument) and len(arguments) - idx >= number_of_arguments:
-            topt_representation = []
-            for i in range(number_of_arguments):
-                argument = arguments[idx]
-                if not is_tensor_option(argument):
-                    break
-                topt_representation.append(argument)
-                idx += 1
-            if len(topt_representation) == number_of_arguments:
-                merged_argument = check_topt_representation(topt_representation)
-                assert merged_argument, \
-                    "Unsupported combination of TensorOptions {}, the only currently supported combinations are {}"\
-                    .format(str(topt_representation), str(supported_topt_arguments))
-                new_arguments.append(merged_argument)
-            else:
-                new_arguments += topt_representation
-        else:
-            new_arguments.append(argument)
-            idx += 1
-
-    arguments = new_arguments
-
-    # Sanity checks
-
-    # TODO: convention is that the ith-argument correspond to the i-th return, but it would
-    # be better if we just named everything and matched by name.
-    for arg_idx, argument in enumerate(arguments_out):
-        assert argument['annotation'] == func_return[arg_idx]['annotation'], \
-            "For func {} writeable keyword Tensor arguments need to have a matching return Tensor. Further, " \
-            "the ith-argument needs to correspond to the i-th return.".format(name)
-
-    assert len(arguments_out) <= len(func_return), "func {} must return at least as many Tensors " \
-        "as can be passed as output.".format(name)
-
-    if name.endswith('_out'):
-        raise RuntimeError("Native function {} may not be suffixed with _out as we transition to a unified schema. "
-                           "Otherwise you will cause confusion amongst consumers of native functions.".format(name))
-
-    if is_out_fn and func_variants not in [[], 'function', ['function']]:
-        raise RuntimeError("Native functions with output MUST be declared with only the function variant; "
-                           "e.g., variants: function; otherwise you will tickle a Python argument binding bug "
-                           "(which usually manifests itself as the result variable being undefined.) "
-                           "The culprit was: {}".format(name))
-    if not is_out_fn:
-        assert len(arguments_out) == 0, "func {} is not marked as output yet contains output " \
-            "keyword arguments".format(name)
-
-    # TODO: Explicit checking for void is a hack and should disappear after a more
-    # functionally complete implementation of Tensor aliases.
-    if declaration['inplace'] and len(func_return) > 0:
-        found_self = False
-        for arg_idx, argument in enumerate(arguments):
-            if argument['name'] == "self":
-                assert argument['annotation'] and argument['annotation'].endswith("!"), \
-                    "Inplace function \"{}\" needs to annotate Tensor argument named self " \
-                    "as mutable.".format(name)
-                found_self = True
-                assert argument['annotation'] == func_return[arg_idx]['annotation'], \
-                    "Inplace function annotations of function {} need to match between " \
-                    "input and correponding output.".format(name)
-                assert argument['name'] == func_return[arg_idx]['name'] or \
-                    argument['name'] == func_return[arg_idx]['name'] + "_return"
-                assert argument['type'] == func_return[arg_idx]['type']
-        assert found_self, "Inplace function \"{}\" needs Tensor argument named self.".format(name)
-
-    return arguments
-
-
-def parse_return_arguments(return_decl, inplace, func_decl):
-    arguments = []
-    if return_decl == '()':
-        return arguments
-
-    # TODO: Use a real parser here; this will get bamboozled
-    # by signatures that contain things like std::array<bool, 2> (note the space)
-    if return_decl[0] == '(' and return_decl[-1] == ')':
-        return_decl = return_decl[1:-1]
-
-    multiple_args = len(return_decl.split(', ')) > 1
-    for arg_idx, arg in enumerate(return_decl.split(', ')):
-        t, name, default, nullable, size, annotation = type_argument_translations(arg)
-        # name of arguments and name of return sometimes have collision
-        # in this case, we rename the return name to <name>_return.
-        return_name = name
-        if name in func_decl['func'].split('->')[0]:
-            return_name = name + "_return"
-        argument_dict = {'type': t, 'name': return_name, 'annotation': annotation}
-        if name:
-            # See Note [field_name versus name]
-            argument_dict['field_name'] = name
-        else:
-            if t == "Tensor" and inplace:
-                assert annotation and annotation.endswith("!"), \
-                    "Return Tensor of function \"{}\" flagged as inplace needs to be " \
-                    "annotated as mutable".format(func_decl['func'])
-                argument_dict['name'] = 'self'
-            elif t == "TensorList" and inplace:
-                assert annotation and annotation.endswith("!"), \
-                    "Return TensorList of function \"{}\" flagged as inplace needs to be " \
-                    "annotated as mutable".format(func_decl['func'])
-                argument_dict['name'] = 'self'
-            else:
-                argument_dict['name'] = 'result' if not multiple_args else 'result' + str(arg_idx)
-        argument_dict['output'] = True
-        arguments.append(argument_dict)
-    return arguments
-
-
-def parse_dispatch(name, dispatch):
-    """
-    Parse a dictionary like {"CPU, CUDA": "blah"}
-    into {"CPU": "blah", "CUDA": "blah"}
-    """
-    if not isinstance(dispatch, dict):
-        return dispatch
-    r = {}
-    for old_k, v in dispatch.items():
-        ks = old_k.split(',')
-        for k in ks:
-            k = k.strip()
-            assert k not in r, "{}, {}".format(name, k)
-            r[k] = v
-    return r
-
-
-def parse_native_yaml(path):
-    with open(path, 'r') as f:
-        return yaml.load(f, Loader=Loader)
-
-
-def propagate_field_names(output_arguments, return_arguments):
-    if output_arguments:
-        for i, r in enumerate(return_arguments):
-            if 'field_name' in r:
-                output_arguments[i]['field_name'] = r['field_name']
-
-
-def run(paths):
-    declarations = []
-    for path in paths:
-        for func in parse_native_yaml(path):
-            declaration = {'mode': 'native'}
-            try:
-                declaration['schema_string'] = "aten::" + func['func']
-                if '->' in func['func']:
-                    func_decl, return_decl = [x.strip() for x in func['func'].split('->')]
-                else:
-                    raise Exception('Expected return declaration')
-                fn_name, arguments = func_decl.split('(', 1)
-                if '.' in fn_name:
-                    fn_name, overload_name = fn_name.split('.', 1)
-                else:
-                    overload_name = ''
-                assert arguments[-1] == ")", "Expecting closing ) for {}".format(func['func'])
-                arguments = arguments[:-1]  # Expect closing )
-                declaration['name'] = func.get('name', fn_name)
-                declaration['operator_name'] = func.get('name', fn_name)
-                declaration['overload_name'] = func.get('overload_name', overload_name)
-                declaration['inplace'] = re.search('(^__i|[^_]_$)', fn_name) is not None
-                return_arguments = parse_return_arguments(return_decl, declaration['inplace'], func)
-                schema_order_arguments = parse_arguments(arguments)
-                arguments = process_arguments(schema_order_arguments, func.get('variants', []), declaration, return_arguments)
-                output_arguments = [x for x in arguments if x.get('output')]
-                propagate_field_names(output_arguments, return_arguments)
-                declaration['return'] = return_arguments if len(output_arguments) == 0 else output_arguments
-                declaration['variants'] = func.get('variants', ['function'])
-                declaration['matches_jit_signature'] = func.get('matches_jit_signature', True)
-                declaration['cpu_half'] = func.get('cpu_half', False)
-                declaration['cpu_bfloat16'] = func.get('cpu_bfloat16', False)
-                declaration['cuda_bfloat16'] = func.get('cuda_bfloat16', False)
-                declaration['cpu_bool'] = func.get('cpu_bool', False)
-                declaration['cuda_bool'] = func.get('cuda_bool', False)
-                declaration['deprecated'] = func.get('deprecated', False)
-                declaration['device_guard'] = func.get('device_guard', True)
-                declaration['use_c10_dispatcher'] = func.get('use_c10_dispatcher', 'with_codegenerated_unboxing_wrapper')
-                assert declaration['use_c10_dispatcher'] in ['with_codegenerated_unboxing_wrapper', 'full']
-                declaration['manual_kernel_registration'] = func.get('manual_kernel_registration', False)
-                declaration['category_override'] = func.get('category_override', '')
-                declaration['arguments'] = func.get('arguments', arguments)
-                declaration['schema_order_arguments'] = func.get('schema_order_arguments', schema_order_arguments)
-                declaration['type_method_definition_dispatch'] = \
-                    parse_dispatch(fn_name, func.get('dispatch', declaration['name']))
-                declaration['python_module'] = func.get('python_module', '')
-                declarations.append(declaration)
-            except Exception as e:
-                msg = '''Exception raised in processing function:
-{func}
-Generated partial declaration:
-{decl}'''.format(func=pprint.pformat(func), decl=pprint.pformat(declaration))
-                print(msg, file=sys.stderr)
-                raise e
-
-    return declarations
diff --git a/aten/src/ATen/nn.yaml b/aten/src/ATen/nn.yaml
deleted file mode 100644
index a95de7be719..00000000000
--- a/aten/src/ATen/nn.yaml
+++ /dev/null
@@ -1,62 +0,0 @@
-# Loss functions
-
-- name: _thnn_multi_margin_loss(Tensor self, LongTensor target, Scalar p, Scalar margin, Tensor? weight, int64_t reduction)
-  cname: MultiMarginCriterion
-
-- name: _thnn_multilabel_margin_loss(Tensor self, LongTensor target, int64_t reduction=at::Reduction::Mean)
-  cname: MultiLabelMarginCriterion
-  buffers: [is_target]
-  CUDA:
-    forward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16']
-    backward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16']
-
-- name: _thnn_nll_loss(Tensor self, LongTensor target, Tensor? weight, int64_t reduction, int64_t ignore_index)
-  cname: ClassNLLCriterion
-  buffers: [total_weight]
-  CPU:
-    forward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16']
-    backward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16']
-  CUDA:
-    forward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16']
-    backward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16']
-
-- name: _thnn_nll_loss2d(Tensor self, LongTensor target, Tensor? weight, int64_t reduction, int64_t ignore_index)
-  cname: SpatialClassNLLCriterion
-  buffers: [total_weight]
-  CUDA:
-    forward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16']
-    backward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16']
-
-# Activation functions
-
-- name: _thnn_glu(Tensor self, int64_t dim)
-  cname: GatedLinear
-
-- name: _thnn_log_sigmoid(Tensor self)
-  cname: LogSigmoid
-  buffers: [buffer]
-
-# NOTE: we treat noise as an input (it's really a buffer) because the codegen
-# can't handle in-place functions that have buffers
-- name: _thnn_rrelu_with_noise(Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, Generator? generator=None)
-  cname: RReLU
-  has_inplace: True
-
-# Convolutions
-
-- name: _thnn_conv2d(Tensor self, Tensor weight, IntArrayRef[2] kernel_size, Tensor? bias, IntArrayRef[2] stride, IntArrayRef[2] padding)
-  cname: SpatialConvolutionMM
-  buffers: [columns, ones]
-  CPU:
-    forward_scalar_types: ['Float', 'Double', 'Long', 'BFloat16']
-    backward_scalar_types: ['Float', 'Double', 'BFloat16']
-  CUDA:
-    forward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16']
-    backward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16']
-
-- name: _thnn_conv_depthwise2d(Tensor self, Tensor weight, IntArrayRef[2] kernel_size, Tensor? bias, IntArrayRef[2] stride, IntArrayRef[2] padding, IntArrayRef[2] dilation)
-  cname: SpatialDepthwiseConvolution
-  buffers: []
-  CUDA:
-    forward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16']
-    backward_scalar_types: ['Float', 'Double', 'Half', 'BFloat16']
diff --git a/aten/src/ATen/nn_parse.py b/aten/src/ATen/nn_parse.py
deleted file mode 100644
index 33d78abf61d..00000000000
--- a/aten/src/ATen/nn_parse.py
+++ /dev/null
@@ -1,388 +0,0 @@
-import copy
-import re
-import common_with_cwrap
-import yaml
-from collections import OrderedDict, defaultdict
-
-try:
-    # use faster C loader if available
-    from yaml import CLoader as Loader
-except ImportError:
-    from yaml import Loader
-
-
-# matches `name`, `params` in `name(params)`
-NAME_PARAM_REGEX = r'(\w+)\((.*)\)'
-
-
-def argument_to_declaration(param, func=None):
-    arg = {}
-    arg['type'], name = param.split(' ')
-    if (arg['type'].endswith('?')):
-        arg['is_nullable'] = True
-        arg['type'] = arg['type'].rstrip('?')
-    if arg['type'] == 'Tensor':
-        arg['type'] = 'THTensor*'
-    elif arg['type'] == 'LongTensor':
-        arg['type'] = 'THIndexTensor*'
-    elif arg['type'] == 'Scalar':
-        arg['type'] = 'accreal'
-    elif arg['type'] == 'Generator':
-        arg['type'] = 'c10::optional<at::Generator>'
-
-    match = re.match(r'IntArrayRef\[(\d+)\]', arg['type'])
-    if match:
-        arg['type'] = 'IntArrayRef'
-        arg['size'] = int(match.group(1))
-
-    if '=' in name:
-        name, default = name.split('=')
-        arg['optional'] = True
-        arg['default'] = default
-    arg['name'] = name
-
-    return arg
-
-
-def output_arguments(thnn_function):
-    cname = thnn_function.name
-    output_args = []
-
-    # function_wrapper expects everything in a declaration to be in
-    # the base type (i.e. THTensor*), but if we pull a THCUNN only
-    # implementation, it will have THCTensor* as the arg type. So we
-    # strip the THC here before returning
-    def map_to_th_type(t):
-        if t.startswith('THC'):
-            t = t.replace('THC', 'TH')
-        return t
-
-    def is_output_arg(arg_name, func_name):
-        if arg_name == 'output' and 'updateOutput' in cname:
-            return True
-        if name in {'gradInput', 'gradWeight', 'gradBias', 'gradGrid'}:
-            return True
-        if arg_name == 'indices' and 'updateOutput' in cname and 'Unpool' not in cname:
-            # indices is an output argument in pooling and an input in unpooling
-            return True
-        return False
-
-    for arg in thnn_function.arguments:
-        name = arg.name
-        if is_output_arg(name, cname):
-            desc = {
-                'type': map_to_th_type(arg.type),
-                'name': camel_to_snake(name),
-                'output': True,
-            }
-            if name.startswith('grad_'):
-                desc['is_nullable'] = True
-            output_args.append(desc)
-    return output_args
-
-
-def get_return(args):
-    indices = [str(idx) for idx, arg in enumerate(args) if arg.get('output')]
-    return 'argument {}'.format(','.join(indices))
-
-
-ARGUMENT_MAPPINGS = {
-    'k': 'kernel_size',
-    'd': 'stride',
-    'pad': 'padding',
-    'p': 'padding',
-    'o': 'output_size',
-    'osize': 'output_size',
-    'output': 'output_size',  # as a prefix e.g. outputW
-    'isize': 'input_size',
-    'dilation': 'dilation',
-    'adj': 'output_padding',
-    'a': 'output_padding',
-}
-
-DIMENSION_OFFSET = {
-    'width': -1,
-    'height': -2,
-    'B': 0,
-    'C': 1,
-    'W': -1,
-    'H': -2,
-    'T': -3,
-    'left': 0,
-    'right': 1,
-    'top': 2,
-    'bottom': 3,
-    'front': 4,
-    'back': 5,
-}
-
-SUBSTITUTIONS = {
-    'input': 'self',
-    'weights': 'weight',
-    'train': 'training',
-    'val': 'value',
-    'lambda': 'lambd',
-    'negval': 'negative_slope',
-}
-
-
-def camel_to_snake(name):
-    # from https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case
-    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
-    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
-
-
-def get_thnn_args(thnn_function, params, inplace):
-    params_by_name = {p['name']: p for p in params}
-
-    def arg_expr(prefix, suffix):
-        # e.g kW, kH
-        name = ARGUMENT_MAPPINGS[prefix]
-        if name not in params_by_name:
-            raise RuntimeError('missing arg "{}" in {}'.format(name, thnn_function.name))
-        param = params_by_name[name]
-        if param['type'] == 'IntArrayRef' and 'size' in param:
-            name = name + '_'
-        # NB: We calculate the dimension based on the name of
-        # the argument, not its positional order.  This means
-        # that we may reorder arguments to get them in
-        # the right place; e.g., if a THNN implementation
-        # has arguments in the order kernelW, kernelH, we
-        # will generate a caller that is kernel[1], kernel[0]
-        # to order them in the correct way.
-        index = DIMENSION_OFFSET[suffix]
-        if index < 0:
-            index += param['size']
-        expr = '{}[{}]'.format(name, index)
-        return {'type': 'EXPRESSION', 'name': expr}
-
-    thnn_args = []
-    for arg in thnn_function.arguments:
-        name = arg.name
-        if name == 'state':
-            continue
-        if inplace and name == 'output':
-            name = 'self'
-        aten_name = camel_to_snake(SUBSTITUTIONS.get(name, name))
-        parts = aten_name.split('_')
-        if aten_name in params_by_name:
-            param = params_by_name[aten_name]
-            if arg.is_optional:
-                param['is_nullable'] = True
-            thnn_args.append(copy.deepcopy(param))
-        elif len(parts) == 2 and parts[0] in ARGUMENT_MAPPINGS and parts[1] in DIMENSION_OFFSET:
-            # e.g. pad_left
-            thnn_args.append(arg_expr(parts[0], parts[1]))
-        elif name[-1] in DIMENSION_OFFSET and name[:-1] in ARGUMENT_MAPPINGS:
-            # e.g kW, kH
-            thnn_args.append(arg_expr(name[:-1], name[-1]))
-        elif name == 'owidth' or name == 'oheight':
-            thnn_args.append(arg_expr(name[0], name[1:]))
-        elif name == 'scale':
-            thnn_args.append({'type': 'EXPRESSION', 'name': '1'})
-        elif name == 'inplace':
-            thnn_args.append({'type': 'EXPRESSION', 'name': str(inplace).lower()})
-        else:
-            raise RuntimeError("{}: can't find binding for '{}'"
-                               .format(thnn_function.name, name))
-    return thnn_args
-
-
-def remove_unused_args(args, thnn_args):
-    """Returns the subset of args whose name appears in thnn_args"""
-    def clean_name(name):
-        name = name[:name.index('[')] if '[' in name else name
-        if name.endswith('_'):
-            name = name[:-1]
-        return name
-    uses = set([clean_name(arg['name']) for arg in thnn_args])
-    uses.add('output_mask')
-    args = [arg for arg in args if arg['name'] in uses]
-    for arg in args:
-        if 'default' in arg:
-            del arg['default']
-    return args
-
-
-def unique_args(argslist):
-    result = []
-    seen = set()
-    for args in argslist:
-        for arg in args:
-            if arg['name'] in seen:
-                continue
-            seen.add(arg['name'])
-            result.append(arg)
-    return result
-
-
-def function_info(name, arguments, cimpls, buffers, backends, inplace, backend_types):
-    """
-    cimpls contains information use to call into THNN:
-        cname: THNN function name
-        arguments: arguments to functional call
-        condition: [optional] guard around call
-    """
-    return {
-        'mode': 'NN',
-        'name': name,
-        'cpu_bfloat16': True if backend_types is not None and 'CPU' in backend_types and
-                'BFloat16' in backend_types['CPU'] else False,
-        'cuda_bfloat16': True if backend_types is not None and 'CUDA' in backend_types and
-                'BFloat16' in backend_types['CUDA'] else False,
-        'backend_types': backend_types,
-        'arguments': arguments,
-        'schema_order_arguments': copy.deepcopy(arguments),
-        'return': 'argument 0' if inplace else get_return(arguments),
-        'buffers': buffers,
-        'backends': backends,
-        'cimpls': cimpls,
-        'variants': ['function'],
-    }
-
-def base_declaration(func, thnn_function, backends, backend_types, inplace=False):
-    """Creates the NN function without any buffers in it's signature"""
-    name, params = re.match(NAME_PARAM_REGEX, func['name']).groups()
-    if inplace:
-        name += '_'
-    params = params.split(', ')
-    arguments = [argument_to_declaration(a, func) for a in params]
-    if not inplace:
-        arguments += output_arguments(thnn_function)
-    buffers = [argument_to_declaration('Tensor ' + buf)
-               for buf in func.get('buffers', [])]
-
-    return function_info(name, arguments, None, buffers, backends, inplace, backend_types)
-
-def forward_declaration(base, thnn_function, backend_types, inplace=False):
-    name = '{}_forward'.format(base['name'])
-    if inplace:
-        name += '_'
-
-    arguments = [copy.deepcopy(arg) for arg in base['arguments']
-                 if not arg.get('output')]
-
-    arguments += output_arguments(thnn_function)
-    for buffer in base['buffers']:
-        buffer = copy.deepcopy(buffer)
-        buffer['output'] = True
-        arguments.append(buffer)
-
-    thnn_args = get_thnn_args(thnn_function, arguments, inplace)
-    arguments = remove_unused_args(arguments, thnn_args)
-    cimpl = {'cname': thnn_function.name, 'arguments': thnn_args}
-
-    return function_info(name, arguments, [cimpl], [], base['backends'], inplace, backend_types)
-
-def backward_declaration(base, thnn_functions, backend_types):
-    name = '{}_backward'.format(base['name'])
-
-    arguments = []
-    arguments.append({'type': 'THTensor*', 'name': 'grad_output'})
-    arguments += [copy.deepcopy(arg) for arg in base['arguments']
-                  if arg['name'] != 'inplace']
-    arguments += base['buffers']
-
-    # outputs from the forward may be inputs to the backwards
-    for arg in arguments:
-        if 'output' in arg:
-            del arg['output']
-
-    arguments += unique_args([output_arguments(f) for f in thnn_functions])
-
-    def initialize_output_arg(arg):
-        # the mask array<bool, N> specifies which return values to compute
-        arg['mask'] = True
-        arg['is_nullable'] = True
-
-    is_batch_norm_backward = '_backward' in thnn_functions[0].name
-    grad_params = []
-    if len(thnn_functions) > 1 or is_batch_norm_backward:
-        for arg in arguments:
-            if arg.get('output', False):
-                initialize_output_arg(arg)
-            if 'Tensor' in arg['type'] and arg['name'].startswith('grad_') and \
-                    'input' not in arg['name'] and 'output' not in arg['name']:
-                grad_params.append(arg['name'])
-
-    thnn_args = [get_thnn_args(f, arguments, False) for f in thnn_functions]
-    arguments = remove_unused_args(arguments, unique_args(thnn_args))
-    cimpls = []
-
-    def get_condition(func):
-        # only call into the THNN functions if the output args are not null
-        if '_updateGradInput' in func.name:
-            return 'grad_input_'
-        if '_accGradParameters' in func.name:
-            return ' || '.join(p + '_' for p in grad_params)
-        return None
-
-    for func, args in zip(thnn_functions, thnn_args):
-        cimpl = {'cname': func.name, 'arguments': args}
-        if len(thnn_functions) > 1:
-            cimpl['condition'] = get_condition(func)
-        cimpls.append(cimpl)
-
-    output_args = [arg for arg in arguments if arg.get('output', False)]
-
-    return function_info(name, arguments, cimpls, [], base['backends'], False, backend_types)
-
-
-def parse_nn_yaml(filename):
-    with open(filename, 'r') as f:
-        return yaml.load(f, Loader=Loader)
-
-
-include_only = '(updateOutput|updateGradInput|accGradParameters|backward)$'
-exclude = 'LookupTable'
-
-
-def run(paths):
-    function_backends = defaultdict(list)
-    header_functions = OrderedDict()
-
-    headers = [p for p in paths if p.endswith('.h')]
-    yamls = [p for p in paths if p.endswith('.yaml')]
-
-    for path in headers:
-        backend = 'CUDA' if re.search('THCU', path) else 'CPU'
-        for func in common_with_cwrap.parse_header(path):
-            if re.search(include_only, func.name) is None or re.search(exclude, func.name) is not None:
-                continue
-            function_backends[func.name].append(backend)
-            if func.name not in header_functions:
-                header_functions[func.name] = func
-
-    bwd_suffixes = ['_updateGradInput', '_accGradParameters', '_backward']
-
-    declarations = []
-    for path in yamls:
-        for func in parse_nn_yaml(path):
-            cname = func['cname']
-            backends = function_backends[cname + '_updateOutput']
-
-            fwd_function = header_functions[cname + '_updateOutput']
-            bwd_functions = []
-            for suffix in bwd_suffixes:
-                if cname + suffix in header_functions:
-                    bwd_functions.append(header_functions[cname + suffix])
-
-            default_scalar_types = ['Float', 'Double', 'Half']  # Half will be stripped for CPU backend
-            forward_backend_types = {}
-            backward_backend_types = {}
-            for backend in backends:
-                backend_props = func.get(backend, {})
-                forward_backend_types[backend] = backend_props.get('forward_scalar_types', default_scalar_types)
-                backward_backend_types[backend] = backend_props.get('backward_scalar_types', default_scalar_types)
-
-            base = base_declaration(func, fwd_function, backends, None)
-            declarations.append(forward_declaration(base, fwd_function, forward_backend_types))
-            if bwd_functions:
-                declarations.append(backward_declaration(base, bwd_functions, backward_backend_types))
-
-
-            if func.get('has_inplace', False):
-                declarations.append(base_declaration(func, fwd_function, backends, forward_backend_types, True))
-                declarations.append(forward_declaration(base, fwd_function, forward_backend_types, True))
-
-    return declarations
diff --git a/aten/src/ATen/preprocess_declarations.py b/aten/src/ATen/preprocess_declarations.py
deleted file mode 100644
index 1c18144ba9f..00000000000
--- a/aten/src/ATen/preprocess_declarations.py
+++ /dev/null
@@ -1,213 +0,0 @@
-import re
-from copy import deepcopy
-from function_wrapper import TYPE_FORMAL_GENERIC
-import common_with_cwrap
-
-type_map = {
-    'floating_point': [
-        'Float',
-        'Double',
-        'Half',
-        'BFloat16',
-    ],
-    'integral': [
-        'Byte',
-        'Char',
-        'Short',
-        'Int',
-        'Long',
-        'Bool',
-    ],
-    'quantized': [
-        'QInt8',
-        'QUInt8',
-        'QInt32',
-    ]
-}
-
-all_types = type_map['floating_point'] + type_map['integral'] + type_map['quantized']
-type_map['all'] = all_types
-
-all_backends = ['CPU', 'CUDA', 'SparseCPU', 'SparseCUDA', 'MkldnnCPU', 'QuantizedCPU', 'QuantizedCUDA', 'Vulkan']
-default_backends = ['CPU', 'CUDA']
-
-
-def process_types_and_backends(option):
-    # if specific pairs were not listed, then enumerate them
-    # based on the backend and type attributes
-    # if backend or type is not defined, it is assumed to be all of them
-    if 'backend_types' not in option:
-        backends = option.get('backends', default_backends)
-        if isinstance(option.get('type_method_definition_dispatch'), dict):
-            backends = option.get('type_method_definition_dispatch').keys()
-        backends = set(backends)
-
-        backend_types = {}
-        for backend in backends:
-            if backend in ('QuantizedCPU', 'QuantizedCUDA'):
-                backend_types[backend] = type_map['quantized']
-            else:
-                backend_types[backend] = option.get('types', all_types)
-    else:
-        backend_types = option['backend_types']
-
-    # expand type alias (integral, floating_point, all)
-    def expand(types):
-        ret = []
-        for t in types:
-            if t in type_map:
-                ret.extend(type_map[t])
-            else:
-                assert(t in all_types)
-                ret.append(t)
-        return ret
-
-    for backend in backend_types.keys():
-        assert backend in all_backends, "{} {}".format(backend, option['name'])
-        backend_types[backend] = set(expand(backend_types[backend]))
-
-    # special case remove Half for cpu unless it is explicitly enabled
-    if not option.get('cpu_half', False):
-        if 'CPU' in backend_types:
-            backend_types['CPU'].discard('Half')
-
-    # special case remove BFloat16 for cpu and cuda unless it is explicitly enabled
-    if not option.get('cpu_bfloat16', False):
-        if 'CPU' in backend_types:
-            backend_types['CPU'].discard('BFloat16')
-
-    if not option.get('cuda_bfloat16', False):
-        if 'CUDA' in backend_types:
-            backend_types['CUDA'].discard('BFloat16')
-
-    # special cases remove bool for cpu and cuda unless it is explicitly enabled
-    if not option.get('cpu_bool', False):
-        if 'CPU' in backend_types:
-            backend_types['CPU'].discard('Bool')
-
-    if not option.get('cuda_bool', False):
-        if 'CUDA' in backend_types:
-            backend_types['CUDA'].discard('Bool')
-
-    # sort the result for easy reading
-    for backend in backend_types.keys():
-        backend_types[backend] = sorted(backend_types[backend])
-    option['backend_types'] = backend_types
-
-
-def exclude(declaration):
-    return 'only_register' in declaration or declaration.get('name') == 'ndimension'
-
-
-def add_variants(option):
-    option.setdefault('variants', ['method'])
-
-# if we have 'output' arguments, generate a variant where
-# we mark oututs as allocate = True, and where the method variant
-# is disabled...
-
-
-def handle_outputs_taken_as_arguments(options):
-    new_options = []
-
-    def is_nullable(arg):
-        return (arg['type'] in {'THIntegerTensor*', 'THTensor*'} and
-                arg.get('default', '') in {None, 'NULL', 'nullptr'})
-
-    def should_generate_out_variant(option):
-        if 'function' in option['variants'] and option['mode'] != 'native':
-            # don't generate _out variants for in-place functions
-            return re.search('(^__i|[^_]_$)', option['api_name']) is None
-        return False
-
-    for option in options:
-        for arg in option['arguments']:
-            # mark arguments which can be null
-            if is_nullable(arg):
-                arg['is_nullable'] = True
-
-        if any('output' in arg for arg in option['arguments']):
-            allocate_option = deepcopy(option)
-            # the allocating option needs to be marked
-            for arg in allocate_option['arguments']:
-                if 'output' in arg:
-                    arg['allocate'] = True
-
-            # the original option, which takes arguments for the results,
-            # is no longer a method, and has _out added to indicte it takes
-            # output arguments
-            if should_generate_out_variant(option):
-                if 'method' in option['variants']:
-                    option['variants'].remove('method')
-                option['api_name'] += '_out'
-                new_options.append(option)
-
-            new_options.append(allocate_option)
-        else:
-            new_options.append(option)
-    return new_options
-
-
-def sanitize_return(option):
-    ret = option['return']
-    m = re.match(r'argument (\d+(,\d+)*)', ret)
-    if m is not None:
-        arguments = [int(x) for x in m.group(1).split(',')]
-        option['return'] = {'kind': 'arguments', 'arguments': arguments}
-    elif ret == 'self':
-        option['return'] = {'kind': 'arguments', 'arguments': []}
-        for i, x in enumerate(option['arguments']):
-            if x['name'] == 'self':
-                option['return']['arguments'].append(i)
-                break
-    else:
-        option['return'] = {'kind': 'type', 'type': option['return']}
-
-
-def set_mode(option):
-    option['mode'] = option.get('mode', 'TH')
-
-
-def is_extended_method(option):
-    if 'method' in option['variants']:
-        return False
-    else:
-        return True
-
-
-def run(declarations):
-    declarations = [d for d in declarations if not exclude(d)]
-    non_extended_methods = set()
-    for declaration in declarations:
-        common_with_cwrap.set_declaration_defaults(declaration)
-        declaration['options'] = [deepcopy(o) for o in declaration['options']]
-        declaration['options'] = common_with_cwrap.filter_unique_options(
-            declaration['options'],
-            allow_kwarg=False,
-            type_to_signature=TYPE_FORMAL_GENERIC,
-            remove_self=True)
-
-        common_with_cwrap.sort_by_number_of_args(declaration)
-
-        for option in declaration['options']:
-            set_mode(option)
-            if option['mode'] != 'native':
-                sanitize_return(option)
-            process_types_and_backends(option)
-            add_variants(option)
-            if not is_extended_method(option):
-                non_extended_methods.add(option['api_name'])
-        declaration['options'] = handle_outputs_taken_as_arguments(
-            declaration['options'])
-    # We (very unfortunately) have overloaded virtual methods. Because
-    # of C++'s rules, we cannot move one overload without doing some
-    # extra work to make sure that overload in a superclass and an
-    # overload in a subclass resolve together. I've chosen to resolve
-    # this problem simply by moving ALL overloads of a method which
-    # occurs in Tensor to Type.  This is why we have to first compute
-    # which methods *names* go on type, and then move ALL overloads
-    # of this name to Type.
-    for declaration in declarations:
-        for option in declaration['options']:
-            option['extended_method'] = option['api_name'] not in non_extended_methods
-    return declarations
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index c00fcc789aa..bdda67afb1f 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -304,10 +304,6 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
   # Generate files
   set(TOOLS_PATH "${TORCH_ROOT}/tools")
 
-  configure_file("${TORCH_ROOT}/aten/src/ATen/common_with_cwrap.py"
-    "${TOOLS_PATH}/shared/cwrap_common.py"
-    COPYONLY)
-
   configure_file("${TORCH_SRC_DIR}/_utils_internal.py"
     "${TOOLS_PATH}/shared/_utils_internal.py"
     COPYONLY)
diff --git a/caffe2/contrib/aten/gen_op.py b/caffe2/contrib/aten/gen_op.py
index aa2d0f7ddbc..791df87e3a0 100755
--- a/caffe2/contrib/aten/gen_op.py
+++ b/caffe2/contrib/aten/gen_op.py
@@ -36,10 +36,10 @@ if args.aten_root:
     if not os.path.exists(args.aten_root):
         raise ValueError('aten_root ({}) does not exist'.format(
             args.aten_root))
-    sys.path.append(os.path.join(args.aten_root, 'src', 'ATen'))
-    from code_template import CodeTemplate as CT
+    sys.path.append(os.path.join(args.aten_root, '..'))  # TODO: fix this
+    from tools.codegen.code_template import CodeTemplate as CT
 else:
-    from src.ATen.code_template import CodeTemplate as CT  # type: ignore[import,no-redef]
+    from tools.codegen.code_template import CodeTemplate as CT  # type: ignore[import,no-redef]
 
 OP_TEMPLATE = CT.from_file(
     os.path.join(args.template_dir, 'aten_op_template.h'))
diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index 9116dd2e317..61501a1105d 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -144,13 +144,7 @@ if(INTERN_BUILD_ATEN_OPS)
   endforeach()
   list(APPEND ATen_CPU_SRCS ${cpu_kernel_cpp})
 
-  set(cwrap_files
-    ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/Declarations.cwrap
-    ${CMAKE_CURRENT_LIST_DIR}/../aten/src/THCUNN/generic/THCUNN.h
-    ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/nn.yaml
-    ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/native_functions.yaml)
-
-  file(GLOB all_python "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/*.py")
+  file(GLOB all_python "${CMAKE_CURRENT_LIST_DIR}/../tools/codegen/*.py")
 
   set(GEN_ROCM_FLAG)
   if(USE_ROCM)
@@ -189,11 +183,10 @@ if(INTERN_BUILD_ATEN_OPS)
   endif()
 
   set(GEN_COMMAND
-      "${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/gen.py
+      "${PYTHON_EXECUTABLE}" -m tools.codegen.gen
       --source-path ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen
       --install_dir ${CMAKE_BINARY_DIR}/aten/src/ATen
       ${GEN_ROCM_FLAG}
-      ${cwrap_files}
       ${CUSTOM_BUILD_FLAGS}
       ${GEN_VULKAN_FLAGS}
   )
@@ -202,6 +195,7 @@ if(INTERN_BUILD_ATEN_OPS)
       COMMAND ${GEN_COMMAND}
         --output-dependencies ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt
       RESULT_VARIABLE RETURN_VALUE
+      WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/..
   )
   if(NOT RETURN_VALUE EQUAL 0)
       message(STATUS ${generated_cpp})
@@ -219,7 +213,10 @@ if(INTERN_BUILD_ATEN_OPS)
 
   add_custom_command(OUTPUT ${generated_cpp} ${cuda_generated_cpp} ${core_generated_cpp}
     COMMAND ${GEN_COMMAND}
-    DEPENDS ${all_python} ${all_templates} ${cwrap_files})
+    DEPENDS ${all_python} ${all_templates}
+      ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/native_functions.yaml
+    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/..
+    )
 
   # Generated headers used from a CUDA (.cu) file are
   # not tracked correctly in CMake. We make the libATen.so depend explicitly
diff --git a/docs/cpp/source/check-doxygen.sh b/docs/cpp/source/check-doxygen.sh
index 454ea228dd5..b258a412141 100755
--- a/docs/cpp/source/check-doxygen.sh
+++ b/docs/cpp/source/check-doxygen.sh
@@ -14,16 +14,9 @@ command -v doxygen >/dev/null 2>&1 || { echo >&2 "doxygen is not supported. Abor
 
 pushd "$(dirname "$0")/../../.."
 
-cp aten/src/ATen/common_with_cwrap.py tools/shared/cwrap_common.py
 cp torch/_utils_internal.py tools/shared
 
-python aten/src/ATen/gen.py \
-  -s aten/src/ATen \
-  -d build/aten/src/ATen \
-  aten/src/ATen/Declarations.cwrap \
-  aten/src/THCUNN/generic/THCUNN.h \
-  aten/src/ATen/nn.yaml \
-  aten/src/ATen/native/native_functions.yaml
+python -m tools.codegen.gen
 
 python tools/setup_helpers/generate_code.py                 \
   --declarations-path build/aten/src/ATen/Declarations.yaml \
diff --git a/mypy-strict.ini b/mypy-strict.ini
index 21563d4be91..95a8d599606 100644
--- a/mypy-strict.ini
+++ b/mypy-strict.ini
@@ -29,5 +29,4 @@ warn_return_any = True
 implicit_reexport = False
 strict_equality = True
 
-files =
-    aten/src/ATen/code_template.py
+files = tools/codegen/gen.py
diff --git a/mypy.ini b/mypy.ini
index d2765089197..9b73e839d29 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -17,7 +17,6 @@ check_untyped_defs = True
 files =
     torch,
     caffe2,
-    aten/src/ATen/function_wrapper.py,
     test/test_complex.py,
     test/test_futures.py,
     test/test_torch.py,
diff --git a/requirements.txt b/requirements.txt
index 29fb620ec62..07127f738ff 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@ requests
 setuptools
 six
 typing_extensions
+dataclasses
diff --git a/setup.py b/setup.py
index 508dcdd94e9..8c060a1c5e3 100644
--- a/setup.py
+++ b/setup.py
@@ -351,8 +351,8 @@ def build_deps():
 
     # Use copies instead of symbolic files.
     # Windows has very poor support for them.
-    sym_files = ['tools/shared/cwrap_common.py', 'tools/shared/_utils_internal.py']
-    orig_files = ['aten/src/ATen/common_with_cwrap.py', 'torch/_utils_internal.py']
+    sym_files = ['tools/shared/_utils_internal.py']
+    orig_files = ['torch/_utils_internal.py']
     for sym_file, orig_file in zip(sym_files, orig_files):
         same = False
         if os.path.exists(sym_file):
@@ -368,7 +368,7 @@ def build_deps():
 ################################################################################
 
 # the list of runtime dependencies required by this built package
-install_requires = ['future', 'typing_extensions']
+install_requires = ['future', 'typing_extensions', 'dataclasses']
 
 missing_pydep = '''
 Missing build dependency: Unable to `import {importname}`.
diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index bccaa21b811..c566e6f9026 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -25,6 +25,8 @@ from torch._C import parse_schema
 #   1: date until which the allowlist entry is valid
 #   2: (optional) function argument regex
 # ]
+#
+# NB: function name DOES NOT include overload name!
 allow_list = [
     ("c10_experimental", datetime.date(2222, 1, 1)),
     # We export some functions and classes for test_jit.py directly from libtorch.so,
@@ -69,9 +71,11 @@ allow_list = [
     ("aten::gcd", datetime.date(2020, 7, 30)),
     ("aten::unflatten", datetime.date(2020, 8, 14)),
     ("aten::linalg_outer", datetime.date(2020, 8, 30)),
+    # WARNING: overload name here doesn't do anything
     ("aten::linalg_outer.out", datetime.date(2020, 8, 30)),
     ("aten::_compute_linear_combination", datetime.date(2020, 9, 1)),
     ("__getstate__", datetime.date(2020, 9, 1), "Conv[23]dPackedParams"),
+    ("aten::_foreach_add_", datetime.date(2020, 10, 1)),
 ]
 
 
diff --git a/test/test_type_hints.py b/test/test_type_hints.py
index 3f6e1215a10..55c080e2cc5 100644
--- a/test/test_type_hints.py
+++ b/test/test_type_hints.py
@@ -215,7 +215,7 @@ class TestTypeHints(TestCase):
         finally:
             os.chdir(cwd)
         if result != 0:
-            self.fail("mypy failed: {}".format(stdout))
+            self.fail("mypy failed: {} {}".format(stdout, stderr))
 
     @unittest.skipIf(not HAVE_MYPY, "need mypy")
     def test_run_mypy_strict(self):
@@ -237,7 +237,7 @@ class TestTypeHints(TestCase):
         finally:
             os.chdir(cwd)
         if result != 0:
-            self.fail("mypy failed: {}".format(stdout))
+            self.fail("mypy failed: {} {}".format(stdout, stderr))
 
 if __name__ == '__main__':
     run_tests()
diff --git a/tools/autograd/gen_autograd.py b/tools/autograd/gen_autograd.py
index 98d9c463581..82d908de618 100644
--- a/tools/autograd/gen_autograd.py
+++ b/tools/autograd/gen_autograd.py
@@ -154,7 +154,6 @@ def load_aten_declarations(path):
         if has_tensoroptions_argument(declaration):
             declaration['schema_order_args'] = [process_schema_order_arg(arg) for arg in declaration['schema_order_args']]
         declaration['api_name'] = declaration['name']
-        # NB: keep this in sync with common_with_cwrap.py
         if declaration.get('overload_name'):
             declaration['type_wrapper_name'] = "{}_{}".format(
                 declaration['name'], declaration['overload_name'])
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 4ea27f01cc4..834916a72b5 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -35,11 +35,7 @@ import re
 from .gen_variable_type import should_trace
 from .utils import write, is_tensor_method
 
-try:
-    from src.ATen.code_template import CodeTemplate
-except ImportError:
-    from tools.shared.module_loader import import_module
-    CodeTemplate = import_module('code_template', 'aten/src/ATen/code_template.py').CodeTemplate
+from tools.codegen.code_template import CodeTemplate
 
 #
 # declarations blocklist
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index c2b8688de91..7329495ac99 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -216,7 +216,15 @@ ${return_type} ${type_wrapper_name}(${formals}) {
 }
 """)
 
-# See NOTE[UnboxedOnly] in function_wrapper.py
+# NOTE[UnboxedOnly] Many of our codegen templates currently exist twice, once
+# in an _UNBOXEDONLY_ variant and once without _UNBOXEDONLY_. This is because
+# ops that are `use_c10_dispatcher: full` need different c++ code than ops
+# that aren't `use_c10_dispatcher: full` yet. The _UNBOXEDONLY_ variants
+# are for ops that aren't `use_c10_dispatcher: full` yet and those code templates
+# can be deleted once all ops are `use_c10_dispatcher: full`.
+# If you update one of the templates, you likely also have to update the other.
+
+# See NOTE[UnboxedOnly]
 UNBOXEDONLY_WRAPPER_REGISTRATION = CodeTemplate("""\
 m.impl_UNBOXED("${unqual_operator_name_with_overload}", &${class_type}::${type_wrapper_name});
 """)
@@ -366,7 +374,7 @@ ${return_type} ${api_name}(${declaration_formals}); // {"schema": "${schema_stri
 
 # TraceType templates
 # TODO: change `redispatch` to `NoTracerDispatchMode` + regular `call`.
-# See NOTE[UnboxedOnly] in function_wrapper.py
+# See NOTE[UnboxedOnly]
 UNBOXED_TRACE_DISPATCH = CodeTemplate("""\
 static auto op = c10::Dispatcher::singleton()
     .findSchemaOrThrow("aten::${operator_name}", "${overload_name}")
diff --git a/tools/autograd/utils.py b/tools/autograd/utils.py
index 96add9ad385..92f8fe89f56 100644
--- a/tools/autograd/utils.py
+++ b/tools/autograd/utils.py
@@ -9,11 +9,7 @@ __all__ = [
     'split_name_params', 'write',
 ]
 
-try:
-    from src.ATen.code_template import CodeTemplate
-except ImportError:
-    from tools.shared.module_loader import import_module
-    CodeTemplate = import_module('code_template', 'aten/src/ATen/code_template.py').CodeTemplate
+from tools.codegen.code_template import CodeTemplate
 
 # You should use these lines, rather than doing it manually.
 # Especially if you see this error!
diff --git a/tools/codegen/__init__.py b/tools/codegen/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tools/codegen/api/__init__.py b/tools/codegen/api/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tools/codegen/api/cpp.py b/tools/codegen/api/cpp.py
new file mode 100644
index 00000000000..452c3721ab9
--- /dev/null
+++ b/tools/codegen/api/cpp.py
@@ -0,0 +1,241 @@
+from tools.codegen.model import *
+from tools.codegen.api.types import TensorOptionsArguments, CppArgument, ThisArgument
+import tools.codegen.local as local
+from typing import Optional, Sequence, Union, Callable, List
+
+# This file describes the translation of JIT schema to the public C++
+# API, which is what people use when they call functions like at::add.
+#
+# Prominent characteristics of the C++ API:
+#
+#   - dtype, layout, device and pin_memory are collected into
+#     a single C++ type TensorOptions  (the legacy dispatcher API
+#     also has this, but tensor options is really most relevant
+#     for the C++ API; it makes calling kwarg factory functions
+#     pleasant)
+#
+#   - for 'use_c10_dispatcher: full' functions, optional tensors are
+#     represented explicitly using c10::optional
+#
+#   - defaulting lives here (in fact, the dispatcher is completely
+#     oblivious of defaults!)
+#
+# BTW: policy on name collisions: we try not to have types with
+# collisions, but functions are fair game to collide
+
+def name(func: FunctionSchema) -> str:
+    name = str(func.name.name)
+    if func.is_out_fn():
+        name += '_out'
+    return name
+
+# Translation of "value types" in JIT schema to C++ API type.  Value
+# types look the same no matter if they are argument types are return
+# types.  Returns None if the type in question is not a value type.
+def valuetype_type(t: Type) -> Optional[str]:
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor:
+            return None
+        elif t.name == BaseTy.int:
+            return 'int64_t'
+        elif t.name == BaseTy.float:
+            return 'double'
+        elif t.name == BaseTy.str:
+            return 'std::string'
+        elif t.name in [BaseTy.bool, BaseTy.QScheme, BaseTy.Scalar,
+                        BaseTy.ScalarType, BaseTy.Generator, BaseTy.Storage,
+                        BaseTy.Layout, BaseTy.Device, BaseTy.MemoryFormat,
+                        BaseTy.Dimname, BaseTy.ConstQuantizerPtr]:
+            # These C++ names line up with their schema names
+            return t.name.name
+        else:
+            raise AssertionError(f"unsupported type: {t}")
+    elif isinstance(t, OptionalType):
+        elem = valuetype_type(t.elem)
+        if elem is None:
+            return None
+        return f"c10::optional<{elem}>"
+    elif isinstance(t, ListType):
+        if str(t.elem) == 'bool':
+            assert t.size is not None
+            return f"std::array<bool,{t.size}>"
+        else:
+            return None
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+# Translation of types occuring in JIT arguments to a C++ argument type.
+def argumenttype_type(t: Type, *, mutable: bool) -> str:
+    # If it's a value type, do the value type translation
+    r = valuetype_type(t)
+    if r is not None:
+        return r
+
+    if str(t) == 'Tensor' and mutable and local.hack_const_mutable_self():
+        return 'const Tensor &'
+
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor:
+            if mutable:
+                return 'Tensor &'
+            else:
+                return 'const Tensor &'
+        else:
+            raise AssertionError(f"base type should have been value type {t}")
+    elif isinstance(t, OptionalType):
+        if str(t.elem) == 'Tensor':
+            if mutable:
+                return 'Tensor &'  # TODO: fix this discrepancy
+            else:
+                if local.use_c10_dispatcher() is UseC10Dispatcher.full:
+                    return 'const c10::optional<Tensor>&'
+                else:
+                    return 'const Tensor &'
+        elem = argumenttype_type(t.elem, mutable=mutable)
+        return f"c10::optional<{elem}>"
+    elif isinstance(t, ListType):
+        # TODO: remove these special cases, ArrayRef fallthrough works fine
+        if str(t.elem) == 'int':
+            return "IntArrayRef"
+        elif str(t.elem) == 'Tensor':
+            return "TensorList"
+        elif str(t.elem) == 'Dimname':
+            return "DimnameList"
+        # TODO: do something reasonable about lists of optional tensors
+        elif not local.use_c10_dispatcher() is UseC10Dispatcher.full and str(t.elem) == 'Tensor?':
+            return "TensorList"
+        elem = argumenttype_type(t.elem, mutable=mutable)
+        # TODO: explicitly qualify namespace here
+        return f"ArrayRef<{elem}>"
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+# Translate a JIT argument into its C++ type
+def argument_type(a: Argument) -> str:
+    return argumenttype_type(a.type, mutable=a.is_write)
+
+# Translation of a (non-multi) return type from JIT to C++
+def returntype_type(t: Type, *, mutable: bool) -> str:
+    r = valuetype_type(t)
+    if r is not None:
+        return r
+
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor:
+            if mutable:
+                return 'Tensor &'
+            else:
+                return 'Tensor'
+    elif isinstance(t, ListType):
+        elem = returntype_type(t.elem, mutable=mutable)
+        assert t.size is None, f"fixed size list returns not supported: {t}"
+        return f"std::vector<{elem}>"
+
+    raise AssertionError(f"unrecognized return type {t}")
+
+# Translation of a single return to its C++ type
+def return_type(r: Return) -> str:
+    return returntype_type(r.type, mutable=r.is_write)
+
+# Translation of a full (possibly multi) return from JIT to its C++ type
+def returns_type(rs: Sequence[Return]) -> str:
+    if len(rs) == 0:
+        return 'void'
+    elif len(rs) == 1:
+        return return_type(rs[0])
+    else:
+        args = ','.join(map(return_type, rs))
+        return f'std::tuple<{args}>'
+
+JIT_TO_CPP_DEFAULT = {
+    'False': 'false',
+    'True': 'true',
+    'None': 'c10::nullopt',  # UGH this one is type directed
+    'Mean': 'at::Reduction::Mean',
+    '[]': '{}',
+    '[0,1]': '{0,1}',  # TODO: stop special casing
+    'contiguous_format': 'MemoryFormat::Contiguous',
+}
+
+# Convert a JIT default into C++ expression representing the default
+def default_expr(d: str, t: Type) -> str:
+    if d == 'None' and str(t) == 'Tensor?':
+        return '{}'
+    return JIT_TO_CPP_DEFAULT.get(d, d)
+
+# Convert an argument into its C++ API form
+def argument(a: Union[Argument, TensorOptionsArguments, ThisArgument]) -> CppArgument:
+    if isinstance(a, Argument):
+        return CppArgument(
+            type=argument_type(a),
+            name=a.name,
+            default=default_expr(a.default, a.type) if a.default is not None else None,
+            argument=a,
+        )
+    elif isinstance(a, ThisArgument):
+        return CppArgument(
+            type=argument_type(a.argument),
+            name="const_cast<Tensor&>(*this)",  # this is an abuse but it's convenient
+            default=None,
+            argument=a,
+        )
+    elif isinstance(a, TensorOptionsArguments):
+        default = None
+        if all(x.default == "None" for x in a.all()):
+            default = '{}'
+        elif a.dtype.default == "long":
+            default = 'at::kLong'  # TODO: this is wrong
+        return CppArgument(
+            type='const TensorOptions &',
+            name='options',
+            default=default,
+            argument=a,
+        )
+    else:
+        assert_never(a)
+
+def group_arguments(
+    func: FunctionSchema, *, method: bool = False
+) -> Sequence[Union[Argument, TensorOptionsArguments, ThisArgument]]:
+    args: List[Union[Argument, ThisArgument, TensorOptionsArguments]] = []
+    args.extend(func.out_arguments)
+
+    if method:
+        args.extend(ThisArgument(a) if a.name == "self" else a for a in func.arguments)
+    else:
+        args.extend(func.arguments)
+
+    # group up arguments for tensor options
+
+    def pred(name: str, ty: Type) -> Callable[[Argument], bool]:
+        return lambda a: a.name == name and a.type in [ty, OptionalType(ty)]
+    predicates = [  # order matters
+        pred('dtype', Type.parse('ScalarType')),
+        pred('layout', Type.parse('Layout')),
+        pred('device', Type.parse('Device')),
+        pred('pin_memory', Type.parse('bool')),
+    ]
+
+    i = 0
+    while i < len(func.kwarg_only_arguments):
+        # If there is enough space...
+        if i <= len(func.kwarg_only_arguments) - len(predicates):
+            # And the next len(predicates) arguments look like TensorOptions arguments
+            if all(p(a) for p, a in zip(predicates, func.kwarg_only_arguments[i : i + len(predicates)])):
+                # Group them together as one argument
+                args.append(TensorOptionsArguments(
+                    dtype=func.kwarg_only_arguments[i],
+                    layout=func.kwarg_only_arguments[i + 1],
+                    device=func.kwarg_only_arguments[i + 2],
+                    pin_memory=func.kwarg_only_arguments[i + 3],
+                ))
+                i += len(predicates)
+                continue
+        args.append(func.kwarg_only_arguments[i])
+        i += 1
+
+    return args
+
+# Convert arguments to C++ API form
+def arguments(func: FunctionSchema, *, method: bool = False) -> Sequence[CppArgument]:
+    return list(map(argument, group_arguments(func, method=method)))
diff --git a/tools/codegen/api/dispatcher.py b/tools/codegen/api/dispatcher.py
new file mode 100644
index 00000000000..34960534275
--- /dev/null
+++ b/tools/codegen/api/dispatcher.py
@@ -0,0 +1,109 @@
+from tools.codegen.model import *
+
+from tools.codegen.api.types import CppArgument, DispatcherExpr, TensorOptionsArguments, \
+    DispatcherArgument, ThisArgument, LegacyDispatcherArgument
+import tools.codegen.api.cpp as cpp
+import tools.codegen.api.legacy_dispatcher as legacy_dispatcher
+import tools.codegen.local as local
+
+import itertools
+from typing import Sequence, Optional
+
+# This file describes the translation of JIT schema to the dispatcher
+# API, the *unboxed* calling convention by which invocations through
+# the dispatcher are made.  Historically, the dispatcher API matched
+# the C++ API, but with the establishment of the boxed API, we've
+# made changes to the dispatcher API to so that the unboxed API
+# better aligns with the boxed API.  The dispatcher API hooks heavily
+# into our template based boxing/unboxing machinery, so changes
+# to this convention will usually need template updates too.
+#
+# Prominent characteristics of the dispatcher API:
+#
+#   - 'use_c10_dispatcher: full' controls whether or not we actually
+#     use the modern calling convention or not.  When use_c10_dispatcher
+#     is not enabled, we don't use the template machinery.
+#
+#   - dtype, layout, device and pin_memory are represented as separate
+#     arguments.
+#
+
+def argumenttype_type(t: Type, *, mutable: bool) -> str:
+    if local.use_c10_dispatcher() is UseC10Dispatcher.full:
+        # This is a faux amis.  If it makes sense in the future to add
+        # more special cases here, or invert things so cpp.argument_type
+        # calls this, or just completely inline the function, please do
+        # it.
+        return cpp.argumenttype_type(t, mutable=mutable)
+    else:
+        # This is real sharing.  If you're modifying this path, ask
+        # yourself why you are changing the legacy dispatcher protocol
+        # here and not in legacy_dispatcher.
+        return legacy_dispatcher.argumenttype_type(t, mutable=mutable)
+
+def argument_type(a: Argument) -> str:
+    return argumenttype_type(a.type, mutable=a.is_write)
+
+def returns_type(rs: Sequence[Return]) -> str:
+    # At present, there is no difference. But there could be!
+    return cpp.returns_type(rs)
+
+def argument(a: Argument) -> DispatcherArgument:
+    if local.use_c10_dispatcher() is UseC10Dispatcher.full:
+        return DispatcherArgument(
+            type=argument_type(a),
+            name=a.name,
+            argument=a,
+        )
+    else:
+        la = legacy_dispatcher.argument(a)
+        return DispatcherArgument(
+            type=la.type,
+            name=la.name,
+            argument=la.argument,
+        )
+
+def arguments(func: FunctionSchema) -> Sequence[DispatcherArgument]:
+    if local.use_c10_dispatcher() is UseC10Dispatcher.full:
+        return list(map(argument, itertools.chain(func.out_arguments, func.arguments, func.kwarg_only_arguments)))
+    else:
+        return [
+            DispatcherArgument(type=la.type, name=la.name, argument=la.argument)
+            for la in legacy_dispatcher.arguments(func)
+        ]
+
+# Given a set of CppArguments in scope, return a sequence of dispatcher
+# expressions that translate the cpp API into dispatcher API
+def cppargument_exprs(a: CppArgument, *, tensor_options: Optional[CppArgument]) -> Sequence[DispatcherExpr]:
+    if isinstance(a.argument, TensorOptionsArguments):
+        if local.use_c10_dispatcher() is UseC10Dispatcher.full:
+            ta = a.argument
+            return [
+                DispatcherExpr(type=argument_type(ta.dtype), expr=f'optTypeMetaToScalarType({a.name}.dtype_opt())'),
+                DispatcherExpr(type=argument_type(ta.layout), expr=f'{a.name}.layout_opt()'),
+                DispatcherExpr(type=argument_type(ta.device), expr=f'{a.name}.device_opt()'),
+                DispatcherExpr(type=argument_type(ta.pin_memory), expr=f'{a.name}.pinned_memory_opt()'),  # weird discrep
+            ]
+        else:
+            return [DispatcherExpr(type='const TensorOptions &', expr=a.name)]
+    elif isinstance(a.argument, Argument):
+        if a.name == 'memory_format' and tensor_options is not None and local.use_c10_dispatcher() is UseC10Dispatcher.full:
+            return [DispatcherExpr(
+                type=argument_type(a.argument),
+                expr=f'c10::impl::check_tensor_options_and_extract_memory_format({tensor_options.name}, {a.name})')
+            ]
+        else:
+            return [DispatcherExpr(type=argument_type(a.argument), expr=a.name)]
+    elif isinstance(a.argument, ThisArgument):
+        return [DispatcherExpr(type=argument_type(a.argument.argument), expr=a.name)]
+    else:
+        assert_never(a.argument)
+
+def cpparguments_exprs(args: Sequence[CppArgument]) -> Sequence[DispatcherExpr]:
+    tensor_options = next((a for a in args if isinstance(a.argument, TensorOptionsArguments)), None)
+    return [r for a in args for r in cppargument_exprs(a, tensor_options=tensor_options)]
+
+# I don't think this is entirely sound, but it should be reasonably
+# close
+def legacydispatcherarguments_exprs(args: Sequence[LegacyDispatcherArgument]) -> Sequence[DispatcherExpr]:
+    return cpparguments_exprs([CppArgument(type=a.type, name=a.name, default=None, argument=a.argument) for a in args])
diff --git a/tools/codegen/api/legacy_dispatcher.py b/tools/codegen/api/legacy_dispatcher.py
new file mode 100644
index 00000000000..db3d26c84fd
--- /dev/null
+++ b/tools/codegen/api/legacy_dispatcher.py
@@ -0,0 +1,74 @@
+from tools.codegen.model import *
+
+from tools.codegen.api.types import TensorOptionsArguments, LegacyDispatcherArgument, ThisArgument
+import tools.codegen.api.cpp as cpp
+
+from typing import Union, Sequence
+
+# This file describes the translation of JIT schema to the legacy
+# dispatcher API.  This looks a lot like the C++ API (which
+# makes historical sense, because historically the dispatcher API
+# and the C++ API exactly matched), but over time we have
+# evolved the C++ API without actually changing our native::
+# kernels.  To be deleted eventually.  Dispatcher calls use
+# this when you are not use_c10_dispatcher: full.
+
+def name(func: FunctionSchema) -> str:
+    name = str(func.name.name)
+    # TODO: delete this!
+    if func.is_out_fn():
+        name += '_out'
+    if func.name.overload_name:
+        name += f'_{func.name.overload_name}'
+    return name
+
+def argumenttype_type(t: Type, *, mutable: bool) -> str:
+    if str(t) == 'Tensor?':
+        if mutable:
+            return 'Tensor &'
+        else:
+            return 'const Tensor &'
+    elif str(t) == 'Tensor?[]':
+        return 'TensorList'
+    return cpp.argumenttype_type(t, mutable=mutable)
+
+def returns_type(rs: Sequence[Return]) -> str:
+    return cpp.returns_type(rs)
+
+def argument_type(a: Argument) -> str:
+    return argumenttype_type(a.type, mutable=a.is_write)
+
+def argument(a: Union[Argument, ThisArgument, TensorOptionsArguments]) -> LegacyDispatcherArgument:
+    if isinstance(a, Argument):
+        return LegacyDispatcherArgument(
+            type=argument_type(a),
+            name=a.name,
+            default=cpp.default_expr(a.default, a.type) if a.default is not None else None,
+            argument=a,
+        )
+    elif isinstance(a, ThisArgument):
+        # Erase ThisArgument from the distinction
+        return LegacyDispatcherArgument(
+            type=argument_type(a.argument),
+            name=a.argument.name,
+            default=None,
+            argument=a.argument,
+        )
+    elif isinstance(a, TensorOptionsArguments):
+        # TODO: expunge this logic entirely
+        default = None
+        if all(x.default == "None" for x in a.all()):
+            default = '{}'
+        elif a.dtype.default == "long":
+            default = 'at::kLong'  # TODO: this is wrong
+        return LegacyDispatcherArgument(
+            type='const TensorOptions &',
+            name='options',
+            default=default,
+            argument=a,
+        )
+    else:
+        assert_never(a)
+
+def arguments(func: FunctionSchema) -> Sequence[LegacyDispatcherArgument]:
+    return list(map(argument, cpp.group_arguments(func)))
diff --git a/tools/codegen/api/types.py b/tools/codegen/api/types.py
new file mode 100644
index 00000000000..cb315cfc752
--- /dev/null
+++ b/tools/codegen/api/types.py
@@ -0,0 +1,95 @@
+from tools.codegen.model import *
+from dataclasses import dataclass
+from typing import Optional, Union, Sequence
+
+# Represents the implicit *this argument for method calls in C++ API
+@dataclass(frozen=True)
+class ThisArgument:
+    argument: Argument
+
+# Bundle of arguments that represent a TensorOptions in the C++ API.
+@dataclass(frozen=True)
+class TensorOptionsArguments:
+    dtype: Argument
+    layout: Argument
+    device: Argument
+    pin_memory: Argument
+
+    def all(self) -> Sequence[Argument]:
+        return [self.dtype, self.layout, self.device, self.pin_memory]
+
+# Describe a argument (e.g., the x in "f(int x)") in the C++ API
+@dataclass(frozen=True)
+class CppArgument:
+    # C++ type, e.g., int
+    type: str
+    # C++ name, e.g., x
+    name: str
+    # Only used by the header, but we work it out in all cases anyway
+    default: Optional[str]
+    # The JIT argument(s) this formal was derived from.  May
+    # correspond to multiple arguments if this is TensorOptions!
+    # May also correspond to the implicit *this argument!
+    argument: Union[Argument, TensorOptionsArguments, ThisArgument]
+
+    # Default string representation prints the most elaborated form
+    # of the formal
+    def __str__(self) -> str:
+        mb_default = ""
+        if self.default is not None:
+            mb_default = f"={self.default}"
+        return f"{self.type} {self.name}{mb_default}"
+
+    # However, you might also find the version with no default useful
+    def str_no_default(self) -> str:
+        return f"{self.type} {self.name}"
+
+@dataclass(frozen=True)
+class CppExpr:
+    type: str
+    expr: str
+
+@dataclass(frozen=True)
+class DispatcherExpr:
+    type: str
+    expr: str
+
+@dataclass(frozen=True)
+class LegacyDispatcherExpr:
+    type: str
+    expr: str
+
+@dataclass(frozen=True)
+class DispatcherArgument:
+    type: str
+    name: str
+    # dispatcher NEVER has defaults
+    argument: Union[Argument, TensorOptionsArguments]
+    # TensorOptionsArguments can occur when not using full c10 dispatch
+
+    def __str__(self) -> str:
+        return f"{self.type} {self.name}"
+
+@dataclass(frozen=True)
+class LegacyDispatcherArgument:
+    type: str
+    name: str
+    # Legacy dispatcher arguments have defaults for some reasons (e.g.,
+    # the function prototypes in CPUType.h are defaulted).  There isn't
+    # really any good reason to do this, as these functions are only
+    # ever called from a context where all defaulted arguments are
+    # guaranteed to be given explicitly.
+    # TODO: Remove this
+    default: Optional[str]
+    argument: Union[Argument, TensorOptionsArguments]
+
+    # Convention here is swapped because arguably legacy
+    # dispatcher shouldn't have defaults...
+    def __str__(self) -> str:
+        return f"{self.type} {self.name}"
+
+    def str_with_default(self) -> str:
+        mb_default = ""
+        if self.default is not None:
+            mb_default = f"={self.default}"
+        return f"{self.type} {self.name}{mb_default}"
diff --git a/aten/src/ATen/code_template.py b/tools/codegen/code_template.py
similarity index 100%
rename from aten/src/ATen/code_template.py
rename to tools/codegen/code_template.py
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
new file mode 100644
index 00000000000..a67901ea594
--- /dev/null
+++ b/tools/codegen/gen.py
@@ -0,0 +1,1111 @@
+import os
+import contextlib
+import textwrap
+import itertools
+from typing import List, Dict, Optional, Iterator, Tuple, Set, Callable, Any, TypeVar, DefaultDict, Union, Sequence
+import yaml
+from enum import Enum
+from collections import OrderedDict
+import argparse
+import pathlib
+import functools
+
+from tools.codegen.code_template import CodeTemplate
+from tools.codegen.model import *
+from tools.codegen.api.types import *
+import tools.codegen.api.cpp as cpp
+import tools.codegen.api.dispatcher as dispatcher
+import tools.codegen.api.legacy_dispatcher as legacy_dispatcher
+import tools.codegen.local as local
+
+try:
+    # use faster C loader if available
+    from yaml import CLoader as Loader
+except ImportError:
+    from yaml import Loader  # type: ignore
+
+# Welcome to the ATen code generator v2!  The ATen code generator is
+# responsible for parsing native_functions.yaml and then generating
+# various generated files (e.g., TypeDefault.cpp) based on the operators
+# defined in this file.  This means that the code generator knows how to
+# parse function schema, and then translate this into various C++ types
+# and boilerplate code.
+#
+# Some things to know about this file when you modify it:
+#
+# - This file has STRICT mypy typechecking.  Typecheck it with
+#   `mypy --config mypy-strict.ini` in the root source directory
+#
+# - Most of the heavy lifting lives in external modules:
+#   - 'model' has the data model for native_functions.yaml.  The classes
+#     in those file represent what you see when you look at
+#     a native_functions.yaml
+#   - 'api' has conversions for how to translate JIT schema into
+#     the various C++ APIs that the codegen interacts with.  There
+#     are in fact THREE different C++ APIs: the public C++ API,
+#     the dispatcher API, and the legacy disaptcher API.  See each
+#     of these respective files for more information
+
+
+# Note [Byte-for-byte compatibility]
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Some special cases we have made in this codegen have been strictly
+# to make sure that git diff -w reports no changes, but we believe
+# they are not semantically meaningful.  After landing the new codegen,
+# we should remove these special cases
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                         HELPER FUNCTIONS
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+# Conveniently add error context to exceptions raised.  Lets us
+# easily say that an error occurred while processing a specific
+# context.
+@contextlib.contextmanager
+def context(msg: str) -> Iterator[None]:
+    try:
+        yield
+    except Exception as e:
+        # TODO: this does the wrong thing with KeyError
+        msg = textwrap.indent(msg, '  ')
+        msg = f'{e.args[0]}\n{msg}' if e.args else msg
+        e.args = (msg,) + e.args[1:]
+        raise
+
+# A custom loader for YAML to let us also keep track of line numbers
+# of each entry in the YAML file
+class LineLoader(Loader):
+    def construct_mapping(self, node, deep=False):  # type: ignore
+        mapping = super().construct_mapping(node, deep=deep)  # type: ignore
+        # Add 1 so line numbering starts at 1
+        mapping['__line__'] = node.start_mark.line + 1
+        return mapping
+
+# Parse native_functions.yaml into a sequence of NativeFunctions
+def parse_native_yaml(path: str) -> List[NativeFunction]:
+    with open(path, 'r') as f:
+        es = yaml.load(f, Loader=LineLoader)
+    assert isinstance(es, list)
+    rs: List[NativeFunction] = []
+    for e in es:
+        assert isinstance(e.get('__line__'), int), e
+        loc = Location(path, e['__line__'])
+        funcs = e.get('func')
+        with context(f'in {loc}:\n  {funcs}'):
+            rs.append(NativeFunction.from_yaml(e, loc))
+    return rs
+
+T = TypeVar('T')
+S = TypeVar('S')
+
+# Given a function that operates on NativeFunction, wrap it into a new function
+# that sets some appropriate context managers for that native function.
+# YOU MUST WRAP FUNCTIONS IN THIS for calls to api modules to be sound
+# (you will get an error if we try to access the local variables without having
+# set them).
+def with_native_function(func: Callable[[NativeFunction], T]) -> Callable[[NativeFunction], T]:
+    @functools.wraps(func)
+    def wrapper(f: NativeFunction) -> T:
+        with context(f'in {f.loc}:\n  {f.func}'):
+            with local.parametrize(
+                use_c10_dispatcher=f.use_c10_dispatcher,
+                # See Note [Byte-for-byte compatibility]
+                hack_const_mutable_self=str(f.func.name) in ["set_data", "retain_grad"],
+            ):
+                return func(f)
+    return wrapper
+
+# These two functions purposely return generators in analogy to map()
+# so that you don't mix up when you need to list() them
+
+# Map over function that may return None; omit Nones from output sequence
+def mapMaybe(func: Callable[[T], Optional[S]], xs: Sequence[T]) -> Iterator[S]:
+    for x in xs:
+        r = func(x)
+        if r is not None:
+            yield r
+
+# Map over function that returns sequences and cat them all together
+def concatMap(func: Callable[[T], Sequence[S]], xs: Sequence[T]) -> Iterator[S]:
+    for x in xs:
+        for r in func(x):
+            yield r
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                        C++ CODE GENERATION
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+# Most functions in this section are curried: they consist of a function
+# that takes some parameters (e.g., what is to be generated) which itself
+# returns a function that actually maps NativeFunction to the code
+# to be generated.  This pattern makes it convenient to use map, concatMap
+# and similar functional combinators.
+
+# Many of these functions share logic for defining both the definition
+# and declaration (for example, the function signature is the same), so
+# we organize them into one function that takes a Target to say which
+# code we want.
+Target = Enum('Target', ('DEFINITION', 'DECLARATION', 'REGISTRATION'))
+
+# Generates {dispatch}Type.cpp and {dispatch}Type.h (e.g., CPUType.cpp
+# and CPUType.h).  This function is also reused to implement per-operator
+# registration.  It also generates TypeDefault.cpp and TypeDefault.h when
+# dispatch is None.
+#
+# {dispatch}Type.cpp
+#   - The primary function of this file is to register all of the
+#     implementations for the given dispatch key to the dispatcher,
+#     so they are available for use in PyTorch.  If dispatch is
+#     None, we generate schema (def) registrations and catchall
+#     registrations.
+#   - The secondary function of this file is to generate a wrapper
+#     around functions.  In CPUType these wrappers do nothing
+#     (and should be removed), but in other cases they handle
+#     DeviceGuard. A small extra benefit of wrappers is they
+#     are not overloaded, so they can be used in the registration
+#     API without having to disambiguate which overload you want
+#     (as would be the case if you directly registered native::
+#     functions).
+#
+# {dispatch}Type.h
+#   - In principle, this file shouldn't exist at all; historically,
+#     it existed so that we could directly access these functions
+#     outside of the registration API for the implementation of
+#     static dispatch.  Should be deleted now!
+#
+# This function is also used for a secondary purpose: the registration
+# logic is also reused to implement per-operator registration.
+def compute_type_method(
+    dispatch: Optional[str], *,
+    target: Target,
+    # Which operators to actually generate code for.  If None, generate
+    # code for all operators
+    op_registration_whitelist: Optional[Set[str]],
+    # Only valid for generating registrations.  If True, only generate
+    # def() invocations (for schema registration); do not generate
+    # any impl() invocations for, e.g., catch-all kernels
+    def_only: bool = False
+) -> Callable[[NativeFunction], Optional[str]]:
+
+    if def_only:
+        assert target is Target.REGISTRATION and dispatch is None
+
+    @with_native_function
+    def func(f: NativeFunction) -> Optional[str]:
+        if dispatch is not None:
+            if f.dispatch is None or dispatch not in f.dispatch:
+                return None
+        else:
+            if f.dispatch is not None and target is not Target.REGISTRATION:
+                return None
+
+        if op_registration_whitelist is not None and \
+                f"aten::{f.func.name.name}" not in op_registration_whitelist and target is Target.REGISTRATION:
+            return None
+
+        name = legacy_dispatcher.name(f.func)
+        returns_type = legacy_dispatcher.returns_type(f.func.returns)
+        args = legacy_dispatcher.arguments(f.func)
+        args_str = ', '.join(map(str, args))
+
+        if target is Target.DECLARATION:
+            return f"{returns_type} {name}({args_str});"
+        elif target is Target.DEFINITION:
+            if f.dispatch is None:
+                cpp_name = cpp.name(f.func)
+                impl_name = f"at::native::{cpp_name}"
+            else:
+                assert dispatch is not None
+                impl_name = f"at::native::{f.dispatch[dispatch]}"
+
+            args_exprs_str = ', '.join(map(lambda a: a.name, args))
+
+            # See Note [Byte-for-byte compatibility]
+            # (return void_func() is valid C++)
+            return_kw = "    return "
+            if returns_type == "void":
+                return_kw = " "
+
+            cuda_guard = ""
+            if dispatch is None or 'CUDA' in dispatch or 'Vulkan' == dispatch:
+                self_args = (a for a in f.func.arguments if a.name == "self")
+
+                # There is precedence for which argument we use to do
+                # device guard.  This describes the precedence order.
+                candidate_args = itertools.chain(self_args, f.func.out_arguments, f.func.arguments)
+
+                # Only tensor like arguments are eligible
+                device_of = next((f'{a.name}' for a in candidate_args if a.type.is_tensor_like()), None)
+
+                # See Note [Byte-for-byte compatibility]
+                # I wasn't able to figure out the internal logic for
+                # these device guards
+                if str(f.func.name) == "_thnn_fused_lstm_cell_backward":
+                    device_of = "cx"
+                elif str(f.func.name) == "_thnn_differentiable_lstm_cell_backward":
+                    device_of = "input_gates"
+
+                has_tensor_options = any(isinstance(a.argument, TensorOptionsArguments) for a in args)
+
+                # TODO: There is probably a simpler version of this that
+                # works just as well.
+                if f.device_guard and (dispatch is None or 'Vulkan' == dispatch) and has_tensor_options:
+                    cuda_guard = """\
+    const DeviceGuard device_guard(options.device());
+"""
+                    # See Note [Byte-for-byte compatibility]
+                    if dispatch is not None:
+                        cuda_guard = f"\n{cuda_guard}"
+                elif f.device_guard and dispatch is not None and 'CUDA' in dispatch and has_tensor_options:
+                    cuda_guard = """\
+    globalContext().lazyInitCUDA();
+    const DeviceGuard device_guard(options.device());
+"""
+                elif f.device_guard and device_of is not None:
+                    cuda_guard = f"""\
+    const OptionalDeviceGuard device_guard(device_of({device_of}));
+"""
+                    # See Note [Byte-for-byte compatibility]
+                    if dispatch is not None:
+                        cuda_guard = f"\n{cuda_guard}"
+                else:
+                    cuda_guard = """\
+    // DeviceGuard omitted
+"""
+                    # See Note [Byte-for-byte compatibility]
+                    if dispatch is not None:
+                        cuda_guard = f"\n{cuda_guard}"
+
+            return f"""\
+{returns_type} {name}({args_str}) {{
+{cuda_guard}{return_kw}{impl_name}({args_exprs_str});
+}}
+"""
+
+        elif target is Target.REGISTRATION:
+            assert returns_type == dispatcher.returns_type(f.func.returns)
+            dispatcher_args = dispatcher.arguments(f.func)
+            dispatcher_args_types_str = ', '.join(map(lambda a: a.type, dispatcher_args))
+            if dispatch is None:
+                type_name = f'TypeDefault::{name}'
+            else:
+                type_name = f'{dispatch}Type::{name}'
+
+            # def registration only happens in TypeDefault
+            def_registration = ""
+            if dispatch is None:
+                def_registration = f'm.def("{f.func}");\n'
+
+            impl_registration = ""
+            if not def_only and not f.manual_kernel_registration and (dispatch is not None or f.dispatch is None):
+                # Figure out which signature the function is
+                if local.use_c10_dispatcher() is UseC10Dispatcher.full:
+                    # See Note [Byte-for-byte compatibility]
+                    if dispatch is not None:
+                        nl = "\n"
+                    else:
+                        nl = ""
+
+                    payload = "c10::impl::hacky_wrapper_for_legacy_signatures<" \
+                        f"{returns_type} ({dispatcher_args_types_str})>({nl}TORCH_FN({type_name}))"
+
+                else:
+                    payload = f"torch::CppFunction::makeUnboxedOnly(&{type_name})"
+
+                # Annotate it with dispatch information if necessary
+                #
+                # NB: In the ordinary, TypeDerived code generation work flow, specification
+                # of the backend is handled by the enclosing block, so the torch::dispatch
+                # invocation here is strictly unnecessary.  However, in the fbcode mobile
+                # only workflow using per-op registration, these registrations will get dumped
+                # in a TORCH_LIBRARY_FRAGMENT that does not have an ambient backend.  So
+                # the torch::dispatch specification here is important!  See
+                # Note [Redundancy in registration code is OK] for how we handle redundant info.
+                if dispatch is not None:
+                    payload = f"torch::dispatch(DispatchKey::{dispatch},\n{payload})\n"
+
+                impl_registration = f'm.impl("{f.func.name}",\n{payload});\n'
+
+            return f"{def_registration}{impl_registration}"
+        else:
+            assert_never(target)
+
+    return func
+
+# Generates Function.cpp and Function.h.  These files provide the
+# functional public C++ API, and the scaffolding to call into
+# the dispatcher from these functions.  See also compute_tensor_method.
+def compute_function(*, target: Target) -> Callable[[NativeFunction], Optional[str]]:
+    @with_native_function
+    def go(f: NativeFunction) -> Optional[str]:
+        if f.manual_kernel_registration:
+            return None
+        if Variant.function not in f.variants:
+            return None
+
+        name = cpp.name(f.func)
+
+        cpp_returns_type = cpp.returns_type(f.func.returns)
+        cpp_args = cpp.arguments(f.func)
+        cpp_args_str = ', '.join(map(str, cpp_args))
+
+        if target is Target.DECLARATION:
+            return f"CAFFE2_API {cpp_returns_type} {name}({cpp_args_str});"
+
+        assert target is Target.DEFINITION
+
+        dispatcher_exprs = dispatcher.cpparguments_exprs(cpp_args)
+        cpp_args_str_no_default = ', '.join(map(lambda a: a.str_no_default(), cpp_args))
+        dispatcher_returns_type = dispatcher.returns_type(f.func.returns)
+        dispatcher_types_str = ', '.join(map(lambda a: a.type, dispatcher_exprs))
+        dispatcher_exprs_str = ', '.join(map(lambda a: a.expr, dispatcher_exprs))
+
+        return f"""
+// aten::{f.func}
+{cpp_returns_type} {name}({cpp_args_str_no_default}) {{
+    static auto op = c10::Dispatcher::singleton()
+        .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}")
+        .typed<{dispatcher_returns_type} ({dispatcher_types_str})>();
+    return op.call({dispatcher_exprs_str});
+}}
+"""
+    return go
+
+# Generates TensorBody.h (sic) and TensorMethods.cpp.  These files provide the
+# object-oriented (method-based) public C++ API, and the scaffolding to call into
+# the dispatcher from these functions.  See also compute_function.
+def compute_tensor_method(*, target: Target) -> Callable[[NativeFunction], Optional[str]]:
+    @with_native_function
+    def go(f: NativeFunction) -> Optional[str]:
+        if Variant.method not in f.variants:
+            return None
+
+        assert not f.func.is_out_fn()
+        assert len(f.func.arguments) > 0
+        assert sum(a.name == 'self' for a in f.func.arguments) == 1
+
+        name = cpp.name(f.func)
+        cpp_returns_type = cpp.returns_type(f.func.returns)
+        cpp_args = cpp.arguments(f.func, method=True)
+        cpp_args_exclude_this = [a for a in cpp_args if not isinstance(a.argument, ThisArgument)]
+        cpp_args_exclude_this_str = ', '.join(str(a) for a in cpp_args_exclude_this)
+
+        if target is Target.DECLARATION:
+            return f"{cpp_returns_type} {name}({cpp_args_exclude_this_str}) const;"
+
+        assert target is Target.DEFINITION
+
+        dispatcher_exprs = dispatcher.cpparguments_exprs(cpp_args)
+        cpp_args_exclude_this_str_no_default = ', '.join(a.str_no_default() for a in cpp_args_exclude_this)
+        dispatcher_returns_type = dispatcher.returns_type(f.func.returns)
+        dispatcher_types_str = ', '.join(map(lambda a: a.type, dispatcher_exprs))
+        dispatcher_exprs_str = ', '.join(map(lambda a: a.expr, dispatcher_exprs))
+
+        return f"""
+// aten::{f.func}
+{cpp_returns_type} Tensor::{name}({cpp_args_exclude_this_str_no_default}) const {{
+    static auto op = c10::Dispatcher::singleton()
+        .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}")
+        .typed<{dispatcher_returns_type} ({dispatcher_types_str})>();
+    return op.call({dispatcher_exprs_str});
+}}
+"""
+
+    return go
+
+# Generates ATenOpList.cpp, a runtime accessible list of all aten
+# operators.
+# TODO: This was historically used to help some JIT interop code
+# figure out whether or not to treat aten namespace'd operators
+# one way or another, we should reevaluate if this is actually needed.
+@with_native_function
+def compute_aten_op(f: NativeFunction) -> str:
+    return f'{{"aten::{f.func.name.name}", "{f.func.name.overload_name}"}},'
+
+# Generates NativeFunctions.h, a list of forward declarations of all
+# actual kernel definitions we keep in aten/src/ATen/native/
+@with_native_function
+def compute_native_function_declaration(f: NativeFunction) -> List[str]:
+    if f.dispatch is None:
+        ns = [cpp.name(f.func)]
+    else:
+        ns = list(f.dispatch.values())
+
+    rs = []
+    # Sometimes a function name shows up multiple times; only generate
+    # it once!
+    seen = set()
+    for n in ns:
+        if n in seen:
+            continue
+        if "legacy::" in n:
+            continue
+        seen.add(n)
+        returns_type = legacy_dispatcher.returns_type(f.func.returns)
+        args = legacy_dispatcher.arguments(f.func)
+        rs.append(f"CAFFE2_API {returns_type} {n}({', '.join(map(lambda a: a.str_with_default(), args))});")
+
+    return rs
+
+# Generates BackendSelectRegister.cpp, a series of kernels which provide
+# specialized computation of dispatch key for operator signatures which cannot
+# be easily done automatically using templating.
+def compute_backend_select(*, target: Target) -> Callable[[NativeFunction], Optional[str]]:
+    @with_native_function
+    def go(f: NativeFunction) -> Optional[str]:
+        if str(f.func.name.name).endswith('_like') or str(f.func.name.name).startswith('new_'):
+            return None
+
+        name = legacy_dispatcher.name(f.func)
+        legacy_dispatcher_returns_type = legacy_dispatcher.returns_type(f.func.returns)
+        legacy_dispatcher_args = legacy_dispatcher.arguments(f.func)
+
+        if not any(isinstance(a.argument, TensorOptionsArguments) for a in legacy_dispatcher_args):
+            return None
+
+        legacy_dispatcher_tensor_args = [
+            a for a in legacy_dispatcher_args
+            if isinstance(a.argument, Argument) and a.argument.type.is_tensor_like()
+        ]
+
+        dispatcher_returns_type = dispatcher.returns_type(f.func.returns)
+        dispatcher_args = dispatcher.arguments(f.func)
+        dispatcher_exprs = dispatcher.legacydispatcherarguments_exprs(legacy_dispatcher_args)
+
+        if target is Target.DEFINITION:
+            # See Note [Byte-for-byte compatibility]
+            # I don't think there's actually a good reason to generate
+            # these two cases differently
+            if legacy_dispatcher_tensor_args:
+                tensor_args = ', '.join(a.name for a in legacy_dispatcher_tensor_args)
+                compute_dk = f"""\
+DispatchKeySet _dk_set = DispatchKeySet(options.computeDispatchKey()) | c10::detail::multi_dispatch_key_set({tensor_args});
+  DispatchKeySet _dk_mask = c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, DispatchKey::BackendSelect);
+  DispatchKey _dk = c10::impl::dispatchTypeId(_dk_set, _dk_mask);"""
+            else:
+                compute_dk = "DispatchKey _dk = options.computeDispatchKey();"
+            return f"""\
+// aten::{f.func}
+{legacy_dispatcher_returns_type} {name}({', '.join(a.str_with_default() for a in legacy_dispatcher_args)}) {{
+  static auto op = c10::Dispatcher::singleton()
+    .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}")
+    .typed<{dispatcher_returns_type} ({', '.join(a.type for a in dispatcher_args)})>();
+  {compute_dk}
+  return op.callWithDispatchKey(_dk, {', '.join(a.expr for a in dispatcher_exprs)});
+}}
+"""
+        elif target is Target.REGISTRATION:
+            if local.use_c10_dispatcher() is UseC10Dispatcher.full:
+                return f"""m.impl("aten::{f.func.name}",
+          c10::impl::hacky_wrapper_for_legacy_signatures<{dispatcher_returns_type} ({', '.join(a.type for a in dispatcher_args)})>(
+            TORCH_FN({name})));"""
+            else:
+                return f"""m.impl_UNBOXED("aten::{f.func.name}", {name});"""
+        elif target is Target.DECLARATION:
+            raise AssertionError()
+        else:
+            assert_never(target)
+    return go
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                       YAML CODE GENERATION
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+def dict_representer(dumper: Any, data: Any) -> Any:
+    return dumper.represent_dict(data.items())
+
+def format_yaml(data: object) -> str:
+    noalias_dumper = yaml.dumper.SafeDumper
+    noalias_dumper.ignore_aliases = lambda self, data: True  # type: ignore
+    # Support serializing OrderedDict
+    noalias_dumper.add_representer(OrderedDict, dict_representer)  # type: ignore
+    # Some yaml parsers (e.g. Haskell's) don't understand line breaks.
+    # width=float('Inf') turns off optional line breaks and improves
+    # the portability of the outputted yaml.
+    return yaml.dump(data, default_flow_style=False, Dumper=noalias_dumper, width=float('Inf'))  # type: ignore
+
+# For some reason, some defaults we write to YAML are written as native
+# YAML objects, rather than doing them uniformly as strings.  This
+# function detects those cases and converts them into native Python
+# objects.
+def pythonify_default(s: str) -> object:
+    if s == 'true':
+        return True
+    elif s == 'false':
+        return False
+
+    try:
+        return int(s)
+    except ValueError:
+        try:
+            return float(s)
+        except ValueError:
+            return s
+
+# What is a dynamic type?  Over time, the semantic meaning of
+# dynamic type has degraded to meaninglessness (in the old days,
+# it captured dtype-ness of types, but that has gone away with
+# the removal of TH).  These days, it's mostly the same thing as
+# the C++ API argument type, except that Tensor and Tensor?
+# arguments simply present as Tensor.
+#
+# TODO: Get rid of dynamic_type, after getting tools/autograd
+# to use the new codegen framework
+def dynamic_type(t: Type) -> str:
+    if isinstance(t, OptionalType):
+        return dynamic_type(t.elem)
+    # Note we don't use t.is_tensor_like() here because it would
+    # also include Tensor[]
+    if str(t) == 'Tensor':
+        return 'Tensor'
+    return cpp.argumenttype_type(t, mutable=False)
+
+def compute_method_of_yaml(variants: Set[Variant]) -> List[str]:
+    # This is written out explicitly to ensure that Tensor and
+    # namespace are put into the list in the right order
+    method_of = ['Type']
+    if Variant.method in variants:
+        method_of.append('Tensor')
+    if Variant.function in variants:
+        method_of.append('namespace')
+    return method_of
+
+def compute_returns_yaml(f: NativeFunction) -> Tuple[List[Dict[str, str]], Dict[str, str]]:
+    # Note [name and field_name]
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # To understand name_to_field_name, we must first talk about this
+    # schema:
+    #
+    #   lstsq.X(Tensor self, Tensor A, *, Tensor(a!) X, Tensor(b!) qr) -> (Tensor(a!) solution, Tensor(b!) QR)
+    #
+    # There is something very odd about this schema: it is an out
+    # variant of the function (that is to say, it will convert into
+    # at::lstsq_out() in the C++ API), but the names of the output
+    # return arguments don't match the keyword argument names of
+    # the inputs.  It TURNS OUT that in this situation, the historical
+    # Declarations.yaml we want to output is this (abbreviated to
+    # only show relevant fields):
+    #
+    #   arguments:
+    #     ...
+    #   - field_name: solution
+    #     name: X
+    #   - field_name: QR
+    #     name: qr
+    #     ...
+    #
+    #   returns:
+    #   - field_name: solution
+    #     name: X
+    #   - field_name: QR
+    #     name: qr
+    #
+    # The name of the return fields is stored in 'field_name', and the
+    # name of the arguments is stored in 'name'.  So when we process
+    # arguments, we need a way to get at the corresponding return.  At
+    # the moment, this is most conveniently done by constructing a
+    # mapping from name (the argument concept) to field_name (the
+    # return concept) while processing return arguments, since we don't
+    # directly maintain this correspondence in the modeling of function
+    # schema itself.
+    #
+    # See also https://github.com/pytorch/pytorch/issues/43114
+    name_to_field_name: Dict[str, str] = {}
+
+    # Compute the returns field of the YAML entry
+    returns = []
+    for i, r in enumerate(f.func.returns):
+        # If we have an inplace function, the return argument is
+        # implicitly named self.
+        # TODO: Consider incorporating this into the data model
+        if f.func.name.name.inplace:
+            assert i == 0, "illegal inplace function with multiple returns"
+            name = 'self'
+        # If we are out function, the name is the name of the
+        # corresponding output function (r.name will get recorded
+        # in field_name later.)
+        elif f.func.is_out_fn():
+            name = f.func.out_arguments[i].name
+        # If the return argument is explicitly named...
+        elif r.name:
+            # See Note [Byte-for-byte compatibility]
+            #
+            # Check if it would conflict with an existing argument.
+            # Downstream codegen assumes that return names and argument
+            # names don't conflict with each other, so we disambiguate
+            # (by adding a trailing _return) this case.  Notice that
+            # historically, the collision check was buggy: it just did a
+            # straight string contains test on the entirety of the
+            # inputs part of the format string, meaning that it also
+            # picked up occurrences of the argument name in the NAME of
+            # the function, as well as substring occurrences of the name
+            # in arguments.  We have simulated the old logic here...
+            buggy_name_conflict = r.name in str(f.func.name) or \
+                any(r.name in a.name for a in f.func.schema_order_arguments())
+            # ... but a more correct version is simply
+            # name_conflict = any(r.name == a.name for a in f.func.schema_order_arguments())
+            if buggy_name_conflict and not f.func.is_out_fn():
+                name = f'{r.name}_return'
+            else:
+                name = r.name
+        # If there is no explicit name, we just name the output result,
+        # unless it's a multi-return, in which case it's result0,
+        # result1, etc (zero-indexed)
+        else:
+            name = 'result' if len(f.func.returns) == 1 else f'result{i}'
+
+        ret = {
+            'dynamic_type': dynamic_type(r.type),
+            'name': name,
+            'type': cpp.return_type(r),
+        }
+
+        if r.name:
+            # See Note [name and field_name]
+            ret['field_name'] = r.name
+            if f.func.is_out_fn():
+                name_to_field_name[f.func.out_arguments[i].name] = r.name
+
+        returns.append(ret)
+
+    return returns, name_to_field_name
+
+# arguments in yaml roughly corresponds to the public C++ API
+def compute_cpp_argument_yaml(cpp_a: CppArgument, *, schema_order: bool, kwarg_only_set: Set[str],
+                              out_arg_set: Set[str], name_to_field_name: Dict[str, str]) -> object:
+    if isinstance(cpp_a.argument, TensorOptionsArguments):
+        arg: Dict[str, object] = {
+            'annotation': None,
+            'dynamic_type': 'TensorOptions',
+            'is_nullable': False,
+            'name': cpp_a.name,
+            'type': cpp_a.type,
+            'kwarg_only': True,
+        }
+        if cpp_a.default is not None:
+            arg['default'] = cpp_a.default
+        return arg
+    elif isinstance(cpp_a.argument, ThisArgument):
+        raise AssertionError()
+    elif isinstance(cpp_a.argument, Argument):
+        return compute_argument_yaml(
+            cpp_a.argument, schema_order=schema_order,
+            kwarg_only_set=kwarg_only_set, out_arg_set=out_arg_set, name_to_field_name=name_to_field_name)
+
+def compute_argument_yaml(a: Argument, *, schema_order: bool, kwarg_only_set: Set[str],
+                          out_arg_set: Set[str], name_to_field_name: Dict[str, str]) -> object:
+    arg: Dict[str, object] = {
+        'annotation': str(a.annotation) if a.annotation else None,
+        'dynamic_type': dynamic_type(a.type),
+        'is_nullable': a.type.is_nullable(),
+        'name': a.name,
+        'type': cpp.argument_type(a),
+    }
+    if a.default is not None:
+        arg['default'] = pythonify_default(cpp.default_expr(a.default, a.type))
+    if a.name in kwarg_only_set:
+        arg['kwarg_only'] = True
+    # See Note [Byte-for-byte compatibility]
+    # The default value of kwarg_only is False; this case exists for
+    # byte-for-byte compatibility
+    elif a.name in out_arg_set:
+        arg['kwarg_only'] = False
+    if a.name in out_arg_set:
+        arg['output'] = True
+        # See Note [Byte-for-byte compatibility]
+        # This is probably a bug in the original implementation, where
+        # the specification of allocate was not properly propagated to
+        # the schema-order arguments.  In any case, this field
+        # is redundant with the output field
+        if not schema_order:
+            arg['allocate'] = True
+        # See Note [name and field_name]
+        if a.name in name_to_field_name:
+            arg['field_name'] = name_to_field_name[a.name]
+    # Historically, booleans don't get their size recorded, because it
+    # is already built into the cpp type (e.g., std::array<bool, 4>)
+    l = a.type.is_list_like()
+    if l is not None and l.size is not None and str(l.elem) != 'bool':
+        arg['size'] = l.size
+    return arg
+
+@with_native_function
+def compute_declaration_yaml(f: NativeFunction) -> object:
+    returns, name_to_field_name = compute_returns_yaml(f)
+
+    # These sets are used to conveniently test if an argument is a
+    # kwarg-only or out argument
+    kwarg_only_set = set(a.name for a in f.func.kwarg_only_arguments)
+    out_arg_set = set(a.name for a in f.func.out_arguments)
+
+    cpp_args = cpp.arguments(f.func)
+    arguments = [
+        compute_cpp_argument_yaml(
+            cpp_a, schema_order=False,
+            kwarg_only_set=kwarg_only_set, out_arg_set=out_arg_set, name_to_field_name=name_to_field_name)
+        for cpp_a in cpp_args
+    ]
+
+    # See Note [Byte-for-byte compatibility]
+    # NB: NOT actually schema order.  This is almost certainly a BUG.
+    schema_order_jit_arguments = list(itertools.chain(f.func.arguments, f.func.out_arguments, f.func.kwarg_only_arguments))
+
+    schema_order_arguments = [
+        compute_argument_yaml(
+            a, schema_order=True,
+            kwarg_only_set=kwarg_only_set, out_arg_set=out_arg_set, name_to_field_name=name_to_field_name)
+        for a in schema_order_jit_arguments
+    ]
+
+    cpp_schema_order_types = [cpp.argument(a).type for a in schema_order_jit_arguments]
+    cpp_returns = cpp.returns_type(f.func.returns)
+    schema_order_cpp_signature = f"{cpp_returns} ({', '.join(cpp_schema_order_types)})"
+
+    is_factory_method = any(isinstance(a.argument, TensorOptionsArguments) for a in cpp_args) \
+        and Variant.method not in f.variants
+
+    return OrderedDict([
+        ('name', cpp.name(f.func)),
+        ('operator_name', str(f.func.name.name)),
+        ('overload_name', str(f.func.name.overload_name)),
+        ('use_c10_dispatcher', f.use_c10_dispatcher.name),
+        ('manual_kernel_registration', f.manual_kernel_registration),
+        ('category_override', f.category_override if f.category_override is not None else ''),
+        ('matches_jit_signature', True),
+        ('schema_string', f'aten::{f.func}'),
+        ('arguments', arguments),
+        ('schema_order_cpp_signature', schema_order_cpp_signature),
+        ('schema_order_arguments', schema_order_arguments),
+        ('method_of', compute_method_of_yaml(f.variants)),
+        ('mode', 'native'),
+        ('python_module', '' if f.python_module is None else f.python_module),
+        ('returns', returns),
+        ('inplace', f.func.name.name.inplace),
+        ('is_factory_method', is_factory_method),
+        # Note [Abstract ATen methods]
+        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        # An abstract ATen method is one whose dispatch differs between
+        # types.  These are implemented in derived types (with a
+        # standard (throwing) definition in Type).  A concrete ATen
+        # method is one which has the same dispatch for all types;
+        # we just implement it in the base Type.  This is exposed
+        # in Declarations.yaml via a field named 'abstract'.
+        #
+        # Although this is what we have historically exposed, it is
+        # actually not all that useful for end users, who are also interested
+        # whether or not there is an explicit entry in derivatives.yaml
+        # for the entry or not (as this affects whether or not the operation is
+        # overrideable or not.)  Once this all gets cleaned up, this
+        # property will be obsolete.
+        ('abstract', f.dispatch is not None),
+        ('device_guard', f.device_guard),
+        ('with_gil', False),
+        ('deprecated', False),
+    ])
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                           RUN IT ALL
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+@functools.lru_cache(maxsize=None)
+def _read_template(template_fn: str) -> CodeTemplate:
+    return CodeTemplate.from_file(template_fn)
+
+# A small abstraction for writing out generated files and keeping track
+# of what files have been written (so you can write out a list of output
+# files)
+class FileManager:
+    install_dir: str
+    template_dir: str
+    dry_run: bool
+    filenames: Set[str]
+
+    def __init__(self, install_dir: str, template_dir: str, dry_run: bool) -> None:
+        self.install_dir = install_dir
+        self.template_dir = template_dir
+        self.filenames = set()
+        self.dry_run = dry_run
+
+    def _write_if_changed(self, filename: str, contents: str) -> None:
+        old_contents: Optional[str]
+        try:
+            with open(filename, 'r') as f:
+                old_contents = f.read()
+        except IOError:
+            old_contents = None
+        if contents != old_contents:
+            with open(filename, 'w') as f:
+                f.write(contents)
+
+    def write_with_template(self, filename: str, template_fn: str,
+                            env_callable: Callable[[], Union[str, Dict[str, object]]]) -> None:
+        filename = '{}/{}'.format(self.install_dir, filename)
+        assert filename not in self.filenames, "duplicate file write {filename}"
+        self.filenames.add(filename)
+        if not self.dry_run:
+            env = env_callable()
+            if isinstance(env, dict):
+                # TODO: Update the comment reference to the correct location
+                comment = "@" + "generated by aten/src/ATen/gen.py"
+                comment += " from {}".format(os.path.basename(template_fn))
+                env['generated_comment'] = comment
+                template = _read_template(os.path.join(self.template_dir, template_fn))
+                self._write_if_changed(filename, template.substitute(env))
+            elif isinstance(env, str):
+                self._write_if_changed(filename, env)
+            else:
+                assert_never(env)
+
+
+    def write(self, filename: str, env_callable: Callable[[], Union[str, Union[str, Dict[str, object]]]]) -> None:
+        self.write_with_template(filename, filename, env_callable)
+
+    def write_outputs(self, filename: str) -> None:
+        """Write a file containing the list of all outputs which are
+        generated by this script."""
+        self._write_if_changed(
+            filename,
+            ''.join(name + ";" for name in sorted(self.filenames)))
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description='Generate ATen source files')
+    parser.add_argument(
+        '-s',
+        '--source-path',
+        help='path to source directory for ATen',
+        default='aten/src/ATen')
+    parser.add_argument(
+        '-o',
+        '--output-dependencies',
+        help='output a list of dependencies into the given file and exit')
+    parser.add_argument(
+        '-d', '--install_dir', help='output directory',
+        default='build/aten/src/ATen')
+    parser.add_argument(
+        '--rocm',
+        action='store_true',
+        help='reinterpret CUDA as ROCm/HIP and adjust filepaths accordingly')
+    # TODO: remove this, we should just unconditionally generate Vulkan
+    parser.add_argument(
+        '--vulkan',
+        action='store_true',
+        help='Generate Vulkan backend functions')
+    parser.add_argument(
+        '--op_registration_whitelist',
+        nargs='*',
+        help='filter op registrations by the whitelist (if set); '
+             'each item is `namespace`::`operator name` without overload name; '
+             'e.g.: aten::empty aten::conv2d ...')
+    parser.add_argument(
+        '--backend_whitelist',
+        nargs='*',
+        help='filter dispatch backend by the whitelist (if set), '
+             'e.g.: CPU CUDA QuantizedCPU ...')
+    parser.add_argument(
+        '--per_op_registration',
+        action='store_true',
+        help='group function registrations by op name and write to separate files; '
+             'must also set --op_registration_whitelist param')
+    parser.add_argument(
+        '--force_schema_registration',
+        action='store_true',
+        help='force it to generate schema-only registrations for all ops, including'
+             'those that are not listed on --op_registration_whitelist')
+    options = parser.parse_args()
+
+    op_registration_whitelist: Optional[Set[str]]
+    if options.op_registration_whitelist is not None:
+        op_registration_whitelist = set(options.op_registration_whitelist)
+    else:
+        op_registration_whitelist = None
+
+    native_functions = parse_native_yaml(os.path.join(options.source_path, 'native/native_functions.yaml'))
+
+    template_dir = os.path.join(options.source_path, "templates")
+
+    # NB: It is mandatory to NOT use os.path.join here, as the install directory
+    # will eventually be ingested by cmake, which does not respect Windows style
+    # path slashes.  If you switch this to use os.path.join, you'll get an error
+    # like:
+    #
+    #   Syntax error in cmake code when parsing string
+    #
+    #     C:/Jenkins/workspace/pytorch-builds/pytorch-win-ws2016-cuda9-cudnn7-py3-build/build/aten/src/ATen\core/TensorMethods.h
+    #
+    #   Invalid character escape '\c'.
+    core_install_dir = f'{options.install_dir}/core'
+    pathlib.Path(core_install_dir).mkdir(parents=True, exist_ok=True)
+
+    def make_file_manager(install_dir: str) -> FileManager:
+        return FileManager(install_dir=install_dir, template_dir=template_dir, dry_run=options.output_dependencies)
+
+    core_fm = make_file_manager(core_install_dir)
+    cpu_fm = make_file_manager(options.install_dir)
+    cuda_fm = make_file_manager(options.install_dir)
+
+    extra_cuda_headers = '''\
+#include <ATen/DeviceGuard.h>
+#include <ATen/cuda/ATenCUDAGeneral.h>
+#include <ATen/cuda/CUDADevice.h>
+#include <ATen/cuda/CUDAContext.h>'''
+    if options.rocm:
+        extra_cuda_headers = '''\
+#include <ATen/DeviceGuard.h>
+#include <ATen/hip/ATenHIPGeneral.h>
+#include <ATen/hip/HIPDevice.h>
+#include <ATen/hip/HIPContext.h>'''
+
+    backends = ["CPU", "SparseCPU", "MkldnnCPU", "CUDA", "SparseCUDA", "QuantizedCPU", "QuantizedCUDA"]
+    if options.vulkan:
+        backends.append("Vulkan")
+    if options.backend_whitelist:
+        backends = [b for b in backends if b in options.backend_whitelist]
+
+    for dispatch in backends:
+        h_template = 'TypeDerived.h'
+        cpp_template = 'TypeDerived.cpp'
+        # TODO: delete this special case
+        if 'Sparse' in dispatch:
+            cpp_template = 'SparseTypeDerived.cpp'
+
+        fm = cuda_fm if 'CUDA' in dispatch else cpu_fm
+
+        fm.write_with_template(f'{dispatch}Type.h', h_template, lambda: {
+            'Type': f'{dispatch}Type',
+            'extra_cuda_headers': extra_cuda_headers if 'CUDA' in dispatch else '',  # TODO: remove this
+            'type_derived_method_declarations': list(mapMaybe(
+                compute_type_method(dispatch, target=Target.DECLARATION, op_registration_whitelist=op_registration_whitelist),
+                native_functions
+            )),
+        })
+        fm.write_with_template(f'{dispatch}Type.cpp', cpp_template, lambda: {
+            'Type': f'{dispatch}Type',
+            # TODO: remove this
+            'extra_cuda_headers': extra_cuda_headers if 'CUDA' in dispatch else '',
+            # TODO: remove this
+            'storage_tensor_headers': '#include <c10/core/TensorImpl.h>',
+            # TODO: remove this
+            'Generator': 'CUDAGeneratorImpl' if 'CUDA' in dispatch else 'CPUGeneratorImpl',
+            'legacy_th_headers':
+                '#include <ATen/LegacyTHFunctionsCPU.h>' if dispatch == "CPU" else
+                '#include <ATen/LegacyTHFunctionsCUDA.h>' if dispatch == "CUDA" else
+                '',
+            'Backend': dispatch,
+            'type_derived_method_definitions': list(mapMaybe(
+                compute_type_method(dispatch, target=Target.DEFINITION, op_registration_whitelist=op_registration_whitelist),
+                native_functions
+            )),
+            'function_registrations': list(mapMaybe(
+                compute_type_method(
+                    dispatch, target=Target.REGISTRATION, op_registration_whitelist=op_registration_whitelist),
+                native_functions
+            )) if not options.per_op_registration else [],
+        })
+        del fm
+
+    cpu_fm.write('TypeDefault.h', lambda: {
+        'type_method_declarations': list(mapMaybe(
+            compute_type_method(None, target=Target.DECLARATION, op_registration_whitelist=op_registration_whitelist),
+            native_functions)),
+    })
+    cpu_fm.write('TypeDefault.cpp', lambda: {
+        'type_method_definitions': list(mapMaybe(
+            compute_type_method(None, target=Target.DEFINITION, op_registration_whitelist=op_registration_whitelist),
+            native_functions)),
+        'function_registrations': list(mapMaybe(
+            compute_type_method(None, target=Target.REGISTRATION, op_registration_whitelist=op_registration_whitelist),
+            native_functions)) if not options.per_op_registration else [],
+    })
+    cpu_fm.write('Functions.h', lambda: {
+        'function_declarations': list(mapMaybe(compute_function(target=Target.DECLARATION), native_functions)),
+    })
+    cpu_fm.write('Functions.cpp', lambda: {
+        'function_definitions': list(mapMaybe(compute_function(target=Target.DEFINITION), native_functions)),
+    })
+    core_fm.write('TensorBody.h', lambda: {
+        'tensor_method_declarations': list(mapMaybe(compute_tensor_method(target=Target.DECLARATION), native_functions)),
+    })
+    core_fm.write('TensorMethods.cpp', lambda: {
+        'tensor_method_definitions': list(mapMaybe(compute_tensor_method(target=Target.DEFINITION), native_functions)),
+    })
+    core_fm.write('ATenOpList.cpp', lambda: {
+        'aten_ops': list(mapMaybe(compute_aten_op, native_functions)),
+    })
+    cpu_fm.write('NativeFunctions.h', lambda: {
+        'native_function_declarations': list(concatMap(compute_native_function_declaration, native_functions)),
+    })
+    cpu_fm.write('BackendSelectRegister.cpp', lambda: {
+        'backend_select_method_definitions':
+            list(mapMaybe(compute_backend_select(target=Target.DEFINITION), native_functions)),
+        'backend_select_function_registrations':
+            list(mapMaybe(compute_backend_select(target=Target.REGISTRATION), native_functions)),
+    })
+
+    if options.force_schema_registration:
+        def computeSchemaRegister() -> Dict[str, object]:
+            schema_registrations = list(mapMaybe(
+                compute_type_method(None, target=Target.REGISTRATION, op_registration_whitelist=None, def_only=True),
+                native_functions))
+            # See Note [Byte-for-byte compatibility]
+            schema_registrations.sort()
+            return {
+                'schema_registrations': schema_registrations,
+            }
+        cpu_fm.write('SchemaRegister.cpp', computeSchemaRegister)
+
+    if options.per_op_registration:
+        def gen_per_op_registration_filename(opname: str) -> str:
+            return 'pt_op_register_{}.cpp'.format(opname.replace(':', '-'))
+
+        if op_registration_whitelist is None:
+            raise Exception("Must set --op_registration_whitelist for per-op registration.")
+
+        # First, group all native functions by unoverloaded operator name
+        grouped_functions : DefaultDict[str, List[NativeFunction]] = DefaultDict(list)
+        for f in native_functions:
+            grouped_functions[f"aten::{f.func.name.name}"].append(f)
+        extra_headers = []
+        for b in backends:
+            extra_headers.append(f'#include <ATen/{b}Type.h>')
+
+        # Next, generate registration for each one
+        for name in op_registration_whitelist:
+            def computePerOpRegistration() -> Dict[str, object]:
+                fs = grouped_functions[name]
+                registrations: List[str] = []
+                for mb_dispatch in itertools.chain([None], backends):
+                    # or you could pass in op_registration_whitelist, it doesn't
+                    # matter!
+                    # NB: Use of compute_type_method here is kind of an abuse;
+                    # this is why we have to unconditionally write in
+                    # torch::dispatch in the registration when it should be
+                    # contextually clear
+                    registrations.extend(
+                        mapMaybe(
+                            compute_type_method(mb_dispatch, target=Target.REGISTRATION, op_registration_whitelist=None),
+                            fs))
+                return {
+                    'extra_headers': extra_headers,
+                    'function_registrations': registrations,
+                }
+
+            cpu_fm.write_with_template(
+                gen_per_op_registration_filename(name), 'PerOpRegistration.cpp', computePerOpRegistration)
+
+    cpu_fm.write('Declarations.yaml', lambda: format_yaml(list(map(compute_declaration_yaml, native_functions))))
+
+    if options.output_dependencies:
+        cpu_fm.write_outputs(options.output_dependencies)
+        core_fm.write_outputs(f"{options.output_dependencies}-core")
+        cuda_fm.write_outputs(f"{options.output_dependencies}-cuda")
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/codegen/local.py b/tools/codegen/local.py
new file mode 100644
index 00000000000..9244cb181ae
--- /dev/null
+++ b/tools/codegen/local.py
@@ -0,0 +1,49 @@
+import threading
+from contextlib import contextmanager
+from typing import Optional, Iterator
+
+from tools.codegen.model import UseC10Dispatcher
+
+# Simple dynamic scoping implementation.  The name "parametrize" comes
+# from Racket.
+#
+# WARNING WARNING: LOOKING TO EDIT THIS FILE?  Think carefully about
+# why you need to add a toggle to the global behavior of code
+# generation.  The parameters here should really only be used
+# for "temporary" situations, where we need to temporarily change
+# the codegen in some cases because we cannot conveniently update
+# all call sites, and are slated to be eliminated once all call
+# sites are eliminated.  If you don't have a plan for how to get there,
+# DON'T add a new entry here.
+
+class Locals(threading.local):
+    use_c10_dispatcher: Optional[UseC10Dispatcher] = None
+    hack_const_mutable_self: bool = False
+_locals = Locals()
+
+# The use_c10_dispatcher field in native_functions.yaml is used to
+# control codegen behavior, so that we can handle cases where
+# Dispatcher templating logic can't handle.  In the terminal
+# state, use_c10_dispatcher should always be UseC10Dispatcher.full
+# and this flag can be eliminated.
+def use_c10_dispatcher() -> UseC10Dispatcher:
+    assert _locals.use_c10_dispatcher is not None, \
+        "need to initialize local.use_c10_dispatcher with local.parametrize"
+    return _locals.use_c10_dispatcher
+
+# This is used to maintain compat, see Note [Byte-for-byte compatibility]
+# It can be removed when we drop compat.
+def hack_const_mutable_self() -> bool:
+    return _locals.hack_const_mutable_self
+
+@contextmanager
+def parametrize(*, use_c10_dispatcher: UseC10Dispatcher, hack_const_mutable_self: bool) -> Iterator[None]:
+    old_use_c10_dispatcher = _locals.use_c10_dispatcher
+    old_hack_const_mutable_self = _locals.hack_const_mutable_self
+    try:
+        _locals.use_c10_dispatcher = use_c10_dispatcher
+        _locals.hack_const_mutable_self = hack_const_mutable_self
+        yield
+    finally:
+        _locals.use_c10_dispatcher = old_use_c10_dispatcher
+        _locals.hack_const_mutable_self = old_hack_const_mutable_self
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
new file mode 100644
index 00000000000..de553704d5d
--- /dev/null
+++ b/tools/codegen/model.py
@@ -0,0 +1,766 @@
+import re
+
+from dataclasses import dataclass
+from typing import List, Sequence, Dict, Optional, Iterator, Tuple, Set, NoReturn
+from enum import Enum
+import itertools
+
+# A little trick from https://github.com/python/mypy/issues/6366
+# for getting mypy to do exhaustiveness checking
+# TODO: put this somewhere else, maybe
+def assert_never(x: NoReturn) -> NoReturn:
+    raise AssertionError("Unhandled type: {}".format(type(x).__name__))
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                           DATA MODEL
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+# Some general principles for our data model.
+#
+# - Stop using C++ data types as the internal data representation
+#   format.  Instead, the internal data structures are centered
+#   around JIT schema representation.  This avoid a big problem
+#   with the old codegen where we read in all the types from
+#   native_functions.yaml and then immediately had to retranslate
+#   them into C++ types.
+#
+# - More semantic data representation.  Instead of representing
+#   everything as dicts and strings, we define dataclasses for
+#   every interesting entity the code generation has to deal with.
+#   These dataclasses have strong semantic invariants: for example,
+#   we generally require them to roundtrip losslessly into the
+#   form they were parsed from.  These structures are immutable
+#   and you're expected to populate information once during
+#   construction.
+
+# Represent a source location; used for better error reporting
+@dataclass(frozen=True)
+class Location:
+    file: str
+    line: int
+
+    def __str__(self) -> str:
+        return "{}:{}".format(self.file, self.line)
+
+# Valid values of the 'variants' field in native_functions.yaml
+Variant = Enum('Variant', ('function', 'method'))
+
+UseC10Dispatcher = Enum('UseC10Dispatcher', (
+    'full',
+    'with_codegenerated_unboxing_wrapper'
+))
+
+# The basic input to the code generation is native_functions.yaml.
+# The name "native", BTW, comes from the distinction between native
+# functions and legacy TH functions.  The legacy TH functions are gone,
+# but the "native" descriptor has stuck.
+#
+# NativeFunction models a single entry in native_functions.yaml.  Its
+# fields roughly correspond to what you would see in the YAML itself,
+# but after canonicalization and parsing has occurred.
+#
+# You can see some of the overall design patterns for how we setup
+# dataclasses in this class, but we will defer a complete discussion
+# of this at FunctionSchema.
+@dataclass(frozen=True)
+class NativeFunction:
+    # The function schema of the operator in question.  This schema
+    # has been parsed; see FunctionSchema for more about its structure.
+    # (This type is quoted as we are forward referencing a type
+    # defined later in the file.  I opted for this ordering of the
+    # classes for expository clarity.)
+    func: 'FunctionSchema'
+
+    # Corresponds to the 'use_c10_dispatcher' field.  The default
+    # is 'with_codegenerated_unboxing_wrapper'
+    use_c10_dispatcher: UseC10Dispatcher
+
+    # Whether or not to omit automatic generation of a DeviceGuard
+    device_guard: bool
+
+    # What python module to put the function in
+    python_module: Optional[str]
+
+    # TODO: figure out what this does
+    category_override: Optional[str]
+
+    # If no variants are specified in native_functions.yaml, this is
+    # assumed to be {'function'}.
+    variants: Set[Variant]
+
+    # Whether or not we should skip generating registrations for
+    # this kernel.  This is a bit of a double-edged sword, as manual
+    # registrations don't participate in codegen-based selective build!
+    manual_kernel_registration: bool
+
+    # Distinguish between a missing dispatch dict (historically, this
+    # means to register a catch-all kernel) and a present but empty
+    # dispatch dict (this means register nothing; arguably, this should
+    # subsume manual_kernel_registration).
+    #
+    # TODO: str key could be replaced with more explicit enum
+    dispatch: Optional[Dict[str, str]]
+
+    # The location in the YAML file were this native function entry was
+    # defined.  This is for conveniently reporting error messages!
+    loc: 'Location'
+
+    # NB: The benefit of defining a dataclass is that we automatically get
+    # a constructor defined for all the fields we specify.  No need
+    # to explicitly write it out.
+
+    @staticmethod
+    def from_yaml(ei: Dict[str, object], loc: 'Location') -> 'NativeFunction':
+        """
+        Parse a NativeFunction from a dictionary as directly parsed
+        from native_functions.yaml
+        """
+        e = ei.copy()
+
+        funcs = e.pop('func')
+        assert isinstance(funcs, str), f'not a str: {funcs}'
+        func = FunctionSchema.parse(funcs)
+
+        use_c10_dispatcher_s = e.pop('use_c10_dispatcher', None)
+        if use_c10_dispatcher_s is None:
+            use_c10_dispatcher = UseC10Dispatcher.with_codegenerated_unboxing_wrapper
+        elif use_c10_dispatcher_s == 'full':
+            use_c10_dispatcher = UseC10Dispatcher.full
+        else:
+            raise AssertionError(
+                f'use_c10_dispatcher must be unset or set to full, got {use_c10_dispatcher}')
+
+        variants_s = e.pop('variants', 'function')
+        assert isinstance(variants_s, str)
+        variants: Set[Variant] = set()
+        for v in variants_s.split(', '):
+            if v == 'function':
+                variants.add(Variant.function)
+            elif v == 'method':
+                variants.add(Variant.method)
+            else:
+                raise AssertionError(f'illegal variant {v}')
+
+        manual_kernel_registration = e.pop('manual_kernel_registration', False)
+        assert isinstance(manual_kernel_registration, bool), f'not a bool: {manual_kernel_registration}'
+
+        device_guard = e.pop('device_guard', True)
+        assert isinstance(device_guard, bool), f'not a bool: {device_guard}'
+
+        python_module = e.pop('python_module', None)
+        assert python_module is None or isinstance(python_module, str), f'not a str: {python_module}'
+
+        category_override = e.pop('category_override', None)
+        assert category_override is None or isinstance(category_override, str), f'not a str: {category_override}'
+
+        raw_dispatch = e.pop('dispatch', None)
+        assert raw_dispatch is None or isinstance(raw_dispatch, dict), e
+        dispatch: Optional[Dict[str, str]] = None
+        if raw_dispatch is not None:
+            dispatch = {}
+            for ks, v in raw_dispatch.items():
+                if ks == '__line__':
+                    continue  # not worth tracking line numbers for dispatch entries
+                assert isinstance(ks, str), e
+                assert isinstance(v, str), e
+                for k in ks.split(","):
+                    dispatch[k.strip()] = v
+
+        e.pop('__line__')
+        assert not e, f"leftover entries: {e}"
+
+        return NativeFunction(
+            func=func,
+            use_c10_dispatcher=use_c10_dispatcher,
+            variants=variants,
+            manual_kernel_registration=manual_kernel_registration,
+            python_module=python_module,
+            category_override=category_override,
+            dispatch=dispatch,
+            device_guard=device_guard,
+            loc=loc,
+        )
+
+    # __post_init__ functions in dataclasses can be used to do extra
+    # validation after construction.
+    #
+    # Notice that we don't do any type validation here.  In fact, we
+    # rely exclusively on mypy to check if you've done types correctly!
+    # Validation is for nontrivial invariants that cannot be (conveniently)
+    # encoded in the type system.
+    def __post_init__(self) -> None:
+        if self.func.out_arguments:
+            assert self.variants == {Variant.function}, "Native functions with out arguments MUST " \
+                "be declared with only function variant; e.g., variants: function; " \
+                "otherwise you will tickle a Python argument binding bug " \
+                "(which usually manifests itself as the result variable being undefined.)"
+
+# The function schema is undoubtedly the most important data structure
+# in all of the codegen, as it defines the type signature for operators,
+# and most of the code generation we do is type directed (e.g., look at
+# the types, decide what to do.  Think about how we code generate
+# C++ function stubs!)
+#
+# We will also see in this class the general structure for how we model
+# data in this code generation.  A few notable properties to point out
+# ahead of time:
+#
+#   - These dataclasses are a *lossless* representation of the strings
+#     they are parsed from.  In fact, we assert that given the
+#     information stored in the dataclass, we can exactly reconstruct
+#     the string we parsed from (and assert this inside the parse
+#     definition).  There are a few reasons for this:
+#
+#       - If you find that it is difficult to reconstruct the string
+#         given a dataclass, that is a clue that you are data
+#         representation is wrong.
+#
+#       - It helps ensure that all relevant information is present
+#         in the dataclass, so that downstream users aren't tempted
+#         to reparse the original string to get some information
+#         that was omitted.
+#
+#       - It forces you to represent the data in-memory in the same way
+#         it is recorded textually, which makes the dataclasses easier
+#         to understand for someone who is familiar with the
+#         textual format.  (As a tradeoff, it means you have to model
+#         the syntax, even when it is inconvenient.  But maybe that means
+#         the syntax is bad!)  If you don't understand the internal
+#         representation, go look at the printing code to see how
+#         it maps onto the surface syntax!
+#
+#       - It makes it easy to test the parsing code, as parsing code
+#         that is inconsistent with the string code will fail early
+#         and loudly.  (As a tradeoff, it makes the parsing code a bit
+#         brittle (in particular, with trivial whitespace changes you
+#         are likely to trigger an assert error).
+#
+#     In general, try to make the __str__ code as simple as possible
+#     (even at the cost of more complex parsing logic.)  Additionally,
+#     try to minimize redundancy in data representation.  (Precomputed
+#     fields are OK though: they are defined as a simple function on
+#     the canonical representation in question.)
+#
+#   - These dataclasses are all frozen; once constructed their
+#     values never change.  This makes it easy to tell where any
+#     given data came from: just look to the constructor.  As a
+#     tradeoff, you can't easily "decorate" a schema with extra
+#     information from a post-facto analysis.  We impose this
+#     restriction to make these structures more understandable.
+#
+@dataclass(frozen=True)
+class FunctionSchema:
+    # The name of the operator this function schema describes.
+    name: 'OperatorName'
+
+    # NB: Sequence here is intentional, to make it read only
+    arguments: Sequence['Argument']
+    kwarg_only_arguments: Sequence['Argument']  # but not including out args
+    # Unlike in the previous codegen, we have factored out 'out' arguments
+    # in the canonical representation, removing them from kwarg
+    # arguments.  This choice is justified by numerous downstream
+    # transformations which treat out arguments specially; additionally,
+    # you can see that canonicity is not violated!
+    out_arguments: Sequence['Argument']  # these are also kwarg-only
+
+    # TODO: Need to handle collisions with argument names at some point
+    returns: Sequence['Return']
+
+    def schema_order_arguments(self) -> Iterator['Argument']:
+        return itertools.chain(self.arguments, self.kwarg_only_arguments, self.out_arguments)
+
+    @staticmethod
+    def parse(func: str) -> 'FunctionSchema':
+        # We should probably get a proper parser here
+        assert ' -> ' in func, "function schema missing return type (spaces are mandatory)"
+        func_decl, return_decl = [x.strip() for x in func.split(' -> ')]
+        ops, args = func_decl.split('(', 1)
+        assert args[-1] == ")", "Expecting closing )"
+        args = args[:-1]
+        name = OperatorName.parse(ops)
+        arguments, kwarg_only_arguments, out_arguments = parse_arguments(args)
+        returns = parse_returns(return_decl)
+        r = FunctionSchema(
+            name=name,
+            arguments=arguments,
+            kwarg_only_arguments=kwarg_only_arguments,
+            out_arguments=out_arguments,
+            returns=returns
+        )
+        assert str(r) == func, f'{str(r)} != {func}'
+        return r
+
+    def __post_init__(self) -> None:
+        for arg, ret in zip(self.out_arguments, self.returns):
+            assert arg.annotation == ret.annotation, \
+                "Out arguments must have matching return Tensor; furthermore, " \
+                "the ith-argument needs to correspond to the ith return"
+        if self.out_arguments:
+            assert len(self.out_arguments) == len(self.returns), \
+                "Must return as many arguments as there are out arguments"
+        if self.name.name.inplace:
+            # TODO: fixme
+            if str(self.name) not in [
+                    '_amp_non_finite_check_and_unscale_',
+                    '_foreach_add_.Scalar']:
+                assert len(self.returns) == 1
+
+    def is_out_fn(self) -> bool:
+        # Note [is_out_fn]
+        #
+        # out functions are the variants which take an explicit out= argument
+        # to populate into.  We need to know if a schema corresponds to an
+        # out function for several reasons:
+        #
+        #   - They codegen differently in C++ API
+        #       - codegen to at::add_out rather than at::add
+        #       - out argument is moved to front of C++ argument list
+        #
+        # out functions are DEFINED to be any function with a keyword-only
+        # argument that is mutable.  In principle, this could lead to a
+        # false positive if you define a function that mutates a
+        # kwarg only argument, but this isn't the "true" output of this
+        # function.  A more robust definition that would work in this
+        # case would also look at:
+        #
+        #   - The output types.  Out functions take in the arguments
+        #     they mutate and then return them again; this is sort
+        #     of "definitionally" what makes something an out function.
+        #     Historically, we DO check this for consistency.
+        #   - Correspondence with pure variant.  An out function
+        #     should have a signature equivalent to its pure variant,
+        #     but just with extra kwargs for the output elements.  This
+        #     is difficult to actually check for and historically
+        #     we only do this check in tools/
+        return bool(self.out_arguments)
+
+    def __str__(self) -> str:
+        all_arguments: List[str] = []
+        all_arguments.extend(map(str, self.arguments))
+        if self.kwarg_only_arguments or self.out_arguments:
+            all_arguments.append('*')
+        all_arguments.extend(map(str, self.kwarg_only_arguments))
+        all_arguments.extend(map(str, self.out_arguments))
+        all_arguments_str = ', '.join(all_arguments)
+        if len(self.returns) == 1:
+            returns = str(self.returns[0])  # omit parentheses
+        else:
+            returns = '(' + ', '.join(map(str, self.returns)) + ')'
+        return f'{self.name}({all_arguments_str}) -> {returns}'
+
+# Here is the rest of the data model, described more briefly.
+
+# Simplified version for what actually shows up in built-ins.
+# Look at alias_info.h for expanded syntax.  If you need the structure,
+# you also need to make this structure recursive so it can be lined
+# up with the type components too.  For primitives this isn't really
+# necessary
+@dataclass(frozen=True)
+class Annotation:
+    # Typically only has one element.  Not actually a set so
+    # we can conveniently assume it is canonically ordered
+    alias_set: Sequence[str]
+    is_write: bool
+
+    @staticmethod
+    def parse(ann: str) -> 'Annotation':
+        m = re.match(r'^([a-z])(!?)$', ann)
+        assert m is not None, f'unrecognized alias annotation {ann}'
+        alias_set = [m.group(1)]
+        is_write = m.group(2) == '!'
+        r = Annotation(alias_set=alias_set, is_write=is_write)
+        assert str(r) == ann, f'{r} != {ann}'
+        return r
+
+    def __str__(self) -> str:
+        alias_set = '|'.join(self.alias_set)
+        is_write = '!' if self.is_write else ''
+        return f'{alias_set}{is_write}'
+
+# The base class for the type system.  This is also loosely modeled
+# off of jit_type.h, but we've simplified the hierarchy to focus
+# in on the aspects of the type system that matter for code generation
+# (for example, there's no SingleElementType subclass anymore).
+# You never actually construct a Type; usually it's going to be one
+# of the subclasses.  If Python had ADTs this would be one!
+@dataclass(frozen=True)
+class Type:
+    @staticmethod
+    def parse(t: str) -> 'Type':
+        r = Type._parse(t)
+        assert str(r) == t, f'{r} != {t}'
+        return r
+
+    @staticmethod
+    def _parse(t: str) -> 'Type':
+        m = re.match(r'^(.+)\?$', t)
+        if m is not None:
+            return OptionalType(Type.parse(m.group(1)))
+        m = re.match(r'^(.+)\[([0-9]+)?\]$', t)
+        if m is not None:
+            size = int(m.group(2)) if m.group(2) is not None else None
+            return ListType(elem=Type.parse(m.group(1)), size=size)
+        try:
+            return BaseType(BaseTy[t])
+        except KeyError:
+            raise RuntimeError(f"unrecognized type {t}")
+
+    def __str__(self) -> str:
+        raise NotImplementedError
+
+    # WARNING: These concepts are not very well-defined.  For example,
+    # is "int?" nullable? How about "int?[]".  They are defined
+    # so we can conveniently generate legacy Declarations.yaml but
+    # really we should probably just remove these at some point
+
+    def is_tensor_like(self) -> bool:
+        raise NotImplementedError
+
+    def is_nullable(self) -> bool:
+        raise NotImplementedError
+
+    def is_list_like(self) -> Optional['ListType']:
+        raise NotImplementedError
+
+# Base types are simple, atomic types with no further structure
+BaseTy = Enum('BaseTy', (
+    'Generator',
+    'ScalarType',
+    'Tensor',
+    'int',
+    'Dimname',
+    'float',
+    'str',
+    'bool',
+    'Layout',
+    'Device',
+    'Scalar',
+    'MemoryFormat',
+    'QScheme',
+    'Storage',
+    'ConstQuantizerPtr',  # TODO: rename
+))
+
+@dataclass(frozen=True)
+class BaseType(Type):
+    name: BaseTy
+
+    def __str__(self) -> str:
+        return f'{self.name.name}'
+
+    def is_tensor_like(self) -> bool:
+        return self.name == BaseTy.Tensor
+
+    def is_nullable(self) -> bool:
+        return False
+
+    def is_list_like(self) -> Optional['ListType']:
+        return None
+
+# Optional types may be specified, or may also be validly given None
+@dataclass(frozen=True)
+class OptionalType(Type):
+    elem: Type
+
+    def __str__(self) -> str:
+        return f'{self.elem}?'
+
+    def is_tensor_like(self) -> bool:
+        return self.elem.is_tensor_like()
+
+    def is_nullable(self) -> bool:
+        return True
+
+    def is_list_like(self) -> Optional['ListType']:
+        return self.elem.is_list_like()
+
+# List types specify that we may have multiples of an element.  We
+# also support explicit sizes on list types, but these have
+# some nontrivial semantics!  (However, for C++ API purposes, explicit
+# sizes are mostly erased from the type system.)
+#
+# DANGER WILL ROBINSON: C++ elaboration depends on elem type; e.g.,
+# int[] elaborates differently than bool[3]!
+@dataclass(frozen=True)
+class ListType(Type):
+    elem: Type
+    size: Optional[int]
+
+    def __str__(self) -> str:
+        size = f'{self.size}' if self.size else ''
+        return f'{self.elem}[{size}]'
+
+    def is_tensor_like(self) -> bool:
+        return self.elem.is_tensor_like()
+
+    def is_nullable(self) -> bool:
+        return self.elem.is_nullable()
+
+    def is_list_like(self) -> Optional['ListType']:
+        return self
+
+@dataclass(frozen=True)
+class Argument:
+    # NB: I didn't put kwarg_only as a boolean field here, unlike
+    # c10::Argument, so that printing works correctly
+
+    name: str
+    type: Type
+    default: Optional[str]
+
+    # The semantics of the annotation field are a little strange.
+    #
+    # Alias annotations parametrize Tensors (since Tensors are the only things
+    # that can alias.)  This motivates why I write Tensor(a!)?  (and not, for
+    # example, Tensor?(a!)), because the (a!) describes aliasing on the tensor,
+    # which may be optional (i.e., the alias annotation should bind first to
+    # Tensor, before the optional postfix annotation).
+    #
+    # However, despite being a property of Tensor, we (and c10::Argument)
+    # store the annotation at the top level of the Argument, rather than
+    # inside the embedded Tensor type.  In the C++ version of this
+    # class, we then go through great lengths to mimic the type
+    # structure in the annotation structure so we can correlate
+    # annotations with types.
+    #
+    # Now, it turns out, in all applications in code generation, the
+    # structure of annotated types is very simple.  So we just hard
+    # code it here.  But if we ever do get anything more complex, this
+    # model will have to change!
+    annotation: Optional[Annotation]
+
+    @staticmethod
+    def parse(arg: str) -> 'Argument':
+        name: str
+        default: Optional[str]
+        type_and_annot, name_and_default = arg.rsplit(' ', 1)
+        if '=' in name_and_default:
+            name, default = name_and_default.split('=')
+        else:
+            name = name_and_default
+            default = None
+        # TODO: deduplicate annotation matching with Return
+        match = re.match(r'Tensor\((.+)\)(.*)', type_and_annot)
+        annotation: Optional[Annotation]
+        if match:
+            # If you update this, make sure the __str__ still works too
+            assert match.group(2) in ['', '?', '[]'], 'unrecognized alias analysis form with Tensor'
+            type_s = 'Tensor' + match.group(2)
+            annotation = Annotation.parse(match.group(1))
+        else:
+            type_s = type_and_annot
+            annotation = None
+        type = Type.parse(type_s)
+        r = Argument(
+            name=name,
+            type=type,
+            default=default,
+            annotation=annotation,
+        )
+        assert str(r) == arg, f'{str(r)} != {arg}'
+        return r
+
+    @property
+    def is_write(self) -> bool:
+        return self.annotation is not None and self.annotation.is_write
+
+    def __str__(self) -> str:
+        type = f'{self.type}'
+        if self.annotation:
+            assert type in ['Tensor', 'Tensor?', 'Tensor[]']
+            type = type.replace('Tensor', f'Tensor({self.annotation})')
+        if self.name is None:
+            return type
+        else:
+            mb_default = ''
+            if self.default:
+                mb_default = f'={self.default}'
+            return f"{type} {self.name}{mb_default}"
+
+
+@dataclass(frozen=True)
+class Return:
+    name: Optional[str]
+    type: Type
+    annotation: Optional[Annotation]
+
+    @staticmethod
+    def parse(arg: str) -> 'Return':
+        name: Optional[str]
+        if ' ' in arg:
+            type_and_annot, name = arg.rsplit(' ', 1)
+        else:
+            type_and_annot = arg
+            name = None
+        match = re.match(r'Tensor\((.+)\)(.*)', type_and_annot)
+        annotation: Optional[Annotation]
+        if match:
+            # If you update this, make sure the __str__ still works too
+            assert match.group(2) in ['', '?', '[]'], 'unrecognized alias analysis form with Tensor'
+            type_s = 'Tensor' + match.group(2)
+            annotation = Annotation.parse(match.group(1))
+        else:
+            type_s = type_and_annot
+            annotation = None
+        type = Type.parse(type_s)
+        r = Return(
+            name=name,
+            type=type,
+            annotation=annotation,
+        )
+        assert str(r) == arg, f'{str(r)} != {arg}'
+        return r
+
+    @property
+    def is_write(self) -> bool:
+        return self.annotation is not None and self.annotation.is_write
+
+    def __str__(self) -> str:
+        type = f'{self.type}'
+        if self.annotation:
+            assert type in ['Tensor', 'Tensor?', 'Tensor[]']
+            type = type.replace('Tensor', f'Tensor({self.annotation})')
+        if self.name is None:
+            return type
+        else:
+            return f"{type} {self.name}"
+
+
+# Names that validly are __iXXX__ indicating inplace operations.
+# Taken from https://www.python.org/dev/peps/pep-0203/#new-methods
+# NB: PyTorch hasn't actually implemented all of these
+AUGMENTED_ASSIGNMENT_NAMES = ['add', 'sub', 'mul', 'div', 'mod', 'pow', 'lshift', 'rshift', 'and', 'xor', 'or']
+
+# A BaseOperatorName is what we think of the operator name, without
+# the overload name.  Unusually, we don't represent this as just a
+# string; instead, we directly represent a few important semantic
+# bits of information we derive from the string: namely whether
+# or not it's inplace (add_) and whether or not it's a double-underscore
+# method (__add__)
+@dataclass(frozen=True)
+class BaseOperatorName:
+    base: str
+    inplace: bool
+    dunder_method: bool
+
+    @staticmethod
+    def parse(op: str) -> 'BaseOperatorName':
+        assert op != ''
+        assert not op.endswith('_out'), \
+            "_out suffix is reserved and not permitted for operator names; " \
+            "did you mean to specify an out overload name instead?"
+        m = re.match(r'^__([^_]+)__$', op)
+        if m is not None:
+            dunder_method = True
+            base = m.group(1)
+            if any(base == f'i{n}' for n in AUGMENTED_ASSIGNMENT_NAMES):
+                inplace = True
+                base = base[1:]
+            else:
+                inplace = False
+                # temporary, this is not intrinsically true but
+                # has been historically true for dunder methods
+                # we support  (but, if we ever got, say, __int__, this would
+                # be wrong!)
+                assert base[0] != 'i'
+        else:
+            dunder_method = False
+            base = op
+            if base[-1] == '_':
+                inplace = True
+                base = base[:-1]
+            else:
+                inplace = False
+        r = BaseOperatorName(base=base, inplace=inplace, dunder_method=dunder_method)
+        assert str(r) == op, f'{str(r)} != {op}'
+        return r
+
+    def __str__(self) -> str:
+        if self.dunder_method:
+            i = 'i' if self.inplace else ''
+            return f'__{i}{self.base}__'
+        else:
+            i = '_' if self.inplace else ''
+            return f'{self.base}{i}'
+
+# Operator name is the base operator name along with the (typically not
+# user visible) overload string.
+@dataclass(frozen=True)
+class OperatorName:
+    name: BaseOperatorName
+    overload_name: str
+
+    @staticmethod
+    def parse(op_name: str) -> 'OperatorName':
+        if '.' in op_name:
+            name, overload_name = op_name.split('.', 1)
+        else:
+            name = op_name
+            overload_name = ''
+        r = OperatorName(
+            name=BaseOperatorName.parse(name),
+            overload_name=overload_name
+        )
+        assert str(r) == op_name, f'{str(r)} != {op_name}'
+        return r
+
+    def __str__(self) -> str:
+        if self.overload_name:
+            return f"{self.name}.{self.overload_name}"
+        else:
+            return f"{self.name}"
+
+# Helper functions for parsing argument lists (both inputs and returns)
+
+def parse_returns(return_decl: str) -> Sequence[Return]:
+    """
+    Input: '()'
+    Output: []
+    """
+    if return_decl == '()':
+        return []
+    if return_decl[0] == '(' and return_decl[-1] == ')':
+        return_decl = return_decl[1:-1]
+    returns = []
+    for arg in return_decl.split(', '):
+        returns.append(Return.parse(arg))
+    return returns
+
+def parse_arguments(args: str) -> Tuple[Sequence[Argument], Sequence[Argument], Sequence[Argument]]:
+    """
+    Input: 'int x, int y, int z'
+    Output: positional args, kwarg only args
+    """
+    arguments: List[Argument] = []
+    kwarg_only_arguments: List[Argument] = []
+    out_arguments: List[Argument] = []
+    arguments_acc = arguments
+
+    # TODO: Use a real parser here; this will get bamboozled
+    # by signatures that contain things like std::array<bool, 2> (note the space)
+    for arg in args.split(', '):
+        if not arg:
+            continue
+        if arg == '*':
+            assert arguments_acc is arguments, "invalid syntax: kwarg-only specifier * can only occur once"
+            arguments_acc = kwarg_only_arguments
+            continue
+        parg = Argument.parse(arg)
+        # Currently, we rely directly on the invariant that there are NO
+        # kwarg-only mutating arguments.  If you want to relax this,
+        # we will need a more semantic way of matching that takes
+        # into account return arguments.  In that case, you will have
+        # to manage out_arguments computation a level up, in
+        # FunctionSchema.  See Note [is_out_fn]
+        if parg.annotation is not None and parg.annotation.is_write:
+            if arguments_acc is arguments:
+                pass  # do nothing
+            elif arguments_acc is kwarg_only_arguments:
+                arguments_acc = out_arguments
+        else:
+            assert arguments_acc is not out_arguments
+        arguments_acc.append(parg)
+
+    return arguments, kwarg_only_arguments, out_arguments
diff --git a/tools/setup_helpers/gen.py b/tools/setup_helpers/gen.py
new file mode 100644
index 00000000000..bdb52ee44ef
--- /dev/null
+++ b/tools/setup_helpers/gen.py
@@ -0,0 +1,11 @@
+# Little stub file to get BUILD.bazel to play along
+
+import os.path
+import sys
+
+root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.insert(0, root)
+
+import tools.codegen.gen
+
+tools.codegen.gen.main()
diff --git a/aten/src/ATen/common_with_cwrap.py b/tools/shared/cwrap_common.py
similarity index 100%
rename from aten/src/ATen/common_with_cwrap.py
rename to tools/shared/cwrap_common.py