TH: Clean up dead code (#60655)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/60655 Test Plan: Imported from OSS Reviewed By: albanD Differential Revision: D29371717 Pulled By: ngimel fbshipit-source-id: faa71b1d4a15450c78e12aa917daec853057bce9
2025-12-06 12:20:52 +01:00 · 2021-06-24 19:39:36 -07:00 · 2021-06-24 19:39:36 -07:00 · 42c8439b6e
commit 42c8439b6e
parent 4a7d281119
25 changed files with 1 additions and 2266 deletions
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -332,11 +332,9 @@ filegroup(
 filegroup(
    name = "th_srcs",
    srcs = [
-        "aten/src/TH/THBlas.cpp",
        "aten/src/TH/THGeneral.cpp",
        "aten/src/TH/THStorageFunctions.cpp",
        "aten/src/TH/THTensor.cpp",
-        "aten/src/TH/THTensorMoreMath.cpp",
    ],
 )

@ -546,10 +544,6 @@ header_template_rule(
    src = "aten/src/TH/THGeneral.h.in",
    out = "aten/src/TH/THGeneral.h",
    substitutions = {
-        "#cmakedefine USE_BLAS": "#define USE_BLAS",
-        "#cmakedefine USE_LAPACK": "#define USE_LAPACK",
-        "#cmakedefine BLAS_F2C": "/* #undef BLAS_F2C */",
-        "#cmakedefine BLAS_USE_CBLAS_DOT": "#define BLAS_USE_CBLAS_DOT",
    },
 )

--- a/aten/src/TH/CMakeLists.txt
+++ b/aten/src/TH/CMakeLists.txt
@ -1,15 +1,12 @@
 set(Aten_TH_AVX_extra_src)

 set(hdr
-  THGeneral.h THHalf.h THStorage.h THStorageFunctions.h THTensor.h THTensorApply.h THBlas.h
-  THVector.h )
+  THGeneral.h THHalf.h THStorage.h THStorageFunctions.h THTensor.h)

 set(ATen_TH_SRCS
  ${CMAKE_CURRENT_SOURCE_DIR}/THGeneral.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/THStorageFunctions.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/THTensor.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/THTensorMoreMath.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/THBlas.cpp
  )
 # Remember that PARENT_SCOPE variables are not in the current scope
 set(ATen_TH_SRCS ${ATen_TH_SRCS} PARENT_SCOPE)
@ -36,7 +33,6 @@ configure_file(THGeneral.h.in "${CMAKE_CURRENT_BINARY_DIR}/THGeneral.h")

 install(FILES
  TH.h
-  THBlas.h
  ${CMAKE_CURRENT_BINARY_DIR}/THGeneral.h
  THGenerateAllTypes.h
  THGenerateBFloat16Type.h
@ -62,17 +58,12 @@ install(FILES
  THStorage.h
  THStorageFunctions.h
  THTensor.h
-  THTensorApply.h
-  THTensorDimApply.h
-  THVector.h
  THHalf.h
  THTensor.hpp
  THStorageFunctions.hpp
  DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/TH")

 install(FILES
-  generic/THBlas.cpp
-  generic/THBlas.h
  generic/THStorage.cpp
  generic/THStorage.h
  generic/THStorageCopy.cpp
@ -80,8 +71,5 @@ install(FILES
  generic/THTensor.cpp
  generic/THTensor.h
  generic/THTensor.hpp
-  generic/THTensorMath.h
-  generic/THVector.h
  # See Note [TH abstraction violation]
-  generic/THTensorFastGetSet.hpp
  DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/TH/generic")
--- a/aten/src/TH/TH.h
+++ b/aten/src/TH/TH.h
@ -3,11 +3,7 @@

 #include <TH/THGeneral.h>

-#include <TH/THBlas.h>
-#include <TH/THVector.h>
 #include <TH/THStorageFunctions.h>
 #include <TH/THTensor.h>
-#include <TH/THTensorApply.h>
-#include <TH/THTensorDimApply.h>

 #endif
--- a/aten/src/TH/THBlas.cpp
+++ b/aten/src/TH/THBlas.cpp
@ -1,13 +0,0 @@
-#include <TH/THBlas.h>
-
-// NOLINTNEXTLINE(bugprone-suspicious-include)
-#include <TH/generic/THBlas.cpp>
-#include <TH/THGenerateAllTypes.h>
-
-// NOLINTNEXTLINE(bugprone-suspicious-include)
-#include <TH/generic/THBlas.cpp>
-#include <TH/THGenerateBFloat16Type.h>
-
-// NOLINTNEXTLINE(bugprone-suspicious-include)
-#include <TH/generic/THBlas.cpp>
-#include <TH/THGenerateHalfType.h>
--- a/aten/src/TH/THBlas.h
+++ b/aten/src/TH/THBlas.h
@ -1,17 +0,0 @@
-#ifndef TH_BLAS_INC
-#define TH_BLAS_INC
-
-#include <TH/THGeneral.h>
-
-#define THBlas_(NAME) TH_CONCAT_4(TH,Real,Blas_,NAME)
-
-#include <TH/generic/THBlas.h>
-#include <TH/THGenerateAllTypes.h>
-
-#include <TH/generic/THBlas.h>
-#include <TH/THGenerateBFloat16Type.h>
-
-#include <TH/generic/THBlas.h>
-#include <TH/THGenerateHalfType.h>
-
-#endif
--- a/aten/src/TH/THGeneral.cpp
+++ b/aten/src/TH/THGeneral.cpp
@ -68,21 +68,6 @@ void _THAssertionFailed(const char *file, const int line, const char *exp, const
  _THError(file, line, "Assertion `%s' failed. %s", exp, msg);
 }

-void THSetErrorHandler(THErrorHandlerFunction new_handler, void *data)
-{
-  threadErrorHandler = new_handler;
-  threadErrorHandlerData = data;
-}
-
-void THSetDefaultErrorHandler(THErrorHandlerFunction new_handler, void *data)
-{
-  if (new_handler)
-    defaultErrorHandler = new_handler;
-  else
-    defaultErrorHandler = defaultErrorHandlerFunction;
-  defaultErrorHandlerData = data;
-}
-
 /* Torch Arg Checking Handling */
 static void defaultArgErrorHandlerFunction(int argNumber, const char *msg, void *data)
 {
@ -125,42 +110,6 @@ void _THArgCheck(const char *file, int line, int condition, int argNumber, const
  }
 }

-void THSetArgErrorHandler(THArgErrorHandlerFunction new_handler, void *data)
-{
-  threadArgErrorHandler = new_handler;
-  threadArgErrorHandlerData = data;
-}
-
-void THSetDefaultArgErrorHandler(THArgErrorHandlerFunction new_handler, void *data)
-{
-  if (new_handler)
-    defaultArgErrorHandler = new_handler;
-  else
-    defaultArgErrorHandler = defaultArgErrorHandlerFunction;
-  defaultArgErrorHandlerData = data;
-}
-
-// NOLINTNEXTLINE(modernize-use-nullptr,cppcoreguidelines-avoid-non-const-global-variables)
-static __thread void (*torchGCFunction)(void *data) = NULL;
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-static __thread void *torchGCData;
-
-/* Optional hook for integrating with a garbage-collected frontend.
- *
- * If torch is running with a garbage-collected frontend (e.g. Lua),
- * the GC isn't aware of TH-allocated memory so may not know when it
- * needs to run. These hooks trigger the GC to run in two cases:
- *
- * (1) When a memory allocation (malloc, realloc, ...) fails
- * (2) When the total TH-allocated memory hits a dynamically-adjusted
- *     soft maximum.
- */
-void THSetGCHandler( void (*torchGCFunction_)(void *data), void *data )
-{
-  torchGCFunction = torchGCFunction_;
-  torchGCData = data;
-}
-
 void* THAlloc(ptrdiff_t size)
 {
  if(size < 0)
@ -169,63 +118,7 @@ void* THAlloc(ptrdiff_t size)
  return c10::alloc_cpu(size);
 }

-void* THRealloc(void *ptr, ptrdiff_t size)
-{
-  if(!ptr)
-    return(THAlloc(size));
-
-  if(size == 0)
-  {
-    THFree(ptr);
-    // NOLINTNEXTLINE(modernize-use-nullptr)
-    return NULL;
-  }
-
-  if(size < 0)
-    THError("$ Torch: invalid memory size -- maybe an overflow?");
-
-  // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
-  void *newptr = realloc(ptr, size);
-
-  if(!newptr && torchGCFunction) {
-    torchGCFunction(torchGCData);
-    // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
-    newptr = realloc(ptr, size);
-  }
-
-  if(!newptr)
-    THError("$ Torch: not enough memory: you tried to reallocate %dGB. Buy new RAM!", size/1073741824);
-
-  return newptr;
-}
-
 void THFree(void *ptr)
 {
  c10::free_cpu(ptr);
 }
-
-THDescBuff _THSizeDesc(const int64_t *size, const int64_t ndim) {
-  const int L = TH_DESC_BUFF_LEN;
-  THDescBuff buf;
-  char *str = buf.str;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int64_t i;
-  int64_t n = 0;
-  n += snprintf(str, L-n, "[");
-
-  for (i = 0; i < ndim; i++) {
-    if (n >= L) break;
-    n += snprintf(str+n, L-n, "%" PRId64, size[i]);
-    if (i < ndim-1) {
-      n += snprintf(str+n, L-n, " x ");
-    }
-  }
-
-  if (n < L - 2) {
-    snprintf(str+n, L-n, "]");
-  } else {
-    snprintf(str+L-5, 5, "...]");
-  }
-
-  return buf;
-}
--- a/aten/src/TH/THGeneral.h.in
+++ b/aten/src/TH/THGeneral.h.in
@ -21,11 +21,6 @@
 #include <mkl_vsl.h>
 #endif

-#cmakedefine USE_BLAS
-#cmakedefine USE_LAPACK
-#cmakedefine BLAS_F2C
-#cmakedefine BLAS_USE_CBLAS_DOT
-
 # define TH_EXTERNC extern "C"

 // Note(jiayq): copied from ATen/core/Macros.h. Because internal build of TH
@ -72,26 +67,12 @@
 typedef void (*THErrorHandlerFunction)(const char *msg, void *data);
 typedef void (*THArgErrorHandlerFunction)(int argNumber, const char *msg, void *data);

-#define TH_DESC_BUFF_LEN 64
-typedef struct {
-    char str[TH_DESC_BUFF_LEN];
-} THDescBuff;

-
-TH_API THDescBuff _THSizeDesc(const int64_t *size, const int64_t ndim);
 TH_API TH_NO_RETURN void _THError(const char *file, const int line, const char *fmt, ...);
 TH_API void _THAssertionFailed(const char *file, const int line, const char *exp, const char *fmt, ...);
-TH_API void THSetErrorHandler(THErrorHandlerFunction new_handler, void *data);
-TH_API void THSetDefaultErrorHandler(THErrorHandlerFunction new_handler, void *data);
 TH_API void _THArgCheck(const char *file, int line, int condition, int argNumber, const char *fmt, ...);
-TH_API void THSetArgErrorHandler(THArgErrorHandlerFunction new_handler, void *data);
-TH_API void THSetDefaultArgErrorHandler(THArgErrorHandlerFunction new_handler, void *data);
 TH_API void* THAlloc(ptrdiff_t size);
-TH_API void* THRealloc(void *ptr, ptrdiff_t size);
 TH_API void THFree(void *ptr);
-TH_API void THSetGCHandler( void (*torchGCHandlerFunction)(void *data), void *data );
-// this hook should only be called by custom allocator functions
-TH_API void THHeapUpdate(ptrdiff_t size);

 #define THError(...) _THError(__FILE__, __LINE__, __VA_ARGS__)

--- a/aten/src/TH/THTensor.cpp
+++ b/aten/src/TH/THTensor.cpp
@ -36,42 +36,3 @@ void THTensor_setStorage(THTensor *self, THStorage *storage_, ptrdiff_t storageO
  c10::raw::intrusive_ptr::incref(storage_);
  THTensor_wrap(self).set_(at::Storage(c10::intrusive_ptr<at::StorageImpl>::reclaim(storage_)), storageOffset_, size_, stride_);
 }
-
-void THTensor_resize(THTensor *self, at::IntArrayRef size, at::IntArrayRef stride)
-{
-  if (stride.data()) {
-    THArgCheck(stride.size() == size.size(), 3, "invalid stride");
-  }
-
-#ifdef DEBUG
-  THAssert(size.size() <= INT_MAX);
-#endif
-  THTensor_resizeNd(self, size.size(), size.data(), stride.data());
-}
-
-void THTensor_resizeNd(THTensor *self, int nDimension, const int64_t *size, const int64_t *stride)
-{
-  TORCH_CHECK(nDimension >= 0, "resizeNd nDimension must be non-negative");
-  at::IntArrayRef sizes(size, nDimension);
-  at::optional<at::IntArrayRef> strides;
-  if (stride) {
-    strides = at::IntArrayRef(stride, nDimension);
-  }
-  at::native::resize_impl_cpu_(self, sizes, strides);
-}
-
-// NB: Steals ownership of storage
-void THTensor_stealAndSetStoragePtr(THTensor* tensor, THStorage* storage) {
-  // Caffe2 might have tensors whose storages are null, but we
-  // don't allow it in PyTorch.
-  AT_ASSERT(storage);
-
-  // We used to allow this, but this breaks device caching.
-  // Let's put an actual error message for this one.
-  TORCH_CHECK(tensor->storage().device() == storage->device(),
-            "Attempted to set the storage of a tensor on device \"", tensor->storage().device(),
-             "\" to a storage on different device \"", storage->device(),
-            "\".  This is no longer allowed; the devices must match.");
-  tensor->set_storage_keep_dtype(
-      at::Storage(c10::intrusive_ptr<THStorage>::reclaim(storage)));
-}
--- a/aten/src/TH/THTensor.h
+++ b/aten/src/TH/THTensor.h
@ -2,7 +2,6 @@
 #define TH_TENSOR_INC

 #include <TH/THStorageFunctions.h>
-#include <TH/THTensorApply.h>

 #define THTensor_(NAME)   TH_CONCAT_4(TH,Real,Tensor_,NAME)

@ -21,20 +20,4 @@

 #include <TH/generic/THTensor.h>
 #include <TH/THGenerateBFloat16Type.h>
-
-/* maths */
-#include <TH/generic/THTensorMath.h>
-#include <TH/THGenerateAllTypes.h>
-
-#include <TH/generic/THTensorMath.h>
-#include <TH/THGenerateBoolType.h>
-
-#include <TH/generic/THTensorMath.h>
-#include <TH/THGenerateHalfType.h>
-
-#include <TH/generic/THTensorMath.h>
-#include <TH/THGenerateBFloat16Type.h>
-
-#include <TH/generic/THTensorMath.h>
-#include <TH/THGenerateComplexTypes.h>
 #endif
--- a/aten/src/TH/THTensor.hpp
+++ b/aten/src/TH/THTensor.hpp
@ -82,14 +82,6 @@ inline int64_t THTensor_sizeLegacyNoScalars(const THTensor *self, int dim)
  return self->dim() == 0 ? 1 : self->size(dim);
 }

-#include <TH/generic/THTensorFastGetSet.hpp>
-#include <TH/THGenerateAllTypes.h>
-
-#include <TH/generic/THTensorFastGetSet.hpp>
-#include <TH/THGenerateComplexTypes.h>
-
-#include <TH/generic/THTensorFastGetSet.hpp>
-#include <TH/THGenerateBFloat16Type.h>

 inline std::vector<int64_t> THTensor_sizesLegacyNoScalars(const THTensor *self) {
  if (self->dim() == 0) {
@ -98,20 +90,7 @@ inline std::vector<int64_t> THTensor_sizesLegacyNoScalars(const THTensor *self)
    return self->sizes().vec();
  }
 }
-
-inline std::vector<int64_t> THTensor_stridesLegacyNoScalars(const THTensor *self) {
-  if (self->dim() == 0) {
-    return {1};
-  } else {
-    return self->strides().vec();
-  }
-}
-
-// NB: Steals ownership of storage
-TH_API void THTensor_stealAndSetStoragePtr(THTensor* tensor, THStorage* storage);
-
 TH_API void THTensor_free(THTensor *self);
-TH_API void THTensor_resizeNd(THTensor *self, int nDimension, const int64_t *size, const int64_t *stride);

 TH_CPP_API void THTensor_resize(THTensor *self, at::IntArrayRef size, at::IntArrayRef stride);
 TH_CPP_API void THTensor_setStorage(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, at::IntArrayRef size_, at::IntArrayRef stride_);
--- a/aten/src/TH/THTensorApply.h
+++ b/aten/src/TH/THTensorApply.h
@ -1,309 +0,0 @@
-#ifndef TH_TENSOR_APPLY_INC
-#define TH_TENSOR_APPLY_INC
-
-#include <ATen/Parallel.h>
-
-/*
- * The basic strategy for apply is as follows:
- *
- * 1. Starting with the outermost index, loop until we reach a dimension where the
- * data is no longer contiguous, i.e. the stride at that dimension is not equal to
- * the size of the tensor defined by the outer dimensions. Let's call this outer
- * (contiguous) tensor A. Note that if the Tensor is contiguous, then A is equal
- * to the entire Tensor. Let's call the inner tensor B.
- *
- * 2. We loop through the indices in B, starting at its outermost dimension. For
- * example, if B is a 2x2 matrix, then we do:
- *
- * B[0][0]
- * B[0][1]
- * B[1][0]
- * B[1][1]
- *
- * We set the offset into the underlying storage as (storageOffset + stride_B * index_B),
- * i.e. basically we compute the offset into the storage as we would normally for a
- * Tensor. But because we are guaranteed the subsequent data is contiguous in memory, we
- * can simply loop for sizeof(A) iterations and perform the operation, without having to
- * follow the order described by the strides of A.
- *
- * 3. As an optimization, we merge dimensions of A that are contiguous in memory. For
- * example, if A is a 3x3x3x3 tensor narrowed from a 3x3x4x3 tensor, then the first two
- * dimensions can be merged for the purposes of APPLY, reducing the number of nested
- * loops.
- */
-
-#define __TH_TENSOR_APPLYX_PREAMBLE(TYPE, TENSOR, DIM, ALLOW_CONTIGUOUS) \
-  TYPE *TENSOR##_data = NULL; \
-  int64_t *TENSOR##_counter = NULL, *TENSOR##_sizes = NULL, *TENSOR##_strides = NULL, *TENSOR##_dimOffset = NULL; \
-  int64_t TENSOR##_stride = 0, TENSOR##_size = 0, TENSOR##_dim = 0, TENSOR##_i, TENSOR##_n; \
-  int TENSOR##_contiguous = ALLOW_CONTIGUOUS && DIM < 0; \
-  TENSOR##_n = 1; \
-  for(TENSOR##_i = 0; TENSOR##_i < TENSOR->dim(); TENSOR##_i++) \
-    TENSOR##_n *= TENSOR->size(TENSOR##_i); \
-\
-  if(TENSOR->is_empty()) \
-    TH_TENSOR_APPLY_hasFinished = 1; \
-  else \
-  { \
-    TENSOR##_data = THTensor_getStoragePtr(TENSOR)->data<TYPE>()+TENSOR->storage_offset(); \
-    TENSOR##_size = 1; \
-    TENSOR##_stride = 1; \
-    for(TENSOR##_i = THTensor_nDimensionLegacyAll(TENSOR)-1; TENSOR##_i >= 0; TENSOR##_i--) { \
-      if(THTensor_sizeLegacyNoScalars(TENSOR, TENSOR##_i) != 1) { \
-        if(THTensor_strideLegacyNoScalars(TENSOR, TENSOR##_i) == TENSOR##_size && TENSOR##_i != DIM) \
-          TENSOR##_size *= THTensor_sizeLegacyNoScalars(TENSOR, TENSOR##_i); \
-        else{ \
-          TENSOR##_contiguous = 0; \
-          break; \
-        } \
-      } \
-    } \
-    if (!TENSOR##_contiguous) { \
-      /* Find the dimension of contiguous sections */ \
-      TENSOR##_dim = 1; \
-      for(TENSOR##_i = THTensor_nDimensionLegacyAll(TENSOR)-2; TENSOR##_i >= 0; TENSOR##_i--) \
-      { \
-        if(TENSOR->stride(TENSOR##_i) != TENSOR->stride(TENSOR##_i+1) * TENSOR->size(TENSOR##_i+1) || TENSOR##_i == DIM || TENSOR##_i+1 == DIM) \
-          TENSOR##_dim++; \
-      } \
-      /* Allocate an array of 3*dim elements, where dim is the number of contiguous sections */ \
-      TENSOR##_counter = (int64_t*)THAlloc(sizeof(int64_t)*(3*TENSOR##_dim)); \
-      TENSOR##_sizes = TENSOR##_counter + TENSOR##_dim; \
-      TENSOR##_strides = TENSOR##_counter + 2*TENSOR##_dim; \
-      TH_TENSOR_dim_index = TENSOR##_dim-1; \
-      TENSOR##_dimOffset = (DIM == THTensor_nDimensionLegacyAll(TENSOR)-1) ? &TENSOR##_i : &TENSOR##_counter[DIM]; \
-      TENSOR##_sizes[TH_TENSOR_dim_index] = THTensor_sizeLegacyNoScalars(TENSOR, THTensor_nDimensionLegacyAll(TENSOR)-1); \
-      TENSOR##_strides[TH_TENSOR_dim_index] = THTensor_strideLegacyNoScalars(TENSOR, THTensor_nDimensionLegacyAll(TENSOR)-1); \
-      /* TENSOR##_counter tracks where we are in the storage. The offset into the */ \
-      /* storage is given by storage_offset + (i * j), where i is the stride */ \
-      /* vector and j is tensor_counter vector. This sets the starting position for the loop. */ \
-      for(TENSOR##_i = TENSOR##_dim-1; TENSOR##_i >= 0; --TENSOR##_i) { \
-        TENSOR##_counter[TENSOR##_i] = 0; \
-      } \
-      for(TENSOR##_i = THTensor_nDimensionLegacyAll(TENSOR)-2; TENSOR##_i >= 0; --TENSOR##_i) { \
-        if (TENSOR->stride(TENSOR##_i) == TENSOR->stride(TENSOR##_i+1) * TENSOR->size(TENSOR##_i+1) && TENSOR##_i != DIM && TENSOR##_i+1 != DIM) { \
-          TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size(TENSOR##_i) * TENSOR##_sizes[TH_TENSOR_dim_index]; \
-          if (DIM != THTensor_nDimensionLegacyAll(TENSOR)-1 && TENSOR##_i < DIM) \
-            TENSOR##_dimOffset--; \
-        } else { \
-          --TH_TENSOR_dim_index; \
-          TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size(TENSOR##_i); \
-          TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride(TENSOR##_i); \
-        } \
-      } \
-      /* Size of the inner most section */ \
-      TENSOR##_size = TENSOR##_sizes[TENSOR##_dim-1]; \
-      /* Stride of the inner most section */ \
-      TENSOR##_stride = TENSOR##_strides[TENSOR##_dim-1]; \
-    } \
-    else{\
-      TENSOR##_dim = 1;\
-      TENSOR##_counter = (int64_t*)THAlloc(sizeof(int64_t)*3);\
-      TENSOR##_sizes = TENSOR##_counter + 1;\
-      TENSOR##_strides = TENSOR##_counter + 2;\
-      TENSOR##_sizes[0] = TENSOR##_n;\
-      TENSOR##_strides[0] = 1;\
-      TENSOR##_size = TENSOR##_sizes[0];\
-      TENSOR##_stride = TENSOR##_strides[0];\
-    }\
-  } \
-  TENSOR##_i = 0;
-
-#define  __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR, ALWAYS_UPDATE) \
-  if(TENSOR##_i == TENSOR##_size || ALWAYS_UPDATE) \
-  { \
-    if(TENSOR##_contiguous) \
-      break; \
-\
-    if(TENSOR##_dim == 1) \
-       break; \
-\
-    /* Reset pointer to beginning of loop */ \
-    TENSOR##_data -= TENSOR##_size*TENSOR##_stride; \
-    for(TENSOR##_i = TENSOR##_dim-2; TENSOR##_i >= 0; TENSOR##_i--) \
-    { \
-      TENSOR##_counter[TENSOR##_i]++; \
-      /* Jump ahread by the stride of this dimension */ \
-      TENSOR##_data += TENSOR##_strides[TENSOR##_i]; \
-\
-      if(TENSOR##_counter[TENSOR##_i]  == TENSOR##_sizes[TENSOR##_i]) \
-      { \
-        if(TENSOR##_i == 0) \
-        { \
-          TH_TENSOR_APPLY_hasFinished = 1; \
-          break; \
-        } \
-          else \
-        { \
-          /* Reset the pointer to the beginning of the chunk defined by this dimension */ \
-          TENSOR##_data -= TENSOR##_counter[TENSOR##_i]*TENSOR##_strides[TENSOR##_i]; \
-          TENSOR##_counter[TENSOR##_i] = 0; \
-        } \
-      } \
-      else \
-        break; \
-    } \
-    TENSOR##_i = 0; \
-  } \
-
-#define TH_TENSOR_APPLY3_D(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, DIM, CODE) \
-{ \
-  int TH_TENSOR_APPLY_hasFinished = 0; \
-  int64_t TH_TENSOR_dim_index = 0; \
-  __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, DIM, 1) \
-  __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, DIM, 1) \
-  __TH_TENSOR_APPLYX_PREAMBLE(TYPE3, TENSOR3, DIM, 1) \
-                                                                        \
-  int elements_equal = 1;                                               \
-  if(TENSOR1##_n != TENSOR2##_n) {                                      \
-    elements_equal = 0;                                                 \
-  }                                                                     \
-  else if(TENSOR1##_n != TENSOR3##_n) {                                 \
-    elements_equal = 0;                                                 \
-  }                                                                     \
-  if (elements_equal == 0) {                                            \
-    AT_ERROR("inconsistent tensor size, expected ",                     \
-            #TENSOR1, " ", TENSOR1->sizes(), ", ",                      \
-            #TENSOR2, " ", TENSOR2->sizes(), " and ",                   \
-            #TENSOR3, " ", TENSOR3->sizes(), " to have the same "       \
-            "number of elements, but got ", TENSOR1##_n, ", ",          \
-            TENSOR2##_n, " and ", TENSOR3##_n, " elements respectively"); \
-  }                                                                     \
-                                                                        \
-  while(!TH_TENSOR_APPLY_hasFinished) \
-  { \
-    /* Loop through the inner most region of the Tensor */ \
-    for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size && TENSOR3##_i < TENSOR3##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR3##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride, TENSOR3##_data += TENSOR3##_stride) /* 0 et pas TENSOR##_dim! */ \
-    { \
-      CODE \
-    } \
-    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR1, 0) \
-    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR2, 0) \
-    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR3, 0) \
-  } \
-  if(TENSOR1##_counter != NULL) \
-    THFree(TENSOR1##_counter); \
-  if(TENSOR2##_counter != NULL) \
-    THFree(TENSOR2##_counter); \
-  if(TENSOR3##_counter != NULL) \
-    THFree(TENSOR3##_counter); \
-}
-
-#define TH_TENSOR_APPLY3(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \
-  TH_TENSOR_APPLY3_D(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, -1, CODE)
-
-#define TH_TENSOR_APPLY2_D(TYPE1, TENSOR1, TYPE2, TENSOR2, DIM, CODE) \
-{ \
-  int TH_TENSOR_APPLY_hasFinished = 0; \
-  int64_t TH_TENSOR_dim_index = 0; \
-  __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, DIM, 1) \
-  __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, DIM, 1) \
-\
-    if(TENSOR1##_n != TENSOR2##_n) {                                    \
-      AT_ERROR("inconsistent tensor size, expected ",                   \
-      #TENSOR1, " ", TENSOR1->sizes(), " and ",                         \
-      #TENSOR2, " ", TENSOR2->sizes(),                                  \
-      " to have the same number of elements, but got ",                 \
-      TENSOR1##_n, " and ", TENSOR2##_n, " elements respectively");     \
-    }                                                                   \
-  while(!TH_TENSOR_APPLY_hasFinished) \
-  { \
-    /* Loop through the inner most region of the Tensor */ \
-    for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride) /* 0 et pas TENSOR##_dim! */ \
-    { \
-      CODE \
-    } \
-    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR1, 0) \
-    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR2, 0) \
-  } \
-  if(TENSOR1##_counter != NULL) \
-    THFree(TENSOR1##_counter); \
-  if(TENSOR2##_counter != NULL) \
-    THFree(TENSOR2##_counter); \
-}
-
-#define TH_TENSOR_APPLY2(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \
-  TH_TENSOR_APPLY2_D(TYPE1, TENSOR1, TYPE2, TENSOR2, -1, CODE)
-
-#define TH_TENSOR_APPLY_D(TYPE, TENSOR, DIM, CODE) \
-{ \
-  int TH_TENSOR_APPLY_hasFinished = 0; \
-  int64_t TH_TENSOR_dim_index = 0; \
-  __TH_TENSOR_APPLYX_PREAMBLE(TYPE, TENSOR, DIM, 0) \
-\
-  while(!TH_TENSOR_APPLY_hasFinished) \
-  { \
-    /* Loop through the inner most region of the Tensor */ \
-    for(; TENSOR##_i < TENSOR##_size; TENSOR##_i++, TENSOR##_data += TENSOR##_stride) /* 0 et pas TENSOR##_dim! */ \
-    { \
-      CODE \
-    } \
-    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR, 1) \
-  } \
-  THFree(TENSOR##_counter); \
-}
-
-#define TH_TENSOR_APPLY(TYPE, TENSOR, CODE) \
-  TH_TENSOR_APPLY_D(TYPE, TENSOR, -1, CODE)
-
-
-/*
- * Calcuate the memory offset of an element in a tensor. The strategy is below:
- *
- * 1. convert the line index(the index of the element) to the indexs(coordinates) in the tensor.
- *    It can hinted by a classical problem: Getting each individual digit from a whole integer(Decimal base).
- *    A N-digit decimal base number could be view as a N-dimension tensor and the sizes of the tensor are 10.
- *    So the value the whole integer is the line index. And the digits could be viewed as the indexes in
- *    different dimensions.
- *
- * 2. convert the indexs(coordinates) in the tensor to the memory offset.
- *
- *  You can get the detailes in the for-statement iterations.
- *
- * The macro is only used in the first element in each thread. For the rest, the memory offset could update
- * according to info of the tensor in order to get better performance. So we should also record the each
- * indexs in coresponding dimension of first element.
- * The recorded info is stored in the TENSOR##_counter_tmp.
- *
- */
-#define __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR) \
-  int64_t *TENSOR##_counter_tmp = (int64_t*)THAlloc(sizeof(int64_t) * TENSOR##_dim);                 \
-  ptrdiff_t TENSOR##_memory_offset = 0;                                                              \
-  ptrdiff_t TENSOR##_quot = line_index_start;                                                        \
-  for (TENSOR##_i = TENSOR##_dim-1; TENSOR##_i>=0; --TENSOR##_i) {                                   \
-    TENSOR##_counter_tmp[TENSOR##_i] = TENSOR##_quot%TENSOR##_sizes[TENSOR##_i];                     \
-    TENSOR##_quot /= TENSOR##_sizes[TENSOR##_i];                                                     \
-    TENSOR##_memory_offset += TENSOR##_counter_tmp[TENSOR##_i] * TENSOR##_strides[TENSOR##_i];       \
-  }
-
-/*
- * The macro update the indexes in each dimension of the elements except for the first one allocated in
- * each thread.
- * For a tensor, if the index of some dimension reaches the size of the corresponding dimension. It will carry and clear.
- * If the index of next high dimension does do, the index of next high dimension should carry and clear, too.
- *
- * The momery offset calculatation is a little confusing. If current index carries, the current index is set to 0. So
- * the offset should decrease by size*stride of the last dimension. Then the index next high dimension increases by 1. So
- * the offset should increase by stride of next high dimension.
- */
-#define __TH_TENSOR_APPLYX_UPDATE_COUNTERS_PARALLEL(TENSOR) \
-  if(TENSOR##_i == TENSOR##_size && TENSOR##_dim > 1){ /*reaches the edge*/ \
-    int TENSOR##_carry_coord = 1;                      /*set carry flag to true*/ \
-    TENSOR##_start = 0;                                /*the current index be cleared to 0*/\
-    TENSOR##_data -= TENSOR##_size * TENSOR##_stride;  /*the momery offset reset to the first one in current dimension  */\
-    for(TENSOR##_i = TENSOR##_dim - 2; (TENSOR##_i >= 0) && (TENSOR##_carry_coord); TENSOR##_i--){ \
-      TENSOR##_counter_tmp[TENSOR##_i]++;             /*the index of next high dimension update*/ \
-      TENSOR##_data += TENSOR##_strides[TENSOR##_i];   /*memory offset increase by stride of next high dimension*/\
-      if(TENSOR##_counter_tmp[TENSOR##_i] == TENSOR##_sizes[TENSOR##_i]){ /*The next high dimension also carry, continue
-        to clear and carry*/ \
-        TENSOR##_data -= TENSOR##_sizes[TENSOR##_i] * TENSOR##_strides[TENSOR##_i]; \
-        TENSOR##_counter_tmp[TENSOR##_i] = 0; \
-      } else { \
-        TENSOR##_carry_coord = 0; \
-      } \
-    } \
-  } else { \
-    TENSOR##_start = TENSOR##_i; \
-  }
-
-#endif
--- a/aten/src/TH/THTensorDimApply.h
+++ b/aten/src/TH/THTensorDimApply.h
@ -1,329 +0,0 @@
-#ifndef TH_TENSOR_DIM_APPLY_INC
-#define TH_TENSOR_DIM_APPLY_INC
-
-// This is an example of SIZE_CHECK argument passable to TH_TENSOR_DIM_APPLY3.
-// The TENSOR1, TENSOR2, TENSOR3, DIMENSION will be expanded the same way as
-// TH_TENSOR_DIM_APPLY3.
-// Specifically, this check ensures that TENSOR1, TENSOR2, TENSOR3 have same
-// size except for DIMENSION.
-#define TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM(TENSOR1, TENSOR2, TENSOR3, DIMENSION) \
-{ \
-  int shape_check_flag = 0;                                             \
-  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \
-  { \
-    if (TH_TENSOR_DIM_APPLY_i == DIMENSION) \
-      continue; \
-    if (TENSOR1->size(TH_TENSOR_DIM_APPLY_i) != TENSOR2->size(TH_TENSOR_DIM_APPLY_i)) { \
-      shape_check_flag = 1; \
-      break; \
-    } \
-    if(TENSOR1->size(TH_TENSOR_DIM_APPLY_i) != TENSOR3->size(TH_TENSOR_DIM_APPLY_i)) { \
-      shape_check_flag = 1; \
-      break; \
-    } \
-  } \
-  if (shape_check_flag == 1) { \
-    AT_ERROR("Expected ", #TENSOR1, " ", TENSOR1->sizes(), ", ", #TENSOR2, " ", TENSOR2->sizes(), " and ", #TENSOR3, " ", TENSOR3->sizes(), " to have the same size apart from dimension ", DIMENSION); \
-  } \
-}
-
-#define TH_TENSOR_DIM_APPLY3(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, DIMENSION, SIZE_CHECK, CODE) \
-{ \
-  TYPE1 *TENSOR1##_data = NULL; \
-  TH_UNUSED int64_t TENSOR1##_stride = 0, TENSOR1##_size = 0; \
-  TYPE2 *TENSOR2##_data = NULL; \
-  TH_UNUSED int64_t TENSOR2##_stride = 0, TENSOR2##_size = 0; \
-  TYPE3 *TENSOR3##_data = NULL; \
-  TH_UNUSED int64_t TENSOR3##_stride = 0, TENSOR3##_size = 0; \
-  int64_t *TH_TENSOR_DIM_APPLY_counter = NULL; \
-  int TH_TENSOR_DIM_APPLY_hasFinished = THTensor_(numel)(TENSOR1) == 0; \
-  int TH_TENSOR_DIM_APPLY_i; \
-\
-  if( (DIMENSION < 0) || (DIMENSION >= THTensor_nDimensionLegacyNoScalars(TENSOR1)) ) \
-    THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, THTensor_nDimensionLegacyNoScalars(TENSOR1)); \
-  int same_dims = 1;                                                    \
-  if( THTensor_nDimensionLegacyNoScalars(TENSOR1) != THTensor_nDimensionLegacyNoScalars(TENSOR2) ) { \
-    same_dims = 0;                                                      \
-  } \
-  if( THTensor_nDimensionLegacyNoScalars(TENSOR1) != THTensor_nDimensionLegacyNoScalars(TENSOR3) ) { \
-    same_dims = 0;                                   \
-  } \
-  if (same_dims == 0) { \
-    AT_ERROR("inconsistent tensor size, expected ", #TENSOR1, " ", TENSOR1->sizes(), ", ", #TENSOR2, " ", TENSOR2->sizes(), " and ", #TENSOR3, " ",TENSOR3->sizes() , " to have the same number of dimensions"); \
-  }                                                                     \
-  SIZE_CHECK(TENSOR1, TENSOR2, TENSOR3, DIMENSION)                      \
-\
-  if (TH_TENSOR_DIM_APPLY_hasFinished) { \
-    return; \
-  } \
-  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(THTensor_nDimensionLegacyNoScalars(TENSOR1))); \
-  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \
-    TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
-\
-  TENSOR1##_data = THTensor_getStoragePtr(TENSOR1)->data<TYPE1>()+(TENSOR1)->storage_offset(); \
-  TENSOR1##_stride = THTensor_strideLegacyNoScalars((TENSOR1), DIMENSION); \
-  TENSOR1##_size = THTensor_sizeLegacyNoScalars((TENSOR1), DIMENSION); \
-\
-  TENSOR2##_data = THTensor_getStoragePtr(TENSOR2)->data<TYPE2>()+(TENSOR2)->storage_offset(); \
-  TENSOR2##_stride = THTensor_strideLegacyNoScalars((TENSOR2), DIMENSION); \
-  TENSOR2##_size = THTensor_sizeLegacyNoScalars((TENSOR2), DIMENSION);  \
-\
-  TENSOR3##_data = THTensor_getStoragePtr(TENSOR3)->data<TYPE3>()+(TENSOR3)->storage_offset(); \
-  TENSOR3##_stride = THTensor_strideLegacyNoScalars((TENSOR3), DIMENSION); \
-  TENSOR3##_size = THTensor_sizeLegacyNoScalars((TENSOR3), DIMENSION); \
-\
-  while(!TH_TENSOR_DIM_APPLY_hasFinished) \
-  { \
-    CODE \
-\
-    if(THTensor_nDimensionLegacyNoScalars(TENSOR1) == 1) \
-       break; \
- \
-    for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \
-    { \
-      if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
-      { \
-        if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \
-        { \
-          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
-          break; \
-        } \
-        continue; \
-      } \
-\
-      TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \
-      TENSOR1##_data += THTensor_strideLegacyNoScalars(TENSOR1, TH_TENSOR_DIM_APPLY_i); \
-      TENSOR2##_data += THTensor_strideLegacyNoScalars(TENSOR2, TH_TENSOR_DIM_APPLY_i); \
-      TENSOR3##_data += THTensor_strideLegacyNoScalars(TENSOR3, TH_TENSOR_DIM_APPLY_i); \
-\
-      if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == THTensor_sizeLegacyNoScalars(TENSOR1, TH_TENSOR_DIM_APPLY_i)) \
-      { \
-        if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \
-        { \
-          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
-          break; \
-        } \
-        else \
-        { \
-          TENSOR1##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*THTensor_strideLegacyNoScalars(TENSOR1, TH_TENSOR_DIM_APPLY_i); \
-          TENSOR2##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*THTensor_strideLegacyNoScalars(TENSOR2, TH_TENSOR_DIM_APPLY_i); \
-          TENSOR3##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*THTensor_strideLegacyNoScalars(TENSOR3, TH_TENSOR_DIM_APPLY_i); \
-          TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
-        } \
-      } \
-      else \
-        break; \
-    } \
-  } \
-  THFree(TH_TENSOR_DIM_APPLY_counter); \
-}
-
-/**
- * Similar to DIM_APPLY(...) but we maintain two sets of pointers: one for the first tensor
- * and one for the second. The two tensors must have the same shape, other than at the
- * specified DIMENSION. This function makes it easy to store the output from reducing the
- * TENSOR at index. For example, in the sum example described below, we could instead do:
- *
- * int64_t i = 0;
- * TYPE1 sum;
- *
- * for (i = 0; i < TENSOR1##_size; ++i) {
- *   sum += TENSOR1##_data[i * TENSOR1##_stride]
- * }
- * *TENSOR2##_data = (TYPE2) sum;
- *
- * In particular, we guarantee that the offset into TENSOR2 will be what you would get if
- * you applied all of the index values used to generate the offset into TENSOR1.
- */
-#define TH_TENSOR_DIM_APPLY2(TYPE1, TENSOR1, TYPE2, TENSOR2, DIMENSION, CODE) \
-{ \
-  TYPE1 *TENSOR1##_data = NULL; \
-  TH_UNUSED int64_t TENSOR1##_stride = 0, TENSOR1##_size = 0; \
-  TYPE2 *TENSOR2##_data = NULL; \
-  TH_UNUSED int64_t TENSOR2##_stride = 0, TENSOR2##_size = 0; \
-  int64_t *TH_TENSOR_DIM_APPLY_counter = NULL; \
-  int TH_TENSOR_DIM_APPLY_hasFinished = THTensor_(numel)(TENSOR1) == 0; \
-  int TH_TENSOR_DIM_APPLY_i; \
-\
-  if( (DIMENSION < 0) || (DIMENSION >= THTensor_nDimensionLegacyNoScalars(TENSOR1)) ) \
-    THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, THTensor_nDimensionLegacyAll(TENSOR1)); \
-  if( THTensor_nDimensionLegacyNoScalars(TENSOR1) != THTensor_nDimensionLegacyNoScalars(TENSOR2)) { \
-    AT_ERROR("inconsistent tensor size, expected ", #TENSOR1, " ", TENSOR1->sizes(), " and ", #TENSOR2, " ", TENSOR2->sizes(), " to have the same number of dimensions");        \
-  }                                                                     \
-  TH_UNUSED int shape_check_flag = 0;                                             \
-  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \
-  { \
-    if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
-      continue; \
-    if(THTensor_sizeLegacyNoScalars(TENSOR1, TH_TENSOR_DIM_APPLY_i) != THTensor_sizeLegacyNoScalars(TENSOR2, TH_TENSOR_DIM_APPLY_i)) { \
-      AT_ERROR("Expected ", #TENSOR1, " ", TENSOR1->sizes(), " and ", #TENSOR2, " ", TENSOR2->sizes(), " to have the same size in dimension ", DIMENSION); \
-    }                                                                   \
-  } \
-\
-  if (TH_TENSOR_DIM_APPLY_hasFinished) { \
-    return; \
-  } \
-  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(THTensor_nDimensionLegacyNoScalars(TENSOR1))); \
-  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \
-    TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
-\
-  TENSOR1##_data = THTensor_getStoragePtr(TENSOR1)->data<TYPE1>()+(TENSOR1)->storage_offset(); \
-  TENSOR1##_stride = THTensor_strideLegacyNoScalars((TENSOR1), DIMENSION); \
-  TENSOR1##_size = THTensor_sizeLegacyNoScalars(TENSOR1, DIMENSION); \
-\
-  TENSOR2##_data = THTensor_getStoragePtr(TENSOR2)->data<TYPE2>()+(TENSOR2)->storage_offset(); \
-  TENSOR2##_stride = THTensor_strideLegacyNoScalars((TENSOR2), DIMENSION); \
-  TENSOR2##_size = THTensor_sizeLegacyNoScalars(TENSOR2, DIMENSION); \
-\
-  while(!TH_TENSOR_DIM_APPLY_hasFinished) \
-  { \
-    CODE \
-\
-    if(THTensor_nDimensionLegacyNoScalars(TENSOR1) == 1) \
-       break; \
- \
-    for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \
-    { \
-      if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
-      { \
-        if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \
-        { \
-          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
-          break; \
-        } \
-        continue; \
-      } \
-\
-      TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \
-      TENSOR1##_data += THTensor_strideLegacyNoScalars(TENSOR1, TH_TENSOR_DIM_APPLY_i); \
-      TENSOR2##_data += THTensor_strideLegacyNoScalars(TENSOR2, TH_TENSOR_DIM_APPLY_i); \
-\
-      if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == THTensor_sizeLegacyNoScalars(TENSOR1, TH_TENSOR_DIM_APPLY_i)) \
-      { \
-        if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \
-        { \
-          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
-          break; \
-        } \
-        else \
-        { \
-          TENSOR1##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*THTensor_strideLegacyNoScalars(TENSOR1, TH_TENSOR_DIM_APPLY_i); \
-          TENSOR2##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*THTensor_strideLegacyNoScalars(TENSOR2, TH_TENSOR_DIM_APPLY_i); \
-          TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
-        } \
-      } \
-      else \
-        break; \
-    } \
-  } \
-  THFree(TH_TENSOR_DIM_APPLY_counter); \
-}
-
-/**
- * The basic idea for DIM_APPLY: Given a TENSOR and a DIMENSION, provide access to the data stored
- * at all sets of dimension values other than DIMENSION, such that we can get all the values at those
- * fixed indices for the various values at DIMENSION.
- *
- * Suppose we have a 2x3x4 Tensor A, and we have DIMENSION=2. Then we will hit CODE (2x3) times, and the
- * pointer into storage will be at:
- *
- * A[0][0]
- * A[0][1]
- * A[0][2]
- * A[1][0]
- * A[1][1]
- * A[1][2]
- *
- * And at each point, we can access the data for each of the four elements of the Tensor via
- * TENSOR##_stride. So for example, if we wanted to sum the elements there, we could do:
- *
- * int64_t i = 0;
- * TYPE sum;
- * for (i = 0; i < TENSOR##_size; i++) {
- *  sum += TENSOR##_data[i * TENSOR##_stride]
- * }
- *
- * Note that we don't have to have DIMENSION be the last tensor. If we have DIMENSION=1, then we will hit the
- * code (2x4) times, with pointer into the storage at:
- *
- * offset +
- *   stride_0 * 0 + stride_2 * 0
- *   stride_0 * 1 + stride_2 * 0
- *   stride_0 * 0 + stride_2 * 1
- *   stride_0 * 1 + stride_2 * 1
- *   stride_0 * 0 + stride_2 * 2
- *   stride_0 * 1 + stride_2 * 2
- *   stride_0 * 0 + stride_2 * 3
- *   stride_0 * 1 + stride_2 * 3
- *
- * So we can again sum over the values at DIMENSION with the other indices fixed.
- */
-#define TH_TENSOR_DIM_APPLY(TYPE, TENSOR, DIMENSION, CODE) \
-{ \
-  TYPE *TENSOR##_data = NULL; \
-  int64_t TENSOR##_stride = 0, TENSOR##_size = 0; \
-  int64_t *TH_TENSOR_DIM_APPLY_counter = NULL; \
-  int TH_TENSOR_DIM_APPLY_hasFinished = 0; \
-  int TH_TENSOR_DIM_APPLY_i; \
-\
-  if( (DIMENSION < 0) || (DIMENSION >= THTensor_nDimensionLegacyAll(TENSOR)) ) \
-    THError("invalid dimension"); \
-\
-  TENSOR##_data = THTensor_getStoragePtr(TENSOR)->data<TYPE>()+(TENSOR)->storage_offset(); \
-  TENSOR##_stride = THTensor_strideLegacyNoScalars((TENSOR), DIMENSION); \
-  TENSOR##_size = THTensor_sizeLegacyNoScalars(TENSOR, DIMENSION); \
-  /* Counter stores the indices into the Tensor at any time */ \
-  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(THTensor_nDimensionLegacyAll(TENSOR))); \
-  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyAll(TENSOR); TH_TENSOR_DIM_APPLY_i++) \
-    TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
-\
-  while(!TH_TENSOR_DIM_APPLY_hasFinished) \
-  { \
-    CODE \
-\
-    if(THTensor_nDimensionLegacyAll(TENSOR) == 1) \
-       break; \
- \
-    for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyAll(TENSOR); TH_TENSOR_DIM_APPLY_i++) \
-    { \
-       /* Check if the index is equal to DIMENSION. We don't need to update the */ \
-       /* offset if this is the case, and can consider the next index. However, */ \
-       /* in the case that the DIMENSION is the last index in the Tensor, then */ \
-       /* we have parsed the entire tensor and can exit */ \
-      if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
-      { \
-        if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyAll(TENSOR)-1) \
-        { \
-          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
-          break; \
-        } \
-        continue; \
-      } \
-\
-      /* Bump the counter at this index, update the pointer */ \
-      TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \
-      TENSOR##_data += THTensor_strideLegacyNoScalars(TENSOR, TH_TENSOR_DIM_APPLY_i); \
-\
-      if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == THTensor_sizeLegacyNoScalars(TENSOR, TH_TENSOR_DIM_APPLY_i)) \
-      { \
-        /* Handled TENSOR_size(dim) iterations for DIM_APPLY_i. If this is the last dimension, exit */ \
-        if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyAll(TENSOR)-1) \
-        { \
-          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
-          break; \
-        } \
-        else \
-        { \
-          /* Reset the counter, and the pointer to the beginning of the storage for this combination of indices */ \
-          TENSOR##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*THTensor_strideLegacyNoScalars(TENSOR, TH_TENSOR_DIM_APPLY_i); \
-          TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
-        } \
-      } \
-      else \
-        break; \
-    } \
-  } \
-  THFree(TH_TENSOR_DIM_APPLY_counter); \
-}
-
-#endif
--- a/aten/src/TH/THTensorMoreMath.cpp
+++ b/aten/src/TH/THTensorMoreMath.cpp
@ -1,20 +0,0 @@
-#include <TH/THTensor.hpp>
-#include <TH/THVector.h>
-#include <TH/THBlas.h>
-#include <TH/THTensorDimApply.h>
-
-// NOLINTNEXTLINE(bugprone-suspicious-include)
-#include <TH/generic/THTensorMoreMath.cpp>
-#include <TH/THGenerateAllTypes.h>
-
-// NOLINTNEXTLINE(bugprone-suspicious-include)
-#include <TH/generic/THTensorMoreMath.cpp>
-#include <TH/THGenerateBoolType.h>
-
-// NOLINTNEXTLINE(bugprone-suspicious-include)
-#include <TH/generic/THTensorMoreMath.cpp>
-#include <TH/THGenerateBFloat16Type.h>
-
-// NOLINTNEXTLINE(bugprone-suspicious-include)
-#include <TH/generic/THTensorMoreMath.cpp>
-#include <TH/THGenerateHalfType.h>
--- a/aten/src/TH/THVector.h
+++ b/aten/src/TH/THVector.h
@ -1,24 +0,0 @@
-#ifndef TH_VECTOR_INC
-#define TH_VECTOR_INC
-
-#include <TH/THGeneral.h>
-#define THVector_(NAME) TH_CONCAT_4(TH,Real,Vector_,NAME)
-
-/* We are going to use dynamic dispatch, and want only to generate declarations
- * of the vector functions */
-#include <TH/generic/THVector.h>
-#include <TH/THGenerateAllTypes.h>
-
-#include <TH/generic/THVector.h>
-#include <TH/THGenerateHalfType.h>
-
-#include <TH/generic/THVector.h>
-#include <TH/THGenerateBoolType.h>
-
-#include <TH/generic/THVector.h>
-#include <TH/THGenerateBFloat16Type.h>
-
-#include <TH/generic/THVector.h>
-#include <TH/THGenerateComplexTypes.h>
-
-#endif // TH_VECTOR_INC
--- a/aten/src/TH/generic/THBlas.cpp
+++ b/aten/src/TH/generic/THBlas.cpp
@ -1,48 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "TH/generic/THBlas.cpp"
-#else
-
-#ifdef BLAS_F2C
-# define ffloat double
-#else
-# define ffloat float
-#endif
-
-TH_EXTERNC void dswap_(int *n, double *x, int *incx, double *y, int *incy);
-TH_EXTERNC void sswap_(int *n, float *x, int *incx, float *y, int *incy);
-
-void THBlas_(swap)(int64_t n, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy)
-{
-  if(n == 1)
-  {
-    incx = 1;
-    incy = 1;
-  }
-
-#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
-  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
-  {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
-
-#if defined(TH_REAL_IS_DOUBLE)
-    dswap_(&i_n, x, &i_incx, y, &i_incy);
-#else
-    sswap_(&i_n, x, &i_incx, y, &i_incy);
-#endif
-    return;
-  }
-#endif
-  {
-    int64_t i;
-    for(i = 0; i < n; i++)
-    {
-      scalar_t z = x[i*incx];
-      x[i*incx] = y[i*incy];
-      y[i*incy] = z;
-    }
-  }
-}
-
-#endif
--- a/aten/src/TH/generic/THBlas.h
+++ b/aten/src/TH/generic/THBlas.h
@ -1,8 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "TH/generic/THBlas.h"
-#else
-
-/* Level 1 */
-TH_API void THBlas_(swap)(int64_t n, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy);
-
-#endif
--- a/aten/src/TH/generic/THTensor.cpp
+++ b/aten/src/TH/generic/THTensor.cpp
@ -8,50 +8,6 @@
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/MemoryOverlap.h>

-/**** access methods ****/
-THStorage *THTensor_(storage)(const THTensor *self)
-{
-  return THTensor_getStoragePtr(self);
-}
-
-ptrdiff_t THTensor_(storageOffset)(const THTensor *self)
-{
-  return self->storage_offset();
-}
-
-int THTensor_(nDimension)(const THTensor *self)
-{
-  return THTensor_nDimension(self);
-}
-
-int THTensor_(nDimensionLegacyNoScalars)(const THTensor *self)
-{
-  return THTensor_nDimensionLegacyNoScalars(self);
-}
-
-int THTensor_(nDimensionLegacyAll)(const THTensor *self)
-{
-  return THTensor_nDimensionLegacyAll(self);
-}
-
-int64_t THTensor_(size)(const THTensor *self, int dim)
-{
-  THArgCheck((dim >= 0) && (dim < self->dim()), 2, "dimension %d out of range of %dD tensor",
-      dim, THTensor_(nDimensionLegacyNoScalars)(self));
-  return self->size(dim);
-}
-
-int64_t THTensor_(stride)(const THTensor *self, int dim)
-{
-  THArgCheck((dim >= 0) && (dim < self->dim()), 2, "dimension %d out of range of %dD tensor",
-      dim, THTensor_(nDimensionLegacyNoScalars)(self));
-  return self->stride(dim);
-}
-
-scalar_t *THTensor_(data)(const THTensor *self) {
-  return self->data<scalar_t>();
-}
-
 /**** creation methods ****/

 /* Empty init */
@ -69,12 +25,6 @@ THTensor *THTensor_(new)(void)
      .release();
 }

-/* Pointer-copy init */
-THTensor *THTensor_(newWithTensor)(THTensor *tensor)
-{
-  return at::native::alias(THTensor_wrap(tensor)).unsafeReleaseTensorImpl();
-}
-
 THTensor *THTensor_(newWithStorage1d)(THStorage *storage, ptrdiff_t storageOffset,
                               int64_t size0, int64_t stride0)
 {
@ -94,442 +44,14 @@ THTensor *THTensor_(newWithStorage1d)(THStorage *storage, ptrdiff_t storageOffse
  return self;
 }

-THTensor *THTensor_(newWithSize1d)(int64_t size0)
-{
-  THStorage *new_storage = THStorage_(new)();
-  THTensor* self =
-      c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(
-          c10::intrusive_ptr<at::StorageImpl>::reclaim(new_storage),
-          at::DispatchKey::CPU,
-          caffe2::TypeMeta::Make<scalar_t>())
-          .release();
-  THTensor_(setStorage)(self, new_storage, 0, {size0}, {});
-
-  return self;
-}
-
-THTensor *THTensor_(newClone)(THTensor *self)
-{
-  // already available in Aten as at::clone()
-  THTensor *tensor = THTensor_(new)();
-  at::Tensor tensor_wrap = THTensor_wrap(tensor);
-  at::Tensor self_wrap = THTensor_wrap(self);
-  tensor_wrap.resize_as_(self_wrap);
-  at::native::copy_(tensor_wrap, self_wrap, false);
-  return tensor;
-}
-
-THTensor *THTensor_(newContiguous)(THTensor *self)
-{
-  if(!THTensor_(isContiguous)(self))
-    return THTensor_(newClone)(self);
-  else
-  {
-    THTensor_(retain)(self);
-    return self;
-  }
-}
-
-THTensor *THTensor_(newSelect)(THTensor *tensor, int dimension_, int64_t sliceIndex_)
-{
-  THTensor *self = THTensor_(newWithTensor)(tensor);
-  THTensor_(select)(self, NULL, dimension_, sliceIndex_);
-  return self;
-}
-
-THTensor *THTensor_(newNarrow)(THTensor *tensor, int dimension_, int64_t firstIndex_, int64_t size_)
-{
-  THTensor *self = THTensor_(newWithTensor)(tensor);
-  THTensor_(narrow)(self, NULL, dimension_, firstIndex_, size_);
-  return self;
-}
-
-THTensor *THTensor_(newTranspose)(THTensor *tensor, int dimension1_, int dimension2_)
-{
-  THTensor *self = THTensor_(newWithTensor)(tensor);
-  THTensor_(transpose)(self, NULL, dimension1_, dimension2_);
-  return self;
-}
-
-/* Resize */
-void THTensor_(resize)(THTensor *self, at::IntArrayRef size, at::IntArrayRef stride)
-{
-  return THTensor_resize(self, size, stride);
-}
-
-void THTensor_(resizeAs)(THTensor *self, THTensor *src)
-{
-  // already available in Aten as at::resize_as_()
-  if(!THTensor_(isSameSizeAs)(self, src))
-    THTensor_(resizeNd)(self, src->dim(), THTensor_getSizePtr(src), NULL);
-}
-
-void THTensor_(resize0d)(THTensor *tensor)
-{
-  THTensor_(resizeNd)(tensor, 0, {}, nullptr);
-}
-
-void THTensor_(resize1d)(THTensor *tensor, int64_t size0)
-{
-  int64_t size[1] = {size0};
-  THTensor_(resizeNd)(tensor, 1, size, nullptr);
-}
-
-void THTensor_(resize2d)(THTensor *tensor, int64_t size0, int64_t size1)
-{
-  int64_t size[2] = {size0, size1};
-  THTensor_(resizeNd)(tensor, 2, size, nullptr);
-}
-
-void THTensor_(resize3d)(THTensor *tensor, int64_t size0, int64_t size1, int64_t size2)
-{
-  int64_t size[3] = {size0, size1, size2};
-  THTensor_(resizeNd)(tensor, 3, size, nullptr);
-}
-
-void THTensor_(resize4d)(THTensor *self, int64_t size0, int64_t size1, int64_t size2, int64_t size3)
-{
-  int64_t size[4] = {size0, size1, size2, size3};
-  THTensor_(resizeNd)(self, 4, size, nullptr);
-}
-
-void THTensor_(resize5d)(THTensor *self, int64_t size0, int64_t size1, int64_t size2, int64_t size3, int64_t size4)
-{
-  int64_t size[5] = {size0, size1, size2, size3, size4};
-  THTensor_(resizeNd)(self, 5, size, nullptr);
-}
-
-void THTensor_(set)(THTensor *self, THTensor *src)
-{
-  if(self != src)
-    THTensor_(setStorage)(self,
-                            THTensor_getStoragePtr(src),
-                            src->storage_offset(),
-                            src->sizes(),
-                            src->strides());
-}
-
 void THTensor_(setStorage)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, at::IntArrayRef size_, at::IntArrayRef stride_)
 {
  THTensor_setStorage(self, storage_, storageOffset_, size_, stride_);
 }

-void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension, int64_t firstIndex, int64_t size)
-{
-  if(!src)
-    src = self;
-
-  THArgCheck( (dimension >= 0) && (dimension < src->dim()), 2, "out of range");
-  THArgCheck( firstIndex >= 0, 3, "out of range");
-  THArgCheck( size >= 0, 4, "out of range");
-  THArgCheck(firstIndex <= src->size(dimension) - size, 4, "out of range");
-
-  THTensor_(set)(self, src);
-
-  if (firstIndex > 0) {
-    self->set_storage_offset(self->storage_offset() + firstIndex*self->stride(dimension));
-  }
-
-  self->set_size(dimension, size);
-}
-
-void THTensor_(select)(THTensor *self, THTensor *src, int dimension, int64_t sliceIndex)
-{
-  int d;
-
-  if(!src)
-    src = self;
-
-  THArgCheck(src->dim() > 0, 1, "cannot select on a 0-dim tensor");
-  THArgCheck((dimension >= 0) && (dimension < src->dim()), 2, "out of range");
-  THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size(dimension)), 3, "out of range");
-
-  THTensor_(set)(self, src);
-  THTensor_(narrow)(self, NULL, dimension, sliceIndex, 1);
-
-  at::DimVector newSize(static_cast<size_t>(self->dim()-1));
-  at::DimVector newStride(static_cast<size_t>(self->dim()-1));
-  for (d = 0; d < dimension; d++)
-  {
-    newSize[d] = self->size(d);
-    newStride[d] = self->stride(d);
-  }
-
-  for(d = dimension; d < self->dim()-1; d++)
-  {
-    newSize[d] = self->size(d+1);
-    newStride[d] = self->stride(d+1);
-  }
-  self->set_sizes_and_strides(newSize, newStride);
-}
-
-void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1, int dimension2)
-{
-  int64_t z;
-
-  if(!src)
-    src = self;
-
-  THArgCheck( (dimension1 >= 0) && (dimension1 < THTensor_nDimensionLegacyNoScalars(src)), 1, "out of range");
-  THArgCheck( (dimension2 >= 0) && (dimension2 < THTensor_nDimensionLegacyNoScalars(src)), 2, "out of range");
-
-  THTensor_(set)(self, src);
-
-  if(dimension1 == dimension2)
-    return;
-
-  z = self->stride(dimension1);
-  self->set_stride(dimension1, self->stride(dimension2));
-  self->set_stride(dimension2, z);
-  z = self->size(dimension1);
-  self->set_size(dimension1, self->size(dimension2));
-  self->set_size(dimension2, z);
-}
-
-void THTensor_(squeeze1d)(THTensor *self, THTensor *src, int dimension)
-{
-  int d;
-
-  if(!src)
-    src = self;
-
-  THArgCheck((dimension >= 0) && (dimension < src->dim()), 2, "dimension out of range");
-
-  THTensor_(set)(self, src);
-
-  if(src->size(dimension) == 1)
-  {
-    at::DimVector newSize(static_cast<size_t>(self->dim() - 1));
-    at::DimVector newStride(static_cast<size_t>(self->dim() - 1));
-    for (d = 0; d < dimension; d++)
-    {
-      newSize[d] = self->size(d);
-      newStride[d] = self->stride(d);
-    }
-
-    for(d = dimension; d < self->dim()-1; d++)
-    {
-      newSize[d] = self->size(d+1);
-      newStride[d] = self->stride(d+1);
-    }
-    self->set_sizes_and_strides(newSize, newStride);
-  }
-}
-
-void THTensor_(unsqueeze1d)(THTensor *self, THTensor *src, int dimension)
-{
-  int d;
-
-  if(!src)
-    src = self;
-
-  THArgCheck((dimension >= 0) && (dimension <= src->dim()), 2, "dimension out of range");
-
-  THTensor_(set)(self, src);
-
-  at::DimVector newSize(static_cast<size_t>(/* size */ self->dim()+1));
-  at::DimVector newStride(static_cast<size_t>(/* size */ self->dim()+1));
-
-  for(d = self->dim(); d > dimension; d--)
-  {
-    newSize[d] = self->size(d-1);
-    newStride[d] = self->stride(d-1);
-  }
-  if (dimension < self->dim())
-  {
-    newStride[dimension] = self->size(dimension) * self->stride(dimension);
-  }
-  else
-  {
-    newStride[dimension] = 1;
-  }
-  newSize[dimension] = 1;
-  for(d = dimension - 1; d >= 0; d--)
-  {
-    newSize[d] = self->size(d);
-    newStride[d] = self->stride(d);
-  }
-  self->set_sizes_and_strides(newSize, newStride);
-}
-
-int THTensor_(isTransposed)(const THTensor *self)
-{
-  if (THTensor_(isContiguous)(self)) {
-    return 0;
-  }
-  int64_t max_stride = 1;
-  int64_t size_max_stride = 1;
-  int64_t z = 1;
-  int d;
-  for (d = 0; d < self->dim(); ++d) {
-    if (self->stride(d) == 0 && self->size(d) != 1)
-      return 0;
-    if (self->stride(d) > max_stride) {
-      max_stride = self->stride(d);
-      size_max_stride = self->size(d);
-    }
-    z *= self->size(d);
-  }
-  if (z == max_stride * size_max_stride) {
-    return 1;
-  }
-  return 0;
-}
-
-int THTensor_(isContiguous)(const THTensor *self)
-{
-  return self->is_contiguous();
-}
-
-int THTensor_(isSameSizeAs)(const THTensor *self, const THTensor* src)
-{
-  int d;
-  if (self->dim() != src->dim())
-    return 0;
-  for(d = 0; d < self->dim(); ++d)
-  {
-    if(self->size(d) != src->size(d))
-      return 0;
-  }
-  return 1;
-}
-
-ptrdiff_t THTensor_(nElement)(const THTensor *self)
-{
-  if(THTensor_nDimensionLegacyAll(self) == 0)
-    return 0;
-  else
-  {
-    ptrdiff_t nElement = 1;
-    int d;
-    for(d = 0; d < THTensor_nDimension(self); d++)
-      nElement *= self->size(d);
-    return nElement;
-  }
-}
-
-// NB: It is INVALID to call this on an UndefinedTensorImpl
-void THTensor_(retain)(THTensor *self)
-{
-  c10::raw::intrusive_ptr::incref(self);
-}

 void THTensor_(free)(THTensor *self)
 {
  THTensor_free(self);
 }
-
-void THTensor_(freeCopyTo)(THTensor *self, THTensor *dst)
-{
-  if(self != dst) {
-    at::Tensor dst_wrap = THTensor_wrap(dst);
-    at::Tensor self_wrap = THTensor_wrap(self);
-    at::native::copy_(dst_wrap, self_wrap, false);
-  }
-
-  THTensor_(free)(self);
-}
-
-/*******************************************************************************/
-
-void THTensor_(resizeNd)(THTensor *self, int nDimension, const int64_t *size, const int64_t *stride)
-{
-  return THTensor_resizeNd(self, nDimension, size, stride);
-}
-
-void THTensor_(set0d)(THTensor *tensor, scalar_t value)
-{
-  THArgCheck(THTensor_nDimension(tensor) == 0, 1, "tensor must have no dimensions");
-  THStorage_(set)(THTensor_getStoragePtr(tensor), tensor->storage_offset(), value);
-}
-
-scalar_t THTensor_(get0d)(const THTensor *tensor)
-{
-  THArgCheck(THTensor_nDimension(tensor) == 0, 1, "tensor must have no dimensions");
-  return THStorage_(get)(THTensor_getStoragePtr(tensor), tensor->storage_offset());
-}
-
-void THTensor_(set1d)(THTensor *tensor, int64_t x0, scalar_t value)
-{
-  THArgCheck(THTensor_nDimensionLegacyNoScalars(tensor) == 1, 1, "tensor must have one dimension");
-  THArgCheck( (x0 >= 0) && (x0 < THTensor_sizeLegacyNoScalars(tensor, 0)), 2, "out of range");
-  THStorage_(set)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*THTensor_strideLegacyNoScalars(tensor, 0), value);
-}
-
-scalar_t THTensor_(get1d)(const THTensor *tensor, int64_t x0)
-{
-  THArgCheck(THTensor_nDimensionLegacyNoScalars(tensor) == 1, 1, "tensor must have one dimension");
-  THArgCheck( (x0 >= 0) && (x0 < THTensor_sizeLegacyNoScalars(tensor, 0)), 2, "out of range");
-  return THStorage_(get)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*THTensor_strideLegacyNoScalars(tensor, 0));
-}
-
-void THTensor_(set2d)(THTensor *tensor, int64_t x0, int64_t x1, scalar_t value)
-{
-  THArgCheck(THTensor_nDimensionLegacyAll(tensor) == 2, 1, "tensor must have two dimensions");
-  THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)), 2, "out of range");
-  THStorage_(set)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1), value);
-}
-
-scalar_t THTensor_(get2d)(const THTensor *tensor, int64_t x0, int64_t x1)
-{
-  THArgCheck(THTensor_nDimensionLegacyAll(tensor) == 2, 1, "tensor must have two dimensions");
-  THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)), 2, "out of range");
-  return THStorage_(get)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1));
-}
-
-void THTensor_(set3d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, scalar_t value)
-{
-  THArgCheck(THTensor_nDimensionLegacyAll(tensor) == 3, 1, "tensor must have three dimensions");
-  THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)), 2, "out of range");
-  THStorage_(set)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2), value);
-}
-
-scalar_t THTensor_(get3d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2)
-{
-  THArgCheck(THTensor_nDimensionLegacyAll(tensor) == 3, 1, "tensor must have three dimensions");
-  THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)), 2, "out of range");
-  return THStorage_(get)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2));
-}
-
-void THTensor_(set4d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, scalar_t value)
-{
-  THArgCheck(THTensor_nDimensionLegacyAll(tensor) == 4, 1, "tensor must have four dimensions");
-  THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)) && (x3 >= 0) && (x3 < tensor->size(3)), 2, "out of range");
-  THStorage_(set)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3), value);
-}
-
-scalar_t THTensor_(get4d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3)
-{
-  THArgCheck(THTensor_nDimensionLegacyAll(tensor) == 4, 1, "tensor must have four dimensions");
-  THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)) && (x3 >= 0) && (x3 < tensor->size(3)), 2, "out of range");
-  return THStorage_(get)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3));
-}
-
-THDescBuff THTensor_(desc)(const THTensor *tensor) {
-  const int L = TH_DESC_BUFF_LEN;
-  THDescBuff buf;
-  char *str = buf.str;
-  int n = 0;
-#define _stringify(x) #x
-  n += snprintf(str, L-n, "torch." _stringify(x) "Tensor of size ");
-#undef _stringify
-  int i;
-  for(i = 0; i < THTensor_nDimension(tensor); i++) {
-    if(n >= L) break;
-    n += snprintf(str+n, L-n, "%" PRId64, tensor->size(i));
-    if(i < THTensor_nDimension(tensor)-1) {
-      n += snprintf(str+n, L-n, "x");
-    }
-  }
-  if(n >= L) {
-    snprintf(str+L-4, 4, "...");
-  }
-  return buf;
-}
-
-THDescBuff THTensor_(sizeDesc)(const THTensor *tensor) {
-  THDescBuff buf = _THSizeDesc(tensor->sizes().data(), tensor->sizes().size());
-  return buf;
-}
-
 #endif
--- a/aten/src/TH/generic/THTensor.h
+++ b/aten/src/TH/generic/THTensor.h
@ -55,24 +55,12 @@ TH_API THTensor *THTensor_(newTranspose)(THTensor *tensor, int dimension1_, int
 // This is especially likely to happen when the tensor is not contiguous. In general, if you still need the
 // values, unless you are doing some size and stride tricks, do not use resize*.
 TH_API void THTensor_(resizeNd)(THTensor *tensor, int nDimension, const int64_t *size, const int64_t *stride);
-TH_API void THTensor_(resizeAs)(THTensor *tensor, THTensor *src);
-TH_API void THTensor_(resize0d)(THTensor *tensor);
-TH_API void THTensor_(resize1d)(THTensor *tensor, int64_t size0_);
-TH_API void THTensor_(resize2d)(THTensor *tensor, int64_t size0_, int64_t size1_);
-TH_API void THTensor_(resize3d)(THTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_);
-TH_API void THTensor_(resize4d)(THTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_, int64_t size3_);
-TH_API void THTensor_(resize5d)(THTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_, int64_t size3_, int64_t size4_);
 // Note: these are legacy resize functions that treat sizes as size->size() == 0 and size->data<int64_t>() as being 0-terminated.

 TH_API void THTensor_(set)(THTensor *self, THTensor *src);

 TH_API void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension_, int64_t firstIndex_, int64_t size_);
 TH_API void THTensor_(select)(THTensor *self, THTensor *src, int dimension_, int64_t sliceIndex_);
-TH_API void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1_, int dimension2_);
-TH_API int THTensor_(isTransposed)(const THTensor *self);
-
-TH_API void THTensor_(squeeze1d)(THTensor *self, THTensor *src, int dimension_);
-TH_API void THTensor_(unsqueeze1d)(THTensor *self, THTensor *src, int dimension_);

 TH_API int THTensor_(isContiguous)(const THTensor *self);
 TH_API int THTensor_(isSameSizeAs)(const THTensor *self, const THTensor *src);
@ -80,23 +68,5 @@ TH_API ptrdiff_t THTensor_(nElement)(const THTensor *self);

 TH_API void THTensor_(retain)(THTensor *self);
 TH_API void THTensor_(free)(THTensor *self);
-TH_API void THTensor_(freeCopyTo)(THTensor *self, THTensor *dst);
-
-/* Slow access methods [check everything] */
-TH_API void THTensor_(set0d)(THTensor *tensor, scalar_t value);
-TH_API void THTensor_(set1d)(THTensor *tensor, int64_t x0, scalar_t value);
-TH_API void THTensor_(set2d)(THTensor *tensor, int64_t x0, int64_t x1, scalar_t value);
-TH_API void THTensor_(set3d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, scalar_t value);
-TH_API void THTensor_(set4d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, scalar_t value);
-
-TH_API scalar_t THTensor_(get0d)(const THTensor *tensor);
-TH_API scalar_t THTensor_(get1d)(const THTensor *tensor, int64_t x0);
-TH_API scalar_t THTensor_(get2d)(const THTensor *tensor, int64_t x0, int64_t x1);
-TH_API scalar_t THTensor_(get3d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2);
-TH_API scalar_t THTensor_(get4d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3);
-
-/* Debug methods */
-TH_API THDescBuff THTensor_(desc)(const THTensor *tensor);
-TH_API THDescBuff THTensor_(sizeDesc)(const THTensor *tensor);

 #endif
--- a/aten/src/TH/generic/THTensor.hpp
+++ b/aten/src/TH/generic/THTensor.hpp
@ -11,6 +11,4 @@
 TH_CPP_API void THTensor_(setStorage)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
                                      at::IntArrayRef size_, at::IntArrayRef stride_);

-TH_CPP_API void THTensor_(resize)(THTensor *self, at::IntArrayRef size, at::IntArrayRef stride);
-
 #endif
--- a/aten/src/TH/generic/THTensorApply.hpp
+++ b/aten/src/TH/generic/THTensorApply.hpp
@ -1,369 +0,0 @@
-#include <TH/THTensorApply.h>
-
-#ifndef NAN
-  #define NAN (nan(NULL))
-#endif
-
-#define HYPER_TH_OMP_OVERHEAD_THRESHOLD (at::internal::GRAIN_SIZE / 16)
-#define ORDIN_TH_OMP_OVERHEAD_THRESHOLD (at::internal::GRAIN_SIZE / 4)
-#define UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD (at::internal::GRAIN_SIZE / 2)
-#define TH_OMP_OVERHEAD_THRESHOLD (at::internal::GRAIN_SIZE)
-
-#define TH_CHECK_SAME_SIZE(TENSOR1, TENSOR2) \
-{ \
-  if (!THTensor_(isSameSizeAs)(TENSOR1, TENSOR2)) { \
-    AT_ERROR("inconsistent tensor size, expected ", #TENSOR1, " ", TENSOR1->sizes(), " and ", #TENSOR2, " ", TENSOR2->sizes(), " to have the same size"); \
-  } \
-}
-
-// Used for `scatter` and `scatterAdd`
-// Assumes TENSOR1 is index
-//         TENSOR2 is real
-//         TENSOR3 is src
-// Tests:
-//   1. index->size(d) <= src->size(d) for all d
-//   2. index->size(d) <= real->size(d) for all d != dim
-#define TH_TENSOR_DIM_APPLY3_SIZE_SCATTER(TENSOR1, TENSOR2, TENSOR3, DIMENSION) \
-{ \
-  int shape_check_flag = 0; \
-  for (TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyAll(TENSOR2); TH_TENSOR_DIM_APPLY_i++) \
-  { \
-    int64_t TENSOR1##_dim_size = THTensor_sizeLegacyNoScalars(TENSOR1, TH_TENSOR_DIM_APPLY_i); \
-    if (TH_TENSOR_DIM_APPLY_i != DIMENSION) { \
-      if (TENSOR1##_dim_size > THTensor_sizeLegacyNoScalars(TENSOR2, TH_TENSOR_DIM_APPLY_i)) { \
-        shape_check_flag = 1; \
-        break; \
-      } \
-    } \
-    if (TENSOR1##_dim_size > THTensor_sizeLegacyNoScalars(TENSOR3, TH_TENSOR_DIM_APPLY_i)) { \
-      shape_check_flag = 1; \
-      break; \
-    } \
-  } \
-  if (shape_check_flag == 1) { \
-    AT_ERROR("Expected ", #TENSOR1, " ", TENSOR1->sizes(), " to be smaller size than ", #TENSOR3, " ", TENSOR3->sizes(), " and to be smaller than ", #TENSOR2, " ", TENSOR2->sizes(), " apart from dimension ", DIMENSION); \
-  } \
-}
-
-#undef th_isnan
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-#define th_isnan(val) \
-(std::isnan(val))
-#else
-#define th_isnan(val) (0)
-#endif
-
-#undef th_isnan_break
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-#define th_isnan_break(val) \
-if (std::isnan(val)) break;
-#else
-#define th_isnan_break(val)
-#endif
-
-#if defined(__clang__)
-#define PRAGMA(P) _Pragma(#P)
-#define PRAGMA_IVDEP      // Noop
-#define PRAGMA_SIMD       // Noop
-#elif defined(_MSC_VER)
-#define PRAGMA(P)         __pragma(P)
-# if _MSC_VER < 1920
-// MSVC < 2019 doesn't support loop pragmas.
-#  define PRAGMA_IVDEP    // Noop
-#  define PRAGMA_SIMD     // Noop
-# else
-#  define PRAGMA_IVDEP    PRAGMA(loop(ivdep))
-#  define PRAGMA_SIMD     PRAGMA(omp simd)
-# endif
-#else
-#define PRAGMA(P)         _Pragma(#P)
-#define PRAGMA_IVDEP      PRAGMA(ivdep)
-#define PRAGMA_SIMD       PRAGMA(simd)
-#endif
-
-#define TH_TENSOR_APPLY2_PARALLEL(SIZE, CONTIG1, CONTIG2, TYPE1, TENSOR1, TYPE2, TENSOR2, CODE, THRESHOLD) \
-{ \
-  /* for advanced searching index*/ \
-  if (CONTIG1 && CONTIG2) { \
-    TYPE1 *rp = THTensor_getStoragePtr(TENSOR1)->data<TYPE1>()+TENSOR1->storage_offset(); \
-    TYPE2 *tp = THTensor_getStoragePtr(TENSOR2)->data<TYPE2>()+TENSOR2->storage_offset(); \
-    if (tp != (TYPE2*)rp) { \
-      at::parallel_for(0, SIZE, (THRESHOLD * 10), [&](int64_t begin, int64_t end) { \
-        PRAGMA_IVDEP \
-        for (auto iter = begin; iter < end; iter++) { \
-          TYPE2 *TENSOR2##_data = tp+iter; \
-          TYPE1 *TENSOR1##_data = rp+iter; \
-          CODE \
-        } \
-      }); \
-    } else { \
-      at::parallel_for(0, SIZE, (THRESHOLD * 10), [&](int64_t begin, int64_t end) { \
-        PRAGMA_SIMD \
-        for (auto iter = begin; iter < end; iter++) { \
-          TYPE2* TENSOR2##_data = tp+iter; \
-          TYPE1* TENSOR1##_data = rp+iter; \
-          CODE \
-        } \
-      }); \
-    } \
-  } else { \
-    /* The following strategy is not easy to understand.
-     * 1. Collapse the dimension of the tensors in order to decrease the number of nested loops.
-     * 2. Calculate the numbers of elements allocated in each thread and the line index of the first one.
-     * 3. Calculate the memory offset of the first element and the indexes in each dimension of the
-     *    first one.
-     * 4. iterate all elements in each thread. update the indexes in each dimension of the rest.
-    */ \
-    int TH_TENSOR_APPLY_hasFinished = 0; \
-    int64_t TH_TENSOR_dim_index = 0; \
-    /*step 1*/ \
-    __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, -1, 1) \
-    __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, -1, 1) \
-    if (0 == TH_TENSOR_APPLY_hasFinished) { \
-      auto TENSOR1##_i_local = TENSOR1##_i; \
-      auto TENSOR2##_i_local = TENSOR2##_i; \
-      auto TENSOR1##_data_local = TENSOR1##_data; \
-      auto TENSOR2##_data_local = TENSOR2##_data; \
-      at::parallel_for(0, SIZE, THRESHOLD, [&](int64_t begin, int64_t end) { \
-        auto TENSOR1##_i = TENSOR1##_i_local; \
-        auto TENSOR2##_i = TENSOR2##_i_local; \
-        auto TENSOR1##_data = TENSOR1##_data_local; \
-        auto TENSOR2##_data = TENSOR2##_data_local; \
-        /*step 2*/ \
-        ptrdiff_t line_index_start = begin; \
-        ptrdiff_t line_seg_length = (end - begin); \
-        /* step 3*/ \
-        __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR2); \
-        __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR1); \
-        TENSOR2##_data += TENSOR2##_memory_offset; \
-        TENSOR1##_data += TENSOR1##_memory_offset; \
-        ptrdiff_t count = 0; \
-        ptrdiff_t TENSOR2##_start =  TENSOR2##_counter_tmp[TENSOR2##_dim-1]; \
-        ptrdiff_t TENSOR1##_start =  TENSOR1##_counter_tmp[TENSOR1##_dim-1]; \
-        /* step 4*/ \
-        while (count < line_seg_length) { \
-          for (TENSOR2##_i=TENSOR2##_start, TENSOR1##_i = TENSOR1##_start; ((count < line_seg_length) && (TENSOR2##_i < TENSOR2##_size) && (TENSOR1##_i < TENSOR1##_size)); ++TENSOR2##_i, ++TENSOR1##_i, ++count) { \
-            CODE \
-            TENSOR2##_data += TENSOR2##_stride; \
-            TENSOR1##_data += TENSOR1##_stride; \
-          } \
-          if (count < line_seg_length) { \
-            __TH_TENSOR_APPLYX_UPDATE_COUNTERS_PARALLEL(TENSOR2); \
-            __TH_TENSOR_APPLYX_UPDATE_COUNTERS_PARALLEL(TENSOR1); \
-          } \
-        } \
-        if (TENSOR1##_counter_tmp != NULL) { \
-          THFree(TENSOR1##_counter_tmp); \
-        } \
-        if (TENSOR2##_counter_tmp != NULL) { \
-          THFree(TENSOR2##_counter_tmp); \
-        } \
-      }); \
-    } \
-    if (TENSOR2##_counter != NULL) { \
-      THFree(TENSOR2##_counter); \
-    } \
-    if (TENSOR1##_counter != NULL) { \
-      THFree(TENSOR1##_counter); \
-    } \
-  } \
-}
-
-#define TH_TENSOR_APPLY3_PARALLEL(SIZE, CONTIG1, CONTIG2, CONTIG3, TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE, THRESHOLD) \
-{ \
-  /* for adveanced searching index*/ \
-  if (CONTIG1 && CONTIG2 && CONTIG3) { \
-    TYPE1 *rp = THTensor_getStoragePtr(TENSOR1)->data<TYPE1>()+TENSOR1->storage_offset(); \
-    TYPE2 *tp = THTensor_getStoragePtr(TENSOR2)->data<TYPE2>()+TENSOR2->storage_offset(); \
-    TYPE3 *srcp = THTensor_getStoragePtr(TENSOR3)->data<TYPE3>()+TENSOR3->storage_offset(); \
-    if (tp != (TYPE2*)rp) { \
-      at::parallel_for(0, SIZE, (THRESHOLD * 10), [&](int64_t begin, int64_t end) { \
-        PRAGMA_IVDEP \
-        for (auto iter = begin; iter < end; iter++) { \
-          TYPE1 *TENSOR1##_data = rp+iter; \
-          TYPE2 *TENSOR2##_data = tp+iter; \
-          TYPE3 *TENSOR3##_data = srcp+iter; \
-          CODE \
-        } \
-      }); \
-    } else { \
-      at::parallel_for(0, SIZE, (THRESHOLD * 10), [&](int64_t begin, int64_t end) { \
-        PRAGMA_SIMD \
-        for (auto iter = begin; iter < end; iter++) { \
-          TYPE1 *TENSOR1##_data = rp+iter; \
-          TYPE2 *TENSOR2##_data = tp+iter; \
-          TYPE3 *TENSOR3##_data = srcp+iter; \
-          CODE \
-        } \
-      }); \
-    } \
-  } else { \
-    int TH_TENSOR_APPLY_hasFinished = 0; \
-    int64_t TH_TENSOR_dim_index = 0; \
-    __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, -1, 1) \
-    __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, -1, 1) \
-    __TH_TENSOR_APPLYX_PREAMBLE(TYPE3, TENSOR3, -1, 1) \
-    if (0 == TH_TENSOR_APPLY_hasFinished) { \
-      auto TENSOR1##_i_local = TENSOR1##_i; \
-      auto TENSOR2##_i_local = TENSOR2##_i; \
-      auto TENSOR3##_i_local = TENSOR3##_i; \
-      auto TENSOR1##_data_local = TENSOR1##_data; \
-      auto TENSOR2##_data_local = TENSOR2##_data; \
-      auto TENSOR3##_data_local = TENSOR3##_data; \
-      at::parallel_for(0, SIZE, THRESHOLD, [&](int64_t begin, int64_t end) { \
-        auto TENSOR1##_i = TENSOR1##_i_local; \
-        auto TENSOR2##_i = TENSOR2##_i_local; \
-        auto TENSOR3##_i = TENSOR3##_i_local; \
-        auto TENSOR1##_data = TENSOR1##_data_local; \
-        auto TENSOR2##_data = TENSOR2##_data_local; \
-        auto TENSOR3##_data = TENSOR3##_data_local; \
-        ptrdiff_t line_index_start = begin; \
-        ptrdiff_t line_seg_length = (end - begin); \
-        __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR1); \
-        __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR2); \
-        __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR3); \
-        TENSOR1##_data += TENSOR1##_memory_offset; \
-        TENSOR2##_data += TENSOR2##_memory_offset; \
-        TENSOR3##_data += TENSOR3##_memory_offset; \
-        ptrdiff_t count = 0; \
-        ptrdiff_t TENSOR1##_start = TENSOR1##_counter_tmp[TENSOR1##_dim - 1]; \
-        ptrdiff_t TENSOR2##_start = TENSOR2##_counter_tmp[TENSOR2##_dim - 1]; \
-        ptrdiff_t TENSOR3##_start = TENSOR3##_counter_tmp[TENSOR3##_dim - 1]; \
-        while (count < line_seg_length) { \
-          for (TENSOR1##_i=TENSOR1##_start, TENSOR2##_i=TENSOR2##_start,TENSOR3##_i=TENSOR3##_start; (count<line_seg_length)&&(TENSOR1##_i<TENSOR1##_size)&&(TENSOR2##_i<TENSOR2##_size)&&(TENSOR3##_i<TENSOR3##_size); ++TENSOR1##_i,++TENSOR2##_i,++TENSOR3##_i,++count) { \
-            CODE \
-            TENSOR1##_data += TENSOR1##_stride; \
-            TENSOR2##_data += TENSOR2##_stride; \
-            TENSOR3##_data += TENSOR3##_stride; \
-          } \
-          if (count < line_seg_length) { \
-            __TH_TENSOR_APPLYX_UPDATE_COUNTERS_PARALLEL(TENSOR1); \
-            __TH_TENSOR_APPLYX_UPDATE_COUNTERS_PARALLEL(TENSOR2); \
-            __TH_TENSOR_APPLYX_UPDATE_COUNTERS_PARALLEL(TENSOR3); \
-          } \
-        } \
-        if (TENSOR1##_counter_tmp != NULL) { \
-          THFree(TENSOR1##_counter_tmp); \
-        } \
-        if (TENSOR2##_counter_tmp != NULL) { \
-          THFree(TENSOR2##_counter_tmp); \
-        } \
-        if (TENSOR3##_counter_tmp != NULL) { \
-          THFree(TENSOR3##_counter_tmp); \
-        } \
-      }); \
-    } \
-    if (TENSOR1##_counter != NULL) { \
-      THFree(TENSOR1##_counter); \
-    } \
-    if (TENSOR2##_counter != NULL) { \
-      THFree(TENSOR2##_counter); \
-    } \
-    if (TENSOR3##_counter != NULL) { \
-      THFree(TENSOR3##_counter); \
-    } \
-  } \
-}
-
-#define TH_TENSOR_APPLY_REDUCTION_SUM_PARALLEL(TYPE, TENSOR, EXPR, OUTPUT, THRESHOLD) \
-{ \
-  int TENSOR##Contig = THTensor_(isContiguous)(TENSOR); \
-  ptrdiff_t TENSOR##Size = THTensor_(nElement)(TENSOR); \
-  if (TENSOR##Contig) { \
-    TYPE *rp = THTensor_getStoragePtr(TENSOR)->data<TYPE>()+TENSOR->storage_offset(); \
-    OUTPUT = at::parallel_reduce(0, TENSOR##Size, (THRESHOLD * 10), (accreal)0, [&](int64_t begin, int64_t end, accreal ident)->accreal { \
-      accreal r = ident; \
-      for (auto iter = begin; iter < end; iter++) { \
-        TYPE *TENSOR##_data = rp+iter; \
-        r += (EXPR); \
-      } \
-      return r; \
-    }, std::plus<accreal>()); \
-  } else { \
-    int TH_TENSOR_APPLY_hasFinished = 0; \
-    int64_t TH_TENSOR_dim_index = 0; \
-    __TH_TENSOR_APPLYX_PREAMBLE(TYPE, TENSOR, -1, 1); \
-    if (0 == TH_TENSOR_APPLY_hasFinished) { \
-      auto TENSOR##_data_local = TENSOR##_data; \
-      auto TENSOR##_i_local = TENSOR##_i; \
-      OUTPUT = at::parallel_reduce(0, TENSOR##Size, THRESHOLD, (accreal)0, [&](int64_t begin, int64_t end, accreal ident)->accreal { \
-        auto TENSOR##_data = TENSOR##_data_local; \
-        auto TENSOR##_i = TENSOR##_i_local; \
-        ptrdiff_t line_index_start = begin; \
-        ptrdiff_t line_seg_length = (end - begin); \
-        __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR); \
-        TENSOR##_data += TENSOR##_memory_offset; \
-        ptrdiff_t count = 0; \
-        ptrdiff_t TENSOR##_start = TENSOR##_counter_tmp[TENSOR##_dim - 1]; \
-        accreal r = ident; \
-        while (count < line_seg_length) { \
-          for (TENSOR##_i=TENSOR##_start; (count < line_seg_length)&&(TENSOR##_i < TENSOR##_size); ++TENSOR##_i, ++count) { \
-            r += (EXPR); \
-            TENSOR##_data += TENSOR##_stride; \
-          } \
-          if (count < line_seg_length) { \
-            __TH_TENSOR_APPLYX_UPDATE_COUNTERS_PARALLEL(TENSOR); \
-          } \
-        } \
-        if (TENSOR##_counter_tmp != NULL) { \
-          THFree(TENSOR##_counter_tmp); \
-        } \
-        return r; \
-      }, std::plus<accreal>()); \
-    } \
-    if (TENSOR##_counter != NULL) { \
-      THFree(TENSOR##_counter); \
-    } \
-  } \
-}
-
-#define TH_TENSOR_APPLY_CONTIG(TYPE, TENSOR, CODE) \
-{ \
-  auto code_fn = [&](int64_t begin, int64_t end) { \
-    ptrdiff_t TENSOR##_len = end - begin; \
-    TYPE *TENSOR##_data = TENSOR->data<scalar_t>() + begin; \
-    CODE \
-  }; \
-  int in_parallel = at::in_parallel_region(); \
-  ptrdiff_t TH_TENSOR_size = THTensor_(nElement)(TENSOR); \
-  if (!in_parallel) { \
-    at::parallel_for(0, TH_TENSOR_size, TH_OMP_OVERHEAD_THRESHOLD, code_fn); \
-  } else { \
-    code_fn(0, TH_TENSOR_size); \
-  } \
-}
-
-#define TH_TENSOR_APPLY2_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \
-{ \
-  auto code_fn = [&](int64_t begin, int64_t end) { \
-    ptrdiff_t TENSOR1##_len = end - begin; \
-    TYPE1 *TENSOR1##_data = TENSOR1->data<scalar_t>() + begin; \
-    TYPE2 *TENSOR2##_data = TENSOR2->data<scalar_t>() + begin; \
-    CODE \
-  }; \
-  int in_parallel = at::in_parallel_region(); \
-  ptrdiff_t TH_TENSOR_size = THTensor_(nElement)(TENSOR1); \
-  if (!in_parallel) { \
-    at::parallel_for(0, TH_TENSOR_size, TH_OMP_OVERHEAD_THRESHOLD, code_fn); \
-  } else { \
-    code_fn(0, TH_TENSOR_size); \
-  } \
-}
-
-#define TH_TENSOR_APPLY3_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \
-{ \
-  auto code_fn = [&](int64_t begin, int64_t end) { \
-    ptrdiff_t TENSOR1##_len = end - begin; \
-    TYPE1 *TENSOR1##_data = TENSOR1->data<scalar_t>() + begin; \
-    TYPE2 *TENSOR2##_data = TENSOR2->data<scalar_t>() + begin; \
-    TYPE3 *TENSOR3##_data = TENSOR3->data<scalar_t>() + begin; \
-    CODE \
-  }; \
-  int in_parallel = at::in_parallel_region(); \
-  ptrdiff_t TH_TENSOR_size = THTensor_(nElement)(TENSOR1); \
-  if (!in_parallel) { \
-    at::parallel_for(0, TH_TENSOR_size, TH_OMP_OVERHEAD_THRESHOLD, code_fn); \
-  } else { \
-    code_fn(0, TH_TENSOR_size); \
-  } \
-}
--- a/aten/src/TH/generic/THTensorFastGetSet.hpp
+++ b/aten/src/TH/generic/THTensorFastGetSet.hpp
@ -1,49 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "TH/generic/THTensorFastGetSet.hpp"
-#else
-
-static inline scalar_t THTensor_(fastGetLegacy1dNoScalars)(THTensor *self, int64_t x0) {
-  return self->unsafe_data<scalar_t>()[x0*THTensor_strideLegacyNoScalars(self, 0)];
-}
-
-static inline scalar_t THTensor_(fastGet1d)(THTensor *self, int64_t x0) {
-  return self->unsafe_data<scalar_t>()[x0*self->stride(0)];
-}
-
-static inline scalar_t THTensor_(fastGet2d)(THTensor *self, int64_t x0, int64_t x1) {
-  return self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)];
-}
-
-static inline scalar_t THTensor_(fastGet3d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2) {
-  return self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)];
-}
-
-static inline scalar_t THTensor_(fastGet4d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3) {
-  return self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)+x3*self->stride(3)];
-}
-
-static inline scalar_t THTensor_(fastGet5d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, int64_t x4) {
-  return self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)+x3*self->stride(3)+(x4)*self->stride(4)];
-}
-
-static inline void THTensor_(fastSet1d)(THTensor *self, int64_t x0, scalar_t value) {
-  self->unsafe_data<scalar_t>()[x0*self->stride(0)] = value;
-}
-
-static inline void THTensor_(fastSet2d)(THTensor *self, int64_t x0, int64_t x1, scalar_t value) {
-  self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)] = value;
-}
-
-static inline void THTensor_(fastSet3d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, scalar_t value) {
-  self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)] = value;
-}
-
-static inline void THTensor_(fastSet4d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, scalar_t value) {
-  self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)+x3*self->stride(3)] = value;
-}
-
-static inline void THTensor_(fastSet5d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, int64_t x4, scalar_t value) {
-  self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)+x3*self->stride(3)+(x4)*self->stride(4)] = value;
-}
-
-#endif
--- a/aten/src/TH/generic/THTensorMath.h
+++ b/aten/src/TH/generic/THTensorMath.h
@ -1,32 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "TH/generic/THTensorMath.h"
-#else
-
-#include <ATen/core/Generator.h>
-
-TH_API int THTensor_(equal)(THTensor *ta, THTensor *tb);
-
-#if !defined(TH_REAL_IS_HALF)
-
-TH_API ptrdiff_t THTensor_(numel)(THTensor *t);
-
-#if !defined(TH_REAL_IS_BFLOAT16)
-
-void THTensor_(preserveReduceDimSemantics)(THTensor *r_, int in_dims, int reduce_dimension, int keepdim);
-
-TH_API void THTensor_(take)(THTensor *tensor, THTensor *src, THLongTensor *index);
-TH_API void THTensor_(put)(THTensor *tensor, THLongTensor *index, THTensor *src, int accumulate);
-
-#if !defined(TH_REAL_IS_BOOL) /* non bool only part */
-
-TH_API void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, int64_t k, int dimension, int keepdim);
-
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-
-TH_API void THTensor_(histc)(THTensor *hist, THTensor *tensor, int64_t nbins, scalar_t minvalue, scalar_t maxvalue);
-
-#endif
-#endif
-#endif
-#endif /* !defined(TH_REAL_IS_HALF) */
-#endif /* TH_GENERIC_FILE*/
--- a/aten/src/TH/generic/THTensorMoreMath.cpp
+++ b/aten/src/TH/generic/THTensorMoreMath.cpp
@ -1,292 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "TH/generic/THTensorMoreMath.cpp"
-#else
-
-#include <TH/generic/THTensorApply.hpp>
-#include <ATen/CPUGeneratorImpl.h>
-#include <ATen/Utils.h>
-#include <ATen/NamedTensorUtils.h>
-#include <ATen/WrapDimUtils.h>
-#include <limits>
-
-ptrdiff_t THTensor_(numel)(THTensor *t)
-{
-  return THTensor_(nElement)(t);
-}
-
-#if !defined(TH_REAL_IS_BFLOAT16) && !defined(TH_REAL_IS_HALF)
-
-// Helper function to be used in a reduction operation.
-// Due to resize semantics of outputs, if the specified output tensor r_ has
-// same size as the output of the reduction operation, then any noncontiguities
-// in r_ should be preserved.
-// The reduction operation, however, needs to act on r_ with an extra dimension
-// (the reduced dimension), so this function "resizes" r_ and preserves its
-// noncontiguities if necessary.
-void THTensor_(preserveReduceDimSemantics)(
-    THTensor *r_, int in_dims, int reduce_dimension, int keepdim) {
-  if (r_ && !keepdim &&
-      THTensor_(nDimensionLegacyAll)(r_) == in_dims - 1 &&
-      THTensor_(nDimensionLegacyAll)(r_) != 0) {
-    THTensor_(unsqueeze1d)(r_, r_, reduce_dimension);
-  }
-}
-
-#if !defined(TH_REAL_IS_BOOL) /* non bool only part */
-
-#define ARR(III) arr[(III)*stride]
-#define IDX(III) idx[(III)*stride]
-
-#define LONG_SWAP(AAA, BBB) swap = AAA; AAA = BBB; BBB = swap
-#define REAL_SWAP(AAA, BBB) rswap = AAA; AAA = BBB; BBB = rswap
-
-#define ARR_SWAP(III, JJJ) \
-  REAL_SWAP(ARR(III), ARR(JJJ));
-
-#define BOTH_SWAP(III, JJJ) \
-  REAL_SWAP(ARR(III), ARR(JJJ)); \
-  LONG_SWAP(IDX(III), IDX(JJJ))
-
-/* Emulate NumPy behavior of putting NaNs
- * at the end of an ascending list. */
-#define GT_OR_NAN(x, y) \
-  ((th_isnan(x) && !(th_isnan(y))) || (x > y))
-
-/* Implementation of the Quickselect algorithm, based on Nicolas Devillard's
-public domain implementation at http://ndevilla.free.fr/median/median/
-Adapted similarly to the above Quicksort algorithm. */
-static void THTensor_(quickselect)(scalar_t *arr, int64_t *idx, int64_t k, int64_t elements, int64_t stride)
-{
-  int64_t P, L, R, i, j, swap;
-  scalar_t rswap, piv;
-  L = 0;
-  R = elements-1;
-
-  do {
-    if (R <= L) /* One element only */
-      return;
-
-    if (R == L+1) {  /* Two elements only */
-      if (ARR(L) > ARR(R)) {
-        BOTH_SWAP(L, R);
-      }
-      return;
-    }
-
-    /* Use median of three for pivot choice */
-    P=(L+R)>>1;
-    BOTH_SWAP(P, L+1);
-    if (ARR(L+1) > ARR(R)) { BOTH_SWAP(L+1, R); }
-    if (ARR(L) > ARR(R)) { BOTH_SWAP(L, R); }
-    if (ARR(L+1) > ARR(L)) { BOTH_SWAP(L+1, L); }
-
-    i = L+1;
-    j = R;
-    piv = ARR(L);
-    do {
-      do i++; while(ARR(i) < piv);
-      do j--; while(ARR(j) > piv);
-      if (j < i)
-        break;
-      BOTH_SWAP(i, j);
-    } while(1);
-    BOTH_SWAP(L, j);
-
-    /* Re-set active partition */
-    if (j <= k) L=i;
-    if (j >= k) R=j-1;
-  } while(1);
-}
-
-#undef ARR
-#undef IDX
-#undef LONG_SWAP
-#undef REAL_SWAP
-#undef BOTH_SWAP
-
-void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, int64_t k, int dimension, int keepdim)
-{
-  THTensor *temp_;
-  THLongTensor *tempi_;
-  scalar_t *temp__data;
-  int64_t *tempi__data;
-  int64_t t_size_dim;
-
-  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimensionLegacyAll)(t), 3, "dimension out of range");
-  THArgCheck(k > 0 && k <= THTensor_sizeLegacyNoScalars(t, dimension), 2, "selected index out of range");
-
-  int in_dims = THTensor_(nDimensionLegacyAll)(t);
-  THTensor_(preserveReduceDimSemantics)(values_, in_dims, dimension, keepdim);
-  THLongTensor_preserveReduceDimSemantics(indices_, in_dims, dimension, keepdim);
-  std::vector<int64_t> dim = THTensor_sizesLegacyNoScalars(t);
-  dim[dimension] = 1;
-  THTensor_(resize)(values_, dim, {});
-  THLongTensor_resize(indices_, dim, {});
-
-  t_size_dim = THTensor_sizeLegacyNoScalars(t, dimension);
-
-  temp_ = THTensor_(new)();
-  THTensor_(resize1d)(temp_, t_size_dim);
-  temp__data = temp_->data<scalar_t>();
-
-  tempi_ = THLongTensor_new();
-  THLongTensor_resize1d(tempi_, t_size_dim);
-  tempi__data = THLongTensor_data(tempi_);
-
-  TH_TENSOR_DIM_APPLY3(scalar_t, t, scalar_t, values_, int64_t, indices_, dimension,
-                       TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM,
-                       int64_t i;
-                       for(i = 0; i < t_size_dim; i++)
-                          temp__data[i] = t_data[i*t_stride];
-                       for(i = 0; i < t_size_dim; i++)
-                          tempi__data[i] = i;
-                       THTensor_(quickselect)(temp__data, tempi__data, k - 1, t_size_dim, 1);
-                       *values__data = temp__data[k-1];
-                       *indices__data = tempi__data[k-1];);
-
-  c10::raw::intrusive_ptr::decref(temp_);
-  THLongTensor_free(tempi_);
-  if (!keepdim) {
-    THTensor_(squeeze1d)(values_, values_, dimension);
-    THLongTensor_squeeze1d(indices_, indices_, dimension);
-  }
-}
-
-static void THTensor_(propagate_names_if_named_tensor_enabled)(THTensor* result, THTensor* src) {
-  at::namedinference::propagate_names(result, src);
-}
-
-#define LAB_IMPLEMENT_BASIC_FUNCTION_3_ARGS(NAME, CFUNC, THRESHOLD) \
-  void THTensor_(NAME)(THTensor *r_, THTensor *t) \
-  { \
-    THTensor_(resizeAs)(r_, t); \
-    ptrdiff_t r_Size = THTensor_(nElement)(r_); \
-    int r_Contig = THTensor_(isContiguous)(r_); \
-    int tContig = THTensor_(isContiguous)(t); \
-    TH_TENSOR_APPLY2_PARALLEL(r_Size, r_Contig, tContig, scalar_t, r_, scalar_t, t, *r__data = CFUNC(*t_data);, THRESHOLD); \
-    THTensor_(propagate_names_if_named_tensor_enabled)(r_, t); \
-  }
-
-#define LAB_IMPLEMENT_BASIC_FUNCTION_2_ARGS(NAME, CFUNC) \
-  LAB_IMPLEMENT_BASIC_FUNCTION_3_ARGS(NAME, CFUNC, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD)
-
-#define LAB_IMPLEMENT_VECTORIZED_FUNCTION_3_ARGS(NAME, CFUNC, THRESHOLD) \
-  void THTensor_(NAME)(THTensor *r_, THTensor *t) \
-  { \
-    THTensor_(resizeAs)(r_, t); \
-    ptrdiff_t r_Size = THTensor_(nElement)(r_); \
-    int r_Contig = THTensor_(isContiguous)(r_); \
-    int tContig = THTensor_(isContiguous)(t); \
-    if (r_Contig && tContig) { \
-      TH_TENSOR_APPLY2_CONTIG(scalar_t, r_, scalar_t, t, THVector_(NAME)(r__data, t_data, r__len);); \
-    } else { \
-      TH_TENSOR_APPLY2_PARALLEL(r_Size, r_Contig, tContig, scalar_t, r_, scalar_t, t, *r__data = CFUNC(*t_data);, THRESHOLD); \
-    } \
-    THTensor_(propagate_names_if_named_tensor_enabled)(r_, t); \
-  }
-
-#define LAB_IMPLEMENT_VECTORIZED_FUNCTION_2_ARGS(NAME, CFUNC) \
-  LAB_IMPLEMENT_VECTORIZED_FUNCTION_3_ARGS(NAME, CFUNC, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD)
-
-#define EXPAND(...) __VA_ARGS__
-
-#define GET_4TH_ARG(ARG0, ARG1, ARG2, ARG3, ...) ARG3
-
-#define LAB_IMPLEMENT_BASIC_FUNCTION_CHOOSE(...) \
-  EXPAND(GET_4TH_ARG(__VA_ARGS__, LAB_IMPLEMENT_BASIC_FUNCTION_3_ARGS, LAB_IMPLEMENT_BASIC_FUNCTION_2_ARGS, ))
-
-#define LAB_IMPLEMENT_VECTORIZED_FUNCTION_CHOOSE(...) \
-  EXPAND(GET_4TH_ARG(__VA_ARGS__, LAB_IMPLEMENT_VECTORIZED_FUNCTION_3_ARGS, LAB_IMPLEMENT_VECTORIZED_FUNCTION_2_ARGS, ))
-
-#define LAB_IMPLEMENT_BASIC_FUNCTION(...) EXPAND(LAB_IMPLEMENT_BASIC_FUNCTION_CHOOSE(__VA_ARGS__)(__VA_ARGS__))
-
-#define LAB_IMPLEMENT_VECTORIZED_FUNCTION(...) EXPAND(LAB_IMPLEMENT_VECTORIZED_FUNCTION_CHOOSE(__VA_ARGS__)(__VA_ARGS__))
-
-/*
- * LAB_IMPLEMENT_BASIC_FUNCTION is a macro with optional parameters, you can use it flexibly.
- * The macro will discard the invalid threshold if parallelization is unavailable.
- * The macro will give a default threshold even if you forget to pass one.
- * In other word,
- * (A), If parallelization is UNavailable, the two usage below is both right.
- *      (1) LAB_IMPLEMENT_BASIC_FUNCTION(type_func, func_entity, OMP_OVERHEAD_THRESHOLD) // discard the invalid threshold
- *      (2) LAB_IMPLEMENT_BASIC_FUNCTION(type_func, func_entity)
- * (B), If parallelization is available, the two usage below is also both right.
- *      (1) LAB_IMPLEMENT_BASIC_FUNCTION(type_func, func_entity, OMP_OVERHEAD_THRESHOLD)
- *      (2) LAB_IMPLEMENT_BASIC_FUNCTION(type_func, func_entity) // pass the default threshold
- * So do LAB_IMPLEMENT_VECTORIZED_FUNCTION.
-*/
-
-LAB_IMPLEMENT_BASIC_FUNCTION(neg,-)
-
-#if defined(TH_REAL_IS_LONG)
-LAB_IMPLEMENT_BASIC_FUNCTION(abs,std::abs)
-#endif /* int64_t only part */
-
-#if defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_CHAR)
-LAB_IMPLEMENT_BASIC_FUNCTION(abs,abs)
-#endif /* int only part */
-
-#if defined(TH_REAL_IS_BYTE)
-LAB_IMPLEMENT_BASIC_FUNCTION(abs,)
-#endif /* for byte, identity due to it being unsigned */
-
-/* floating point only now */
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-
-#if defined (TH_REAL_IS_FLOAT)
-#define TH_MATH_NAME(fn) fn##f
-#else
-#define TH_MATH_NAME(fn) fn
-#endif
-
-LAB_IMPLEMENT_BASIC_FUNCTION(abs,TH_MATH_NAME(fabs))
-
-LAB_IMPLEMENT_BASIC_FUNCTION(cosh,TH_MATH_NAME(cosh),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
-LAB_IMPLEMENT_BASIC_FUNCTION(tanh,TH_MATH_NAME(tanh),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
-
-void THTensor_(histc)(THTensor *hist, THTensor *tensor, int64_t nbins, scalar_t minvalue, scalar_t maxvalue)
-{
-  if (nbins <= 0) {
-      THError("bins must be > 0");
-  }
-  scalar_t minval;
-  scalar_t maxval;
-  scalar_t *h_data;
-
-  THTensor_(resize1d)(hist, nbins);
-  THTensor_wrap(hist).zero_();
-  minval = minvalue;
-  maxval = maxvalue;
-  if (minval == maxval)
-  {
-    minval = THTensor_wrap(tensor).min().item<scalar_t>();
-    maxval = THTensor_wrap(tensor).max().item<scalar_t>();
-  }
-  if (minval == maxval)
-  {
-    minval = minval - 1;
-    maxval = maxval + 1;
-  }
-
-  TORCH_CHECK(!(std::isinf(minval) || std::isinf(maxval) || std::isnan(minval) || std::isnan(maxval)), "range of [", minval, ", ", maxval, "] is not finite");
-  TORCH_CHECK(minval < maxval, "max must be larger than min");
-
-  h_data = hist->data<scalar_t>();
-
-  TH_TENSOR_APPLY(scalar_t, tensor,
-    if (*tensor_data >= minval && *tensor_data <= maxval) {
-      const int bin = (int)((*tensor_data-minval) / (maxval-minval) * nbins);
-      h_data[THMin(bin, nbins-1)] += 1;
-    }
-  );
-}
-
-#endif
-
-#undef TH_MATH_NAME
-#endif /* floating point only part */
-#undef IS_NONZERO
-
-#endif /* !defined(TH_REAL_IS_BOOL) */
-
-#endif /* TH_GENERIC_FILE */
--- a/aten/src/TH/generic/THVector.h
+++ b/aten/src/TH/generic/THVector.h
@ -1,18 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "TH/generic/THVector.h"
-#else
-#if !defined(TH_REAL_IS_BOOL) /* non bool only part */
-
-TH_API void THVector_(neg)(scalar_t *y, const scalar_t *x, const ptrdiff_t n);
-
-#endif /* non bool only part */
-
-/* floating point only now */
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-
-TH_API void THVector_(erfc)(scalar_t *y, const scalar_t *x, const ptrdiff_t n);
-TH_API void THVector_(pow)(scalar_t *y, const scalar_t *x, const scalar_t c, const ptrdiff_t n);
-
-#endif /* floating point only part */
-
-#endif
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@ -1078,11 +1078,9 @@ aten_native_source_non_codegen_list = [
    "aten/src/ATen/native/sparse/SparseCsrTensor.cpp",
    "aten/src/ATen/native/sparse/SparseTensorMath.cpp",
    "aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp",
-    "aten/src/TH/THBlas.cpp",
    "aten/src/TH/THGeneral.cpp",
    "aten/src/TH/THStorageFunctions.cpp",
    "aten/src/TH/THTensor.cpp",
-    "aten/src/TH/THTensorMoreMath.cpp",
    "aten/src/ATen/native/utils/Factory.cpp",
    "aten/src/ATen/native/xnnpack/Activation.cpp",
    "aten/src/ATen/native/xnnpack/ChannelShuffle.cpp",