TH: Clean up dead code (#60655)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/60655 Test Plan: Imported from OSS Reviewed By: albanD Differential Revision: D29371717 Pulled By: ngimel fbshipit-source-id: faa71b1d4a15450c78e12aa917daec853057bce9
2025-12-06 12:20:52 +01:00 · 2021-06-24 19:39:36 -07:00 · 2021-06-24 19:39:36 -07:00 · 42c8439b6e
commit 42c8439b6e
parent 4a7d281119
25 changed files with 1 additions and 2266 deletions
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -332,11 +332,9 @@ filegroup(
 filegroup(
    name = "th_srcs",
    srcs = [
        "aten/src/TH/THBlas.cpp",
        "aten/src/TH/THGeneral.cpp",
        "aten/src/TH/THStorageFunctions.cpp",
        "aten/src/TH/THTensor.cpp",
        "aten/src/TH/THTensorMoreMath.cpp",
    ],
 )
@ -546,10 +544,6 @@ header_template_rule(
    src = "aten/src/TH/THGeneral.h.in",
    out = "aten/src/TH/THGeneral.h",
    substitutions = {
        "#cmakedefine USE_BLAS": "#define USE_BLAS",
        "#cmakedefine USE_LAPACK": "#define USE_LAPACK",
        "#cmakedefine BLAS_F2C": "/* #undef BLAS_F2C */",
        "#cmakedefine BLAS_USE_CBLAS_DOT": "#define BLAS_USE_CBLAS_DOT",
    },
 )
--- a/aten/src/TH/CMakeLists.txt
+++ b/aten/src/TH/CMakeLists.txt
@ -1,15 +1,12 @@
 set(Aten_TH_AVX_extra_src)
 set(hdr
-  THGeneral.h THHalf.h THStorage.h THStorageFunctions.h THTensor.h THTensorApply.h THBlas.h
+  THGeneral.h THHalf.h THStorage.h THStorageFunctions.h THTensor.h)
  THVector.h )
 set(ATen_TH_SRCS
  ${CMAKE_CURRENT_SOURCE_DIR}/THGeneral.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/THStorageFunctions.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/THTensor.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/THTensorMoreMath.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/THBlas.cpp
  )
 # Remember that PARENT_SCOPE variables are not in the current scope
 set(ATen_TH_SRCS ${ATen_TH_SRCS} PARENT_SCOPE)
@ -36,7 +33,6 @@ configure_file(THGeneral.h.in "${CMAKE_CURRENT_BINARY_DIR}/THGeneral.h")
 install(FILES
  TH.h
  THBlas.h
  ${CMAKE_CURRENT_BINARY_DIR}/THGeneral.h
  THGenerateAllTypes.h
  THGenerateBFloat16Type.h
@ -62,17 +58,12 @@ install(FILES
  THStorage.h
  THStorageFunctions.h
  THTensor.h
  THTensorApply.h
  THTensorDimApply.h
  THVector.h
  THHalf.h
  THTensor.hpp
  THStorageFunctions.hpp
  DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/TH")
 install(FILES
  generic/THBlas.cpp
  generic/THBlas.h
  generic/THStorage.cpp
  generic/THStorage.h
  generic/THStorageCopy.cpp
@ -80,8 +71,5 @@ install(FILES
  generic/THTensor.cpp
  generic/THTensor.h
  generic/THTensor.hpp
  generic/THTensorMath.h
  generic/THVector.h
  # See Note [TH abstraction violation]
  generic/THTensorFastGetSet.hpp
  DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/TH/generic")
--- a/aten/src/TH/TH.h
+++ b/aten/src/TH/TH.h
@ -3,11 +3,7 @@
 #include <TH/THGeneral.h>
 #include <TH/THBlas.h>
 #include <TH/THVector.h>
 #include <TH/THStorageFunctions.h>
 #include <TH/THTensor.h>
 #include <TH/THTensorApply.h>
 #include <TH/THTensorDimApply.h>
 #endif
--- a/aten/src/TH/THBlas.cpp
+++ b/aten/src/TH/THBlas.cpp
@ -1,13 +0,0 @@
 #include <TH/THBlas.h>
 // NOLINTNEXTLINE(bugprone-suspicious-include)
 #include <TH/generic/THBlas.cpp>
 #include <TH/THGenerateAllTypes.h>
 // NOLINTNEXTLINE(bugprone-suspicious-include)
 #include <TH/generic/THBlas.cpp>
 #include <TH/THGenerateBFloat16Type.h>
 // NOLINTNEXTLINE(bugprone-suspicious-include)
 #include <TH/generic/THBlas.cpp>
 #include <TH/THGenerateHalfType.h>
--- a/aten/src/TH/THBlas.h
+++ b/aten/src/TH/THBlas.h
@ -1,17 +0,0 @@
 #ifndef TH_BLAS_INC
 #define TH_BLAS_INC
 #include <TH/THGeneral.h>
 #define THBlas_(NAME) TH_CONCAT_4(TH,Real,Blas_,NAME)
 #include <TH/generic/THBlas.h>
 #include <TH/THGenerateAllTypes.h>
 #include <TH/generic/THBlas.h>
 #include <TH/THGenerateBFloat16Type.h>
 #include <TH/generic/THBlas.h>
 #include <TH/THGenerateHalfType.h>
 #endif
--- a/aten/src/TH/THGeneral.cpp
+++ b/aten/src/TH/THGeneral.cpp
@ -68,21 +68,6 @@ void _THAssertionFailed(const char *file, const int line, const char *exp, const
  _THError(file, line, "Assertion `%s' failed. %s", exp, msg);
 }
 void THSetErrorHandler(THErrorHandlerFunction new_handler, void *data)
 {
  threadErrorHandler = new_handler;
  threadErrorHandlerData = data;
 }
 void THSetDefaultErrorHandler(THErrorHandlerFunction new_handler, void *data)
 {
  if (new_handler)
    defaultErrorHandler = new_handler;
  else
    defaultErrorHandler = defaultErrorHandlerFunction;
  defaultErrorHandlerData = data;
 }
 /* Torch Arg Checking Handling */
 static void defaultArgErrorHandlerFunction(int argNumber, const char *msg, void *data)
 {
@ -125,42 +110,6 @@ void _THArgCheck(const char *file, int line, int condition, int argNumber, const
  }
 }
 void THSetArgErrorHandler(THArgErrorHandlerFunction new_handler, void *data)
 {
  threadArgErrorHandler = new_handler;
  threadArgErrorHandlerData = data;
 }
 void THSetDefaultArgErrorHandler(THArgErrorHandlerFunction new_handler, void *data)
 {
  if (new_handler)
    defaultArgErrorHandler = new_handler;
  else
    defaultArgErrorHandler = defaultArgErrorHandlerFunction;
  defaultArgErrorHandlerData = data;
 }
 // NOLINTNEXTLINE(modernize-use-nullptr,cppcoreguidelines-avoid-non-const-global-variables)
 static __thread void (*torchGCFunction)(void *data) = NULL;
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 static __thread void *torchGCData;
 /* Optional hook for integrating with a garbage-collected frontend.
 *
 * If torch is running with a garbage-collected frontend (e.g. Lua),
 * the GC isn't aware of TH-allocated memory so may not know when it
 * needs to run. These hooks trigger the GC to run in two cases:
 *
 * (1) When a memory allocation (malloc, realloc, ...) fails
 * (2) When the total TH-allocated memory hits a dynamically-adjusted
 *     soft maximum.
 */
 void THSetGCHandler( void (*torchGCFunction_)(void *data), void *data )
 {
  torchGCFunction = torchGCFunction_;
  torchGCData = data;
 }
 void* THAlloc(ptrdiff_t size)
 {
  if(size < 0)
@ -169,63 +118,7 @@ void* THAlloc(ptrdiff_t size)
  return c10::alloc_cpu(size);
 }
 void* THRealloc(void *ptr, ptrdiff_t size)
 {
  if(!ptr)
    return(THAlloc(size));
  if(size == 0)
  {
    THFree(ptr);
    // NOLINTNEXTLINE(modernize-use-nullptr)
    return NULL;
  }
  if(size < 0)
    THError("$ Torch: invalid memory size -- maybe an overflow?");
  // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
  void *newptr = realloc(ptr, size);
  if(!newptr && torchGCFunction) {
    torchGCFunction(torchGCData);
    // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
    newptr = realloc(ptr, size);
  }
  if(!newptr)
    THError("$ Torch: not enough memory: you tried to reallocate %dGB. Buy new RAM!", size/1073741824);
  return newptr;
 }
 void THFree(void *ptr)
 {
  c10::free_cpu(ptr);
 }
 THDescBuff _THSizeDesc(const int64_t *size, const int64_t ndim) {
  const int L = TH_DESC_BUFF_LEN;
  THDescBuff buf;
  char *str = buf.str;
  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
  int64_t i;
  int64_t n = 0;
  n += snprintf(str, L-n, "[");
  for (i = 0; i < ndim; i++) {
    if (n >= L) break;
    n += snprintf(str+n, L-n, "%" PRId64, size[i]);
    if (i < ndim-1) {
      n += snprintf(str+n, L-n, " x ");
    }
  }
  if (n < L - 2) {
    snprintf(str+n, L-n, "]");
  } else {
    snprintf(str+L-5, 5, "...]");
  }
  return buf;
 }
--- a/aten/src/TH/THGeneral.h.in
+++ b/aten/src/TH/THGeneral.h.in
@ -21,11 +21,6 @@
 #include <mkl_vsl.h>
 #endif
 #cmakedefine USE_BLAS
 #cmakedefine USE_LAPACK
 #cmakedefine BLAS_F2C
 #cmakedefine BLAS_USE_CBLAS_DOT
 # define TH_EXTERNC extern "C"
 // Note(jiayq): copied from ATen/core/Macros.h. Because internal build of TH
@ -72,26 +67,12 @@
 typedef void (*THErrorHandlerFunction)(const char *msg, void *data);
 typedef void (*THArgErrorHandlerFunction)(int argNumber, const char *msg, void *data);
 #define TH_DESC_BUFF_LEN 64
 typedef struct {
    char str[TH_DESC_BUFF_LEN];
 } THDescBuff;
 TH_API THDescBuff _THSizeDesc(const int64_t *size, const int64_t ndim);
 TH_API TH_NO_RETURN void _THError(const char *file, const int line, const char *fmt, ...);
 TH_API void _THAssertionFailed(const char *file, const int line, const char *exp, const char *fmt, ...);
 TH_API void THSetErrorHandler(THErrorHandlerFunction new_handler, void *data);
 TH_API void THSetDefaultErrorHandler(THErrorHandlerFunction new_handler, void *data);
 TH_API void _THArgCheck(const char *file, int line, int condition, int argNumber, const char *fmt, ...);
 TH_API void THSetArgErrorHandler(THArgErrorHandlerFunction new_handler, void *data);
 TH_API void THSetDefaultArgErrorHandler(THArgErrorHandlerFunction new_handler, void *data);
 TH_API void* THAlloc(ptrdiff_t size);
 TH_API void* THRealloc(void *ptr, ptrdiff_t size);
 TH_API void THFree(void *ptr);
 TH_API void THSetGCHandler( void (*torchGCHandlerFunction)(void *data), void *data );
 // this hook should only be called by custom allocator functions
 TH_API void THHeapUpdate(ptrdiff_t size);
 #define THError(...) _THError(__FILE__, __LINE__, __VA_ARGS__)
--- a/aten/src/TH/THTensor.cpp
+++ b/aten/src/TH/THTensor.cpp
@ -36,42 +36,3 @@ void THTensor_setStorage(THTensor *self, THStorage *storage_, ptrdiff_t storageO
  c10::raw::intrusive_ptr::incref(storage_);
  THTensor_wrap(self).set_(at::Storage(c10::intrusive_ptr<at::StorageImpl>::reclaim(storage_)), storageOffset_, size_, stride_);
 }
 void THTensor_resize(THTensor *self, at::IntArrayRef size, at::IntArrayRef stride)
 {
  if (stride.data()) {
    THArgCheck(stride.size() == size.size(), 3, "invalid stride");
  }
 #ifdef DEBUG
  THAssert(size.size() <= INT_MAX);
 #endif
  THTensor_resizeNd(self, size.size(), size.data(), stride.data());
 }
 void THTensor_resizeNd(THTensor *self, int nDimension, const int64_t *size, const int64_t *stride)
 {
  TORCH_CHECK(nDimension >= 0, "resizeNd nDimension must be non-negative");
  at::IntArrayRef sizes(size, nDimension);
  at::optional<at::IntArrayRef> strides;
  if (stride) {
    strides = at::IntArrayRef(stride, nDimension);
  }
  at::native::resize_impl_cpu_(self, sizes, strides);
 }
 // NB: Steals ownership of storage
 void THTensor_stealAndSetStoragePtr(THTensor* tensor, THStorage* storage) {
  // Caffe2 might have tensors whose storages are null, but we
  // don't allow it in PyTorch.
  AT_ASSERT(storage);
  // We used to allow this, but this breaks device caching.
  // Let's put an actual error message for this one.
  TORCH_CHECK(tensor->storage().device() == storage->device(),
            "Attempted to set the storage of a tensor on device \"", tensor->storage().device(),
             "\" to a storage on different device \"", storage->device(),
            "\".  This is no longer allowed; the devices must match.");
  tensor->set_storage_keep_dtype(
      at::Storage(c10::intrusive_ptr<THStorage>::reclaim(storage)));
 }
--- a/aten/src/TH/THTensor.h
+++ b/aten/src/TH/THTensor.h
@ -2,7 +2,6 @@
 #define TH_TENSOR_INC
 #include <TH/THStorageFunctions.h>
 #include <TH/THTensorApply.h>
 #define THTensor_(NAME)   TH_CONCAT_4(TH,Real,Tensor_,NAME)
@ -21,20 +20,4 @@
 #include <TH/generic/THTensor.h>
 #include <TH/THGenerateBFloat16Type.h>
 /* maths */
 #include <TH/generic/THTensorMath.h>
 #include <TH/THGenerateAllTypes.h>
 #include <TH/generic/THTensorMath.h>
 #include <TH/THGenerateBoolType.h>
 #include <TH/generic/THTensorMath.h>
 #include <TH/THGenerateHalfType.h>
 #include <TH/generic/THTensorMath.h>
 #include <TH/THGenerateBFloat16Type.h>
 #include <TH/generic/THTensorMath.h>
 #include <TH/THGenerateComplexTypes.h>
 #endif
--- a/aten/src/TH/THTensor.hpp
+++ b/aten/src/TH/THTensor.hpp
@ -82,14 +82,6 @@ inline int64_t THTensor_sizeLegacyNoScalars(const THTensor *self, int dim)
  return self->dim() == 0 ? 1 : self->size(dim);
 }
 #include <TH/generic/THTensorFastGetSet.hpp>
 #include <TH/THGenerateAllTypes.h>
 #include <TH/generic/THTensorFastGetSet.hpp>
 #include <TH/THGenerateComplexTypes.h>
 #include <TH/generic/THTensorFastGetSet.hpp>
 #include <TH/THGenerateBFloat16Type.h>
 inline std::vector<int64_t> THTensor_sizesLegacyNoScalars(const THTensor *self) {
  if (self->dim() == 0) {
@ -98,20 +90,7 @@ inline std::vector<int64_t> THTensor_sizesLegacyNoScalars(const THTensor *self)
    return self->sizes().vec();
  }
 }
 inline std::vector<int64_t> THTensor_stridesLegacyNoScalars(const THTensor *self) {
  if (self->dim() == 0) {
    return {1};
  } else {
    return self->strides().vec();
  }
 }
 // NB: Steals ownership of storage
 TH_API void THTensor_stealAndSetStoragePtr(THTensor* tensor, THStorage* storage);
 TH_API void THTensor_free(THTensor *self);
 TH_API void THTensor_resizeNd(THTensor *self, int nDimension, const int64_t *size, const int64_t *stride);
 TH_CPP_API void THTensor_resize(THTensor *self, at::IntArrayRef size, at::IntArrayRef stride);
 TH_CPP_API void THTensor_setStorage(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, at::IntArrayRef size_, at::IntArrayRef stride_);
--- a/aten/src/TH/THTensorApply.h
+++ b/aten/src/TH/THTensorApply.h
@ -1,309 +0,0 @@
 #ifndef TH_TENSOR_APPLY_INC
 #define TH_TENSOR_APPLY_INC
 #include <ATen/Parallel.h>
 /*
 * The basic strategy for apply is as follows:
 *
 * 1. Starting with the outermost index, loop until we reach a dimension where the
 * data is no longer contiguous, i.e. the stride at that dimension is not equal to
 * the size of the tensor defined by the outer dimensions. Let's call this outer
 * (contiguous) tensor A. Note that if the Tensor is contiguous, then A is equal
 * to the entire Tensor. Let's call the inner tensor B.
 *
 * 2. We loop through the indices in B, starting at its outermost dimension. For
 * example, if B is a 2x2 matrix, then we do:
 *
 * B[0][0]
 * B[0][1]
 * B[1][0]
 * B[1][1]
 *
 * We set the offset into the underlying storage as (storageOffset + stride_B * index_B),
 * i.e. basically we compute the offset into the storage as we would normally for a
 * Tensor. But because we are guaranteed the subsequent data is contiguous in memory, we
 * can simply loop for sizeof(A) iterations and perform the operation, without having to
 * follow the order described by the strides of A.
 *
 * 3. As an optimization, we merge dimensions of A that are contiguous in memory. For
 * example, if A is a 3x3x3x3 tensor narrowed from a 3x3x4x3 tensor, then the first two
 * dimensions can be merged for the purposes of APPLY, reducing the number of nested
 * loops.
 */
 #define __TH_TENSOR_APPLYX_PREAMBLE(TYPE, TENSOR, DIM, ALLOW_CONTIGUOUS) \
  TYPE *TENSOR##_data = NULL; \
  int64_t *TENSOR##_counter = NULL, *TENSOR##_sizes = NULL, *TENSOR##_strides = NULL, *TENSOR##_dimOffset = NULL; \
  int64_t TENSOR##_stride = 0, TENSOR##_size = 0, TENSOR##_dim = 0, TENSOR##_i, TENSOR##_n; \
  int TENSOR##_contiguous = ALLOW_CONTIGUOUS && DIM < 0; \
  TENSOR##_n = 1; \
  for(TENSOR##_i = 0; TENSOR##_i < TENSOR->dim(); TENSOR##_i++) \
    TENSOR##_n *= TENSOR->size(TENSOR##_i); \
 \
  if(TENSOR->is_empty()) \
    TH_TENSOR_APPLY_hasFinished = 1; \
  else \
  { \
    TENSOR##_data = THTensor_getStoragePtr(TENSOR)->data<TYPE>()+TENSOR->storage_offset(); \
    TENSOR##_size = 1; \
    TENSOR##_stride = 1; \
    for(TENSOR##_i = THTensor_nDimensionLegacyAll(TENSOR)-1; TENSOR##_i >= 0; TENSOR##_i--) { \
      if(THTensor_sizeLegacyNoScalars(TENSOR, TENSOR##_i) != 1) { \
        if(THTensor_strideLegacyNoScalars(TENSOR, TENSOR##_i) == TENSOR##_size && TENSOR##_i != DIM) \
          TENSOR##_size *= THTensor_sizeLegacyNoScalars(TENSOR, TENSOR##_i); \
        else{ \
          TENSOR##_contiguous = 0; \
          break; \
        } \
      } \
    } \
    if (!TENSOR##_contiguous) { \
      /* Find the dimension of contiguous sections */ \
      TENSOR##_dim = 1; \
      for(TENSOR##_i = THTensor_nDimensionLegacyAll(TENSOR)-2; TENSOR##_i >= 0; TENSOR##_i--) \
      { \
        if(TENSOR->stride(TENSOR##_i) != TENSOR->stride(TENSOR##_i+1) * TENSOR->size(TENSOR##_i+1) || TENSOR##_i == DIM || TENSOR##_i+1 == DIM) \
          TENSOR##_dim++; \
      } \
      /* Allocate an array of 3*dim elements, where dim is the number of contiguous sections */ \
      TENSOR##_counter = (int64_t*)THAlloc(sizeof(int64_t)*(3*TENSOR##_dim)); \
      TENSOR##_sizes = TENSOR##_counter + TENSOR##_dim; \
      TENSOR##_strides = TENSOR##_counter + 2*TENSOR##_dim; \
      TH_TENSOR_dim_index = TENSOR##_dim-1; \
      TENSOR##_dimOffset = (DIM == THTensor_nDimensionLegacyAll(TENSOR)-1) ? &TENSOR##_i : &TENSOR##_counter[DIM]; \
      TENSOR##_sizes[TH_TENSOR_dim_index] = THTensor_sizeLegacyNoScalars(TENSOR, THTensor_nDimensionLegacyAll(TENSOR)-1); \
      TENSOR##_strides[TH_TENSOR_dim_index] = THTensor_strideLegacyNoScalars(TENSOR, THTensor_nDimensionLegacyAll(TENSOR)-1); \
      /* TENSOR##_counter tracks where we are in the storage. The offset into the */ \
      /* storage is given by storage_offset + (i * j), where i is the stride */ \
      /* vector and j is tensor_counter vector. This sets the starting position for the loop. */ \
      for(TENSOR##_i = TENSOR##_dim-1; TENSOR##_i >= 0; --TENSOR##_i) { \
        TENSOR##_counter[TENSOR##_i] = 0; \
      } \
      for(TENSOR##_i = THTensor_nDimensionLegacyAll(TENSOR)-2; TENSOR##_i >= 0; --TENSOR##_i) { \
        if (TENSOR->stride(TENSOR##_i) == TENSOR->stride(TENSOR##_i+1) * TENSOR->size(TENSOR##_i+1) && TENSOR##_i != DIM && TENSOR##_i+1 != DIM) { \
          TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size(TENSOR##_i) * TENSOR##_sizes[TH_TENSOR_dim_index]; \
          if (DIM != THTensor_nDimensionLegacyAll(TENSOR)-1 && TENSOR##_i < DIM) \
            TENSOR##_dimOffset--; \
        } else { \
          --TH_TENSOR_dim_index; \
          TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size(TENSOR##_i); \
          TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride(TENSOR##_i); \
        } \
      } \
      /* Size of the inner most section */ \
      TENSOR##_size = TENSOR##_sizes[TENSOR##_dim-1]; \
      /* Stride of the inner most section */ \
      TENSOR##_stride = TENSOR##_strides[TENSOR##_dim-1]; \
    } \
    else{\
      TENSOR##_dim = 1;\
      TENSOR##_counter = (int64_t*)THAlloc(sizeof(int64_t)*3);\
      TENSOR##_sizes = TENSOR##_counter + 1;\
      TENSOR##_strides = TENSOR##_counter + 2;\
      TENSOR##_sizes[0] = TENSOR##_n;\
      TENSOR##_strides[0] = 1;\
      TENSOR##_size = TENSOR##_sizes[0];\
      TENSOR##_stride = TENSOR##_strides[0];\
    }\
  } \
  TENSOR##_i = 0;
 #define  __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR, ALWAYS_UPDATE) \
  if(TENSOR##_i == TENSOR##_size || ALWAYS_UPDATE) \
  { \
    if(TENSOR##_contiguous) \
      break; \
 \
    if(TENSOR##_dim == 1) \
       break; \
 \
    /* Reset pointer to beginning of loop */ \
    TENSOR##_data -= TENSOR##_size*TENSOR##_stride; \
    for(TENSOR##_i = TENSOR##_dim-2; TENSOR##_i >= 0; TENSOR##_i--) \
    { \
      TENSOR##_counter[TENSOR##_i]++; \
      /* Jump ahread by the stride of this dimension */ \
      TENSOR##_data += TENSOR##_strides[TENSOR##_i]; \
 \
      if(TENSOR##_counter[TENSOR##_i]  == TENSOR##_sizes[TENSOR##_i]) \
      { \
        if(TENSOR##_i == 0) \
        { \
          TH_TENSOR_APPLY_hasFinished = 1; \
          break; \
        } \
          else \
        { \
          /* Reset the pointer to the beginning of the chunk defined by this dimension */ \
          TENSOR##_data -= TENSOR##_counter[TENSOR##_i]*TENSOR##_strides[TENSOR##_i]; \
          TENSOR##_counter[TENSOR##_i] = 0; \
        } \
      } \
      else \
        break; \
    } \
    TENSOR##_i = 0; \
  } \
 #define TH_TENSOR_APPLY3_D(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, DIM, CODE) \
 { \
  int TH_TENSOR_APPLY_hasFinished = 0; \
  int64_t TH_TENSOR_dim_index = 0; \
  __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, DIM, 1) \
  __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, DIM, 1) \
  __TH_TENSOR_APPLYX_PREAMBLE(TYPE3, TENSOR3, DIM, 1) \
                                                                        \
  int elements_equal = 1;                                               \
  if(TENSOR1##_n != TENSOR2##_n) {                                      \
    elements_equal = 0;                                                 \
  }                                                                     \
  else if(TENSOR1##_n != TENSOR3##_n) {                                 \
    elements_equal = 0;                                                 \
  }                                                                     \
  if (elements_equal == 0) {                                            \
    AT_ERROR("inconsistent tensor size, expected ",                     \
            #TENSOR1, " ", TENSOR1->sizes(), ", ",                      \
            #TENSOR2, " ", TENSOR2->sizes(), " and ",                   \
            #TENSOR3, " ", TENSOR3->sizes(), " to have the same "       \
            "number of elements, but got ", TENSOR1##_n, ", ",          \
            TENSOR2##_n, " and ", TENSOR3##_n, " elements respectively"); \
  }                                                                     \
                                                                        \
  while(!TH_TENSOR_APPLY_hasFinished) \
  { \
    /* Loop through the inner most region of the Tensor */ \
    for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size && TENSOR3##_i < TENSOR3##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR3##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride, TENSOR3##_data += TENSOR3##_stride) /* 0 et pas TENSOR##_dim! */ \
    { \
      CODE \
    } \
    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR1, 0) \
    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR2, 0) \
    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR3, 0) \
  } \
  if(TENSOR1##_counter != NULL) \
    THFree(TENSOR1##_counter); \
  if(TENSOR2##_counter != NULL) \
    THFree(TENSOR2##_counter); \
  if(TENSOR3##_counter != NULL) \
    THFree(TENSOR3##_counter); \
 }
 #define TH_TENSOR_APPLY3(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \
  TH_TENSOR_APPLY3_D(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, -1, CODE)
 #define TH_TENSOR_APPLY2_D(TYPE1, TENSOR1, TYPE2, TENSOR2, DIM, CODE) \
 { \
  int TH_TENSOR_APPLY_hasFinished = 0; \
  int64_t TH_TENSOR_dim_index = 0; \
  __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, DIM, 1) \
  __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, DIM, 1) \
 \
    if(TENSOR1##_n != TENSOR2##_n) {                                    \
      AT_ERROR("inconsistent tensor size, expected ",                   \
      #TENSOR1, " ", TENSOR1->sizes(), " and ",                         \
      #TENSOR2, " ", TENSOR2->sizes(),                                  \
      " to have the same number of elements, but got ",                 \
      TENSOR1##_n, " and ", TENSOR2##_n, " elements respectively");     \
    }                                                                   \
  while(!TH_TENSOR_APPLY_hasFinished) \
  { \
    /* Loop through the inner most region of the Tensor */ \
    for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride) /* 0 et pas TENSOR##_dim! */ \
    { \
      CODE \
    } \
    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR1, 0) \
    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR2, 0) \
  } \
  if(TENSOR1##_counter != NULL) \
    THFree(TENSOR1##_counter); \
  if(TENSOR2##_counter != NULL) \
    THFree(TENSOR2##_counter); \
 }
 #define TH_TENSOR_APPLY2(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \
  TH_TENSOR_APPLY2_D(TYPE1, TENSOR1, TYPE2, TENSOR2, -1, CODE)
 #define TH_TENSOR_APPLY_D(TYPE, TENSOR, DIM, CODE) \
 { \
  int TH_TENSOR_APPLY_hasFinished = 0; \
  int64_t TH_TENSOR_dim_index = 0; \
  __TH_TENSOR_APPLYX_PREAMBLE(TYPE, TENSOR, DIM, 0) \
 \
  while(!TH_TENSOR_APPLY_hasFinished) \
  { \
    /* Loop through the inner most region of the Tensor */ \
    for(; TENSOR##_i < TENSOR##_size; TENSOR##_i++, TENSOR##_data += TENSOR##_stride) /* 0 et pas TENSOR##_dim! */ \
    { \
      CODE \
    } \
    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR, 1) \
  } \
  THFree(TENSOR##_counter); \
 }
 #define TH_TENSOR_APPLY(TYPE, TENSOR, CODE) \
  TH_TENSOR_APPLY_D(TYPE, TENSOR, -1, CODE)
 /*
 * Calcuate the memory offset of an element in a tensor. The strategy is below:
 *
 * 1. convert the line index(the index of the element) to the indexs(coordinates) in the tensor.
 *    It can hinted by a classical problem: Getting each individual digit from a whole integer(Decimal base).
 *    A N-digit decimal base number could be view as a N-dimension tensor and the sizes of the tensor are 10.
 *    So the value the whole integer is the line index. And the digits could be viewed as the indexes in
 *    different dimensions.
 *
 * 2. convert the indexs(coordinates) in the tensor to the memory offset.
 *
 *  You can get the detailes in the for-statement iterations.
 *
 * The macro is only used in the first element in each thread. For the rest, the memory offset could update
 * according to info of the tensor in order to get better performance. So we should also record the each
 * indexs in coresponding dimension of first element.
 * The recorded info is stored in the TENSOR##_counter_tmp.
 *
 */
 #define __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR) \
  int64_t *TENSOR##_counter_tmp = (int64_t*)THAlloc(sizeof(int64_t) * TENSOR##_dim);                 \
  ptrdiff_t TENSOR##_memory_offset = 0;                                                              \
  ptrdiff_t TENSOR##_quot = line_index_start;                                                        \
  for (TENSOR##_i = TENSOR##_dim-1; TENSOR##_i>=0; --TENSOR##_i) {                                   \
    TENSOR##_counter_tmp[TENSOR##_i] = TENSOR##_quot%TENSOR##_sizes[TENSOR##_i];                     \
    TENSOR##_quot /= TENSOR##_sizes[TENSOR##_i];                                                     \
    TENSOR##_memory_offset += TENSOR##_counter_tmp[TENSOR##_i] * TENSOR##_strides[TENSOR##_i];       \
  }
 /*
 * The macro update the indexes in each dimension of the elements except for the first one allocated in
 * each thread.
 * For a tensor, if the index of some dimension reaches the size of the corresponding dimension. It will carry and clear.
 * If the index of next high dimension does do, the index of next high dimension should carry and clear, too.
 *
 * The momery offset calculatation is a little confusing. If current index carries, the current index is set to 0. So
 * the offset should decrease by size*stride of the last dimension. Then the index next high dimension increases by 1. So
 * the offset should increase by stride of next high dimension.
 */
 #define __TH_TENSOR_APPLYX_UPDATE_COUNTERS_PARALLEL(TENSOR) \
  if(TENSOR##_i == TENSOR##_size && TENSOR##_dim > 1){ /*reaches the edge*/ \
    int TENSOR##_carry_coord = 1;                      /*set carry flag to true*/ \
    TENSOR##_start = 0;                                /*the current index be cleared to 0*/\
    TENSOR##_data -= TENSOR##_size * TENSOR##_stride;  /*the momery offset reset to the first one in current dimension  */\
    for(TENSOR##_i = TENSOR##_dim - 2; (TENSOR##_i >= 0) && (TENSOR##_carry_coord); TENSOR##_i--){ \
      TENSOR##_counter_tmp[TENSOR##_i]++;             /*the index of next high dimension update*/ \
      TENSOR##_data += TENSOR##_strides[TENSOR##_i];   /*memory offset increase by stride of next high dimension*/\
      if(TENSOR##_counter_tmp[TENSOR##_i] == TENSOR##_sizes[TENSOR##_i]){ /*The next high dimension also carry, continue
        to clear and carry*/ \
        TENSOR##_data -= TENSOR##_sizes[TENSOR##_i] * TENSOR##_strides[TENSOR##_i]; \
        TENSOR##_counter_tmp[TENSOR##_i] = 0; \
      } else { \
        TENSOR##_carry_coord = 0; \
      } \
    } \
  } else { \
    TENSOR##_start = TENSOR##_i; \
  }
 #endif
--- a/aten/src/TH/THTensorDimApply.h
+++ b/aten/src/TH/THTensorDimApply.h
@ -1,329 +0,0 @@
 #ifndef TH_TENSOR_DIM_APPLY_INC
 #define TH_TENSOR_DIM_APPLY_INC
 // This is an example of SIZE_CHECK argument passable to TH_TENSOR_DIM_APPLY3.
 // The TENSOR1, TENSOR2, TENSOR3, DIMENSION will be expanded the same way as
 // TH_TENSOR_DIM_APPLY3.
 // Specifically, this check ensures that TENSOR1, TENSOR2, TENSOR3 have same
 // size except for DIMENSION.
 #define TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM(TENSOR1, TENSOR2, TENSOR3, DIMENSION) \
 { \
  int shape_check_flag = 0;                                             \
  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \
  { \
    if (TH_TENSOR_DIM_APPLY_i == DIMENSION) \
      continue; \
    if (TENSOR1->size(TH_TENSOR_DIM_APPLY_i) != TENSOR2->size(TH_TENSOR_DIM_APPLY_i)) { \
      shape_check_flag = 1; \
      break; \
    } \
    if(TENSOR1->size(TH_TENSOR_DIM_APPLY_i) != TENSOR3->size(TH_TENSOR_DIM_APPLY_i)) { \
      shape_check_flag = 1; \
      break; \
    } \
  } \
  if (shape_check_flag == 1) { \
    AT_ERROR("Expected ", #TENSOR1, " ", TENSOR1->sizes(), ", ", #TENSOR2, " ", TENSOR2->sizes(), " and ", #TENSOR3, " ", TENSOR3->sizes(), " to have the same size apart from dimension ", DIMENSION); \
  } \
 }
 #define TH_TENSOR_DIM_APPLY3(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, DIMENSION, SIZE_CHECK, CODE) \
 { \
  TYPE1 *TENSOR1##_data = NULL; \
  TH_UNUSED int64_t TENSOR1##_stride = 0, TENSOR1##_size = 0; \
  TYPE2 *TENSOR2##_data = NULL; \
  TH_UNUSED int64_t TENSOR2##_stride = 0, TENSOR2##_size = 0; \
  TYPE3 *TENSOR3##_data = NULL; \
  TH_UNUSED int64_t TENSOR3##_stride = 0, TENSOR3##_size = 0; \
  int64_t *TH_TENSOR_DIM_APPLY_counter = NULL; \
  int TH_TENSOR_DIM_APPLY_hasFinished = THTensor_(numel)(TENSOR1) == 0; \
  int TH_TENSOR_DIM_APPLY_i; \
 \
  if( (DIMENSION < 0) || (DIMENSION >= THTensor_nDimensionLegacyNoScalars(TENSOR1)) ) \
    THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, THTensor_nDimensionLegacyNoScalars(TENSOR1)); \
  int same_dims = 1;                                                    \
  if( THTensor_nDimensionLegacyNoScalars(TENSOR1) != THTensor_nDimensionLegacyNoScalars(TENSOR2) ) { \
    same_dims = 0;                                                      \
  } \
  if( THTensor_nDimensionLegacyNoScalars(TENSOR1) != THTensor_nDimensionLegacyNoScalars(TENSOR3) ) { \
    same_dims = 0;                                   \
  } \
  if (same_dims == 0) { \
    AT_ERROR("inconsistent tensor size, expected ", #TENSOR1, " ", TENSOR1->sizes(), ", ", #TENSOR2, " ", TENSOR2->sizes(), " and ", #TENSOR3, " ",TENSOR3->sizes() , " to have the same number of dimensions"); \
  }                                                                     \
  SIZE_CHECK(TENSOR1, TENSOR2, TENSOR3, DIMENSION)                      \
 \
  if (TH_TENSOR_DIM_APPLY_hasFinished) { \
    return; \
  } \
  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(THTensor_nDimensionLegacyNoScalars(TENSOR1))); \
  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \
    TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
 \
  TENSOR1##_data = THTensor_getStoragePtr(TENSOR1)->data<TYPE1>()+(TENSOR1)->storage_offset(); \
  TENSOR1##_stride = THTensor_strideLegacyNoScalars((TENSOR1), DIMENSION); \
  TENSOR1##_size = THTensor_sizeLegacyNoScalars((TENSOR1), DIMENSION); \
 \
  TENSOR2##_data = THTensor_getStoragePtr(TENSOR2)->data<TYPE2>()+(TENSOR2)->storage_offset(); \
  TENSOR2##_stride = THTensor_strideLegacyNoScalars((TENSOR2), DIMENSION); \
  TENSOR2##_size = THTensor_sizeLegacyNoScalars((TENSOR2), DIMENSION);  \
 \
  TENSOR3##_data = THTensor_getStoragePtr(TENSOR3)->data<TYPE3>()+(TENSOR3)->storage_offset(); \
  TENSOR3##_stride = THTensor_strideLegacyNoScalars((TENSOR3), DIMENSION); \
  TENSOR3##_size = THTensor_sizeLegacyNoScalars((TENSOR3), DIMENSION); \
 \
  while(!TH_TENSOR_DIM_APPLY_hasFinished) \
  { \
    CODE \
 \
    if(THTensor_nDimensionLegacyNoScalars(TENSOR1) == 1) \
       break; \
 \
    for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \
    { \
      if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
      { \
        if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \
        { \
          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
          break; \
        } \
        continue; \
      } \
 \
      TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \
      TENSOR1##_data += THTensor_strideLegacyNoScalars(TENSOR1, TH_TENSOR_DIM_APPLY_i); \
      TENSOR2##_data += THTensor_strideLegacyNoScalars(TENSOR2, TH_TENSOR_DIM_APPLY_i); \
      TENSOR3##_data += THTensor_strideLegacyNoScalars(TENSOR3, TH_TENSOR_DIM_APPLY_i); \
 \
      if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == THTensor_sizeLegacyNoScalars(TENSOR1, TH_TENSOR_DIM_APPLY_i)) \
      { \
        if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \
        { \
          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
          break; \
        } \
        else \
        { \
          TENSOR1##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*THTensor_strideLegacyNoScalars(TENSOR1, TH_TENSOR_DIM_APPLY_i); \
          TENSOR2##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*THTensor_strideLegacyNoScalars(TENSOR2, TH_TENSOR_DIM_APPLY_i); \
          TENSOR3##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*THTensor_strideLegacyNoScalars(TENSOR3, TH_TENSOR_DIM_APPLY_i); \
          TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
        } \
      } \
      else \
        break; \
    } \
  } \
  THFree(TH_TENSOR_DIM_APPLY_counter); \
 }
 /**
 * Similar to DIM_APPLY(...) but we maintain two sets of pointers: one for the first tensor
 * and one for the second. The two tensors must have the same shape, other than at the
 * specified DIMENSION. This function makes it easy to store the output from reducing the
 * TENSOR at index. For example, in the sum example described below, we could instead do:
 *
 * int64_t i = 0;
 * TYPE1 sum;
 *
 * for (i = 0; i < TENSOR1##_size; ++i) {
 *   sum += TENSOR1##_data[i * TENSOR1##_stride]
 * }
 * *TENSOR2##_data = (TYPE2) sum;
 *
 * In particular, we guarantee that the offset into TENSOR2 will be what you would get if
 * you applied all of the index values used to generate the offset into TENSOR1.
 */
 #define TH_TENSOR_DIM_APPLY2(TYPE1, TENSOR1, TYPE2, TENSOR2, DIMENSION, CODE) \
 { \
  TYPE1 *TENSOR1##_data = NULL; \
  TH_UNUSED int64_t TENSOR1##_stride = 0, TENSOR1##_size = 0; \
  TYPE2 *TENSOR2##_data = NULL; \
  TH_UNUSED int64_t TENSOR2##_stride = 0, TENSOR2##_size = 0; \
  int64_t *TH_TENSOR_DIM_APPLY_counter = NULL; \
  int TH_TENSOR_DIM_APPLY_hasFinished = THTensor_(numel)(TENSOR1) == 0; \
  int TH_TENSOR_DIM_APPLY_i; \
 \
  if( (DIMENSION < 0) || (DIMENSION >= THTensor_nDimensionLegacyNoScalars(TENSOR1)) ) \
    THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, THTensor_nDimensionLegacyAll(TENSOR1)); \
  if( THTensor_nDimensionLegacyNoScalars(TENSOR1) != THTensor_nDimensionLegacyNoScalars(TENSOR2)) { \
    AT_ERROR("inconsistent tensor size, expected ", #TENSOR1, " ", TENSOR1->sizes(), " and ", #TENSOR2, " ", TENSOR2->sizes(), " to have the same number of dimensions");        \
  }                                                                     \
  TH_UNUSED int shape_check_flag = 0;                                             \
  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \
  { \
    if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
      continue; \
    if(THTensor_sizeLegacyNoScalars(TENSOR1, TH_TENSOR_DIM_APPLY_i) != THTensor_sizeLegacyNoScalars(TENSOR2, TH_TENSOR_DIM_APPLY_i)) { \
      AT_ERROR("Expected ", #TENSOR1, " ", TENSOR1->sizes(), " and ", #TENSOR2, " ", TENSOR2->sizes(), " to have the same size in dimension ", DIMENSION); \
    }                                                                   \
  } \
 \
  if (TH_TENSOR_DIM_APPLY_hasFinished) { \
    return; \
  } \
  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(THTensor_nDimensionLegacyNoScalars(TENSOR1))); \
  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \
    TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
 \
  TENSOR1##_data = THTensor_getStoragePtr(TENSOR1)->data<TYPE1>()+(TENSOR1)->storage_offset(); \
  TENSOR1##_stride = THTensor_strideLegacyNoScalars((TENSOR1), DIMENSION); \
  TENSOR1##_size = THTensor_sizeLegacyNoScalars(TENSOR1, DIMENSION); \
 \
  TENSOR2##_data = THTensor_getStoragePtr(TENSOR2)->data<TYPE2>()+(TENSOR2)->storage_offset(); \
  TENSOR2##_stride = THTensor_strideLegacyNoScalars((TENSOR2), DIMENSION); \
  TENSOR2##_size = THTensor_sizeLegacyNoScalars(TENSOR2, DIMENSION); \
 \
  while(!TH_TENSOR_DIM_APPLY_hasFinished) \
  { \
    CODE \
 \
    if(THTensor_nDimensionLegacyNoScalars(TENSOR1) == 1) \
       break; \
 \
    for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \
    { \
      if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
      { \
        if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \
        { \
          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
          break; \
        } \
        continue; \
      } \
 \
      TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \
      TENSOR1##_data += THTensor_strideLegacyNoScalars(TENSOR1, TH_TENSOR_DIM_APPLY_i); \
      TENSOR2##_data += THTensor_strideLegacyNoScalars(TENSOR2, TH_TENSOR_DIM_APPLY_i); \
 \
      if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == THTensor_sizeLegacyNoScalars(TENSOR1, TH_TENSOR_DIM_APPLY_i)) \
      { \
        if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \
        { \
          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
          break; \
        } \
        else \
        { \
          TENSOR1##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*THTensor_strideLegacyNoScalars(TENSOR1, TH_TENSOR_DIM_APPLY_i); \
          TENSOR2##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*THTensor_strideLegacyNoScalars(TENSOR2, TH_TENSOR_DIM_APPLY_i); \
          TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
        } \
      } \
      else \
        break; \
    } \
  } \
  THFree(TH_TENSOR_DIM_APPLY_counter); \
 }
 /**
 * The basic idea for DIM_APPLY: Given a TENSOR and a DIMENSION, provide access to the data stored
 * at all sets of dimension values other than DIMENSION, such that we can get all the values at those
 * fixed indices for the various values at DIMENSION.
 *
 * Suppose we have a 2x3x4 Tensor A, and we have DIMENSION=2. Then we will hit CODE (2x3) times, and the
 * pointer into storage will be at:
 *
 * A[0][0]
 * A[0][1]
 * A[0][2]
 * A[1][0]
 * A[1][1]
 * A[1][2]
 *
 * And at each point, we can access the data for each of the four elements of the Tensor via
 * TENSOR##_stride. So for example, if we wanted to sum the elements there, we could do:
 *
 * int64_t i = 0;
 * TYPE sum;
 * for (i = 0; i < TENSOR##_size; i++) {
 *  sum += TENSOR##_data[i * TENSOR##_stride]
 * }
 *
 * Note that we don't have to have DIMENSION be the last tensor. If we have DIMENSION=1, then we will hit the
 * code (2x4) times, with pointer into the storage at:
 *
 * offset +
 *   stride_0 * 0 + stride_2 * 0
 *   stride_0 * 1 + stride_2 * 0
 *   stride_0 * 0 + stride_2 * 1
 *   stride_0 * 1 + stride_2 * 1
 *   stride_0 * 0 + stride_2 * 2
 *   stride_0 * 1 + stride_2 * 2
 *   stride_0 * 0 + stride_2 * 3
 *   stride_0 * 1 + stride_2 * 3
 *
 * So we can again sum over the values at DIMENSION with the other indices fixed.
 */
 #define TH_TENSOR_DIM_APPLY(TYPE, TENSOR, DIMENSION, CODE) \
 { \
  TYPE *TENSOR##_data = NULL; \
  int64_t TENSOR##_stride = 0, TENSOR##_size = 0; \
  int64_t *TH_TENSOR_DIM_APPLY_counter = NULL; \
  int TH_TENSOR_DIM_APPLY_hasFinished = 0; \
  int TH_TENSOR_DIM_APPLY_i; \
 \
  if( (DIMENSION < 0) || (DIMENSION >= THTensor_nDimensionLegacyAll(TENSOR)) ) \
    THError("invalid dimension"); \
 \
  TENSOR##_data = THTensor_getStoragePtr(TENSOR)->data<TYPE>()+(TENSOR)->storage_offset(); \
  TENSOR##_stride = THTensor_strideLegacyNoScalars((TENSOR), DIMENSION); \
  TENSOR##_size = THTensor_sizeLegacyNoScalars(TENSOR, DIMENSION); \
  /* Counter stores the indices into the Tensor at any time */ \
  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(THTensor_nDimensionLegacyAll(TENSOR))); \
  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyAll(TENSOR); TH_TENSOR_DIM_APPLY_i++) \
    TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
 \
  while(!TH_TENSOR_DIM_APPLY_hasFinished) \
  { \
    CODE \
 \
    if(THTensor_nDimensionLegacyAll(TENSOR) == 1) \
       break; \
 \
    for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyAll(TENSOR); TH_TENSOR_DIM_APPLY_i++) \
    { \
       /* Check if the index is equal to DIMENSION. We don't need to update the */ \
       /* offset if this is the case, and can consider the next index. However, */ \
       /* in the case that the DIMENSION is the last index in the Tensor, then */ \
       /* we have parsed the entire tensor and can exit */ \
      if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
      { \
        if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyAll(TENSOR)-1) \
        { \
          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
          break; \
        } \
        continue; \
      } \
 \
      /* Bump the counter at this index, update the pointer */ \
      TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \
      TENSOR##_data += THTensor_strideLegacyNoScalars(TENSOR, TH_TENSOR_DIM_APPLY_i); \
 \
      if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == THTensor_sizeLegacyNoScalars(TENSOR, TH_TENSOR_DIM_APPLY_i)) \
      { \
        /* Handled TENSOR_size(dim) iterations for DIM_APPLY_i. If this is the last dimension, exit */ \
        if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyAll(TENSOR)-1) \
        { \
          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
          break; \
        } \
        else \
        { \
          /* Reset the counter, and the pointer to the beginning of the storage for this combination of indices */ \
          TENSOR##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*THTensor_strideLegacyNoScalars(TENSOR, TH_TENSOR_DIM_APPLY_i); \
          TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
        } \
      } \
      else \
        break; \
    } \
  } \
  THFree(TH_TENSOR_DIM_APPLY_counter); \
 }
 #endif
--- a/aten/src/TH/THTensorMoreMath.cpp
+++ b/aten/src/TH/THTensorMoreMath.cpp
@ -1,20 +0,0 @@
 #include <TH/THTensor.hpp>
 #include <TH/THVector.h>
 #include <TH/THBlas.h>
 #include <TH/THTensorDimApply.h>
 // NOLINTNEXTLINE(bugprone-suspicious-include)
 #include <TH/generic/THTensorMoreMath.cpp>
 #include <TH/THGenerateAllTypes.h>
 // NOLINTNEXTLINE(bugprone-suspicious-include)
 #include <TH/generic/THTensorMoreMath.cpp>
 #include <TH/THGenerateBoolType.h>
 // NOLINTNEXTLINE(bugprone-suspicious-include)
 #include <TH/generic/THTensorMoreMath.cpp>
 #include <TH/THGenerateBFloat16Type.h>
 // NOLINTNEXTLINE(bugprone-suspicious-include)
 #include <TH/generic/THTensorMoreMath.cpp>
 #include <TH/THGenerateHalfType.h>
--- a/aten/src/TH/THVector.h
+++ b/aten/src/TH/THVector.h
@ -1,24 +0,0 @@
 #ifndef TH_VECTOR_INC
 #define TH_VECTOR_INC
 #include <TH/THGeneral.h>
 #define THVector_(NAME) TH_CONCAT_4(TH,Real,Vector_,NAME)
 /* We are going to use dynamic dispatch, and want only to generate declarations
 * of the vector functions */
 #include <TH/generic/THVector.h>
 #include <TH/THGenerateAllTypes.h>
 #include <TH/generic/THVector.h>
 #include <TH/THGenerateHalfType.h>
 #include <TH/generic/THVector.h>
 #include <TH/THGenerateBoolType.h>
 #include <TH/generic/THVector.h>
 #include <TH/THGenerateBFloat16Type.h>
 #include <TH/generic/THVector.h>
 #include <TH/THGenerateComplexTypes.h>
 #endif // TH_VECTOR_INC
--- a/aten/src/TH/generic/THBlas.cpp
+++ b/aten/src/TH/generic/THBlas.cpp
@ -1,48 +0,0 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "TH/generic/THBlas.cpp"
 #else
 #ifdef BLAS_F2C
 # define ffloat double
 #else
 # define ffloat float
 #endif
 TH_EXTERNC void dswap_(int *n, double *x, int *incx, double *y, int *incy);
 TH_EXTERNC void sswap_(int *n, float *x, int *incx, float *y, int *incy);
 void THBlas_(swap)(int64_t n, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy)
 {
  if(n == 1)
  {
    incx = 1;
    incy = 1;
  }
 #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
  {
    int i_n = (int)n;
    int i_incx = (int)incx;
    int i_incy = (int)incy;
 #if defined(TH_REAL_IS_DOUBLE)
    dswap_(&i_n, x, &i_incx, y, &i_incy);
 #else
    sswap_(&i_n, x, &i_incx, y, &i_incy);
 #endif
    return;
  }
 #endif
  {
    int64_t i;
    for(i = 0; i < n; i++)
    {
      scalar_t z = x[i*incx];
      x[i*incx] = y[i*incy];
      y[i*incy] = z;
    }
  }
 }
 #endif
--- a/aten/src/TH/generic/THBlas.h
+++ b/aten/src/TH/generic/THBlas.h
@ -1,8 +0,0 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "TH/generic/THBlas.h"
 #else
 /* Level 1 */
 TH_API void THBlas_(swap)(int64_t n, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy);
 #endif
--- a/aten/src/TH/generic/THTensor.cpp
+++ b/aten/src/TH/generic/THTensor.cpp
@ -8,50 +8,6 @@
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/MemoryOverlap.h>
 /**** access methods ****/
 THStorage *THTensor_(storage)(const THTensor *self)
 {
  return THTensor_getStoragePtr(self);
 }
 ptrdiff_t THTensor_(storageOffset)(const THTensor *self)
 {
  return self->storage_offset();
 }
 int THTensor_(nDimension)(const THTensor *self)
 {
  return THTensor_nDimension(self);
 }
 int THTensor_(nDimensionLegacyNoScalars)(const THTensor *self)
 {
  return THTensor_nDimensionLegacyNoScalars(self);
 }
 int THTensor_(nDimensionLegacyAll)(const THTensor *self)
 {
  return THTensor_nDimensionLegacyAll(self);
 }
 int64_t THTensor_(size)(const THTensor *self, int dim)
 {
  THArgCheck((dim >= 0) && (dim < self->dim()), 2, "dimension %d out of range of %dD tensor",
      dim, THTensor_(nDimensionLegacyNoScalars)(self));
  return self->size(dim);
 }
 int64_t THTensor_(stride)(const THTensor *self, int dim)
 {
  THArgCheck((dim >= 0) && (dim < self->dim()), 2, "dimension %d out of range of %dD tensor",
      dim, THTensor_(nDimensionLegacyNoScalars)(self));
  return self->stride(dim);
 }
 scalar_t *THTensor_(data)(const THTensor *self) {
  return self->data<scalar_t>();
 }
 /**** creation methods ****/
 /* Empty init */
@ -69,12 +25,6 @@ THTensor *THTensor_(new)(void)
      .release();
 }
 /* Pointer-copy init */
 THTensor *THTensor_(newWithTensor)(THTensor *tensor)
 {
  return at::native::alias(THTensor_wrap(tensor)).unsafeReleaseTensorImpl();
 }
 THTensor *THTensor_(newWithStorage1d)(THStorage *storage, ptrdiff_t storageOffset,
                               int64_t size0, int64_t stride0)
 {
@ -94,442 +44,14 @@ THTensor *THTensor_(newWithStorage1d)(THStorage *storage, ptrdiff_t storageOffse
  return self;
 }
 THTensor *THTensor_(newWithSize1d)(int64_t size0)
 {
  THStorage *new_storage = THStorage_(new)();
  THTensor* self =
      c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(
          c10::intrusive_ptr<at::StorageImpl>::reclaim(new_storage),
          at::DispatchKey::CPU,
          caffe2::TypeMeta::Make<scalar_t>())
          .release();
  THTensor_(setStorage)(self, new_storage, 0, {size0}, {});
  return self;
 }
 THTensor *THTensor_(newClone)(THTensor *self)
 {
  // already available in Aten as at::clone()
  THTensor *tensor = THTensor_(new)();
  at::Tensor tensor_wrap = THTensor_wrap(tensor);
  at::Tensor self_wrap = THTensor_wrap(self);
  tensor_wrap.resize_as_(self_wrap);
  at::native::copy_(tensor_wrap, self_wrap, false);
  return tensor;
 }
 THTensor *THTensor_(newContiguous)(THTensor *self)
 {
  if(!THTensor_(isContiguous)(self))
    return THTensor_(newClone)(self);
  else
  {
    THTensor_(retain)(self);
    return self;
  }
 }
 THTensor *THTensor_(newSelect)(THTensor *tensor, int dimension_, int64_t sliceIndex_)
 {
  THTensor *self = THTensor_(newWithTensor)(tensor);
  THTensor_(select)(self, NULL, dimension_, sliceIndex_);
  return self;
 }
 THTensor *THTensor_(newNarrow)(THTensor *tensor, int dimension_, int64_t firstIndex_, int64_t size_)
 {
  THTensor *self = THTensor_(newWithTensor)(tensor);
  THTensor_(narrow)(self, NULL, dimension_, firstIndex_, size_);
  return self;
 }
 THTensor *THTensor_(newTranspose)(THTensor *tensor, int dimension1_, int dimension2_)
 {
  THTensor *self = THTensor_(newWithTensor)(tensor);
  THTensor_(transpose)(self, NULL, dimension1_, dimension2_);
  return self;
 }
 /* Resize */
 void THTensor_(resize)(THTensor *self, at::IntArrayRef size, at::IntArrayRef stride)
 {
  return THTensor_resize(self, size, stride);
 }
 void THTensor_(resizeAs)(THTensor *self, THTensor *src)
 {
  // already available in Aten as at::resize_as_()
  if(!THTensor_(isSameSizeAs)(self, src))
    THTensor_(resizeNd)(self, src->dim(), THTensor_getSizePtr(src), NULL);
 }
 void THTensor_(resize0d)(THTensor *tensor)
 {
  THTensor_(resizeNd)(tensor, 0, {}, nullptr);
 }
 void THTensor_(resize1d)(THTensor *tensor, int64_t size0)
 {
  int64_t size[1] = {size0};
  THTensor_(resizeNd)(tensor, 1, size, nullptr);
 }
 void THTensor_(resize2d)(THTensor *tensor, int64_t size0, int64_t size1)
 {
  int64_t size[2] = {size0, size1};
  THTensor_(resizeNd)(tensor, 2, size, nullptr);
 }
 void THTensor_(resize3d)(THTensor *tensor, int64_t size0, int64_t size1, int64_t size2)
 {
  int64_t size[3] = {size0, size1, size2};
  THTensor_(resizeNd)(tensor, 3, size, nullptr);
 }
 void THTensor_(resize4d)(THTensor *self, int64_t size0, int64_t size1, int64_t size2, int64_t size3)
 {
  int64_t size[4] = {size0, size1, size2, size3};
  THTensor_(resizeNd)(self, 4, size, nullptr);
 }
 void THTensor_(resize5d)(THTensor *self, int64_t size0, int64_t size1, int64_t size2, int64_t size3, int64_t size4)
 {
  int64_t size[5] = {size0, size1, size2, size3, size4};
  THTensor_(resizeNd)(self, 5, size, nullptr);
 }
 void THTensor_(set)(THTensor *self, THTensor *src)
 {
  if(self != src)
    THTensor_(setStorage)(self,
                            THTensor_getStoragePtr(src),
                            src->storage_offset(),
                            src->sizes(),
                            src->strides());
 }
 void THTensor_(setStorage)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, at::IntArrayRef size_, at::IntArrayRef stride_)
 {
  THTensor_setStorage(self, storage_, storageOffset_, size_, stride_);
 }
 void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension, int64_t firstIndex, int64_t size)
 {
  if(!src)
    src = self;
  THArgCheck( (dimension >= 0) && (dimension < src->dim()), 2, "out of range");
  THArgCheck( firstIndex >= 0, 3, "out of range");
  THArgCheck( size >= 0, 4, "out of range");
  THArgCheck(firstIndex <= src->size(dimension) - size, 4, "out of range");
  THTensor_(set)(self, src);
  if (firstIndex > 0) {
    self->set_storage_offset(self->storage_offset() + firstIndex*self->stride(dimension));
  }
  self->set_size(dimension, size);
 }
 void THTensor_(select)(THTensor *self, THTensor *src, int dimension, int64_t sliceIndex)
 {
  int d;
  if(!src)
    src = self;
  THArgCheck(src->dim() > 0, 1, "cannot select on a 0-dim tensor");
  THArgCheck((dimension >= 0) && (dimension < src->dim()), 2, "out of range");
  THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size(dimension)), 3, "out of range");
  THTensor_(set)(self, src);
  THTensor_(narrow)(self, NULL, dimension, sliceIndex, 1);
  at::DimVector newSize(static_cast<size_t>(self->dim()-1));
  at::DimVector newStride(static_cast<size_t>(self->dim()-1));
  for (d = 0; d < dimension; d++)
  {
    newSize[d] = self->size(d);
    newStride[d] = self->stride(d);
  }
  for(d = dimension; d < self->dim()-1; d++)
  {
    newSize[d] = self->size(d+1);
    newStride[d] = self->stride(d+1);
  }
  self->set_sizes_and_strides(newSize, newStride);
 }
 void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1, int dimension2)
 {
  int64_t z;
  if(!src)
    src = self;
  THArgCheck( (dimension1 >= 0) && (dimension1 < THTensor_nDimensionLegacyNoScalars(src)), 1, "out of range");
  THArgCheck( (dimension2 >= 0) && (dimension2 < THTensor_nDimensionLegacyNoScalars(src)), 2, "out of range");
  THTensor_(set)(self, src);
  if(dimension1 == dimension2)
    return;
  z = self->stride(dimension1);
  self->set_stride(dimension1, self->stride(dimension2));
  self->set_stride(dimension2, z);
  z = self->size(dimension1);
  self->set_size(dimension1, self->size(dimension2));
  self->set_size(dimension2, z);
 }
 void THTensor_(squeeze1d)(THTensor *self, THTensor *src, int dimension)
 {
  int d;
  if(!src)
    src = self;
  THArgCheck((dimension >= 0) && (dimension < src->dim()), 2, "dimension out of range");
  THTensor_(set)(self, src);
  if(src->size(dimension) == 1)
  {
    at::DimVector newSize(static_cast<size_t>(self->dim() - 1));
    at::DimVector newStride(static_cast<size_t>(self->dim() - 1));
    for (d = 0; d < dimension; d++)
    {
      newSize[d] = self->size(d);
      newStride[d] = self->stride(d);
    }
    for(d = dimension; d < self->dim()-1; d++)
    {
      newSize[d] = self->size(d+1);
      newStride[d] = self->stride(d+1);
    }
    self->set_sizes_and_strides(newSize, newStride);
  }
 }
 void THTensor_(unsqueeze1d)(THTensor *self, THTensor *src, int dimension)
 {
  int d;
  if(!src)
    src = self;
  THArgCheck((dimension >= 0) && (dimension <= src->dim()), 2, "dimension out of range");
  THTensor_(set)(self, src);
  at::DimVector newSize(static_cast<size_t>(/* size */ self->dim()+1));
  at::DimVector newStride(static_cast<size_t>(/* size */ self->dim()+1));
  for(d = self->dim(); d > dimension; d--)
  {
    newSize[d] = self->size(d-1);
    newStride[d] = self->stride(d-1);
  }
  if (dimension < self->dim())
  {
    newStride[dimension] = self->size(dimension) * self->stride(dimension);
  }
  else
  {
    newStride[dimension] = 1;
  }
  newSize[dimension] = 1;
  for(d = dimension - 1; d >= 0; d--)
  {
    newSize[d] = self->size(d);
    newStride[d] = self->stride(d);
  }
  self->set_sizes_and_strides(newSize, newStride);
 }
 int THTensor_(isTransposed)(const THTensor *self)
 {
  if (THTensor_(isContiguous)(self)) {
    return 0;
  }
  int64_t max_stride = 1;
  int64_t size_max_stride = 1;
  int64_t z = 1;
  int d;
  for (d = 0; d < self->dim(); ++d) {
    if (self->stride(d) == 0 && self->size(d) != 1)
      return 0;
    if (self->stride(d) > max_stride) {
      max_stride = self->stride(d);
      size_max_stride = self->size(d);
    }
    z *= self->size(d);
  }
  if (z == max_stride * size_max_stride) {
    return 1;
  }
  return 0;
 }
 int THTensor_(isContiguous)(const THTensor *self)
 {
  return self->is_contiguous();
 }
 int THTensor_(isSameSizeAs)(const THTensor *self, const THTensor* src)
 {
  int d;
  if (self->dim() != src->dim())
    return 0;
  for(d = 0; d < self->dim(); ++d)
  {
    if(self->size(d) != src->size(d))
      return 0;
  }
  return 1;
 }
 ptrdiff_t THTensor_(nElement)(const THTensor *self)
 {
  if(THTensor_nDimensionLegacyAll(self) == 0)
    return 0;
  else
  {
    ptrdiff_t nElement = 1;
    int d;
    for(d = 0; d < THTensor_nDimension(self); d++)
      nElement *= self->size(d);
    return nElement;
  }
 }
 // NB: It is INVALID to call this on an UndefinedTensorImpl
 void THTensor_(retain)(THTensor *self)
 {
  c10::raw::intrusive_ptr::incref(self);
 }
 void THTensor_(free)(THTensor *self)
 {
  THTensor_free(self);
 }
 void THTensor_(freeCopyTo)(THTensor *self, THTensor *dst)
 {
  if(self != dst) {
    at::Tensor dst_wrap = THTensor_wrap(dst);
    at::Tensor self_wrap = THTensor_wrap(self);
    at::native::copy_(dst_wrap, self_wrap, false);
  }
  THTensor_(free)(self);
 }
 /*******************************************************************************/
 void THTensor_(resizeNd)(THTensor *self, int nDimension, const int64_t *size, const int64_t *stride)
 {
  return THTensor_resizeNd(self, nDimension, size, stride);
 }
 void THTensor_(set0d)(THTensor *tensor, scalar_t value)
 {
  THArgCheck(THTensor_nDimension(tensor) == 0, 1, "tensor must have no dimensions");
  THStorage_(set)(THTensor_getStoragePtr(tensor), tensor->storage_offset(), value);
 }
 scalar_t THTensor_(get0d)(const THTensor *tensor)
 {
  THArgCheck(THTensor_nDimension(tensor) == 0, 1, "tensor must have no dimensions");
  return THStorage_(get)(THTensor_getStoragePtr(tensor), tensor->storage_offset());
 }
 void THTensor_(set1d)(THTensor *tensor, int64_t x0, scalar_t value)
 {
  THArgCheck(THTensor_nDimensionLegacyNoScalars(tensor) == 1, 1, "tensor must have one dimension");
  THArgCheck( (x0 >= 0) && (x0 < THTensor_sizeLegacyNoScalars(tensor, 0)), 2, "out of range");
  THStorage_(set)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*THTensor_strideLegacyNoScalars(tensor, 0), value);
 }
 scalar_t THTensor_(get1d)(const THTensor *tensor, int64_t x0)
 {
  THArgCheck(THTensor_nDimensionLegacyNoScalars(tensor) == 1, 1, "tensor must have one dimension");
  THArgCheck( (x0 >= 0) && (x0 < THTensor_sizeLegacyNoScalars(tensor, 0)), 2, "out of range");
  return THStorage_(get)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*THTensor_strideLegacyNoScalars(tensor, 0));
 }
 void THTensor_(set2d)(THTensor *tensor, int64_t x0, int64_t x1, scalar_t value)
 {
  THArgCheck(THTensor_nDimensionLegacyAll(tensor) == 2, 1, "tensor must have two dimensions");
  THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)), 2, "out of range");
  THStorage_(set)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1), value);
 }
 scalar_t THTensor_(get2d)(const THTensor *tensor, int64_t x0, int64_t x1)
 {
  THArgCheck(THTensor_nDimensionLegacyAll(tensor) == 2, 1, "tensor must have two dimensions");
  THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)), 2, "out of range");
  return THStorage_(get)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1));
 }
 void THTensor_(set3d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, scalar_t value)
 {
  THArgCheck(THTensor_nDimensionLegacyAll(tensor) == 3, 1, "tensor must have three dimensions");
  THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)), 2, "out of range");
  THStorage_(set)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2), value);
 }
 scalar_t THTensor_(get3d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2)
 {
  THArgCheck(THTensor_nDimensionLegacyAll(tensor) == 3, 1, "tensor must have three dimensions");
  THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)), 2, "out of range");
  return THStorage_(get)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2));
 }
 void THTensor_(set4d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, scalar_t value)
 {
  THArgCheck(THTensor_nDimensionLegacyAll(tensor) == 4, 1, "tensor must have four dimensions");
  THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)) && (x3 >= 0) && (x3 < tensor->size(3)), 2, "out of range");
  THStorage_(set)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3), value);
 }
 scalar_t THTensor_(get4d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3)
 {
  THArgCheck(THTensor_nDimensionLegacyAll(tensor) == 4, 1, "tensor must have four dimensions");
  THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)) && (x3 >= 0) && (x3 < tensor->size(3)), 2, "out of range");
  return THStorage_(get)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3));
 }
 THDescBuff THTensor_(desc)(const THTensor *tensor) {
  const int L = TH_DESC_BUFF_LEN;
  THDescBuff buf;
  char *str = buf.str;
  int n = 0;
 #define _stringify(x) #x
  n += snprintf(str, L-n, "torch." _stringify(x) "Tensor of size ");
 #undef _stringify
  int i;
  for(i = 0; i < THTensor_nDimension(tensor); i++) {
    if(n >= L) break;
    n += snprintf(str+n, L-n, "%" PRId64, tensor->size(i));
    if(i < THTensor_nDimension(tensor)-1) {
      n += snprintf(str+n, L-n, "x");
    }
  }
  if(n >= L) {
    snprintf(str+L-4, 4, "...");
  }
  return buf;
 }
 THDescBuff THTensor_(sizeDesc)(const THTensor *tensor) {
  THDescBuff buf = _THSizeDesc(tensor->sizes().data(), tensor->sizes().size());
  return buf;
 }
 #endif
--- a/aten/src/TH/generic/THTensor.h
+++ b/aten/src/TH/generic/THTensor.h
@ -55,24 +55,12 @@ TH_API THTensor *THTensor_(newTranspose)(THTensor *tensor, int dimension1_, int
 // This is especially likely to happen when the tensor is not contiguous. In general, if you still need the
 // values, unless you are doing some size and stride tricks, do not use resize*.
 TH_API void THTensor_(resizeNd)(THTensor *tensor, int nDimension, const int64_t *size, const int64_t *stride);
 TH_API void THTensor_(resizeAs)(THTensor *tensor, THTensor *src);
 TH_API void THTensor_(resize0d)(THTensor *tensor);
 TH_API void THTensor_(resize1d)(THTensor *tensor, int64_t size0_);
 TH_API void THTensor_(resize2d)(THTensor *tensor, int64_t size0_, int64_t size1_);
 TH_API void THTensor_(resize3d)(THTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_);
 TH_API void THTensor_(resize4d)(THTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_, int64_t size3_);
 TH_API void THTensor_(resize5d)(THTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_, int64_t size3_, int64_t size4_);
 // Note: these are legacy resize functions that treat sizes as size->size() == 0 and size->data<int64_t>() as being 0-terminated.
 TH_API void THTensor_(set)(THTensor *self, THTensor *src);
 TH_API void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension_, int64_t firstIndex_, int64_t size_);
 TH_API void THTensor_(select)(THTensor *self, THTensor *src, int dimension_, int64_t sliceIndex_);
 TH_API void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1_, int dimension2_);
 TH_API int THTensor_(isTransposed)(const THTensor *self);
 TH_API void THTensor_(squeeze1d)(THTensor *self, THTensor *src, int dimension_);
 TH_API void THTensor_(unsqueeze1d)(THTensor *self, THTensor *src, int dimension_);
 TH_API int THTensor_(isContiguous)(const THTensor *self);
 TH_API int THTensor_(isSameSizeAs)(const THTensor *self, const THTensor *src);
@ -80,23 +68,5 @@ TH_API ptrdiff_t THTensor_(nElement)(const THTensor *self);
 TH_API void THTensor_(retain)(THTensor *self);
 TH_API void THTensor_(free)(THTensor *self);
 TH_API void THTensor_(freeCopyTo)(THTensor *self, THTensor *dst);
 /* Slow access methods [check everything] */
 TH_API void THTensor_(set0d)(THTensor *tensor, scalar_t value);
 TH_API void THTensor_(set1d)(THTensor *tensor, int64_t x0, scalar_t value);
 TH_API void THTensor_(set2d)(THTensor *tensor, int64_t x0, int64_t x1, scalar_t value);
 TH_API void THTensor_(set3d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, scalar_t value);
 TH_API void THTensor_(set4d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, scalar_t value);
 TH_API scalar_t THTensor_(get0d)(const THTensor *tensor);
 TH_API scalar_t THTensor_(get1d)(const THTensor *tensor, int64_t x0);
 TH_API scalar_t THTensor_(get2d)(const THTensor *tensor, int64_t x0, int64_t x1);
 TH_API scalar_t THTensor_(get3d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2);
 TH_API scalar_t THTensor_(get4d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3);
 /* Debug methods */
 TH_API THDescBuff THTensor_(desc)(const THTensor *tensor);
 TH_API THDescBuff THTensor_(sizeDesc)(const THTensor *tensor);
 #endif
--- a/aten/src/TH/generic/THTensor.hpp
+++ b/aten/src/TH/generic/THTensor.hpp
@ -11,6 +11,4 @@
 TH_CPP_API void THTensor_(setStorage)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
                                      at::IntArrayRef size_, at::IntArrayRef stride_);
 TH_CPP_API void THTensor_(resize)(THTensor *self, at::IntArrayRef size, at::IntArrayRef stride);
 #endif
--- a/aten/src/TH/generic/THTensorApply.hpp
+++ b/aten/src/TH/generic/THTensorApply.hpp
@ -1,369 +0,0 @@
 #include <TH/THTensorApply.h>
 #ifndef NAN
  #define NAN (nan(NULL))
 #endif
 #define HYPER_TH_OMP_OVERHEAD_THRESHOLD (at::internal::GRAIN_SIZE / 16)
 #define ORDIN_TH_OMP_OVERHEAD_THRESHOLD (at::internal::GRAIN_SIZE / 4)
 #define UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD (at::internal::GRAIN_SIZE / 2)
 #define TH_OMP_OVERHEAD_THRESHOLD (at::internal::GRAIN_SIZE)
 #define TH_CHECK_SAME_SIZE(TENSOR1, TENSOR2) \
 { \
  if (!THTensor_(isSameSizeAs)(TENSOR1, TENSOR2)) { \
    AT_ERROR("inconsistent tensor size, expected ", #TENSOR1, " ", TENSOR1->sizes(), " and ", #TENSOR2, " ", TENSOR2->sizes(), " to have the same size"); \
  } \
 }
 // Used for `scatter` and `scatterAdd`
 // Assumes TENSOR1 is index
 //         TENSOR2 is real
 //         TENSOR3 is src
 // Tests:
 //   1. index->size(d) <= src->size(d) for all d
 //   2. index->size(d) <= real->size(d) for all d != dim
 #define TH_TENSOR_DIM_APPLY3_SIZE_SCATTER(TENSOR1, TENSOR2, TENSOR3, DIMENSION) \
 { \
  int shape_check_flag = 0; \
  for (TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyAll(TENSOR2); TH_TENSOR_DIM_APPLY_i++) \
  { \
    int64_t TENSOR1##_dim_size = THTensor_sizeLegacyNoScalars(TENSOR1, TH_TENSOR_DIM_APPLY_i); \
    if (TH_TENSOR_DIM_APPLY_i != DIMENSION) { \
      if (TENSOR1##_dim_size > THTensor_sizeLegacyNoScalars(TENSOR2, TH_TENSOR_DIM_APPLY_i)) { \
        shape_check_flag = 1; \
        break; \
      } \
    } \
    if (TENSOR1##_dim_size > THTensor_sizeLegacyNoScalars(TENSOR3, TH_TENSOR_DIM_APPLY_i)) { \
      shape_check_flag = 1; \
      break; \
    } \
  } \
  if (shape_check_flag == 1) { \
    AT_ERROR("Expected ", #TENSOR1, " ", TENSOR1->sizes(), " to be smaller size than ", #TENSOR3, " ", TENSOR3->sizes(), " and to be smaller than ", #TENSOR2, " ", TENSOR2->sizes(), " apart from dimension ", DIMENSION); \
  } \
 }
 #undef th_isnan
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
 #define th_isnan(val) \
 (std::isnan(val))
 #else
 #define th_isnan(val) (0)
 #endif
 #undef th_isnan_break
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
 #define th_isnan_break(val) \
 if (std::isnan(val)) break;
 #else
 #define th_isnan_break(val)
 #endif
 #if defined(__clang__)
 #define PRAGMA(P) _Pragma(#P)
 #define PRAGMA_IVDEP      // Noop
 #define PRAGMA_SIMD       // Noop
 #elif defined(_MSC_VER)
 #define PRAGMA(P)         __pragma(P)
 # if _MSC_VER < 1920
 // MSVC < 2019 doesn't support loop pragmas.
 #  define PRAGMA_IVDEP    // Noop
 #  define PRAGMA_SIMD     // Noop
 # else
 #  define PRAGMA_IVDEP    PRAGMA(loop(ivdep))
 #  define PRAGMA_SIMD     PRAGMA(omp simd)
 # endif
 #else
 #define PRAGMA(P)         _Pragma(#P)
 #define PRAGMA_IVDEP      PRAGMA(ivdep)
 #define PRAGMA_SIMD       PRAGMA(simd)
 #endif
 #define TH_TENSOR_APPLY2_PARALLEL(SIZE, CONTIG1, CONTIG2, TYPE1, TENSOR1, TYPE2, TENSOR2, CODE, THRESHOLD) \
 { \
  /* for advanced searching index*/ \
  if (CONTIG1 && CONTIG2) { \
    TYPE1 *rp = THTensor_getStoragePtr(TENSOR1)->data<TYPE1>()+TENSOR1->storage_offset(); \
    TYPE2 *tp = THTensor_getStoragePtr(TENSOR2)->data<TYPE2>()+TENSOR2->storage_offset(); \
    if (tp != (TYPE2*)rp) { \
      at::parallel_for(0, SIZE, (THRESHOLD * 10), [&](int64_t begin, int64_t end) { \
        PRAGMA_IVDEP \
        for (auto iter = begin; iter < end; iter++) { \
          TYPE2 *TENSOR2##_data = tp+iter; \
          TYPE1 *TENSOR1##_data = rp+iter; \
          CODE \
        } \
      }); \
    } else { \
      at::parallel_for(0, SIZE, (THRESHOLD * 10), [&](int64_t begin, int64_t end) { \
        PRAGMA_SIMD \
        for (auto iter = begin; iter < end; iter++) { \
          TYPE2* TENSOR2##_data = tp+iter; \
          TYPE1* TENSOR1##_data = rp+iter; \
          CODE \
        } \
      }); \
    } \
  } else { \
    /* The following strategy is not easy to understand.
     * 1. Collapse the dimension of the tensors in order to decrease the number of nested loops.
     * 2. Calculate the numbers of elements allocated in each thread and the line index of the first one.
     * 3. Calculate the memory offset of the first element and the indexes in each dimension of the
     *    first one.
     * 4. iterate all elements in each thread. update the indexes in each dimension of the rest.
    */ \
    int TH_TENSOR_APPLY_hasFinished = 0; \
    int64_t TH_TENSOR_dim_index = 0; \
    /*step 1*/ \
    __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, -1, 1) \
    __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, -1, 1) \
    if (0 == TH_TENSOR_APPLY_hasFinished) { \
      auto TENSOR1##_i_local = TENSOR1##_i; \
      auto TENSOR2##_i_local = TENSOR2##_i; \
      auto TENSOR1##_data_local = TENSOR1##_data; \
      auto TENSOR2##_data_local = TENSOR2##_data; \
      at::parallel_for(0, SIZE, THRESHOLD, [&](int64_t begin, int64_t end) { \
        auto TENSOR1##_i = TENSOR1##_i_local; \
        auto TENSOR2##_i = TENSOR2##_i_local; \
        auto TENSOR1##_data = TENSOR1##_data_local; \
        auto TENSOR2##_data = TENSOR2##_data_local; \
        /*step 2*/ \
        ptrdiff_t line_index_start = begin; \
        ptrdiff_t line_seg_length = (end - begin); \
        /* step 3*/ \
        __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR2); \
        __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR1); \
        TENSOR2##_data += TENSOR2##_memory_offset; \
        TENSOR1##_data += TENSOR1##_memory_offset; \
        ptrdiff_t count = 0; \
        ptrdiff_t TENSOR2##_start =  TENSOR2##_counter_tmp[TENSOR2##_dim-1]; \
        ptrdiff_t TENSOR1##_start =  TENSOR1##_counter_tmp[TENSOR1##_dim-1]; \
        /* step 4*/ \
        while (count < line_seg_length) { \
          for (TENSOR2##_i=TENSOR2##_start, TENSOR1##_i = TENSOR1##_start; ((count < line_seg_length) && (TENSOR2##_i < TENSOR2##_size) && (TENSOR1##_i < TENSOR1##_size)); ++TENSOR2##_i, ++TENSOR1##_i, ++count) { \
            CODE \
            TENSOR2##_data += TENSOR2##_stride; \
            TENSOR1##_data += TENSOR1##_stride; \
          } \
          if (count < line_seg_length) { \
            __TH_TENSOR_APPLYX_UPDATE_COUNTERS_PARALLEL(TENSOR2); \
            __TH_TENSOR_APPLYX_UPDATE_COUNTERS_PARALLEL(TENSOR1); \
          } \
        } \
        if (TENSOR1##_counter_tmp != NULL) { \
          THFree(TENSOR1##_counter_tmp); \
        } \
        if (TENSOR2##_counter_tmp != NULL) { \
          THFree(TENSOR2##_counter_tmp); \
        } \
      }); \
    } \
    if (TENSOR2##_counter != NULL) { \
      THFree(TENSOR2##_counter); \
    } \
    if (TENSOR1##_counter != NULL) { \
      THFree(TENSOR1##_counter); \
    } \
  } \
 }
 #define TH_TENSOR_APPLY3_PARALLEL(SIZE, CONTIG1, CONTIG2, CONTIG3, TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE, THRESHOLD) \
 { \
  /* for adveanced searching index*/ \
  if (CONTIG1 && CONTIG2 && CONTIG3) { \
    TYPE1 *rp = THTensor_getStoragePtr(TENSOR1)->data<TYPE1>()+TENSOR1->storage_offset(); \
    TYPE2 *tp = THTensor_getStoragePtr(TENSOR2)->data<TYPE2>()+TENSOR2->storage_offset(); \
    TYPE3 *srcp = THTensor_getStoragePtr(TENSOR3)->data<TYPE3>()+TENSOR3->storage_offset(); \
    if (tp != (TYPE2*)rp) { \
      at::parallel_for(0, SIZE, (THRESHOLD * 10), [&](int64_t begin, int64_t end) { \
        PRAGMA_IVDEP \
        for (auto iter = begin; iter < end; iter++) { \
          TYPE1 *TENSOR1##_data = rp+iter; \
          TYPE2 *TENSOR2##_data = tp+iter; \
          TYPE3 *TENSOR3##_data = srcp+iter; \
          CODE \
        } \
      }); \
    } else { \
      at::parallel_for(0, SIZE, (THRESHOLD * 10), [&](int64_t begin, int64_t end) { \
        PRAGMA_SIMD \
        for (auto iter = begin; iter < end; iter++) { \
          TYPE1 *TENSOR1##_data = rp+iter; \
          TYPE2 *TENSOR2##_data = tp+iter; \
          TYPE3 *TENSOR3##_data = srcp+iter; \
          CODE \
        } \
      }); \
    } \
  } else { \
    int TH_TENSOR_APPLY_hasFinished = 0; \
    int64_t TH_TENSOR_dim_index = 0; \
    __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, -1, 1) \
    __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, -1, 1) \
    __TH_TENSOR_APPLYX_PREAMBLE(TYPE3, TENSOR3, -1, 1) \
    if (0 == TH_TENSOR_APPLY_hasFinished) { \
      auto TENSOR1##_i_local = TENSOR1##_i; \
      auto TENSOR2##_i_local = TENSOR2##_i; \
      auto TENSOR3##_i_local = TENSOR3##_i; \
      auto TENSOR1##_data_local = TENSOR1##_data; \
      auto TENSOR2##_data_local = TENSOR2##_data; \
      auto TENSOR3##_data_local = TENSOR3##_data; \
      at::parallel_for(0, SIZE, THRESHOLD, [&](int64_t begin, int64_t end) { \
        auto TENSOR1##_i = TENSOR1##_i_local; \
        auto TENSOR2##_i = TENSOR2##_i_local; \
        auto TENSOR3##_i = TENSOR3##_i_local; \
        auto TENSOR1##_data = TENSOR1##_data_local; \
        auto TENSOR2##_data = TENSOR2##_data_local; \
        auto TENSOR3##_data = TENSOR3##_data_local; \
        ptrdiff_t line_index_start = begin; \
        ptrdiff_t line_seg_length = (end - begin); \
        __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR1); \
        __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR2); \
        __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR3); \
        TENSOR1##_data += TENSOR1##_memory_offset; \
        TENSOR2##_data += TENSOR2##_memory_offset; \
        TENSOR3##_data += TENSOR3##_memory_offset; \
        ptrdiff_t count = 0; \
        ptrdiff_t TENSOR1##_start = TENSOR1##_counter_tmp[TENSOR1##_dim - 1]; \
        ptrdiff_t TENSOR2##_start = TENSOR2##_counter_tmp[TENSOR2##_dim - 1]; \
        ptrdiff_t TENSOR3##_start = TENSOR3##_counter_tmp[TENSOR3##_dim - 1]; \
        while (count < line_seg_length) { \
          for (TENSOR1##_i=TENSOR1##_start, TENSOR2##_i=TENSOR2##_start,TENSOR3##_i=TENSOR3##_start; (count<line_seg_length)&&(TENSOR1##_i<TENSOR1##_size)&&(TENSOR2##_i<TENSOR2##_size)&&(TENSOR3##_i<TENSOR3##_size); ++TENSOR1##_i,++TENSOR2##_i,++TENSOR3##_i,++count) { \
            CODE \
            TENSOR1##_data += TENSOR1##_stride; \
            TENSOR2##_data += TENSOR2##_stride; \
            TENSOR3##_data += TENSOR3##_stride; \
          } \
          if (count < line_seg_length) { \
            __TH_TENSOR_APPLYX_UPDATE_COUNTERS_PARALLEL(TENSOR1); \
            __TH_TENSOR_APPLYX_UPDATE_COUNTERS_PARALLEL(TENSOR2); \
            __TH_TENSOR_APPLYX_UPDATE_COUNTERS_PARALLEL(TENSOR3); \
          } \
        } \
        if (TENSOR1##_counter_tmp != NULL) { \
          THFree(TENSOR1##_counter_tmp); \
        } \
        if (TENSOR2##_counter_tmp != NULL) { \
          THFree(TENSOR2##_counter_tmp); \
        } \
        if (TENSOR3##_counter_tmp != NULL) { \
          THFree(TENSOR3##_counter_tmp); \
        } \
      }); \
    } \
    if (TENSOR1##_counter != NULL) { \
      THFree(TENSOR1##_counter); \
    } \
    if (TENSOR2##_counter != NULL) { \
      THFree(TENSOR2##_counter); \
    } \
    if (TENSOR3##_counter != NULL) { \
      THFree(TENSOR3##_counter); \
    } \
  } \
 }
 #define TH_TENSOR_APPLY_REDUCTION_SUM_PARALLEL(TYPE, TENSOR, EXPR, OUTPUT, THRESHOLD) \
 { \
  int TENSOR##Contig = THTensor_(isContiguous)(TENSOR); \
  ptrdiff_t TENSOR##Size = THTensor_(nElement)(TENSOR); \
  if (TENSOR##Contig) { \
    TYPE *rp = THTensor_getStoragePtr(TENSOR)->data<TYPE>()+TENSOR->storage_offset(); \
    OUTPUT = at::parallel_reduce(0, TENSOR##Size, (THRESHOLD * 10), (accreal)0, [&](int64_t begin, int64_t end, accreal ident)->accreal { \
      accreal r = ident; \
      for (auto iter = begin; iter < end; iter++) { \
        TYPE *TENSOR##_data = rp+iter; \
        r += (EXPR); \
      } \
      return r; \
    }, std::plus<accreal>()); \
  } else { \
    int TH_TENSOR_APPLY_hasFinished = 0; \
    int64_t TH_TENSOR_dim_index = 0; \
    __TH_TENSOR_APPLYX_PREAMBLE(TYPE, TENSOR, -1, 1); \
    if (0 == TH_TENSOR_APPLY_hasFinished) { \
      auto TENSOR##_data_local = TENSOR##_data; \
      auto TENSOR##_i_local = TENSOR##_i; \
      OUTPUT = at::parallel_reduce(0, TENSOR##Size, THRESHOLD, (accreal)0, [&](int64_t begin, int64_t end, accreal ident)->accreal { \
        auto TENSOR##_data = TENSOR##_data_local; \
        auto TENSOR##_i = TENSOR##_i_local; \
        ptrdiff_t line_index_start = begin; \
        ptrdiff_t line_seg_length = (end - begin); \
        __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR); \
        TENSOR##_data += TENSOR##_memory_offset; \
        ptrdiff_t count = 0; \
        ptrdiff_t TENSOR##_start = TENSOR##_counter_tmp[TENSOR##_dim - 1]; \
        accreal r = ident; \
        while (count < line_seg_length) { \
          for (TENSOR##_i=TENSOR##_start; (count < line_seg_length)&&(TENSOR##_i < TENSOR##_size); ++TENSOR##_i, ++count) { \
            r += (EXPR); \
            TENSOR##_data += TENSOR##_stride; \
          } \
          if (count < line_seg_length) { \
            __TH_TENSOR_APPLYX_UPDATE_COUNTERS_PARALLEL(TENSOR); \
          } \
        } \
        if (TENSOR##_counter_tmp != NULL) { \
          THFree(TENSOR##_counter_tmp); \
        } \
        return r; \
      }, std::plus<accreal>()); \
    } \
    if (TENSOR##_counter != NULL) { \
      THFree(TENSOR##_counter); \
    } \
  } \
 }
 #define TH_TENSOR_APPLY_CONTIG(TYPE, TENSOR, CODE) \
 { \
  auto code_fn = [&](int64_t begin, int64_t end) { \
    ptrdiff_t TENSOR##_len = end - begin; \
    TYPE *TENSOR##_data = TENSOR->data<scalar_t>() + begin; \
    CODE \
  }; \
  int in_parallel = at::in_parallel_region(); \
  ptrdiff_t TH_TENSOR_size = THTensor_(nElement)(TENSOR); \
  if (!in_parallel) { \
    at::parallel_for(0, TH_TENSOR_size, TH_OMP_OVERHEAD_THRESHOLD, code_fn); \
  } else { \
    code_fn(0, TH_TENSOR_size); \
  } \
 }
 #define TH_TENSOR_APPLY2_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \
 { \
  auto code_fn = [&](int64_t begin, int64_t end) { \
    ptrdiff_t TENSOR1##_len = end - begin; \
    TYPE1 *TENSOR1##_data = TENSOR1->data<scalar_t>() + begin; \
    TYPE2 *TENSOR2##_data = TENSOR2->data<scalar_t>() + begin; \
    CODE \
  }; \
  int in_parallel = at::in_parallel_region(); \
  ptrdiff_t TH_TENSOR_size = THTensor_(nElement)(TENSOR1); \
  if (!in_parallel) { \
    at::parallel_for(0, TH_TENSOR_size, TH_OMP_OVERHEAD_THRESHOLD, code_fn); \
  } else { \
    code_fn(0, TH_TENSOR_size); \
  } \
 }
 #define TH_TENSOR_APPLY3_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \
 { \
  auto code_fn = [&](int64_t begin, int64_t end) { \
    ptrdiff_t TENSOR1##_len = end - begin; \
    TYPE1 *TENSOR1##_data = TENSOR1->data<scalar_t>() + begin; \
    TYPE2 *TENSOR2##_data = TENSOR2->data<scalar_t>() + begin; \
    TYPE3 *TENSOR3##_data = TENSOR3->data<scalar_t>() + begin; \
    CODE \
  }; \
  int in_parallel = at::in_parallel_region(); \
  ptrdiff_t TH_TENSOR_size = THTensor_(nElement)(TENSOR1); \
  if (!in_parallel) { \
    at::parallel_for(0, TH_TENSOR_size, TH_OMP_OVERHEAD_THRESHOLD, code_fn); \
  } else { \
    code_fn(0, TH_TENSOR_size); \
  } \
 }
--- a/aten/src/TH/generic/THTensorFastGetSet.hpp
+++ b/aten/src/TH/generic/THTensorFastGetSet.hpp
@ -1,49 +0,0 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "TH/generic/THTensorFastGetSet.hpp"
 #else
 static inline scalar_t THTensor_(fastGetLegacy1dNoScalars)(THTensor *self, int64_t x0) {
  return self->unsafe_data<scalar_t>()[x0*THTensor_strideLegacyNoScalars(self, 0)];
 }
 static inline scalar_t THTensor_(fastGet1d)(THTensor *self, int64_t x0) {
  return self->unsafe_data<scalar_t>()[x0*self->stride(0)];
 }
 static inline scalar_t THTensor_(fastGet2d)(THTensor *self, int64_t x0, int64_t x1) {
  return self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)];
 }
 static inline scalar_t THTensor_(fastGet3d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2) {
  return self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)];
 }
 static inline scalar_t THTensor_(fastGet4d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3) {
  return self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)+x3*self->stride(3)];
 }
 static inline scalar_t THTensor_(fastGet5d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, int64_t x4) {
  return self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)+x3*self->stride(3)+(x4)*self->stride(4)];
 }
 static inline void THTensor_(fastSet1d)(THTensor *self, int64_t x0, scalar_t value) {
  self->unsafe_data<scalar_t>()[x0*self->stride(0)] = value;
 }
 static inline void THTensor_(fastSet2d)(THTensor *self, int64_t x0, int64_t x1, scalar_t value) {
  self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)] = value;
 }
 static inline void THTensor_(fastSet3d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, scalar_t value) {
  self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)] = value;
 }
 static inline void THTensor_(fastSet4d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, scalar_t value) {
  self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)+x3*self->stride(3)] = value;
 }
 static inline void THTensor_(fastSet5d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, int64_t x4, scalar_t value) {
  self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)+x3*self->stride(3)+(x4)*self->stride(4)] = value;
 }
 #endif
--- a/aten/src/TH/generic/THTensorMath.h
+++ b/aten/src/TH/generic/THTensorMath.h
@ -1,32 +0,0 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "TH/generic/THTensorMath.h"
 #else
 #include <ATen/core/Generator.h>
 TH_API int THTensor_(equal)(THTensor *ta, THTensor *tb);
 #if !defined(TH_REAL_IS_HALF)
 TH_API ptrdiff_t THTensor_(numel)(THTensor *t);
 #if !defined(TH_REAL_IS_BFLOAT16)
 void THTensor_(preserveReduceDimSemantics)(THTensor *r_, int in_dims, int reduce_dimension, int keepdim);
 TH_API void THTensor_(take)(THTensor *tensor, THTensor *src, THLongTensor *index);
 TH_API void THTensor_(put)(THTensor *tensor, THLongTensor *index, THTensor *src, int accumulate);
 #if !defined(TH_REAL_IS_BOOL) /* non bool only part */
 TH_API void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, int64_t k, int dimension, int keepdim);
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
 TH_API void THTensor_(histc)(THTensor *hist, THTensor *tensor, int64_t nbins, scalar_t minvalue, scalar_t maxvalue);
 #endif
 #endif
 #endif
 #endif /* !defined(TH_REAL_IS_HALF) */
 #endif /* TH_GENERIC_FILE*/
--- a/aten/src/TH/generic/THTensorMoreMath.cpp
+++ b/aten/src/TH/generic/THTensorMoreMath.cpp
@ -1,292 +0,0 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "TH/generic/THTensorMoreMath.cpp"
 #else
 #include <TH/generic/THTensorApply.hpp>
 #include <ATen/CPUGeneratorImpl.h>
 #include <ATen/Utils.h>
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/WrapDimUtils.h>
 #include <limits>
 ptrdiff_t THTensor_(numel)(THTensor *t)
 {
  return THTensor_(nElement)(t);
 }
 #if !defined(TH_REAL_IS_BFLOAT16) && !defined(TH_REAL_IS_HALF)
 // Helper function to be used in a reduction operation.
 // Due to resize semantics of outputs, if the specified output tensor r_ has
 // same size as the output of the reduction operation, then any noncontiguities
 // in r_ should be preserved.
 // The reduction operation, however, needs to act on r_ with an extra dimension
 // (the reduced dimension), so this function "resizes" r_ and preserves its
 // noncontiguities if necessary.
 void THTensor_(preserveReduceDimSemantics)(
    THTensor *r_, int in_dims, int reduce_dimension, int keepdim) {
  if (r_ && !keepdim &&
      THTensor_(nDimensionLegacyAll)(r_) == in_dims - 1 &&
      THTensor_(nDimensionLegacyAll)(r_) != 0) {
    THTensor_(unsqueeze1d)(r_, r_, reduce_dimension);
  }
 }
 #if !defined(TH_REAL_IS_BOOL) /* non bool only part */
 #define ARR(III) arr[(III)*stride]
 #define IDX(III) idx[(III)*stride]
 #define LONG_SWAP(AAA, BBB) swap = AAA; AAA = BBB; BBB = swap
 #define REAL_SWAP(AAA, BBB) rswap = AAA; AAA = BBB; BBB = rswap
 #define ARR_SWAP(III, JJJ) \
  REAL_SWAP(ARR(III), ARR(JJJ));
 #define BOTH_SWAP(III, JJJ) \
  REAL_SWAP(ARR(III), ARR(JJJ)); \
  LONG_SWAP(IDX(III), IDX(JJJ))
 /* Emulate NumPy behavior of putting NaNs
 * at the end of an ascending list. */
 #define GT_OR_NAN(x, y) \
  ((th_isnan(x) && !(th_isnan(y))) || (x > y))
 /* Implementation of the Quickselect algorithm, based on Nicolas Devillard's
 public domain implementation at http://ndevilla.free.fr/median/median/
 Adapted similarly to the above Quicksort algorithm. */
 static void THTensor_(quickselect)(scalar_t *arr, int64_t *idx, int64_t k, int64_t elements, int64_t stride)
 {
  int64_t P, L, R, i, j, swap;
  scalar_t rswap, piv;
  L = 0;
  R = elements-1;
  do {
    if (R <= L) /* One element only */
      return;
    if (R == L+1) {  /* Two elements only */
      if (ARR(L) > ARR(R)) {
        BOTH_SWAP(L, R);
      }
      return;
    }
    /* Use median of three for pivot choice */
    P=(L+R)>>1;
    BOTH_SWAP(P, L+1);
    if (ARR(L+1) > ARR(R)) { BOTH_SWAP(L+1, R); }
    if (ARR(L) > ARR(R)) { BOTH_SWAP(L, R); }
    if (ARR(L+1) > ARR(L)) { BOTH_SWAP(L+1, L); }
    i = L+1;
    j = R;
    piv = ARR(L);
    do {
      do i++; while(ARR(i) < piv);
      do j--; while(ARR(j) > piv);
      if (j < i)
        break;
      BOTH_SWAP(i, j);
    } while(1);
    BOTH_SWAP(L, j);
    /* Re-set active partition */
    if (j <= k) L=i;
    if (j >= k) R=j-1;
  } while(1);
 }
 #undef ARR
 #undef IDX
 #undef LONG_SWAP
 #undef REAL_SWAP
 #undef BOTH_SWAP
 void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, int64_t k, int dimension, int keepdim)
 {
  THTensor *temp_;
  THLongTensor *tempi_;
  scalar_t *temp__data;
  int64_t *tempi__data;
  int64_t t_size_dim;
  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimensionLegacyAll)(t), 3, "dimension out of range");
  THArgCheck(k > 0 && k <= THTensor_sizeLegacyNoScalars(t, dimension), 2, "selected index out of range");
  int in_dims = THTensor_(nDimensionLegacyAll)(t);
  THTensor_(preserveReduceDimSemantics)(values_, in_dims, dimension, keepdim);
  THLongTensor_preserveReduceDimSemantics(indices_, in_dims, dimension, keepdim);
  std::vector<int64_t> dim = THTensor_sizesLegacyNoScalars(t);
  dim[dimension] = 1;
  THTensor_(resize)(values_, dim, {});
  THLongTensor_resize(indices_, dim, {});
  t_size_dim = THTensor_sizeLegacyNoScalars(t, dimension);
  temp_ = THTensor_(new)();
  THTensor_(resize1d)(temp_, t_size_dim);
  temp__data = temp_->data<scalar_t>();
  tempi_ = THLongTensor_new();
  THLongTensor_resize1d(tempi_, t_size_dim);
  tempi__data = THLongTensor_data(tempi_);
  TH_TENSOR_DIM_APPLY3(scalar_t, t, scalar_t, values_, int64_t, indices_, dimension,
                       TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM,
                       int64_t i;
                       for(i = 0; i < t_size_dim; i++)
                          temp__data[i] = t_data[i*t_stride];
                       for(i = 0; i < t_size_dim; i++)
                          tempi__data[i] = i;
                       THTensor_(quickselect)(temp__data, tempi__data, k - 1, t_size_dim, 1);
                       *values__data = temp__data[k-1];
                       *indices__data = tempi__data[k-1];);
  c10::raw::intrusive_ptr::decref(temp_);
  THLongTensor_free(tempi_);
  if (!keepdim) {
    THTensor_(squeeze1d)(values_, values_, dimension);
    THLongTensor_squeeze1d(indices_, indices_, dimension);
  }
 }
 static void THTensor_(propagate_names_if_named_tensor_enabled)(THTensor* result, THTensor* src) {
  at::namedinference::propagate_names(result, src);
 }
 #define LAB_IMPLEMENT_BASIC_FUNCTION_3_ARGS(NAME, CFUNC, THRESHOLD) \
  void THTensor_(NAME)(THTensor *r_, THTensor *t) \
  { \
    THTensor_(resizeAs)(r_, t); \
    ptrdiff_t r_Size = THTensor_(nElement)(r_); \
    int r_Contig = THTensor_(isContiguous)(r_); \
    int tContig = THTensor_(isContiguous)(t); \
    TH_TENSOR_APPLY2_PARALLEL(r_Size, r_Contig, tContig, scalar_t, r_, scalar_t, t, *r__data = CFUNC(*t_data);, THRESHOLD); \
    THTensor_(propagate_names_if_named_tensor_enabled)(r_, t); \
  }
 #define LAB_IMPLEMENT_BASIC_FUNCTION_2_ARGS(NAME, CFUNC) \
  LAB_IMPLEMENT_BASIC_FUNCTION_3_ARGS(NAME, CFUNC, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD)
 #define LAB_IMPLEMENT_VECTORIZED_FUNCTION_3_ARGS(NAME, CFUNC, THRESHOLD) \
  void THTensor_(NAME)(THTensor *r_, THTensor *t) \
  { \
    THTensor_(resizeAs)(r_, t); \
    ptrdiff_t r_Size = THTensor_(nElement)(r_); \
    int r_Contig = THTensor_(isContiguous)(r_); \
    int tContig = THTensor_(isContiguous)(t); \
    if (r_Contig && tContig) { \
      TH_TENSOR_APPLY2_CONTIG(scalar_t, r_, scalar_t, t, THVector_(NAME)(r__data, t_data, r__len);); \
    } else { \
      TH_TENSOR_APPLY2_PARALLEL(r_Size, r_Contig, tContig, scalar_t, r_, scalar_t, t, *r__data = CFUNC(*t_data);, THRESHOLD); \
    } \
    THTensor_(propagate_names_if_named_tensor_enabled)(r_, t); \
  }
 #define LAB_IMPLEMENT_VECTORIZED_FUNCTION_2_ARGS(NAME, CFUNC) \
  LAB_IMPLEMENT_VECTORIZED_FUNCTION_3_ARGS(NAME, CFUNC, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD)
 #define EXPAND(...) __VA_ARGS__
 #define GET_4TH_ARG(ARG0, ARG1, ARG2, ARG3, ...) ARG3
 #define LAB_IMPLEMENT_BASIC_FUNCTION_CHOOSE(...) \
  EXPAND(GET_4TH_ARG(__VA_ARGS__, LAB_IMPLEMENT_BASIC_FUNCTION_3_ARGS, LAB_IMPLEMENT_BASIC_FUNCTION_2_ARGS, ))
 #define LAB_IMPLEMENT_VECTORIZED_FUNCTION_CHOOSE(...) \
  EXPAND(GET_4TH_ARG(__VA_ARGS__, LAB_IMPLEMENT_VECTORIZED_FUNCTION_3_ARGS, LAB_IMPLEMENT_VECTORIZED_FUNCTION_2_ARGS, ))
 #define LAB_IMPLEMENT_BASIC_FUNCTION(...) EXPAND(LAB_IMPLEMENT_BASIC_FUNCTION_CHOOSE(__VA_ARGS__)(__VA_ARGS__))
 #define LAB_IMPLEMENT_VECTORIZED_FUNCTION(...) EXPAND(LAB_IMPLEMENT_VECTORIZED_FUNCTION_CHOOSE(__VA_ARGS__)(__VA_ARGS__))
 /*
 * LAB_IMPLEMENT_BASIC_FUNCTION is a macro with optional parameters, you can use it flexibly.
 * The macro will discard the invalid threshold if parallelization is unavailable.
 * The macro will give a default threshold even if you forget to pass one.
 * In other word,
 * (A), If parallelization is UNavailable, the two usage below is both right.
 *      (1) LAB_IMPLEMENT_BASIC_FUNCTION(type_func, func_entity, OMP_OVERHEAD_THRESHOLD) // discard the invalid threshold
 *      (2) LAB_IMPLEMENT_BASIC_FUNCTION(type_func, func_entity)
 * (B), If parallelization is available, the two usage below is also both right.
 *      (1) LAB_IMPLEMENT_BASIC_FUNCTION(type_func, func_entity, OMP_OVERHEAD_THRESHOLD)
 *      (2) LAB_IMPLEMENT_BASIC_FUNCTION(type_func, func_entity) // pass the default threshold
 * So do LAB_IMPLEMENT_VECTORIZED_FUNCTION.
 */
 LAB_IMPLEMENT_BASIC_FUNCTION(neg,-)
 #if defined(TH_REAL_IS_LONG)
 LAB_IMPLEMENT_BASIC_FUNCTION(abs,std::abs)
 #endif /* int64_t only part */
 #if defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_CHAR)
 LAB_IMPLEMENT_BASIC_FUNCTION(abs,abs)
 #endif /* int only part */
 #if defined(TH_REAL_IS_BYTE)
 LAB_IMPLEMENT_BASIC_FUNCTION(abs,)
 #endif /* for byte, identity due to it being unsigned */
 /* floating point only now */
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
 #if defined (TH_REAL_IS_FLOAT)
 #define TH_MATH_NAME(fn) fn##f
 #else
 #define TH_MATH_NAME(fn) fn
 #endif
 LAB_IMPLEMENT_BASIC_FUNCTION(abs,TH_MATH_NAME(fabs))
 LAB_IMPLEMENT_BASIC_FUNCTION(cosh,TH_MATH_NAME(cosh),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
 LAB_IMPLEMENT_BASIC_FUNCTION(tanh,TH_MATH_NAME(tanh),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
 void THTensor_(histc)(THTensor *hist, THTensor *tensor, int64_t nbins, scalar_t minvalue, scalar_t maxvalue)
 {
  if (nbins <= 0) {
      THError("bins must be > 0");
  }
  scalar_t minval;
  scalar_t maxval;
  scalar_t *h_data;
  THTensor_(resize1d)(hist, nbins);
  THTensor_wrap(hist).zero_();
  minval = minvalue;
  maxval = maxvalue;
  if (minval == maxval)
  {
    minval = THTensor_wrap(tensor).min().item<scalar_t>();
    maxval = THTensor_wrap(tensor).max().item<scalar_t>();
  }
  if (minval == maxval)
  {
    minval = minval - 1;
    maxval = maxval + 1;
  }
  TORCH_CHECK(!(std::isinf(minval) || std::isinf(maxval) || std::isnan(minval) || std::isnan(maxval)), "range of [", minval, ", ", maxval, "] is not finite");
  TORCH_CHECK(minval < maxval, "max must be larger than min");
  h_data = hist->data<scalar_t>();
  TH_TENSOR_APPLY(scalar_t, tensor,
    if (*tensor_data >= minval && *tensor_data <= maxval) {
      const int bin = (int)((*tensor_data-minval) / (maxval-minval) * nbins);
      h_data[THMin(bin, nbins-1)] += 1;
    }
  );
 }
 #endif
 #undef TH_MATH_NAME
 #endif /* floating point only part */
 #undef IS_NONZERO
 #endif /* !defined(TH_REAL_IS_BOOL) */
 #endif /* TH_GENERIC_FILE */
--- a/aten/src/TH/generic/THVector.h
+++ b/aten/src/TH/generic/THVector.h
@ -1,18 +0,0 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "TH/generic/THVector.h"
 #else
 #if !defined(TH_REAL_IS_BOOL) /* non bool only part */
 TH_API void THVector_(neg)(scalar_t *y, const scalar_t *x, const ptrdiff_t n);
 #endif /* non bool only part */
 /* floating point only now */
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
 TH_API void THVector_(erfc)(scalar_t *y, const scalar_t *x, const ptrdiff_t n);
 TH_API void THVector_(pow)(scalar_t *y, const scalar_t *x, const scalar_t c, const ptrdiff_t n);
 #endif /* floating point only part */
 #endif
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@ -1078,11 +1078,9 @@ aten_native_source_non_codegen_list = [
    "aten/src/ATen/native/sparse/SparseCsrTensor.cpp",
    "aten/src/ATen/native/sparse/SparseTensorMath.cpp",
    "aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp",
    "aten/src/TH/THBlas.cpp",
    "aten/src/TH/THGeneral.cpp",
    "aten/src/TH/THStorageFunctions.cpp",
    "aten/src/TH/THTensor.cpp",
    "aten/src/TH/THTensorMoreMath.cpp",
    "aten/src/ATen/native/utils/Factory.cpp",
    "aten/src/ATen/native/xnnpack/Activation.cpp",
    "aten/src/ATen/native/xnnpack/ChannelShuffle.cpp",