Remove many caffe2::TIndex and replace them with int64_t (#11943)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11943 See title Reviewed By: ezyang Differential Revision: D9992645 fbshipit-source-id: e8f80d6ea762971513e5e8072975ceea53e1f11a
2025-12-06 12:20:52 +01:00 · 2018-09-22 18:07:38 -07:00 · 2018-09-22 18:07:38 -07:00 · a6630e25af
commit a6630e25af
parent 5d0f1c3c8f
248 changed files with 1446 additions and 1454 deletions
--- a/binaries/core_overhead_benchmark_gpu.cc
+++ b/binaries/core_overhead_benchmark_gpu.cc
@ -139,7 +139,7 @@ BENCHMARK(BM_cudaStreamWaitEventThenStreamSynchronize);

 static void BM_CudaPointerAffinity(benchmark::State& state) {
  CAFFE2_SKIP_IF_NO_GPU;
-  Tensor tensor(vector<TIndex>{1, 2, 3, 4}, CUDA);
+  Tensor tensor(vector<int64_t>{1, 2, 3, 4}, CUDA);
  float* ptr = tensor.mutable_data<float>();
  while (state.KeepRunning()) {
    volatile int id = GetGPUIDForPointer(ptr);
--- a/caffe2/contrib/aten/aten_op_template.h
+++ b/caffe2/contrib/aten/aten_op_template.h
@ -144,7 +144,7 @@ private:
  }
  template <typename T>
  void assignToValue(Tensor* dst, T v) {
-    dst->Resize(std::vector<TIndex>());
+    dst->Resize(std::vector<int64_t>());
    math::Set(1, v, dst->template mutable_data<T>(), &context_);
  }
  int findImplementation(const OperatorDef& operator_def) {
--- a/caffe2/contrib/gloo/allgather_ops.h
+++ b/caffe2/contrib/gloo/allgather_ops.h
@ -75,7 +75,7 @@ class AllgatherOp final : public Operator<Context> {
    auto comm_size =
        OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0)->size;
    const auto dims =
-        std::vector<TIndex>(1, (InputSize() - 1) * Input(1).size() * comm_size);
+        std::vector<int64_t>(1, (InputSize() - 1) * Input(1).size() * comm_size);
    Output(0)->Resize(dims);

    // Store which inputs/outputs this instance initialized with
--- a/caffe2/contrib/nccl/cuda_nccl_gpu.cc
+++ b/caffe2/contrib/nccl/cuda_nccl_gpu.cc
@ -269,7 +269,7 @@ void NCCL<T>::AllGather(const NCCLExecution& ex) {
      ex,
      [n](const NCCLElement& ctx) {
        CAFFE_ENFORCE_NE(ctx.src, ctx.dst);
-        std::vector<TIndex> dims;
+        std::vector<int64_t> dims;
        dims.reserve(ctx.src->ndim() + 1);
        dims.push_back(n);
        for (auto d : ctx.src->dims()) {
@ -307,7 +307,7 @@ void NCCL<T>::ReduceScatter(const NCCLExecution& ex) {
      [](const NCCLElement& ctx) {
        CAFFE_ENFORCE_NE(ctx.src, ctx.dst);
        const auto& srcDims = ctx.src->dims();
-        std::vector<TIndex> dstDims(srcDims.begin() + 1, srcDims.end());
+        std::vector<int64_t> dstDims(srcDims.begin() + 1, srcDims.end());
        ctx.dst->Resize(dstDims);
        ctx.dst->template mutable_data<T>();
      },
--- a/caffe2/contrib/tensorrt/tensorrt_op_trt.cc
+++ b/caffe2/contrib/tensorrt/tensorrt_op_trt.cc
@ -15,7 +15,7 @@ namespace {
 // Otherwise, return the product of CHW dimensions
 int64_t CheckDims(
    const nvinfer1::Dims& nv_dims,
-    const std::vector<TIndex>& c2_dims) {
+    const std::vector<int64_t>& c2_dims) {
  if (nv_dims.nbDims + 1 != c2_dims.size()) {
    CAFFE_THROW(
        "Mismatched dimensions between TRT input (",
@ -115,7 +115,7 @@ TensorRTOp::TensorRTOp(const OperatorDef& operator_def, Workspace* ws)
      const std::string key = MakeString("output_size_hint_", output_idx);
      auto output_size_hint = OperatorBase::GetRepeatedArgument<int>(key);
      if (!output_size_hint.empty()) {
-        std::vector<TIndex> dims;
+        std::vector<int64_t> dims;
        for (const auto v : output_size_hint) {
          dims.push_back(v);
        }
@ -130,17 +130,17 @@ TensorRTOp::TensorRTOp(const OperatorDef& operator_def, Workspace* ws)

 void TensorRTOp::MaybeAdjustOutputShape(
    int output_idx,
-    std::vector<TIndex>* dims) {
+    std::vector<int64_t>* dims) {
  const auto it = output_size_hints_.find(output_idx);
  if (it != output_size_hints_.end()) {
    const auto& dims_hint = it->second;
    auto total_trt = std::accumulate(
-        dims->begin(), dims->end(), (TIndex)(1), std::multiplies<TIndex>());
+        dims->begin(), dims->end(), (int64_t)(1), std::multiplies<int64_t>());
    auto total_c2 = std::accumulate(
        dims_hint.begin(),
        dims_hint.end(),
-        (TIndex)(1),
-        std::multiplies<TIndex>());
+        (int64_t)(1),
+        std::multiplies<int64_t>());
    CAFFE_ENFORCE_EQ(
        total_trt,
        total_c2,
@ -204,7 +204,7 @@ bool TensorRTOp::RunOnDevice() {
      } else {
        // output, we need to allocate the output tensor at first batch run
        auto* output_tensor = Output(output_idx);
-        std::vector<TIndex> tensor_dims;
+        std::vector<int64_t> tensor_dims;
        tensor_dims.push_back(N);
        int64_t chw = 1;
        for (int i = 0; i < dims.nbDims; ++i) {
--- a/caffe2/contrib/tensorrt/tensorrt_op_trt.h
+++ b/caffe2/contrib/tensorrt/tensorrt_op_trt.h
@ -17,13 +17,13 @@ class TensorRTOp final : public Operator<CUDAContext> {
  virtual ~TensorRTOp() noexcept {}

 private:
-  void MaybeAdjustOutputShape(int output_idx, std::vector<TIndex>* dims);
+  void MaybeAdjustOutputShape(int output_idx, std::vector<int64_t>* dims);

  tensorrt::TrtLogger logger_;
  int max_batch_size_;
  std::vector<nvinfer1::Dims> nv_dims_;
  std::vector<bool> is_input_;
-  std::unordered_map<int, std::vector<TIndex>> output_size_hints_;
+  std::unordered_map<int, std::vector<int64_t>> output_size_hints_;
  std::shared_ptr<nvinfer1::ICudaEngine> trt_engine_{nullptr};
  std::shared_ptr<nvinfer1::IExecutionContext> trt_executor_{nullptr};
  bool batch_warning_issued_{false};
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@ -139,7 +139,7 @@ void TensorSerializer::SerializeWithChunkSize(
  // Serialize whole vector. If vector is empty, it's shape still needs to be
  // serialized in empty proto
  for (size_t chunkBegin = 0;
-       chunkBegin < std::max(tensor.size(), static_cast<TIndex>(1));
+       chunkBegin < std::max(tensor.size(), static_cast<int64_t>(1));
       chunkBegin += chunk_size) {
    VLOG(2) << "Starting a chunk at " << chunkBegin;
 #ifndef __ANDROID__
@ -374,8 +374,8 @@ void TensorDeserializer::Deserialize(const TensorProto& proto, Tensor* tensor) {
      tensor->GetStaticContext()->CreateContext(proto.device_detail());
  auto context = uniq_ptr.get();
  context->SwitchToDevice(0);
-  vector<TIndex> dims;
-  for (const TIndex d : proto.dims()) {
+  vector<int64_t> dims;
+  for (const int64_t d : proto.dims()) {
    dims.push_back(d);
  }
  tensor->Resize(dims);
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@ -557,9 +557,9 @@ TEST(TensorTest, TensorNonFundamentalTypeClone) {

 TEST(TensorTest, Tensor64BitDimension) {
  // Initialize a large tensor.
-  TIndex large_number =
+  int64_t large_number =
      static_cast<int64_t>(std::numeric_limits<int>::max()) + 1;
-  Tensor tensor(vector<TIndex>{large_number}, CPU);
+  Tensor tensor(vector<int64_t>{large_number}, CPU);
  EXPECT_EQ(tensor.ndim(), 1);
  EXPECT_EQ(tensor.dim(0), large_number);
  EXPECT_EQ(tensor.size(), large_number);
@ -589,9 +589,9 @@ TEST(TensorTest, Tensor64BitDimension) {
 }

 TEST(TensorDeathTest, CannotCastDownLargeDims) {
-  TIndex large_number =
+  int64_t large_number =
      static_cast<int64_t>(std::numeric_limits<int>::max()) + 1;
-  Tensor tensor(vector<TIndex>{large_number}, CPU);
+  Tensor tensor(vector<int64_t>{large_number}, CPU);
  EXPECT_EQ(tensor.ndim(), 1);
  EXPECT_EQ(tensor.dim(0), large_number);
  ASSERT_THROW(tensor.dim32(0), EnforceNotMet);
@ -694,7 +694,7 @@ TEST(TensorTest, TensorSerialization_CustomType) {
 }

 TEST(TensorTest, Half) {
-  const TIndex kSize = 3000000;
+  const int64_t kSize = 3000000;
  Blob blob;
  TensorCPU* tensor = blob.GetMutableTensor(CPU);
  tensor->Resize(kSize);
--- a/caffe2/core/logging.h
+++ b/caffe2/core/logging.h
@ -145,7 +145,7 @@ using EnforceNotMet = at::Error;
 * functions to caffe2::enforce_detail namespace. For example:
 *
 *   namespace caffe2 { namespace enforce_detail {
- *   inline EnforceFailMessage IsVector(const vector<TIndex>& shape) {
+ *   inline EnforceFailMessage IsVector(const vector<int64_t>& shape) {
 *     if (shape.size() == 1) { return EnforceOK(); }
 *     return MakeString("Shape ", shape, " is not a vector");
 *   }
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@ -581,7 +581,7 @@ TensorShapes InferBlobShapesAndTypesFromWorkspace(
 }

 TensorShapes InferBlobShapesAndTypesFromMap(
-    const CaffeMap<std::string, std::vector<TIndex>>& blob_dimensions,
+    const CaffeMap<std::string, std::vector<int64_t>>& blob_dimensions,
    const vector<NetDef*>& nets) {
  CaffeMap<string, TensorShape> blob_desc;
  // Populate shapes from known blobs
@ -597,7 +597,7 @@ TensorShapes InferBlobShapesAndTypesFromMap(
 }

 TensorShapes InferBlobShapesAndTypesFromMap(
-    const CaffeMap<std::string, std::vector<TIndex>>& blob_dimensions,
+    const CaffeMap<std::string, std::vector<int64_t>>& blob_dimensions,
    const CaffeMap<std::string, TensorProto_DataType>& blob_types,
    const vector<NetDef*>& nets) {
  CaffeMap<string, TensorShape> blob_desc;
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@ -700,7 +700,7 @@ struct DispatchHelper<FixedValues<FirstVal, Values...>, ExtraArgs...> {
 template <typename... ExtraArgs>
 struct DispatchHelper<FixedValues<>, ExtraArgs...> {
  template <typename Op>
-  static bool call(Op* op, TIndex /*size*/) {
+  static bool call(Op* op, int64_t /*size*/) {
    return op->template DoRunWithValue<ExtraArgs..., -1>();
  }
 };
@ -973,11 +973,11 @@ CAFFE2_API TensorShapes InferBlobShapesAndTypesFromWorkspace(
    const vector<NetDef*>& nets);

 CAFFE2_API TensorShapes InferBlobShapesAndTypesFromMap(
-    const CaffeMap<std::string, std::vector<TIndex>>& blob_dimensions,
+    const CaffeMap<std::string, std::vector<int64_t>>& blob_dimensions,
    const vector<NetDef*>& nets);

 CAFFE2_API TensorShapes InferBlobShapesAndTypesFromMap(
-    const CaffeMap<std::string, std::vector<TIndex>>& blob_dimensions,
+    const CaffeMap<std::string, std::vector<int64_t>>& blob_dimensions,
    const CaffeMap<std::string, TensorProto_DataType>& blob_types,
    const vector<NetDef*>& nets);

--- a/caffe2/core/operator_schema.cc
+++ b/caffe2/core/operator_schema.cc
@ -331,7 +331,7 @@ int OpSchema::CalculateOutput(int num_input) const {
 }

 static void SparseLengthsFillerHelper(
-    const std::vector<std::vector<TIndex>>& shapes,
+    const std::vector<std::vector<int64_t>>& shapes,
    size_t value_index,
    size_t length_index,
    std::vector<TensorFiller>* fillers) {
@ -341,7 +341,7 @@ static void SparseLengthsFillerHelper(
 }

 static void SparseSegmentsFillerHelper(
-    const std::vector<std::vector<TIndex>>& shapes,
+    const std::vector<std::vector<int64_t>>& shapes,
    size_t value_index,
    size_t segment_index,
    std::vector<TensorFiller>* fillers) {
@ -364,7 +364,7 @@ OpSchema& OpSchema::ValueKeyLengthInputFillers(
    size_t key_index,
    size_t length_index) {
  filler_supplier_ = [this, value_index, key_index, length_index](
-                         const std::vector<std::vector<TIndex>>& shapes) {
+                         const std::vector<std::vector<int64_t>>& shapes) {
    auto fillers = SupplyDenseFillers(shapes);
    // fill in the length (value_index is used to get the correct shape)
    SparseLengthsFillerHelper(shapes, key_index, length_index, &fillers);
@ -383,7 +383,7 @@ OpSchema& OpSchema::ValueLengthInputFillers(
    size_t value_index,
    size_t length_index) {
  filler_supplier_ = [this, value_index, length_index](
-                         const std::vector<std::vector<TIndex>>& shapes) {
+                         const std::vector<std::vector<int64_t>>& shapes) {
    auto fillers = SupplyDenseFillers(shapes);
    // fill in the length (value_index is used to get the correct shape)
    SparseLengthsFillerHelper(shapes, value_index, length_index, &fillers);
@ -394,7 +394,7 @@ OpSchema& OpSchema::ValueLengthInputFillers(

 OpSchema& OpSchema::DisallowInputFillers() {
  filler_supplier_ =
-      [this](const std::vector<std::vector<TIndex>>& /* unused */) {
+      [this](const std::vector<std::vector<int64_t>>& /* unused */) {
        throw std::invalid_argument(type_ + " does not have input fillers");
        return std::vector<TensorFiller>();
      };
@ -402,12 +402,12 @@ OpSchema& OpSchema::DisallowInputFillers() {
 }

 std::vector<TensorFiller> OpSchema::InputFillers(
-    const std::vector<std::vector<TIndex>>& shapes) const {
+    const std::vector<std::vector<int64_t>>& shapes) const {
  return filler_supplier_(shapes);
 }

 std::vector<TensorFiller> OpSchema::SupplyDenseFillers(
-    const std::vector<std::vector<TIndex>>& shapes) {
+    const std::vector<std::vector<int64_t>>& shapes) {
  std::vector<TensorFiller> fillers;
  for (const auto& shape : shapes) {
    fillers.emplace_back(shape);
--- a/caffe2/core/operator_schema.h
+++ b/caffe2/core/operator_schema.h
@ -383,11 +383,11 @@ class CAFFE2_API OpSchema {
  OpSchema& DisallowInputFillers();

  std::vector<TensorFiller> InputFillers(
-      const std::vector<std::vector<TIndex>>& shapes) const;
+      const std::vector<std::vector<int64_t>>& shapes) const;

 private:
  std::vector<TensorFiller> SupplyDenseFillers(
-      const std::vector<std::vector<TIndex>>& shapes);
+      const std::vector<std::vector<int64_t>>& shapes);

 private:
  string type_;
@ -438,9 +438,9 @@ class CAFFE2_API OpSchema {
      };

  std::function<std::vector<TensorFiller>(
-      const std::vector<std::vector<TIndex>>&)>
+      const std::vector<std::vector<int64_t>>&)>
      filler_supplier_ =
-          [this](const std::vector<std::vector<TIndex>>& shapes) {
+          [this](const std::vector<std::vector<int64_t>>& shapes) {
            return SupplyDenseFillers(shapes);
          };
 };
@ -508,8 +508,8 @@ inline TensorShape CreateTensorShape(
 }

 // Helper function
-inline vector<TIndex> GetDimsVector(const TensorShape& shape) {
-  vector<TIndex> dims;
+inline vector<int64_t> GetDimsVector(const TensorShape& shape) {
+  vector<int64_t> dims;
  for (auto d : shape.dims()) {
    dims.push_back(d);
  }
--- a/caffe2/core/qtensor.h
+++ b/caffe2/core/qtensor.h
@ -212,8 +212,8 @@ class CAFFE2_EXPORT QTensor {
  /**
   * Return product of all dimensions starting from K.
   */
-  inline TIndex size_from_dim(int k) const {
-    TIndex r = 1;
+  inline int64_t size_from_dim(int k) const {
+    int64_t r = 1;
    for (int i = k; i < dims_.size(); ++i) {
      r *= dims_[i];
    }
@ -223,9 +223,9 @@ class CAFFE2_EXPORT QTensor {
  /**
   * Product of all dims up to.
   */
-  inline TIndex size_to_dim(int k) const {
+  inline int64_t size_to_dim(int k) const {
    CAFFE_ENFORCE(k < dims_.size());
-    TIndex r = 1;
+    int64_t r = 1;
    for (int i = 0; i < k; ++i) {
      r *= dims_[i];
    }
--- a/caffe2/core/tensor.cc
+++ b/caffe2/core/tensor.cc
@ -77,7 +77,7 @@ void RegisterTypeCallFunction(TypeIdentifier id, TypeCall c) {

 int GetGPUIDForPointer(const void* ptr);

-vector<TIndex> GetTensorInfo(
+vector<int64_t> GetTensorInfo(
    const void* c,
    size_t* capacity,
    DeviceOption* device) {
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@ -59,7 +59,7 @@ class CAFFE2_API Tensor final {
   * Note that the actual data allocation is not going to be carried out until
   * the first time mutable_data() is called.
   */
-  explicit Tensor(const vector<TIndex>& dims, DeviceType type)
+  explicit Tensor(const vector<int64_t>& dims, DeviceType type)
      : Tensor(Storage(type)) {
    // TODO: here, we create a Storage
    // and immediately discard it in Resize() since
@ -96,7 +96,7 @@ class CAFFE2_API Tensor final {
   */
  template <typename T>
  Tensor(
-      const vector<TIndex>& dims,
+      const vector<int64_t>& dims,
      const vector<T>& values,
      BaseContext* context)
      : Tensor(Storage(context->device_type(), TypeMeta::Make<T>())) {
@ -115,7 +115,7 @@ class CAFFE2_API Tensor final {
      typename = typename std::enable_if<std::is_scalar<T>::value>::type>
  Tensor(const T& value, BaseContext* context)
      : Tensor(Storage(context->device_type(), TypeMeta::Make<T>())) {
-    Resize(std::vector<TIndex>{});
+    Resize(std::vector<int64_t>{});
    context->CopyItemsFromCPU(
        storage().dtype(), size(), &value, mutable_data<T>());
  }
@ -142,15 +142,15 @@ class CAFFE2_API Tensor final {
    impl_.get()->CopyFrom(*src.impl_.get(), context);
  }

-  void ExtendTo(TIndex num, float growthPct, BaseContext* context) const {
+  void ExtendTo(int64_t num, float growthPct, BaseContext* context) const {
    impl_.get()->ExtendTo(num, growthPct, context);
  }

-  void Extend(TIndex num, float growthPct, BaseContext* context) const {
+  void Extend(int64_t num, float growthPct, BaseContext* context) const {
    impl_.get()->Extend(num, growthPct, context);
  }

-  void ShrinkTo(TIndex outer_dim) const {
+  void ShrinkTo(int64_t outer_dim) const {
    impl_.get()->ShrinkTo(outer_dim);
  }

@ -168,7 +168,7 @@ class CAFFE2_API Tensor final {
    impl_.get()->ResizeLike(*src_tensor.impl_.get());
  }

-  inline void Reshape(const vector<TIndex>& dims) const {
+  inline void Reshape(const vector<int64_t>& dims) const {
    impl_.get()->Reshape(dims);
  }

@ -250,7 +250,7 @@ class CAFFE2_API Tensor final {
    return impl_.get()->ndim();
  }

-  inline TIndex size() const {
+  inline int64_t size() const {
    return impl_.get()->size();
  }

@ -266,19 +266,19 @@ class CAFFE2_API Tensor final {
    return impl_.get()->capacity_nbytes();
  }

-  inline const vector<TIndex>& dims() const {
+  inline const vector<int64_t>& dims() const {
    return impl_.get()->dims();
  }

-  inline TIndex size_from_dim(int k) const {
+  inline int64_t size_from_dim(int k) const {
    return impl_.get()->size_from_dim(k);
  }

-  inline TIndex size_to_dim(int k) const {
+  inline int64_t size_to_dim(int k) const {
    return impl_.get()->size_to_dim(k);
  }

-  inline TIndex size_between_dim(int k, int l) const {
+  inline int64_t size_between_dim(int k, int l) const {
    return impl_.get()->size_between_dim(k, l);
  }

@ -311,7 +311,7 @@ class CAFFE2_API Tensor final {
    return impl_.get()->dim32(i);
  }

-  inline TIndex dim(const int i) const {
+  inline int64_t dim(const int i) const {
    return impl_.get()->dim(i);
  }

@ -337,7 +337,7 @@ TypeCall GetTypeCallFunction(TypeIdentifier id);
 void RegisterTypeCallFunction(TypeIdentifier id, TypeCall c);

 // Shape call registry
-typedef vector<TIndex> (*TensorInfoCall)(
+typedef vector<int64_t> (*TensorInfoCall)(
    const void*,
    size_t* capacity,
    DeviceOption* device);
@ -377,7 +377,7 @@ void TensorPrinter::Print(const Tensor& tensor) {
  std::stringstream values_stream;
  // One most likely doesn't want to print int64-number of items for visual
  // inspection, so we cast down to int here.
-  int total_count = static_cast<int>(std::min(tensor.size(), TIndex(limit_)));
+  int total_count = static_cast<int>(std::min(tensor.size(), int64_t(limit_)));
  const T* tensor_data = tensor.template data<T>();
  for (int i = 0; i < total_count - 1; ++i) {
    values_stream << tensor_data[i] << ",";
--- a/caffe2/core/tensor_impl.h
+++ b/caffe2/core/tensor_impl.h
@ -26,17 +26,17 @@ namespace caffe2 {
 class DeviceOption;

 /**
- * A utility function to convert vector<int> to vector<TIndex>.
+ * A utility function to convert vector<int> to vector<int64_t>.
 */
-inline std::vector<TIndex> ToVectorTIndex(const std::vector<int>& src) {
-  return std::vector<TIndex>(src.begin(), src.end());
+inline std::vector<int64_t> ToVectorint64_t(const std::vector<int>& src) {
+  return std::vector<int64_t>(src.begin(), src.end());
 }

 /**
 * Return product of all dimensions starting from k
 */
-inline TIndex size_from_dim_(int k, const std::vector<TIndex>& dims) {
-  TIndex r = 1;
+inline int64_t size_from_dim_(int k, const std::vector<int64_t>& dims) {
+  int64_t r = 1;
  for (size_t i = k; i < dims.size(); ++i) {
    r *= dims[i];
  }
@ -44,9 +44,9 @@ inline TIndex size_from_dim_(int k, const std::vector<TIndex>& dims) {
 }

 // Product of all dims up to k (not including dims[k])
-inline TIndex size_to_dim_(int k, const std::vector<TIndex>& dims) {
+inline int64_t size_to_dim_(int k, const std::vector<int64_t>& dims) {
  CAFFE_ENFORCE((unsigned)k <= dims.size());
-  TIndex r = 1;
+  int64_t r = 1;
  for (int i = 0; i < k; ++i) {
    r *= dims[i];
  }
@ -54,9 +54,9 @@ inline TIndex size_to_dim_(int k, const std::vector<TIndex>& dims) {
 }

 // Product of all dims between k and l (not including dims[k] and dims[l])
-inline TIndex size_between_dim_(int k, int l, const std::vector<TIndex>& dims) {
+inline int64_t size_between_dim_(int k, int l, const std::vector<int64_t>& dims) {
  CAFFE_ENFORCE((unsigned)l < dims.size());
-  TIndex r = 1;
+  int64_t r = 1;
  if (k < l) {
    for (int i = k + 1; i < l; ++i) {
      r *= dims[i];
@ -191,7 +191,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   * @brief Extend the outer-most dimension of this tensor
   *        to dimension of `num`.
   */
-  void ExtendTo(TIndex num, float growthPct, at::BaseContext* context) {
+  void ExtendTo(int64_t num, float growthPct, at::BaseContext* context) {
    CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
    CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0);
    CAFFE_ENFORCE(context != nullptr, "Context must be provided.");
@ -207,7 +207,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   * growthPct. This ensures that Extend runs on an amortized O(1) time
   * complexity.
   */
-  void Extend(TIndex num, float growthPct, at::BaseContext* context) {
+  void Extend(int64_t num, float growthPct, at::BaseContext* context) {
    CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
    CAFFE_ENFORCE_GE_WITH_CALLER(
        num, 0, "`num` must be non-negative for Extend");
@ -223,8 +223,8 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    auto newNumel = std::accumulate(
        newDims.begin(),
        newDims.end(),
-        static_cast<TIndex>(1),
-        std::multiplies<TIndex>());
+        static_cast<int64_t>(1),
+        std::multiplies<int64_t>());
    if (newNumel * storage_.itemsize() <= storage_.capacity()) {
      dims_ = newDims;
      numel_ = newNumel;
@ -253,7 +253,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   * This method guarantees that no re-allocations are carried out, which means
   * that the extra capacity after the end of the shurnk tensor is maintained.
   */
-  void ShrinkTo(TIndex outer_dim) {
+  void ShrinkTo(int64_t outer_dim) {
    CAFFE_ENFORCE_WITH_CALLER(
        is_contiguous_,
        "Right now ShrinkTo is only supported on contiguous Tensor.");
@ -268,8 +268,8 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    numel_ = std::accumulate(
        dims_.begin(),
        dims_.end(),
-        static_cast<TIndex>(1),
-        std::multiplies<TIndex>());
+        static_cast<int64_t>(1),
+        std::multiplies<int64_t>());
  }

  /**
@ -292,8 +292,8 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    auto newNumel = std::accumulate(
        newCapacity.begin(),
        newCapacity.end(),
-        static_cast<TIndex>(1),
-        std::multiplies<TIndex>());
+        static_cast<int64_t>(1),
+        std::multiplies<int64_t>());
    if (newNumel * storage_.itemsize() <= storage_.capacity()) {
      return;
    }
@ -365,11 +365,11 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   * Resizes the tensor without touching underlying storage.
   * This requires the total size of the tensor to remains constant.
   */
-  inline void Reshape(const std::vector<TIndex>& dims) {
+  inline void Reshape(const std::vector<int64_t>& dims) {
    CAFFE_ENFORCE_WITH_CALLER(
        is_contiguous_,
        "Right now Reshape is only supported for contiguous Tensor.");
-    TIndex new_size = 1;
+    int64_t new_size = 1;
    for (auto d : dims) {
      CAFFE_ENFORCE_GE_WITH_CALLER(d, 0);
      new_size *= d;
@ -387,7 +387,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
  }

  inline void Reshape(const std::vector<int>& dims) {
-    Reshape(ToVectorTIndex(dims));
+    Reshape(ToVectorint64_t(dims));
  }

  /**
@ -674,7 +674,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
  /**
   * Returns the size (i.e. the number of items) of the tensor.
   */
-  inline TIndex size() const {
+  inline int64_t size() const {
    return numel_;
  }
  /**
@ -701,19 +701,19 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
  /**
   * Returns the dimensions of the tensor as a vector.
   */
-  inline const std::vector<TIndex>& dims() const {
+  inline const std::vector<int64_t>& dims() const {
    return dims_;
  }

-  inline TIndex size_from_dim(int k) const {
+  inline int64_t size_from_dim(int k) const {
    return size_from_dim_(k, dims_);
  }

-  inline TIndex size_to_dim(int k) const {
+  inline int64_t size_to_dim(int k) const {
    return size_to_dim_(k, dims_);
  }

-  inline TIndex size_between_dim(int k, int l) const {
+  inline int64_t size_between_dim(int k, int l) const {
    return size_between_dim_(k, l, dims_);
  }

@ -772,7 +772,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
  /**
   * Returns the i-th dimension of the tensor in int.
   *
-   * This function returns an int value instead of TIndex, which depending on
+   * This function returns an int value instead of int64_t, which depending on
   * the typedef could be int64. If you want int64 dim values, make sure you
   * call dim() instead.
   */
@ -790,7 +790,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   * must be between 0 (inclusive) and the number of dimensions, otherwise
   * this function will produce a fatal message.
   */
-  inline TIndex dim(const int i) const {
+  inline int64_t dim(const int i) const {
 #ifndef NDEBUG
    CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit");
    CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
@ -818,9 +818,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {

 protected:
  // TODO: change to DimVector
-  std::vector<TIndex> dims_; // sizes_
+  std::vector<int64_t> dims_; // sizes_
  at::DimVector strides_;
-  TIndex numel_ = -1; // numel_
+  int64_t numel_ = -1; // numel_
  bool is_contiguous_ = true;
  // we decide to keep reserved_ and it will
  // live in Tensor after the split
@ -838,7 +838,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
  bool SetDims(const std::vector<T>& src) {
    auto old_numel = numel_;
    dims_.resize(src.size());
-    TIndex new_numel = 1;
+    int64_t new_numel = 1;
    for (size_t i = 0; i < src.size(); ++i) {
      new_numel *= src[i];
      dims_[i] = src[i];
@ -859,7 +859,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
  // TODO(jiayq): maybe rewrite the following functions with initializer list.
  // NVCC does not play well with initializer lists last time, but worth
  // another shot.
-  bool SetDims(const TIndex d0) {
+  bool SetDims(const int64_t d0) {
    auto old_numel = numel_;
    dims_.resize(1);
    dims_[0] = d0;
@ -868,7 +868,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    return numel_ != old_numel;
  }

-  bool SetDims(const TIndex d0, const TIndex d1) {
+  bool SetDims(const int64_t d0, const int64_t d1) {
    auto old_numel = numel_;
    dims_.resize(2);
    dims_[0] = d0;
@ -878,7 +878,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    return numel_ != old_numel;
  }

-  bool SetDims(const TIndex d0, const TIndex d1, const TIndex d2) {
+  bool SetDims(const int64_t d0, const int64_t d1, const int64_t d2) {
    auto old_numel = numel_;
    dims_.resize(3);
    dims_[0] = d0;
@ -890,7 +890,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
  }

  bool
-  SetDims(const TIndex d0, const TIndex d1, const TIndex d2, const TIndex d3) {
+  SetDims(const int64_t d0, const int64_t d1, const int64_t d2, const int64_t d3) {
    auto old_numel = numel_;
    dims_.resize(4);
    dims_[0] = d0;
--- a/caffe2/cuda_rtc/pool_op_rtc_gpu.cc
+++ b/caffe2/cuda_rtc/pool_op_rtc_gpu.cc
@ -232,7 +232,7 @@ class MaxPoolRTCOp final : public ConvPoolOpBase<CUDAContext> {

 private:
  MaxPoolRTCFunction func_;
-  vector<TIndex> input_dims_;
+  vector<int64_t> input_dims_;
 };

 class MaxPoolGradientRTCOp final : public ConvPoolOpBase<CUDAContext> {
@ -285,7 +285,7 @@ class MaxPoolGradientRTCOp final : public ConvPoolOpBase<CUDAContext> {

 private:
  MaxPoolGradientRTCFunction func_;
-  vector<TIndex> input_dims_;
+  vector<int64_t> input_dims_;
 };

 namespace {
--- a/caffe2/experiments/operators/fully_connected_op_prune.h
+++ b/caffe2/experiments/operators/fully_connected_op_prune.h
@ -29,8 +29,8 @@ namespace caffe2 {
      using Shape = std::array<int, N>;

    template<int N>
-      const std::vector<TIndex>& shape(Shape<N> vs) {
-        static thread_local std::vector<TIndex> cache;
+      const std::vector<int64_t>& shape(Shape<N> vs) {
+        static thread_local std::vector<int64_t> cache;
        cache.resize(vs.size());
        for (auto i = 0; i < vs.size(); ++i) {
          cache[i] = vs[i];
@ -38,11 +38,11 @@ namespace caffe2 {
        return cache;
      }

-    inline const std::vector<TIndex>& shape(int i) {
+    inline const std::vector<int64_t>& shape(int i) {
      return shape<1>(Shape<1>({i}));
    }

-    inline const std::vector<TIndex>& shape(int i, int j) {
+    inline const std::vector<int64_t>& shape(int i, int j) {
      return shape<2>(Shape<2>({i, j}));
    }

@ -177,7 +177,7 @@ namespace caffe2 {
              Y->template mutable_data<T>(), &context_);
          if (OutputSize() == 2){
            auto* Comp_rate = Output(1);
-            Comp_rate->Resize(vector<TIndex>());
+            Comp_rate->Resize(vector<int64_t>());
            T* comp_data = Comp_rate->template mutable_data<T>();
            math::Sum<T, Context>(
                Mask.size(), Mask.template data<T>(), comp_data, &context_);
@ -262,7 +262,7 @@ namespace caffe2 {
              0, dW->template mutable_data<T>(),
              &context_);

-          comp_r_buf_.Resize(vector<TIndex>());
+          comp_r_buf_.Resize(vector<int64_t>());
          T* comp_data = comp_r_buf_.template mutable_data<T>();
          math::Sum<T, Context>(
              Mask.size(), Mask.template data<T>(), comp_data, &context_);
--- a/caffe2/experiments/operators/fully_connected_op_sparse.h
+++ b/caffe2/experiments/operators/fully_connected_op_sparse.h
@ -32,8 +32,8 @@ template<int N>
 using Shape = std::array<int, N>;

 template<int N>
-const std::vector<TIndex>& shape(Shape<N> vs) {
-  static thread_local std::vector<TIndex> cache;
+const std::vector<int64_t>& shape(Shape<N> vs) {
+  static thread_local std::vector<int64_t> cache;
  cache.resize(vs.size());
  for (auto i = 0; i < vs.size(); ++i) {
    cache[i] = vs[i];
@ -41,11 +41,11 @@ const std::vector<TIndex>& shape(Shape<N> vs) {
  return cache;
 }

-inline const std::vector<TIndex>& shape(int i) {
+inline const std::vector<int64_t>& shape(int i) {
  return shape<1>(Shape<1>({i}));
 }

-inline const std::vector<TIndex>& shape(int i, int j) {
+inline const std::vector<int64_t>& shape(int i, int j) {
  return shape<2>(Shape<2>({i, j}));
 }

--- a/caffe2/experiments/operators/funhash_op.h
+++ b/caffe2/experiments/operators/funhash_op.h
@ -37,9 +37,9 @@ class FunHashOp : public Operator<Context> {
  FunHashOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws),
        num_outputs_(
-            OperatorBase::GetSingleArgument<TIndex>("num_outputs", -1)),
+            OperatorBase::GetSingleArgument<int64_t>("num_outputs", -1)),
        num_segments_(
-            OperatorBase::GetSingleArgument<TIndex>("num_segments", -1)),
+            OperatorBase::GetSingleArgument<int64_t>("num_segments", -1)),
        seed_(OperatorBase::GetSingleArgument<uint64_t>("seed", 0)) {
    CAFFE_ENFORCE(
        OperatorBase::HasArgument("num_outputs"),
@ -54,7 +54,7 @@ class FunHashOp : public Operator<Context> {
    const auto& seg = Input(2);
    const auto& weight = Input(3);

-    TIndex num_alpha = 1;
+    int64_t num_alpha = 1;
    if (adaptive_) {
      const auto& alpha = Input(4);
      num_alpha = alpha.dim(0);
@ -62,12 +62,12 @@ class FunHashOp : public Operator<Context> {

    const auto* seg_data = seg.template data<int>();

-    TIndex num_weight = weight.dim(0);
-    TIndex num_nz_ent = seg.dim(0);
+    int64_t num_weight = weight.dim(0);
+    int64_t num_nz_ent = seg.dim(0);

-    TIndex n_segments = num_segments_;
+    int64_t n_segments = num_segments_;
    if (num_segments_ == -1) {
-      for (TIndex i = 0; i < num_nz_ent; ++i) {
+      for (int64_t i = 0; i < num_nz_ent; ++i) {
        if (seg_data[i] > n_segments) {
          n_segments = seg_data[i];
        }
@ -85,16 +85,16 @@ class FunHashOp : public Operator<Context> {
    const auto* weight_data = weight.template data<T>();
    const auto* alpha_data = adaptive_ ? Input(4).template data<T>() : 0;
    const auto* val_data = val.template data<T>();
-    const auto* key_data = key.template data<TIndex>();
+    const auto* key_data = key.template data<int64_t>();

-    for (TIndex j = 0; j < num_nz_ent; ++j) {
-      TIndex cur_seg = seg_data[j];
-      TIndex cur_key = key_data[j];
+    for (int64_t j = 0; j < num_nz_ent; ++j) {
+      int64_t cur_seg = seg_data[j];
+      int64_t cur_key = key_data[j];
      T cur_val = val_data[j];
-      TIndex output_stride = cur_seg * num_outputs_;
-      for (TIndex i = 0; i < num_outputs_; ++i) {
+      int64_t output_stride = cur_seg * num_outputs_;
+      for (int64_t i = 0; i < num_outputs_; ++i) {
        T sum = 0;
-        for (TIndex k = 0; k < num_alpha; ++k) {
+        for (int64_t k = 0; k < num_alpha; ++k) {
          uint64_t hash;
          // The hash function takes as input four integers:
          // 1. feature index
@ -108,7 +108,7 @@ class FunHashOp : public Operator<Context> {

          hash_data[3] = INDEX_MAGIC;
          hash = XXH64(hash_data.data(), hash_data.size(), seed_);
-          TIndex index = hash % num_weight;
+          int64_t index = hash % num_weight;

          T cur_weight = weight_data[index];
 #ifdef USE_SIGN
@ -133,8 +133,8 @@ class FunHashOp : public Operator<Context> {
  }

 protected:
-  TIndex num_outputs_;
-  TIndex num_segments_;
+  int64_t num_outputs_;
+  int64_t num_segments_;
  uint64_t seed_;
  std::array<uint64_t, 4> hash_data;
  bool adaptive_;
@ -147,7 +147,7 @@ class FunHashGradientOp : public Operator<Context> {
  FunHashGradientOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws),
        num_outputs_(
-            OperatorBase::GetSingleArgument<TIndex>("num_outputs", -1)),
+            OperatorBase::GetSingleArgument<int64_t>("num_outputs", -1)),
        seed_(OperatorBase::GetSingleArgument<uint64_t>("seed", 0)) {
    adaptive_ = (InputSize() == 6);
  }
@ -159,7 +159,7 @@ class FunHashGradientOp : public Operator<Context> {
    const auto& seg = Input(3);
    const auto& weight = Input(4);

-    TIndex num_alpha = 1;
+    int64_t num_alpha = 1;
    T* grad_alpha_data = 0;

    if (adaptive_) {
@ -173,8 +173,8 @@ class FunHashGradientOp : public Operator<Context> {

    const auto* seg_data = seg.template data<int>();

-    TIndex num_weight = weight.dim(0);
-    TIndex num_nz_ent = seg.dim(0);
+    int64_t num_weight = weight.dim(0);
+    int64_t num_nz_ent = seg.dim(0);

    auto* grad_weight = Output(0);
    grad_weight->ResizeLike(weight);
@ -184,18 +184,18 @@ class FunHashGradientOp : public Operator<Context> {
    const auto* weight_data = weight.template data<T>();
    const auto* alpha_data = adaptive_ ? Input(5).template data<T>() : 0;
    const auto* val_data = val.template data<T>();
-    const auto* key_data = key.template data<TIndex>();
+    const auto* key_data = key.template data<int64_t>();

    memset(grad_weight_data, 0, sizeof(T) * num_weight);

-    for (TIndex j = 0; j < num_nz_ent; ++j) {
-      TIndex cur_seg = seg_data[j];
-      TIndex cur_key = key_data[j];
+    for (int64_t j = 0; j < num_nz_ent; ++j) {
+      int64_t cur_seg = seg_data[j];
+      int64_t cur_key = key_data[j];
      T cur_val = val_data[j];
-      TIndex grad_out_stride = cur_seg * num_outputs_;
-      for (TIndex i = 0; i < num_outputs_; ++i) {
+      int64_t grad_out_stride = cur_seg * num_outputs_;
+      for (int64_t i = 0; i < num_outputs_; ++i) {
        T grad_out_scale = grad_out_data[grad_out_stride + i] * cur_val;
-        for (TIndex k = 0; k < num_alpha; ++k) {
+        for (int64_t k = 0; k < num_alpha; ++k) {
          uint64_t hash;
          hash_data[0] = cur_key;
          hash_data[1] = i;
@ -203,7 +203,7 @@ class FunHashGradientOp : public Operator<Context> {

          hash_data[3] = INDEX_MAGIC;
          hash = XXH64(hash_data.data(), hash_data.size(), seed_);
-          TIndex index = hash % num_weight;
+          int64_t index = hash % num_weight;

          T cur_grad_out_scale = grad_out_scale;
 #ifdef USE_SIGN
@ -227,7 +227,7 @@ class FunHashGradientOp : public Operator<Context> {
  }

 protected:
-  TIndex num_outputs_;
+  int64_t num_outputs_;
  uint64_t seed_;
  std::array<uint64_t, 4> hash_data;
  bool adaptive_;
--- a/caffe2/experiments/operators/sparse_funhash_op.h
+++ b/caffe2/experiments/operators/sparse_funhash_op.h
@ -36,9 +36,9 @@ class SparseFunHashOp : public Operator<Context> {
  SparseFunHashOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws),
        num_outputs_(
-            OperatorBase::GetSingleArgument<TIndex>("num_outputs", -1)),
+            OperatorBase::GetSingleArgument<int64_t>("num_outputs", -1)),
        num_segments_(
-            OperatorBase::GetSingleArgument<TIndex>("num_segments", -1)),
+            OperatorBase::GetSingleArgument<int64_t>("num_segments", -1)),
        seed_(OperatorBase::GetSingleArgument<uint64_t>("seed", 0)) {
    CAFFE_ENFORCE(
        OperatorBase::HasArgument("num_outputs"),
@ -53,7 +53,7 @@ class SparseFunHashOp : public Operator<Context> {
    const auto& seg = Input(2);
    const auto& weight = Input(3);

-    TIndex num_alpha = 1;
+    int64_t num_alpha = 1;
    if (adaptive_) {
      const auto& alpha = Input(4);
      num_alpha = alpha.dim(0);
@ -61,12 +61,12 @@ class SparseFunHashOp : public Operator<Context> {

    const auto* seg_data = seg.template data<int>();

-    TIndex num_weight = weight.dim(0);
-    TIndex num_nz_ent = seg.dim(0);
+    int64_t num_weight = weight.dim(0);
+    int64_t num_nz_ent = seg.dim(0);

-    TIndex n_segments = num_segments_;
+    int64_t n_segments = num_segments_;
    if (num_segments_ == -1) {
-      for (TIndex i = 0; i < num_nz_ent; ++i) {
+      for (int64_t i = 0; i < num_nz_ent; ++i) {
        if (seg_data[i] > n_segments) {
          n_segments = seg_data[i];
        }
@ -84,16 +84,16 @@ class SparseFunHashOp : public Operator<Context> {
    const auto* weight_data = weight.template data<T>();
    const auto* alpha_data = adaptive_ ? Input(4).template data<T>() : 0;
    const auto* val_data = val.template data<T>();
-    const auto* key_data = key.template data<TIndex>();
+    const auto* key_data = key.template data<int64_t>();

-    for (TIndex j = 0; j < num_nz_ent; ++j) {
-      TIndex cur_seg = seg_data[j];
-      TIndex cur_key = key_data[j];
+    for (int64_t j = 0; j < num_nz_ent; ++j) {
+      int64_t cur_seg = seg_data[j];
+      int64_t cur_key = key_data[j];
      T cur_val = val_data[j];
-      TIndex output_stride = cur_seg * num_outputs_;
-      for (TIndex i = 0; i < num_outputs_; ++i) {
+      int64_t output_stride = cur_seg * num_outputs_;
+      for (int64_t i = 0; i < num_outputs_; ++i) {
        T sum = 0;
-        for (TIndex k = 0; k < num_alpha; ++k) {
+        for (int64_t k = 0; k < num_alpha; ++k) {
          // The hash function takes as input three integers:
          // 1. feature index
          // 2. output index
@ -108,13 +108,13 @@ class SparseFunHashOp : public Operator<Context> {

 #ifdef USE_SIGN
          // Use the least significant bit for sign, the rest for weights.
-          TIndex index = (hash >> 1) % num_weight;
+          int64_t index = (hash >> 1) % num_weight;
          T cur_weight = weight_data[index];
          if (hash & 1) {
            cur_weight = -cur_weight;
          }
 #else
-          TIndex index = hash % num_weight;
+          int64_t index = hash % num_weight;
          T cur_weight = weight_data[index];
 #endif

@ -132,8 +132,8 @@ class SparseFunHashOp : public Operator<Context> {
  }

 protected:
-  TIndex num_outputs_;
-  TIndex num_segments_;
+  int64_t num_outputs_;
+  int64_t num_segments_;
  uint64_t seed_;
  std::array<uint64_t, 4> hash_data;
  bool adaptive_;
@ -146,7 +146,7 @@ class SparseFunHashGradientOp : public Operator<Context> {
  SparseFunHashGradientOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws),
        num_outputs_(
-            OperatorBase::GetSingleArgument<TIndex>("num_outputs", -1)),
+            OperatorBase::GetSingleArgument<int64_t>("num_outputs", -1)),
        seed_(OperatorBase::GetSingleArgument<uint64_t>("seed", 0)) {
    adaptive_ = (InputSize() == 6);
  }
@ -158,7 +158,7 @@ class SparseFunHashGradientOp : public Operator<Context> {
    const auto& seg = Input(3);
    const auto& weight = Input(4);

-    TIndex num_alpha = 1;
+    int64_t num_alpha = 1;
    T* grad_alpha_data = 0;

    if (adaptive_) {
@ -172,10 +172,10 @@ class SparseFunHashGradientOp : public Operator<Context> {

    const auto* seg_data = seg.template data<int>();

-    TIndex num_weight = weight.dim(0);
-    TIndex num_nz_ent = seg.dim(0);
+    int64_t num_weight = weight.dim(0);
+    int64_t num_nz_ent = seg.dim(0);

-    TIndex grad_weight_size = num_nz_ent * num_outputs_ * num_alpha;
+    int64_t grad_weight_size = num_nz_ent * num_outputs_ * num_alpha;
    auto* grad_weight_val = Output(0);
    grad_weight_val->Resize(grad_weight_size);
    T* grad_weight_val_data = grad_weight_val->template mutable_data<T>();
@ -183,23 +183,23 @@ class SparseFunHashGradientOp : public Operator<Context> {
    auto* grad_weight_ind = Output(1);
    grad_weight_ind->Resize(grad_weight_size);
    auto* grad_weight_ind_data =
-        grad_weight_ind->template mutable_data<TIndex>();
+        grad_weight_ind->template mutable_data<int64_t>();

    const auto* grad_out_data = grad_out.template data<T>();
    const auto* weight_data = weight.template data<T>();
    const auto* alpha_data = adaptive_ ? Input(5).template data<T>() : 0;
    const auto* val_data = val.template data<T>();
-    const auto* key_data = key.template data<TIndex>();
+    const auto* key_data = key.template data<int64_t>();

-    TIndex w_ind = 0;
-    for (TIndex j = 0; j < num_nz_ent; ++j) {
-      TIndex cur_seg = seg_data[j];
-      TIndex cur_key = key_data[j];
+    int64_t w_ind = 0;
+    for (int64_t j = 0; j < num_nz_ent; ++j) {
+      int64_t cur_seg = seg_data[j];
+      int64_t cur_key = key_data[j];
      T cur_val = val_data[j];
-      TIndex grad_out_stride = cur_seg * num_outputs_;
-      for (TIndex i = 0; i < num_outputs_; ++i) {
+      int64_t grad_out_stride = cur_seg * num_outputs_;
+      for (int64_t i = 0; i < num_outputs_; ++i) {
        T grad_out_scale = grad_out_data[grad_out_stride + i] * cur_val;
-        for (TIndex k = 0; k < num_alpha; ++k) {
+        for (int64_t k = 0; k < num_alpha; ++k) {
          hash_data[0] = cur_key;
          hash_data[1] = i;
          hash_data[2] = k;
@ -209,12 +209,12 @@ class SparseFunHashGradientOp : public Operator<Context> {

          T cur_grad_out_scale = grad_out_scale;
 #ifdef USE_SIGN
-          TIndex index = (hash >> 1) % num_weight;
+          int64_t index = (hash >> 1) % num_weight;
          if (hash & 1) {
            cur_grad_out_scale = -cur_grad_out_scale;
          }
 #else
-          TIndex index = hash % num_weight;
+          int64_t index = hash % num_weight;
 #endif

          if (adaptive_) {
@ -232,7 +232,7 @@ class SparseFunHashGradientOp : public Operator<Context> {
  }

 protected:
-  TIndex num_outputs_;
+  int64_t num_outputs_;
  uint64_t seed_;
  std::array<uint64_t, 4> hash_data;
  bool adaptive_;
--- a/caffe2/experiments/operators/sparse_matrix_reshape_op.h
+++ b/caffe2/experiments/operators/sparse_matrix_reshape_op.h
@ -36,10 +36,10 @@ class SparseMatrixReshapeOp : public Operator<Context> {
        OperatorBase::HasArgument("new_shape"),
        "Argument `new_shape` is missing.");

-    vector<TIndex> old_shape =
-        OperatorBase::GetRepeatedArgument<TIndex>("old_shape");
-    vector<TIndex> new_shape =
-        OperatorBase::GetRepeatedArgument<TIndex>("new_shape");
+    vector<int64_t> old_shape =
+        OperatorBase::GetRepeatedArgument<int64_t>("old_shape");
+    vector<int64_t> new_shape =
+        OperatorBase::GetRepeatedArgument<int64_t>("new_shape");

    CAFFE_ENFORCE(
        old_shape.size() == 2,
@ -63,7 +63,7 @@ class SparseMatrixReshapeOp : public Operator<Context> {
          old_shape[0] > 0,
          "The first dimension in `old_shape` must be positive.");

-      TIndex matrix_size = old_shape[0] * old_shape[1];
+      int64_t matrix_size = old_shape[0] * old_shape[1];

      if (new_shape[0] == -1) {
        CAFFE_ENFORCE(
@ -106,14 +106,14 @@ class SparseMatrixReshapeOp : public Operator<Context> {
    new_col->Resize(nnz);
    new_row->Resize(nnz);

-    const auto* old_col_data = old_col.template data<TIndex>();
+    const auto* old_col_data = old_col.template data<int64_t>();
    const auto* old_row_data = old_row.template data<int>();

-    auto* new_col_data = new_col->template mutable_data<TIndex>();
+    auto* new_col_data = new_col->template mutable_data<int64_t>();
    auto* new_row_data = new_row->template mutable_data<int>();

    for (int i = 0; i < nnz; ++i) {
-      TIndex offset = old_row_data[i] * old_stride_ + old_col_data[i];
+      int64_t offset = old_row_data[i] * old_stride_ + old_col_data[i];
      new_row_data[i] = offset / new_stride_;
      new_col_data[i] = offset % new_stride_;
    }
@ -122,8 +122,8 @@ class SparseMatrixReshapeOp : public Operator<Context> {
  }

 private:
-  TIndex old_stride_;
-  TIndex new_stride_;
+  int64_t old_stride_;
+  int64_t new_stride_;
 };

 } // namespace caffe2
--- a/caffe2/experiments/operators/tt_contraction_op.h
+++ b/caffe2/experiments/operators/tt_contraction_op.h
@ -29,9 +29,9 @@ class TTContractionOp final : public Operator<Context> {
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  TTContractionOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws),
-        K_(OperatorBase::GetSingleArgument<TIndex>("K", 0)),
-        M_(OperatorBase::GetSingleArgument<TIndex>("M", 0)),
-        N_(OperatorBase::GetSingleArgument<TIndex>("N", 0)) {
+        K_(OperatorBase::GetSingleArgument<int64_t>("K", 0)),
+        M_(OperatorBase::GetSingleArgument<int64_t>("M", 0)),
+        N_(OperatorBase::GetSingleArgument<int64_t>("N", 0)) {
    CAFFE_ENFORCE(OperatorBase::HasArgument("K"), "Argument `K` is missing.");
    CAFFE_ENFORCE(OperatorBase::HasArgument("M"), "Argument `M` is missing.");
    CAFFE_ENFORCE(OperatorBase::HasArgument("N"), "Argument `N` is missing.");
@ -44,8 +44,8 @@ class TTContractionOp final : public Operator<Context> {

    CAFFE_ENFORCE(A.ndim() == 2, A.ndim());

-    TIndex A_size = A.size_from_dim(0);
-    TIndex B_size = B.size_from_dim(0);
+    int64_t A_size = A.size_from_dim(0);
+    int64_t B_size = B.size_from_dim(0);

    CAFFE_ENFORCE(
        K_ * M_ == A_size,
@ -55,19 +55,19 @@ class TTContractionOp final : public Operator<Context> {
        B_size % (K_ * N_) == 0,
        "Argument `K` and `N` do not agree with the size of B.");

-    TIndex D_ = B_size / (K_ * N_);
+    int64_t D_ = B_size / (K_ * N_);

-    TIndex C_size = D_ * M_ * N_;
-    C->Resize(vector<TIndex>{C_size});
+    int64_t C_size = D_ * M_ * N_;
+    C->Resize(vector<int64_t>{C_size});

-    TIndex B_stride = K_ * N_;
-    TIndex C_stride = M_ * N_;
+    int64_t B_stride = K_ * N_;
+    int64_t C_stride = M_ * N_;

    const T* A_data = A.template data<T>();
    const T* B_data = B.template data<T>();
    T* C_data = C->template mutable_data<T>();

-    for (TIndex B_index = 0; B_index < B_size; B_index += B_stride) {
+    for (int64_t B_index = 0; B_index < B_size; B_index += B_stride) {
      math::Gemm<T, Context, Engine>(
          CblasTrans,
          CblasNoTrans,
@ -84,9 +84,9 @@ class TTContractionOp final : public Operator<Context> {
  }

 protected:
-  TIndex K_;
-  TIndex M_;
-  TIndex N_;
+  int64_t K_;
+  int64_t M_;
+  int64_t N_;
 };

 template <typename T, class Context, class Engine = DefaultEngine>
@ -95,9 +95,9 @@ class TTContractionGradientOp final : public Operator<Context> {
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  TTContractionGradientOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws),
-        K_(OperatorBase::GetSingleArgument<TIndex>("K", 0)),
-        M_(OperatorBase::GetSingleArgument<TIndex>("M", 0)),
-        N_(OperatorBase::GetSingleArgument<TIndex>("N", 0)) {}
+        K_(OperatorBase::GetSingleArgument<int64_t>("K", 0)),
+        M_(OperatorBase::GetSingleArgument<int64_t>("M", 0)),
+        N_(OperatorBase::GetSingleArgument<int64_t>("N", 0)) {}

  bool RunOnDevice() override {
    const auto& G = Input(0);
@ -106,16 +106,16 @@ class TTContractionGradientOp final : public Operator<Context> {
    auto* dA = Output(0);
    auto* dB = Output(1);

-    TIndex G_size = G.size_from_dim(0);
-    TIndex D_ = G_size / (M_ * N_);
+    int64_t G_size = G.size_from_dim(0);
+    int64_t D_ = G_size / (M_ * N_);

-    TIndex dB_size = D_ * K_ * N_;
+    int64_t dB_size = D_ * K_ * N_;

    dA->Resize(A.dims());
    dB->Resize(B.dims());

-    TIndex B_stride = K_ * N_;
-    TIndex G_stride = M_ * N_;
+    int64_t B_stride = K_ * N_;
+    int64_t G_stride = M_ * N_;

    const T* G_data = G.template data<T>();
    const T* A_data = A.template data<T>();
@ -125,7 +125,7 @@ class TTContractionGradientOp final : public Operator<Context> {
    T* dB_data = dB->template mutable_data<T>();

    const T* G_ptr = G_data;
-    for (TIndex B_index = 0; B_index < dB_size; B_index += B_stride) {
+    for (int64_t B_index = 0; B_index < dB_size; B_index += B_stride) {
      math::Gemm<T, Context, Engine>(
          CblasNoTrans,
          CblasTrans,
@ -139,7 +139,7 @@ class TTContractionGradientOp final : public Operator<Context> {
    }

    G_ptr = G_data;
-    for (TIndex B_index = 0; B_index < dB_size; B_index += B_stride) {
+    for (int64_t B_index = 0; B_index < dB_size; B_index += B_stride) {
      math::Gemm<T, Context, Engine>(
          CblasNoTrans,
          CblasNoTrans,
@ -156,9 +156,9 @@ class TTContractionGradientOp final : public Operator<Context> {
  }

 protected:
-  TIndex K_;
-  TIndex M_;
-  TIndex N_;
+  int64_t K_;
+  int64_t M_;
+  int64_t N_;
 };

 } // namespace caffe2
--- a/caffe2/experiments/operators/tt_pad_op.h
+++ b/caffe2/experiments/operators/tt_pad_op.h
@ -29,7 +29,7 @@ class TTPadOp final : public Operator<Context> {
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  TTPadOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws),
-        scale_(OperatorBase::GetSingleArgument<TIndex>("scale", 0)) {
+        scale_(OperatorBase::GetSingleArgument<int64_t>("scale", 0)) {
    CAFFE_ENFORCE(
        OperatorBase::HasArgument("scale"), "Argument `scale` is missing.");
  }
@ -46,16 +46,16 @@ class TTPadOp final : public Operator<Context> {

    auto* X_orig_dim0 = Output(1);
    X_orig_dim0->Resize(1);
-    *X_orig_dim0->template mutable_data<TIndex>() = X_dim0;
+    *X_orig_dim0->template mutable_data<int64_t>() = X_dim0;

    if (X_dim0 % scale_ != 0) {
-      TIndex padded_dim0 = (X_dim0 / scale_ + 1) * scale_;
+      int64_t padded_dim0 = (X_dim0 / scale_ + 1) * scale_;
      auto dim0_diff = padded_dim0 - X_dim0;
      // set growthPct to the upper bound percentage: (100 * scale_ / X_dim0)
      X_pad->Extend(dim0_diff, 100 * scale_ / X_dim0, &context_);

      auto* X_pad_data = X_pad->template mutable_data<T>();
-      TIndex X_size = X_dim0 * X_dim1;
+      int64_t X_size = X_dim0 * X_dim1;
      memset(X_pad_data + X_size, 0, dim0_diff * X_dim1 * sizeof(T));
    }

@ -63,7 +63,7 @@ class TTPadOp final : public Operator<Context> {
  }

 protected:
-  TIndex scale_;
+  int64_t scale_;
 };

 template <typename T, class Context, class Engine = DefaultEngine>
@ -78,7 +78,7 @@ class TTPadGradientOp final : public Operator<Context> {
    auto* output = Output(0);
    CAFFE_ENFORCE(&G == output);

-    auto old_dim0 = *Input(1).template data<TIndex>();
+    auto old_dim0 = *Input(1).template data<int64_t>();
    auto new_dim0 = G.dim(0);
    auto dim1 = G.dim(1);

--- a/caffe2/ideep/operators/concat_split_op.cc
+++ b/caffe2/ideep/operators/concat_split_op.cc
@ -43,7 +43,7 @@ class IDEEPConcatOp final : public IDEEPOperator {
    }

    auto axis_vdata = ideep::concat::compute(inputs, axis_, add_axis_, *output);
-    axis_info->Resize(vector<TIndex>(1, InputSize()));
+    axis_info->Resize(vector<int64_t>(1, InputSize()));
    int* axis_data = axis_info->template mutable_data<int>();
    for (int i = 0; i < axis_vdata.size(); i++) {
      axis_data[i] = axis_vdata[i];
--- a/caffe2/ideep/operators/conv_pool_base_op.h
+++ b/caffe2/ideep/operators/conv_pool_base_op.h
@ -39,7 +39,7 @@ class IDEEPConvPoolOpBase : public ConvPoolOpBase<IDEEPContext> {
    ideep::tensor::dims output_dims;

    auto input_dims = input.get_dims();
-    vector<TIndex> input_Tdims (input_dims.begin(), input_dims.end());
+    vector<int64_t> input_Tdims (input_dims.begin(), input_dims.end());
    InferOutputSize(
        input_Tdims,
        output_channel,
--- a/caffe2/ideep/operators/squeeze_op.cc
+++ b/caffe2/ideep/operators/squeeze_op.cc
@ -35,7 +35,7 @@ class IDEEPSqueezeOp final : public IDEEPOperator {
        (dims_.back() + 1),
        " dimensions.");
    const auto& ideep_dims = X.get_dims();
-    vector<TIndex> dims(ideep_dims.begin(), ideep_dims.end());
+    vector<int64_t> dims(ideep_dims.begin(), ideep_dims.end());
    const auto& new_dims = SqueezeOp<IDEEPContext>::ComputeDims(dims, dims_);
    itensor::dims new_dims_ideep(new_dims.begin(), new_dims.end());
    if (&X != Y) {
--- a/caffe2/image/image_input_op.h
+++ b/caffe2/image/image_input_op.h
@ -372,14 +372,14 @@ ImageInputOp<Context>::ImageInputOp(
    randgen_per_thread_.emplace_back(meta_randgen());
  }
  prefetched_image_.Resize(
-      TIndex(batch_size_),
-      TIndex(crop_),
-      TIndex(crop_),
-      TIndex(color_ ? 3 : 1));
+      int64_t(batch_size_),
+      int64_t(crop_),
+      int64_t(crop_),
+      int64_t(color_ ? 3 : 1));
  if (label_type_ != SINGLE_LABEL && label_type_ != SINGLE_LABEL_WEIGHTED) {
-    prefetched_label_.Resize(TIndex(batch_size_), TIndex(num_labels_));
+    prefetched_label_.Resize(int64_t(batch_size_), int64_t(num_labels_));
  } else {
-    prefetched_label_.Resize(vector<TIndex>(1, batch_size_));
+    prefetched_label_.Resize(vector<int64_t>(1, batch_size_));
  }

  for (int i = 0; i < additional_output_sizes.size(); ++i) {
@ -387,7 +387,7 @@ ImageInputOp<Context>::ImageInputOp(
        Context::GetDeviceType());
    prefetched_additional_outputs_.emplace_back(CPU);
    prefetched_additional_outputs_[i].Resize(
-        TIndex(batch_size_), TIndex(additional_output_sizes[i]));
+        int64_t(batch_size_), int64_t(additional_output_sizes[i]));
  }
 }

--- a/caffe2/mkl/mkl_utils_test.cc
+++ b/caffe2/mkl/mkl_utils_test.cc
@ -23,10 +23,10 @@ TEST(MKLDNNTest, SimpleConvolutionTest) {
  int pads[2] = {0, 0};

  // Creating Input and output tensors
-  Tensor X(vector<TIndex>{16, 8, 32, 32}, CPU);
-  Tensor W(vector<TIndex>{64, 8, 3, 3}, CPU);
-  Tensor b(vector<TIndex>{64}, CPU);
-  Tensor Y(vector<TIndex>{16, 64, 30, 30}, CPU);
+  Tensor X(vector<int64_t>{16, 8, 32, 32}, CPU);
+  Tensor W(vector<int64_t>{64, 8, 3, 3}, CPU);
+  Tensor b(vector<int64_t>{64}, CPU);
+  Tensor Y(vector<int64_t>{16, 64, 30, 30}, CPU);

  float* data = X.mutable_data<float>();
  for (int i = 0; i < X.size(); ++i) {
@ -91,7 +91,7 @@ TEST(MKLDNNTest, MKLMemoryCopyTest) {
  // the buffer size being empty for both - former in dnnAllocateBuffer and
  // the latter in dnnConversionExecute (likely due to some difference in
  // layout?). Test both cases.
-  vector<vector<TIndex>> dims_list{{10, 3, 20, 20}, {0}, {0, 10}};
+  vector<vector<int64_t>> dims_list{{10, 3, 20, 20}, {0}, {0, 10}};
  for (const auto& dims : dims_list) {
    auto X_cpu_in = caffe2::make_unique<Tensor>(dims, CPU);
    CPUContext ctx;
--- a/caffe2/mkl/mklmemory_serialization.cc
+++ b/caffe2/mkl/mklmemory_serialization.cc
@ -84,8 +84,8 @@ class MKLMemoryDeserializer : public BlobDeserializerBase {
        "MKLMemory only supports either float or double formats.");
    CAFFE_ENFORCE(
        !proto.has_segment(), "MKLMemory does not support segment right now.");
-    vector<TIndex> dims;
-    for (const TIndex d : proto.dims()) {
+    vector<int64_t> dims;
+    for (const int64_t d : proto.dims()) {
      dims.push_back(d);
    }
    // TODO: right now, every time we do a deserializer we create a new MKL
--- a/caffe2/mkl/operators/concat_op.cc
+++ b/caffe2/mkl/operators/concat_op.cc
@ -96,7 +96,7 @@ class MKLConcatOp final : public MKLOperator<T> {

 private:
  int axis_;
-  vector<TIndex> cached_output_dims_;
+  vector<int64_t> cached_output_dims_;
 };

 } // namespace mkl
--- a/caffe2/mkl/operators/conv_op.cc
+++ b/caffe2/mkl/operators/conv_op.cc
@ -37,7 +37,7 @@ class MKLConvOp final : public ConvPoolOpBase<MKLContext> {
      math::Set<T, CPUContext>(
          M, 0.0, cpu_zero_bias.template mutable_data<float>(), &ctx);

-      zero_bias_.reset(new MKLMemory<T>(std::vector<TIndex>{M}));
+      zero_bias_.reset(new MKLMemory<T>(std::vector<int64_t>{M}));
      zero_bias_->CopyFrom(cpu_zero_bias);
    }
    const auto& bias = InputSize() == 2
@ -130,11 +130,11 @@ class MKLConvOp final : public ConvPoolOpBase<MKLContext> {
    if (group_ > 1) {
      // Explicitly reformat the buffer.
      MKLMemory<float> group_filter(
-          std::vector<TIndex>{TIndex(group_),
-                              TIndex(filter.dim32(0) / group_),
-                              TIndex(filter.dim32(1)),
-                              TIndex(filter.dim32(2)),
-                              TIndex(filter.dim32(3))},
+          std::vector<int64_t>{int64_t(group_),
+                              int64_t(filter.dim32(0) / group_),
+                              int64_t(filter.dim32(1)),
+                              int64_t(filter.dim32(2)),
+                              int64_t(filter.dim32(3))},
          nullptr,
          dnnResourceFilter,
          /*share_memory_if_possible=*/true);
@ -168,8 +168,8 @@ class MKLConvOp final : public ConvPoolOpBase<MKLContext> {
  // Input: X, W, b
  // Output: Y
  std::unique_ptr<MKLMemory<T>> zero_bias_;
-  vector<TIndex> cached_input_dims_;
-  vector<TIndex> cached_filter_dims_;
+  vector<int64_t> cached_input_dims_;
+  vector<int64_t> cached_filter_dims_;
  PrimitiveWrapper<T> primitive_;
  LayoutWrapper<T> input_layout_;
  LayoutWrapper<T> filter_layout_;
--- a/caffe2/mkl/operators/conv_op_mkldnn.cc
+++ b/caffe2/mkl/operators/conv_op_mkldnn.cc
@ -106,8 +106,8 @@ class ConvMKLDNNOp final : public ConvPoolOpBase<CPUContext> {
 private:
  // Input: X, W, b
  // Output: Y
-  vector<TIndex> cached_input_dims_;
-  vector<TIndex> cached_filter_dims_;
+  vector<int64_t> cached_input_dims_;
+  vector<int64_t> cached_filter_dims_;
  PrimitiveWrapper<T> primitive_;
  unique_ptr<MKLMemory<T>> X_wrapper_ = nullptr;
  unique_ptr<MKLMemory<T>> filter_wrapper_ = nullptr;
--- a/caffe2/mkl/operators/elementwise_sum_op.cc
+++ b/caffe2/mkl/operators/elementwise_sum_op.cc
@ -64,7 +64,7 @@ class MKLSumOp final : public MKLOperator<T> {

 private:
  std::vector<float> coefficients_;
-  vector<TIndex> cached_input_dims_;
+  vector<int64_t> cached_input_dims_;
  vector<std::shared_ptr<void>> input_views_;
 };

--- a/caffe2/mkl/operators/fully_connected_op.cc
+++ b/caffe2/mkl/operators/fully_connected_op.cc
@ -90,8 +90,8 @@ class MKLFullyConnectedOp final : public MKLOperator<T> {
  // Input: X, W, b
  // Output: Y
  size_t axis_{1};
-  vector<TIndex> cached_input_dims_;
-  vector<TIndex> cached_filter_dims_;
+  vector<int64_t> cached_input_dims_;
+  vector<int64_t> cached_filter_dims_;
  PrimitiveWrapper<T> primitive_;
  LayoutWrapper<T> input_layout_;
  LayoutWrapper<T> filter_layout_;
--- a/caffe2/mkl/operators/local_response_normalization_op.cc
+++ b/caffe2/mkl/operators/local_response_normalization_op.cc
@ -19,7 +19,7 @@ class MKLLRNOp final : public LRNOpBase<T, MKLContext> {
  bool RunOnDeviceWithOrderNHWC() override;

 private:
-  vector<TIndex> cached_input_dims_;
+  vector<int64_t> cached_input_dims_;
  LayoutWrapper<T> workspace_layout_;
  std::unique_ptr<MKLWorkspace<T>> workspace_buffer_;
  PrimitiveWrapper<T> primitive_;
--- a/caffe2/mkl/operators/packed_fc_op.cc
+++ b/caffe2/mkl/operators/packed_fc_op.cc
@ -141,7 +141,7 @@ class PackedFCOp final : public Operator<CPUContext> {
  }
  size_t axis_{1};
  uint32_t hash_{0};
-  vector<TIndex> Y_shape_cache_;
+  vector<int64_t> Y_shape_cache_;
  Tensor bias_multiplier_{CPU};
  std::unique_ptr<MKLPackedMatrix> local_packed_matrix_;
 };
--- a/caffe2/mkl/operators/pool_op.cc
+++ b/caffe2/mkl/operators/pool_op.cc
@ -41,8 +41,8 @@ class MKLPoolOp final : public ConvPoolOpBase<MKLContext> {
  // Input: X
  // Output: Y
 private:
-  vector<TIndex> cached_input_dims_;
-  // vector<TIndex> cached_avgpool_input_dims_;
+  vector<int64_t> cached_input_dims_;
+  // vector<int64_t> cached_avgpool_input_dims_;
  LayoutWrapper<T> workspace_layout_;
  std::unique_ptr<MKLWorkspace<T>> workspace_buffer_;
  PrimitiveWrapper<T> primitive_;
--- a/caffe2/mkl/operators/relu_op.cc
+++ b/caffe2/mkl/operators/relu_op.cc
@ -43,7 +43,7 @@ class MKLReluOp : public MKLOperator<T> {
  }

 private:
-  vector<TIndex> cached_input_dims_;
+  vector<int64_t> cached_input_dims_;
 };

 template <typename T>
--- a/caffe2/mkl/operators/spatial_batch_norm_op.cc
+++ b/caffe2/mkl/operators/spatial_batch_norm_op.cc
@ -146,7 +146,7 @@ class MKLBNOp final : public Operator<MKLContext> {
  const StorageOrder order_;
  const int num_batches_;

-  vector<TIndex> cached_input_dims_;
+  vector<int64_t> cached_input_dims_;
  LayoutWrapper<T> scale_bias_layout_;
  LayoutWrapper<T> saved_mean_layout_;
  LayoutWrapper<T> saved_var_layout_;
--- a/caffe2/mkl/operators/squeeze_op.cc
+++ b/caffe2/mkl/operators/squeeze_op.cc
@ -57,7 +57,7 @@ class MKLSqueezeOp final : public MKLOperator<T> {

 private:
  vector<int> dims_;
-  vector<TIndex> cached_input_dims_;
+  vector<int64_t> cached_input_dims_;
 };

 } // namespace mkl
--- a/caffe2/mkl/utils/mkl_memory.cc
+++ b/caffe2/mkl/utils/mkl_memory.cc
@ -19,7 +19,7 @@ CAFFE_KNOWN_TYPE(mkl::MKLMemory<float>);
 CAFFE_KNOWN_TYPE(mkl::MKLMemory<double>);

 template <typename T>
-static vector<TIndex> GetMKLTensorInfo(
+static vector<int64_t> GetMKLTensorInfo(
    const void* c,
    size_t* capacity,
    DeviceOption* device) {
--- a/caffe2/mkl/utils/mkl_memory.h
+++ b/caffe2/mkl/utils/mkl_memory.h
@ -5,8 +5,8 @@
 #include <vector>
 #include <mutex>

-#include "caffe2/core/flags.h" // for TIndex
-#include "caffe2/core/tensor.h" // for TIndex
+#include "caffe2/core/flags.h" // for int64_t
+#include "caffe2/core/tensor.h" // for int64_t
 #include "caffe2/mkl/utils/mkl_dnn_cppwrapper.h"

 // A global boolean variable that controls the behavior when we call View() on
@ -270,7 +270,7 @@ class MKLMemory {
        "Reshape is not allowed for custom layouts. "
        "Convert to plain layout before invoking Reshape().");

-    TIndex new_size = 1;
+    int64_t new_size = 1;
    for (auto i = 0; i < dims.size(); ++i) {
      CAFFE_ENFORCE_GE_WITH_CALLER(dims[i], 0);
      new_size *= dims[i];
@ -279,7 +279,7 @@ class MKLMemory {
        new_size == size_,
        "New size and old size are not equal. Reshape is not possible.");

-    vector<TIndex> new_dims(dims.size());
+    vector<int64_t> new_dims(dims.size());
    vector<size_t> size(dims.size());
    vector<size_t> strides(dims.size());
    for (int i = 0; i < dims.size(); ++i) {
@ -456,7 +456,7 @@ class MKLMemory {
    return buffer_.get();
  }

-  inline const vector<TIndex>& dims() const {
+  inline const vector<int64_t>& dims() const {
    return dims_;
  }

@ -470,7 +470,7 @@ class MKLMemory {
  /**
   * Returns the size (i.e., the number of items) in the buffer.
   */
-  inline TIndex size() const {
+  inline int64_t size() const {
    return size_;
  }

@ -479,7 +479,7 @@ class MKLMemory {
   * must be between 0 (inclusive) and the number of dimensions, otherwise
   * this function will produce a fatal message.
   */
-  inline TIndex dim(const int i) const {
+  inline int64_t dim(const int i) const {
    return dims_.at(i);
  }

@ -545,9 +545,9 @@ class MKLMemory {
  mutable std::mutex buffer_lock_;
  // The dimensions in the same order as Caffe2 does. This is used to
  // interface with C2.
-  vector<TIndex> dims_;
+  vector<int64_t> dims_;
  // Number of items in the buffer.
-  TIndex size_ = -1;
+  int64_t size_ = -1;
  // The user dnn layout.
  LayoutWrapper<T> user_layout_;
  // The internal dnn layout.
--- a/caffe2/mkl/utils/mkl_operator.h
+++ b/caffe2/mkl/utils/mkl_operator.h
@ -97,7 +97,7 @@ class MKLOperator : public OperatorBase {
  // The primitive used in the operator.
  PrimitiveWrapper<T> primitive_;
  // Size cache for all the input sizes.
-  vector<vector<TIndex>> input_size_cache_;
+  vector<vector<int64_t>> input_size_cache_;
  // An internal MKLMemory buffer. This is usually handy when we have a
  // single output from the operator. If your operator has multiple outputs
  // then you should allocate your own buffer.
--- a/caffe2/mobile/contrib/arm-compute/core/context.h
+++ b/caffe2/mobile/contrib/arm-compute/core/context.h
@ -249,7 +249,7 @@ public:

  const int32_t ndim() const { return dims_.size(); }

-  vector<TIndex> dims() const { return dims_; }
+  vector<int64_t> dims() const { return dims_; }

  const int32_t dim32(const int index) const { return dims_.at(index); }

@ -283,7 +283,7 @@ private:
  bool SetDims(const vector<TI> &src) {
    auto old_size = size_;
    dims_.resize(src.size());
-    TIndex new_size = 1;
+    int64_t new_size = 1;
    for (unsigned int i = 0; i < src.size(); ++i) {
      new_size *= src[i];
      dims_[i] = src[i];
@ -299,7 +299,7 @@ private:
    return size_ > old_size;
  }

-  bool SetDims(const TIndex d0) {
+  bool SetDims(const int64_t d0) {
    auto old_size = size_;
    dims_.resize(1);
    dims_[0] = d0;
@ -307,7 +307,7 @@ private:
    return size_ > old_size;
  }

-  bool SetDims(const TIndex d0, const TIndex d1) {
+  bool SetDims(const int64_t d0, const int64_t d1) {
    auto old_size = size_;
    dims_.resize(2);
    dims_[0] = d0;
@ -316,7 +316,7 @@ private:
    return size_ > old_size;
  }

-  bool SetDims(const TIndex d0, const TIndex d1, const TIndex d2) {
+  bool SetDims(const int64_t d0, const int64_t d1, const int64_t d2) {
    auto old_size = size_;
    dims_.resize(3);
    dims_[0] = d0;
@ -326,8 +326,8 @@ private:
    return size_ > old_size;
  }

-  bool SetDims(const TIndex d0, const TIndex d1, const TIndex d2,
-               const TIndex d3) {
+  bool SetDims(const int64_t d0, const int64_t d1, const int64_t d2,
+               const int64_t d3) {
    auto old_size = size_;
    dims_.resize(4);
    dims_[0] = d0;
@ -338,8 +338,8 @@ private:
    return size_ > old_size;
  }

-  vector<TIndex> dims_;
-  TIndex size_ = -1;
+  vector<int64_t> dims_;
+  int64_t size_ = -1;
  arm_compute::TensorShape shape_;
  unique_ptr<arm_compute::GCTensor> tensor_;
 };
--- a/caffe2/mobile/contrib/arm-compute/operators/fully_connected_op.cc
+++ b/caffe2/mobile/contrib/arm-compute/operators/fully_connected_op.cc
@ -40,7 +40,7 @@ bool GLFullyConnectedOp<T>::RunOnDevice() {
  CAFFE_ENFORCE_EQ(1, B_->ndim());
  CAFFE_ENFORCE_EQ(N, B_->dim32(0));

-  vector<TIndex> output_dims = {M, N};
+  vector<int64_t> output_dims = {M, N};
  GLTensor<T> *Y =
      OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
  if (first_run_) {
--- a/caffe2/mobile/contrib/arm-compute/operators/pool_op.cc
+++ b/caffe2/mobile/contrib/arm-compute/operators/pool_op.cc
@ -53,7 +53,7 @@ bool GLAveragePoolOp<DataType>::RunOnDeviceWithOrderNCHW() {
  int height = X_->dim32(2);
  int width = X_->dim32(3);

-  vector<TIndex> output_dims = {N, channels, 1, 1};
+  vector<int64_t> output_dims = {N, channels, 1, 1};
  if (!global_pooling_) {
    output_dims[2] = (height + pad_t() + pad_b() - kernel_h()) / stride_h() + 1;
    output_dims[3] = (width + pad_l() + pad_r() - kernel_w()) / stride_w() + 1;
@ -116,7 +116,7 @@ template <> bool GLMaxPoolOp<DataType>::RunOnDeviceWithOrderNCHW() {
  int height = X_->dim32(2);
  int width = X_->dim32(3);

-  vector<TIndex> output_dims = {N, channels, 1, 1};
+  vector<int64_t> output_dims = {N, channels, 1, 1};
  if (!global_pooling_) {
    output_dims[2] = (height + pad_t() + pad_b() - kernel_h()) / stride_h() + 1;
    output_dims[3] = (width + pad_l() + pad_r() - kernel_w()) / stride_w() + 1;
--- a/caffe2/mobile/contrib/arm-compute/operators/resize_op.cc
+++ b/caffe2/mobile/contrib/arm-compute/operators/resize_op.cc
@ -45,7 +45,7 @@ bool GLResizeNearestOp<T>::RunOnDevice() {

  GLTensor<T> *Y =
      OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
-  vector<TIndex> output_dims = {N, C, H * height_scale_, W * width_scale_};
+  vector<int64_t> output_dims = {N, C, H * height_scale_, W * width_scale_};

  if (first_run_) {
    Y->Resize(output_dims);
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
@ -329,7 +329,7 @@ class CopyToMPSCNNOp final : public Operator<CPUContext> {
    for (auto i = 0; i < Inputs().size(); ++i) {
      const auto& X = Input(i);
      CAFFE_ENFORCE(X.ndim() > 0 && X.ndim() <= 4);
-      std::vector<TIndex> XDims = {1, 1, 1, 1};
+      std::vector<int64_t> XDims = {1, 1, 1, 1};
      XDims.assign(X.dims().begin(), X.dims().end());

      caffe2::Timer t;
@ -2259,15 +2259,15 @@ class MPSCNNGenerateProposalsCPPOp final : public Operator<CPUContext> {

    // bbox_deltas: (num_images, A * 4, H, W)
    CAFFE_ENFORCE_EQ(
-        bbox_deltas.dims(), (vector<TIndex>{num_images, 4 * A, height, width}));
+        bbox_deltas.dims(), (vector<int64_t>{num_images, 4 * A, height, width}));

    // im_info_tensor: (num_images, 3), format [height, width, scale; ...]
-    CAFFE_ENFORCE_EQ(im_info_tensor.dims(), (vector<TIndex>{num_images, 3}));
+    CAFFE_ENFORCE_EQ(im_info_tensor.dims(), (vector<int64_t>{num_images, 3}));
    CAFFE_ENFORCE(
        im_info_tensor.template IsType<float>(), im_info_tensor.meta().name());

    // anchors: (A, 4)
-    CAFFE_ENFORCE_EQ(anchors.dims(), (vector<TIndex>{A, 4}));
+    CAFFE_ENFORCE_EQ(anchors.dims(), (vector<int64_t>{A, 4}));
    CAFFE_ENFORCE(anchors.template IsType<float>(), anchors.meta().name());
    // Broadcast the anchors to all pixels
    auto all_anchors_vec =
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
@ -640,7 +640,7 @@ void testMPSCNN() {
              CAFFE_ENFORCE_EQ(t1.ndim(), 2);
              CAFFE_ENFORCE(t2.dim32(2) == 1 && t2.dim32(3) == 1);
              const_cast<TensorCPU&>(t2).Reshape(
-                  std::vector<TIndex>{TIndex(batchSize), TIndex(COut)});
+                  std::vector<int64_t>{int64_t(batchSize), int64_t(COut)});
              // Note dims do not match, as Metal leaves a 1x1 spatial
              // dimension.
              CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
--- a/caffe2/mobile/contrib/ios/pool_test.cc
+++ b/caffe2/mobile/contrib/ios/pool_test.cc
@ -12,7 +12,7 @@ namespace caffe2 {

 namespace {

-void AddNoiseInput(const vector<TIndex>& shape, const string& name, Workspace* ws) {
+void AddNoiseInput(const vector<int64_t>& shape, const string& name, Workspace* ws) {
  DeviceOption option;
  CPUContext context(option);
  Blob* blob = ws->CreateBlob(name);
@ -58,7 +58,7 @@ void compareMaxPooling(int N,
  def1.add_arg()->CopyFrom(MakeArgument("pad_b", padB));
  def1.add_arg()->CopyFrom(MakeArgument("pad_r", padR));

-  AddNoiseInput(vector<TIndex>{N, C, H, W}, "X", &ws);
+  AddNoiseInput(vector<int64_t>{N, C, H, W}, "X", &ws);

  unique_ptr<OperatorBase> op1(CreateOperator(def1, &ws));
  EXPECT_NE(nullptr, op1.get());
--- a/caffe2/mobile/contrib/ios/resize_test.cc
+++ b/caffe2/mobile/contrib/ios/resize_test.cc
@ -12,7 +12,7 @@ namespace caffe2 {

 namespace {

-void AddNoiseInput(const vector<TIndex>& shape, const string& name, Workspace* ws) {
+void AddNoiseInput(const vector<int64_t>& shape, const string& name, Workspace* ws) {
  DeviceOption option;
  CPUContext context(option);
  Blob* blob = ws->CreateBlob(name);
@ -44,7 +44,7 @@ void compareResizeNeareast(int N,
  def1.add_arg()->CopyFrom(MakeArgument("width_scale", wscale));
  def1.add_arg()->CopyFrom(MakeArgument("height_scale", hscale));

-  AddNoiseInput(vector<TIndex>{N, C, H, W}, "X", &ws);
+  AddNoiseInput(vector<int64_t>{N, C, H, W}, "X", &ws);

  unique_ptr<OperatorBase> op1(CreateOperator(def1, &ws));
  EXPECT_NE(nullptr, op1.get());
--- a/caffe2/mobile/contrib/opengl/test/TestGLConvolution.cc
+++ b/caffe2/mobile/contrib/opengl/test/TestGLConvolution.cc
@ -12,7 +12,7 @@

 #include <vector>

-void AddNoiseInput(const std::vector<caffe2::TIndex>& shape,
+void AddNoiseInput(const std::vector<int64_t>& shape,
                   const std::string& name,
                   caffe2::Workspace* ws) {
  caffe2::CPUContext context;
@ -60,13 +60,13 @@ double BenchOp(const std::string& typ,
  def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_r", 0));
  def1.add_arg()->CopyFrom(caffe2::MakeArgument("convolution_transform_strategy", std::string("PRECOMPUTE")));

-  AddNoiseInput(std::vector<caffe2::TIndex>{1, inputC, inH, inW}, "X", ws);
+  AddNoiseInput(std::vector<int64_t>{1, inputC, inH, inW}, "X", ws);
  if (transposed) {
-    AddNoiseInput(std::vector<caffe2::TIndex>{inputC, outputC, kH, kW}, "W", ws);
+    AddNoiseInput(std::vector<int64_t>{inputC, outputC, kH, kW}, "W", ws);
  } else {
-    AddNoiseInput(std::vector<caffe2::TIndex>{outputC, inputC, kH, kW}, "W", ws);
+    AddNoiseInput(std::vector<int64_t>{outputC, inputC, kH, kW}, "W", ws);
  }
-  AddNoiseInput(std::vector<caffe2::TIndex>{outputC}, "B", ws);
+  AddNoiseInput(std::vector<int64_t>{outputC}, "B", ws);

  std::unique_ptr<caffe2::OperatorBase> op1(CreateOperator(def1, ws));

@ -131,19 +131,19 @@ static double BenchGLConvolution(int input_channels,
  }

  AddNoiseInput(
-      std::vector<caffe2::TIndex>{1, input_channels, input_height, input_width}, "X_cpu", ws);
+      std::vector<int64_t>{1, input_channels, input_height, input_width}, "X_cpu", ws);
  if (transposed) {
    AddNoiseInput(
-        std::vector<caffe2::TIndex>{input_channels, output_channels, kernel_height, kernel_width},
+        std::vector<int64_t>{input_channels, output_channels, kernel_height, kernel_width},
        "W",
        ws);
  } else {
    AddNoiseInput(
-        std::vector<caffe2::TIndex>{output_channels, input_channels, kernel_height, kernel_width},
+        std::vector<int64_t>{output_channels, input_channels, kernel_height, kernel_width},
        "W",
        ws);
  }
-  AddNoiseInput(std::vector<caffe2::TIndex>{output_channels}, "b", ws);
+  AddNoiseInput(std::vector<int64_t>{output_channels}, "b", ws);

  caffe2::NetDef netdef;
  {
--- a/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
+++ b/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
@ -36,7 +36,7 @@

 namespace caffe2 {

-void AddConstInput(const vector<TIndex>& shape,
+void AddConstInput(const vector<int64_t>& shape,
                   const float value,
                   const string& name,
                   Workspace* ws) {
@ -50,7 +50,7 @@ void AddConstInput(const vector<TIndex>& shape,
                               &context);
 }

-void AddNoiseInput(const vector<TIndex>& shape,
+void AddNoiseInput(const vector<int64_t>& shape,
                   const string& name,
                   Workspace* ws) {
  DeviceOption option;
@ -72,7 +72,7 @@ float snpe_run(int iters, Workspace& ws) {
  const int W = 227;
  const int C = 3;

-  POPULATE_DATA("X_snpe", (caffe2::vector<caffe2::TIndex>{H, W, C}), hwc);
+  POPULATE_DATA("X_snpe", (caffe2::vector<int64_t>{H, W, C}), hwc);

  OperatorDef def;
  def.set_name("snpe_test");
@ -108,7 +108,7 @@ float caffe2_run(int iters, Workspace& ws) {
  ReadProtoFromBinaryFile("/data/local/tmp/squeeze_init_net.pb", &init_net);
  ReadProtoFromBinaryFile("/data/local/tmp/squeeze_predict_net.pb", &predict_net);
  ws.RunNetOnce(init_net);
-  POPULATE_DATA("data", (caffe2::vector<caffe2::TIndex>{N, C, H, W}), chw);
+  POPULATE_DATA("data", (caffe2::vector<int64_t>{N, C, H, W}), chw);
  predict_net.set_name("SqueezeNet");
  ws.CreateNet(predict_net);

--- a/caffe2/mobile/contrib/ulp2/ulp_neon.cc
+++ b/caffe2/mobile/contrib/ulp2/ulp_neon.cc
@ -538,7 +538,7 @@ void run2b1bConvIm2ColGEMM(QConvState* state,
    CAFFE_ENFORCE_EQ(Y->dim32(0), divRoundUp(X.dim32(0) * OH * OW, kGEMMTileSize) * kGEMMTileSize);
    CAFFE_ENFORCE_EQ(Y->dim32(1), OC);
    Y->ShrinkTo(X.dim32(0) * OH * OW);
-    Y->Reshape(std::vector<TIndex>{{TIndex(X.dim(0)), TIndex(OH), TIndex(OW), TIndex(OC)}});
+    Y->Reshape(std::vector<int64_t>{{int64_t(X.dim(0)), int64_t(OH), int64_t(OW), int64_t(OC)}});
  }
 }

--- a/caffe2/mobile/contrib/ulp2/ulp_test.cc
+++ b/caffe2/mobile/contrib/ulp2/ulp_test.cc
@ -62,7 +62,7 @@ int randInt(int a, int b) {
  return std::uniform_int_distribution<int>(a, b)(gen);
 }

-TensorCPU genTensor11(std::vector<TIndex> shape) {
+TensorCPU genTensor11(std::vector<int64_t> shape) {
  Tensor r(CPU);
  r.Resize(shape);

@ -76,7 +76,7 @@ TensorCPU genTensor11(std::vector<TIndex> shape) {
  return r;
 }

-TensorCPU genTensorUniform11(std::vector<TIndex> shape) {
+TensorCPU genTensorUniform11(std::vector<int64_t> shape) {
  Tensor r(CPU);
  r.Resize(shape);

@ -90,7 +90,7 @@ TensorCPU genTensorUniform11(std::vector<TIndex> shape) {
  return r;
 }

-TensorCPU genTensor0123(std::vector<TIndex> shape) {
+TensorCPU genTensor0123(std::vector<int64_t> shape) {
  Tensor r(CPU);
  r.Resize(shape);

@ -171,7 +171,7 @@ inline void qgemmNT(int M, int N, int K, const uint8_t* A, const uint8_t* B, flo
  }
 }

-void gemmTest(TIndex M, TIndex N, TIndex K) {
+void gemmTest(int64_t M, int64_t N, int64_t K) {
  auto X = genTensor11({M, K});
  auto W = genTensor11({N, K});
  Tensor XQ(CPU), WQ(CPU), YQ(CPU), Y(CPU);
--- a/caffe2/mpi/mpi_ops.h
+++ b/caffe2/mpi/mpi_ops.h
@ -98,7 +98,7 @@ class MPIAllgatherOp final : public Operator<Context> {
    MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
    auto& input = Input(1);
    auto* output = Output(0);
-    vector<TIndex> output_dims = input.dims();
+    vector<int64_t> output_dims = input.dims();
    output_dims[0] *= OperatorBase::Input<MPICommonWorldWrapper>(0).size();
    output->Resize(output_dims);
    MPI_CHECK(MPI_Allgather(
--- a/caffe2/operators/accuracy_op.cc
+++ b/caffe2/operators/accuracy_op.cc
@ -12,7 +12,7 @@ bool AccuracyOp<float, CPUContext>::RunOnDevice() {
  int D = X.dim32(1);
  CAFFE_ENFORCE_EQ(label.ndim(), 1);
  CAFFE_ENFORCE_EQ(label.dim32(0), N);
-  Y->Resize(vector<TIndex>());
+  Y->Resize(vector<int64_t>());
  const auto* Xdata = X.data<float>();
  const auto* labelData = label.data<int>();
  const int top_k = top_k_;
--- a/caffe2/operators/accuracy_op.cu
+++ b/caffe2/operators/accuracy_op.cu
@ -53,7 +53,7 @@ bool AccuracyOp<float, CUDAContext>::RunOnDevice() {
  int D = X.dim32(1);
  CAFFE_ENFORCE_EQ(label.ndim(), 1);
  CAFFE_ENFORCE_EQ(label.dim32(0), N);
-  Y->Resize(vector<TIndex>());
+  Y->Resize(vector<int64_t>());
  float* Ydata = Y->template mutable_data<float>();
  math::Set<float, CUDAContext>(1, 0, Ydata, &context_);
  AccuracyKernel<<<
--- a/caffe2/operators/arg_ops.cc
+++ b/caffe2/operators/arg_ops.cc
@ -15,14 +15,14 @@ void ComputeArgImpl(
    const int n,
    const Compare& comp,
    const T* X,
-    TIndex* Y,
+    int64_t* Y,
    Context* context) {
-  math::Set<TIndex, Context>(prev_size * next_size, TIndex(0), Y, context);
+  math::Set<int64_t, Context>(prev_size * next_size, int64_t(0), Y, context);
  for (int i = 0; i < prev_size; ++i) {
    const T* cur_X = X + i * n * next_size + next_size;
    for (int k = 1; k < n; ++k) {
      for (int j = 0; j < next_size; ++j) {
-        TIndex* cur_Y = Y + i * next_size + j;
+        int64_t* cur_Y = Y + i * next_size + j;
        if (comp(*cur_X, X[i * n * next_size + *cur_Y * next_size + j])) {
          *cur_Y = k;
        }
@ -41,7 +41,7 @@ bool ArgMaxReducer<CPUContext>::operator()(
    const int next_size,
    const int n,
    const T* X,
-    TIndex* Y,
+    int64_t* Y,
    CPUContext* context) const {
  ComputeArgImpl(prev_size, next_size, n, std::greater<T>(), X, Y, context);
  return true;
@ -54,7 +54,7 @@ bool ArgMinReducer<CPUContext>::operator()(
    const int next_size,
    const int n,
    const T* X,
-    TIndex* Y,
+    int64_t* Y,
    CPUContext* context) const {
  ComputeArgImpl(prev_size, next_size, n, std::less<T>(), X, Y, context);
  return true;
--- a/caffe2/operators/arg_ops.cu
+++ b/caffe2/operators/arg_ops.cu
@ -28,7 +28,7 @@ __global__ void ComputeArgCUDAKernel(
    const Reducer reducer,
    const T init,
    const T* X,
-    TIndex* Y) {
+    int64_t* Y) {
  __shared__ typename BlockReduce<int, T>::TempStorage temp_storage;
  const int d = stride.d();
  for (int idx = blockIdx.x; idx < outer_size; idx += gridDim.x) {
@ -41,7 +41,7 @@ __global__ void ComputeArgCUDAKernel(
    }
    kv = BlockReduce<int, T>(temp_storage).Reduce(kv, reducer);
    if (threadIdx.x == 0) {
-      Y[idx] = static_cast<TIndex>(kv.key);
+      Y[idx] = static_cast<int64_t>(kv.key);
    }
    __syncthreads();
  }
@ -56,7 +56,7 @@ bool ArgMaxReducer<CUDAContext>::operator()(
    const int next_size,
    const int n,
    const T* X,
-    TIndex* Y,
+    int64_t* Y,
    CUDAContext* context) const {
  const int outer_size = prev_size * next_size;
  const FixedDivisor<int> stride(next_size);
@ -82,7 +82,7 @@ bool ArgMinReducer<CUDAContext>::operator()(
    const int next_size,
    const int n,
    const T* X,
-    TIndex* Y,
+    int64_t* Y,
    CUDAContext* context) const {
  const int outer_size = prev_size * next_size;
  const FixedDivisor<int> stride(next_size);
--- a/caffe2/operators/arg_ops.h
+++ b/caffe2/operators/arg_ops.h
@ -60,7 +60,7 @@ class ArgOp final : public Operator<Context> {
        next_size,
        n,
        X.template data<T>(),
-        Y->template mutable_data<TIndex>(),
+        Y->template mutable_data<int64_t>(),
        &context_);
  }

@ -78,7 +78,7 @@ struct ArgMaxReducer {
      const int next_size,
      const int n,
      const T* X,
-      TIndex* Y,
+      int64_t* Y,
      Context* context) const;
 };

@ -90,7 +90,7 @@ struct ArgMinReducer {
      const int next_size,
      const int n,
      const T* X,
-      TIndex* Y,
+      int64_t* Y,
      Context* context) const;
 };

--- a/caffe2/operators/assert_op.h
+++ b/caffe2/operators/assert_op.h
@ -22,7 +22,7 @@ class AssertOp final : public Operator<Context> {
    cmp_tensor_.CopyFrom(Input(0));
    auto* cmp_data = cmp_tensor_.template data<T>();

-    for (TIndex i = 0; i < cmp_tensor_.size(); ++i) {
+    for (int64_t i = 0; i < cmp_tensor_.size(); ++i) {
      CAFFE_ENFORCE((bool)cmp_data[i], [&]() {
        std::stringstream ss;
        ss << "Assert failed for element " << i
--- a/caffe2/operators/atomic_ops.cc
+++ b/caffe2/operators/atomic_ops.cc
@ -29,8 +29,8 @@ class AtomicFetchAddOp final : public Operator<CPUContext> {
    auto& b = Input(2);
    auto* c = Output(0);
    auto* d = Output(1);
-    c->Resize(std::vector<TIndex>());
-    d->Resize(std::vector<TIndex>());
+    c->Resize(std::vector<int64_t>());
+    d->Resize(std::vector<int64_t>());
    auto* aPtr = a.data<int32_t>();
    auto* bPtr = b.data<int32_t>();
    auto* cPtr = c->template mutable_data<int32_t>();
--- a/caffe2/operators/batch_box_cox_op.cc
+++ b/caffe2/operators/batch_box_cox_op.cc
@ -105,7 +105,7 @@ bool BatchBoxCoxOp<CPUContext>::DoRunWithType() {
    zeros_.clear();
    nonzeros_.reserve(D);
    zeros_.reserve(D);
-    for (TIndex j = 0; j < D; j++) {
+    for (int64_t j = 0; j < D; j++) {
      if (lambda1_ptr[j] == 0) {
        zeros_.push_back(j);
      } else {
@ -121,7 +121,7 @@ bool BatchBoxCoxOp<CPUContext>::DoRunWithType() {
    // rows by replicating the input parameters K times. Then finish row-by-row.
    TypedCachedBuffers<T>& b = GetBuffers<T>();
    if (nonzeros_.size() == D) {
-      TIndex i = 0;
+      int64_t i = 0;
      if (K > 1) {
        TileArrayIntoVector(lambda1_ptr, D, K, &b.lambda1_);
        TileArrayIntoVector(lambda2_ptr, D, K, &b.lambda2_);
@ -142,7 +142,7 @@ bool BatchBoxCoxOp<CPUContext>::DoRunWithType() {
            D, data_ptr, lambda1_ptr, lambda2_ptr, k_eps, output_ptr);
      }
    } else if (zeros_.size() == D) {
-      TIndex i = 0;
+      int64_t i = 0;
      if (K > 1) {
        TileArrayIntoVector(lambda2_ptr, D, K, &b.lambda2_z_);
        DCHECK_EQ(K * D, b.lambda2_z_.size());
@ -169,7 +169,7 @@ bool BatchBoxCoxOp<CPUContext>::DoRunWithType() {
      PackV(nonzeros_.size(), lambda2_ptr, nonzeros_.data(), b.lambda2_.data());
      PackV(zeros_.size(), lambda2_ptr, zeros_.data(), b.lambda2_z_.data());

-      TIndex i = 0;
+      int64_t i = 0;
      b.accumulator_.resize(std::max(nonzeros_.size(), zeros_.size()));
      if (K > 1) {
        // Truncate to original size, and re-tile with offsets this time.
@ -219,15 +219,15 @@ bool BatchBoxCoxOp<CPUContext>::DoRunWithType() {
 template <>
 template <typename T>
 void BatchBoxCoxOp<CPUContext>::BoxCoxNaive(
-    TIndex N,
-    TIndex D,
+    int64_t N,
+    int64_t D,
    const T* data_ptr,
    const T* lambda1_ptr,
    const T* lambda2_ptr,
    T k_eps,
    T* output_ptr) {
-  for (TIndex i = 0; i < N; i++) {
-    for (TIndex j = 0; j < D; j++, data_ptr++, output_ptr++) {
+  for (int64_t i = 0; i < N; i++) {
+    for (int64_t j = 0; j < D; j++, data_ptr++, output_ptr++) {
      T lambda1_v = lambda1_ptr[j];
      T lambda2_v = lambda2_ptr[j];
      T tmp = std::max(*data_ptr + lambda2_v, k_eps);
@ -245,18 +245,18 @@ void BatchBoxCoxOp<CPUContext>::BoxCoxNaive(
 template <>
 template <typename T>
 void BatchBoxCoxOp<CPUContext>::BoxCoxNonzeroLambda(
-    TIndex D,
+    int64_t D,
    const T* data_ptr,
    const T* lambda1,
    const T* lambda2,
    T k_eps,
    T* out) {
  caffe2::math::Add(D, data_ptr, lambda2, out, &context_);
-  for (TIndex j = 0; j < D; j++) {
+  for (int64_t j = 0; j < D; j++) {
    out[j] = std::max(out[j], k_eps);
  }
  Pow(D, out, lambda1, out);
-  for (TIndex j = 0; j < D; j++) {
+  for (int64_t j = 0; j < D; j++) {
    out[j] -= 1.0;
  }
  caffe2::math::Div(D, out, lambda1, out, &context_);
@ -265,13 +265,13 @@ void BatchBoxCoxOp<CPUContext>::BoxCoxNonzeroLambda(
 template <>
 template <typename T>
 void BatchBoxCoxOp<CPUContext>::BoxCoxZeroLambda(
-    TIndex D,
+    int64_t D,
    const T* data_ptr,
    const T* lambda2,
    T k_eps,
    T* output_ptr) {
  caffe2::math::Add(D, data_ptr, lambda2, output_ptr, &context_);
-  for (TIndex j = 0; j < D; j++) {
+  for (int64_t j = 0; j < D; j++) {
    output_ptr[j] = std::max(output_ptr[j], k_eps);
  }
  caffe2::math::Log(D, output_ptr, output_ptr, &context_);
--- a/caffe2/operators/batch_box_cox_op.h
+++ b/caffe2/operators/batch_box_cox_op.h
@ -27,8 +27,8 @@ class BatchBoxCoxOp final : public Operator<Context> {
 protected:
  template <typename T>
  void BoxCoxNaive(
-      TIndex N,
-      TIndex D,
+      int64_t N,
+      int64_t D,
      const T* data_ptr,
      const T* lambda1_ptr,
      const T* lambda2_ptr,
@ -38,7 +38,7 @@ class BatchBoxCoxOp final : public Operator<Context> {
 #ifdef CAFFE2_USE_MKL
  template <typename T>
  void BoxCoxNonzeroLambda(
-      TIndex D,
+      int64_t D,
      const T* data_ptr,
      const T* lambda1,
      const T* lambda2,
@ -47,7 +47,7 @@ class BatchBoxCoxOp final : public Operator<Context> {

  template <typename T>
  void BoxCoxZeroLambda(
-      TIndex D,
+      int64_t D,
      const T* data_ptr,
      const T* lambda2,
      T k_eps,
--- a/caffe2/operators/batch_bucketize_op.cc
+++ b/caffe2/operators/batch_bucketize_op.cc
@ -26,21 +26,21 @@ bool BatchBucketizeOp<CPUContext>::RunOnDevice() {
  auto feature_dim = feature.dim(1);
  auto output_dim = indices.size();

-  TIndex length_sum = 0;
-  for (TIndex i = 0; i < lengths.size(); i++) {
+  int64_t length_sum = 0;
+  for (int64_t i = 0; i < lengths.size(); i++) {
    CAFFE_ENFORCE_GE(feature_dim, indices_data[i]);
    length_sum += lengths_data[i];
  }
  CAFFE_ENFORCE_EQ(length_sum, boundaries.size());

-  TIndex lower_bound = 0;
+  int64_t lower_bound = 0;
  output->Resize(batch_size, output_dim);
  auto* output_data = output->template mutable_data<int32_t>();

-  for (TIndex i = 0; i < batch_size; i++) {
+  for (int64_t i = 0; i < batch_size; i++) {
    lower_bound = 0;
-    for (TIndex j = 0; j < output_dim; j++) {
-      for (TIndex k = 0; k <= lengths_data[j]; k++) {
+    for (int64_t j = 0; j < output_dim; j++) {
+      for (int64_t k = 0; k <= lengths_data[j]; k++) {
        if (k == lengths_data[j] ||
            feature_data[i * feature_dim + indices_data[j]] <=
                boundaries_data[lower_bound + k]) {
--- a/caffe2/operators/batch_gather_ops.cu
+++ b/caffe2/operators/batch_gather_ops.cu
@ -41,7 +41,7 @@ bool BatchGatherOp<CUDAContext>::DoRunWithType() {
  auto& indices = Input(INDICES);
  auto* output = Output(0);

-  vector<TIndex> shape;
+  vector<int64_t> shape;
  shape.push_back(data.dim(0));
  shape.insert(shape.end(), indices.dims().begin(), indices.dims().end());
  shape.insert(shape.end(), data.dims().begin() + 2, data.dims().end());
--- a/caffe2/operators/batch_gather_ops.h
+++ b/caffe2/operators/batch_gather_ops.h
@ -26,7 +26,7 @@ class BatchGatherOp final : public Operator<Context> {

    CAFFE_ENFORCE_GE(data.ndim(), 2, "DATA should be at least 2-D");

-    vector<TIndex> shape;
+    vector<int64_t> shape;
    shape.push_back(data.dim(0));
    shape.insert(shape.end(), indices.dims().begin(), indices.dims().end());
    shape.insert(shape.end(), data.dims().begin() + 2, data.dims().end());
--- a/caffe2/operators/batch_matmul_op.cc
+++ b/caffe2/operators/batch_matmul_op.cc
@ -27,16 +27,16 @@ vector<TensorShape> TensorInferenceForBatchMatMul(
      b_dim1 = in[1].dims(ndim - 1);
    }

-    auto output_dims = vector<TIndex>{in[0].dims().begin(), in[0].dims().end()};
+    auto output_dims = vector<int64_t>{in[0].dims().begin(), in[0].dims().end()};
    output_dims[ndim - 2] = a_dim0;
    output_dims[ndim - 1] = b_dim1;

    return vector<TensorShape>{
-        CreateTensorShape(vector<TIndex>{output_dims}, in[0].data_type())};
+        CreateTensorShape(vector<int64_t>{output_dims}, in[0].data_type())};
  } else {
    auto ndims_A = in[0].dims_size();
    auto ndims_B = in[1].dims_size();
-    std::vector<TIndex> dims_A(ndims_A), dims_B(ndims_B);
+    std::vector<int64_t> dims_A(ndims_A), dims_B(ndims_B);
    for (int i = 0; i < ndims_A; ++i) {
      dims_A[i] = in[0].dims(i);
    }
@ -66,7 +66,7 @@ vector<TensorShape> TensorInferenceForBatchMatMul(
      N = dims_B[ndims_B - 1];
    }

-    std::vector<TIndex> new_dims;
+    std::vector<int64_t> new_dims;
    if (ndims_A >= ndims_B) {
      new_dims.assign(dims_A.begin(), dims_A.end() - 2);
    } else {
@ -82,7 +82,7 @@ vector<TensorShape> TensorInferenceForBatchMatMul(
      new_dims.push_back(1);
    }
    return vector<TensorShape>{
-        CreateTensorShape(vector<TIndex>{new_dims}, in[0].data_type())};
+        CreateTensorShape(vector<int64_t>{new_dims}, in[0].data_type())};
  }
 }

--- a/caffe2/operators/batch_matmul_op.h
+++ b/caffe2/operators/batch_matmul_op.h
@ -175,7 +175,7 @@ class BatchMatMulOp final : public Operator<Context> {
      // Calculate output tensor shapes [B..., (M), (N)]
      // Batch dimensions will be broadcasted out to those of the longer tensor
      // A or B. Either M or N are optional if A or B, respectively are 1-D.
-      std::vector<TIndex> new_dims;
+      std::vector<int64_t> new_dims;
      if (ndims_A >= ndims_B) {
        new_dims.assign(dims_A.begin(), dims_A.end() - 2);
      } else {
--- a/caffe2/operators/batch_matmul_op_gpu_test.cc
+++ b/caffe2/operators/batch_matmul_op_gpu_test.cc
@ -26,7 +26,7 @@ class BatchMatMulOpGPUTest : public testing::Test {
  }

  void AddConstInput(
-      const std::vector<TIndex>& dims,
+      const std::vector<int64_t>& dims,
      const float value,
      const string& name) {
    Blob* blob = ws_.CreateBlob(name);
@ -39,7 +39,7 @@ class BatchMatMulOpGPUTest : public testing::Test {
        cuda_context_.get());
  }

-  void VerifyOutput(const std::vector<TIndex>& dims, const float value) const {
+  void VerifyOutput(const std::vector<int64_t>& dims, const float value) const {
    const Blob* Y_blob = ws_.GetBlob("Y");
    ASSERT_NE(nullptr, Y_blob);
    const auto& Y = Y_blob->Get<Tensor>();
@ -64,12 +64,12 @@ TEST_F(BatchMatMulOpGPUTest, BatchMatMulOpGPUNormalTest) {
  if (!HasCudaGPU()) {
    return;
  }
-  AddConstInput(std::vector<TIndex>{3, 5, 10}, 1.0f, "A");
-  AddConstInput(std::vector<TIndex>{3, 10, 6}, 1.0f, "B");
+  AddConstInput(std::vector<int64_t>{3, 5, 10}, 1.0f, "A");
+  AddConstInput(std::vector<int64_t>{3, 10, 6}, 1.0f, "B");
  std::unique_ptr<OperatorBase> op(CreateOperator(def_, &ws_));
  ASSERT_NE(nullptr, op);
  ASSERT_TRUE(op->Run());
-  VerifyOutput(std::vector<TIndex>{3, 5, 6}, 10.0f);
+  VerifyOutput(std::vector<int64_t>{3, 5, 6}, 10.0f);
 }

 TEST_F(BatchMatMulOpGPUTest, BatchMatMulOpGPUBroadcastTest) {
@ -79,12 +79,12 @@ TEST_F(BatchMatMulOpGPUTest, BatchMatMulOpGPUBroadcastTest) {
  auto* arg = def_.add_arg();
  arg->set_name("broadcast");
  arg->set_i(1);
-  AddConstInput(std::vector<TIndex>{3, 5, 10}, 1.0f, "A");
-  AddConstInput(std::vector<TIndex>{2, 3, 10, 6}, 1.0f, "B");
+  AddConstInput(std::vector<int64_t>{3, 5, 10}, 1.0f, "A");
+  AddConstInput(std::vector<int64_t>{2, 3, 10, 6}, 1.0f, "B");
  std::unique_ptr<OperatorBase> op(CreateOperator(def_, &ws_));
  ASSERT_NE(nullptr, op);
  ASSERT_TRUE(op->Run());
-  VerifyOutput(std::vector<TIndex>{2, 3, 5, 6}, 10.0f);
+  VerifyOutput(std::vector<int64_t>{2, 3, 5, 6}, 10.0f);
 }

 } // namespace
--- a/caffe2/operators/batch_matmul_op_test.cc
+++ b/caffe2/operators/batch_matmul_op_test.cc
@ -20,7 +20,7 @@ class BatchMatMulOpTest : public testing::Test {
  }

  void AddConstInput(
-      const std::vector<TIndex>& dims,
+      const std::vector<int64_t>& dims,
      const float value,
      const string& name) {
    Blob* blob = ws_.CreateBlob(name);
@ -33,7 +33,7 @@ class BatchMatMulOpTest : public testing::Test {
        cpu_context_.get());
  }

-  void VerifyOutput(const std::vector<TIndex>& dims, const float value) const {
+  void VerifyOutput(const std::vector<int64_t>& dims, const float value) const {
    const Blob* Y_blob = ws_.GetBlob("Y");
    ASSERT_NE(nullptr, Y_blob);
    const auto& Y = Y_blob->Get<TensorCPU>();
@ -54,24 +54,24 @@ class BatchMatMulOpTest : public testing::Test {
 };

 TEST_F(BatchMatMulOpTest, BatchMatMulOpNormalTest) {
-  AddConstInput(std::vector<TIndex>{3, 5, 10}, 1.0f, "A");
-  AddConstInput(std::vector<TIndex>{3, 10, 6}, 1.0f, "B");
+  AddConstInput(std::vector<int64_t>{3, 5, 10}, 1.0f, "A");
+  AddConstInput(std::vector<int64_t>{3, 10, 6}, 1.0f, "B");
  std::unique_ptr<OperatorBase> op(CreateOperator(def_, &ws_));
  ASSERT_NE(nullptr, op);
  ASSERT_TRUE(op->Run());
-  VerifyOutput(std::vector<TIndex>{3, 5, 6}, 10.0f);
+  VerifyOutput(std::vector<int64_t>{3, 5, 6}, 10.0f);
 }

 TEST_F(BatchMatMulOpTest, BatchMatMulOpBroadcastTest) {
  auto* arg = def_.add_arg();
  arg->set_name("broadcast");
  arg->set_i(1);
-  AddConstInput(std::vector<TIndex>{3, 5, 10}, 1.0f, "A");
-  AddConstInput(std::vector<TIndex>{2, 3, 10, 6}, 1.0f, "B");
+  AddConstInput(std::vector<int64_t>{3, 5, 10}, 1.0f, "A");
+  AddConstInput(std::vector<int64_t>{2, 3, 10, 6}, 1.0f, "B");
  std::unique_ptr<OperatorBase> op(CreateOperator(def_, &ws_));
  ASSERT_NE(nullptr, op);
  ASSERT_TRUE(op->Run());
-  VerifyOutput(std::vector<TIndex>{2, 3, 5, 6}, 10.0f);
+  VerifyOutput(std::vector<int64_t>{2, 3, 5, 6}, 10.0f);
 }

 } // namespace
--- a/caffe2/operators/batch_sparse_to_dense_op.cc
+++ b/caffe2/operators/batch_sparse_to_dense_op.cc
@ -14,15 +14,15 @@ bool BatchSparseToDenseOp<T, Context>::RunOnDevice() {
  CAFFE_ENFORCE_EQ(lengths.ndim(), 1);
  CAFFE_ENFORCE_EQ(indices.ndim(), 1);

-  const TIndex* lengths_data = lengths.template data<TIndex>();
-  const TIndex* indices_data = indices.template data<TIndex>();
+  const int64_t* lengths_data = lengths.template data<int64_t>();
+  const int64_t* indices_data = indices.template data<int64_t>();
  const T* values_data = values.template data<T>();
-  TIndex batch_size = lengths.size();
-  TIndex lengths_sum = 0;
-  math::Sum<TIndex, Context>(batch_size, lengths_data, &lengths_sum, &context_);
+  int64_t batch_size = lengths.size();
+  int64_t lengths_sum = 0;
+  math::Sum<int64_t, Context>(batch_size, lengths_data, &lengths_sum, &context_);
  CAFFE_ENFORCE_EQ(lengths_sum, indices.size());

-  vector<TIndex> output_shape = {batch_size};
+  vector<int64_t> output_shape = {batch_size};
  if (InputSize() == 4) {
    auto& shaper = Input(3);
    CAFFE_ENFORCE_EQ(shaper.ndim(), 2);
@ -42,9 +42,9 @@ bool BatchSparseToDenseOp<T, Context>::RunOnDevice() {
  math::Set(
      output->size(), static_cast<T>(default_value_), output_data, &context_);

-  TIndex k = 0;
-  for (TIndex i = 0; i < batch_size; ++i) {
-    for (TIndex j = 0; j < lengths_data[i]; ++j) {
+  int64_t k = 0;
+  for (int64_t i = 0; i < batch_size; ++i) {
+    for (int64_t j = 0; j < lengths_data[i]; ++j) {
      CAFFE_ENFORCE(
          indices_data[k] < dense_last_dim_,
          "An indice (",
@ -69,24 +69,24 @@ bool BatchDenseToSparseOp<T, Context>::RunOnDevice() {
  CAFFE_ENFORCE_EQ(lengths.ndim(), 1);
  CAFFE_ENFORCE_EQ(indices.ndim(), 1);
  CAFFE_ENFORCE_EQ(dense.ndim(), 2);
-  const TIndex* lengths_data = lengths.template data<TIndex>();
-  const TIndex* indices_data = indices.template data<TIndex>();
+  const int64_t* lengths_data = lengths.template data<int64_t>();
+  const int64_t* indices_data = indices.template data<int64_t>();
  const T* dense_data = dense.template data<T>();

-  TIndex batch_size = lengths.size();
-  TIndex lengths_sum = 0;
-  math::Sum<TIndex, Context>(batch_size, lengths_data, &lengths_sum, &context_);
+  int64_t batch_size = lengths.size();
+  int64_t lengths_sum = 0;
+  math::Sum<int64_t, Context>(batch_size, lengths_data, &lengths_sum, &context_);
  CAFFE_ENFORCE_EQ(lengths_sum, indices.size());

  CAFFE_ENFORCE_EQ(batch_size, dense.dim(0));
  dense_last_dim_ = dense.dim(1);
-  vector<TIndex> output_shape = indices.dims();
+  vector<int64_t> output_shape = indices.dims();
  output->Resize(output_shape);
  T* output_data = output->template mutable_data<T>();

-  TIndex k = 0;
-  for (TIndex i = 0; i < batch_size; ++i) {
-    for (TIndex j = 0; j < lengths_data[i]; ++j) {
+  int64_t k = 0;
+  for (int64_t i = 0; i < batch_size; ++i) {
+    for (int64_t j = 0; j < lengths_data[i]; ++j) {
      CAFFE_ENFORCE(
          indices_data[k] < dense.dim(1),
          "An indice (",
--- a/caffe2/operators/batch_sparse_to_dense_op.h
+++ b/caffe2/operators/batch_sparse_to_dense_op.h
@ -15,12 +15,12 @@ class BatchSparseToDenseOp : public Operator<Context> {
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  BatchSparseToDenseOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws),
-        OP_SINGLE_ARG(TIndex, "dense_last_dim", dense_last_dim_, -1),
+        OP_SINGLE_ARG(int64_t, "dense_last_dim", dense_last_dim_, -1),
        OP_SINGLE_ARG(T, "default_value", default_value_, static_cast<T>(0)) {}
  bool RunOnDevice() override;

 private:
-  TIndex dense_last_dim_;
+  int64_t dense_last_dim_;
  T default_value_;
  INPUT_TAGS(LENGTHS, INDICES, VALUES);
 };
@ -34,7 +34,7 @@ class BatchDenseToSparseOp : public Operator<Context> {
  bool RunOnDevice() override;

 private:
-  TIndex dense_last_dim_;
+  int64_t dense_last_dim_;
  INPUT_TAGS(LENGTHS, INDICES, DENSE);
 };

--- a/caffe2/operators/bbox_transform_op.cc
+++ b/caffe2/operators/bbox_transform_op.cc
@ -138,7 +138,7 @@ bool BBoxTransformOp<float, CPUContext>::RunOnDevice() {
    }
  }

-  CAFFE_ENFORCE_EQ(iminfo_in.dims(), (vector<TIndex>{batch_size, 3}));
+  CAFFE_ENFORCE_EQ(iminfo_in.dims(), (vector<int64_t>{batch_size, 3}));
  Eigen::Map<const ERArrXXf> iminfo(
      iminfo_in.data<float>(), iminfo_in.dim(0), iminfo_in.dim(1));

--- a/caffe2/operators/boolean_mask_ops.cc
+++ b/caffe2/operators/boolean_mask_ops.cc
@ -62,7 +62,7 @@ bool BooleanMaskOp<CPUContext>::RunOnDevice() {
      ++numOutputs;
    }
  }
-  std::vector<TIndex> outShape;
+  std::vector<int64_t> outShape;
  outShape.push_back(numOutputs);
  outShape.insert(outShape.end(), data.dims().begin() + 1, data.dims().end());
  dataOut->Resize(outShape);
@ -81,11 +81,11 @@ bool BooleanMaskOp<CPUContext>::RunOnDevice() {
  const auto innerSize = data.size_from_dim(1);
  const auto innerSizeBytes = innerSize * data.meta().itemsize();

-  TIndex lastStart = -1;
+  int64_t lastStart = -1;
  const auto* inPtr = (char*)data.raw_data();
-  TIndex outStart = 0;
+  int64_t outStart = 0;

-  for (TIndex i = 0;; ++i) {
+  for (int64_t i = 0;; ++i) {
    // mask was true and either a) became false, or b) sequence finished
    if (lastStart != -1 && ((i >= outerSize) || !maskPtr[i])) {
      const auto* src = inPtr + lastStart * innerSizeBytes;
--- a/caffe2/operators/boolean_mask_ops.cu
+++ b/caffe2/operators/boolean_mask_ops.cu
@ -7,15 +7,15 @@ namespace caffe2 {

 namespace {
 __global__ void BooleanMaskCopyKernel(
-    const TIndex numOfOutput,
-    const TIndex numBytes,
-    const TIndex* indices,
+    const int64_t numOfOutput,
+    const int64_t numBytes,
+    const int64_t* indices,
    const uint8_t* src,
    uint8_t* dest) {
-  for (TIndex i = blockIdx.x; i < numOfOutput; i += gridDim.x) {
+  for (int64_t i = blockIdx.x; i < numOfOutput; i += gridDim.x) {
    const auto srcBase = indices[i] * numBytes;
    const auto destBase = i * numBytes;
-    for (TIndex j = threadIdx.x; j < numBytes; j += blockDim.x) {
+    for (int64_t j = threadIdx.x; j < numBytes; j += blockDim.x) {
      dest[destBase + j] = src[srcBase + j];
    }
  }
@ -40,7 +40,7 @@ class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {
    const auto* maskData = mask.data<bool>();
    const auto outerSize = mask.dims()[0];
    indices_.Resize(outerSize);
-    auto* indicesData = indices_.mutable_data<TIndex>();
+    auto* indicesData = indices_.mutable_data<int64_t>();

    size_t numBytes = 0;
    cub::CountingInputIterator<int> itr(0);
@ -50,16 +50,16 @@ class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {
        itr,
        maskData,
        indicesData,
-        static_cast<TIndex*>(nullptr),
+        static_cast<int64_t*>(nullptr),
        outerSize,
        context_.cuda_stream());

-    auto numTIndex =
-        static_cast<TIndex>((numBytes + sizeof(TIndex) - 1) / sizeof(TIndex));
-    // allocate one more TIndex at the end of scratch for storing numOfOutput
-    scratch_.Resize(numTIndex + 1);
-    auto* scratchData = scratch_.mutable_data<TIndex>();
-    auto* numOfOutputData = scratchData + numTIndex;
+    auto numint64_t =
+        static_cast<int64_t>((numBytes + sizeof(int64_t) - 1) / sizeof(int64_t));
+    // allocate one more int64_t at the end of scratch for storing numOfOutput
+    scratch_.Resize(numint64_t + 1);
+    auto* scratchData = scratch_.mutable_data<int64_t>();
+    auto* numOfOutputData = scratchData + numint64_t;

    cub::DeviceSelect::Flagged(
        static_cast<void*>(scratchData),
@ -72,11 +72,11 @@ class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {
        context_.cuda_stream());

    // Copy numOfOutput from gpu to cpu
-    TIndex numOfOutput;
+    int64_t numOfOutput;
    context_.CopyToCPU(1, numOfOutputData, &numOfOutput);

    indices_.Resize(numOfOutput);
-    std::vector<TIndex> dims = src.dims();
+    std::vector<int64_t> dims = src.dims();
    dims[0] = numOfOutput;
    dest->Resize(dims);
    auto* destData = (uint8_t*)dest->raw_mutable_data(src.meta());
@ -84,12 +84,12 @@ class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {
    if (OutputSize() == 2) {
      auto* indicesOut = Output(1);
      indicesOut->Resize(numOfOutput);
-      indicesOut->template mutable_data<TIndex>();
+      indicesOut->template mutable_data<int64_t>();
    }

    if (numOfOutput > 0) {
      BooleanMaskCopyKernel<<<
-          min(numOfOutput, static_cast<TIndex>(CAFFE_MAXIMUM_NUM_BLOCKS)),
+          min(numOfOutput, static_cast<int64_t>(CAFFE_MAXIMUM_NUM_BLOCKS)),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context_.cuda_stream()>>>(
--- a/caffe2/operators/boolean_unmask_ops_test.cc
+++ b/caffe2/operators/boolean_unmask_ops_test.cc
@ -18,10 +18,10 @@ static void AddScalarInput(
  Blob* blob = ws->CreateBlob(name);
  auto* tensor = blob->GetMutableTensor(CPU);
  if (!isEmpty) {
-    tensor->Resize(vector<TIndex>{1});
+    tensor->Resize(vector<int64_t>{1});
    *(tensor->template mutable_data<DataT>()) = value;
  } else {
-    tensor->Resize(vector<TIndex>{0});
+    tensor->Resize(vector<int64_t>{0});
    tensor->template mutable_data<DataT>();
  }
  return;
--- a/caffe2/operators/cast_op.cc
+++ b/caffe2/operators/cast_op.cc
@ -11,7 +11,7 @@ bool CastOp<CPUContext>::DoRunWithType() {
  const auto* data = input.template data<SrcType>();
  auto* out = output->template mutable_data<DstType>();
  auto N = input.size();
-  for (TIndex i = 0; i < N; ++i) {
+  for (int64_t i = 0; i < N; ++i) {
    out[i] = static_cast<DstType>(data[i]);
  }
  return true;
--- a/caffe2/operators/cast_op.h
+++ b/caffe2/operators/cast_op.h
@ -42,7 +42,7 @@ class CastOp : public Operator<Context> {
    const auto* data = input.template data<SrcType>();
    auto* out = output->template mutable_data<DstType>();
    auto N = input.size();
-    for (TIndex i = 0; i < N; ++i) {
+    for (int64_t i = 0; i < N; ++i) {
      out[i] = static_cast<DstType>(data[i]);
    }
    return true;
--- a/caffe2/operators/concat_split_op.h
+++ b/caffe2/operators/concat_split_op.h
@ -161,7 +161,7 @@ bool SplitOp<Context>::RunOnDevice() {
      input_channels,
      "Sum of split dimensions do not match: should be ",
      input_channels);
-  vector<TIndex> output_dims(input.dims());
+  vector<int64_t> output_dims(input.dims());
  int before = 1, after = 1;
  for (int i = 0; i < canonical_axis; ++i) {
    before *= input.dim32(i);
@ -215,7 +215,7 @@ bool SplitByLengthsOp<Context>::RunOnDevice() {
      input_channels,
      "Sum of split dimensions do not match: should be ",
      input_channels);
-  vector<TIndex> output_dims(input.dims());
+  vector<int64_t> output_dims(input.dims());
  int before = input.size_to_dim(canonical_axis);
  int after = input.size_from_dim(canonical_axis + 1);
  size_t input_offset = 0;
@ -245,7 +245,7 @@ template <class Context>
 bool ConcatOp<Context>::RunOnDevice() {
  auto* output = Output(0);
  Tensor* split = this->template Output<Tensor>(1, CPU);
-  split->Resize(vector<TIndex>(1, InputSize()));
+  split->Resize(vector<int64_t>(1, InputSize()));
  int* axis_data = split->template mutable_data<int>();
  auto& input_zero = Input(0);
  int adj_size = input_zero.ndim() + (add_axis_ ? 1 : 0);
@ -263,7 +263,7 @@ bool ConcatOp<Context>::RunOnDevice() {
  }

  int before = 1, after = 1;
-  vector<TIndex> output_dims(input_zero.dims());
+  vector<int64_t> output_dims(input_zero.dims());
  for (int i = 0; i < input_zero.ndim(); ++i) {
    if (i == canonical_axis && !add_axis_) {
      continue;
--- a/caffe2/operators/conditional_op.cc
+++ b/caffe2/operators/conditional_op.cc
@ -31,7 +31,7 @@ bool ConditionalOp<CPUContext>::RunOnDevice() {
  // perform conditional op along first dimension
  const auto* ptrT = (char*)dataT.raw_data();
  const auto* ptrF = (char*)dataF.raw_data();
-  for (TIndex i = 0; i < condition.size(); i++) {
+  for (int64_t i = 0; i < condition.size(); i++) {
    auto* dst = outPtr + i * innerSizeBytes;
    if (condPtr[i]) {
      context_.CopyItemsSameDevice(
--- a/caffe2/operators/conv_op_cache_cudnn.h
+++ b/caffe2/operators/conv_op_cache_cudnn.h
@ -16,8 +16,8 @@ class AlgorithmsCache {
  // combination of tensor dimensions & compute data type.
  //
  TAlgorithm getAlgorithm(
-      const std::vector<TIndex>& tensorDimensions1,
-      const std::vector<TIndex>& tensorDimensions2,
+      const std::vector<int64_t>& tensorDimensions1,
+      const std::vector<int64_t>& tensorDimensions2,
      int algorithmFlags, // Differentiate between algorithms with different
                          // parameters in a generic way
      std::function<TAlgorithm()> generatingFunc);
@ -28,14 +28,14 @@ class AlgorithmsCache {

 template <typename TAlgorithm>
 TAlgorithm AlgorithmsCache<TAlgorithm>::getAlgorithm(
-    const std::vector<TIndex>& tensorDimensions1,
-    const std::vector<TIndex>& tensorDimensions2,
+    const std::vector<int64_t>& tensorDimensions1,
+    const std::vector<int64_t>& tensorDimensions2,
    int algorithmFlags,
    std::function<TAlgorithm()> generatingFunc) {
  int64_t seed = 0;
  // Hash all of the inputs, which we wiill then use to try and look up
  // a previously discovered algorithm, or fall back to generating a new one.
-  std::hash<TIndex> hashFn;
+  std::hash<int64_t> hashFn;
  for (const auto num : tensorDimensions1) {
    // Copied from boost::hash_combine.
    // Adding 1 to differentiate between first and second vector.
--- a/caffe2/operators/conv_op_cache_cudnn_test.cc
+++ b/caffe2/operators/conv_op_cache_cudnn_test.cc
@ -12,11 +12,11 @@ namespace caffe2 {
 TEST(AlgorithmsCacheTest, CachesCorrectly) {
  AlgorithmsCache<int> cache;
  int result = cache.getAlgorithm(
-      std::vector<TIndex>(1), std::vector<TIndex>(1), 0, []() { return 5; });
+      std::vector<int64_t>(1), std::vector<int64_t>(1), 0, []() { return 5; });
  EXPECT_EQ(result, 5);

  int res2 = cache.getAlgorithm(
-      std::vector<TIndex>(1), std::vector<TIndex>(1), 0, []() { return 10; });
+      std::vector<int64_t>(1), std::vector<int64_t>(1), 0, []() { return 10; });

  EXPECT_EQ(res2, 5);
 }
@ -24,11 +24,11 @@ TEST(AlgorithmsCacheTest, CachesCorrectly) {
 TEST(AlgorithmsCacheTest, KeysDifferIfOneVectorIsEmpty) {
  AlgorithmsCache<int> cache;
  int result = cache.getAlgorithm(
-      std::vector<TIndex>(1, 10), std::vector<TIndex>(), 0, []() { return 5; });
+      std::vector<int64_t>(1, 10), std::vector<int64_t>(), 0, []() { return 5; });
  EXPECT_EQ(result, 5);

  int res2 = cache.getAlgorithm(
-      std::vector<TIndex>(), std::vector<TIndex>(1, 10), 0, []() {
+      std::vector<int64_t>(), std::vector<int64_t>(1, 10), 0, []() {
        return 10;
      });

@ -38,20 +38,20 @@ TEST(AlgorithmsCacheTest, KeysDifferIfOneVectorIsEmpty) {
 TEST(AlgorithmsCacheTest, KeysDifferIfFlagsAreDifferent) {
  AlgorithmsCache<int> cache;
  int result = cache.getAlgorithm(
-      std::vector<TIndex>{2, 3, 4}, std::vector<TIndex>{5, 6}, 123, []() {
+      std::vector<int64_t>{2, 3, 4}, std::vector<int64_t>{5, 6}, 123, []() {
        return 5;
      });
  EXPECT_EQ(result, 5);

  int res2 = cache.getAlgorithm(
-      std::vector<TIndex>{2, 3, 4}, std::vector<TIndex>{5, 6}, 456, []() {
+      std::vector<int64_t>{2, 3, 4}, std::vector<int64_t>{5, 6}, 456, []() {
        return 10;
      });

  EXPECT_EQ(res2, 10);

  int res3 = cache.getAlgorithm(
-      std::vector<TIndex>{2, 3, 4}, std::vector<TIndex>{5, 6}, 456, []() {
+      std::vector<int64_t>{2, 3, 4}, std::vector<int64_t>{5, 6}, 456, []() {
        return 15;
      });

--- a/caffe2/operators/conv_op_cudnn.cc
+++ b/caffe2/operators/conv_op_cudnn.cc
@ -411,8 +411,8 @@ class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
    }
  }

-  vector<TIndex> cudnn_input_dims_;
-  vector<TIndex> cudnn_filter_dims_;
+  vector<int64_t> cudnn_input_dims_;
+  vector<int64_t> cudnn_filter_dims_;

  CuDNNWrapper cudnn_wrapper_;
  cudnnTensorDescriptor_t bottom_desc_;
--- a/caffe2/operators/conv_op_eigen.cc
+++ b/caffe2/operators/conv_op_eigen.cc
@ -42,10 +42,10 @@ bool EigenConvOp<T>::RunOnDeviceWithOrderNCHW() {
  CAFFE_ENFORCE(filter.dim32(2) == kernel_h());
  CAFFE_ENFORCE(filter.dim32(3) == kernel_w());
  ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
-  Eigen::array<TIndex, 4> kernel_shuffles
-      { {TIndex(2), TIndex(3), TIndex(1), TIndex(0)} };
-  Eigen::array<TIndex, 4> input_shuffles
-      { {TIndex(0), TIndex(2), TIndex(3), TIndex(1)} };
+  Eigen::array<int64_t, 4> kernel_shuffles
+      { {int64_t(2), int64_t(3), int64_t(1), int64_t(0)} };
+  Eigen::array<int64_t, 4> input_shuffles
+      { {int64_t(0), int64_t(2), int64_t(3), int64_t(1)} };

  Eigen::Tensor<T, 4, Eigen::RowMajor> filter_tensor =
      Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>(
@ -109,14 +109,14 @@ bool EigenConvOp<T>::RunOnDeviceWithOrderNCHW() {
    // It seems that the bias broadcast is still slower so let's do the
    // following for now.
    EigenArrayMap<T> Y_arr(
-        Y_tensor.data(), static_cast<TIndex>(M), Y->size() / M);
+        Y_tensor.data(), static_cast<int64_t>(M), Y->size() / M);
    ConstEigenVectorArrayMap<T> bias_arr(bias.template data<T>(), M);
    Y_arr = Y_arr.colwise() + bias_arr;
  }

  // Do a last transpose.
-  Eigen::array<TIndex, 4> output_shuffles
-      { {TIndex(0), TIndex(3), TIndex(1), TIndex(2) } };
+  Eigen::array<int64_t, 4> output_shuffles
+      { {int64_t(0), int64_t(3), int64_t(1), int64_t(2) } };

  Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>(
      Y->template mutable_data<T>(), N, M, Y->dim32(2), Y->dim32(3)) =
@ -204,7 +204,7 @@ bool EigenConvOp<T>::RunOnDeviceWithOrderNHWC() {
    // It seems that the bias broadcast is still slower so let's do the
    // following for now.
    EigenArrayMap<T> Y_arr(
-        Y->template mutable_data<T>(), static_cast<TIndex>(M), Y->size() / M);
+        Y->template mutable_data<T>(), static_cast<int64_t>(M), Y->size() / M);
    ConstEigenVectorArrayMap<T> bias_arr(bias.template data<T>(), M);
    Y_arr = Y_arr.colwise() + bias_arr;
  }
--- a/caffe2/operators/conv_op_impl.h
+++ b/caffe2/operators/conv_op_impl.h
@ -240,7 +240,7 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
  }
  auto f = [&](Tensor* col_buffer) {
    col_buffer->Resize(
-        vector<TIndex>{Y->dim32(1), Y->dim32(2), kernel_h(), kernel_w(), C});
+        vector<int64_t>{Y->dim32(1), Y->dim32(2), kernel_h(), kernel_w(), C});
    T* col_buffer_data = col_buffer->template mutable_data<T>();
    // Im2Col, followed by gemm.
    for (int image_id = 0; image_id < N; ++image_id) {
@ -504,7 +504,7 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
    dbias->Resize(M);
    if (bias_multiplier_.size() != output_image_size) {
      // If the helper bias multiplier is not M, reshape and fill it with one.
-      bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
+      bias_multiplier_.Resize(vector<int64_t>(1, output_image_size));
      math::Set<T, Context>(
          output_image_size,
          static_cast<T>(1),
@ -689,7 +689,7 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
    math::Set<T, Context>(dbias->size(), 0, dbias_data, &context_);
    if (bias_multiplier_.size() != output_image_size) {
      // If the helper bias multiplier is not M, reshape and fill it with one.
-      bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
+      bias_multiplier_.Resize(vector<int64_t>(1, output_image_size));
      math::Set<T, Context>(
          output_image_size,
          static_cast<T>(1),
--- a/caffe2/operators/conv_pool_op_base.h
+++ b/caffe2/operators/conv_pool_op_base.h
@ -246,7 +246,7 @@ class ConvPoolOpBase : public Operator<Context> {
  // Helper function that is also called from OperatorSchema. Modified
  // kernel parameters and output output_dims and channel_first.
  static inline void InferOutputSize(
-      vector<TIndex> input_dims,
+      vector<int64_t> input_dims,
      int /*output_channel*/,
      StorageOrder order,
      bool global_pooling,
@ -259,7 +259,7 @@ class ConvPoolOpBase : public Operator<Context> {
      vector<int>& pads,
      bool& channel_first) {
    channel_first = false; // initialized to suppress compiler warning.
-    vector<TIndex> dims;
+    vector<int64_t> dims;
    switch (order) {
      case StorageOrder::NHWC:
        channel_first = false;
@ -358,7 +358,7 @@ class ConvPoolOpBase : public Operator<Context> {
    if (bias_multiplier_->size() != size) {
      // If the helper bias multiplier is not image size, reshape and fill it
      // with one.
-      bias_multiplier_->Resize(std::vector<TIndex>{size});
+      bias_multiplier_->Resize(std::vector<int64_t>{size});
      math::Set<T, Context>(
          size,
          static_cast<T>(1),
--- a/caffe2/operators/conv_transpose_op_cudnn.cc
+++ b/caffe2/operators/conv_transpose_op_cudnn.cc
@ -64,8 +64,8 @@ class CudnnConvTransposeOpBase : public ConvTransposeUnpoolBase<CUDAContext> {
  }

 protected:
-  vector<TIndex> cudnn_input_dims_;
-  vector<TIndex> cudnn_filter_dims_;
+  vector<int64_t> cudnn_input_dims_;
+  vector<int64_t> cudnn_filter_dims_;

  CuDNNWrapper cudnn_wrapper_;
  cudnnTensorDescriptor_t bottom_desc_;
--- a/caffe2/operators/conv_transpose_op_impl.h
+++ b/caffe2/operators/conv_transpose_op_impl.h
@ -45,7 +45,7 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
        bias.dim32(0) == C,
        "bias dimension must be equal to output channel number");
    if (bias_multiplier_.size() != output_image_size) {
-      bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
+      bias_multiplier_.Resize(vector<int64_t>(1, output_image_size));
      T* bm_data = bias_multiplier_.template mutable_data<T>();
      math::Set<T, Context>(
          output_image_size,
@ -61,7 +61,7 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {

  auto f = [&](Tensor* col_buffer) {
    col_buffer->Resize(
-        vector<TIndex>{C, this->kernel_h(), this->kernel_w(), H, W});
+        vector<int64_t>{C, this->kernel_h(), this->kernel_w(), H, W});
    T* col_buffer_data = col_buffer->template mutable_data<T>();
    for (auto image_id = 0; image_id < N; ++image_id) {
      // Weight term
@ -167,7 +167,7 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNHWC() {
        bias.dim32(0) == C,
        "bias dimension must be equal to output channel number");
    if (bias_multiplier_.size() != output_image_size) {
-      bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
+      bias_multiplier_.Resize(vector<int64_t>(1, output_image_size));
      T* bm_data = bias_multiplier_.template mutable_data<T>();
      math::Set<T, Context>(
          output_image_size,
@ -182,7 +182,7 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNHWC() {

  auto f = [&](Tensor* /*col_buffer*/) {
    col_buffer_.Resize(
-        vector<TIndex>{H, W, this->kernel_h(), this->kernel_w(), C});
+        vector<int64_t>{H, W, this->kernel_h(), this->kernel_w(), C});
    T* col_buffer_data = col_buffer_.template mutable_data<T>();
    for (auto image_id = 0; image_id < N; ++image_id) {
      // Weight term
@ -270,7 +270,7 @@ bool ConvTransposeGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
  const int output_image_size = dY.dim32(2) * dY.dim32(3);
  // The col buffer is stored in CHW order as well
  col_buffer_.Resize(
-      vector<TIndex>{C, this->kernel_h(), this->kernel_w(), H, W});
+      vector<int64_t>{C, this->kernel_h(), this->kernel_w(), H, W});
  if (!no_bias_) {
    auto* dbias = Output(BIAS_OR_INPUT_GRAD);
    dbias->Resize(C);
@ -422,7 +422,7 @@ bool ConvTransposeGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
  const int output_image_size = dY.dim32(1) * dY.dim32(2);
  // The col buffer is stored in HWC order as well
  col_buffer_.Resize(
-      vector<TIndex>{H, W, this->kernel_h(), this->kernel_w(), C});
+      vector<int64_t>{H, W, this->kernel_h(), this->kernel_w(), C});
  if (!no_bias_) {
    auto* dbias = Output(BIAS_OR_INPUT_GRAD);
    dbias->Resize(C);
--- a/caffe2/operators/conv_transpose_op_mobile_test.cc
+++ b/caffe2/operators/conv_transpose_op_mobile_test.cc
@ -10,7 +10,7 @@

 namespace caffe2 {

-void AddConstInput(const vector<TIndex>& shape,
+void AddConstInput(const vector<int64_t>& shape,
                   const float value,
                   const string& name,
                   Workspace* ws) {
@ -23,7 +23,7 @@ void AddConstInput(const vector<TIndex>& shape,
      tensor->size(), value, tensor->template mutable_data<float>(), &context);
 }

-void AddNoiseInput(const vector<TIndex>& shape,
+void AddNoiseInput(const vector<int64_t>& shape,
                   const string& name,
                   Workspace* ws) {
  DeviceOption option;
@ -81,9 +81,9 @@ void compare(int N, int inputC, int H, int W,
  def1.add_arg()->CopyFrom(MakeArgument("adj_h", adjH));
  def1.add_arg()->CopyFrom(MakeArgument("adj_w", adjW));

-  AddNoiseInput(vector<TIndex>{N, inputC, H, W}, "X", &ws);
-  AddNoiseInput(vector<TIndex>{inputC, outputC, kernelH, kernelW}, "W", &ws);
-  AddNoiseInput(vector<TIndex>{outputC}, "B", &ws);
+  AddNoiseInput(vector<int64_t>{N, inputC, H, W}, "X", &ws);
+  AddNoiseInput(vector<int64_t>{inputC, outputC, kernelH, kernelW}, "W", &ws);
+  AddNoiseInput(vector<int64_t>{outputC}, "B", &ws);

  unique_ptr<OperatorBase> op1(CreateOperator(def1, &ws));
  EXPECT_NE(nullptr, op1.get());
--- a/caffe2/operators/cross_entropy_op.cc
+++ b/caffe2/operators/cross_entropy_op.cc
@ -80,9 +80,9 @@ bool SigmoidCrossEntropyWithLogitsOp<float, CPUContext>::RunOnDevice() {

  auto* out = Output(0);
  if (logits.ndim() == 0) {
-    out->Resize(std::vector<TIndex>{});
+    out->Resize(std::vector<int64_t>{});
  } else {
-    std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
+    std::vector<int64_t> dims(logits.dims().begin(), logits.dims().end() - 1);
    out->Resize(dims);
  }
  auto* out_ptr = out->template mutable_data<float>();
@ -162,9 +162,9 @@ bool WeightedSigmoidCrossEntropyWithLogitsOp<float, CPUContext>::RunOnDevice() {

  auto* out = Output(0);
  if (logits.ndim() == 0) {
-    out->Resize(std::vector<TIndex>{});
+    out->Resize(std::vector<int64_t>{});
  } else {
-    std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
+    std::vector<int64_t> dims(logits.dims().begin(), logits.dims().end() - 1);
    out->Resize(dims);
  }
  auto* out_ptr = out->template mutable_data<float>();
@ -260,11 +260,11 @@ bool MakeTwoClassOp<float, CPUContext>::RunOnDevice() {
  auto* Y = Output(0);
  auto shape = X.dims();
  shape.push_back(2);
-  TIndex N = X.size();
+  int64_t N = X.size();
  Y->Resize(shape);
  const auto* Xdata = X.data<float>();
  auto* Ydata = Y->template mutable_data<float>();
-  for (TIndex i = 0; i < N; ++i) {
+  for (int64_t i = 0; i < N; ++i) {
    DCHECK_GE(Xdata[i], 0.0);
    DCHECK_LE(Xdata[i], 1.0);
    Ydata[i * 2] = 1.0 - Xdata[i];
@ -284,9 +284,9 @@ bool MakeTwoClassGradientOp<float, CPUContext>::RunOnDevice() {
  dX->Resize(shape);
  const float* dYdata = dY.data<float>();
  float* dXdata = dX->template mutable_data<float>();
-  TIndex N = dX->size();
+  int64_t N = dX->size();
  // use eigen?
-  for (TIndex i = 0; i < N; ++i) {
+  for (int64_t i = 0; i < N; ++i) {
    dXdata[i] = dYdata[i * 2 + 1] - dYdata[i * 2];
  }
  return true;
@ -308,7 +308,7 @@ bool CrossEntropyOp<float, CPUContext>::RunOnDevice() {
  CAFFE_ENFORCE(
      (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == D));
  CAFFE_ENFORCE_EQ(label.dim32(0), N);
-  Y->Resize(vector<TIndex>{N});
+  Y->Resize(vector<int64_t>{N});
  const float* Xdata = X.data<float>();
  const float* labelData = label.data<float>();
  auto* Ydata = Y->template mutable_data<float>();
--- a/caffe2/operators/cross_entropy_op.cu
+++ b/caffe2/operators/cross_entropy_op.cu
@ -42,7 +42,7 @@ bool LabelCrossEntropyOp<float, CUDAContext>::RunOnDevice() {
  CAFFE_ENFORCE(
      (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == 1));
  CAFFE_ENFORCE_EQ(label.dim32(0), N);
-  Y->Resize(vector<TIndex>(size_t(1), N));
+  Y->Resize(vector<int64_t>(size_t(1), N));
  LabelCrossEntropyKernel<<<
      CAFFE_GET_BLOCKS(N),
      CAFFE_CUDA_NUM_THREADS,
@ -250,9 +250,9 @@ bool SigmoidCrossEntropyWithLogitsOp<float, CUDAContext>::RunOnDevice() {

  auto* out = Output(0);
  if (logits.ndim() == 0) {
-    out->Resize(std::vector<TIndex>{});
+    out->Resize(std::vector<int64_t>{});
  } else {
-    std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
+    std::vector<int64_t> dims(logits.dims().begin(), logits.dims().end() - 1);
    out->Resize(dims);
  }
  auto* out_ptr = out->template mutable_data<float>();
@ -372,9 +372,9 @@ bool WeightedSigmoidCrossEntropyWithLogitsOp<float, CUDAContext>::

  auto* out = Output(0);
  if (logits.ndim() == 0) {
-    out->Resize(std::vector<TIndex>{});
+    out->Resize(std::vector<int64_t>{});
  } else {
-    std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
+    std::vector<int64_t> dims(logits.dims().begin(), logits.dims().end() - 1);
    out->Resize(dims);
  }
  auto* out_ptr = out->template mutable_data<float>();
--- a/caffe2/operators/ctc_beam_search_decoder_op.cc
+++ b/caffe2/operators/ctc_beam_search_decoder_op.cc
@ -32,7 +32,7 @@ bool CTCBeamSearchDecoderOp<CPUContext>::RunOnDevice() {
      (InputSize() == 2) ? Input(SEQ_LEN).data<int>() : nullptr;

  vector<int32_t> values_cache;
-  output_len->Resize(vector<TIndex>{batch_size});
+  output_len->Resize(vector<int64_t>{batch_size});
  int* output_len_data = output_len->mutable_data<int>();

  for (int32_t i = 0; i < batch_size; ++i) {
@ -121,7 +121,7 @@ bool CTCBeamSearchDecoderOp<CPUContext>::RunOnDevice() {
  }

  int32_t cache_size = values_cache.size();
-  values->Resize(vector<TIndex>{cache_size});
+  values->Resize(vector<int64_t>{cache_size});
  int* values_data = values->mutable_data<int>();
  for (int i = 0; i < values_cache.size(); ++i) {
    values_data[i] = values_cache.at(i);
--- a/caffe2/operators/ctc_greedy_decoder_op.cc
+++ b/caffe2/operators/ctc_greedy_decoder_op.cc
@ -32,7 +32,7 @@ bool CTCGreedyDecoderOp<CPUContext>::RunOnDevice() {
      (InputSize() == 2) ? Input(SEQ_LEN).data<int>() : nullptr;

  vector<int> values_cach;
-  output_len->Resize(vector<TIndex>{batch_size});
+  output_len->Resize(vector<int64_t>{batch_size});
  int* output_len_data = output_len->template mutable_data<int>();

  for (int32_t i = 0; i < batch_size; ++i) {
@ -54,7 +54,7 @@ bool CTCGreedyDecoderOp<CPUContext>::RunOnDevice() {
  }

  int32_t values_cach_size = values_cach.size();
-  values->Resize(vector<TIndex>{values_cach_size});
+  values->Resize(vector<int64_t>{values_cach_size});
  int* values_data = values->mutable_data<int>();
  for (int i = 0; i < values_cach.size(); ++i) {
    values_data[i] = values_cach.at(i);
--- a/caffe2/operators/dataset_ops.cc
+++ b/caffe2/operators/dataset_ops.cc
@ -155,7 +155,7 @@ void TreeWalker::advance() {
  cursor_.it.advance(lengths_, cursor_.offsets, sizes_, limits_, 1);
 }

-std::vector<TIndex> TreeWalker::fieldDim(int fieldId) const {
+std::vector<int64_t> TreeWalker::fieldDim(int fieldId) const {
  auto tensorDim = input(fieldId).dims();
  tensorDim[0] = sizes_[lengthIdx(fieldId)];
  return tensorDim;
@ -355,7 +355,7 @@ class UnPackRecordsOp : public Operator<CPUContext> {
    auto numTensors = OutputSize();

    // Precomputer the output sizes to avoid resizing
-    std::vector<std::vector<TIndex>> outputDims(numTensors);
+    std::vector<std::vector<int64_t>> outputDims(numTensors);
    std::vector<const TypeMeta*> metas(numTensors);

    CAFFE_ENFORCE(
@ -414,7 +414,7 @@ class UnPackRecordsOp : public Operator<CPUContext> {

 private:
  void getShapeAndMetaFromInput(
-      std::vector<std::vector<TIndex>>& outputDims,
+      std::vector<std::vector<int64_t>>& outputDims,
      std::vector<const TypeMeta*>& metas) {
    const auto* inputs = Input(0).template data<SharedTensorVectorPtr>();

@ -434,7 +434,7 @@ class UnPackRecordsOp : public Operator<CPUContext> {
  }

  void getShapeAndMetaFromPrototypeBlobs(
-      std::vector<std::vector<TIndex>>& outputDims,
+      std::vector<std::vector<int64_t>>& outputDims,
      std::vector<const TypeMeta*>& metas) {
    const auto numTensors = fields_.size();
    CAFFE_ENFORCE_EQ(numTensors, InputSize() - 1);
@ -501,7 +501,7 @@ class ReadNextBatchOp : public Operator<CPUContext> {
      }
    }
    // gather data
-    std::vector<TIndex> outDim;
+    std::vector<int64_t> outDim;
    for (int i = 0; i < cursor->it.fields().size(); ++i) {
      auto lengthIdx = cursor->it.fields()[i].lengthFieldId + 1;
      auto size = sizes[lengthIdx];
@ -676,7 +676,7 @@ class ReadRandomBatchOp : public Operator<CPUContext> {
    auto idxvec = idxblob.template data<int64_t>();
    auto& offsetdim = offsetsmat.dims();
    // gather data
-    std::vector<TIndex> outDim;
+    std::vector<int64_t> outDim;
    int64_t idx;
    {
      std::lock_guard<std::mutex> lock(cursor->mutex_);
@ -883,7 +883,7 @@ class ConcatTensorVectorOp final : public Operator<Context> {
    auto* tensor = Output(TENSOR);
    CAFFE_ENFORCE(!tensorVector->empty());

-    vector<TIndex> outputDims(tensorVector->at(0).dims());
+    vector<int64_t> outputDims(tensorVector->at(0).dims());
    CAFFE_ENFORCE(outputDims.size() > 0);
    for (int i = 1; i < tensorVector->size(); i++) {
      // the tensor shapes are the same except for the first dimension
@ -895,7 +895,7 @@ class ConcatTensorVectorOp final : public Operator<Context> {
    }

    tensor->Resize(outputDims);
-    TIndex offset = 0;
+    int64_t offset = 0;
    auto* dst = (char*)tensor->raw_mutable_data(tensorVector->at(0).meta());

    for (const auto& t : *tensorVector) {
--- a/caffe2/operators/dataset_ops.h
+++ b/caffe2/operators/dataset_ops.h
@ -123,7 +123,7 @@ class TreeWalker {
    return prevOffsets_[lengthIdx(fieldId)];
  }

-  std::vector<TIndex> fieldDim(int fieldId) const;
+  std::vector<int64_t> fieldDim(int fieldId) const;

  void* fieldPtr(int fieldId) const;

@ -134,12 +134,12 @@ class TreeWalker {
    Field(TreeWalker& walker, int fieldId)
        : walker_(walker), fieldId_(fieldId) {}

-    inline std::vector<TIndex> dim() const {
+    inline std::vector<int64_t> dim() const {
      return walker_.fieldDim(fieldId_);
    }

-    inline TIndex size() const {
-      TIndex size = 1;
+    inline int64_t size() const {
+      int64_t size = 1;
      for (const auto d : dim()) {
        size *= d;
      }
--- a/caffe2/operators/deform_conv_op.cu
+++ b/caffe2/operators/deform_conv_op.cu
@ -67,8 +67,8 @@

 namespace caffe2 {

-typedef TIndex index_t;
-typedef std::vector<TIndex> TShape;
+typedef int64_t index_t;
+typedef std::vector<int64_t> TShape;

 template <typename DType>
 __device__ DType deformable_im2col_bilinear(
@ -304,8 +304,8 @@ template <typename DType, typename Context>
 void DeformConvOpBase<DType, Context>::DeformableIm2col(
    const DType* data_im,
    const DType* data_offset,
-    const std::vector<TIndex>& im_shape,
-    const std::vector<TIndex>& col_shape,
+    const std::vector<int64_t>& im_shape,
+    const std::vector<int64_t>& col_shape,
    DType* data_col) {
  CHECK_LT(2, CAFFE_CUDA_NUM_THREADS);
  CAFFE_ENFORCE_EQ(pad_t(), pad_b());
@ -430,8 +430,8 @@ template <typename DType, typename Context>
 void DeformConvOpBase<DType, Context>::DeformableCol2im(
    const DType* data_col,
    const DType* data_offset,
-    const std::vector<TIndex>& im_shape,
-    const std::vector<TIndex>& col_shape,
+    const std::vector<int64_t>& im_shape,
+    const std::vector<int64_t>& col_shape,
    DType* grad_im) {
  CAFFE_ENFORCE_EQ(pad_t(), pad_b());
  CAFFE_ENFORCE_EQ(pad_l(), pad_r());
@ -577,8 +577,8 @@ void DeformConvOpBase<DType, Context>::DeformableCol2imCoord(
    const DType* data_col,
    const DType* data_im,
    const DType* data_offset,
-    const std::vector<TIndex>& im_shape,
-    const std::vector<TIndex>& col_shape,
+    const std::vector<int64_t>& im_shape,
+    const std::vector<int64_t>& col_shape,
    DType* grad_offset) {
  CAFFE_ENFORCE_EQ(pad_t(), pad_b());
  CAFFE_ENFORCE_EQ(pad_l(), pad_r());
--- a/Show More
+++ b/Show More