fbshipit-source-id: ba600fcd2b5cefc7621357bdeb05e24cea02e5af

2025-12-06 12:20:52 +01:00 · 2018-06-27 04:50:56 -07:00 · 2018-06-27 04:50:56 -07:00 · 9ec0a2aef4
commit 9ec0a2aef4
parent 290d20b094
69 changed files with 989 additions and 300 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1,5 +0,0 @@
 # Set the default behavior, in case people don't have core.autocrlf set.
 * text=auto
 # BASH scripts shouldn't be converted since they may need to be used by Docker
 *.sh text eol=lf
--- a/aten/src/ATen/native/Pooling.cpp
+++ b/aten/src/ATen/native/Pooling.cpp
@ -41,13 +41,10 @@ std::tuple<Tensor,Tensor> adaptive_max_pool1d(const Tensor & self, IntList outpu
  return std::make_tuple(output.squeeze(2), indices.squeeze(2));
 }
-std::tuple<Tensor, Tensor> max_pool1d_with_indices(
+std::tuple<Tensor,Tensor> max_pool1d(
-    const Tensor& self,
+    const Tensor & self, IntList kernel_size, IntList stride, IntList padding,
-    IntList kernel_size,
+    IntList dilation, bool ceil_mode) {
-    IntList stride,
+
    IntList padding,
    IntList dilation,
    bool ceil_mode) {
  if (stride.empty()) {
    stride = kernel_size;
  }
@ -58,7 +55,7 @@ std::tuple<Tensor, Tensor> max_pool1d_with_indices(
  check1d("max_pool1d", "dilation", dilation);
  Tensor output, indices;
-  std::tie(output, indices) = at::max_pool2d_with_indices(
+  std::tie(output, indices) = at::max_pool2d(
      self.unsqueeze(2),
      {1, kernel_size[0]},
      {1, stride[0]},
@ -94,41 +91,5 @@ Tensor avg_pool1d(
  return output.squeeze(2);
 }
 Tensor max_pool1d(
    const Tensor& self,
    IntList kernel_size,
    IntList stride,
    IntList padding,
    IntList dilation,
    bool ceil_mode) {
  auto output_and_indices = at::max_pool1d_with_indices(
      self, kernel_size, stride, padding, dilation, ceil_mode);
  return std::get<0>(output_and_indices);
 }
 Tensor max_pool2d(
    const Tensor& self,
    IntList kernel_size,
    IntList stride,
    IntList padding,
    IntList dilation,
    bool ceil_mode) {
  auto output_and_indices = at::max_pool2d_with_indices(
      self, kernel_size, stride, padding, dilation, ceil_mode);
  return std::get<0>(output_and_indices);
 }
 Tensor max_pool3d(
    const Tensor& self,
    IntList kernel_size,
    IntList stride,
    IntList padding,
    IntList dilation,
    bool ceil_mode) {
  auto output_and_indices = at::max_pool3d_with_indices(
      self, kernel_size, stride, padding, dilation, ceil_mode);
  return std::get<0>(output_and_indices);
 }
 } // namespace native
 } // namespace at
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -817,16 +817,7 @@
 - func: max_values(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
- func: max_pool1d_with_indices(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> (Tensor, Tensor)
+- func: max_pool1d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> (Tensor, Tensor)
  variants: function
 - func: max_pool1d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> Tensor
  variants: function
 - func: max_pool2d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> Tensor
  variants: function
 - func: max_pool3d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> Tensor
  variants: function
 # FIXME: These could be combined as optional<ScalarType> but for https://github.com/pytorch/pytorch/issues/6593.
--- a/aten/src/ATen/nn.yaml
+++ b/aten/src/ATen/nn.yaml
@ -149,12 +149,12 @@
  scalar_check:
    output: 'false'
- name: max_pool2d_with_indices(Tensor self, IntList[2] kernel_size, IntList[2] stride={}, IntList[2] padding=0, IntList[2] dilation=1, bool ceil_mode=false)
+- name: max_pool2d(Tensor self, IntList[2] kernel_size, IntList[2] stride={}, IntList[2] padding=0, IntList[2] dilation=1, bool ceil_mode=false)
  cname: SpatialDilatedMaxPooling
  default_init:
    stride: kernel_size
- name: max_pool3d_with_indices(Tensor self, IntList[3] kernel_size, IntList[3] stride={}, IntList[3] padding=0, IntList[3] dilation=1, bool ceil_mode=false)
+- name: max_pool3d(Tensor self, IntList[3] kernel_size, IntList[3] stride={}, IntList[3] padding=0, IntList[3] dilation=1, bool ceil_mode=false)
  cname: VolumetricDilatedMaxPooling
  default_init:
    stride: kernel_size
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@ -521,7 +521,7 @@ TEST(TensorTest, TensorNonFundamentalType) {
  }
 }
-TEST(TensorTest, TensorNonFundamentalTypeCopy) {
+TEST(TensorTest, TensorNonFundamentalTypeClone) {
  TensorCPU tensor(vector<int>{2, 3, 4});
  std::string* ptr = tensor.mutable_data<std::string>();
  EXPECT_TRUE(ptr != nullptr);
@ -529,11 +529,20 @@ TEST(TensorTest, TensorNonFundamentalTypeCopy) {
    EXPECT_TRUE(ptr[i] == "");
    ptr[i] = "filled";
  }
-  TensorCPU dst_tensor(tensor);
+  TensorCPU dst_tensor = tensor.Clone();
  const std::string* dst_ptr = dst_tensor.data<std::string>();
  for (int i = 0; i < dst_tensor.size(); ++i) {
    EXPECT_TRUE(dst_ptr[i] == "filled");
  }
  // Change the original tensor
  for (int i = 0; i < tensor.size(); ++i) {
    EXPECT_TRUE(ptr[i] == "filled");
    ptr[i] = "changed";
  }
  // Confirm that the cloned tensor is not affect
  for (int i = 0; i < dst_tensor.size(); ++i) {
    EXPECT_TRUE(dst_ptr[i] == "filled");
  }
 }
 TEST(TensorTest, Tensor64BitDimension) {
@ -1060,5 +1069,47 @@ TEST(BlobTest, CastingMessage) {
  }
 }
 TEST(TensorConstruction, UnitializedCopyTest) {
  CPUContext context;
  TensorCPU x;
  TensorCPU y(x, &context);
  TensorCPU z = x.Clone();
  // should be uninitialized
  EXPECT_EQ(x.size(), -1);
  EXPECT_EQ(y.size(), -1);
  LOG(INFO) << "z.size()" << z.size();
  EXPECT_EQ(z.size(), -1);
 }
 TEST(TensorConstruction, CopyConstructorTest) {
  CPUContext context;
  TensorCPU x;
  x.Resize(5);
  x.mutable_data<float>()[0] = 1;
  TensorCPU y = x.Clone();
  TensorCPU z(x, &context);
  TensorCPU w;
  EXPECT_EQ(*x.data<float>(), 1);
  EXPECT_EQ(*y.data<float>(), 1);
  EXPECT_EQ(*z.data<float>(), 1);
  x.mutable_data<float>()[0] = 5;
  EXPECT_EQ(*x.data<float>(), 5);
  EXPECT_EQ(*y.data<float>(), 1);
  EXPECT_EQ(*z.data<float>(), 1);
 }
 TEST(TensorConstruction, MoveConstructorTest) {
  CPUContext context;
  TensorCPU x;
  x.Resize(5);
  x.mutable_data<float>()[0] = 1;
  TensorCPU y = std::move(x);
  EXPECT_EQ(*y.data<float>(), 1);
 }
 } // namespace
 } // namespace caffe2
--- a/caffe2/core/logging_is_google_glog.h
+++ b/caffe2/core/logging_is_google_glog.h
@ -12,9 +12,9 @@
 #include <cuda.h>
 #endif
-#if (!defined(__CUDACC__) || CUDA_VERSION > 9000 ) && !defined(CAFFE2_USE_MINIMAL_GOOGLE_GLOG)
+#if !defined(__CUDACC__) && !defined(CAFFE2_USE_MINIMAL_GOOGLE_GLOG)
 #include <glog/stl_logging.h>
-#else // (!defined(__CUDACC__) || CUDA_VERSION > 9000 ) && !defined(CAFFE2_USE_MINIMAL_GOOGLE_GLOG)
+#else // !defined(__CUDACC__) && !defined(CAFFE2_USE_MINIMAL_GOOGLE_GLOG)
 // here, we need to register a fake overload for vector/string - here,
 // we just ignore the entries in the logs.
--- a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h
@ -555,13 +555,32 @@ class GivenTensorFill : public NeuralNetOperator {
 class Concat : public NeuralNetOperator {
 public:
-  Concat() : NeuralNetOperator(NNKind::Concat) {}
+  Concat(int axis = -1, bool addAxis = false)
      : NeuralNetOperator(NNKind::Concat), Axis(axis), AddAxis(addAxis) {}
  ~Concat() {}
  NOMNIGRAPH_DEFINE_NN_RTTI(Concat);
  int getAxis() const {
    return Axis;
  }
  bool getAddAxis() const {
    return AddAxis;
  }
  void setAxis(int axis) {
    Axis = axis;
  }
  void setAddAxis(bool addAxis) {
    AddAxis = addAxis;
  }
 private:
  int Axis;
  bool AddAxis;
 };
 class Softmax : public NeuralNetOperator {
@ -908,3 +927,68 @@ class Int8MaxPoolRelu : public NeuralNetOperator {
 private:
 };
 class BatchMatMul : public NeuralNetOperator {
 public:
  BatchMatMul(bool transA = false, bool transB = true, bool broadcast = false)
      : NeuralNetOperator(NNKind::BatchMatMul),
        TransA(transA),
        TransB(transB),
        Broadcast(broadcast) {}
  ~BatchMatMul() {}
  NOMNIGRAPH_DEFINE_NN_RTTI(BatchMatMul);
  bool getTransA() const {
    return TransA;
  }
  bool getTransB() const {
    return TransB;
  }
  bool getBroadcast() const {
    return Broadcast;
  }
  void setTransA(bool transA) {
    TransA = transA;
  }
  void setTransB(bool transB) {
    TransB = transB;
  }
  void setBroadcast(bool broadcast) {
    Broadcast = broadcast;
  }
 private:
  bool TransA;
  bool TransB;
  bool Broadcast;
 };
 class BatchGather : public NeuralNetOperator {
 public:
  BatchGather() : NeuralNetOperator(NNKind::BatchGather) {}
  ~BatchGather() {}
  NOMNIGRAPH_DEFINE_NN_RTTI(BatchGather);
 private:
 };
 class ConcatBatchMatMulBatchGatherOp : public NeuralNetOperator {
 public:
  ConcatBatchMatMulBatchGatherOp()
      : NeuralNetOperator(NNKind::ConcatBatchMatMulBatchGatherOp) {}
  ~ConcatBatchMatMulBatchGatherOp() {}
  NOMNIGRAPH_DEFINE_NN_RTTI(ConcatBatchMatMulBatchGatherOp);
 private:
 };
--- a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h
@ -5,4 +5,5 @@ Relu, Conv, ConvRelu, ConvTranspose, AveragePool, AveragePoolRelu, MaxPool,
    Int8Conv, Int8ConvTranspose, Int8FC, Int8MaxPool, Int8Relu,
    Int8GivenTensorFill, Int8Concat, Int8Softmax, Int8ChannelShuffle, Int8Sum,
    Int8Add, Int8Reshape, Int8Flatten, Int8ConvRelu, Int8SumRelu,
-    Int8AveragePoolRelu, Int8MaxPoolRelu
+    Int8AveragePoolRelu, Int8MaxPoolRelu, BatchMatMul, BatchGather,
    ConcatBatchMatMulBatchGatherOp
--- a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h
@ -84,3 +84,9 @@ case NNKind::Int8AveragePoolRelu:
  return "Int8AveragePoolRelu";
 case NNKind::Int8MaxPoolRelu:
  return "Int8MaxPoolRelu";
 case NNKind::BatchMatMul:
  return "BatchMatMul";
 case NNKind::BatchGather:
  return "BatchGather";
 case NNKind::ConcatBatchMatMulBatchGatherOp:
  return "ConcatBatchMatMulBatchGatherOp";
--- a/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h
@ -14,6 +14,24 @@
 #include <functional>
 #include <list>
 // These #defines are useful when writing passes as the collapse
 //
 // if (!cond) {
 //   continue; // or break; or return;
 // }
 //
 // into a single line without negation
 #define NOM_REQUIRE_OR_(_cond, _expr) \
  if (!(_cond)) {                     \
    _expr;                            \
  }
 #define NOM_REQUIRE_OR_CONT(_cond) NOM_REQUIRE_OR_(_cond, continue)
 #define NOM_REQUIRE_OR_BREAK(_cond) NOM_REQUIRE_OR_(_cond, break)
 #define NOM_REQUIRE_OR_RET_NULL(_cond) NOM_REQUIRE_OR_(_cond, return nullptr)
 #define NOM_REQUIRE_OR_RET(_cond) NOM_REQUIRE_OR_(_cond, return )
 // Implements accessors for a generic type T. If the type is not
 // specified (i.e., void template type) then the partial specification
 // gives an empty type.
--- a/caffe2/core/nomnigraph/ops.def
+++ b/caffe2/core/nomnigraph/ops.def
@ -55,6 +55,8 @@ BatchNormalization
 FC
 GivenTensorFill
 Concat
 - Axis : int : -1
 - AddAxis : bool : false
 Softmax
 ChannelShuffle
 Add
@ -84,3 +86,10 @@ Int8ConvRelu : ConvRelu
 Int8SumRelu : SumRelu
 Int8AveragePoolRelu : AveragePoolRelu
 Int8MaxPoolRelu : MaxPoolRelu
 BatchMatMul
 - TransA : bool : false
 - TransB : bool : true
 - Broadcast: bool : false
 BatchGather
 ConcatBatchMatMulBatchGatherOp
--- a/caffe2/core/plan_executor.cc
+++ b/caffe2/core/plan_executor.cc
@ -124,7 +124,7 @@ struct WorkspaceIdInjector {
  void InjectWorkspaceId(Workspace* workspace) {
    if (workspace->HasBlob(NODE_ID)) {
      Blob* node_id_blob = workspace->GetBlob(NODE_ID);
-      TensorCPU node_id_tensor = node_id_blob->template Get<TensorCPU>();
+      const TensorCPU& node_id_tensor = node_id_blob->template Get<TensorCPU>();
      int node_id = node_id_tensor.template data<int32_t>()[0];
      CAFFE_ENFORCE(
          seq_ < (1 << 16),
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@ -168,6 +168,15 @@ class Tensor {
      return;
    }
    meta_ = src.meta();
    if (src.size() == -1) {
      dims_.clear();
      size_ = -1;
      data_.reset();
      shares_data_ = false;
      capacity_ = 0;
      reserved_ = false;
      return;
    }
    Resize(src.dims());
    if (size() > 0) {
      if (meta_.copy()) {
@ -681,6 +690,21 @@ class Tensor {
    return dims_[i];
  }
  Tensor Clone() const {
    Tensor x;
    x.CopyFrom(*this);
    return x;
  }
  Tensor(Tensor<Context>&& src) noexcept {
    swap(src);
  }
  /**
   * @brief Delete the copy constructor and use Clone explicitly
   */
  Tensor(const Tensor<Context>& src) = delete;
 protected:
  vector<TIndex> dims_;
  TIndex size_ = -1;
--- a/caffe2/core/typeid.h
+++ b/caffe2/core/typeid.h
@ -27,7 +27,9 @@ class CaffeTypeId final : public c10::guts::IdWrapper<CaffeTypeId, uint16_t> {
 public:
  static CaffeTypeId createTypeId();
-  friend std::ostream& operator<<(std::ostream& stream, CaffeTypeId typeId);
+  friend std::ostream& operator<<(std::ostream& stream, CaffeTypeId typeId) {
    return stream << typeId.underlyingId();
  }
  friend bool operator<(CaffeTypeId lhs, CaffeTypeId rhs);
  // TODO Can we get rid of uninitialized?
@ -39,10 +41,6 @@ private:
    constexpr explicit CaffeTypeId(uint16_t id): IdWrapper(id) {}
 };
 inline std::ostream& operator<<(std::ostream& stream, CaffeTypeId typeId) {
  return stream << typeId.underlyingId();
 }
 // Allow usage in std::map / std::set
 // TODO Disallow this and rather use std::unordered_map/set everywhere
 inline bool operator<(CaffeTypeId lhs, CaffeTypeId rhs) {
--- a/caffe2/experiments/operators/fully_connected_op_decomposition.cc
+++ b/caffe2/experiments/operators/fully_connected_op_decomposition.cc
@ -1,3 +1,19 @@
 /**
 * Copyright (c) 2016-present, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "caffe2/experiments/operators/fully_connected_op_decomposition.h"
 namespace caffe2 {
--- a/caffe2/experiments/operators/fully_connected_op_decomposition.h
+++ b/caffe2/experiments/operators/fully_connected_op_decomposition.h
@ -1,3 +1,19 @@
 /**
 * Copyright (c) 2016-present, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef CAFFE2_OPERATORS_FULLY_CONNECTED_OP_DECOMPOSITION_H_
 #define CAFFE2_OPERATORS_FULLY_CONNECTED_OP_DECOMPOSITION_H_
--- a/caffe2/experiments/operators/fully_connected_op_decomposition_gpu.cc
+++ b/caffe2/experiments/operators/fully_connected_op_decomposition_gpu.cc
@ -1,3 +1,19 @@
 /**
 * Copyright (c) 2016-present, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/experiments/operators/fully_connected_op_decomposition.h"
--- a/caffe2/experiments/operators/fully_connected_op_prune.cc
+++ b/caffe2/experiments/operators/fully_connected_op_prune.cc
@ -1,3 +1,19 @@
 /**
 * Copyright (c) 2016-present, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "caffe2/experiments/operators/fully_connected_op_prune.h"
 namespace caffe2 {
--- a/caffe2/experiments/operators/fully_connected_op_prune.h
+++ b/caffe2/experiments/operators/fully_connected_op_prune.h
@ -1,3 +1,19 @@
 /**
 * Copyright (c) 2016-present, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef CAFFE2_OPERATORS_FULLY_CONNECTED_OP_PRUNE_H_
 #define CAFFE2_OPERATORS_FULLY_CONNECTED_OP_PRUNE_H_
--- a/caffe2/experiments/operators/fully_connected_op_sparse.cc
+++ b/caffe2/experiments/operators/fully_connected_op_sparse.cc
@ -1,3 +1,19 @@
 /**
 * Copyright (c) 2016-present, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "caffe2/experiments/operators/fully_connected_op_sparse.h"
 namespace caffe2 {
--- a/caffe2/experiments/operators/fully_connected_op_sparse.h
+++ b/caffe2/experiments/operators/fully_connected_op_sparse.h
@ -1,3 +1,19 @@
 /**
 * Copyright (c) 2016-present, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef CAFFE2_OPERATORS_FULLY_CONNECTED_OP_SPARSE_H_
 #define CAFFE2_OPERATORS_FULLY_CONNECTED_OP_SPARSE_H_
--- a/caffe2/experiments/operators/funhash_op.cc
+++ b/caffe2/experiments/operators/funhash_op.cc
@ -1,3 +1,19 @@
 /**
 * Copyright (c) 2016-present, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "caffe2/experiments/operators/funhash_op.h"
 namespace caffe2 {
--- a/caffe2/experiments/operators/funhash_op.h
+++ b/caffe2/experiments/operators/funhash_op.h
@ -1,3 +1,19 @@
 /**
 * Copyright (c) 2016-present, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef CAFFE2_OPERATORS_FUNHASH_OP_H_
 #define CAFFE2_OPERATORS_FUNHASH_OP_H_
--- a/caffe2/experiments/operators/sparse_funhash_op.cc
+++ b/caffe2/experiments/operators/sparse_funhash_op.cc
@ -1,3 +1,19 @@
 /**
 * Copyright (c) 2016-present, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "caffe2/experiments/operators/sparse_funhash_op.h"
 namespace caffe2 {
--- a/caffe2/experiments/operators/sparse_funhash_op.h
+++ b/caffe2/experiments/operators/sparse_funhash_op.h
@ -1,3 +1,19 @@
 /**
 * Copyright (c) 2016-present, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef CAFFE2_OPERATORS_SPARSE_FUNHASH_OP_H_
 #define CAFFE2_OPERATORS_SPARSE_FUNHASH_OP_H_
--- a/caffe2/experiments/operators/sparse_matrix_reshape_op.cc
+++ b/caffe2/experiments/operators/sparse_matrix_reshape_op.cc
@ -1,3 +1,19 @@
 /**
 * Copyright (c) 2016-present, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "caffe2/experiments/operators/sparse_matrix_reshape_op.h"
 namespace caffe2 {
--- a/caffe2/experiments/operators/sparse_matrix_reshape_op.h
+++ b/caffe2/experiments/operators/sparse_matrix_reshape_op.h
@ -1,3 +1,19 @@
 /**
 * Copyright (c) 2016-present, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef CAFFE2_OPERATORS_SPARSE_MATRIX_RESHAPE_H_
 #define CAFFE2_OPERATORS_SPARSE_MATRIX_RESHAPE_H_
--- a/caffe2/experiments/operators/tt_contraction_op.cc
+++ b/caffe2/experiments/operators/tt_contraction_op.cc
@ -1,3 +1,19 @@
 /**
 * Copyright (c) 2016-present, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "caffe2/experiments/operators/tt_contraction_op.h"
 namespace caffe2 {
--- a/caffe2/experiments/operators/tt_contraction_op.h
+++ b/caffe2/experiments/operators/tt_contraction_op.h
@ -1,3 +1,19 @@
 /**
 * Copyright (c) 2016-present, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef CAFFE2_OPERATORS_TT_CONTRACTION_OP_H_
 #define CAFFE2_OPERATORS_TT_CONTRACTION_OP_H_
--- a/caffe2/experiments/operators/tt_contraction_op_gpu.cc
+++ b/caffe2/experiments/operators/tt_contraction_op_gpu.cc
@ -1,3 +1,19 @@
 /**
 * Copyright (c) 2016-present, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/experiments/operators/tt_contraction_op.h"
--- a/caffe2/experiments/operators/tt_pad_op.cc
+++ b/caffe2/experiments/operators/tt_pad_op.cc
@ -1,3 +1,19 @@
 /**
 * Copyright (c) 2016-present, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "caffe2/experiments/operators/tt_pad_op.h"
 namespace caffe2 {
--- a/caffe2/experiments/operators/tt_pad_op.h
+++ b/caffe2/experiments/operators/tt_pad_op.h
@ -1,3 +1,19 @@
 /**
 * Copyright (c) 2016-present, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef CAFFE2_OPERATORS_TT_PAD_OP_H_
 #define CAFFE2_OPERATORS_TT_PAD_OP_H_
--- a/caffe2/experiments/python/SparseTransformer.py
+++ b/caffe2/experiments/python/SparseTransformer.py
@ -1,3 +1,18 @@
 # Copyright (c) 2016-present, Facebook, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ##############################################################################
 ## @package SparseTransformer
 # Module caffe2.experiments.python.SparseTransformer
 from __future__ import absolute_import
--- a/caffe2/experiments/python/convnet_benchmarks.py
+++ b/caffe2/experiments/python/convnet_benchmarks.py
@ -1,3 +1,18 @@
 # Copyright (c) 2016-present, Facebook, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ##############################################################################
 ## @package convnet_benchmarks
 # Module caffe2.experiments.python.convnet_benchmarks
 from __future__ import absolute_import
--- a/caffe2/experiments/python/device_reduce_sum_bench.py
+++ b/caffe2/experiments/python/device_reduce_sum_bench.py
@ -1,3 +1,18 @@
 # Copyright (c) 2016-present, Facebook, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ##############################################################################
 ## @package device_reduce_sum_bench
 # Module caffe2.experiments.python.device_reduce_sum_bench
 from __future__ import absolute_import
--- a/caffe2/experiments/python/funhash_op_test.py
+++ b/caffe2/experiments/python/funhash_op_test.py
@ -1,3 +1,18 @@
 # Copyright (c) 2016-present, Facebook, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ##############################################################################
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
--- a/caffe2/experiments/python/net_construct_bench.py
+++ b/caffe2/experiments/python/net_construct_bench.py
@ -1,3 +1,18 @@
 # Copyright (c) 2016-present, Facebook, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ##############################################################################
 ## @package net_construct_bench
 # Module caffe2.experiments.python.net_construct_bench
 from __future__ import absolute_import
--- a/caffe2/experiments/python/sparse_funhash_op_test.py
+++ b/caffe2/experiments/python/sparse_funhash_op_test.py
@ -1,3 +1,18 @@
 # Copyright (c) 2016-present, Facebook, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ##############################################################################
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
--- a/caffe2/experiments/python/sparse_reshape_op_test.py
+++ b/caffe2/experiments/python/sparse_reshape_op_test.py
@ -1,3 +1,18 @@
 # Copyright (c) 2016-present, Facebook, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ##############################################################################
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
--- a/caffe2/experiments/python/tt_contraction_op_test.py
+++ b/caffe2/experiments/python/tt_contraction_op_test.py
@ -1,3 +1,18 @@
 # Copyright (c) 2016-present, Facebook, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ##############################################################################
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
--- a/caffe2/experiments/python/tt_pad_op_test.py
+++ b/caffe2/experiments/python/tt_pad_op_test.py
@ -1,3 +1,18 @@
 # Copyright (c) 2016-present, Facebook, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ##############################################################################
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
--- a/caffe2/mobile/contrib/libopencl-stub/Android.mk
+++ b/caffe2/mobile/contrib/libopencl-stub/Android.mk
@ -1,15 +0,0 @@
 # Android makefile
 # Build this using ndk as
 # ndk-build NDK_PROJECT_PATH=.  APP_BUILD_SCRIPT=Android.mk
 #
 LOCAL_PATH := $(call my-dir)
 include $(CLEAR_VARS)
 LOCAL_MODULE := libOpenCL
 LOCAL_C_INCLUDES := $(LOCAL_PATH)/include/
 LOCAL_SRC_FILES :=  src/libopencl.c
 LOCAL_CFLAGS   = -fPIC -O2
 include $(BUILD_STATIC_LIBRARY)
--- a/caffe2/mobile/contrib/ulp2/ulp.cc
+++ b/caffe2/mobile/contrib/ulp2/ulp.cc
@ -286,7 +286,8 @@ std::unique_ptr<QConvState> create2b1bConvState(Workspace* ws,
 #endif
  };
  if (b) {
-    state->bias = caffe2::make_unique<TensorCPU>(*b);
+    CPUContext context;
    state->bias = caffe2::make_unique<TensorCPU>(*b, &context);
  }
  return state;
 }
--- a/caffe2/operators/abs_op.cc
+++ b/caffe2/operators/abs_op.cc
@ -78,7 +78,7 @@ Y: [0.3005476  1.551666   1.3591481  0.39191285 0.21866608]
 </details>
 )DOC")
-    .Input(0, "X", "*(type: Tensor<float\>)* Input tensor.")
+    .Input(0, "X", "*(type: Tensor<float\\>)* Input tensor.")
    .Output(
        0,
        "Y",
--- a/caffe2/operators/elementwise_op_test.h
+++ b/caffe2/operators/elementwise_op_test.h
@ -59,7 +59,7 @@ void elementwiseAnd() {
    EXPECT_TRUE(op->Run());
    auto* blob = ws.GetBlob("Z");
    EXPECT_NE(nullptr, blob);
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>());
+    const auto& Z = blob->Get<caffe2::Tensor<Context>>();
    EXPECT_EQ(Z.size(), N);
    std::vector<bool> result{true, false, false, false};
    for (size_t i = 0; i < Z.size(); ++i) {
@ -79,7 +79,7 @@ void elementwiseAnd() {
    EXPECT_TRUE(op->Run());
    auto* blob = ws.GetBlob("Z");
    EXPECT_NE(nullptr, blob);
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>());
+    const auto& Z = blob->Get<caffe2::Tensor<Context>>();
    EXPECT_EQ(Z.size(), M * N);
    std::vector<bool> result{
        true, false, false, false, true, false, false, false};
@ -105,7 +105,7 @@ void elementwiseOr() {
    EXPECT_TRUE(op->Run());
    auto* blob = ws.GetBlob("Z");
    EXPECT_NE(nullptr, blob);
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>());
+    const auto& Z = blob->Get<caffe2::Tensor<Context>>();
    EXPECT_EQ(Z.size(), N);
    std::vector<bool> result{true, true, true, false};
    for (size_t i = 0; i < Z.size(); ++i) {
@ -125,7 +125,7 @@ void elementwiseOr() {
    EXPECT_TRUE(op->Run());
    auto* blob = ws.GetBlob("Z");
    EXPECT_NE(nullptr, blob);
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>());
+    const auto& Z = blob->Get<caffe2::Tensor<Context>>();
    EXPECT_EQ(Z.size(), M * N);
    std::vector<bool> result{true, true, true, false, true, true, true, false};
    for (size_t i = 0; i < Z.size(); ++i) {
@ -150,7 +150,7 @@ void elementwiseXor() {
    EXPECT_TRUE(op->Run());
    auto* blob = ws.GetBlob("Z");
    EXPECT_NE(nullptr, blob);
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>());
+    const auto& Z = blob->Get<caffe2::Tensor<Context>>();
    EXPECT_EQ(Z.size(), N);
    std::vector<bool> result{false, true, true, false};
    for (size_t i = 0; i < Z.size(); ++i) {
@ -170,7 +170,7 @@ void elementwiseXor() {
    EXPECT_TRUE(op->Run());
    auto* blob = ws.GetBlob("Z");
    EXPECT_NE(nullptr, blob);
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>());
+    const auto& Z = blob->Get<caffe2::Tensor<Context>>();
    EXPECT_EQ(Z.size(), M * N);
    std::vector<bool> result{
        false, true, true, false, false, true, true, false};
@ -195,7 +195,7 @@ void elementwiseNot() {
  EXPECT_TRUE(op->Run());
  auto* blob = ws.GetBlob("Y");
  EXPECT_NE(nullptr, blob);
-  caffe2::TensorCPU Y(blob->Get<caffe2::Tensor<Context>>());
+  const auto& Y = blob->Get<caffe2::Tensor<Context>>();
  EXPECT_EQ(Y.size(), N);
  std::vector<bool> result{false, true};
  for (size_t i = 0; i < Y.size(); ++i) {
@ -217,7 +217,7 @@ void elementwiseEQ() {
    EXPECT_TRUE(op->Run());
    auto* blob = ws.GetBlob("Z");
    EXPECT_NE(nullptr, blob);
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>());
+    const auto& Z = blob->Get<caffe2::Tensor<Context>>();
    EXPECT_EQ(Z.size(), N);
    std::vector<bool> result{false, true, false, true};
    for (size_t i = 0; i < Z.size(); ++i) {
@ -234,7 +234,7 @@ void elementwiseEQ() {
    EXPECT_TRUE(op->Run());
    auto* blob = ws.GetBlob("Z");
    EXPECT_NE(nullptr, blob);
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>());
+    const auto& Z = blob->Get<caffe2::Tensor<Context>>();
    EXPECT_EQ(Z.size(), N);
    std::vector<bool> result{true, true, false, false};
    for (size_t i = 0; i < Z.size(); ++i) {
@ -253,7 +253,7 @@ void elementwiseEQ() {
    EXPECT_TRUE(op->Run());
    auto* blob = ws.GetBlob("Z");
    EXPECT_NE(nullptr, blob);
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>());
+    const auto& Z = blob->Get<caffe2::Tensor<Context>>();
    EXPECT_EQ(Z.size(), M * N);
    std::vector<bool> result{
        true, false, false, true, false, true, true, false};
--- a/caffe2/operators/reduction_ops.cc
+++ b/caffe2/operators/reduction_ops.cc
@ -296,13 +296,14 @@ bool SumElementsGradientOp<T, Context>::RunOnDevice()
 #endif
 {
  auto& X = Input(0);
-  TensorCPU sum_grad = TensorCPU(Input(1));
+  const auto& sum_grad = Input(1);
  auto* dX = Output(0);
  dX->ResizeLike(X);
  DCHECK_EQ(sum_grad.size(), 1);
  math::Set<T, Context>(
      dX->size(),
-      static_cast<T>(sum_grad.data<T>()[0] * (average_ ? 1.0 / X.size() : 1)),
+      static_cast<T>(
          sum_grad.template data<T>()[0] * (average_ ? 1.0 / X.size() : 1)),
      dX->template mutable_data<T>(),
      &context_);
  return true;
--- a/caffe2/operators/sequence_ops.cc
+++ b/caffe2/operators/sequence_ops.cc
@ -385,7 +385,7 @@ lengths_out: [5]
    .Output(
        0,
        "data_out",
-        "*(type: Tensor)* Padded data tensor ($T<N + 2*padding\_width, "
+        "*(type: Tensor)* Padded data tensor ($T<N + 2*padding\\_width, "
        "D_1, ..., D_n>$).")
    .Output(
        1,
@ -483,7 +483,7 @@ lengths_out_rm: [3]
        0,
        "data_out",
        "*(type: Tensor)* Padded data tensor "
-        "($T<N + 2*padding\_width, D_1, ..., D_n>$).")
+        "($T<N + 2*padding\\_width, D_1, ..., D_n>$).")
    .Output(
        1,
        "lengths_out",
--- a/caffe2/opt/converter.cc
+++ b/caffe2/opt/converter.cc
@ -128,6 +128,49 @@ convertToNeuralNetOperator(caffe2::OperatorDef* op) {
    nnOp = util::make_unique<repr::BatchNormalization>();
  }
  if (op->type() == "Concat") {
    nnOp = util::make_unique<repr::Concat>();
    auto c = dyn_cast<repr::Concat>(nnOp.get());
    if (argMap.count("axis")) {
      CAFFE_ENFORCE(argMap["axis"].has_i(), "Invalid axis argument");
      int axis = static_cast<int>(argMap["axis"].i());
      c->setAxis(axis);
    }
    if (argMap.count("add_axis")) {
      CAFFE_ENFORCE(argMap["add_axis"].has_i(), "Invalid add_axis argument");
      int add_axis = static_cast<int>(argMap["add_axis"].i());
      c->setAddAxis(!!add_axis);
    }
  }
  if (op->type() == "Flatten") {
    nnOp = util::make_unique<repr::Flatten>();
  }
  if (op->type() == "BatchGather") {
    nnOp = util::make_unique<repr::BatchGather>();
  }
  if (op->type() == "BatchMatMul") {
    nnOp = util::make_unique<repr::BatchMatMul>();
    auto c = dyn_cast<repr::BatchMatMul>(nnOp.get());
    if (argMap.count("trans_a")) {
      CAFFE_ENFORCE(argMap["trans_a"].has_i(), "Invalid axis argument");
      int trans_a = static_cast<int>(argMap["trans_a"].i());
      c->setTransA(!!trans_a);
    }
    if (argMap.count("trans_b")) {
      CAFFE_ENFORCE(argMap["trans_b"].has_i(), "Invalid add_axis argument");
      int trans_b = static_cast<int>(argMap["trans_b"].i());
      c->setTransB(!!trans_b);
    }
    if (argMap.count("broadcast")) {
      CAFFE_ENFORCE(argMap["broadcast"].has_i(), "Invalid add_axis argument");
      int broadcast = static_cast<int>(argMap["broadcast"].i());
      c->setBroadcast(!!broadcast);
    }
  }
  if (!nnOp) {
    nnOp = util::make_unique<repr::GenericOperator>(op->type());
  }
--- a/caffe2/opt/passes.h
+++ b/caffe2/opt/passes.h
@ -25,7 +25,7 @@ class OptimizationPass {
 public:
  OptimizationPass(NNModule* nn) : nn_(nn) {}
  virtual void run() = 0;
-  virtual ~OptimizationPass() = 0;
+  virtual ~OptimizationPass(){}
 protected:
  NNModule* nn_;
@ -34,6 +34,7 @@ class OptimizationPass {
 class WorkspaceOptimizationPass : public OptimizationPass {
 public:
  WorkspaceOptimizationPass(NNModule* nn, Workspace* ws) : OptimizationPass(nn), ws_(ws) {}
  virtual ~WorkspaceOptimizationPass(){}
 protected:
  Workspace* ws_;
@ -42,26 +43,28 @@ class WorkspaceOptimizationPass : public OptimizationPass {
 CAFFE_DECLARE_REGISTRY(WorkspaceOptimizationPassRegistry, WorkspaceOptimizationPass, NNModule*, Workspace*);
 #define REGISTER_WS_OPT_PASS(clsname) \
  CAFFE_REGISTER_CLASS(WorkspaceOptimizationPassRegistry, clsname, clsname)
-#define REGISTER_WS_OPT_PASS_FROM_FUNC(passname, funcname) \
+#define REGISTER_WS_OPT_PASS_FROM_FUNC(passname, funcname)      \
-  class passname : public WorkspaceOptimizationPass { \
+  class passname : public WorkspaceOptimizationPass {           \
-   public: \
+   public:                                                      \
    using WorkspaceOptimizationPass::WorkspaceOptimizationPass; \
-    void run() override { \
+    void run() override {                                       \
-      funcname(nn_, ws_); \
+      funcname(nn_, ws_);                                       \
-    } \
+    }                                                           \
-  };
+  };                                                            \
  REGISTER_WS_OPT_PASS(passname);
 CAFFE_DECLARE_REGISTRY(OptimizationPassRegistry, OptimizationPass, NNModule*);
 #define REGISTER_OPT_PASS(clsname) \
  CAFFE_REGISTER_CLASS(OptimizationPassRegistry, clsname, clsname)
 #define REGISTER_OPT_PASS_FROM_FUNC(passname, funcname) \
-  class passname : public OptimizationPass { \
+  class passname : public OptimizationPass {            \
-   public: \
+   public:                                              \
-    using OptimizationPass::OptimizationPass; \
+    using OptimizationPass::OptimizationPass;           \
-    void run() override { \
+    void run() override {                               \
-      funcname(nn_); \
+      funcname(nn_);                                    \
-    } \
+    }                                                   \
-  };
+  };                                                    \
  REGISTER_OPT_PASS(passname);
 } // namespace caffe2
--- a/caffe2/python/layer_model_helper.py
+++ b/caffe2/python/layer_model_helper.py
@ -91,6 +91,7 @@ class LayerModelHelper(model_helper.ModelHelper):
        # additional (hard-coded) diagnose_options to report based on the model
        # TODO(xlwang): it's hack!
        self.ad_hoc_diagnose_blobs_and_operations = []
        self.ad_hoc_plot_blobs = []
    def clear_output_schema(self):
        self._output_schema = None
@ -105,6 +106,11 @@ class LayerModelHelper(model_helper.ModelHelper):
            (name, value)
        )
    def add_ad_hoc_plot_blob(self, blob, dtype=None):
        dtype = dtype or (np.float, (1, ))
        self.add_metric_field(str(blob), schema.Scalar(dtype, blob))
        self.ad_hoc_plot_blobs.append(blob)
    @staticmethod
    def _get_global_constant_initializer_op(
        blob_name, array=None, dtype=None, initializer=None
--- a/caffe2/python/layers/adaptive_weight.py
+++ b/caffe2/python/layers/adaptive_weight.py
@ -22,7 +22,9 @@ class AdaptiveWeight(ModelLayer):
        optimizer=None,
        weights=None,
        enable_diagnose=False,
-        estimation_method=None,
+        estimation_method="log_std",
        pos_optim_method="log_barrier",
        reg_lambda=0.1,
        **kwargs
    ):
        super(AdaptiveWeight, self).__init__(model, name, input_record, **kwargs)
@ -38,20 +40,23 @@ class AdaptiveWeight(ModelLayer):
            weights = [1. / self.num for _ in range(self.num)]
        assert min(weights) > 0, "initial weights must be positive"
        self.weights = np.array(weights).astype(np.float32)
-        self.estimation_method = estimation_method
+        self.estimation_method = str(estimation_method).lower()
-        if self.estimation_method is not None:
+        # used in positivity-constrained parameterization as when the estimation method
-            self.estimation_method_type = infer_thrift_union_selection(
+        # is inv_var, with optimization method being either log barrier, or grad proj
-                estimation_method
+        self.pos_optim_method = str(pos_optim_method).lower()
-            ).lower()
+        self.reg_lambda = float(reg_lambda)
            self.estimation_method_value = estimation_method.value
        else:
            self.estimation_method_type = "log_std"
            self.estimation_method_value = None
        self.enable_diagnose = enable_diagnose
-        self.init_func = getattr(self, self.estimation_method_type + "_init")
+        self.init_func = getattr(self, self.estimation_method + "_init")
-        self.weight_func = getattr(self, self.estimation_method_type + "_weight")
+        self.weight_func = getattr(self, self.estimation_method + "_weight")
-        self.reg_func = getattr(self, self.estimation_method_type + "_reg")
+        self.reg_func = getattr(self, self.estimation_method + "_reg")
        self.init_func()
        if self.enable_diagnose:
            self.weight_i = [
                self.get_next_blob_reference("adaptive_weight_%d" % i)
                for i in range(self.num)
            ]
            for i in range(self.num):
                self.model.add_ad_hoc_plot_blob(self.weight_i[i])
    def concat_data(self, net):
        reshaped = [net.NextScopedBlob("reshaped_data_%d" % i) for i in range(self.num)]
@ -110,15 +115,15 @@ class AdaptiveWeight(ModelLayer):
            "GivenTensorFill",
            {"values": values, "dtype": core.DataType.FLOAT},
        )
-        pos_optim_method = self.estimation_method_value.pos_optim_method.getType()
+        if self.pos_optim_method == "log_barrier":
-        pos_optim_option = self.estimation_method_value.pos_optim_method.value
+            regularizer = LogBarrier(reg_lambda=self.reg_lambda)
-        if pos_optim_method == "LOG_BARRIER":
+        elif self.pos_optim_method == "pos_grad_proj":
            regularizer = LogBarrier(float(reg_lambda=pos_optim_option.reg_lambda))
        elif pos_optim_method == "POS_GRAD_PROJ":
            regularizer = BoundedGradientProjection(lb=0, left_open=True)
        else:
            raise TypeError(
-                "unknown positivity optimization method: {}".format(pos_optim_method)
+                "unknown positivity optimization method: {}".format(
                    self.pos_optim_method
                )
            )
        self.k = self.create_param(
            param_name="k",
@ -136,7 +141,7 @@ class AdaptiveWeight(ModelLayer):
        net.Log(self.k, log_k)
        net.Scale(log_k, reg, scale=-0.5)
-    def add_ops(self, net):
+    def _add_ops_impl(self, net, enable_diagnose):
        x = self.concat_data(net)
        weight = net.NextScopedBlob("weight")
        reg = net.NextScopedBlob("reg")
@ -147,21 +152,9 @@ class AdaptiveWeight(ModelLayer):
        net.Mul([weight, x], weighted_x)
        net.Add([weighted_x, reg], weighted_x_add_reg)
        net.SumElements(weighted_x_add_reg, self.output_schema())
-        if self.enable_diagnose:
+        if enable_diagnose:
            for i in range(self.num):
-                weight_i = net.NextScopedBlob("weight_%d" % i)
+                net.Slice(weight, self.weight_i[i], starts=[i], ends=[i + 1])
                net.Slice(weight, weight_i, starts=[i], ends=[i + 1])
-
+    def add_ops(self, net):
-def infer_thrift_union_selection(ttype_union):
+        self._add_ops_impl(net, self.enable_diagnose)
    # TODO(xlwang): this is a hack way to infer the type str of a thrift union
    # struct
    assert ttype_union.isUnion(), "type {} is not a thrift union".format(
        type(ttype_union)
    )
    field = ttype_union.field
    for attr in dir(ttype_union):
        v = getattr(ttype_union, attr)
        if isinstance(v, int) and attr != "field" and v == field:
            return attr
    raise ValueError("Fail to infer the thrift union type")
--- a/caffe2/python/layers_test.py
+++ b/caffe2/python/layers_test.py
@ -1809,25 +1809,50 @@ class TestLayers(LayersTestCase):
    @given(
        num=st.integers(min_value=10, max_value=100),
        feed_weight=st.booleans(),
        use_inv_var_parameterization=st.booleans(),
        use_log_barrier=st.booleans(),
        enable_diagnose=st.booleans(),
        **hu.gcs
    )
-    def testAdaptiveWeight(self, num, feed_weight, gc, dc):
+    def testAdaptiveWeight(
        self, num, feed_weight, use_inv_var_parameterization, use_log_barrier,
        enable_diagnose, gc, dc
    ):
        input_record = self.new_record(schema.RawTuple(num))
        data = np.random.random(num)
        schema.FeedRecord(
-            input_record,
+            input_record, [np.array(x).astype(np.float32) for x in data]
            [np.array(x).astype(np.float32) for x in data]
        )
        weights = np.random.random(num) if feed_weight else None
-        result = self.model.AdaptiveWeight(input_record, weights=weights)
+        result = self.model.AdaptiveWeight(
            input_record,
            weights=weights,
            estimation_method=(
                'inv_var' if use_inv_var_parameterization else 'log_std'
            ),
            pos_optim_method=(
                'log_barrier' if use_log_barrier else 'pos_grad_proj'
            ),
            enable_diagnose=enable_diagnose
        )
        train_init_net, train_net = self.get_training_nets(True)
        workspace.RunNetOnce(train_init_net)
        workspace.RunNetOnce(train_net)
        result = workspace.FetchBlob(result())
        if not feed_weight:
-            weights = 1. / num
+            weights = np.array([1. / num for _ in range(num)])
        expected = np.sum(weights * data + 0.5 * np.log(1. / 2. / weights))
        npt.assert_allclose(expected, result, atol=1e-4, rtol=1e-4)
        if enable_diagnose:
            assert len(self.model.ad_hoc_plot_blobs) == num
            reconst_weights_from_ad_hoc = np.array(
                [workspace.FetchBlob(b) for b in self.model.ad_hoc_plot_blobs]
            ).flatten()
            npt.assert_allclose(
                reconst_weights_from_ad_hoc, weights, atol=1e-4, rtol=1e-4
            )
        else:
            assert len(self.model.ad_hoc_plot_blobs) == 0
    @given(num=st.integers(min_value=10, max_value=100), **hu.gcs)
    def testConstantWeight(self, num, gc, dc):
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@ -21,6 +21,7 @@
 #include "caffe2/opt/fusion.h"
 #include "caffe2/opt/mobile.h"
 #include "caffe2/opt/optimize_ideep.h"
 #include "caffe2/opt/passes.h"
 #include "caffe2/opt/sink.h"
 #include "caffe2/utils/cpuid.h"
 #include "caffe2/utils/string_utils.h"
@ -1481,6 +1482,45 @@ void addGlobalMethods(py::module& m) {
  CAFFE2_CPU_FEATURE_SUPPORT(avx2);
 #undef CAFFE2_CPU_FEATURE_SUPPORT
  m.def("transform_exists", [](const std::string& transform_name) {
    return OptimizationPassRegistry()->Has(transform_name);
  });
  m.def("workspace_transform_exists", [](const std::string& transform_name) {
    return WorkspaceOptimizationPassRegistry()->Has(transform_name);
  });
  m.def("run_transform", [](const std::string& transform_name, py::bytes def) {
    caffe2::NetDef proto;
    CAFFE_ENFORCE(ParseProtoFromLargeString(def.cast<std::string>(), &proto));
    auto nn = caffe2::convertToNNModule(proto);
    auto pass = OptimizationPassRegistry()->Create(transform_name, &nn);
    CAFFE_ENFORCE(pass, "Pass doesn't exist: ", transform_name);
    pass->run();
    auto new_proto = caffe2::convertToCaffe2Proto(nn, proto);
    std::string out;
    new_proto.SerializeToString(&out);
    return py::bytes(out);
  });
  m.def(
      "run_workspace_transform",
      [](const std::string& transform_name, py::bytes def) {
        CAFFE_ENFORCE(gWorkspace);
        caffe2::NetDef proto;
        CAFFE_ENFORCE(
            ParseProtoFromLargeString(def.cast<std::string>(), &proto));
        auto nn = caffe2::convertToNNModule(proto);
        auto pass = WorkspaceOptimizationPassRegistry()->Create(
            transform_name, &nn, gWorkspace);
        CAFFE_ENFORCE(pass, "Pass doesn't exist: ", transform_name);
        pass->run();
        auto new_proto = caffe2::convertToCaffe2Proto(nn, proto);
        std::string out;
        new_proto.SerializeToString(&out);
        return py::bytes(out);
      });
  // Transformations are exposed as functions here and wrapped
  // into a python interface in transformations.py
--- a/caffe2/python/pybind_state_int8.cc
+++ b/caffe2/python/pybind_state_int8.cc
@ -32,7 +32,7 @@ namespace python {
 class Int8TensorFetcher : public BlobFetcherBase {
 public:
  pybind11::object Fetch(const Blob& blob) override {
-    const caffe2::int8::Int8TensorCPU src =
+    const caffe2::int8::Int8TensorCPU& src =
        blob.template Get<caffe2::int8::Int8TensorCPU>();
    const int numpy_type = CaffeToNumpyType(src.t.meta());
    CAFFE_ENFORCE(numpy_type != -1, "Int8Tensor contains unknown type data");
--- a/caffe2/python/transformations.py
+++ b/caffe2/python/transformations.py
@ -21,10 +21,23 @@ from __future__ import unicode_literals
 import caffe2.python._import_c_extension as C
-def addNNPACK(net):
+class Transformer(object):
-    net.Proto().ParseFromString(
+    def __init__(self):
-        C.transform_addNNPACK(net.Proto().SerializeToString())
+        pass
-    )
+
    @classmethod
    def runTransform(cls, transform_name, net):
        pb = net.Proto().SerializeToString()
        if C.transform_exists(transform_name):
            output = C.run_transform(transform_name, pb)
        elif C.workspace_transform_exists(transform_name):
            output = C.run_workspace_transform(transform_name, pb)
        else:
            raise AttributeError('Transformation {} not found.'.format(transform_name))
        net.Proto().ParseFromString(output)
    def __getattr__(self, transform_name):
        return lambda net : self.runTransform(transform_name, net)
 def fuseNNPACKConvRelu(net):
--- a/caffe2/python/transformations_test.py
+++ b/caffe2/python/transformations_test.py
@ -22,14 +22,11 @@ from hypothesis import given
 import hypothesis.strategies as st
 import numpy as np
-from caffe2.python.transformations import (
+from caffe2.python.transformations import Transformer
    addNNPACK,
    fuseNNPACKConvRelu,
    fuseConvBN,
    sinkMaxPool,
 )
 from caffe2.python import core, workspace, test_util
 transformer = Transformer()
 def str_compare(a, b, encoding="utf8"):
    if isinstance(a, bytes):
@ -40,26 +37,21 @@ def str_compare(a, b, encoding="utf8"):
 class TestTransformations(test_util.TestCase):
-    def test_addNNPACK(self):
+    def test_transformer_AddNNPACK(self):
        net = core.Net("net")
-        net.Conv(
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
            ["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW"
        )
        net.Relu(["Y"], ["Y2"])
-        addNNPACK(net)
+        transformer.AddNNPACK(net)
        assert str_compare(net.Proto().op[0].engine, "NNPACK")
-
+    def test_transformer_FuseNNPACKConvRelu(self):
    def test_fuseNNPACKConvRelu(self):
        net = core.Net("net")
-        net.Conv(
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
            ["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW"
        )
        net.Relu(["Y"], ["Y2"])
-        addNNPACK(net) # get the NNPACK engine
+        transformer.AddNNPACK(net)  # get the NNPACK engine
        assert str_compare(net.Proto().op[0].engine, "NNPACK")
-        fuseNNPACKConvRelu(net)
+        transformer.FuseNNPACKConvRelu(net)
-        assert (len(net.Proto().op) == 1)
+        assert len(net.Proto().op) == 1
        has_activation_arg = False
        for arg in net.Proto().op[0].arg:
            if str_compare(arg.name, "activation"):
@ -69,31 +61,27 @@ class TestTransformations(test_util.TestCase):
    def test_noFuseNNPACKConvRelu(self):
        net = core.Net("net")
-        net.Conv(
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
            ["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW"
        )
        net.Relu(["Y"], ["Y2"])
        net.Relu(["Y"], ["Y3"])
-        addNNPACK(net) # get the NNPACK engine
+        transformer.AddNNPACK(net)  # get the NNPACK engine
        assert str_compare(net.Proto().op[0].engine, "NNPACK")
-        fuseNNPACKConvRelu(net)
+        transformer.FuseNNPACKConvRelu(net)
-        assert (len(net.Proto().op) == 3)
+        assert len(net.Proto().op) == 3
        has_activation_arg = False
        for arg in net.Proto().op[0].arg:
            if str_compare(arg.name, "activation") and str_compare(arg.s, "Relu"):
                has_activation_arg = True
        assert not has_activation_arg
-    def test_fuseNNPACKConvReluNoInplace(self):
+    def test_transformer_FuseNNPACKConvReluNoInplace(self):
        net = core.Net("net")
-        net.Conv(
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
            ["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW"
        )
        net.Relu(["Y"], ["X"])
-        addNNPACK(net) # get the NNPACK engine
+        transformer.AddNNPACK(net)  # get the NNPACK engine
        assert str_compare(net.Proto().op[0].engine, "NNPACK")
-        fuseNNPACKConvRelu(net)
+        transformer.FuseNNPACKConvRelu(net)
-        assert (len(net.Proto().op) == 1)
+        assert len(net.Proto().op) == 1
        has_activation_arg = False
        for arg in net.Proto().op[0].arg:
            if str_compare(arg.name, "activation"):
@ -102,16 +90,14 @@ class TestTransformations(test_util.TestCase):
        assert has_activation_arg
        assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
-    def test_fuseNNPACKConvReluInplaceRelu(self):
+    def test_transformer_FuseNNPACKConvReluInplaceRelu(self):
        net = core.Net("net")
-        net.Conv(
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
            ["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW"
        )
        net.Relu(["Y"], ["Y"])
-        addNNPACK(net) # get the NNPACK engine
+        transformer.AddNNPACK(net)  # get the NNPACK engine
        assert str_compare(net.Proto().op[0].engine, "NNPACK")
-        fuseNNPACKConvRelu(net)
+        transformer.FuseNNPACKConvRelu(net)
-        assert (len(net.Proto().op) == 1)
+        assert len(net.Proto().op) == 1
        has_activation_arg = False
        for arg in net.Proto().op[0].arg:
            if str_compare(arg.name, "activation"):
@ -120,19 +106,15 @@ class TestTransformations(test_util.TestCase):
        assert has_activation_arg
        assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
-    def test_fuseNNPACKConvReluPingPongNaming(self):
+    def test_transformer_FuseNNPACKConvReluPingPongNaming(self):
        net = core.Net("net")
-        net.Conv(
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
            ["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW"
        )
        net.Relu(["Y"], ["X"])
-        net.Conv(
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
-            ["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW"
+        transformer.AddNNPACK(net)  # get the NNPACK engine
        )
        addNNPACK(net) # get the NNPACK engine
        assert str_compare(net.Proto().op[0].engine, "NNPACK")
-        fuseNNPACKConvRelu(net)
+        transformer.FuseNNPACKConvRelu(net)
-        assert (len(net.Proto().op) == 2)
+        assert len(net.Proto().op) == 2
        has_activation_arg = False
        for arg in net.Proto().op[0].arg:
            if str_compare(arg.name, "activation"):
@ -142,20 +124,16 @@ class TestTransformations(test_util.TestCase):
        assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
        assert net.Proto().op[1].output[0] != net.Proto().op[1].input[0]
-    def test_fuseNNPACKConvReluFollowedByMultipleInputOp(self):
+    def test_transformer_FuseNNPACKConvReluFollowedByMultipleInputOp(self):
        net = core.Net("net")
-        net.Conv(
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
            ["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW"
        )
        net.Relu(["Y"], ["Y2"])
-        net.Conv(
+        net.Conv(["Y2", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
            ["Y2", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW"
        )
        net.Relu(["Y"], ["Y2"])
-        addNNPACK(net) # get the NNPACK engine
+        transformer.AddNNPACK(net)  # get the NNPACK engine
        assert str_compare(net.Proto().op[0].engine, "NNPACK")
-        fuseNNPACKConvRelu(net)
+        transformer.FuseNNPACKConvRelu(net)
-        assert (len(net.Proto().op) == 2)
+        assert len(net.Proto().op) == 2
        has_activation_arg = False
        for arg in net.Proto().op[0].arg:
            if str_compare(arg.name, "activation"):
@ -165,20 +143,16 @@ class TestTransformations(test_util.TestCase):
        assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
        assert net.Proto().op[1].output[0] != net.Proto().op[1].input[0]
-    def test_fuseNNPACKConvReluInplaceFollowedByMultipleInputOp(self):
+    def test_transformer_FuseNNPACKConvReluInplaceFollowedByMultipleInputOp(self):
        net = core.Net("net")
-        net.Conv(
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
            ["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW"
        )
        net.Relu(["Y"], ["Y"])
-        net.Conv(
+        net.Conv(["Y", "w", "b"], ["Y2"], stride=1, pad=0, kernel=3, order="NCHW")
            ["Y", "w", "b"], ["Y2"], stride=1, pad=0, kernel=3, order="NCHW"
        )
        net.Relu(["Y2"], ["Y2"])
-        addNNPACK(net) # get the NNPACK engine
+        transformer.AddNNPACK(net)  # get the NNPACK engine
        assert str_compare(net.Proto().op[0].engine, "NNPACK")
-        fuseNNPACKConvRelu(net)
+        transformer.FuseNNPACKConvRelu(net)
-        assert (len(net.Proto().op) == 2)
+        assert len(net.Proto().op) == 2
        has_activation_arg = False
        for arg in net.Proto().op[0].arg:
            if str_compare(arg.name, "activation"):
@ -188,14 +162,12 @@ class TestTransformations(test_util.TestCase):
        assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
        assert net.Proto().op[1].output[0] != net.Proto().op[1].input[0]
-    def test_sinkMaxPool(self):
+    def test_transformer_SinkMaxPool(self):
        net = core.Net("net")
-        net.Conv(
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
            ["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW"
        )
        net.MaxPool(["Y"], ["Y1"], kernel=3)
        net.Relu(["Y1"], ["Y1"])
-        sinkMaxPool(net)
+        transformer.SinkMaxPool(net)
        assert str_compare(net.Proto().op[1].type, "Relu")
        assert str_compare(net.Proto().op[2].type, "MaxPool")
@ -204,9 +176,9 @@ class TestTransformations(test_util.TestCase):
        input_channels=st.integers(1, 10),
        seed=st.integers(0, 65535),
        order=st.sampled_from(["NCHW", "NHWC"]),
-        epsilon=st.floats(min_value=1e-5, max_value=1e-2)
+        epsilon=st.floats(min_value=1e-5, max_value=1e-2),
    )
-    def test_fuseConvBN(self, size, input_channels, seed, order, epsilon):
+    def test_transformer_FuseConvBN(self, size, input_channels, seed, order, epsilon):
        net = core.Net("net")
        c = input_channels
        h = size
@ -214,31 +186,20 @@ class TestTransformations(test_util.TestCase):
        k = 3
        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=k, order=order)
        net.SpatialBN(
-            ["Y", "scale", "bias", "mean", "var"], ["Y2"],
+            ["Y", "scale", "bias", "mean", "var"],
            ["Y2"],
            is_test=True,
            order=order,
-            epsilon=epsilon
+            epsilon=epsilon,
        )
        np.random.seed(seed)
        if order == "NCHW":
-            workspace.FeedBlob(
+            workspace.FeedBlob("X", np.random.rand(1, c, h, w).astype(np.float32))
-                "X",
+            workspace.FeedBlob("w", np.random.rand(c, c, k, k).astype(np.float32))
                np.random.rand(1, c, h, w).astype(np.float32)
            )
            workspace.FeedBlob(
                "w",
                np.random.rand(c, c, k, k).astype(np.float32)
            )
        else:
-            workspace.FeedBlob(
+            workspace.FeedBlob("X", np.random.rand(1, h, w, c).astype(np.float32))
-                "X",
+            workspace.FeedBlob("w", np.random.rand(c, k, k, c).astype(np.float32))
                np.random.rand(1, h, w, c).astype(np.float32)
            )
            workspace.FeedBlob(
                "w",
                np.random.rand(c, k, k, c).astype(np.float32)
            )
        workspace.FeedBlob("b", np.random.rand(c).astype(np.float32))
        workspace.FeedBlob("scale", np.random.rand(c).astype(np.float32))
        workspace.FeedBlob("bias", np.random.rand(c).astype(np.float32))
@ -246,11 +207,13 @@ class TestTransformations(test_util.TestCase):
        workspace.FeedBlob("var", np.random.rand(c).astype(np.float32))
        workspace.RunNetOnce(net)
        preTransformOutput = workspace.FetchBlob("Y2")
-        fuseConvBN(net)
+        transformer.FuseConvBN(net)
        # Ensure fusion
-        assert (len(net.Proto().op) == 1)
+        assert len(net.Proto().op) == 1
        workspace.RunNetOnce(net)
        postTransformOutput = workspace.FetchBlob("Y2")
        # Check that there is no numerical difference
-        assert (np.allclose(preTransformOutput, postTransformOutput, rtol=1e-05, atol=1e-08))
+        assert np.allclose(
            preTransformOutput, postTransformOutput, rtol=1e-05, atol=1e-08
        )
--- a/caffe2/queue/rebatching_queue.cc
+++ b/caffe2/queue/rebatching_queue.cc
@ -163,7 +163,7 @@ bool RebatchingQueue::enqueueOne(
  auto& tensorVector = splittedInputs.back();
  tensorVector.reserve(inputs.size());
  for (const auto* tensorPtr : inputs) {
-    tensorVector.push_back(*tensorPtr);
+    tensorVector.push_back(tensorPtr->Clone());
  }
  return enqueue(std::move(splittedInputs));
--- a/caffe2/sgd/yellowfin_op.h
+++ b/caffe2/sgd/yellowfin_op.h
@ -111,19 +111,19 @@ class YellowFinOp final : public Operator<Context> {
  bool RunOnDevice() override {
 // Iter live on the CPU
-#define CAFFE2_YF_READ_INPUT(INPUT_NAME, VAR_NAME)  \
+#define CAFFE2_YF_READ_INPUT(INPUT_NAME, VAR_NAME)   \
-  const auto VAR_NAME##_tensor = Input(INPUT_NAME); \
+  const auto& VAR_NAME##_tensor = Input(INPUT_NAME); \
  VAR_NAME##_ = VAR_NAME##_tensor.template data<T>();
-    CAFFE2_YF_READ_INPUT(PARAM, param)
+CAFFE2_YF_READ_INPUT(PARAM, param)
-    CAFFE2_YF_READ_INPUT(MOMENT, moment)
+CAFFE2_YF_READ_INPUT(MOMENT, moment)
-    CAFFE2_YF_READ_INPUT(LR_AVG, lr_avg)
+CAFFE2_YF_READ_INPUT(LR_AVG, lr_avg)
-    CAFFE2_YF_READ_INPUT(MU_AVG, mu_avg)
+CAFFE2_YF_READ_INPUT(MU_AVG, mu_avg)
-    CAFFE2_YF_READ_INPUT(CURV_WIN, curv_win)
+CAFFE2_YF_READ_INPUT(CURV_WIN, curv_win)
-    CAFFE2_YF_READ_INPUT(G_AVG, g_avg)
+CAFFE2_YF_READ_INPUT(G_AVG, g_avg)
-    CAFFE2_YF_READ_INPUT(G2_AVG, g2_avg)
+CAFFE2_YF_READ_INPUT(G2_AVG, g2_avg)
-    CAFFE2_YF_READ_INPUT(SCALARS_MEMORY, scalars_memory)
+CAFFE2_YF_READ_INPUT(SCALARS_MEMORY, scalars_memory)
-    CAFFE2_YF_READ_INPUT(GRAD, grad)
+CAFFE2_YF_READ_INPUT(GRAD, grad)
 #undef CAFFE2_YF_READ_OUTPUT
    CAFFE_ENFORCE(OperatorBase::InputIsType<TensorCPU>(ITER));
--- a/caffe2/release-notes.md
+++ b/caffe2/release-notes.md
--- a/rsync_exclude.txt
+++ b/rsync_exclude.txt
@ -0,0 +1,29 @@
 # To do syncs, check out caffe2 under ~/local, check out the fbsync branch,
 # and then execute
 #   rsync -arv --delete --exclude-from=rsync_exclude.txt ./ ~/local/caffe2/
 # Make sure you do dry run before actually doing anything.
 .git
 caffe/
 caffe2/fb/
 caffe2/experiments/
 third_party/
 PLATFORM
 caffe2/proto/fb_protobuf.sh
 README.facebook
 rsync_exclude.txt
 TARGETS
 .gitmodules
 .ipynb_checkpoints
 *.tmp
 # These two files are created by patch commands and are not needed.
 *.orig
 *.rej
 # We have these two files under fbcode for convenience.
 caffe2/contrib/nervana/nervana_c_api.cu
 caffe2/contrib/nervana/nervana_c_api.h
 # We have decided to delay open-source the mobile engine of conv transpose.
 caffe2/operators/conv_transpose_op_mobile*
--- a/submodules/tbb-rev.txt
+++ b/submodules/tbb-rev.txt
@ -0,0 +1 @@
 Subproject commit 633b01ad27e012e1dc4e392c3230250d1f4967a4
--- a/test/cpp/api/integration.cpp
+++ b/test/cpp/api/integration.cpp
@ -340,10 +340,10 @@ TEST_CASE("integration/mnist", "[cuda]") {
  auto linear2 = model->add(Linear(50, 10), "linear2");
  auto forward = [&](torch::Tensor x) {
-    x = at::max_pool2d(conv1->forward(x), {2, 2}).relu();
+    x = std::get<0>(at::max_pool2d(conv1->forward(x), {2, 2})).clamp_min(0);
    x = conv2->forward(x);
    x = drop2d->forward(x);
-    x = at::max_pool2d(x, {2, 2}).relu();
+    x = std::get<0>(at::max_pool2d(x, {2, 2})).clamp_min(0);
    x = x.view({-1, 320});
    x = linear1->forward(x).clamp_min(0);
@ -377,10 +377,10 @@ TEST_CASE("integration/mnist/batchnorm", "[cuda]") {
  auto linear2 = model->add(Linear(50, 10), "linear2");
  auto forward = [&](torch::Tensor x) {
-    x = at::max_pool2d(conv1->forward(x), {2, 2}).relu();
+    x = std::get<0>(at::max_pool2d(conv1->forward(x), {2, 2})).clamp_min(0);
    x = batchnorm2d->forward(x);
    x = conv2->forward(x);
-    x = at::max_pool2d(x, {2, 2}).relu();
+    x = std::get<0>(at::max_pool2d(x, {2, 2})).clamp_min(0);
    x = x.view({-1, 320});
    x = linear1->forward(x).clamp_min(0);
--- a/test/test_distributed_trap.py
+++ b/test/test_distributed_trap.py
@ -0,0 +1,23 @@
 import os
 import tempfile
 import sys
 import random
 import __test_main__
 tmp_dir = tempfile.TemporaryDirectory()
 os.environ["TEMP_DIR"] = tmp_dir.name
 os.mkdir(os.path.join(tmp_dir.name, "barrier"))
 os.mkdir(os.path.join(tmp_dir.name, "test_dir"))
 init_dir_path = os.path.join(tmp_dir.name, "init_dir")
 os.mkdir(init_dir_path)
 init_method = os.environ.get('INIT_METHOD')
 if init_method is not None and init_method == "zeus":
    os.environ['INIT_METHOD'] = 'zeus://unittest_' + \
        str(random.randint(1, 1000000000000))
 else:
    os.environ['INIT_METHOD'] = 'file://' + \
        os.path.join(init_dir_path, 'shared_init_file')
 if __name__ == '__main__':
    __test_main__.main(sys.argv)
--- a/third_party/nccl/CMakeLists.txt
+++ b/third_party/nccl/CMakeLists.txt
@ -7,13 +7,14 @@ ENDIF()
 include("${CMAKE_UTILS_PATH}")
 torch_cuda_get_nvcc_gencode_flag(NVCC_GENCODE)
-string(REPLACE "-gencode;" "-gencode=" NVCC_GENCODE "${NVCC_GENCODE}")
+string (REPLACE ";" " " NVCC_GENCODE "${NVCC_GENCODE}")
-message(STATUS "Set NVCC_GENCODE for building NCCL: ${NVCC_GENCODE}")
+string (REPLACE "-gencode " "-gencode=" NVCC_GENCODE "${NVCC_GENCODE}")
 message(INFO "Set NVCC_GENCODE for building NCCL: ${NVCC_GENCODE}")
 ADD_CUSTOM_COMMAND(
   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
   OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/lib/libnccl.so
-   COMMAND env CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR} NVCC=${CUDA_NVCC_EXECUTABLE} BUILDDIR=${CMAKE_CURRENT_BINARY_DIR} NVCC_GENCODE="${NVCC_GENCODE}" make -j${NUM_JOBS}
+   COMMAND env CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR} NVCC=${CUDA_NVCC_EXECUTABLE} BUILDDIR=${CMAKE_CURRENT_BINARY_DIR} NVCC_GENCODE="${NVCC_GENCODE}" make -j `getconf _NPROCESSORS_ONLN`
 )
 ADD_CUSTOM_TARGET(nccl ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/lib/libnccl.so)
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@ -915,11 +915,11 @@
 - name: fractional_max_pool2d_forward(Tensor self, IntList kernel_size, IntList output_size, Tensor random_samples)
  self: fractional_max_pool2d_backward(grad, self, kernel_size, output_size, indices)
- name: max_pool2d_with_indices_forward(Tensor self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode)
+- name: max_pool2d_forward(Tensor self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode)
-  self: max_pool2d_with_indices_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode, indices)
+  self: max_pool2d_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode, indices)
- name: max_pool3d_with_indices_forward(Tensor self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode)
+- name: max_pool3d_forward(Tensor self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode)
-  self: max_pool3d_with_indices_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode, indices)
+  self: max_pool3d_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode, indices)
 - name: max_unpool2d_forward(Tensor self, Tensor indices, IntList output_size)
  self: max_unpool2d_backward(grad, self, indices, output_size)
@ -1041,11 +1041,11 @@
  grad_output: leaky_relu_backward(grad, self, negative_slope)
  self: zeros_like(grad)
- name: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, Tensor indices)
+- name: max_pool2d_backward(Tensor grad_output, Tensor self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, Tensor indices)
  grad_output: max_pool_double_backward(grad, indices, 2);
  self: zeros_like(self)
- name: max_pool3d_with_indices_backward(Tensor grad_output, Tensor self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, Tensor indices)
+- name: max_pool3d_backward(Tensor grad_output, Tensor self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, Tensor indices)
  grad_output: max_pool_double_backward(grad, indices, 3);
  self: zeros_like(self)
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@ -25,7 +25,7 @@ SKIP_PYTHON_BINDINGS = [
    'index',
    '_indexCopy_', 'max_values', 'min_values', 'argmax', 'argmin',
    '_cumsum.*', '_cumprod.*', '_sum.*', '_prod.*', '_th_sum.*', '_th_prod.*',
-    'arange.*', 'range.*', '_gesv.*', 'slice', 'max_pool1d', 'max_pool2d', 'max_pool3d'
+    'arange.*', 'range.*', '_gesv.*', 'slice',
 ]
 PY_VARIABLE_METHOD_VARARGS = CodeTemplate("""\
--- a/tools/setup_helpers/generate_code.py
+++ b/tools/setup_helpers/generate_code.py
@ -37,6 +37,7 @@ outputs = [
    'torch/csrc/autograd/generated/python_nn_functions_dispatch.h',
    'torch/csrc/autograd/generated/python_variable_methods.cpp',
    'torch/csrc/autograd/generated/python_variable_methods_dispatch.h',
    'torch/csrc/autograd/generated/variable_factories.h',
    'torch/csrc/autograd/generated/VariableType.cpp',
    'torch/csrc/autograd/generated/VariableType.h',
    'torch/csrc/jit/generated/aten_dispatch.cpp',
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@ -341,7 +341,7 @@ def max_pool1d(input, kernel_size, stride=None, padding=0, dilation=1,
    See :class:`~torch.nn.MaxPool1d` for details.
    """
-    ret = torch.max_pool1d_with_indices(input, kernel_size, stride, padding, dilation, ceil_mode)
+    ret = torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)
    return ret if return_indices else ret[0]
@ -352,7 +352,7 @@ def max_pool2d(input, kernel_size, stride=None, padding=0, dilation=1,
    See :class:`~torch.nn.MaxPool2d` for details.
    """
-    ret = torch._C._nn.max_pool2d_with_indices(input, kernel_size, stride, padding, dilation, ceil_mode)
+    ret = torch._C._nn.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
    return ret if return_indices else ret[0]
@ -363,7 +363,7 @@ def max_pool3d(input, kernel_size, stride=None, padding=0, dilation=1,
    See :class:`~torch.nn.MaxPool3d` for details.
    """
-    ret = torch._C._nn.max_pool3d_with_indices(input, kernel_size, stride, padding, dilation, ceil_mode)
+    ret = torch._C._nn.max_pool3d(input, kernel_size, stride, padding, dilation, ceil_mode)
    return ret if return_indices else ret[0]
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@ -396,11 +396,11 @@ def softplus(g, self, beta, threshold):
    return g.op('Softplus', self)
-def max_pool1d_with_indices(g, input, kernel_size, stride, padding, dilation, ceil_mode):
+def max_pool1d(g, input, kernel_size, stride, padding, dilation, ceil_mode):
    if ceil_mode:
-        return _unimplemented("max_pool1d_with_indices", "ceil_mode")
+        return _unimplemented("max_pool1d", "ceil_mode")
    if set(_single(dilation)) != {1}:
-        return _unimplemented("max_pool1d_with_indices", "dilation")
+        return _unimplemented("max_pool1d", "dilation")
    if stride is None:
        stride = kernel_size
    r = g.op("MaxPool", input,
@ -410,11 +410,11 @@ def max_pool1d_with_indices(g, input, kernel_size, stride, padding, dilation, ce
    return r, None
-def max_pool2d_with_indices(g, input, kernel_size, stride, padding, dilation, ceil_mode):
+def max_pool2d(g, input, kernel_size, stride, padding, dilation, ceil_mode):
    if ceil_mode:
-        return _unimplemented("max_pool2d_with_indices", "ceil_mode")
+        return _unimplemented("max_pool2d", "ceil_mode")
    if set(_pair(dilation)) != {1}:
-        return _unimplemented("max_pool2d_with_indices", "dilation")
+        return _unimplemented("max_pool2d", "dilation")
    if not stride:
        stride = kernel_size
    r = g.op("MaxPool", input,
@ -424,11 +424,11 @@ def max_pool2d_with_indices(g, input, kernel_size, stride, padding, dilation, ce
    return r, None
-def max_pool3d_with_indices(g, input, kernel_size, stride, padding, dilation, ceil_mode):
+def max_pool3d(g, input, kernel_size, stride, padding, dilation, ceil_mode):
    if ceil_mode:
-        return _unimplemented("max_pool3d_with_indices", "ceil_mode")
+        return _unimplemented("max_pool3d", "ceil_mode")
    if set(_triple(dilation)) != {1}:
-        return _unimplemented("max_pool3d_with_indices", "dilation")
+        return _unimplemented("max_pool3d", "dilation")
    if not stride:
        stride = kernel_size
    r = g.op("MaxPool", input,
		`@ -0,0 +1 @@`
							`Subproject commit 633b01ad27e012e1dc4e392c3230250d1f4967a4`