sync

2025-12-06 12:20:52 +01:00 · 2016-07-28 15:06:04 -07:00 · 2016-07-28 15:06:04 -07:00 · bcea409c82
commit bcea409c82
parent f01f2063dd
70 changed files with 5124 additions and 383 deletions
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@ -11,12 +11,70 @@ CAFFE2_DEFINE_int(
    "Chunk size to split tensor data into");
 namespace caffe2 {
 namespace {
 /**
 * @brief StringSerializer is the serializer for String.
 *
 * StringSerializer takes in a blob that contains a String, and serializes it
 * into a BlobProto protocol buffer.
 */
 class StringSerializer : public BlobSerializerBase {
 public:
  StringSerializer() {}
  ~StringSerializer() {}
  /**
   * Serializes a Blob. Note that this blob has to contain Tensor<Context>,
   * otherwise this function produces a fatal error.
   */
  void Serialize(
      const Blob& blob,
      const string& name,
      SerializationAcceptor acceptor) override {
    CHECK(blob.IsType<std::string>());
    BlobProto blob_proto;
    blob_proto.set_name(name);
    blob_proto.set_type("std::string");
    blob_proto.set_content(blob.template Get<std::string>());
    acceptor(name, blob_proto.SerializeAsString());
  }
 };
 /**
 * @brief StringDeserializer is the deserializer for Strings.
 *
 */
 class StringDeserializer : public BlobDeserializerBase {
 public:
  bool Deserialize(const BlobProto& proto, Blob* blob) override {
    *blob->GetMutable<std::string>() = proto.content();
    return true;
  }
 };
 }
 namespace {
 // We can't use DeviceType_Name because of a protobuf-lite constraint.
 std::string tensorDeviceTypeName(const DeviceType& d) {
  switch (d) {
    case CPU:
      return "TensorCPU";
    case CUDA:
      return "TensorCUDA";
    default:
      CAFFE_THROW("Unknown device: ", d);
      return "";
  }
 };
 }
 // The blob serialization member function implementation.
 void Blob::Serialize(
    const string& name,
    BlobSerializerBase::SerializationAcceptor acceptor) const {
  std::unique_ptr<BlobSerializerBase> serializer(CreateSerializer(meta_.id()));
  CAFFE_ENFORCE(serializer, "No known serializer for ", meta_.name());
  serializer->Serialize(*this, name, acceptor);
 }
@ -33,7 +91,6 @@ std::string Blob::Serialize(const string& name) const {
  return data.str();
 }
 // Specialization for StoreDeviceDetail for CPU - nothing needs to be done.
 template <>
 void TensorSerializer<CPUContext>::StoreDeviceDetail(
@ -60,9 +117,8 @@ bool Blob::Deserialize(const BlobProto& blob_proto) {
  if (blob_proto.has_tensor()) {
    // This is a tensor object. Depending on the device type, we will
    // use the corresponding TensorDeserializer.
-    auto deserializer = CreateDeserializer(
+    auto deserializer = CreateDeserializer(tensorDeviceTypeName(
-        "Tensor" +
+        blob_proto.tensor().device_detail().device_type()));
        DeviceType_Name(blob_proto.tensor().device_detail().device_type()));
    // Tensor's deserializer should always be registered, but we will double
    // check if it is not null anyway.
    return CHECK_NOTNULL(deserializer.get())->Deserialize(blob_proto, this);
@ -82,5 +138,8 @@ REGISTER_BLOB_SERIALIZER(
    (TypeMeta::Id<TensorCPU>()),
    TensorSerializer<CPUContext>);
 REGISTER_BLOB_DESERIALIZER(TensorCPU, TensorDeserializer<CPUContext>);
 // Serialize std::string
 REGISTER_BLOB_SERIALIZER((TypeMeta::Id<std::string>()), StringSerializer);
 REGISTER_BLOB_DESERIALIZER(std::string, StringDeserializer);
 }  // namespace
 }  // namespace caffe2
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@ -69,6 +69,20 @@ TEST(BlobTest, BlobWrongType) {
  ASSERT_THROW(blob.Get<int>(), EnforceNotMet);
 }
 TEST(BlobTest, StringSerialization) {
  const std::string kTestString = "Hello world?";
  Blob blob;
  *blob.GetMutable<std::string>() = kTestString;
  string serialized = blob.Serialize("test");
  BlobProto proto;
  CHECK(proto.ParseFromString(serialized));
  EXPECT_EQ(proto.name(), "test");
  EXPECT_EQ(proto.type(), "std::string");
  EXPECT_FALSE(proto.has_tensor());
  EXPECT_EQ(proto.content(), kTestString);
 }
 TEST(TensorNonTypedTest, TensorChangeType) {
  vector<int> dims(3);
  dims[0] = 2;
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@ -5,8 +5,9 @@
 #include <cstdlib>
 #include <random>
 #include "caffe2/proto/caffe2.pb.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/typeid.h"
 #include "caffe2/proto/caffe2.pb.h"
 #include "caffe2/utils/math.h"
 namespace caffe2 {
@ -103,6 +104,7 @@ class CPUContext final {
  // Two copy functions that deals with cross-device copies.
  template <class SrcContext, class DstContext>
  inline void CopyBytes(size_t nbytes, const void* src, void* dst);
  template <typename T, class SrcContext, class DstContext>
  inline void Copy(size_t n, const T* src, T* dst) {
    if (std::is_fundamental<T>::value) {
@ -116,6 +118,16 @@ class CPUContext final {
    }
  }
  template <class SrcContext, class DstContext>
  inline void
  CopyItems(const TypeMeta& meta, size_t n, const void* src, void* dst) {
    if (meta.copy()) {
      meta.copy()(src, dst, n);
    } else {
      CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
    }
  }
 protected:
  // TODO(jiayq): instead of hard-coding a generator, make it more flexible.
  int random_seed_{1701};
--- a/caffe2/core/context_gpu.cc
+++ b/caffe2/core/context_gpu.cc
@ -7,6 +7,16 @@ thread_local ThreadLocalCUDAObjects CUDAContext::cuda_objects_;
 namespace {
 bool Caffe2UsePinnedCPUAllocator(int*, char***) {
 #ifdef __SANITIZE_ADDRESS__
  // Note(jiayq): for more details, see
  //     https://github.com/google/sanitizers/issues/629
  LOG(WARNING) << "There are known issues between address sanitizer and "
                  "cudaMallocHost. As a result, caffe2 will not enable pinned "
                  "memory allocation in asan mode. If you are expecting any "
                  "behavior that depends on asan, be advised that it is not "
                  "turned on.";
  return true;
 #else
  if (!HasCudaGPU()) {
    VLOG(1) << "No GPU present. I won't use pinned allocator then.";
    return true;
@ -14,6 +24,7 @@ bool Caffe2UsePinnedCPUAllocator(int*, char***) {
  VLOG(1) << "Caffe2 gpu: setting CPUAllocator to PinnedCPUAllocator.";
  SetCPUAllocator(new PinnedCPUAllocator());
  return true;
 #endif
 }
 REGISTER_CAFFE2_INIT_FUNCTION(Caffe2UsePinnedCPUAllocator,
--- a/caffe2/core/db.h
+++ b/caffe2/core/db.h
@ -116,7 +116,9 @@ CAFFE_DECLARE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
 */
 inline unique_ptr<DB> CreateDB(
    const string& db_type, const string& source, Mode mode) {
-  return Caffe2DBRegistry()->Create(db_type, source, mode);
+  auto result = Caffe2DBRegistry()->Create(db_type, source, mode);
  VLOG(1) << ((!result) ? "not found db " : "found db ") << db_type;
  return result;
 }
 /**
--- a/caffe2/core/init.h
+++ b/caffe2/core/init.h
@ -68,13 +68,13 @@ class InitRegisterer {
 #define REGISTER_CAFFE2_INIT_FUNCTION(name, function, description)             \
  namespace {                                                                  \
-  ::caffe2::InitRegisterer g_caffe2_initregisterer_name(                       \
+  ::caffe2::InitRegisterer g_caffe2_initregisterer_##name(                     \
      function, false, description);                                           \
  }  // namespace
 #define REGISTER_CAFFE2_EARLY_INIT_FUNCTION(name, function, description)       \
  namespace {                                                                  \
-  ::caffe2::InitRegisterer g_caffe2_initregisterer_name(                       \
+  ::caffe2::InitRegisterer g_caffe2_initregisterer_##name(                     \
      function, true, description);                                            \
  }  // namespace
--- a/caffe2/core/net_gpu.cc
+++ b/caffe2/core/net_gpu.cc
@ -1,14 +1,70 @@
 #include "caffe2/core/net.h"
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/core/flags.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/timer.h"
 #include "caffe2/proto/caffe2.pb.h"
 #ifdef CAFFE2_USE_NVTX
 #include <nvToolsExt.h>
 #endif
 CAFFE2_DEFINE_bool(caffe2_use_nvtx, false, "Use NVTX ranges for profiling");
 namespace caffe2 {
 namespace {
 using Color = int32_t;
 constexpr Color kRunColor = 0x0000CCFF; // blue
 constexpr Color kRecordColor = 0x00FF3300; // red
 constexpr Color kWaitColor = 0x0066FF33; // green
 #ifdef CAFFE2_USE_NVTX
 class ProfiledRange {
 public:
  ProfiledRange(const OperatorDef& def, Color color) {
    if (!FLAGS_caffe2_use_nvtx) {
      return;
    }
    nvtxEventAttributes_t eventAttrib = {0};
    eventAttrib.version = NVTX_VERSION;
    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
    eventAttrib.colorType = NVTX_COLOR_ARGB;
    eventAttrib.color = color;
    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
    eventAttrib.message.ascii = def.type().c_str();
    range_ = nvtxRangeStartEx(&eventAttrib);
    CHECK(range_);
  }
  ~ProfiledRange() {
    if (!FLAGS_caffe2_use_nvtx) {
      return;
    }
    nvtxRangeEnd(range_);
  }
 private:
  nvtxRangeId_t range_ = 0;
  DISABLE_COPY_AND_ASSIGN(ProfiledRange);
 };
 #else
 class ProfiledRange {
 public:
  ProfiledRange(const OperatorDef& def, Color color) {}
 private:
  DISABLE_COPY_AND_ASSIGN(ProfiledRange);
 };
 #endif // ifdef CAFFE2_USE_NVTX
 struct Stream;
 struct Event {
@ -69,6 +125,7 @@ struct Stream {
  int gpu_id_{-1};
  cudaStream_t stream_{nullptr};
 private:
  DISABLE_COPY_AND_ASSIGN(Stream);
 };
@ -128,18 +185,24 @@ class AsyncDAGNet : public DAGNetBase {
        }));
    for (auto source_parent_idx : operator_nodes_[source_idx].parents_) {
      ProfiledRange r(
          operator_nodes_[source_parent_idx].operator_->def(), kWaitColor);
      stream.wait(events_[source_parent_idx].get());
    }
    // We've waited on all our parent indices.
    bool success = true;
    for (auto idx : chain) {
      ProfiledRange r(operator_nodes_[idx].operator_->def(), kRunColor);
      success &= operator_nodes_[idx].operator_->RunAsync();
    }
    // Record an event for the sink of the chain.
    const auto& sink_idx = chain.back();
    {
      ProfiledRange r(operator_nodes_[sink_idx].operator_->def(), kRecordColor);
      events_[sink_idx]->record(stream);
    }
    CHECK(!eventRecorded_[sink_idx]);
    eventRecorded_[sink_idx] = 1;
    return success;
@ -157,9 +220,11 @@ class AsyncDAGNet : public DAGNetBase {
    Stream stream{device_option};
    // Potential optimization: we can pre-compute outstanding events.
-    for (auto& event : events_) {
+    for (auto i = 0; i < events_.size(); ++i) {
      auto& event = events_[i];
      if (event->outstanding_) {
        VLOG(2) << "Synchronizing host on outstanding event";
        ProfiledRange r(operator_nodes_[i].operator_->def(), kWaitColor);
        stream.wait(event.get());
      }
    }
--- a/caffe2/core/tensor.cc
+++ b/caffe2/core/tensor.cc
@ -2,17 +2,6 @@
 #include "caffe2/core/flags.h"
 CAFFE2_DEFINE_bool(
-    caffe2_keep_on_shrink, false,
+    caffe2_keep_on_shrink,
    true,
    "If set, keeps memory when a tensor is shrinking its size.");
 namespace caffe2 {
 namespace detail {
 vector<TIndex>& shape(size_t n) {
  static thread_local vector<TIndex> r;
  r.resize(n);
  return r;
 }
 }
 }
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@ -150,6 +150,40 @@ class Tensor {
  virtual ~Tensor() {}
  /**
   * @brief Extends the outer-most dimension of this tensor by num elements,
   * preserving the existing data.
   *
   * The underlying data may be reallocated in order to accommodate the new
   * elements, in which case this tensors' capacity is grown at a factor of
   * growthPct. This ensures that Extend runs on an amortized O(1) time
   * complexity.
   */
  template <class ContextForCopy>
  void Extend(TIndex num, int growthPct, ContextForCopy* context) {
    CHECK_GE(dims_.size(), 1);
    auto oldSize = size_;
    auto newDims = dims_;
    newDims[0] += num;
    if (!data_) {
      Resize(newDims);
      return;
    }
    auto newSize = std::accumulate(
        newDims.begin(), newDims.end(), 1, std::multiplies<TIndex>());
    if (newSize * meta_.itemsize() > capacity_) {
      auto newCapacity = dims_;
      newCapacity[0] = std::max(newDims[0], dims_[0] * (growthPct + 100) / 100);
      auto oldData = std::move(data_);
      Resize(newCapacity);
      auto* newData = raw_mutable_data(meta_);
      context->template CopyItems<ContextForCopy, ContextForCopy>(
          meta_, oldSize, oldData.get(), newData);
    }
    dims_ = newDims;
    size_ = newSize;
  }
  /**
   * @brief Resizes a tensor.
   *
@ -297,9 +331,12 @@ class Tensor {
    CHECK(data_.get() || size_ == 0)
        << "The tensor is uninitialized. You probably need to call "
        << "Resize() and mutable_data() first.";
-    CHECK(IsType<T>())
+    CAFFE_ENFORCE(
-        << "Tensor type mistmatch, caller expects elements to be "
+        IsType<T>(),
-        << TypeMeta::Name<T>() << " while tensor contains " << meta_.name();
+        "Tensor type mistmatch, caller expects elements to be ",
        TypeMeta::Name<T>(),
        " while tensor contains ",
        meta_.name());
    return static_cast<T*>(data_.get());
  }
--- a/caffe2/core/workspace.cc
+++ b/caffe2/core/workspace.cc
@ -12,34 +12,39 @@
 namespace caffe2 {
 namespace {
-// Returns a function that returns `true` if we should continue
+// try to get the should_stop signal, a scalar bool blob value.
-// iterating, given the current iteration count.
+// if the blob doesn't exist or is not initiaized, return false
-std::function<bool(int)> getContinuationTest(
+const bool getShouldStop(const Blob* b) {
-    Workspace* ws,
+  if (!b || !b->meta().id()) { // not exist or uninitialized
-    const ExecutionStep& step) {
+    return false;
  if (step.has_criteria_network()) {
    CHECK(!step.has_num_iter())
        << "Must not specify num_iter if critera_network is set";
  }
-  if (!step.has_criteria_network()) {
+  const auto& t = b->Get<TensorCPU>();
-    int iterations = step.has_num_iter() ? step.num_iter() : 1;
+  CAFFE_ENFORCE(t.IsType<bool>() && t.size() == 1, "expects a scalar boolean");
-    VLOG(1) << "Executing step for " << iterations << " iterations.";
+  return *(t.template data<bool>());
-    return [=](int i) { return i < iterations; };
+}
 // Returns a function that returns `true` if we should continue
 // iterating, given the current iteration count.
 std::function<bool(int64_t)> getContinuationTest(
    Workspace* ws,
    const ExecutionStep& step) {
  if (step.has_should_stop_blob()) {
    CAFFE_ENFORCE(
        !step.has_num_iter(),
        "Must not specify num_iter if should_stop_blob is set");
  }
  if (!step.has_should_stop_blob()) {
    int64_t iterations = step.has_num_iter() ? step.num_iter() : 1;
    VLOG(1) << "Will execute step " << step.name() << " for " << iterations
            << " iterations.";
    return [=](int64_t i) { return i < iterations; };
  } else {
    VLOG(1) << "Will execute step " << step.name() << " until stopped by blob "
            << step.should_stop_blob();
    return [](int64_t i) { return true; };
  }
  auto* criteria_network = ws->GetNet(step.criteria_network());
  CHECK_NOTNULL(criteria_network);
  CHECK_EQ(criteria_network->external_output().size(), 1);
  const auto& criteria_output = criteria_network->external_output().front();
  VLOG(1) << "Executing step controlled by criteria output: "
                << criteria_output;
  return [=](int) {
    criteria_network->Run();
    const auto& blob = ws->GetBlob(criteria_output)->Get<TensorCPU>();
    CHECK_EQ(blob.size(), 1);
    CHECK(blob.IsType<bool>());
    return blob.template data<bool>()[0] > 0;
  };
 };
 }  // namespace
@ -229,10 +234,17 @@ struct Reporter {
 }
 #define CHECK_SHOULD_STOP(shouldStop)                   \
  if (getShouldStop(shouldStop)) {                      \
    VLOG(1) << "Execution stopped by should_stop_blob"; \
    return true;                                        \
  }
 bool Workspace::ExecuteStepRecursive(
      const ExecutionStep& step,
      ShouldContinue externalShouldContinue) {
-  LOG(INFO) << "Running execution step " << step.name();
+  VLOG(1) << "Running execution step " << step.name();
  if (!(step.substep_size() == 0 || step.network_size() == 0)) {
    LOG(ERROR) << "An ExecutionStep should either have substep or networks "
               << "but not both.";
@ -247,22 +259,40 @@ bool Workspace::ExecuteStepRecursive(
    if (net_map_.count(step.report_net()) == 0) {
      LOG(ERROR) << "Report net " << step.report_net() << " not found.";
    }
    VLOG(1) << "Starting reporter net";
    reporter.start(net_map_[step.report_net()].get(), step.report_interval());
  }
  const Blob* shouldStop = nullptr;
  if (step.has_should_stop_blob()) {
    shouldStop = GetBlob(step.should_stop_blob());
    CAFFE_ENFORCE(
        shouldStop, "blob ", step.should_stop_blob(), " does not exist");
  }
  const auto netShouldContinue = getContinuationTest(this, step);
-  const auto shouldContinue = [&](int iter) {
+  const auto shouldContinue = [&](int64_t iter) {
    return externalShouldContinue(iter) && netShouldContinue(iter);
  };
  if (step.substep_size()) {
-    for (int iter = 0; shouldContinue(iter); ++iter) {
+    for (int64_t iter = 0; shouldContinue(iter); ++iter) {
-      // we assume that, if we have substeps, each substep is going to take a
+      VLOG(1) << "Execution step " << step.name() << ": iteration " << iter;
-      // reasonable amount of time, so logging here is fine
+
-      LOG(INFO) << "Execution step " << step.name()
+      if (!step.concurrent_substeps() || step.substep().size() <= 1) {
-                << ": Starting iteration " << iter;
+        auto substepShouldContinue = [&, externalShouldContinue](int64_t iter) {
          return externalShouldContinue(iter);
        };
        for (auto& ss : step.substep()) {
          if (!ExecuteStepRecursive(ss, substepShouldContinue)) {
            return false;
          }
          CHECK_SHOULD_STOP(shouldStop);
        }
      } else {
        std::atomic<int> next_substep{0};
        std::atomic<bool> got_failure{false};
-      auto substepShouldContinue = [&, externalShouldContinue](int iter) {
+        auto substepShouldContinue = [&, externalShouldContinue](int64_t iter) {
          return !got_failure && externalShouldContinue(iter);
        };
        auto worker = [&]() {
@ -271,26 +301,26 @@ bool Workspace::ExecuteStepRecursive(
            if (got_failure || (substep_id >= step.substep().size())) {
              break;
            }
-          if (!ExecuteStepRecursive(step.substep().Get(substep_id),
+            if (!ExecuteStepRecursive(
-                                    substepShouldContinue)) {
+                    step.substep().Get(substep_id), substepShouldContinue)) {
              got_failure = true;
            }
          }
        };
-      if (!step.concurrent_substeps() || step.substep().size() <= 1) {
+
        worker();
      } else {
        std::vector<std::thread> threads;
-        for (int i = 0; i < step.substep().size(); ++i) {
+        for (int64_t i = 0; i < step.substep().size(); ++i) {
          threads.emplace_back(worker);
        }
        for (auto& thread: threads) {
          thread.join();
        }
      }
        if (got_failure) {
          return false;
        }
        // concurrent substeps should be careful about setting should_stop_blob
        CHECK_SHOULD_STOP(shouldStop);
      }
    }
    return true;
  } else {
@ -305,16 +335,19 @@ bool Workspace::ExecuteStepRecursive(
      VLOG(1) << "Going to execute network " << network_name;
      networks.push_back(net_map_[network_name].get());
    }
-    for (int iter = 0; shouldContinue(iter); ++iter) {
+    for (int64_t iter = 0; shouldContinue(iter); ++iter) {
      VLOG(1) << "Executing network iteration " << iter;
      for (NetBase* network : networks) {
        if (!network->Run()) {
          return false;
        }
        CHECK_SHOULD_STOP(shouldStop);
      }
    }
  }
  return true;
 }
 #undef CHECK_SHOULD_STOP
 }  // namespace caffe2
--- a/caffe2/operators/atomic_ops.cc
+++ b/caffe2/operators/atomic_ops.cc
@ -0,0 +1,73 @@
 #include <mutex>
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 namespace caffe2 {
 namespace fb {
 namespace {
 class CreateMutexOp final : public Operator<CPUContext> {
 public:
  CreateMutexOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<CPUContext>(operator_def, ws) {}
  bool RunOnDevice() override {
    *OperatorBase::Output<std::unique_ptr<std::mutex>>(0) =
        std::unique_ptr<std::mutex>(new std::mutex);
    return true;
  }
 };
 class AtomicFetchAddOp final : public Operator<CPUContext> {
 public:
  AtomicFetchAddOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<CPUContext>(operator_def, ws) {}
  bool RunOnDevice() override {
    auto& mutex = OperatorBase::Input<std::unique_ptr<std::mutex>>(0);
    auto& a = Input(1);
    auto& b = Input(2);
    auto* c = Output(0);
    auto* d = Output(1);
    c->Resize(std::vector<TIndex>());
    d->Resize(std::vector<TIndex>());
    auto* aPtr = a.data<int32_t>();
    auto* bPtr = b.data<int32_t>();
    auto* cPtr = c->mutable_data<int32_t>();
    auto* dPtr = d->mutable_data<int32_t>();
    std::lock_guard<std::mutex> lg(*mutex);
    *dPtr = *aPtr;
    *cPtr = *aPtr + *bPtr;
    return true;
  }
 };
 REGISTER_CPU_OPERATOR(CreateMutex, CreateMutexOp);
 REGISTER_CPU_OPERATOR(AtomicFetchAdd, AtomicFetchAddOp);
 OPERATOR_SCHEMA(CreateMutex)
    .NumInputs(0)
    .NumOutputs(1)
    .SetDoc("Creates an unlocked mutex and returns it in a unique_ptr blob.")
    .Output(0, "mutex_ptr", "Blob containing a std::unique_ptr<mutex>.");
 OPERATOR_SCHEMA(AtomicFetchAdd)
    .NumInputs(3)
    .NumOutputs(2)
    .SetDoc(R"DOC(
 Given a mutex and two int32 scalar tensors, performs an atomic fetch add
 by mutating the first argument and adding it to the second input
 argument. Returns the updated integer and the value prior to the update.
 )DOC")
    .Input(0, "mutex_ptr", "Blob containing to a unique_ptr<mutex>")
    .Input(1, "mut_value", "Value to be mutated after the sum.")
    .Input(2, "increment", "Value to add to the first operand.")
    .Output(0, "mut_value", "Mutated value after sum. Usually same as input 1.")
    .Output(1, "fetched_value", "Value of the first operand before sum.")
    .AllowInplace({{1, 0}});
 SHOULD_NOT_DO_GRADIENT(CreateMutex);
 SHOULD_NOT_DO_GRADIENT(AtomicFetchAdd);
 }
 }
 }
--- a/caffe2/operators/boolean_mask_ops.cc
+++ b/caffe2/operators/boolean_mask_ops.cc
@ -0,0 +1,143 @@
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
 namespace caffe2 {
 namespace {
 template <class Context>
 class BooleanMaskLengthsOp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  BooleanMaskLengthsOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws) {}
  bool RunOnDevice() override {
    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
  }
  template <typename T>
  bool DoRunWithType() {
    auto& lengths = Input(0);
    auto& mask = Input(1);
    auto* lengthsOut = Output(0);
    CAFFE_ENFORCE(lengths.ndim() == 1);
    CAFFE_ENFORCE(mask.ndim() == 1);
    const auto* lengthsPtr = lengths.template data<T>();
    const auto* maskPtr = mask.template data<bool>();
    auto totalLength =
        std::accumulate(lengthsPtr, lengthsPtr + lengths.size(), 0);
    CAFFE_ENFORCE(mask.size() == totalLength);
    lengthsOut->ResizeLike(lengths);
    auto* lengthsOutPtr = lengthsOut->template mutable_data<T>();
    int p = 0;
    for (int i = 0; i < lengths.size(); ++i) {
      T lengthOut = 0;
      for (int j = 0; j < lengthsPtr[i]; ++j) {
        if (maskPtr[p++]) {
          ++lengthOut;
        }
      }
      lengthsOutPtr[i] = lengthOut;
    }
    return true;
  }
 };
 template <class Context>
 class BooleanMaskOp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  BooleanMaskOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws) {}
  bool RunOnDevice() override {
    auto& data = Input(0);
    auto& mask = Input(1);
    auto* dataOut = Output(0);
    CAFFE_ENFORCE(data.ndim() >= 1);
    CAFFE_ENFORCE(mask.ndim(), 1);
    CAFFE_ENFORCE(data.dims()[0] == mask.dims()[0]);
    const auto* maskPtr = mask.template data<bool>();
    int numOutputs = 0;
    int outerSize = mask.size();
    for (int i = 0; i < outerSize; ++i) {
      if (maskPtr[i]) {
        ++numOutputs;
      }
    }
    std::vector<TIndex> outShape;
    outShape.push_back(numOutputs);
    outShape.insert(outShape.end(), data.dims().begin() + 1, data.dims().end());
    dataOut->Resize(outShape);
    if (numOutputs == 0) {
      return true;
    }
    auto innerSizeBytes = std::accumulate(
                              data.dims().begin() + 1,
                              data.dims().end(),
                              1,
                              std::multiplies<TIndex>()) *
        data.meta().itemsize();
    TIndex lastStart = -1;
    const auto* inPtr = (char*)data.raw_data();
    auto* outPtr = (char*)dataOut->raw_mutable_data(data.meta());
    TIndex outStart = 0;
    for (TIndex i = 0;; ++i) {
      // mask was true and either a) became false, or b) sequence finished
      if (lastStart != -1 && ((i >= outerSize) || !maskPtr[i])) {
        const auto* src = inPtr + lastStart * innerSizeBytes;
        auto* dst = outPtr + outStart * innerSizeBytes;
        int numItems = i - lastStart;
        if (data.meta().copy()) {
          data.meta().copy()(src, dst, numItems);
        } else {
          context_.template CopyBytes<CPUContext, CPUContext>(
              numItems * data.meta().itemsize(), src, dst);
        }
        outStart += numItems;
        lastStart = -1;
      }
      if (i >= outerSize) {
        break;
      }
      // mask was false and became true
      if (lastStart == -1 && maskPtr[i]) {
        lastStart = i;
      }
    }
    return true;
  }
 };
 REGISTER_CPU_OPERATOR(BooleanMask, BooleanMaskOp<CPUContext>);
 REGISTER_CPU_OPERATOR(BooleanMaskLengths, BooleanMaskLengthsOp<CPUContext>);
 OPERATOR_SCHEMA(BooleanMask)
    .NumInputs(2)
    .NumOutputs(1)
    .SetDoc(R"DOC(
 Given a data 1D tensor and a mask (boolean) tensor of same shape, returns a
 tensor containing only the elements corresponding to positions where the mask
 is true.
 )DOC")
    .Input(0, "data", "The 1D, original data tensor.")
    .Input(1, "mask", "A tensor of bools of same shape as `data`.")
    .Output(0, "masked_data", "A tensor of same type as `data`.");
 OPERATOR_SCHEMA(BooleanMaskLengths)
    .NumInputs(2)
    .NumOutputs(1)
    .SetDoc(R"DOC(
 Given a tensor of int32 segment lengths and a mask (boolean) tensor, return
 the segment lengths of a corresponding segmented tensor after BooleanMask is
 applied.
 )DOC")
    .Input(0, "lengths", "A 1D int32 tensor representing segment lengths.")
    .Input(1, "mask", "A 1D bool tensor of values to keep.")
    .Output(0, "masked_lengths", "Segment lengths of a masked tensor.");
 NO_GRADIENT(BooleanMask)
 NO_GRADIENT(BooleanMaskLengths);
 }
 }
--- a/caffe2/operators/concat_split_op.cc
+++ b/caffe2/operators/concat_split_op.cc
@ -10,8 +10,14 @@ OPERATOR_SCHEMA(Concat).NumInputs(1, INT_MAX).NumOutputs(2);
 // Backward compatibility names.
 REGISTER_CPU_OPERATOR(DepthSplit, SplitOp<CPUContext>);
 REGISTER_CPU_OPERATOR(DepthConcat, ConcatOp<CPUContext>);
-OPERATOR_SCHEMA(DepthSplit).NumInputs(1, 2).NumOutputs(1, INT_MAX);
+OPERATOR_SCHEMA(DepthSplit)
-OPERATOR_SCHEMA(DepthConcat).NumInputs(1, INT_MAX).NumOutputs(2);
+    .NumInputs(1, 2)
    .NumOutputs(1, INT_MAX)
    .SetDoc("Backward compatible operator name for Split.");
 OPERATOR_SCHEMA(DepthConcat)
    .NumInputs(1, INT_MAX)
    .NumOutputs(2)
    .SetDoc("Backward compatible operator name for Concat.");
 class GetSplitGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
--- a/caffe2/operators/concat_split_op.h
+++ b/caffe2/operators/concat_split_op.h
@ -85,7 +85,9 @@ class ConcatOp final : public Operator<Context> {
 template <class Context>
 bool SplitOp<Context>::RunOnDevice() {
  auto& input = Input(0);
  const int input_channels = input.dim32(axis_);
  const int* axis_data;
  vector<int> equal_split;
  if (InputSize() == 2) {
    // We obtain split from the input tensor.
    CHECK_EQ(split_.size(), 0)
@ -94,13 +96,21 @@ bool SplitOp<Context>::RunOnDevice() {
    auto& split_tensor = OperatorBase::Input<TensorCPU>(1);
    CHECK_EQ(split_tensor.size(), OutputSize());
    axis_data = split_tensor.template data<int>();
  } else if (split_.size() == 0) {
    CAFFE_ENFORCE(input_channels % OutputSize() == 0,
                  "If you did not specify split explicitly, the number of "
                  "input channels should be divisible by the output size.");
    equal_split.resize(OutputSize(), input_channels / OutputSize());
    axis_data = equal_split.data();
  } else {
    // We obtain split from the parameters.
-    CHECK_EQ(split_.size(), OutputSize());
+    CAFFE_ENFORCE(split_.size() == OutputSize(),
                  "The number of splits specified should be equal to the "
                  "number of outputs.");
    axis_data = split_.data();
  }
  CHECK_LT(axis_, input.ndim());
-  const int input_channels = input.dim32(axis_);
+
  CHECK_EQ(std::accumulate(axis_data, axis_data + OutputSize(), 0),
           input_channels)
      << "Sum of split dimensions do not match: should be " << input_channels;
--- a/caffe2/operators/conv_op_cudnn.cc
+++ b/caffe2/operators/conv_op_cudnn.cc
@ -22,11 +22,11 @@ template <typename ArrayOfcudnnConvolutionAlgoPerf_t>
 inline void LogCuDNNPerfStats(
    const ArrayOfcudnnConvolutionAlgoPerf_t& perf_stat,
    int returned_algo_count) {
-  LOG(INFO) << "Perf result: (algo: stat, time, memory)";
+  VLOG(1) << "Perf result: (algo: stat, time, memory)";
  for (int i = 0; i < returned_algo_count; ++i) {
    const auto& stat = perf_stat[i];
-    LOG(INFO) << stat.algo << ": " << stat.status
+    VLOG(1) << stat.algo << ": " << stat.status << " " << stat.time << " "
-                   << " " << stat.time << " " << stat.memory;
+            << stat.memory;
  }
 }
 }  // namespace
@ -193,7 +193,7 @@ bool CudnnConvOp<T>::RunOnDevice() {
    if (deterministic_) {
      algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
    } else if (exhaustive_search_) {
-      LOG(INFO) << "CUDNN Convolution: doing exhaustive search.";
+      VLOG(1) << "CUDNN Convolution: doing exhaustive search.";
      // When we do an exhaustive search, we will ignore the workspace size
      // limit and simply go for the fastest algorithm. If you happen to run
      // out of memory later, you will be on your own...
@ -229,8 +229,8 @@ bool CudnnConvOp<T>::RunOnDevice() {
        cudnn_wrapper_.inline_cudnn_handle(),
        bottom_desc_, filter_desc_, conv_desc_, top_desc_,
        algo_, &cudnn_ws_nbytes_));
-    LOG(INFO) << "CuDNN algorithm: " << algo_;
+    VLOG(1) << "CuDNN algorithm: " << algo_;
-    LOG(INFO) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
+    VLOG(1) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
  }
  // Now, actually run the computation.
@ -346,7 +346,7 @@ bool CudnnConvGradientOp<T>::RunOnDevice() {
      bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
      bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
    } else if (exhaustive_search_) {
-      LOG(INFO) << "CUDNN Convolution bwd: doing exhaustive search.";
+      VLOG(1) << "CUDNN Convolution bwd: doing exhaustive search.";
      // When we do an exhaustive search, we will ignore the workspace size
      // limit and simply go for the fastest algorithm. If you happen to run
      // out of memory later, you will be on your own...
@ -416,9 +416,9 @@ bool CudnnConvGradientOp<T>::RunOnDevice() {
        bwd_data_algo_, &bwd_data_ws_size));
    cudnn_ws_nbytes_ = std::max(bwd_filter_ws_size, bwd_data_ws_size);
-    LOG(INFO) << "CuDNN bwd algorithm: " << bwd_filter_algo_ << ", "
+    VLOG(1) << "CuDNN bwd algorithm: " << bwd_filter_algo_ << ", "
            << bwd_data_algo_;
-    LOG(INFO) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
+    VLOG(1) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
  }
  // Now, actually run the computation.
--- a/caffe2/operators/conv_transpose_op_cudnn.cc
+++ b/caffe2/operators/conv_transpose_op_cudnn.cc
@ -0,0 +1,579 @@
 #include "caffe2/core/common_cudnn.h"
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/conv_transpose_op.h"
 namespace caffe2 {
 // Earlier in the days Caffe sets the default cudnn workspace to 8MB. We bump
 // it up to 64MB in Caffe2, as this enables the use of Winograd in many cases,
 // something very beneficial to more recent CNN models.
 static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 64 * 1024 * 1024;
 // Manually specified number of algorithms implemented in CuDNN.
 // This does not have any performance implications, as we will always find the
 // fastest algorithm; setting them to the right number of algorithms will enable
 // us to best report the statistics when doing an exhaustive search, though.
 static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7;
 static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
 static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
 namespace {
 template <typename ArrayOfcudnnConvolutionAlgoPerf_t>
 inline void LogCuDNNPerfStats(
    const ArrayOfcudnnConvolutionAlgoPerf_t& perf_stat,
    int returned_algo_count) {
  LOG(INFO) << "Perf result: (algo: stat, time, memory)";
  for (int i = 0; i < returned_algo_count; ++i) {
    const auto& stat = perf_stat[i];
    LOG(INFO) << stat.algo << ": " << stat.status << " " << stat.time << " "
              << stat.memory;
  }
 }
 } // namespace
 class CudnnConvTransposeOpBase : public ConvTransposeUnpoolBase<CUDAContext> {
 public:
  CudnnConvTransposeOpBase(const OperatorDef& operator_def, Workspace* ws)
      : ConvTransposeUnpoolBase<CUDAContext>(operator_def, ws),
        cudnn_wrapper_(&context_),
        cudnn_ws_nbytes_limit_(OperatorBase::GetSingleArgument<size_t>(
            "ws_nbytes_limit",
            kCONV_CUDNN_WORKSPACE_LIMIT_BYTES)),
        exhaustive_search_(
            OperatorBase::GetSingleArgument<int>("exhaustive_search", 0)),
        deterministic_(
            OperatorBase::GetSingleArgument<int>("deterministic", 0)),
        cudnn_state_(OperatorBase::GetSingleArgument<int>("cudnn_state", 0)) {
    CHECK(!deterministic_ || !exhaustive_search_);
    CUDNN_CHECK(cudnnCreateTensorDescriptor(&bottom_desc_));
    CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
    CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
    CUDNN_CHECK(cudnnCreateTensorDescriptor(&top_desc_));
    CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
  }
  ~CudnnConvTransposeOpBase() {
    CUDNN_CHECK(cudnnDestroyTensorDescriptor(bottom_desc_));
    CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc_));
    CUDNN_CHECK(cudnnDestroyTensorDescriptor(bias_desc_));
    CUDNN_CHECK(cudnnDestroyTensorDescriptor(top_desc_));
    CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc_));
  }
 protected:
  vector<TIndex> cudnn_input_dims_;
  vector<TIndex> cudnn_filter_dims_;
  CuDNNWrapper cudnn_wrapper_;
  cudnnTensorDescriptor_t bottom_desc_;
  cudnnFilterDescriptor_t filter_desc_;
  cudnnTensorDescriptor_t bias_desc_;
  cudnnTensorDescriptor_t top_desc_;
  cudnnConvolutionDescriptor_t conv_desc_;
  const size_t cudnn_ws_nbytes_limit_;
  size_t cudnn_ws_nbytes_;
  bool exhaustive_search_;
  bool deterministic_;
  size_t cudnn_state_;
 };
 template <typename T>
 class CudnnConvTransposeOp final : public CudnnConvTransposeOpBase {
 public:
  CudnnConvTransposeOp(const OperatorDef& operator_def, Workspace* ws)
      : CudnnConvTransposeOpBase(operator_def, ws) {}
  ~CudnnConvTransposeOp() {}
  bool RunOnDevice() override;
 private:
  cudnnConvolutionBwdDataAlgo_t bwd_data_algo_;
  // Input: X, W, b
  // Output: Y
  INPUT_TAGS(INPUT, FILTER, BIAS);
 };
 template <typename T>
 class CudnnConvTransposeGradientOp final : public CudnnConvTransposeOpBase {
 public:
  CudnnConvTransposeGradientOp(const OperatorDef& operator_def, Workspace* ws)
      : CudnnConvTransposeOpBase(operator_def, ws) {}
  ~CudnnConvTransposeGradientOp() {}
  bool RunOnDevice() override;
 private:
  cudnnConvolutionFwdAlgo_t algo_;
  cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo_;
  // input: X, W, dY
  // output: dW, db, and optionally dX
  INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
  OUTPUT_TAGS(FILTER_GRAD, BIAS_GRAD, INPUT_GRAD);
 };
 ////////////////////////////////////////////////////////////////////////////////
 // Implementations
 ////////////////////////////////////////////////////////////////////////////////
 template <typename T>
 bool CudnnConvTransposeOp<T>::RunOnDevice() {
  auto& X = Input(INPUT);
  auto& filter = Input(FILTER);
  auto& bias = Input(BIAS);
  auto* Y = Output(0);
  int C = 0;
  switch (order_) {
    case StorageOrder::NHWC:
      C = filter.dim32(3);
      break;
    case StorageOrder::NCHW:
      C = filter.dim32(1);
      break;
    default:
      LOG(FATAL) << "Unknown storage order: " << order_;
  }
  ConvTransposeUnpoolBase<CUDAContext>::SetOutputSize(X, Y, C);
  int N = 0, M = 0, H = 0, W = 0, H_out = 0, W_out = 0;
  switch (order_) {
    case StorageOrder::NHWC:
      N = X.dim32(0);
      H = X.dim32(1);
      W = X.dim32(2);
      M = X.dim32(3);
      H_out = Y->dim32(1);
      W_out = Y->dim32(2);
      DCHECK_EQ(filter.dim32(1), kernel_h_);
      DCHECK_EQ(filter.dim32(1), kernel_h_);
      DCHECK_EQ(filter.dim32(2), kernel_w_);
      DCHECK_EQ(filter.dim32(3), C);
      break;
    case StorageOrder::NCHW:
      N = X.dim32(0);
      M = X.dim32(1);
      H = X.dim32(2);
      W = X.dim32(3);
      H_out = Y->dim32(2);
      W_out = Y->dim32(3);
      DCHECK_EQ(filter.dim32(1), C);
      DCHECK_EQ(filter.dim32(2), kernel_h_);
      DCHECK_EQ(filter.dim32(3), kernel_w_);
      break;
    default:
      LOG(FATAL) << "Unknown storage order: " << order_;
  }
  DCHECK_EQ(bias.ndim(), 1);
  DCHECK_EQ(bias.dim32(0), C);
  // Set up the cudnn algorithms & workspace if necessary
  bool input_changed = (X.dims() != cudnn_input_dims_);
  bool filter_changed = (filter.dims() != cudnn_filter_dims_);
  if (input_changed || filter_changed) {
    VLOG(1) << "Changing the cudnn descriptor configurations.";
    if (input_changed) {
      cudnn_input_dims_ = X.dims();
      CUDNN_CHECK(cudnnSetTensor4dDescriptor(
          bottom_desc_,
          GetCudnnTensorFormat(order_),
          cudnnTypeWrapper<T>::type,
          N,
          M,
          H,
          W));
    }
    if (filter_changed) {
      cudnn_filter_dims_ = filter.dims();
      CUDNN_CHECK(cudnnSetFilter4dDescriptor(
          filter_desc_,
          cudnnTypeWrapper<T>::type,
          GetCudnnTensorFormat(order_),
          M,
          C,
          kernel_h_,
          kernel_w_));
      CUDNN_CHECK(cudnnSetTensor4dDescriptor(
          bias_desc_,
          GetCudnnTensorFormat(order_),
          cudnnTypeWrapper<T>::type,
          1,
          C,
          1,
          1));
    }
    // Set the output
    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
        top_desc_,
        GetCudnnTensorFormat(order_),
        cudnnTypeWrapper<T>::type,
        N,
        C,
        H_out,
        W_out));
    // Set the convolution descriptor
    CHECK_EQ(pad_t_, pad_b_)
        << "The current padding scheme leads to unequal padding on the top and "
           "bottom, which is not supported by cudnn.";
    CHECK_EQ(pad_l_, pad_r_)
        << "The current padding scheme leads to unequal padding on the left "
           "and right, which is not supported by cudnn.";
    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
        conv_desc_,
        pad_t_,
        pad_l_,
        stride_h_,
        stride_w_,
        1,
        1,
        CUDNN_CROSS_CORRELATION));
    if (deterministic_) {
      bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
    } else if (exhaustive_search_) {
      int returned_algo_count;
      std::array<cudnnConvolutionBwdDataAlgoPerf_t, kNUM_CUDNN_BWD_DATA_ALGS>
          data_perf_stat;
      cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
        state->workspace().reset();
        CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
            state->cudnn_handle(),
            filter_desc_,
            bottom_desc_,
            conv_desc_,
            top_desc_,
            kNUM_CUDNN_BWD_DATA_ALGS,
            &returned_algo_count,
            data_perf_stat.data()));
      });
      LogCuDNNPerfStats(data_perf_stat, returned_algo_count);
      bwd_data_algo_ = data_perf_stat[0].algo;
    } else {
      CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
          cudnn_wrapper_.inline_cudnn_handle(),
          filter_desc_,
          bottom_desc_,
          conv_desc_,
          top_desc_,
          CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
          cudnn_ws_nbytes_limit_,
          &bwd_data_algo_));
    }
    size_t bwd_data_ws_size;
    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
        cudnn_wrapper_.inline_cudnn_handle(),
        filter_desc_,
        bottom_desc_,
        conv_desc_,
        top_desc_,
        bwd_data_algo_,
        &bwd_data_ws_size));
    cudnn_ws_nbytes_ = bwd_data_ws_size;
    LOG(INFO) << "CuDNN algorithm: " << bwd_data_algo_;
    LOG(INFO) << "CuDNN workspace size: " << bwd_data_ws_size;
  }
  // Now, actually run the computation.
  // Filter
  cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
    CUDNN_CHECK(cudnnConvolutionBackwardData(
        state->cudnn_handle(),
        cudnnTypeWrapper<T>::kOne(),
        filter_desc_,
        filter.template data<T>(),
        bottom_desc_,
        X.template data<T>(),
        conv_desc_,
        bwd_data_algo_,
        state->workspace().get(cudnn_ws_nbytes_),
        cudnn_ws_nbytes_,
        cudnnTypeWrapper<T>::kZero(),
        top_desc_,
        Y->template mutable_data<T>()));
  });
  // Bias
  CUDNN_CHECK(cudnnAddTensor(
      cudnn_wrapper_.inline_cudnn_handle(),
      cudnnTypeWrapper<T>::kOne(),
      bias_desc_,
      bias.template data<T>(),
      cudnnTypeWrapper<T>::kOne(),
      top_desc_,
      Y->template mutable_data<T>()));
  // Done.
  return true;
 }
 // TODO(Yangqing): a lot of the function contents are very similar. Consider
 // consolidating them.
 template <typename T>
 bool CudnnConvTransposeGradientOp<T>::RunOnDevice() {
  auto& X = Input(INPUT);
  auto& filter = Input(FILTER);
  auto& dY = Input(OUTPUT_GRAD);
  auto* dfilter = Output(FILTER_GRAD);
  auto* dbias = Output(BIAS_GRAD);
  DCHECK_EQ(X.ndim(), 4);
  DCHECK_EQ(filter.ndim(), 4);
  auto* Y = Output(0);
  int C = 0;
  switch (order_) {
    case StorageOrder::NHWC:
      C = filter.dim32(3);
      break;
    case StorageOrder::NCHW:
      C = filter.dim32(1);
      break;
    default:
      LOG(FATAL) << "Unknown storage order: " << order_;
  }
  ConvTransposeUnpoolBase<CUDAContext>::SetOutputSize(X, Y, C);
  int N = 0, M = 0, H = 0, W = 0, H_out = 0, W_out = 0;
  switch (order_) {
    case StorageOrder::NHWC:
      N = X.dim32(0);
      H = X.dim32(1);
      W = X.dim32(2);
      M = X.dim32(3);
      H_out = dY.dim32(1);
      W_out = dY.dim32(2);
      DCHECK_EQ(filter.dim32(1), kernel_h_);
      DCHECK_EQ(filter.dim32(1), kernel_h_);
      DCHECK_EQ(filter.dim32(2), kernel_w_);
      DCHECK_EQ(filter.dim32(3), C);
      break;
    case StorageOrder::NCHW:
      N = X.dim32(0);
      M = X.dim32(1);
      H = X.dim32(2);
      W = X.dim32(3);
      H_out = dY.dim32(2);
      W_out = dY.dim32(3);
      DCHECK_EQ(filter.dim32(1), C);
      DCHECK_EQ(filter.dim32(2), kernel_h_);
      DCHECK_EQ(filter.dim32(3), kernel_w_);
      break;
    default:
      LOG(FATAL) << "Unknown storage order: " << order_;
  }
  // Since we only handle LegacyPadding::NOTSET, we don't need to
  // compute padding.
  dfilter->ResizeLike(filter);
  dbias->Resize(C);
  // Set up the cudnn algorithms & workspace if necessary
  bool input_changed = (X.dims() != cudnn_input_dims_);
  bool filter_changed = (filter.dims() != cudnn_filter_dims_);
  if (input_changed || filter_changed) {
    VLOG(1) << "Changing the cudnn descriptor configurations.";
    if (input_changed) {
      cudnn_input_dims_ = X.dims();
      CUDNN_CHECK(cudnnSetTensor4dDescriptor(
          bottom_desc_,
          GetCudnnTensorFormat(order_),
          cudnnTypeWrapper<T>::type,
          N,
          M,
          H,
          W));
    }
    if (filter_changed) {
      cudnn_filter_dims_ = filter.dims();
      CUDNN_CHECK(cudnnSetFilter4dDescriptor(
          filter_desc_,
          cudnnTypeWrapper<T>::type,
          GetCudnnTensorFormat(order_),
          M,
          C,
          kernel_h_,
          kernel_w_));
      CUDNN_CHECK(cudnnSetTensor4dDescriptor(
          bias_desc_,
          GetCudnnTensorFormat(order_),
          cudnnTypeWrapper<T>::type,
          1,
          C,
          1,
          1));
    }
    // Set the output
    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
        top_desc_,
        GetCudnnTensorFormat(order_),
        cudnnTypeWrapper<T>::type,
        N,
        C,
        H_out,
        W_out));
    // Set the convolution descriptor
    CHECK_EQ(pad_t_, pad_b_)
        << "The current padding scheme leads to unequal padding on the top and "
           "bottom, which is not supported by cudnn.";
    CHECK_EQ(pad_l_, pad_r_)
        << "The current padding scheme leads to unequal padding on the left "
           "and right, which is not supported by cudnn.";
    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
        conv_desc_,
        pad_t_,
        pad_l_,
        stride_h_,
        stride_w_,
        1,
        1,
        CUDNN_CROSS_CORRELATION));
    // Set the workspace
    size_t bwd_filter_ws_size, fwd_ws_size;
    if (deterministic_) {
      algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
      bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
    } else if (exhaustive_search_) {
      LOG(INFO) << "CUDNN Convolution bwd: doing exhaustive search.";
      // When we do an exhaustive search, we will ignore the workspace size
      // limit and simply go for the fastest algorithm. If you happen to run
      // out of memory later, you will be on your own...
      int returned_algo_count;
      // We clean up the current workspace memory so that the forward algorithm
      // is free to allocate memory.
      // Actually run the search.
      std::
          array<cudnnConvolutionBwdFilterAlgoPerf_t, kNUM_CUDNN_BWD_FILTER_ALGS>
              filter_perf_stat;
      cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
        state->workspace().reset();
        CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
            state->cudnn_handle(),
            top_desc_,
            bottom_desc_,
            conv_desc_,
            filter_desc_,
            kNUM_CUDNN_BWD_FILTER_ALGS,
            &returned_algo_count,
            filter_perf_stat.data()));
      });
      LogCuDNNPerfStats(filter_perf_stat, returned_algo_count);
      bwd_filter_algo_ = filter_perf_stat[0].algo;
      std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
          fwd_perf_stat;
      cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
        state->workspace().reset();
        CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
            state->cudnn_handle(),
            top_desc_,
            filter_desc_,
            conv_desc_,
            bottom_desc_,
            kNUM_CUDNN_BWD_DATA_ALGS,
            &returned_algo_count,
            fwd_perf_stat.data()));
      });
      LogCuDNNPerfStats(fwd_perf_stat, returned_algo_count);
      algo_ = fwd_perf_stat[0].algo;
    } else {
      // choose backward algorithm for filter
      CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
          cudnn_wrapper_.inline_cudnn_handle(),
          top_desc_,
          bottom_desc_,
          conv_desc_,
          filter_desc_,
          CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
          cudnn_ws_nbytes_limit_,
          &bwd_filter_algo_));
      // choose backward algo for data
      CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
          cudnn_wrapper_.inline_cudnn_handle(),
          top_desc_,
          filter_desc_,
          conv_desc_,
          bottom_desc_,
          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
          cudnn_ws_nbytes_limit_,
          &algo_));
    }
    // get workspace for backwards filter algorithm
    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
        cudnn_wrapper_.inline_cudnn_handle(),
        top_desc_,
        bottom_desc_,
        conv_desc_,
        filter_desc_,
        bwd_filter_algo_,
        &bwd_filter_ws_size));
    // get workspace for backwards data algorithm
    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
        cudnn_wrapper_.inline_cudnn_handle(),
        top_desc_,
        filter_desc_,
        conv_desc_,
        bottom_desc_,
        algo_,
        &fwd_ws_size));
    cudnn_ws_nbytes_ = std::max(bwd_filter_ws_size, fwd_ws_size);
    LOG(INFO) << "CuDNN bwd algorithm: " << bwd_filter_algo_ << ", " << algo_;
    LOG(INFO) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
  }
  // Now, actually run the computation.
  CUDNN_CHECK(cudnnConvolutionBackwardBias(
      cudnn_wrapper_.inline_cudnn_handle(),
      cudnnTypeWrapper<T>::kOne(),
      top_desc_,
      dY.template data<T>(),
      cudnnTypeWrapper<T>::kZero(),
      bias_desc_,
      dbias->template mutable_data<T>()));
  cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
    CUDNN_CHECK(cudnnConvolutionBackwardFilter(
        state->cudnn_handle(),
        cudnnTypeWrapper<T>::kOne(),
        top_desc_,
        dY.template data<T>(),
        bottom_desc_,
        X.template data<T>(),
        conv_desc_,
        bwd_filter_algo_,
        state->workspace().get(cudnn_ws_nbytes_),
        cudnn_ws_nbytes_,
        cudnnTypeWrapper<T>::kZero(),
        filter_desc_,
        dfilter->template mutable_data<T>()));
    if (OutputSize() == 3) {
      // Compute the gradient w.r.t. the input.
      auto* dX = Output(INPUT_GRAD);
      dX->ResizeLike(X);
      CUDNN_CHECK(cudnnConvolutionForward(
          state->cudnn_handle(),
          cudnnTypeWrapper<T>::kOne(),
          top_desc_,
          dY.template data<T>(),
          filter_desc_,
          filter.template data<T>(),
          conv_desc_,
          algo_,
          state->workspace().get(cudnn_ws_nbytes_),
          cudnn_ws_nbytes_,
          cudnnTypeWrapper<T>::kZero(),
          bottom_desc_,
          dX->template mutable_data<T>()));
    }
  });
  return true;
 }
 REGISTER_CUDNN_OPERATOR(ConvTranspose, CudnnConvTransposeOp<float>);
 REGISTER_CUDNN_OPERATOR(
    ConvTransposeGradient,
    CudnnConvTransposeGradientOp<float>);
 } // namespace caffe2
--- a/caffe2/operators/conv_transpose_unpool_op_base.h
+++ b/caffe2/operators/conv_transpose_unpool_op_base.h
@ -118,9 +118,13 @@ class ConvTransposeUnpoolBase : public Operator<Context> {
    return true;
  }
-  virtual bool RunOnDeviceWithOrderNCHW() = 0;
+  virtual bool RunOnDeviceWithOrderNCHW() {
    CAFFE_THROW("Not implemented");
  }
-  virtual bool RunOnDeviceWithOrderNHWC() = 0;
+  virtual bool RunOnDeviceWithOrderNHWC() {
    CAFFE_THROW("Not implemented");
  }
  virtual ~ConvTransposeUnpoolBase() {}
--- a/caffe2/operators/counter_ops.cc
+++ b/caffe2/operators/counter_ops.cc
@ -0,0 +1,46 @@
 #include "counter_ops.h"
 namespace caffe2 {
 namespace {
 REGISTER_CPU_OPERATOR(CreateCounter, CreateCounterOp<int32_t>);
 REGISTER_CPU_OPERATOR(ResetCounter, ResetCounterOp<int32_t>);
 REGISTER_CPU_OPERATOR(CountDown, CountDownOp<int32_t>);
 OPERATOR_SCHEMA(CreateCounter)
    .NumInputs(0)
    .NumOutputs(1)
    .SetDoc(R"DOC(
 Creates a count-down counter with initial value specified by the 'init_count'
 argument.
 )DOC")
    .Output(0, "counter", "A blob pointing to an instance of a new counter.")
    .Arg("init_count", "Initial count for the counter, must be >= 0.");
 OPERATOR_SCHEMA(ResetCounter)
    .NumInputs(1)
    .NumOutputs(0)
    .SetDoc(R"DOC(
 Resets a count-down counter with initial value specified by the 'init_count'
 argument.
 )DOC")
    .Input(0, "counter", "A blob pointing to an instance of a new counter.")
    .Arg("init_count", "Resets counter to this value, must be >= 0.");
 OPERATOR_SCHEMA(CountDown)
    .NumInputs(1)
    .NumOutputs(1)
    .SetDoc(R"DOC(
 If the internal count value > 0, decreases count value by 1 and outputs false,
 otherwise outputs true.
 )DOC")
    .Input(0, "counter", "A blob pointing to an instance of a counter.")
    .Output(0, "should_stop", "false unless the internal count is zero.");
 SHOULD_NOT_DO_GRADIENT(CreateCounter);
 SHOULD_NOT_DO_GRADIENT(ResetCounter);
 SHOULD_NOT_DO_GRADIENT(CountDown);
 } // namespace
 } // namespace caffe2
--- a/caffe2/operators/counter_ops.h
+++ b/caffe2/operators/counter_ops.h
@ -0,0 +1,89 @@
 #ifndef CAFFE2_OPERATORS_COUNTER_OPS_H
 #define CAFFE2_OPERATORS_COUNTER_OPS_H
 #include <atomic>
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 namespace caffe2 {
 namespace {
 template <typename T>
 class Counter {
 public:
  explicit Counter(T count) : count_(count) {}
  bool CountDown() {
    if (count_ > 0) {
      --count_;
      return false;
    }
    return true;
  }
  void reset(T init_count) {
    count_ = init_count;
  }
 private:
  std::atomic<T> count_;
 };
 }
 template <typename T, class Context = CPUContext>
 class CreateCounterOp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  CreateCounterOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws),
        init_count_(OperatorBase::GetSingleArgument<T>("init_count", 0)) {
    CHECK_LE(0, init_count_) << "negative init_count is not permitted.";
  }
  bool RunOnDevice() override {
    *OperatorBase::Output<std::unique_ptr<Counter<T>>>(0) =
        std::unique_ptr<Counter<T>>(new Counter<T>(init_count_));
    return true;
  }
 private:
  T init_count_ = 0;
 };
 template <typename T, class Context = CPUContext>
 class ResetCounterOp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  ResetCounterOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws),
        init_count_(OperatorBase::GetSingleArgument<T>("init_count", 0)) {
    CHECK_LE(0, init_count_) << "negative init_count is not permitted.";
  }
  bool RunOnDevice() override {
    auto& counterPtr = OperatorBase::Input<std::unique_ptr<Counter<T>>>(0);
    counterPtr->reset(init_count_);
    return true;
  }
 private:
  T init_count_;
 };
 template <typename T, class Context = CPUContext>
 class CountDownOp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  CountDownOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws) {}
  bool RunOnDevice() override {
    auto& counterPtr = OperatorBase::Input<std::unique_ptr<Counter<T>>>(0);
    auto* output = Output(0);
    output->Resize(std::vector<int>{});
    *output->template mutable_data<bool>() = counterPtr->CountDown();
    return true;
  }
 };
 } // namespace caffe2
 #endif // CAFFE2_OPERATORS_COUNTER_OPS_H_
--- a/caffe2/operators/cross_entropy_op.cc
+++ b/caffe2/operators/cross_entropy_op.cc
@ -2,6 +2,17 @@
 namespace caffe2 {
 namespace {
 inline float sigmoid_xent_forward(float lgt, float tgt) {
  return lgt * (tgt - (lgt >= 0)) - log(1 + exp(lgt - 2 * lgt * (lgt >= 0)));
 }
 inline float sigmoid_xent_backward(float lgt, float tgt) {
  return tgt - 1. / (1. + exp(-lgt));
 }
 }
 template <>
 bool LabelCrossEntropyOp<float, CPUContext>::RunOnDevice() {
  auto& X = Input(0);
@ -26,6 +37,68 @@ bool LabelCrossEntropyOp<float, CPUContext>::RunOnDevice() {
  return true;
 }
 template <>
 bool SigmoidCrossEntropyWithLogitsOp<float, CPUContext>::RunOnDevice() {
  auto& logits = Input(0);
  auto& targets = Input(1);
  CAFFE_ENFORCE(logits.dims() == targets.dims());
  const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
  const auto outer_size = logits.size() / inner_size;
  auto* out = Output(0);
  if (logits.ndim() == 0) {
    out->Resize(std::vector<TIndex>{});
  } else {
    std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
    out->Resize(dims);
  }
  auto* out_ptr = out->mutable_data<float>();
  auto* logits_ptr = logits.data<float>();
  auto* targets_ptr = targets.data<float>();
  auto in_idx = 0;
  for (int i = 0; i < outer_size; ++i) {
    float value = 0;
    for (int j = 0; j < inner_size; ++j) {
      value += sigmoid_xent_forward(logits_ptr[in_idx], targets_ptr[in_idx]);
      ++in_idx;
    }
    out_ptr[i] = -value / inner_size;
  }
  return true;
 }
 template <>
 bool SigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>::RunOnDevice() {
  auto& g = Input(0);
  auto& logits = Input(1);
  auto& targets = Input(2);
  CAFFE_ENFORCE(logits.dims() == targets.dims());
  const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
  const auto outer_size = logits.size() / inner_size;
  CAFFE_ENFORCE(g.size() == outer_size);
  auto* out = Output(0);
  out->ResizeLike(logits);
  auto* out_ptr = out->mutable_data<float>();
  auto* logits_ptr = logits.data<float>();
  auto* targets_ptr = targets.data<float>();
  auto* g_ptr = g.data<float>();
  auto in_idx = 0;
  for (int i = 0; i < outer_size; ++i) {
    auto g_factor = -g_ptr[i] / inner_size;
    for (int i = 0; i < inner_size; ++i) {
      out_ptr[in_idx] = g_factor *
          sigmoid_xent_backward(logits_ptr[in_idx], targets_ptr[in_idx]);
      ++in_idx;
    }
  }
  return true;
 }
 template <>
 bool LabelCrossEntropyGradientOp<float, CPUContext>::RunOnDevice() {
  auto& X = Input(0);
@ -129,6 +202,13 @@ REGISTER_CPU_OPERATOR(MakeTwoClass,
 REGISTER_CPU_OPERATOR(MakeTwoClassGradient,
                      MakeTwoClassGradientOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(
    SigmoidCrossEntropyWithLogits,
    SigmoidCrossEntropyWithLogitsOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(
    SigmoidCrossEntropyWithLogitsGradient,
    SigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>);
 OPERATOR_SCHEMA(MakeTwoClass)
  .NumInputs(1)
  .NumOutputs(1)
@ -145,6 +225,22 @@ OPERATOR_SCHEMA(MakeTwoClassGradient)
  .NumInputs(1)
  .NumOutputs(1);
 OPERATOR_SCHEMA(SigmoidCrossEntropyWithLogits)
    .NumInputs(2)
    .NumOutputs(1)
    .SetDoc(R"DOC(
 Given two matrices logits and targets, of same shape,
 (batch_size, num_classes), computes the sigmoid cross entropy between the two.
 Returns a tensor of shape (batch_size,) of losses for each example.
 )DOC")
    .Input(0, "logits", "matrix of logits for each example and class.")
    .Input(1, "targets", "matrix of targets, same shape as logits.")
    .Output(0, "xentropy", "Vector with the total xentropy for each example.");
 OPERATOR_SCHEMA(SigmoidCrossEntropyWithLogitsGradient)
    .NumInputs(3)
    .NumOutputs(1);
 struct GetMakeTwoClassGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
  vector<OperatorDef> GetGradientDefs() override {
@ -156,5 +252,20 @@ struct GetMakeTwoClassGradient : public GradientMakerBase {
  }
 };
 REGISTER_GRADIENT(MakeTwoClass, GetMakeTwoClassGradient);
 struct GetSigmoidCrossEntropyWithLogitsGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
  vector<OperatorDef> GetGradientDefs() override {
    return SingleGradientDef(
        "SigmoidCrossEntropyWithLogitsGradient",
        "",
        vector<string>{GO(0), I(0), I(1)},
        vector<string>{GI(0)});
  }
 };
 REGISTER_GRADIENT(
    SigmoidCrossEntropyWithLogits,
    GetSigmoidCrossEntropyWithLogitsGradient);
 }  // namespace
 }  // namespace caffe2
--- a/caffe2/operators/cross_entropy_op.h
+++ b/caffe2/operators/cross_entropy_op.h
@ -62,6 +62,22 @@ class MakeTwoClassGradientOp final
  // Ouptut: dX
 };
 template <typename T, class Context>
 class SigmoidCrossEntropyWithLogitsOp final : public Operator<Context> {
 public:
  USE_SIMPLE_CTOR_DTOR(SigmoidCrossEntropyWithLogitsOp);
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  bool RunOnDevice() override;
 };
 template <typename T, class Context>
 class SigmoidCrossEntropyWithLogitsGradientOp final : public Operator<Context> {
 public:
  USE_SIMPLE_CTOR_DTOR(SigmoidCrossEntropyWithLogitsGradientOp);
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  bool RunOnDevice() override;
 };
 }  // namespace caffe2
 #endif  // CAFFE2_OPERATORS_CROSS_ENTROPY_OP_H_
--- a/caffe2/operators/dataset_ops.cc
+++ b/caffe2/operators/dataset_ops.cc
@ -0,0 +1,734 @@
 #include <memory>
 #include <mutex>
 #include <string>
 #include <vector>
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/utils/string_utils.h"
 namespace caffe2 {
 namespace {
 const char kDatasetFieldSeparator = ':';
 const char* kDatasetLengthField = "lengths";
 // how much percent to grow the dataset when needed
 const int kDatasetGrowthPct = 40;
 // used for lengths tensors in the dataset
 using TLength = int32_t;
 // used for all internal dataset operations (offsets, sizes to read, etc.)
 using TOffset = int64_t;
 /**
 * Provides functionality to iterate across a list of tensors where some
 * of those tensors represent lengths in a hierarchical structure.
 */
 class TreeIterator {
 public:
  struct FieldDesc {
    int id;
    int lengthFieldId = -1;
    std::string name;
  };
  explicit TreeIterator(const std::vector<std::string>& fields) {
    // populate field vector and split field names
    fields_.resize(fields.size());
    std::vector<std::vector<std::string>> nameParts(fields_.size());
    for (int i = 0; i < fields.size(); ++i) {
      auto& field = fields_.at(i);
      field.name = fields[i];
      field.id = i;
      field.lengthFieldId = -1;
      nameParts.at(i) = split(kDatasetFieldSeparator, field.name);
    }
    // populate lengthFields
    for (const auto& field : fields_) {
      const auto& parts = nameParts.at(field.id);
      if (!parts.empty() && parts.back() == kDatasetLengthField) {
        lengthFieldIds_.push_back(field.id);
      }
    }
    // find length-field with maximum prefix matching for each field
    for (auto& field : fields_) {
      // by default, we are matching against the root domain
      int maxMatchLevel = 1;
      int maxMatchLengthFieldId = -1;
      for (int j = 0; j < numLengthFields(); ++j) {
        const auto& lenField = lengthField(j);
        // a length field can't have itself as its length field
        if (field.id == lenField.id) {
          continue;
        }
        auto lf = nameParts.at(lenField.id);
        auto lfEnd = lf.end() - 1;
        // check whether this lengthField is a prefix for this field name
        if (std::mismatch(lf.begin(), lfEnd, nameParts.at(field.id).begin())
                .first != lfEnd) {
          continue;
        }
        if (lf.size() > maxMatchLevel) {
          maxMatchLevel = lf.size();
          maxMatchLengthFieldId = j;
        }
      }
      field.lengthFieldId = maxMatchLengthFieldId;
    }
    // check that fields are topologically sorted
    // (no length field depends on a length defined afterwards)
    for (const auto& field : fields_) {
      const auto* lengthField = lengthFieldFor(field);
      CAFFE_ENFORCE(
          (lengthField == nullptr) || (lengthField->id < field.id),
          "Error: Field ",
          field.id,
          " (",
          field.name,
          ") ",
          "depends on a field defined afterwards: ",
          lengthField->id,
          " (",
          lengthField->name,
          ").");
    }
  }
  void advance(
      const std::vector<const TLength*>& lengths,
      std::vector<TOffset>& offsets,
      std::vector<TOffset>& sizes,
      std::vector<TOffset>& limits,
      TOffset num) {
    thread_local std::vector<TOffset> newOffsets;
    CHECK_EQ(lengths.size(), numLengthFields());
    CHECK_EQ(offsets.size(), numOffsetFields());
    sizes.resize(offsets.size());
    newOffsets.resize(offsets.size());
    // first index, top level
    {
      auto limit = limits[0];
      auto offset = offsets[0];
      CAFFE_ENFORCE(limit >= offset, "Tried to advance past end of cursor.");
      TOffset total = std::min(limit - offset, num);
      sizes[0] = total;
      newOffsets[0] = offset + total;
    }
    // child indices
    for (int j = 1; j < numOffsetFields(); ++j) {
      TOffset total = 0;
      int parentOffsetId = offsetFieldIdFor(lengthField(j - 1));
      const TLength* length = lengths[j - 1] + offsets[parentOffsetId];
      for (int k = 0; k < sizes[parentOffsetId]; ++k) {
        total += *(length++);
      }
      auto offset = offsets[j];
      CAFFE_ENFORCE(
          offset + total <= limits[j],
          "Inconsistent field length: ",
          "tried to advance past the end of field ",
          j);
      sizes[j] = total;
      newOffsets[j] = offset + total;
    }
    offsets = newOffsets;
  }
  // Corresponds to the number of fields that have "length" as its last name
  int numLengthFields() const {
    return lengthFieldIds_.size();
  }
  // Corresponds to the number of length fields + 1 (for the top-level domain)
  int numOffsetFields() const {
    return numLengthFields() + 1;
  }
  // Get lengthField description for the given field
  const FieldDesc* lengthFieldFor(const FieldDesc& desc) {
    return (desc.lengthFieldId == -1)
        ? nullptr
        : &fields_.at(lengthFieldIds_.at(desc.lengthFieldId));
  }
  // Get lengthField description for the given lengthFieldId, where
  // 0 <= lengthFieldId < numLengthFields()
  const FieldDesc& lengthField(int lengthFieldId) {
    return fields_.at(lengthFieldIds_.at(lengthFieldId));
  }
  // Returns the index into the 'offset' vector for the given field.
  int offsetFieldIdFor(const FieldDesc& fieldDesc) {
    return fieldDesc.lengthFieldId + 1;
  }
  // Returns the field description for all fields.
  const std::vector<FieldDesc>& fields() {
    return fields_;
  }
 private:
  // Description of each field
  std::vector<FieldDesc> fields_;
  // Index into fields_ above for the fields that are lengths.
  std::vector<int> lengthFieldIds_;
 };
 class TreeCursor {
 public:
  explicit TreeCursor(const TreeIterator& iterator) : it(iterator) {}
  std::vector<TOffset> offsets;
  std::mutex mutex_;
  TreeIterator it;
 };
 class CreateTreeCursorOp : public Operator<CPUContext> {
 public:
  CreateTreeCursorOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator(operator_def, ws),
        fields_(OperatorBase::GetRepeatedArgument<std::string>("fields")) {}
  bool RunOnDevice() override {
    *OperatorBase::Output<std::unique_ptr<TreeCursor>>(0) =
        std::unique_ptr<TreeCursor>(new TreeCursor(TreeIterator(fields_)));
    return true;
  }
 private:
  std::vector<std::string> fields_;
 };
 class ResetCursorOp : public Operator<CPUContext> {
 public:
  ResetCursorOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator(operator_def, ws) {}
  bool RunOnDevice() override {
    auto& cursor = OperatorBase::Input<std::unique_ptr<TreeCursor>>(0);
    std::lock_guard<std::mutex> lock(cursor->mutex_);
    cursor->offsets.clear();
    return true;
  }
 };
 class CheckDatasetConsistencyOp : public Operator<CPUContext> {
 public:
  CheckDatasetConsistencyOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator(operator_def, ws),
        iterator_(OperatorBase::GetRepeatedArgument<std::string>("fields")) {}
  bool RunOnDevice() override {
    thread_local std::vector<const TLength*> lengths;
    thread_local std::vector<TOffset> limits;
    thread_local std::vector<TOffset> sizes;
    thread_local std::vector<TOffset> offsets;
    CAFFE_ENFORCE(
        InputSize() == iterator_.fields().size(),
        "Invalid number of fields. Expected ",
        iterator_.fields().size(),
        ", got ",
        InputSize());
    sizes.resize(iterator_.numOffsetFields());
    // gather length data
    lengths.resize(iterator_.numLengthFields());
    for (int i = 0; i < lengths.size(); ++i) {
      lengths[i] = Input(iterator_.lengthField(i).id).data<TLength>();
    }
    // gather size limits
    limits.assign(sizes.size(), std::numeric_limits<TOffset>::max());
    for (int i = 0; i < iterator_.fields().size(); ++i) {
      int lengthIdx = iterator_.fields()[i].lengthFieldId + 1;
      TOffset size = (TOffset)Input(i).dims()[0];
      if (limits[lengthIdx] == std::numeric_limits<TOffset>::max()) {
        limits[lengthIdx] = size;
      } else {
        CAFFE_ENFORCE(
            limits[lengthIdx] == size,
            "Inconsistent sizes for fields belonging to same domain.",
            " Field: ",
            i,
            " (",
            iterator_.fields()[i].name,
            "); Length field index: ",
            lengthIdx,
            "); Previous size: ",
            limits[lengthIdx],
            "; New size: ",
            size);
      }
    }
    // advance to the end
    offsets.assign(sizes.size(), 0);
    iterator_.advance(lengths, offsets, sizes, limits, limits[0]);
    for (int i = 0; i < limits.size(); ++i) {
      CAFFE_ENFORCE(limits[i] == offsets[i]);
    }
    return true;
  }
 private:
  TreeIterator iterator_;
 };
 class ReadNextBatchOp : public Operator<CPUContext> {
 public:
  ReadNextBatchOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator(operator_def, ws),
        batchSize_(OperatorBase::GetSingleArgument<int>("batch_size", 1)) {}
  bool RunOnDevice() override {
    auto& cursor = OperatorBase::Input<std::unique_ptr<TreeCursor>>(0);
    CAFFE_ENFORCE(InputSize() == cursor->it.fields().size() + 1);
    thread_local std::vector<const TLength*> lengths;
    thread_local std::vector<TOffset> limits;
    thread_local std::vector<TOffset> sizes;
    thread_local std::vector<TOffset> offsets;
    sizes.resize(cursor->it.numOffsetFields());
    // gather length data
    lengths.resize(cursor->it.numLengthFields());
    for (int i = 0; i < lengths.size(); ++i) {
      lengths[i] = Input(cursor->it.lengthField(i).id + 1).data<int>();
    }
    // gather size limits
    limits.assign(sizes.size(), std::numeric_limits<TOffset>::max());
    for (int i = 0; i < cursor->it.fields().size(); ++i) {
      int lengthFieldIdx = cursor->it.fields()[i].lengthFieldId + 1;
      limits[lengthFieldIdx] =
          std::min(limits[lengthFieldIdx], (TOffset)Input(i + 1).dims()[0]);
    }
    // advance cursor
    {
      std::lock_guard<std::mutex> lock(cursor->mutex_);
      if (cursor->offsets.empty()) {
        cursor->offsets.assign(sizes.size(), 0);
      }
      offsets = cursor->offsets;
      cursor->it.advance(lengths, cursor->offsets, sizes, limits, batchSize_);
    }
    // gather data
    thread_local std::vector<TIndex> outDim;
    for (int i = 0; i < cursor->it.fields().size(); ++i) {
      auto lengthIdx = cursor->it.fields()[i].lengthFieldId + 1;
      auto size = sizes[lengthIdx];
      auto offset = offsets[lengthIdx];
      auto& in = Input(i + 1);
      auto innerSize = in.size_from_dim(1);
      outDim = in.dims();
      outDim[0] = size;
      auto* out = Output(i);
      out->Resize(outDim);
      if (out->size() == 0) {
        continue;
      }
      void* src =
          (char*)in.raw_data() + offset * innerSize * in.meta().itemsize();
      void* dst = out->raw_mutable_data(in.meta());
      context_.template CopyItems<CPUContext, CPUContext>(
          in.meta(), out->size(), src, dst);
    }
    return true;
  }
  int batchSize_;
 };
 class ComputeOffsetOp : public Operator<CPUContext> {
 public:
  ComputeOffsetOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator(operator_def, ws) {}
  bool RunOnDevice() override {
    auto& cursor = OperatorBase::Input<std::unique_ptr<TreeCursor>>(0);
    CAFFE_ENFORCE(InputSize() == cursor->it.fields().size() + 1);
    auto* out = Output(0);
    std::vector<const TLength*> lengths;
    std::vector<TOffset> limits;
    std::vector<TOffset> sizes;
    std::vector<TOffset> offsets;
    sizes.resize(cursor->it.numOffsetFields());
    // gather length data
    lengths.resize(cursor->it.numLengthFields());
    for (int i = 0; i < lengths.size(); ++i) {
      lengths[i] = Input(cursor->it.lengthField(i).id + 1).data<int>();
    }
    // gather size limits
    limits.assign(sizes.size(), std::numeric_limits<TOffset>::max());
    for (int i = 0; i < cursor->it.fields().size(); ++i) {
      int lengthFieldIdx = cursor->it.fields()[i].lengthFieldId + 1;
      limits[lengthFieldIdx] =
          std::min(limits[lengthFieldIdx], (TOffset)Input(i + 1).dims()[0]);
    }
    out->Resize(limits.at(0) + 1, sizes.size());
    auto* out_data = out->mutable_data<int64_t>();
    for (int k = 0; k <= limits.at(0); k++) {
      // advance cursor
      if (cursor->offsets.empty()) {
        cursor->offsets.assign(sizes.size(), 0);
      }
      // write output
      std::copy(cursor->offsets.begin(), cursor->offsets.end(), out_data);
      out_data += sizes.size();
      cursor->it.advance(lengths, cursor->offsets, sizes, limits, 1);
    }
    cursor->offsets.assign(sizes.size(), 0); // reSet after getting meta info
    return true;
  }
 };
 class ReadRandomBatchOp : public Operator<CPUContext> {
 public:
  ReadRandomBatchOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator(operator_def, ws),
        batchSize_(OperatorBase::GetSingleArgument<int>("batch_size", 1)) {}
  bool RunOnDevice() override {
    auto& cursor = OperatorBase::Input<std::unique_ptr<TreeCursor>>(0);
    auto& idxblob = Input(1);
    auto& offsetsmat = Input(2);
    CAFFE_ENFORCE(InputSize() == cursor->it.fields().size() + 3);
    auto idxvec = idxblob.template data<int64_t>();
    auto& offsetdim = offsetsmat.dims();
    // gather data
    thread_local std::vector<TIndex> outDim;
    int64_t idx;
    {
      std::lock_guard<std::mutex> lock(cursor->mutex_);
      cursor->offsets.resize(1);
      idx = cursor->offsets.at(0);
      cursor->offsets.at(0) += batchSize_;
    }
    for (int i = 0; i < cursor->it.fields().size(); ++i) {
      auto lengthIdx = cursor->it.fields()[i].lengthFieldId + 1;
      auto& in = Input(i + 3);
      outDim = in.dims();
      outDim.at(0) = 0;
      auto idxbegin = idx;
      for (int j = 0; j < batchSize_; ++j) {
        if (idx >= idxblob.size()) {
          break;
        }
        CAFFE_ENFORCE(
            (idxvec[idx] + 1) * offsetdim[1] + lengthIdx < offsetsmat.size(),
            "Out of bound when trying to get elem from offsetsmat");
        auto offsetptr = offsetsmat.template data<TOffset>() +
            idxvec[idx] * offsetdim[1] + lengthIdx;
        auto offset = *offsetptr;
        auto size = *(offsetptr + offsetdim[1]) - offset;
        outDim.at(0) += size; // accumulate over the batch
        idx++;
      }
      idx = idxbegin; // reSet
      auto* out = Output(i);
      out->Resize(outDim);
      if (out->size() == 0) {
        continue;
      }
      auto innerSize = in.size_from_dim(1);
      auto dst = static_cast<char*>(out->raw_mutable_data(in.meta()));
      int block_size = in.size() / in.dim(0);
      int block_bytesize = in.nbytes() / in.dim(0);
      int start = 0;
      for (int j = 0; j < batchSize_; ++j) {
        if (idx >= idxblob.size()) {
          break;
        }
        auto offsetptr = offsetsmat.template data<TOffset>() +
            idxvec[idx] * offsetdim[1] + lengthIdx;
        auto offset = *offsetptr;
        auto size = *(offsetptr + offsetdim[1]) - offset;
        // copy data
        void* src =
            (char*)in.raw_data() + offset * innerSize * in.meta().itemsize();
        context_.template CopyItems<CPUContext, CPUContext>(
            in.meta(), size * block_size, src, dst + start * block_bytesize);
        start += size;
        idx++;
      }
      idx = idxbegin; // reSet
    }
    return true;
  }
  int batchSize_;
 };
 template <class Context>
 class AppendOp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  AppendOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws) {}
  bool RunOnDevice() override {
    auto& a = Input(0);
    auto& b = Input(1);
    auto* c = Output(0);
    CAFFE_ENFORCE(b.ndim() >= 1);
    if (a.size() == 0) {
      c->CopyFrom(b);
      return true;
    }
    CAFFE_ENFORCE(&a == c, "First argument must be in-place.");
    CAFFE_ENFORCE(c->ndim() == b.ndim());
    CAFFE_ENFORCE(b.ndim() == c->ndim());
    CAFFE_ENFORCE(a.meta() == b.meta());
    for (int i = 1; i < a.ndim(); ++i) {
      CAFFE_ENFORCE(a.dims()[i] == b.dims()[i]);
    }
    auto oldSize = c->size();
    c->Extend(b.dims()[0], kDatasetGrowthPct, &context_);
    auto* dst = (char*)c->raw_mutable_data() + oldSize * b.meta().itemsize();
    context_.template CopyItems<Context, Context>(
        b.meta(), b.size(), b.raw_data(), dst);
    return true;
  }
 };
 template <class Context>
 class AtomicAppendOp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  AtomicAppendOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws) {}
  bool RunOnDevice() override {
    auto& mutex = OperatorBase::Input<std::unique_ptr<std::mutex>>(0);
    const auto numFields = (InputSize() - 1) / 2;
    CAFFE_ENFORCE(OutputSize() == numFields);
    std::lock_guard<std::mutex> guard(*mutex);
    // 1: checks
    for (int i = 0; i < numFields; ++i) {
      auto& a = Input(1 + i);
      auto& b = Input(1 + i + numFields);
      auto* c = Output(i);
      CAFFE_ENFORCE(b.ndim() >= 1);
      if (a.size() == 0) {
        continue;
      }
      CAFFE_ENFORCE(
          (void*)&a == (void*)c, "Appended-to arguments must be in-place.");
      CAFFE_ENFORCE(c->ndim() == b.ndim());
      CAFFE_ENFORCE(b.ndim() == c->ndim());
      CAFFE_ENFORCE(a.meta() == b.meta());
      for (int j = 1; j < a.ndim(); ++j) {
        CAFFE_ENFORCE(a.dims()[j] == b.dims()[j]);
      }
    }
    // 2: copies
    for (int i = 0; i < numFields; ++i) {
      auto& a = Input(1 + i);
      auto& b = Input(1 + i + numFields);
      auto* c = Output(i);
      if (a.size() == 0) {
        c->CopyFrom(b);
        continue;
      }
      auto oldSize = c->size();
      c->Extend(b.dims()[0], kDatasetGrowthPct, &context_);
      auto* dst = (char*)c->raw_mutable_data() + oldSize * b.meta().itemsize();
      context_.template CopyItems<Context, Context>(
          b.meta(), b.size(), b.raw_data(), dst);
    }
    return true;
  }
 };
 REGISTER_CPU_OPERATOR(CreateTreeCursor, CreateTreeCursorOp);
 REGISTER_CPU_OPERATOR(ResetCursor, ResetCursorOp);
 REGISTER_CPU_OPERATOR(ReadNextBatch, ReadNextBatchOp);
 REGISTER_CPU_OPERATOR(ComputeOffset, ComputeOffsetOp);
 REGISTER_CPU_OPERATOR(ReadRandomBatch, ReadRandomBatchOp);
 REGISTER_CPU_OPERATOR(CheckDatasetConsistency, CheckDatasetConsistencyOp);
 REGISTER_CPU_OPERATOR(Append, AppendOp<CPUContext>);
 REGISTER_CPU_OPERATOR(AtomicAppend, AtomicAppendOp<CPUContext>);
 OPERATOR_SCHEMA(CreateTreeCursor)
    .NumInputs(0)
    .NumOutputs(1)
    .SetDoc(R"DOC(
 Creates a cursor to iterate through a list of tensors, where some of those
 tensors contains the lengths in a nested schema. The schema is determined by
 the `fields` arguments.
 For example, to represent the following schema:
  Struct(
      a=Int(),
      b=List(List(Int),
      c=List(
          Struct(
             c1=String,
             c2=List(Int),
          ),
      ),
  )
 the field list will be:
  [
      "a",
      "b:lengths",
      "b:values:lengths",
      "b:values:values",
      "c:lengths",
      "c:c1",
      "c:c2:lengths",
      "c:c2:values",
  ]
 And for the following instance of the struct:
  Struct(
      a=3,
      b=[[4, 5], [6, 7, 8], [], [9]],
      c=[
          Struct(c1='alex', c2=[10, 11]),
          Struct(c1='bob', c2=[12]),
      ],
  )
 The values of the fields will be:
  {
      "a": [3],
      "b:lengths": [4],
      "b:values:lengths": [2, 3, 0, 1],
      "b:values:values": [4, 5, 6, 7, 8, 9],
      "c:lengths": [2],
      "c:c1": ["alex", "bob"],
      "c:c2:lengths": [2, 1],
      "c:c2:values", [10, 11, 12],
  }
 In general, every field name in the format "{prefix}:lengths" defines a domain
 "{prefix}", and every subsequent field in the format "{prefx}:{field}" will
 be in that domain, and the length of the domain is provided for each entry of
 the parent domain. In the example, "b:lengths" defines a domain of length 4, so
 every field under domain "b" will have 4 entries.
 The "lengths" field for a given domain must appear before any reference to
 that domain.
 Returns a pointer to an instance of the Cursor, which keeps the current offset
 on each of the domains defined by `fields`. Cursor also ensures thread-safety
 such that ReadNextBatch and ResetCursor can be used safely in parallel.
 A cursor does not contain data per se, so calls to ReadNextBatch actually need
 to pass a list of blobs containing the data to read for each one of the fields.
 )DOC")
    .Output(0, "cursor", "A blob pointing to an instance of a new TreeCursor.")
    .Arg(
        "fields",
        "A list of strings each one representing a field of the dataset.");
 OPERATOR_SCHEMA(ResetCursor)
    .NumInputs(1)
    .NumOutputs(0)
    .SetDoc(R"DOC(
 Resets the offsets for the given TreeCursor. This operation is thread safe.
 )DOC")
    .Input(0, "cursor", "A blob containing a pointer to the cursor.");
 OPERATOR_SCHEMA(ReadNextBatch)
    .NumInputs(1, INT_MAX)
    .NumOutputs(1, INT_MAX)
    .SetDoc(R"DOC(
 Read the next batch of examples out of the given cursor and data blobs.
 Input(0) is a blob pointing to a TreeCursor, and
 [Input(1),... Input(num_fields)] a list of tensors containing the data for
 each field of the dataset.
 ReadNextBatch is thread safe.
 )DOC")
    .Input(0, "cursor", "A blob containing a pointer to the cursor.")
    .Input(1, "dataset_field_0", "First dataset field")
    .Output(0, "field_0", "Tensor containing the next batch for field 0.")
    .Arg("batch_size", "Number of top-level entries to read.");
 OPERATOR_SCHEMA(ComputeOffset)
    .NumInputs(1, INT_MAX)
    .NumOutputs(1)
    .SetDoc(R"DOC(
 Compute the offsets matrix given cursor and data blobs. Need to be ran at
 beginning or after reseting cursor
 Input(0) is a blob pointing to a TreeCursor, and
 [Input(1),... Input(num_fields)] a list of tensors containing the data for
 each field of the dataset.
 ComputeOffset is thread safe.
 )DOC")
    .Input(0, "cursor", "A blob containing a pointer to the cursor.")
    .Input(1, "dataset_field_0", "First dataset field")
    .Output(0, "field_0", "Tensor containing offset info for this chunk.");
 OPERATOR_SCHEMA(ReadRandomBatch)
    .NumInputs(1, INT_MAX)
    .NumOutputs(1, INT_MAX)
    .SetDoc(R"DOC(
 Read the next batch of examples out of the given cursor,
 idx blob, offset matrix and data blobs.
 Input(0) is a blob pointing to a TreeCursor,
 Input(1) is a blob pointing to the shuffled idx
 Input(2) is a blob pointing to the offset matrix and
 [Input(3),... Input(num_fields)] a list of tensors containing the data for
 each field of the dataset.
 ReadRandomBatch is thread safe.
 )DOC")
    .Input(0, "cursor", "A blob containing a pointer to the cursor.")
    .Input(1, "idx", "idx with a shuffled order.")
    .Input(2, "offsetsmat", "offset matrix containing length offset info.")
    .Input(3, "dataset_field_0", "First dataset field")
    .Output(0, "field_0", "Tensor containing the next batch for field 0.")
    .Arg("batch_size", "Number of top-level entries to read.");
 OPERATOR_SCHEMA(CheckDatasetConsistency)
    .NumInputs(1, INT_MAX)
    .NumOutputs(0)
    .SetDoc(R"DOC(
 Checks that the given data fields represents a consistent dataset unther
 the schema specified by the `fields` argument. Operator fails if the fields
 are not consistent. If data is consistent, each field's data can be safely
 appended to an existing dataset, keeping it consistent.
 )DOC")
    .Input(0, "field_0", "Data for field 0.")
    .Arg(
        "fields",
        "List of strings representing the string names in the format"
        "specified in the doc for CreateTreeCursor.");
 OPERATOR_SCHEMA(Append)
    .NumInputs(2)
    .NumOutputs(1)
    .EnforceInplace({{0, 0}})
    .SetDoc(R"DOC(
 Append input 2 to the end of input 1.
 Input 1 must be the same as output, that is, it is required to be in-place.
 Input 1 may have to be re-allocated in order for accommodate to the new size.
 Currently, an exponential growth ratio is used in order to ensure amortized
 constant time complexity.
 All except the outer-most dimension must be the same between input 1 and 2.
 )DOC")
    .Input(0, "dataset", "The tensor to be appended to.")
    .Input(1, "new_data", "Tensor to append to the end of dataset.")
    .Output(0, "dataset", "Same as input 0, representing the mutated tensor.");
 OPERATOR_SCHEMA(AtomicAppend)
    .NumInputs(3, INT_MAX)
    .NumOutputs(1, INT_MAX)
    .AllowInplace([](int in, int out) { return in == out + 1; });
 SHOULD_NOT_DO_GRADIENT(CreateTreeCursor);
 SHOULD_NOT_DO_GRADIENT(ResetCursor);
 SHOULD_NOT_DO_GRADIENT(ReadNextBatch);
 SHOULD_NOT_DO_GRADIENT(ComputeOffset);
 SHOULD_NOT_DO_GRADIENT(ReadRandomBatch);
 SHOULD_NOT_DO_GRADIENT(CheckDatasetConsistency);
 SHOULD_NOT_DO_GRADIENT(Append);
 SHOULD_NOT_DO_GRADIENT(AtomicAppend);
 }
 }
--- a/caffe2/operators/elementwise_op.h
+++ b/caffe2/operators/elementwise_op.h
@ -2,9 +2,9 @@
 #define CAFFE2_OPERATORS_ELEMENTWISE_OP_H_
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
 #include "caffe2/core/logging.h"
 namespace caffe2 {
@ -21,12 +21,26 @@ struct TypeForOutput<SameTypeAsInput, InputType> {
  using value = InputType;
 };
-template <typename InputTypes, class Context, class Functor,
+/**
 * Generic meta-operator that is able to processes element-wise operations on
 * a single-element tensor, returning a tensor with same shape, and either of
 * the same type as the input or of a specified result type.
 *
 * The functor provided must implement operator() as a template on input and
 * output types, and on a Context. Moreover, it needs to provide a constructor
 * that takes OperatorBase& as argument. This is in order to consume arguments
 * passed to the operator instance.
 */
 template <
    typename InputTypes,
    class Context,
    class Functor,
    class OutputType = SameTypeAsInput>
-class UnaryElementwiseOp : public Operator<Context> {
+class UnaryElementwiseWithArgsOp : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  USE_SIMPLE_CTOR_DTOR(UnaryElementwiseOp);
+  UnaryElementwiseWithArgsOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws), functor(*this) {}
  bool RunOnDevice() override {
    return DispatchHelper<InputTypes>::call(this, Input(0));
@ -38,11 +52,67 @@ class UnaryElementwiseOp : public Operator<Context> {
    auto* output = Output(0);
    output->ResizeLike(input);
    using R = typename TypeForOutput<OutputType, T>::value;
-    Functor()(input.size(), input.template data<T>(),
+    functor(
-              output->template mutable_data<R>(), &context_);
+        input.size(),
        input.template data<T>(),
        output->template mutable_data<R>(),
        &context_);
    return true;
  }
  Functor functor;
 };
 /**
 * WithDefaultConstructor is a functor that can be used as the functor of an
 * UnaryElementwiseWithArgsOp. It simply forwards the operator() call into
 * another functor that doesn't accept arguments in its constructor.
 */
 template <typename Functor>
 struct WithDefaultConstructor {
  explicit WithDefaultConstructor(OperatorBase& op) {}
  template <typename In, typename Out, typename Context>
  void operator()(int n, const In* in, Out* out, Context* c) {
    Functor()(n, in, out, c);
  }
 };
 /**
 * UnaryElementwiseOp is a wrapper around UnaryElementwiseWithArgsOp, with the
 * difference that it takes a functor with default constructor, e.g. that does
 * not need to take into consideration any arguments during operator creation.
 */
 template <
    typename InputTypes,
    class Context,
    class Functor,
    class OutputType = SameTypeAsInput>
 using UnaryElementwiseOp = UnaryElementwiseWithArgsOp<
    InputTypes,
    Context,
    WithDefaultConstructor<Functor>,
    OutputType>;
 /**
 * ForEach is a unary functor that forwards each element of the input array
 * into the elementwise Functor provided, and gathers the results of each
 * call into the resulting array. Use it as an adaptor if you want to create
 * a UnaryElementwiseOp that acts on each element of the tensor per function
 * call -- this is resonable for complex types where vectorization wouldn't
 * be much of a gain, performance-wise.
 */
 template <typename Functor>
 struct ForEach {
  explicit ForEach(OperatorBase& op) : functor(op) {}
  template <typename In, typename Out, typename Context>
  void operator()(int n, const In* in, Out* out, Context* c) {
    for (int i = 0; i < n; ++i) {
      out[i] = functor(in[i]);
    }
  }
  Functor functor;
 };
 /**
@ -192,8 +262,12 @@ class DivGradientOp final : public Operator<Context> {
    }                                                               \
  };                                                                \
  template <class DC>                                               \
-  using name##Op = BinaryElementwiseOp<NumericTypes, DC,                  \
+  using name##Op = BinaryElementwiseOp<                             \
-    name##Functor, SameTypeAsInput, true>
+      NumericTypes,                                                 \
      DC,                                                           \
      name##Functor,                                                \
      SameTypeAsInput,                                              \
      true>
 CAFFE2_BINARY_FUNCTOR_WRAPPER(Add);
 CAFFE2_BINARY_FUNCTOR_WRAPPER(Sub);
@ -225,8 +299,8 @@ CAFFE2_BINARY_FUNCTOR_WRAPPER(Div);
    }                                                               \
  };                                                                \
  template <class DC>                                               \
-  using name##Op = BinaryElementwiseOp<                                       \
+  using name##Op =                                                  \
-      NumericTypes, DC, name##Functor, bool, true>
+      BinaryElementwiseOp<NumericTypes, DC, name##Functor, bool, true>
 CAFFE2_BINARY_FUNCTOR_BINARY_RESULT_WRAPPER(LT);
 CAFFE2_BINARY_FUNCTOR_BINARY_RESULT_WRAPPER(LE);
--- a/caffe2/operators/filler_op.cc
+++ b/caffe2/operators/filler_op.cc
@ -18,6 +18,7 @@ REGISTER_CPU_OPERATOR(UniformFill, UniformFillOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(UniformIntFill, UniformFillOp<int, CPUContext>);
 REGISTER_CPU_OPERATOR(ConstantFill, ConstantFillOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(ConstantIntFill, ConstantFillOp<int, CPUContext>);
 REGISTER_CPU_OPERATOR(ConstantBoolFill, ConstantFillOp<bool, CPUContext>);
 REGISTER_CPU_OPERATOR(GivenTensorFill, GivenTensorFillOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(GivenTensorIntFill, GivenTensorFillOp<int, CPUContext>);
 REGISTER_CPU_OPERATOR(GaussianFill, GaussianFillOp<float, CPUContext>);
@ -30,6 +31,10 @@ OPERATOR_SCHEMA(UniformFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}}
 OPERATOR_SCHEMA(UniformIntFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
 OPERATOR_SCHEMA(ConstantFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
 OPERATOR_SCHEMA(ConstantIntFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
 OPERATOR_SCHEMA(ConstantBoolFill)
    .NumInputs(0, 1)
    .NumOutputs(1)
    .AllowInplace({{0, 0}});
 OPERATOR_SCHEMA(GivenTensorFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
 OPERATOR_SCHEMA(GivenTensorIntFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
 OPERATOR_SCHEMA(GaussianFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
--- a/caffe2/operators/fully_connected_op.h
+++ b/caffe2/operators/fully_connected_op.h
@ -56,14 +56,12 @@ class FullyConnectedOp final : public Operator<Context> {
    CAFFE_ENFORCE(N == b.dim32(0), dimErrorString());
    CAFFE_ENFORCE(N == b.size(), dimErrorString());
-    // Create the Y shape (without allocation)
+    Y_shape_cache_ = X.dims();
    static thread_local vector<TIndex> Y_shape;
    Y_shape = X.dims();
    // This is an invariant of canonical_axis, so we can DCHECK.
-    DCHECK_LE(canonical_axis + 1, Y_shape.size());
+    DCHECK_LE(canonical_axis + 1, Y_shape_cache_.size());
-    Y_shape.resize(canonical_axis + 1);
+    Y_shape_cache_.resize(canonical_axis + 1);
-    Y_shape[canonical_axis] = N;
+    Y_shape_cache_[canonical_axis] = N;
-    Y->Resize(Y_shape);
+    Y->Resize(Y_shape_cache_);
    CAFFE_ENFORCE(M * N == Y->size(), dimErrorString());
    // W * x
@ -88,6 +86,9 @@ class FullyConnectedOp final : public Operator<Context> {
 protected:
  size_t axis_{1};
  // A local vector to cache the output shape so we don't need to recreate
  // a vector object every time we run Run().
  vector<TIndex> Y_shape_cache_;
  Tensor<Context> bias_multiplier_;
 };
--- a/caffe2/operators/index_ops.cc
+++ b/caffe2/operators/index_ops.cc
@ -7,7 +7,6 @@
 #include <limits>
 namespace caffe2 {
 namespace {
 using IndexKeyTypes = TensorTypes<int32_t, int64_t, std::string>;
 using TIndexValue = int64_t;
@ -22,12 +21,17 @@ struct IndexBase {
  void Freeze() { frozen_ = true; }
  virtual ~IndexBase() {}
  const TypeMeta& Type() const { return meta_; }
  TIndexValue Size() {
    std::lock_guard<std::mutex> guard(dictMutex_);
    return nextId_;
  }
 protected:
  int64_t maxElements_;
  TypeMeta meta_;
  TIndexValue nextId_{1}; // guarded by dictMutex_
  std::atomic<bool> frozen_{false};
  std::mutex dictMutex_;
 };
 template<typename T>
@ -96,7 +100,6 @@ struct Index: IndexBase {
  }
  std::unordered_map<T, TIndexValue> dict_;
  std::mutex dictMutex_;
 };
 template<class T>
@ -142,7 +145,9 @@ class IndexGetOp: public Operator<CPUContext> {
 class IndexLoadOp: public Operator<CPUContext> {
 public:
  IndexLoadOp(const OperatorDef& operator_def, Workspace* ws)
-   : Operator(operator_def, ws) {}
+      : Operator(operator_def, ws),
        skipFirstEntry_(
            OperatorBase::GetSingleArgument<int>("skip_first_entry", 0)) {}
  bool RunOnDevice() override {
    return DispatchHelper<IndexKeyTypes>::call(this, Input(1));
@ -153,8 +158,18 @@ class IndexLoadOp: public Operator<CPUContext> {
    auto* dict = dynamic_cast_if_rtti<Index<T>*>(base.get());
    CAFFE_ENFORCE(dict, "Wrong dictionary type given input keys.");
    const auto& keys = Input(1);
-    return dict->Load(keys.data<T>(), keys.size());
+    const auto* keys_data = keys.data<T>();
    auto keys_size = keys.size();
    if (skipFirstEntry_) {
      CAFFE_ENFORCE(keys.size() > 0);
      ++keys_data;
      --keys_size;
    }
    return dict->Load(keys_data, keys_size);
  }
 private:
  bool skipFirstEntry_;
 };
 class IndexStoreOp: public Operator<CPUContext> {
@ -188,6 +203,19 @@ class IndexFreezeOp: public Operator<CPUContext> {
  }
 };
 class IndexSizeOp : public Operator<CPUContext> {
 public:
  IndexSizeOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator(operator_def, ws) {}
  bool RunOnDevice() override {
    auto& base = OperatorBase::Input<std::unique_ptr<IndexBase>>(0);
    auto* out = Output(0);
    out->Resize(std::vector<TIndex>{});
    *out->mutable_data<TIndexValue>() = base->Size();
    return true;
  }
 };
 REGISTER_CPU_OPERATOR(IntIndexCreate, IndexCreateOp<int32_t>);
 REGISTER_CPU_OPERATOR(LongIndexCreate, IndexCreateOp<int64_t>);
@ -197,6 +225,7 @@ REGISTER_CPU_OPERATOR(IndexGet, IndexGetOp);
 REGISTER_CPU_OPERATOR(IndexLoad, IndexLoadOp);
 REGISTER_CPU_OPERATOR(IndexStore, IndexStoreOp);
 REGISTER_CPU_OPERATOR(IndexFreeze, IndexFreezeOp);
 REGISTER_CPU_OPERATOR(IndexSize, IndexSizeOp);
 OPERATOR_SCHEMA(IntIndexCreate)
  .NumInputs(0)
@ -250,7 +279,6 @@ Should not be called concurrently with IndexGet.
 )DOC")
  .Input(0, "handle", "Pointer to an Index instance.");
 OPERATOR_SCHEMA(IndexLoad)
    .NumInputs(2)
    .NumOutputs(0)
@ -259,7 +287,12 @@ Loads the index from the given 1-D tensor. Elements in the tensor will be given
 consecutive indexes starting at 1. Fails if tensor contains repeated elements.
 )DOC")
    .Input(0, "handle", "Pointer to an Index instance.")
-  .Input(1, "items", "1-D tensor with elements starting with index 1.");
+    .Input(1, "items", "1-D tensor with elements starting with index 1.")
    .Arg(
        "skip_first_entry",
        "If set, skips the first entry of the tensor. This allows "
        "to load tensors that are aligned with an embedding, where the first "
        "entry corresponds to the default 0 index entry.");
 OPERATOR_SCHEMA(IndexStore)
  .NumInputs(1)
@ -271,6 +304,15 @@ for unknowns, the first element of the output tensor will be element of index 1.
  .Input(0, "handle", "Pointer to an Index instance.")
  .Output(0, "items", "1-D tensor with elements starting with index 1.");
 OPERATOR_SCHEMA(IndexSize)
    .NumInputs(1)
    .NumOutputs(1)
    .SetDoc(R"DOC(
 Returns the number of entries currently present in the index.
 )DOC")
    .Input(0, "handle", "Pointer to an Index instance.")
    .Output(0, "items", "Scalar int64 tensor with number of entries.");
 NO_GRADIENT(IndexGetOp);
 NO_GRADIENT(IntIndexCreate);
 NO_GRADIENT(LongIndexCreate);
@ -278,5 +320,5 @@ NO_GRADIENT(StringIndexCreate);
 SHOULD_NOT_DO_GRADIENT(IndexFreeze);
 SHOULD_NOT_DO_GRADIENT(IndexLoad);
 SHOULD_NOT_DO_GRADIENT(IndexStore);
-
+SHOULD_NOT_DO_GRADIENT(IndexSize);
 }  // namespace caffe2
--- a/caffe2/operators/load_save_op.h
+++ b/caffe2/operators/load_save_op.h
@ -3,6 +3,7 @@
 #include <cstdio>
 #include <map>
 #include <unordered_set>
 #include "caffe2/core/context.h"
 #include "caffe2/core/db.h"
@ -65,12 +66,18 @@ class LoadOp final : public Operator<Context> {
    // chunks. This way we can make sure that all chunks were loaded in the end.
    // This is a map from output index to current size of the blob
    std::map<int, size_t> blobSizes;
-
+    std::unordered_set<string> loaded;
    for (; cursor->Valid(); cursor->Next()) {
      const string& key = cursor->key();
      if (!output_indices_.count(key)) {
        VLOG(1) << "Key " << key << " not used. Skipping.";
      } else {
        CAFFE_ENFORCE(
            loaded.count(key) == 0,
            "Multiple copies of blob ",
            key,
            " found in the db.");
        VLOG(2) << "Deserializing blob " << key;
        BlobProto proto;
        CHECK(proto.ParseFromString(cursor->value()));
@ -101,6 +108,15 @@ class LoadOp final : public Operator<Context> {
            blobSize.first->second = blob->Get<Tensor<Context>>().size();
          }
        }
        if (!proto.has_tensor() ||
            blobSize.first->second >= blob->Get<Tensor<Context>>().size()) {
          loaded.insert(key);
        }
        if (loaded.size() >= OutputSize()) {
          break;
        }
      }
    }
@ -116,6 +132,8 @@ class LoadOp final : public Operator<Context> {
            blobSize.second);
      }
    }
    CHECK_EQ(loaded.size(), OutputSize());
  }
 private:
--- a/caffe2/operators/one_hot_ops.cc
+++ b/caffe2/operators/one_hot_ops.cc
@ -0,0 +1,107 @@
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
 namespace caffe2 {
 namespace {
 class OneHotOp : public Operator<CPUContext> {
 public:
  OneHotOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator(operator_def, ws) {}
  bool RunOnDevice() override {
    auto& indices = Input(0);
    auto& index_size_tensor = Input(1);
    CAFFE_ENFORCE(indices.ndim() == 1);
    CAFFE_ENFORCE(index_size_tensor.size() == 1);
    auto batch_size = indices.size();
    auto index_size = *index_size_tensor.data<int64_t>();
    auto* indices_ptr = indices.data<int64_t>();
    auto* one_hots = Output(0);
    one_hots->Resize(std::vector<TIndex>{batch_size, index_size});
    if (one_hots->size() == 0) {
      return true;
    }
    auto* one_hots_ptr = one_hots->mutable_data<float>();
    memset(one_hots_ptr, 0, one_hots->nbytes());
    for (int i = 0; i < batch_size; ++i) {
      auto label_idx = indices_ptr[i];
      DCHECK((0 <= label_idx) && (label_idx < index_size));
      one_hots_ptr[label_idx] = 1.0;
      one_hots_ptr += index_size;
    }
    return true;
  }
 };
 class SegmentOneHotOp : public Operator<CPUContext> {
 public:
  SegmentOneHotOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator(operator_def, ws) {}
  bool RunOnDevice() override {
    auto& lengths = Input(0);
    auto& indices = Input(1);
    auto& index_size_tensor = Input(2);
    CAFFE_ENFORCE(lengths.ndim() == 1);
    CAFFE_ENFORCE(indices.ndim() == 1);
    CAFFE_ENFORCE(index_size_tensor.size() == 1);
    auto batch_size = lengths.size();
    auto index_size = *index_size_tensor.data<int64_t>();
    CAFFE_ENFORCE(index_size > 0);
    auto* lengths_ptr = lengths.data<int32_t>();
    auto* indices_ptr = indices.data<int64_t>();
    auto* one_hots = Output(0);
    one_hots->Resize(std::vector<TIndex>{batch_size, index_size});
    auto* one_hots_ptr = one_hots->mutable_data<float>();
    if (one_hots->size() == 0) {
      return true;
    }
    memset(one_hots_ptr, 0, one_hots->nbytes());
    int el_idx = 0;
    for (int i = 0; i < batch_size; ++i) {
      for (int j = 0; j < lengths_ptr[i]; ++j) {
        DCHECK(el_idx < indices.size());
        auto label_idx = indices_ptr[el_idx++];
        DCHECK((0 <= label_idx) && (label_idx < index_size));
        one_hots_ptr[label_idx] = 1.0;
      }
      one_hots_ptr += index_size;
    }
    return true;
  }
 };
 REGISTER_CPU_OPERATOR(OneHot, OneHotOp);
 REGISTER_CPU_OPERATOR(SegmentOneHot, SegmentOneHotOp);
 OPERATOR_SCHEMA(OneHot)
    .NumInputs(2)
    .NumOutputs(1)
    .SetDoc(R"DOC(
 Given a sequence of indices, one for each example in a batch, returns a matrix
 where each inner dimension has the size of the index and has 1.0 in the index
 active in the given example, and 0.0 everywhere else.
 )DOC")
    .Input(0, "indices", "The active index for each example in the batch.")
    .Input(1, "index_size_tensor", "Scalar with the size of the index.")
    .Output(0, "one_hots", "Matrix of size len(indices) x index_size");
 OPERATOR_SCHEMA(SegmentOneHot)
    .NumInputs(3)
    .NumOutputs(1)
    .SetDoc(R"DOC(
 Given a sequence of indices, segmented by the lengths tensor, returns a matrix
 that has the elements in each sequence set to 1.0, and 0.0 everywhere else.
 )DOC")
    .Input(0, "lengths", "Size of each segment.")
    .Input(1, "indices", "Active indices, of size sum(lengths)")
    .Input(2, "index_size_tensor", "Size of the index")
    .Output(0, "one_hots", "Matrix of size len(lengths) x index_size");
 NO_GRADIENT(OneHot);
 NO_GRADIENT(SegmentOneHot);
 }
 }
--- a/caffe2/operators/reducer_functors.h
+++ b/caffe2/operators/reducer_functors.h
@ -124,6 +124,67 @@ struct LogSumExpRangeReducerDef {
      "input slices. Operation doesn't change the shape of individual blocks.";
 };
 template <typename T, class Context>
 class LogMeanExpRangeReducer;
 template <typename T, class Context>
 class LogMeanExpRangeReducerGradient;
 template <typename T>
 class LogMeanExpRangeReducer<T, CPUContext> {
 public:
  void operator()(
      const TIndex block_size,
      const TIndex blocks,
      const T* in,
      T* out,
      CPUContext* context) {
    for (int j = 0; j < block_size; ++j) {
      T max_value = std::numeric_limits<T>::lowest();
      for (int i = 0; i < blocks; ++i) {
        max_value = std::max(max_value, in[i * block_size + j]);
      }
      T scaled_exp_sum = 0;
      for (int i = 0; i < blocks; ++i) {
        scaled_exp_sum += std::exp(in[i * block_size + j] - max_value);
      }
      scaled_exp_sum /= blocks;
      *(out++) = std::log(scaled_exp_sum) + max_value;
    }
  }
 };
 template <typename T, class Context>
 class LogMeanExpRangeReducerGradient {
 public:
  void operator()(
      const TIndex block_size,
      const TIndex blocks,
      const T* segment_grad, // GO
      T* data_grad, // GI
      const T* data_in, // I
      const T* data_out, // O
      Context* context) {
    for (int j = 0; j < block_size; ++j) {
      const T out_grad = *(segment_grad++);
      const T offset = *(data_out++);
      for (int i = 0; i < blocks; ++i) {
        auto idx = i * block_size + j;
        data_grad[idx] = out_grad * std::exp(data_in[idx] - offset) / blocks;
      }
    }
  }
 };
 struct LogMeanExpRangeReducerDef {
  template <typename T, class Context>
  using Reducer = LogMeanExpRangeReducer<T, Context>;
  template <typename T, class Context>
  using ReducerGradient = LogMeanExpRangeReducerGradient<T, Context>;
  static constexpr const char* name = "LogMeanExp";
  static constexpr const char* doc =
      "LogMeanExp computes the element-wise log of the mean of exponentials of "
      "input slices. Operation doesn't change the shape of individual blocks.";
 };
 template <typename T, class Context>
 class MeanRangeReducer;
@ -180,10 +241,78 @@ struct MeanRangeReducerDef {
  static constexpr const char* doc =
      "Mean computation is done element-wise, so that each element of the "
      "output slice corresponds to the average value of the respective "
-      "elements in the input slives. Operation doesn't change the shape of "
+      "elements in the input slices. Operation doesn't change the shape of "
      "individual blocks.";
 };
 template <typename T, class Context>
 class MaxRangeReducer;
 template <typename T, class Context>
 class MaxRangeReducerGradient;
 template <typename T>
 class MaxRangeReducer<T, CPUContext> {
 public:
  void operator()(
      const TIndex block_size,
      const TIndex blocks,
      const T* in,
      T* out,
      CPUContext* context) {
    for (int j = 0; j < block_size; ++j) {
      T max_value = std::numeric_limits<T>::lowest();
      for (int i = 0; i < blocks; ++i) {
        max_value = std::max(max_value, in[i * block_size + j]);
      }
      *(out++) = max_value;
    }
  }
 };
 template <typename T, class Context>
 class MaxRangeReducerGradient {
 public:
  void operator()(
      const TIndex block_size,
      const TIndex blocks,
      const T* segment_grad, // GO
      T* data_grad, // GI
      const T* data_in, // I
      const T* data_out, // O
      Context* context) {
    std::memset(
        static_cast<void*>(data_grad), 0, blocks * block_size * sizeof(T));
    for (int j = 0; j < block_size; ++j) {
      const T out_grad = *(segment_grad++);
      const T out = data_out[j];
      for (int i = 0; i < blocks; ++i) {
        auto idx = i * block_size + j;
        if (out == data_in[idx]) {
          data_grad[idx] = out_grad;
          break;
        }
      }
    }
  }
 };
 struct MaxRangeReducerDef {
  template <typename T, class Context>
  using Reducer = MaxRangeReducer<T, Context>;
  template <typename T, class Context>
  using ReducerGradient = MaxRangeReducerGradient<T, Context>;
  static constexpr const char* name = "Max";
  static constexpr const char* doc =
      "Max computation is done element-wise, so that each element of the "
      "output slice corresponds to the max value of the respective "
      "elements in the input slices. Operation doesn't change the shape of "
      "individual blocks. This implementation imitates torch nn.Max operator. "
      "If the maximum value occurs more than once, the operator will return "
      "the first occurence of value. When computing the gradient using the "
      "backward propagation, the gradient input corresponding to the first "
      "occurence of the maximum value will be used.";
 };
 ////////////////////////////////////////////////////////////////////////////////
 // Incremental reducers: consume elements one by one
 ////////////////////////////////////////////////////////////////////////////////
--- a/caffe2/operators/relu_op_cudnn.cc
+++ b/caffe2/operators/relu_op_cudnn.cc
@ -32,12 +32,16 @@ class CuDNNReluOp final : public Operator<CUDAContext> {
    if (X.dims() != cudnn_input_dims_) {
      VLOG(1) << "Setting descriptors.";
      cudnn_input_dims_ = X.dims();
-      int C = (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(3));
+      int C = 1, H = 1, W = 1;
      int H = 1;
      int W = 1;
      if (X.ndim() == 4) {
        // Normal 4-dimensional tensors for images.
        C = (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(3));
        H = (order_ == StorageOrder::NCHW ? X.dim32(2) : X.dim32(1));
        W = (order_ == StorageOrder::NCHW ? X.dim32(3) : X.dim32(2));
      } else {
        // If X is not 4-dimensional, we will simply use H = 1 and W = 1
        // and wrap everything into C.
        C = X.size() / X.dim32(0);
      }
      CUDNN_CHECK(cudnnSetTensor4dDescriptor(
          data_desc_, GetCudnnTensorFormat(order_),
@ -93,12 +97,16 @@ class CuDNNReluGradientOp final : public Operator<CUDAContext> {
    if (Y.dims() != cudnn_input_dims_) {
      VLOG(1) << "Setting descriptors.";
      cudnn_input_dims_ = Y.dims();
-      int C = (order_ == StorageOrder::NCHW ? Y.dim32(1) : Y.dim32(3));
+      int C = 1, H = 1, W = 1;
      int H = 1;
      int W = 1;
      if (Y.ndim() == 4) {
        // Normal 4-dimensional tensors for images.
        C = (order_ == StorageOrder::NCHW ? Y.dim32(1) : Y.dim32(3));
        H = (order_ == StorageOrder::NCHW ? Y.dim32(2) : Y.dim32(1));
        W = (order_ == StorageOrder::NCHW ? Y.dim32(3) : Y.dim32(2));
      } else {
        // If Y is not 4-dimensional, we will simply use H = 1 and W = 1
        // and wrap everything into C.
        C = Y.size() / Y.dim32(0);
      }
      CUDNN_CHECK(cudnnSetTensor4dDescriptor(
          data_desc_, GetCudnnTensorFormat(order_),
--- a/caffe2/operators/reverse_packed_segs_op.cc
+++ b/caffe2/operators/reverse_packed_segs_op.cc
@ -0,0 +1,35 @@
 #include "caffe2/operators/reverse_packed_segs_op.h"
 namespace caffe2 {
 namespace {
 REGISTER_CPU_OPERATOR(ReversePackedSegs, ReversePackedSegsOp<CPUContext>);
 OPERATOR_SCHEMA(ReversePackedSegs)
    .NumInputs(2)
    .NumOutputs(1)
    .SetDoc(R"DOC(
 Reverse segments in a 3-D tensor (lengths, segments, embeddings,), leaving
 paddings unchanged. This operator is used to reverse input of a recurrent neural
 network to make it a BRNN.
  )DOC")
    .Input(0, "data", "a 3-D (lengths, segments, embeddings,) tensor.")
    .Input(1, "lengths", "length of each segment.")
    .Output(
        0,
        "reversed data",
        "a (lengths, segments, embeddings,) tensor with each segment reversed"
        "and paddings unchanged.");
 class GetReversePackedSegsGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
  vector<OperatorDef> GetGradientDefs() override {
    return SingleGradientDef(
        "ReversePackedSegs",
        "",
        vector<string>{GO(0), I(1)},
        vector<string>{GI(0)});
  }
 };
 REGISTER_GRADIENT(ReversePackedSegs, GetReversePackedSegsGradient);
 } // namespace
 } // namespace caffe2
--- a/caffe2/operators/reverse_packed_segs_op.h
+++ b/caffe2/operators/reverse_packed_segs_op.h
@ -0,0 +1,84 @@
 #ifndef CAFFE2_OPERATORS_REVERSE_PACKED_SEGS_OP_H_
 #define CAFFE2_OPERATORS_REVERSE_PACKED_SEGS_OP_H_
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 namespace caffe2 {
 template <class Context>
 class ReversePackedSegsOp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  USE_SIMPLE_CTOR_DTOR(ReversePackedSegsOp);
  USE_DISPATCH_HELPER;
  bool RunOnDevice() override {
    return DispatchHelper<TensorTypes<float, double, int, long, bool>>::call(
        this, Input(DATA));
  }
  template <typename T>
  bool DoRunWithType() {
    if (Input(LENGTHS).template IsType<int>()) {
      DoRunWithLengthType<T, int>();
    } else {
      DoRunWithLengthType<T, long>();
    }
    return true;
  }
 private:
  INPUT_TAGS(DATA, LENGTHS);
  template <typename T, typename LengthType>
  void DoRunWithLengthType() {
    const auto& data = Input(DATA);
    const auto& lengths = Input(LENGTHS);
    CAFFE_ENFORCE(
        data.ndim() == 3,
        "DATA should be 3-D tensor <lengths, "
        "segments, embeddings>");
    CAFFE_ENFORCE(lengths.ndim() == 1, "LENGTH should be 1-D");
    auto* output = Output(0);
    const auto& shape = data.dims();
    output->Resize(shape);
    const auto& max_length = data.dims()[0];
    const auto& batch_size = data.dims()[1];
    const auto& block_size = data.dims()[2];
    CAFFE_ENFORCE(
        lengths.dims()[0] == batch_size,
        "lenths size should be"
        " equal to batch size");
    const T* data_ptr = data.template data<T>();
    const LengthType* lengths_ptr = lengths.template data<LengthType>();
    T* rev_data_ptr = output->template mutable_data<T>();
    for (TIndex i = 0; i < batch_size; i++) {
      const auto& seg_length = lengths_ptr[i];
      CHECK_LE(seg_length, max_length);
      TIndex j = 0;
      for (; j < seg_length; j++) {
        const T* data_block_ptr = data_ptr + (j * batch_size + i) * block_size;
        T* rev_data_block_ptr =
            rev_data_ptr + ((seg_length - 1 - j) * batch_size + i) * block_size;
        context_.template Copy<T, Context, Context>(
            block_size, data_block_ptr, rev_data_block_ptr);
      }
      for (; j < max_length; j++) {
        const T* data_block_ptr = data_ptr + (j * batch_size + i) * block_size;
        T* rev_data_block_ptr =
            rev_data_ptr + (j * batch_size + i) * block_size;
        context_.template Copy<T, Context, Context>(
            block_size, data_block_ptr, rev_data_block_ptr);
      }
    }
  }
 };
 } // namespace caffe2
 #endif // CAFFE2_OPERATORS_REVERSE_PACKED_SEGS_OP_H_
--- a/caffe2/operators/segment_reduction_op.cc
+++ b/caffe2/operators/segment_reduction_op.cc
@ -1066,9 +1066,15 @@ REGISTER_SEGMENT_DEF(
 REGISTER_SEGMENT_DEF(
    AbstractSortedSegmentRangeDef<float, int, CPUContext,
                                  LogSumExpRangeReducerDef>);
 REGISTER_SEGMENT_DEF(AbstractSortedSegmentRangeDef<
                     float,
                     int,
                     CPUContext,
                     LogMeanExpRangeReducerDef>);
 REGISTER_SEGMENT_DEF(
-    AbstractSortedSegmentRangeDef<float, int, CPUContext,
+    AbstractSortedSegmentRangeDef<float, int, CPUContext, MeanRangeReducerDef>);
-                                  MeanRangeReducerDef>);
+REGISTER_SEGMENT_DEF(
    AbstractSortedSegmentRangeDef<float, int, CPUContext, MaxRangeReducerDef>);
 #define REGISTER_REDUCER_WITH_ALL_OPS(reducer_def)                          \
  REGISTER_SEGMENT_DEF(                                                     \
--- a/caffe2/operators/sparse_to_dense_mask_op.cc
+++ b/caffe2/operators/sparse_to_dense_mask_op.cc
@ -0,0 +1,152 @@
 #include <algorithm>
 #include <unordered_map>
 #include <vector>
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
 namespace caffe2 {
 using ValueTypes = TensorTypes<int32_t, int64_t, float, double, string, bool>;
 class SparseToDenseMaskOp : public Operator<CPUContext> {
 public:
  SparseToDenseMaskOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<CPUContext>(operator_def, ws) {
    std::vector<int> mask = GetRepeatedArgument<int>("mask");
    featuresCount_ = mask.size();
    auto biggest = *std::max_element(mask.begin(), mask.end());
    dense_.assign(std::min(kMaxDenseSize, biggest + 1), -1);
    for (int i = 0; i < mask.size(); i++) {
      int id = mask[i];
      CAFFE_ENFORCE(id >= 0, "Only positive IDs are allowed.");
      if (id >= kMaxDenseSize) {
        sparse_[id] = i;
      } else {
        dense_[id] = i;
      }
    }
  }
  bool RunOnDevice() override {
    const TypeMeta& meta = Input(INDICES).meta();
    if (meta.Match<int32_t>()) {
      return DoRunWithIndexType<int32_t>();
    } else if (meta.Match<int64_t>()) {
      return DoRunWithIndexType<int64_t>();
    } else {
      CAFFE_THROW("Unsupported type of tensor: ", meta.name());
      return false;
    }
  }
  template <typename TInd>
  bool DoRunWithIndexType() {
    if (InputSize() < 4) {
      return DoRunWithLengthType<TInd, int32_t>();
    } else {
      const TypeMeta& meta = Input(LENGTHS).meta();
      if (meta.Match<int32_t>()) {
        return DoRunWithLengthType<TInd, int32_t>();
      } else if (meta.Match<int64_t>()) {
        return DoRunWithLengthType<TInd, int64_t>();
      } else {
        CAFFE_THROW("Unsupported type of tensor: ", meta.name());
        return false;
      }
    }
  }
  template <typename TInd, typename TLen>
  bool DoRunWithLengthType() {
    return DispatchHelper<ValueTypes, TInd, TLen>::call(this, Input(VALUES));
  }
  template <typename TInd, typename TLen, typename TVal>
  bool DoRunWithType() {
    auto& sparse_indices = Input(INDICES);
    CAFFE_ENFORCE(sparse_indices.ndim() == 1);
    auto& sparse_values = Input(VALUES);
    CAFFE_ENFORCE(sparse_values.ndim() == 1);
    CAFFE_ENFORCE(sparse_indices.size() == sparse_values.size());
    auto& default_value = Input(DEFAULT);
    CAFFE_ENFORCE(default_value.size() == 1);
    const TInd* sparse_indices_vec = sparse_indices.data<TInd>();
    const TVal* sparse_values_vec = sparse_values.template data<TVal>();
    const TVal* default_val = default_value.template data<TVal>();
    int cols = featuresCount_;
    int rows = 0;
    TLen default_length = sparse_indices.dim32(0);
    const TLen* lengths_vec = nullptr;
    auto* output = Output(0);
    if (InputSize() == 4) {
      auto& lengths = Input(LENGTHS);
      CAFFE_ENFORCE(lengths.ndim() == 1);
      lengths_vec = lengths.data<TLen>();
      rows = lengths.dim32(0);
      output->Resize(rows, cols);
    }
    if (rows == 0) {
      // if the LENGTHS is not set or it is empty, the output will be a vector
      rows = 1;
      lengths_vec = &default_length;
      output->Resize(cols);
    }
    // init
    TVal* output_data = output->template mutable_data<TVal>();
    for (int i = 0; i < cols * rows; i++) {
      output_data[i] = default_val[0];
    }
    TLen offset = 0;
    for (int r = 0; r < rows; r++) {
      for (int c = 0; c < lengths_vec[r]; c++) {
        int idx = getFeatureIdx(sparse_indices_vec[offset + c]);
        if (idx != -1) {
          output_data[r * cols + idx] = sparse_values_vec[offset + c];
        }
      }
      offset += lengths_vec[r];
    }
    return true;
  }
 private:
  const int kMaxDenseSize = 1024 * 128;
  std::unordered_map<int, int> sparse_;
  std::vector<int> dense_;
  int featuresCount_;
  inline int getFeatureIdx(int id) const {
    if (id >= kMaxDenseSize) {
      const auto& iter = sparse_.find(id);
      if (iter == sparse_.end()) {
        return -1;
      } else {
        return iter->second;
      }
    } else {
      return (id >= dense_.size()) ? -1 : dense_[id];
    }
  }
  INPUT_TAGS(INDICES, VALUES, DEFAULT, LENGTHS);
 };
 namespace {
 REGISTER_CPU_OPERATOR(SparseToDenseMask, SparseToDenseMaskOp);
 OPERATOR_SCHEMA(SparseToDenseMask)
    .NumInputs(3, 4)
    .NumOutputs(1)
    .SetDoc("Convert sparse representations to dense with given indices.")
    .Output(0, "output", "1-D or 2-D dense tensor.");
 NO_GRADIENT(SparseToDenseMask);
 } // namespace
 } // namespace caffe2
--- a/caffe2/operators/string_ops.cc
+++ b/caffe2/operators/string_ops.cc
@ -0,0 +1,124 @@
 #include "caffe2/caffe2/operators/string_ops.h"
 #include "caffe2/core/operator.h"
 namespace caffe2 {
 namespace {
 struct StartsWith {
  explicit StartsWith(OperatorBase& op)
      : prefix_(op.GetSingleArgument<std::string>("prefix", "")) {}
  bool operator()(const std::string& str) {
    return std::mismatch(prefix_.begin(), prefix_.end(), str.begin()).first ==
        prefix_.end();
  }
 private:
  std::string prefix_;
 };
 struct EndsWith {
  explicit EndsWith(OperatorBase& op)
      : suffix_(op.GetSingleArgument<std::string>("suffix", "")) {}
  bool operator()(const std::string& str) {
    return std::mismatch(suffix_.rbegin(), suffix_.rend(), str.rbegin())
               .first == suffix_.rend();
  }
 private:
  std::string suffix_;
 };
 struct Prefix {
  explicit Prefix(OperatorBase& op)
      : length_(op.GetSingleArgument<int>("length", 3)) {}
  std::string operator()(const std::string& str) {
    return std::string(str.begin(), std::min(str.end(), str.begin() + length_));
  }
 private:
  int length_;
 };
 struct Suffix {
  explicit Suffix(OperatorBase& op)
      : length_(op.GetSingleArgument<int>("length", 3)) {}
  std::string operator()(const std::string& str) {
    return std::string(std::max(str.begin(), str.end() - length_), str.end());
  }
 private:
  int length_;
 };
 template <typename ScalarFunctor, typename OutputType = std::string>
 using StringElementwiseOp = UnaryElementwiseWithArgsOp<
    TensorTypes<std::string>,
    CPUContext,
    ForEach<ScalarFunctor>,
    OutputType>;
 REGISTER_CPU_OPERATOR(StringPrefix, StringElementwiseOp<Prefix>);
 REGISTER_CPU_OPERATOR(StringSuffix, StringElementwiseOp<Suffix>);
 REGISTER_CPU_OPERATOR(StringStartsWith, StringElementwiseOp<StartsWith, bool>);
 REGISTER_CPU_OPERATOR(StringEndsWith, StringElementwiseOp<EndsWith, bool>);
 OPERATOR_SCHEMA(StringPrefix)
    .NumInputs(1)
    .NumOutputs(1)
    .SetDoc(R"DOC(
 Computes the element-wise string prefix of the string tensor.
 Input strings that are shorter than prefix length will be returned unchanged.
 NOTE: Prefix is computed on number of bytes, which may lead to wrong behavior
 and potentially invalid strings for variable-length encodings such as utf-8.
 )DOC")
    .Arg("length", "Maximum size of the prefix, in bytes.")
    .Input(0, "strings", "Tensor of std::string.")
    .Output(
        0,
        "prefixes",
        "Tensor of std::string containing prefixes for each input.");
 OPERATOR_SCHEMA(StringSuffix)
    .NumInputs(1)
    .NumOutputs(1)
    .SetDoc(R"DOC(
 Computes the element-wise string suffix of the string tensor.
 Input strings that are shorter than suffix length will be returned unchanged.
 NOTE: Prefix is computed on number of bytes, which may lead to wrong behavior
 and potentially invalid strings for variable-length encodings such as utf-8.
 )DOC")
    .Input(0, "strings", "Tensor of std::string.")
    .Output(
        0,
        "suffixes",
        "Tensor of std::string containing suffixes for each output.")
    .Arg("length", "Maximum size of the suffix, in bytes.");
 OPERATOR_SCHEMA(StringStartsWith)
    .NumInputs(1)
    .NumOutputs(1)
    .SetDoc(R"DOC(
 Performs the starts-with check on each string in the input tensor.
 Returns tensor of boolean of the same dimension of input.
 )DOC")
    .Arg("prefix", "The prefix to check input strings against.")
    .Input(0, "strings", "Tensor of std::string.")
    .Output(0, "bools", "Tensor of bools of same shape as input.");
 OPERATOR_SCHEMA(StringEndsWith)
    .NumInputs(1)
    .NumOutputs(1)
    .SetDoc(R"DOC(
 Performs the ends-with check on each string in the input tensor.
 Returns tensor of boolean of the same dimension of input.
 )DOC")
    .Arg("suffix", "The suffix to check input strings against.")
    .Input(0, "strings", "Tensor of std::string.")
    .Output(0, "bools", "Tensor of bools of same shape as input.");
 SHOULD_NOT_DO_GRADIENT(StringPrefix);
 SHOULD_NOT_DO_GRADIENT(StringSuffix);
 SHOULD_NOT_DO_GRADIENT(StringStartsWith);
 SHOULD_NOT_DO_GRADIENT(StringEndsWith);
 }
 } // namespace caffe2
--- a/caffe2/operators/string_ops.h
+++ b/caffe2/operators/string_ops.h
@ -0,0 +1,13 @@
 #pragma once
 #include "caffe2/core/operator.h"
 #include "caffe2/operators/elementwise_op.h"
 namespace caffe2 {
 template <typename ScalarFunctor, typename OutputType = std::string>
 using StringElementwiseOp = UnaryElementwiseWithArgsOp<
    TensorTypes<std::string>,
    CPUContext,
    ForEach<ScalarFunctor>,
    OutputType>;
 }
--- a/caffe2/operators/transpose_op.cu
+++ b/caffe2/operators/transpose_op.cu
@ -10,24 +10,25 @@ namespace caffe2 {
 #define COMPILE_TIME_CUDA_MAX_TRANSPOSE_DIMS 5
 namespace {
-// TODO(jiayq): one possible optimization is to copy the buffer into a shared memory
+// TODO(jiayq): one possible optimization is to copy the buffer into a shared
-// location to speed up access.
+// memory location to speed up access.
 template <typename Dtype>
 __global__ void transpose_gpu(const int nthreads, const Dtype* from_data,
  Dtype* to_data, const int* buffer, const int num_axes) {
  int from_inds[COMPILE_TIME_CUDA_MAX_TRANSPOSE_DIMS];
  const int* from_counts = buffer;
  const int* to_counts = buffer + num_axes;
-  const int* map = buffer + num_axes * 2;
+  const int* axes = buffer + num_axes * 2;
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    int from_index = index, to_index = 0;
-    for (int i = 0; i < num_axes; i++) {
+    for (int i = num_axes - 1; i >= 0; --i) {
-      from_inds[i] = from_index / from_counts[i];
+      from_inds[i] = from_index % from_counts[i];
-      from_index = from_index % from_counts[i];
+      from_index = from_index / from_counts[i];
    }
-    for (int i = 0; i < num_axes; i++) {
+    for (int i = 0; i < num_axes - 1; i++) {
-      to_index += from_inds[map[i]] * to_counts[i];
+      to_index = (to_index + from_inds[axes[i]]) * to_counts[i + 1];
    }
    to_index += from_inds[axes[num_axes - 1]];
    to_data[to_index] = from_data[index];
  }
 }
@ -42,7 +43,7 @@ bool TransposeOp<CUDAContext>::DoRunWithType() {
  int ndim = input.ndim();
  CAFFE_ENFORCE(count < std::numeric_limits<int>::max(),
                "Transpose op on GPU only supports int32"); 
-  CAFFE_ENFORCE(count < COMPILE_TIME_CUDA_MAX_TRANSPOSE_DIMS,
+  CAFFE_ENFORCE(ndim < COMPILE_TIME_CUDA_MAX_TRANSPOSE_DIMS,
                "Input ndim exceeds compile time max."); 
  // Buffer contains the following data:
  // (1) the dimenions of the inputs
--- a/caffe2/operators/transpose_op.h
+++ b/caffe2/operators/transpose_op.h
@ -45,7 +45,8 @@ class TransposeOp final : public Operator<Context> {
    }
    Y->Resize(new_dims_);
    // Do the actual transpose, which is implemented in DoRunWithType().
-    return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
+    return DispatchHelper<TensorTypes<float, double, int, long>>::call(
        this, Input(0));
  }
 protected:
--- a/caffe2/operators/utility_ops.cc
+++ b/caffe2/operators/utility_ops.cc
@ -17,6 +17,7 @@ REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(Copy, CopyOp<CPUContext, CPUContext, CPUContext>);
 REGISTER_CPU_OPERATOR(Shape, ShapeOp<CPUContext>);
 REGISTER_CPU_OPERATOR(HasElements, HasElementsOp<CPUContext>);
 REGISTER_CPU_OPERATOR(IsEmpty, IsEmptyOp<CPUContext>);
 REGISTER_CPU_OPERATOR(Gather, GatherOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(Unique, UniqueOp<CPUContext>);
 REGISTER_CPU_OPERATOR(LengthsToSegmentIds, LengthsToSegmentIdsOp<CPUContext>);
@ -24,6 +25,7 @@ REGISTER_CPU_OPERATOR(SegmentIdsToLengths, SegmentIdsToLengthsOp<CPUContext>);
 REGISTER_CPU_OPERATOR(Slice, SliceOp<int, CPUContext>);
 REGISTER_CPU_OPERATOR(Squeeze, SqueezeOp<CPUContext>);
 REGISTER_CPU_OPERATOR(ExpandDims, ExpandDimsOp<CPUContext>);
 REGISTER_CPU_OPERATOR(And, AndOp<CPUContext>);
 OPERATOR_SCHEMA(Print)
    .NumInputs(1)
@ -209,6 +211,13 @@ OPERATOR_SCHEMA(HasElements)
        "has_elements",
        "Scalar bool tensor. True if input is not empty.");
 OPERATOR_SCHEMA(IsEmpty)
    .NumInputs(1)
    .NumOutputs(1)
    .SetDoc("Returns true iff the input tensor has size == 0")
    .Input(0, "tensor", "Tensor of any type.")
    .Output(0, "is_empty", "Scalar bool tensor. True if input is empty.");
 OPERATOR_SCHEMA(Gather)
    .NumInputs(2)
    .NumOutputs(1)
@ -340,9 +349,21 @@ If the same blob is provided in input and output, the operation is copy-free.
    .Input(0, "data", "Original tensor")
    .Output(0, "expanded", "Reshaped tensor with same data as input.");
 OPERATOR_SCHEMA(And)
    .NumInputs(2)
    .NumOutputs(1)
    .AllowInplace({{0, 0}})
    .SetDoc(R"DOC(
 Outputs true iff both input blob values are true.
 )DOC")
    .Input(0, "input_0", "first boolean input.")
    .Input(1, "input_1", "second boolean input.")
    .Output(0, "output", "input_0 && input_1.");
 SHOULD_NOT_DO_GRADIENT(Print);
 SHOULD_NOT_DO_GRADIENT(Shape);
 SHOULD_NOT_DO_GRADIENT(HasElements);
 SHOULD_NOT_DO_GRADIENT(IsEmpty);
 class GetSqueezeGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
@ -433,6 +454,7 @@ SHOULD_NOT_DO_GRADIENT(LengthsToSegmentIds);
 SHOULD_NOT_DO_GRADIENT(SegmentIdsToLengths);
 // TODO(azzolini): Add support for slice gradient
 SHOULD_NOT_DO_GRADIENT(Slice);
 SHOULD_NOT_DO_GRADIENT(And);
 } // namespace
--- a/caffe2/operators/utility_ops.h
+++ b/caffe2/operators/utility_ops.h
@ -675,6 +675,21 @@ class HasElementsOp : public Operator<Context> {
  }
 };
 template <class Context>
 class IsEmptyOp : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  USE_SIMPLE_CTOR_DTOR(IsEmptyOp);
  bool RunOnDevice() override {
    auto& input = Input(0);
    auto* output = OperatorBase::Output<TensorCPU>(0);
    output->Resize(std::vector<TIndex>{});
    *output->template mutable_data<bool>() = (input.size() == 0);
    return true;
  }
 };
 // RecordShapeOp records the shape of the input tensor to a vector of int. You
 // mostly don't need this operator explicitly, and it is mostly used in the
 // autodiff process.
@ -911,6 +926,23 @@ class UniqueOp : public Operator<Context> {
 public:
  OUTPUT_TAGS(UNIQUE, REMAPPING);
 };
 template <class Context>
 class AndOp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  AndOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws) {}
  bool RunOnDevice() override {
    const auto* i1 = Input(0).template data<bool>();
    const auto* i2 = Input(1).template data<bool>();
    auto* output = Output(0);
    output->Resize(std::vector<int>{});
    *output->template mutable_data<bool>() = (*i1 && *i2);
    return true;
  }
 };
 } // namespace caffe2
 #endif // CAFFE2_OPERATORS_UTILITY_OPS_H_
--- a/caffe2/proto/caffe2.proto
+++ b/caffe2/proto/caffe2.proto
@ -199,7 +199,7 @@ message ExecutionStep {
  // Criteria network specifies a single output (TensorCPU<bool>) of
  // size (1), is run on every iteration by the executor, and
  // execution terminates when the output[0] is `false`.
-  optional string criteria_network = 5;
+  optional string criteria_network = 5 [deprecated=true];
  // If specified, run report_net asynchronously every `report_interval`
  // seconds. Report_net is guaranteed to run at least once after all
@ -210,6 +210,20 @@ message ExecutionStep {
  // If false or not set, execute sub-steps serially.
  // If true, execute all substeps concurrently, each one in a separte thread.
  optional bool concurrent_substeps = 6;
  // Name of a scalar boolean tensor.
  // ES checks this blob AFTER every substeps/subnets.
  // If specified, and the value is true, then ES will skip the rest and return
  // immediately.
  // This means that the report_net and the first step will always be called.
  // Use cases:
  // 1) the first substep stops the rest if data condition not met
  // 2) the first substep decide which of the rest of the steps should be run.
  // 3) external control
  //
  // ** It is the user's responsibility to not to put this blob in race conditions.
  // ** For example when setting this blob in concurrent substeps
  optional string should_stop_blob = 9;
 }
 message PlanDef {
--- a/caffe2/python/caffe2_python.cc
+++ b/caffe2/python/caffe2_python.cc
@ -87,7 +87,6 @@ const TypeMeta& NumpyTypeToCaffe(int numpy_type) {
    {NPY_UINT8, TypeMeta::Make<uint8_t>()},
    {NPY_UINT16, TypeMeta::Make<uint16_t>()},
    {NPY_OBJECT, TypeMeta::Make<std::string>()},
    {NPY_STRING, TypeMeta::Make<std::string>()},
    // Note: Add more types here.
  };
  static TypeMeta unknown_type;
@ -565,11 +564,10 @@ PyObject* FetchBlob(PyObject* self, PyObject* args) {
 PyObject* FeedBlob(PyObject* self, PyObject* args) {
  char* name_char;
-  PyArrayObject* array = nullptr;
+  PyObject* arg = nullptr;
  PyObject* device_option_string = nullptr;
-  // TODO(dzhulgakov): implement accepting other types (at least string)
+  if (!PyArg_ParseTuple(
-  if (!PyArg_ParseTuple(args, "sO!|O", &name_char, &PyArray_Type, &array,
+          args, "sO|O", &name_char, &arg, &device_option_string)) {
                        &device_option_string)) {
    PyErr_SetString(PyExc_ValueError, "Incorrect arguments.");
    return nullptr;
  }
@ -584,13 +582,25 @@ PyObject* FeedBlob(PyObject* self, PyObject* args) {
  }
  Blob* blob = gWorkspace->CreateBlob(name);
  if (PyArray_Check(arg)) { // numpy array
    PyArrayObject* array = reinterpret_cast<PyArrayObject*>(arg);
    auto feeder = CreateFeeder(option.device_type());
    if (!feeder) {
-    PyErr_SetString(PyExc_TypeError,
+      PyErr_SetString(
-                    "Unknown device type encountered in FeedBlob.");
+          PyExc_TypeError, "Unknown device type encountered in FeedBlob.");
      return nullptr;
    }
    return feeder->Feed(option, array, blob);
  } else if (PyString_Check(arg)) { // string
    *blob->GetMutable<std::string>() = PyBytesToStdString(arg);
    Py_RETURN_TRUE;
  } else {
    PyErr_SetString(
        PyExc_ValueError,
        "Unexpected type of argument - only numpy array or string are "
        "supported for feeding");
    return nullptr;
  }
 }
 // A simple macro to avoid writing repeated symbols.
@ -620,7 +630,7 @@ PyMethodDef* GetCaffe2PythonMethods() {
      {"cc_RunPlan", RunPlan, METH_VARARGS, ""},
      _PYNAME(CreateBlob),
      _PYNAME(SerializeBlob),
-    _PYNAME(FetchBlob),
+      {"cc_FetchBlob", FetchBlob, METH_VARARGS, ""},
      {"cc_FeedBlob", FeedBlob, METH_VARARGS, ""},
      {nullptr, nullptr, 0, nullptr}, // end of python methods.
  };
--- a/caffe2/python/caffe2_python.h
+++ b/caffe2/python/caffe2_python.h
@ -160,7 +160,10 @@ class TensorFetcher : public BlobFetcherBase {
            Py_DECREF(outObj[j]);
          }
          Py_DECREF(array);
-          LOG(FATAL) << "Failed to allocate string for ndarray of strings.";
+          PyErr_SetString(
              PyExc_TypeError,
              "Failed to allocate string for ndarray of strings.");
          return nullptr;
        }
      }
      return array;
@ -217,21 +220,14 @@ class TensorFeeder : public BlobFeederBase {
        char* str;
        Py_ssize_t strSize;
        if (PyBytes_AsStringAndSize(input[i], &str, &strSize) == -1) {
-          LOG(FATAL) << "Unsupported pyhton object type passed into ndarray.";
+          PyErr_SetString(
              PyExc_TypeError,
              "Unsupported python object type passed into ndarray.");
          return nullptr;
        }
        outPtr[i] = std::string(str, strSize);
      }
    } break;
    case NPY_STRING: {
      char* inputData = PyArray_BYTES(array);
      auto* outPtr = tensor->template mutable_data<std::string>();
      auto itemSize = PyArray_ITEMSIZE(array);
      for (int i = 0; i < tensor->size(); ++i) {
        auto start = inputData + i * itemSize;
        auto end = std::find(start, start + itemSize, '\0');
        outPtr[i] = std::string(start, end - start);
      }
    } break;
    default:
      context.template CopyBytes<CPUContext, Context>(
          tensor->size() * meta.itemsize(),
--- a/caffe2/python/convnet_benchmarks.py
+++ b/caffe2/python/convnet_benchmarks.py
@ -613,6 +613,7 @@ def GetArgumentParser():
    )
    parser.add_argument("--net_type", type=str, default="dag")
    parser.add_argument("--num_workers", type=int, default=2)
    parser.add_argument("--use-nvtx", default=False, action='store_true')
    return parser
@ -624,7 +625,9 @@ if __name__ == '__main__':
    ):
        GetArgumentParser().print_help()
-    workspace.GlobalInit(['caffe2', '--caffe2_log_level=0'])
+    workspace.GlobalInit(
        ['caffe2', '--caffe2_log_level=0'] +
        ['--caffe2_use_nvtx'] if args.use_nvtx else [])
    model_map = {
        'AlexNet': AlexNet,
        'OverFeat': OverFeat,
--- a/caffe2/python/core.py
+++ b/caffe2/python/core.py
@ -146,7 +146,7 @@ def ScopedBlobReference(name, *args, **kwargs):
    return BlobReference(scope.NAMESCOPE + name, *args, **kwargs)
-def _RectifyInputOutput(blobs):
+def _RectifyInputOutput(blobs, net=None):
    """A helper function to rectify the input or output of the CreateOperator
    interface.
    """
@ -154,18 +154,18 @@ def _RectifyInputOutput(blobs):
        # If blobs is a single string, prepend scope.NAMESCOPE and put it as a
        # list.
        # TODO(jiayq): enforce using BlobReference instead of raw strings.
-        return [ScopedBlobReference(blobs)]
+        return [ScopedBlobReference(blobs, net=net)]
    elif type(blobs) is BlobReference:
        # If blob is a BlobReference, simply put it as a list.
-        return [BlobReference(str(blobs))]
+        return [blobs]
-    elif type(blobs) is list:
+    elif type(blobs) in (list, tuple):
        # If blob is a list, we go through it and type check.
        rectified = []
        for blob in blobs:
            if isinstance(blob, basestring):
-                rectified.append(ScopedBlobReference(blob))
+                rectified.append(ScopedBlobReference(blob, net=net))
            elif type(blob) is BlobReference:
-                rectified.append(BlobReference(str(blob)))
+                rectified.append(blob)
            else:
                raise TypeError(
                    "I/O blob #{} of unsupported type: {} of type {}"
@ -670,8 +670,19 @@ def get_op_ids_in_path(ssa, blob_versions, inputs, outputs):
 class Net(object):
    _net_names_used = set()
    operator_registry_ = {}
    @staticmethod
    def _get_next_net_name(basename):
        name = basename
        next_idx = 1
        while name in Net._net_names_used:
            name = basename + '_' + str(next_idx)
            next_idx += 1
        Net._net_names_used |= set([name])
        return name
    def __init__(self, name_or_proto):
        """
        Create a Net.
@ -706,29 +717,29 @@ class Net(object):
            else:
                self._next_name_index = 0
        else:
            name = name_or_proto
            self._net = caffe2_pb2.NetDef()
-            self._net.name = name
+            self._net.name = name_or_proto
            self._next_name_index = 0
        # make sure that this net name hasn't been used before
        self._net.name = Net._get_next_net_name(self._net.name)
    def __str__(self):
        return self._net.name
-    def DefinesBlob(self, blob):
+    def BlobIsDefined(self, blob):
        """
        Returns true if the given BlobReference is produced as output of
        an operator in this net, or if it is provided as an external input.
        """
        if isinstance(blob, BlobReference):
            assert blob.Net() == self, 'Reference belongs to different net'
        blob_name = str(blob)
        for input in self._net.external_input:
            if input == blob_name:
                return True
        for op in self._net.op:
            for output in op.output:
                if output == blob_name:
                    return True
        for input in self._net.external_input:
            if input == blob_name:
                return True
        return False
    def UsesBlob(self, blob):
@ -753,7 +764,7 @@ class Net(object):
        raises KeyError.
        """
        blob_name = str(blob_name)
-        if not self.DefinesBlob(blob_name):
+        if not self.BlobIsDefined(blob_name):
            raise KeyError('Net does not define blob %s' % blob_name)
        return BlobReference(blob_name, self)
@ -818,13 +829,16 @@ class Net(object):
                new_outputs:   list of BlobReferences corresponding to the
                               outputs produced by new_net.
        """
-        inputs = inputs if isinstance(inputs, dict) else {i: i for i in inputs}
+        input_is_pair_list = isinstance(inputs, list) and all(
            isinstance(i, tuple) and len(i) == 2 for i in inputs)
        inputs = (
            inputs if isinstance(inputs, (dict, OrderedDict)) else
            OrderedDict(inputs) if input_is_pair_list else
            OrderedDict(zip(inputs, inputs)))
        for output in outputs:
            assert self.BlobIsDefined(output)
        input_names = {str(k): str(v) for k, v in inputs.items()}
        output_names = [str(o) for o in outputs]
        for input in inputs.keys():
            assert self.UsesBlob(input)
        for output in outputs:
            assert self.DefinesBlob(output)
        proto = self._net
        ssa, blob_versions = get_ssa(proto)
        used_op_ids = get_op_ids_in_path(ssa, blob_versions, inputs, outputs)
@ -859,9 +873,21 @@ class Net(object):
    def Proto(self):
        return self._net
-    def NextName(self):
+    def NextName(self, prefix=None, output_id=None):
        """Returns the next name to be used, if you do not want to explicitly
        name your blob."""
        if prefix:
            output_name_base = self._net.name + '/' + prefix
            output_name = output_name_base
            if output_id is not None:
                output_name += ':' + str(output_id)
            index = 2
            while self.BlobIsDefined(output_name):
                output_name = output_name_base + '_' + str(index)
                if output_id is not None:
                    output_name += ':' + str(output_id)
                index += 1
        else:
            output_name = self._net.name + '_blob_' + str(self._next_name_index)
            self._next_name_index += 1
        return str(output_name)
@ -900,16 +926,18 @@ class Net(object):
        self._net.op.extend(grad_ops)
        return input_to_grad
-    def AddExternalInput(self, input_name):
+    def AddExternalInput(self, input):
-        input_name = str(input_name)
+        input_name = str(input)
        assert input_name not in self._net.external_input, (
            'Net already contains an input named %s' % input_name)
        self._net.external_input.extend([input_name])
-        return BlobReference(input_name, self)
+        return (
            input if isinstance(input, BlobReference)
            else BlobReference(input_name))
    def AddExternalOutput(self, output):
        assert isinstance(output, BlobReference)
-        assert self.DefinesBlob(output)
+        assert self.BlobIsDefined(output)
        self.Proto().external_output.extend([str(output)])
    def DeduplicateGradientSlices(self, g):
@ -931,14 +959,22 @@ class Net(object):
    def _CreateAndAddToSelf(self, op_type, inputs, outputs=None, **kwargs):
        """A helper function to create an operator and add it to self.
        """
        inputs = _RectifyInputOutput(inputs)
        for input in inputs:
            if not self.BlobIsDefined(input):
                assert input.Net() != self
                self.AddExternalInput(input)
        if outputs is None:
            # If we do not specify an output, we will assume that this op
            # produces one output in this case.
-            outputs = self.NextName()
+            outputs = self.NextName(prefix=op_type)
        elif type(outputs) is int:
            # In this case, we will auto-fill the given number of outputs
            # with auto-generated names.
-            outputs = [self.NextName() for i in range(outputs)]
+            outputs = [
                self.NextName(prefix=op_type, output_id=i)
                for i in range(outputs)]
        outputs = _RectifyInputOutput(outputs, net=self)
        op = CreateOperator(op_type, inputs, outputs, **kwargs)
        self._net.op.extend([op])
        if len(op.output) == 0:
@ -1036,10 +1072,11 @@ class ExecutionStep(object):
        self._assert_can_mutate()
        self._step.num_iter = num_iter
-    def SetCriteriaNet(self, criteria_net):
+    def SetShouldStopBlob(self, should_stop_blob):
        assert isinstance(should_stop_blob, BlobReference), (
            "expects BlobReference here, got {}".format(type(should_stop_blob)))
        self._assert_can_mutate()
-        _add_net_to_dict(self._net_dict, criteria_net)
+        self._step.should_stop_blob = str(should_stop_blob)
        self._step.criteria_network = get_net_name(criteria_net)
    def SetReportNet(self, report_net, report_interval):
        self._assert_can_mutate()
@ -1053,7 +1090,7 @@ class ExecutionStep(object):
        if isinstance(substep, ExecutionStep):
            substep._notify_is_used()
            if not substep.HasNets() and not substep.HasSubsteps():
-                return
+                return self
            for net in substep.Nets():
                _add_net_to_dict(self._net_dict, net)
            self._substeps.append(substep)
@ -1061,6 +1098,7 @@ class ExecutionStep(object):
        else:
            proto = substep
        self._step.substep.add().CopyFrom(proto)
        return self
    def SetConcurrentSubsteps(self, concurrent_substeps):
        self._assert_can_mutate()
@ -1073,6 +1111,7 @@ class ExecutionStep(object):
        assert isinstance(net, Net)
        _add_net_to_dict(self._net_dict, net)
        self._step.network.extend([get_net_name(net)])
        return self
 class Plan(object):
@ -1107,11 +1146,11 @@ class Plan(object):
 def execution_step(default_name,
                   steps_or_nets,
                   criteria=None,
                   num_iter=None,
                   report_net=None,
                   report_interval=None,
-                   concurrent_substeps=None):
+                   concurrent_substeps=None,
                   should_stop_blob=None):
    """
    Helper for creating an ExecutionStep.
    - steps_or_nets can be:
@ -1120,18 +1159,20 @@ def execution_step(default_name,
      - ExecutionStep
      - list<Net>
      - list<ExecutionStep>
-    - criteria is either None or a Net
+    - should_stop_blob is either None or a scalar boolean blob.
-    - if no criteria or num_iter is provided, defaults to num_iter=1
+      - This blob is checked AFTER every substeps/subnets.
      - If specified and true, then this step will return immediately.
      - Be sure to handle race conditions if setting from concurrent threads.
    - if no should_stop_blob or num_iter is provided, defaults to num_iter=1
    """
-    assert criteria is None or isinstance(criteria, Net)
+    assert should_stop_blob is None or num_iter is None, (
-    assert criteria is None or num_iter is None, (
+        'Cannot set both should_stop_blob and num_iter.')
-        'Cannot set both criteria and num_iter.')
+    if should_stop_blob is None and num_iter is None:
    if criteria is None and num_iter is None:
        num_iter = 1
-    def set_criteria(step):
+    def set_step_attr(step):
-        if criteria is not None:
+        if should_stop_blob is not None:
-            step.SetCriteriaNet(criteria)
+            step.SetShouldStopBlob(should_stop_blob)
        else:
            step.SetIter(num_iter)
        if concurrent_substeps is not None:
@ -1144,18 +1185,20 @@ def execution_step(default_name,
    if not steps_or_nets:
        return ExecutionStep(default_name)
    if isinstance(steps_or_nets, ExecutionStep):
-        return set_criteria(steps_or_nets)
+        step = set_step_attr(ExecutionStep(default_name))
        step.AddSubstep(steps_or_nets)
        return step
    elif isinstance(steps_or_nets, Net):
-        step = set_criteria(ExecutionStep(default_name))
+        step = set_step_attr(ExecutionStep(default_name))
        step.AddNet(steps_or_nets)
        return step
    elif isinstance(steps_or_nets, list):
        if isinstance(steps_or_nets[0], Net):
-            step = set_criteria(ExecutionStep(default_name))
+            step = set_step_attr(ExecutionStep(default_name))
            map(step.AddNet, steps_or_nets)
            return step
        elif isinstance(steps_or_nets[0], ExecutionStep):
-            step = set_criteria(ExecutionStep(default_name))
+            step = set_step_attr(ExecutionStep(default_name))
            map(step.AddSubstep, steps_or_nets)
            return step
    else:
--- a/caffe2/python/dataset.py
+++ b/caffe2/python/dataset.py
@ -0,0 +1,276 @@
 """
 Implementation of an in-memory dataset with structured schema.
 Use this to store and iterate through datasets with complex schema that
 fit in memory.
 Iterating through entries of this dataset is very fast since the dataset
 is stored as a set of native Caffe2 tensors, thus no type conversion or
 deserialization is necessary.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
 from caffe2.python import core, workspace
 from caffe2.python.io import Reader, Writer
 from caffe2.python.schema import Struct
 import numpy as np
 class _DatasetReader(Reader):
    def __init__(self, field_names, field_blobs, cursor, name):
        """Don't call this directly. Instead, use dataset.reader()"""
        self.field_names = field_names
        self.field_blobs = field_blobs
        self.cursor = cursor
        self.name = name
    def read(self, read_net, batch_size=1):
        with core.NameScope(read_net.NextName(self.name)):
            fields = read_net.ReadNextBatch(
                [self.cursor] + self.field_blobs,
                self.field_names,
                batch_size=batch_size)
            return (read_net.IsEmpty([fields[0]]), fields)
    def reset(self, net):
        net.ResetCursor([self.cursor], [])
 class _DatasetRandomReader(Reader):
    def __init__(self, field_names, field_blobs, cursor, name, indices):
        """Don't call this directly. Instead, use dataset.random_reader()"""
        self.field_names = field_names
        self.field_blobs = field_blobs
        self.cursor = cursor
        self.name = name
        self.indices = indices
    def reset(self, net):
        net.ResetCursor([self.cursor], [])
    def computeoffset(self, net):
        self.reset(net)
        offsets = net.ComputeOffset(
            [self.cursor] + self.field_blobs,
            'offsets')
        self.offsets = offsets
    def read(self, read_net, batch_size=1):
        fields = read_net.ReadRandomBatch(
            [self.cursor, self.indices, self.offsets] + self.field_blobs,
            self.field_names,
            batch_size=batch_size)
        return (read_net.IsEmpty([fields[0]]), fields)
 class _DatasetWriter(Writer):
    def __init__(self, fields, field_blobs, init_net):
        """Don't call this directly. Use dataset.writer() instead."""
        self.fields = fields
        self.field_blobs = field_blobs
        self.mutex = init_net.CreateMutex([])
    def write(self, writer_net, fields):
        """
        Add operations to `net` that append the blobs in `fields` to the end
        of the dataset. An additional operator will also be added that checks
        the consistency of the data in `fields` against the dataset schema.
        Args:
            writer_net: The net that will contain the Append operators.
            fields: A list of BlobReference to be appeneded to this dataset.
        """
        assert len(fields) == len(self.fields), (
            'Expected %s fields, got %s.' % (len(self.fields), len(fields)))
        writer_net.CheckDatasetConsistency(fields, [], fields=self.fields)
        writer_net.AtomicAppend(
            [self.mutex] + list(self.field_blobs) + list(fields),
            self.field_blobs)
    def commit(self, finish_net):
        """Commit is a no-op for an in-memory dataset."""
        pass
 def to_ndarray_list(values, schema):
    """
    Given a list of values and a dataset schema, produce list of ndarray in the
    right format.
    This function will perform some checks to make sure that the arrays
    produced have the right dtype and rank.
    """
    assert isinstance(schema, Struct), 'schema must be a Struct.'
    names = schema.field_names()
    types = schema.field_types()
    assert len(types) == len(values), (
        'Values must have %d elements, got %d' % (len(types), len(values)))
    arrays = []
    for value, dtype, name in zip(values, types, names):
        array = np.array(value, dtype=dtype.base)
        # if array is empty we may need to reshape a little
        if array.size == 0:
            array = array.reshape((0,) + dtype.shape)
        # check that the inner dimensions match the schema
        assert (array.shape[1:] == dtype.shape), (
            'Invalid array shape for field %s. Expected (%s), got (%s).' % (
                name,
                ', '.join(['_'] + map(str, dtype.shape)),
                ', '.join(map(str, array.shape))))
        arrays.append(array)
    return arrays
 def Const(net, value, dtype=None, name=None):
    """
    Create a 'constant' by first creating an external input in the given
    net, and then feeding the corresponding blob with its provided value
    in the current workspace. The name is automatically generated in order
    to avoid clashes with existing blob names.
    """
    assert isinstance(net, core.Net), 'net must be a core.Net instance.'
    value = np.array(value, dtype=dtype)
    blob = net.AddExternalInput(net.NextName(prefix=name))
    workspace.FeedBlob(str(blob), value)
    return blob
 class Dataset(object):
    """Represents an in-memory dataset with fixed schema.
    Use this to store and iterate through datasets with complex schema that
    fit in memory.
    Iterating through entries of this dataset is very fast since the dataset
    is stored as a set of native Caffe2 tensors, thus no type conversion or
    deserialization is necessary.
    """
    def __init__(self, fields, name=None):
        """Create an un-initialized dataset with schema provided by `fields`.
        Before this dataset can be used, it must be initialized, either by
        `init_empty` or `init_from_dataframe`.
        Args:
            fields: either a schema.Struct or a list of field names in a format
                    compatible with the one described in schema.py.
            name: optional name to prepend to blobs that will store the data.
        """
        assert isinstance(fields, list) or isinstance(fields, Struct), (
            'fields must be either a Struct or a list of raw field names.')
        self.schema = fields
        self.fields = (
            fields.field_names() if isinstance(fields, Struct) else fields)
        self.field_types = (
            fields.field_types() if isinstance(fields, Struct) else
            [np.dtype(np.void)] * len(self.fields))
        self.name = name or 'dataset'
        self.field_blobs = None
    def init_empty(self, init_net):
        """Initialize the blobs for this dataset with empty values.
        Empty arrays will be immediately fed into the current workspace,
        and `init_net` will take those blobs as external inputs.
        """
        self.field_blobs = [Const(init_net, [], name=f) for f in self.fields]
    def init_from_dataframe(self, net, dataframe):
        """Initialize the blobs for this dataset from a Pandas dataframe.
        Each column of the dataframe will be immediately fed into the current
        workspace, and the `net` will take this blobs as external inputs.
        """
        assert len(self.fields) == len(dataframe.columns)
        self.field_blobs = [
            Const(net, dataframe.as_matrix([col]).flatten(), name=field)
            for col, field in enumerate(self.fields)]
    def get_blobs(self):
        """
        Return the list of BlobReference pointing to the blobs that contain
        the data for this dataset.
        """
        assert self
        return self.field_blobs
    def field_names(self):
        """Return the list of field names for this dataset."""
        return self.fields
    def field_types(self):
        """
        Return the list of field dtypes for this dataset.
        If a list of strings, not a schema.Struct, was passed to the
        constructor, this will return a list of dtype(np.void).
        """
        return self.field_types
    def reader(self, init_net, cursor_name=None):
        """Create a Reader object that is used to iterate through the dataset.
        This will append operations to `init_net` that create a TreeCursor,
        used to iterate through the data.
        NOTE: Currently, it is not safe to append to a dataset while reading.
        Args:
            init_net: net that will be run once to create the cursor.
            cursor_name: optional name for the blob containing a pointer
                         to the cursor.
        Returns:
            A _DatasetReader that can be used to create operators that will
            iterate through the dataset.
        """
        assert self.field_blobs, 'Dataset not initialized.'
        cursor_name = cursor_name or (self.name + '_cursor')
        cursor = init_net.CreateTreeCursor(
            [],
            [cursor_name],
            fields=self.fields)
        return _DatasetReader(
            self.fields, self.field_blobs, cursor, cursor_name)
    def random_reader(self, init_net, indices, cursor_name=None):
        """Create a Reader object that is used to iterate through the dataset.
        NOTE: The reader order depends on the order in indices.
        Args:
            Similar to reader
            indices: blob of reading order
        Returns:
            A DatasetReader that can be used to create operators that will
            iterate through the dataset according to indices.
        """
        assert self.field_blobs, 'Dataset not initialized.'
        cursor_name = cursor_name or (self.name + '_cursor')
        cursor = init_net.CreateTreeCursor(
            [],
            [cursor_name],
            fields=self.fields)
        return _DatasetRandomReader(
            self.fields, self.field_blobs, cursor, cursor_name, indices)
    def writer(self, init_net):
        """Create a Writer that can be used to append entries into the dataset.
        NOTE: Currently, it is not safe to append to a dataset
              while reading from it.
        NOTE: Currently implementation of writer is not thread safe.
              TODO: fixme
        Args:
            init_net: net that will be run once in order to create the writer.
                      (currently not used)
        """
        assert self.field_blobs, 'Dataset not initialized.'
        return _DatasetWriter(self.fields, self.field_blobs, init_net)
--- a/caffe2/python/device_checker.py
+++ b/caffe2/python/device_checker.py
@ -1,6 +1,6 @@
 import numpy as np
 import copy
-from caffe2.python import core, workspace
+from caffe2.python import workspace
 class DeviceChecker(object):
@ -41,9 +41,8 @@ class DeviceChecker(object):
            op.device_option.CopyFrom(device_option)
            workspace.RunOperatorOnce(op)
            results.append(
-                [workspace.FetchBlob(op.output[idx]) for idx in outputs_to_check
+                [workspace.FetchBlob(op.output[idx])
-                ]
+                 for idx in outputs_to_check])
            )
            # Everything is done, reset the workspace.
            workspace.ResetWorkspace()
        # After running on all devices, check correctness
@ -68,8 +67,8 @@ class DeviceChecker(object):
        return success
    def CheckNet(self, net, inputs={}, blobs_to_check=None, ignore=set()):
-        """Checks a network by inspecting all of its intermediate results, and see
+        """Checks a network by inspecting all of its intermediate results, and
-        if things match.
+        see if things match.
        """
        old_ws_name = workspace.CurrentWorkspace()
        results = []
@ -78,7 +77,7 @@ class DeviceChecker(object):
        blobs_to_check = [b for b in blobs_to_check if b not in ignore]
        workspace.SwitchWorkspace("_device_check_", True)
        for i, device_option in enumerate(self._device_options):
-            for name, arr in inputs.iteritems():
+            for name, arr in inputs.items():
                # print 'feeding', name
                workspace.FeedBlob(name, arr, device_option)
            for op in net.op:
@ -93,15 +92,18 @@ class DeviceChecker(object):
            for j in range(len(blobs_to_check)):
                x = results[i][j]
                y = results[0][j]
-                if np.any(np.abs(x - y) > self._threshold):
+                if not np.allclose(x, y,
                                   atol=self._threshold, rtol=self._threshold):
                    print('Failure in checking device option {}'
                          ' and output {}. The outputs are:'
                          .format(i, blobs_to_check[j]))
                    print(x.flatten())
                    print(y.flatten())
                    print(np.max(np.abs(x - y)))
                    success = False
                # else:
                #     print ('Passed device pair (%d, %d), %s %s: %s' %
-                #         (i, j, blobs_to_check[j], y.shape, str(y.flatten())))
+                #            (i, j, blobs_to_check[j], y.shape,
                #             str(y.flatten())))
        workspace.SwitchWorkspace(old_ws_name)
        return success
--- a/caffe2/python/hypothesis_test.py
+++ b/caffe2/python/hypothesis_test.py
@ -178,7 +178,12 @@ class TestOperators(hu.HypothesisTestCase):
            return st.sampled_from([np.float32, np.float64])
        _test_binary(
-            "Div", ref, filter_=non_zero, test_gradient=True, dtypes=div_dtypes
+            "Div", ref, filter_=non_zero, test_gradient=True,
            dtypes=div_dtypes, gcs=hu.gcs_cpu_only
        )(self)
        _test_binary(
            "Div", ref, filter_=non_zero, test_gradient=False,
            dtypes=div_dtypes
        )(self)
        _test_binary_broadcast(
            "Div", ref, filter_=non_zero, dtypes=div_dtypes)(self)
@ -269,8 +274,8 @@ class TestOperators(hu.HypothesisTestCase):
        for param, _ in enumerate(inputs):
            self.assertGradientChecks(gc, op, inputs, param, [0])
-    @unittest.skipIf(True,
+    @unittest.skipIf(not workspace.has_gpu_support,
-                     "Recurrent only works on CUDA 7.5 and above")
+                     "Skipping test due to no gpu present.")
    @given(hidden_size=st.integers(min_value=1, max_value=3),
           num_layers=st.integers(min_value=1, max_value=3),
           bidirectional=st.booleans(),
@ -371,10 +376,10 @@ class TestOperators(hu.HypothesisTestCase):
           pad_l=st.integers(0, 3),
           pad_b=st.integers(0, 3),
           pad_r=st.integers(0, 3),
-            kernel=st.integers(1, 5),
+           kernel=st.integers(3, 5),
-            size=st.integers(7, 10),
+           size=st.integers(8, 8),
-            input_channels=st.integers(1, 8),
+           input_channels=st.integers(1, 3),
-            output_channels=st.integers(1, 8),
+           output_channels=st.integers(1, 3),
           batch_size=st.integers(1, 3),
           order=st.sampled_from(["NCHW", "NHWC"]),
           engine=st.sampled_from([""]),
@ -661,7 +666,8 @@ class TestOperators(hu.HypothesisTestCase):
           output_channels=st.integers(1, 8),
           batch_size=st.integers(1, 3),
           order=st.sampled_from(["NCHW", "NHWC"]),
-            engine=st.sampled_from([""]), **hu.gcs)
+           engine=st.sampled_from(["", "CUDNN"]), **hu.gcs)
    @settings(max_examples=2, timeout=100)
    def test_convolution_transpose_gradients(self, stride, pad, kernel,
                                             size, input_channels,
                                             output_channels, batch_size,
@ -698,7 +704,7 @@ class TestOperators(hu.HypothesisTestCase):
           input_channels=st.integers(1, 8),
           output_channels=st.integers(1, 8),
           batch_size=st.integers(1, 3),
-            engine=st.sampled_from([""]), **hu.gcs)
+           engine=st.sampled_from(["", "CUDNN"]), **hu.gcs)
    def test_convolution_transpose_layout(self, stride, pad, kernel,
                                          size, input_channels,
                                          output_channels, batch_size,
@ -1049,7 +1055,7 @@ class TestOperators(hu.HypothesisTestCase):
    @given(target_probabilities=hu.arrays(
        dims=[10], elements=st.floats(allow_nan=False,
                                      allow_infinity=False,
-                                      min_value=0,
+                                      min_value=0.01,
                                      max_value=1)),
           **hu.gcs)
    def test_perplexity(self, target_probabilities, gc, dc):
@ -1477,9 +1483,13 @@ class TestOperators(hu.HypothesisTestCase):
        op = core.CreateOperator("HasElements", ["data"], ["has_elements"])
        self.assertReferenceChecks(gc, op, [data], lambda x: (len(x) > 0, ))
        op = core.CreateOperator("IsEmpty", ["data"], ["is_empty"])
        self.assertReferenceChecks(gc, op, [data], lambda x: (len(x) == 0, ))
    @given(initial_iters=st.integers(0, 100),
           max_iters=st.integers(0, 100))
-    def test_criteria_net_with_execution_step(self, initial_iters, max_iters):
+    def test_should_stop_as_criteria_net_execution_step(
            self, initial_iters, max_iters):
        net = core.Net("net")
        net.Iter(["iter"], ["iter"])
        workspace.FeedBlob(
@ -1487,16 +1497,87 @@ class TestOperators(hu.HypothesisTestCase):
        workspace.FeedBlob(
            "num_iters", np.asarray([max_iters]).astype(np.int32))
        criteria_net = core.Net("criteria")
-        criteria_net.LT(["iter", "num_iters"], ["continue"])
+        criteria_net.GE(["iter", "num_iters"], ["stop"])
-        criteria_net.Proto().external_output.extend(["continue"])
+        criteria_net.Proto().external_output.extend(["stop"])
        plan = core.Plan('plan')
-        plan.AddStep(core.execution_step('step', net, criteria=criteria_net))
+        plan.AddStep(core.execution_step(
            'step', [criteria_net, net],
            should_stop_blob=core.BlobReference("stop")))
        workspace.RunPlan(plan)
        iters = workspace.FetchBlob("iter")
        self.assertEqual(iters.dtype, np.int32)
        self.assertEqual(iters[0], max(initial_iters, max_iters))
    def test_disabled_execution_step(self):
        def createNets(i, disabled):
            should_stop = 'should_stop_{}'.format(i)
            output = 'output_{}'.format(i)
            # init content and stop signal
            init = core.Net("init_{}".format(i))
            init.ConstantFill(
                [],
                [output],
                shape=[1],
                value=0.0
            )
            init.Cast([output], [should_stop], to='bool')
            # decide if disabled or not
            criterion = core.Net("criterion_{}".format(i))
            tmp = criterion.ConstantFill(
                [],
                shape=[1],
                value=1.0 if disabled else 0.0
            )
            criterion.Cast([tmp], [should_stop], to='bool')
            criterion.Proto().external_output.extend([should_stop])
            # the body net is just to turn a 0 blob to 1
            net = core.Net("net_{}".format(i))
            net.ConstantFill(
                [],
                [output],
                shape=[1],
                value=1.0
            )
            # always end the loop
            ender = core.Net("ender_{}".format(i))
            tmp = ender.ConstantFill(
                [],
                shape=[1],
                value=1.0
            )
            ender.Cast([tmp], [should_stop], to='bool')
            ender.Proto().external_output.extend([should_stop])
            return [init, criterion, net, ender]
        nets = [createNets(1, False),
                createNets(2, True),
                createNets(3, False)]
        steps = [
            core.execution_step(
                'step_1', nets[0],
                should_stop_blob=core.BlobReference('should_stop_1')),
            core.execution_step(
                'step_2', nets[1],
                should_stop_blob=core.BlobReference('should_stop_2')),
            core.execution_step('step_3', nets[2])
        ]
        expected = [1.0, 0.0, 1.0]
        plan = core.Plan('plan')
        plan.AddStep(core.execution_step('all_steps', steps, num_iter=3))
        workspace.RunPlan(plan)
        for i, net in enumerate(nets):
            self.assertEqual(
                workspace.FetchBlob('output_{}'.format(i + 1))[0],
                expected[i])
    @given(initial_iters=st.integers(0, 100),
           num_iters=st.integers(0, 100))
    def test_iter_count_with_execution_step(self, initial_iters, num_iters):
@ -1523,6 +1604,13 @@ class TestOperators(hu.HypothesisTestCase):
    def test_cast(self, a, src, dst, use_name, gc, dc):
        a = a.astype(src)
        # Casting from a float type outside the range of the integral
        # type is UB.
        ftypes = [np.float32, np.float64]
        if src in ftypes and dst not in ftypes and dst is not np.bool:
            info = np.iinfo(dst)
            a = np.clip(a, info.min, info.max)
        def ref(data):
            return [data.astype(dst)]
@ -1571,7 +1659,8 @@ class TestOperators(hu.HypothesisTestCase):
        self.assertDeviceChecks(dc, op, [X], [0])
        self.assertGradientChecks(gc, op, [X], 0, [0])
-    @given(X=hu.tensor(), seed=st.integers(min_value=0, max_value=65536),
+    @given(X=_dtypes().flatmap(lambda dtype: hu.tensor(dtype=dtype)),
           seed=st.integers(min_value=0, max_value=65536),
           null_axes=st.booleans(),
           **hu.gcs)
    def test_transpose(self, X, seed, null_axes, gc, dc):
@ -1589,6 +1678,7 @@ class TestOperators(hu.HypothesisTestCase):
        self.assertReferenceChecks(gc, op, [X, axes],
                                   transpose_ref)
        if X.dtype != np.int32 and X.dtype != np.int64:
            self.assertGradientChecks(gc, op, [X], 0, [0])
    @given(n=st.integers(1, 3),
--- a/caffe2/python/hypothesis_test_util.py
+++ b/caffe2/python/hypothesis_test_util.py
@ -263,7 +263,11 @@ class HypothesisTestCase(test_util.TestCase):
            outs = []
            for (n, ref) in zip(op.output, reference_outputs):
                output = workspace.FetchBlob(n)
-                np.testing.assert_allclose(output, ref, atol=1e-4, rtol=1e-4)
+                if output.dtype.kind in ('S', 'O'):
                    np.testing.assert_array_equal(output, ref)
                else:
                    np.testing.assert_allclose(
                        output, ref, atol=1e-4, rtol=1e-4)
                outs.append(output)
            if grad_reference and output_to_grad:
                self._assertGradReferenceChecks(
--- a/caffe2/python/io.py
+++ b/caffe2/python/io.py
@ -0,0 +1,138 @@
 """
 Defines the base interface for reading and writing operations.
 Readers/Writers are objects that produce operations that read/write sequences
 of data. Each operation reads or writes a list of BlobReferences.
 Readers and Writers must be implemented such that read and write operations
 are atomic and thread safe.
 Examples of possible Readers and Writers:
    HiveReader, HiveWriter,
    QueueReader, QueueWriter,
    DatasetReader, DatasetWriter,
    DBReader, DBWriter,
 See `dataset.py` for an example of implementation.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
 from caffe2.python import core
 class Reader(object):
    """
    Reader is a abstract class to be implemented in order to provide
    operations capable of iterating through a dataset or stream of data.
    A Reader must implement at least one operation, `read`, which
    adds operations to a net that read the next batch of data. Readers can
    optionally support the `reset` operation, which is useful when multiple
    passes over the data are required.
    """
    def read(self, read_net, batch_size=1, *args):
        """
        Add operations to read_net that will read the read batch of data
        and return a list of BlobReference representing the blobs that will
        contain the batches produced.
        Operations added to `read_net` must be thread safe and atomic, that is,
        it should be possible to clone `read_net` and run multiple instances of
        it in parallel.
        Args:
            read_net: the net that will be appended with read operations
            batch_size: number of entires to read
        Returns:
            A tuple (should_stop, fields), with:
                should_stop: BlobReference pointing to a boolean scalar
                             blob that indicates whether the read operation
                             was succesfull or whether the end of data has
                             been reached.
                fields: A tuple of BlobReference containing the latest batch
                        of data that was read.
        """
        raise NotImplementedError('Readers must implement `read`.')
    def reset(self, net):
        """Append operations to `net` that will reset the reader.
        This can be used to read the data multiple times.
        Not all readers support this operation.
        """
        raise NotImplementedError('This reader cannot be resetted.')
    def execution_step(self, reader_net_name=None, batch_size=1):
        """Create an execution step with a net containing read operators.
        The execution step will contain a `stop_blob` that knows how to stop
        the execution loop when end of data was reached.
        E.g.:
            read_step, fields = reader.execution_step()
            consume_net = core.Net('consume')
            consume_net.Print(fields[0], [])
            p = core.Plan('reader')
            p.AddStep(read_step.AddNet(consume_net))
            core.RunPlan(p)
        Args:
            reader_net_name: (optional) the name of the reader_net to be
                             created. The execution step will
                             be named accordingly.
            batch_size: the batch size
        Returns:
            A tuple (read_step, fields), with:
                read_step: A newly created execution step containing a net with
                           read operations. The step will have `stop_blob` set,
                           in order to stop the loop on end of data.
                fields: A tuple of BlobReference containing the latest batch
                        of data that was read.
        """
        reader_net = core.Net(reader_net_name or 'reader')
        should_stop, fields = self.read(reader_net, batch_size=batch_size)
        read_step = core.execution_step(
            '{}_step'.format(reader_net_name),
            reader_net,
            should_stop_blob=should_stop)
        return (read_step, fields)
 class Writer(object):
    """
    Writer is a abstract class to be implemented in order to provide
    operations capable of feeding a data stream or a dataset.
    A Writer must implement 2 operations:
    `write`, which adds operations to a net that write the write batch of
    data, and `commit`, which adds operations to a net in order to indicate
    that no more data will be written.
    """
    def write(self, writer_net, fields):
        """Add operations to `writer_net` that write the next batch of data.
        Operations added to the net must be thread-safe and unique, that is:
        multiple writers must be able to write to the dataset in parallel.
        Args:
            fields: a tuple of BlobReference containing the batch of data to
                    write.
        """
        raise NotImplementedError('Writers must implement write.')
    def commit(self, finish_net):
        """Add operations to `finish_net` that signal end of data.
        This must be implemented by all Writers, but may be no-op for some
        of them.
        """
        raise NotImplementedError('Writers must implement commit.')
--- a/caffe2/python/model_device_test.py
+++ b/caffe2/python/model_device_test.py
@ -116,7 +116,7 @@ class TestMiniAlexNet(test_util.TestCase):
        gpu_device = caffe2_pb2.DeviceOption()
        gpu_device.device_type = caffe2_pb2.CUDA
-        checker = device_checker.DeviceChecker(1e-2, [cpu_device, gpu_device])
+        checker = device_checker.DeviceChecker(0.05, [cpu_device, gpu_device])
        ret = checker.CheckNet(
            model.net.Proto(),
            inputs,
@ -126,15 +126,16 @@ class TestMiniAlexNet(test_util.TestCase):
        )
        self.assertEqual(ret, True)
-    def testMiniAlexNet(self):
+    @unittest.skipIf(not workspace.has_gpu_support,
                     "No GPU support. Skipping test.")
    def testMiniAlexNetNCHW(self):
        self._testMiniAlexNet("NCHW")
    @unittest.skipIf(not workspace.has_gpu_support,
                     "No GPU support. Skipping test.")
    def testMiniAlexNetNHWC(self):
        self._testMiniAlexNet("NHWC")
 if __name__ == '__main__':
    if not workspace.has_gpu_support:
        print('No GPU support. Skipping gpu test.')
    elif workspace.NumCudaDevices() == 0:
        print('No GPU device. Skipping gpu test.')
    else:
    unittest.main()
--- a/caffe2/python/net_drawer.py
+++ b/caffe2/python/net_drawer.py
@ -3,6 +3,7 @@ from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
 import argparse
 import json
 from collections import defaultdict
 from caffe2.python import utils
@ -53,6 +54,11 @@ def _rectify_operator_and_name(operators_or_net, name):
    return operators, name
 def _escape_label(name):
    # json.dumps is poor man's escaping
    return json.dumps(name)
 def GetPydotGraph(operators_or_net, name=None, rankdir='LR'):
    operators, name = _rectify_operator_and_name(operators_or_net, name)
    graph = pydot.Dot(name, rankdir=rankdir)
@ -73,7 +79,7 @@ def GetPydotGraph(operators_or_net, name=None, rankdir='LR'):
            if input_name not in pydot_nodes:
                input_node = pydot.Node(
                    input_name + str(pydot_node_counts[input_name]),
-                    label=input_name,
+                    label=_escape_label(input_name),
                    **BLOB_STYLE
                )
                pydot_nodes[input_name] = input_node
@ -87,7 +93,7 @@ def GetPydotGraph(operators_or_net, name=None, rankdir='LR'):
                pydot_node_counts[output_name] += 1
            output_node = pydot.Node(
                output_name + str(pydot_node_counts[output_name]),
-                label=output_name,
+                label=_escape_label(output_name),
                **BLOB_STYLE
            )
            pydot_nodes[output_name] = output_node
--- a/caffe2/python/operator_test/atomic_ops_test.py
+++ b/caffe2/python/operator_test/atomic_ops_test.py
@ -0,0 +1,43 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
 class TestAtomicOps(TestCase):
    def test_atomic_ops(self):
        """
        Test that both countdown and checksum are update atomically by having
        cowntdown count from 20k to 0 from parallel the workers and updating
        the checksum to the value fetched. If operations are trully atomic,
        each value from 1 to 20k should be fetched exactly once from the
        countdown, and fed exactly once to the checksum, such that at the end
        checksum must contain the exact value of sum[i=0..20000](i).
        """
        init_net = core.Net('init')
        mutex_countdown = init_net.CreateMutex([])
        mutex_checksum = init_net.CreateMutex([])
        countdown = init_net.ConstantIntFill([], shape=[], value=20000.)
        checksum = init_net.ConstantIntFill([], shape=[], value=0.)
        minus_one = init_net.ConstantIntFill([], shape=[], value=-1.)
        steps = []
        for i in range(0, 100):
            net = core.Net('net:%d' % i)
            _, fetched_count = net.AtomicFetchAdd(
                [mutex_countdown, countdown, minus_one],
                [countdown, 'fetched_count:%d' % i])
            net.AtomicFetchAdd(
                [mutex_checksum, checksum, fetched_count],
                [checksum, 'not_used'])
            steps.append(
                core.execution_step('worker:%d' % i, net, num_iter=200))
        super_step = core.execution_step(
            'parent', steps, concurrent_substeps=True)
        plan = core.Plan('plan')
        plan.AddStep(core.execution_step('init', init_net))
        plan.AddStep(super_step)
        workspace.RunPlan(plan)
        # checksum = sum[i=1..20000](i) = 20000 * 20001 / 2 = 200010000
        self.assertEquals(workspace.FetchBlob(checksum), 200010000)
--- a/caffe2/python/operator_test/counter_ops_test.py
+++ b/caffe2/python/operator_test/counter_ops_test.py
@ -0,0 +1,42 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
 class TestCounterOps(TestCase):
    def test_counter_ops(self):
        workspace.RunOperatorOnce(core.CreateOperator(
            'CreateCounter', [], ['c'], init_count=1))
        workspace.RunOperatorOnce(core.CreateOperator(
            'CountDown', ['c'], ['t1']))  # 1 -> 0
        assert not workspace.FetchBlob('t1')
        workspace.RunOperatorOnce(core.CreateOperator(
            'CountDown', ['c'], ['t2']))  # 0 -> 0
        assert workspace.FetchBlob('t2')
        workspace.RunOperatorOnce(core.CreateOperator(
            'ResetCounter', ['c'], [], init_count=1))  # -> 1
        workspace.RunOperatorOnce(core.CreateOperator(
            'CountDown', ['c'], ['t3']))  # 1 -> 0
        assert not workspace.FetchBlob('t3')
        workspace.RunOperatorOnce(core.CreateOperator(
            'ConstantBoolFill', [], ['t4'], value=0.0, shape=[]))
        assert workspace.FetchBlob('t4') == workspace.FetchBlob('t1')
        workspace.RunOperatorOnce(core.CreateOperator(
            'ConstantBoolFill', [], ['t5'], value=1.0, shape=[]))
        assert workspace.FetchBlob('t5') == workspace.FetchBlob('t2')
        assert workspace.RunOperatorOnce(core.CreateOperator(
            'And', ['t1', 't2'], ['t6']))
        assert not workspace.FetchBlob('t6')  # True && False
        assert workspace.RunOperatorOnce(core.CreateOperator(
            'And', ['t2', 't5'], ['t7']))
        assert workspace.FetchBlob('t7')  # True && True
--- a/caffe2/python/operator_test/cross_entropy_ops_test.py
+++ b/caffe2/python/operator_test/cross_entropy_ops_test.py
@ -0,0 +1,71 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
 import numpy as np
 def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))
 def sigmoid_cross_entropy_with_logits(x, z):
    return np.maximum(x, 0) - x * z + np.log(1 + np.exp(-np.abs(x)))
 def sigmoid_cross_entropy_with_logits_grad(x, z):
    return z - sigmoid(x)
 class TestCrossEntropyOps(hu.HypothesisTestCase):
    @given(
        inputs=st.lists(
            elements=st.integers(min_value=1, max_value=5),
            min_size=1,
            max_size=2,
            average_size=2,
        ).flatmap(
            lambda shape: st.tuples(
                hu.arrays(
                    dims=shape,
                    elements=st.one_of(
                        st.floats(min_value=-1.0, max_value=-0.1),
                        st.floats(min_value=0.1, max_value=1.0),
                    )),
                hu.arrays(
                    dims=shape,
                    elements=st.sampled_from([0.0, 1.0]),
                ),
            )
        ),
    )
    def test_sigmoid_cross_entropy_with_logits(self, inputs):
        logits, targets = inputs
        def sigmoid_xentr_logit_ref(logits, targets):
            s = sigmoid_cross_entropy_with_logits(logits, targets)
            m = np.mean(s, axis=len(logits.shape) - 1)
            return (m, )
        def sigmoid_xentr_logit_grad_ref(g_out, outputs, fwd_inputs):
            fwd_logits, fwd_targets = fwd_inputs
            inner_size = fwd_logits.shape[-1]
            m = fwd_targets - sigmoid(fwd_logits)
            g_in = -np.expand_dims(g_out, axis=-1) * m / inner_size
            return (g_in, None)
        op = core.CreateOperator(
            'SigmoidCrossEntropyWithLogits',
            ['logits', 'targets'],
            ['xentropy'])
        self.assertReferenceChecks(
            hu.cpu_do,
            op,
            [logits, targets],
            sigmoid_xentr_logit_ref,
            output_to_grad='xentropy',
            grad_reference=sigmoid_xentr_logit_grad_ref)
--- a/caffe2/python/operator_test/dataset_ops_test.py
+++ b/caffe2/python/operator_test/dataset_ops_test.py
@ -0,0 +1,284 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
 import numpy as np
 from caffe2.python import core, workspace, dataset
 from caffe2.python.dataset import Const
 from caffe2.python.schema import List, Struct, Scalar, Map
 from caffe2.python.test_util import TestCase
 def _assert_arrays_equal(actual, ref, err_msg):
    if ref.dtype.kind in ('S', 'O'):
        np.testing.assert_array_equal(actual, ref, err_msg=err_msg)
    else:
        np.testing.assert_allclose(
            actual, ref, atol=1e-4, rtol=1e-4, err_msg=err_msg)
 class TestDatasetOps(TestCase):
    def test_dataset_ops(self):
        """
        1. Defining the schema of our dataset.
        This example schema could represent, for example, a search query log.
        """
        schema = Struct(
            # fixed size vector, which will be stored as a matrix when batched
            ('dense', Scalar((np.float32, 3))),
            # could represent a feature map from feature ID to float value
            ('floats', Map(
                Scalar(np.int32),
                Scalar(np.float32))),
            # could represent a multi-valued categorical feature map
            ('int_lists', Map(
                Scalar(np.int32),
                List(Scalar(np.int64)),
            )),
            # could represent a multi-valued, weighted categorical feature map
            ('id_score_pairs', Map(
                Scalar(np.int32),
                Map(
                    Scalar(np.int64),
                    Scalar(np.float32),
                    keys_name='ids',
                    values_name='scores'),
            )),
            # additional scalar information
            ('metadata', Struct(
                ('user_id', Scalar(np.int64)),
                ('user_embed', Scalar((np.float32, 2))),
                ('query', Scalar(str)),
            )),
        )
        """
        This is what the flattened fields for this schema look like, along
        with its type. Each one of these fields will be stored, read and
        writen as a tensor.
        """
        expected_fields = [
            ('dense', (np.float32, 3)),
            ('floats:lengths', np.int32),
            ('floats:values:keys', np.int32),
            ('floats:values:values', np.float32),
            ('int_lists:lengths', np.int32),
            ('int_lists:values:keys', np.int32),
            ('int_lists:values:values:lengths', np.int32),
            ('int_lists:values:values:values', np.int64),
            ('id_score_pairs:lengths', np.int32),
            ('id_score_pairs:values:keys', np.int32),
            ('id_score_pairs:values:values:lengths', np.int32),
            ('id_score_pairs:values:values:values:ids', np.int64),
            ('id_score_pairs:values:values:values:scores', np.float32),
            ('metadata:user_id', np.int64),
            ('metadata:user_embed', (np.float32, 2)),
            ('metadata:query', str),
        ]
        zipped = zip(
            expected_fields,
            schema.field_names(),
            schema.field_types())
        for (ref_name, ref_type), name, dtype in zipped:
            self.assertEquals(ref_name, name)
            self.assertEquals(np.dtype(ref_type), dtype)
        """
        2. The contents of our dataset.
        Contents as defined below could represent, for example, a log of
        search queries along with dense, sparse features and metadata.
        The datset below has 3 top-level entries.
        """
        contents_raw = [
            # dense
            [[1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3]],
            # floats
            [1, 2, 3],  # len
            [11, 21, 22, 31, 32, 33],  # key
            [1.1, 2.1, 2.2, 3.1, 3.2, 3.3],  # value
            # int lists
            [2, 0, 2],  # len
            [11, 12, 31, 32],  # key
            [2, 4, 3, 1],  # value:len
            [111, 112, 121, 122, 123, 124, 311, 312, 313, 321],  # value:value
            # id score pairs
            [1, 2, 2],  # len
            [11, 21, 22, 31, 32],  # key
            [1, 1, 2, 2, 3],  # value:len
            [111, 211, 221, 222, 311, 312, 321, 322, 323],  # value:ids
            [11.1, 21.1, 22.1, 22.2, 31.1, 31.2, 32.1, 32.2, 32.3],  # val:score
            # metadata
            [123, 234, 456],  # user_id
            [[0.2, 0.8], [0.5, 0.5], [0.7, 0.3]],  # user_embed
            ['dog posts', 'friends who like to', 'posts about ca'],  # query
        ]
        # convert the above content to ndarrays, checking against the schema
        contents = dataset.to_ndarray_list(contents_raw, schema)
        """
        3. Creating and appending to the dataset.
        We first create an empty dataset with the given schema.
        Then, a Writer is used to append these entries to the dataset.
        """
        ds = dataset.Dataset(schema)
        net = core.Net('init')
        ds.init_empty(net)
        blobs_to_append = [Const(net, c) for c in contents]
        writer = ds.writer(init_net=net)
        writer.write(net, blobs_to_append)
        workspace.RunNetOnce(net)
        """
        4. Iterating through the dataset contents.
        If we were to iterate through the top level entries of our dataset,
        this is what we should expect to see:
        """
        entries_raw = [
            (
                [[1.1, 1.2, 1.3]],  # dense
                [1], [11], [1.1],  # floats
                [2], [11, 12], [2, 4], [111, 112, 121, 122, 123, 124],  # intlst
                [1], [11], [1], [111], [11.1],  # id score pairs
                [123], [[0.2, 0.8]], ['dog posts'],  # metadata
            ),
            (
                [[2.1, 2.2, 2.3]],  # dense
                [2], [21, 22], [2.1, 2.2],  # floats
                [0], [], [], [],  # int list
                [2], [21, 22], [1, 2], [211, 221, 222], [21.1, 22.1, 22.2],
                [234], [[0.5, 0.5]], ['friends who like to'],  # metadata
            ),
            (
                [[3.1, 3.2, 3.3]],  # dense
                [3], [31, 32, 33], [3.1, 3.2, 3.3],  # floats
                [2], [31, 32], [3, 1], [311, 312, 313, 321],  # int lst
                [2], [31, 32], [2, 3], [311, 312, 321, 322, 323],
                [31.1, 31.2, 32.1, 32.2, 32.3],  # id score list
                [456], [[0.7, 0.3]], ['posts about ca'],  # metadata
            ),
            # after the end of the dataset, we will keep getting empty vectors
            ([],) * 16,
            ([],) * 16,
        ]
        entries = [dataset.to_ndarray_list(e, schema) for e in entries_raw]
        """
        Let's go ahead and create the reading nets.
        We will run `read` net multiple times and assert that we are reading the
        entries the way we stated above.
        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')
        reader = ds.reader(read_init_net)
        should_continue, batch_blobs = reader.read(read_next_net)
        workspace.RunNetOnce(read_init_net)
        workspace.CreateNet(read_next_net)
        read_next_net_name = str(read_next_net)
        for i, entry in enumerate(entries):
            workspace.RunNet(read_next_net_name)
            for name, blob, base in zip(ds.field_names(), batch_blobs, entry):
                data = workspace.FetchBlob(str(blob))
                _assert_arrays_equal(
                    data, base,
                    err_msg='Mismatch in entry %d, field %s' % (i, name))
        """
        5. Reading/writing in a single plan
        If all of operations on the data are expressible as Caffe2 operators,
        we don't need to load the data to python, iterating through the dataset
        in a single Plan.
        Where we will process the dataset a little and store it in a second
        dataset. We can reuse the same Reader since it supports reset.
        """
        reset_net = core.Net('reset_net')
        reader.reset(reset_net)
        read_step, fields = reader.execution_step()
        """ We will add the line number * 1000 to the feature ids. """
        process_net = core.Net('process')
        line_no = Const(process_net, 0, dtype=np.int32)
        const_one = Const(process_net, 1000, dtype=np.int32)
        process_net.Add([line_no, const_one], [line_no])
        fid = schema.floats.values.keys.id()
        process_net.Print(fields[fid], [])
        process_net.Add([fields[fid], line_no], fields[fid], broadcast=1)
        """ Lets create a second dataset and append to it. """
        ds2 = dataset.Dataset(schema, name='dataset2')
        ds2.init_empty(reset_net)
        writer = ds2.writer(reset_net)
        writer.write(process_net, fields)
        # commit is not necessary for DatasetWriter but will add it for
        # generality of the example
        commit_net = core.Net('commit')
        writer.commit(commit_net)
        """ Time to create and run a plan which will do the processing """
        plan = core.Plan('process')
        plan.AddStep(core.execution_step('reset', reset_net))
        plan.AddStep(read_step.AddNet(process_net))
        plan.AddStep(core.execution_step('commit', commit_net))
        workspace.RunPlan(plan)
        """
        Now we should have dataset2 populated.
        """
        ds2blobs = ds2.get_blobs()
        for i, (name, blob) in enumerate(zip(schema.field_names(), ds2blobs)):
            data = workspace.FetchBlob(str(blob))
            content = contents[i]
            if i == fid:
                # one of our fields has been added with line numbers * 1000
                content += [1000, 2000, 2000, 3000, 3000, 3000]
            _assert_arrays_equal(
                data, contents[i], err_msg='Mismatch in field %s.' % name)
        """
        6. Slicing a dataset
        You can create a new schema from pieces of another schema and reuse
        the same data.
        """
        subschema = Struct(('top_level', schema.int_lists.values))
        int_list_contents = contents[schema.int_lists.values.slice()]
        self.assertEquals(len(subschema.field_names()), len(int_list_contents))
        """
        7. Random Access a dataset
        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')
        idx = np.array([2, 1, 0])
        workspace.FeedBlob('idx', idx)
        reader = ds.random_reader(read_init_net, 'idx')
        reader.computeoffset(read_init_net)
        should_continue, batch_blobs = reader.read(read_next_net)
        workspace.CreateNet(read_init_net)
        workspace.RunNetOnce(read_init_net)
        workspace.CreateNet(read_next_net)
        read_next_net_name = str(read_next_net)
        for i in range(len(entries)):
            k = idx[i] if i in idx else i
            entry = entries[k]
            workspace.RunNet(read_next_net_name)
            for name, blob, base in zip(ds.field_names(), batch_blobs, entry):
                data = workspace.FetchBlob(str(blob))
                _assert_arrays_equal(
                    data, base,
                    err_msg='Mismatch in entry %d, field %s' % (i, name))
--- a/caffe2/python/operator_test/index_ops_test.py
+++ b/caffe2/python/operator_test/index_ops_test.py
@ -2,9 +2,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
 import numpy as np
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
 import numpy as np
 class TestIndexOps(TestCase):
@ -47,6 +47,13 @@ class TestIndexOps(TestCase):
        result2 = workspace.FetchBlob('result2')
        np.testing.assert_array_equal([0, 5, 1, 0, 0], result2)
        workspace.RunOperatorOnce(core.CreateOperator(
            'IndexSize',
            ['index'],
            ['index_size']))
        size = workspace.FetchBlob('index_size')
        self.assertEquals(size, 6)
        workspace.RunOperatorOnce(core.CreateOperator(
            'IndexStore',
            ['index'],
@ -55,3 +62,21 @@ class TestIndexOps(TestCase):
        new_entries = np.array(['new_entry1', 'new_entry2'], dtype=str)
        np.testing.assert_array_equal(
            np.concatenate((entries, new_entries)), stored_actual)
        workspace.RunOperatorOnce(core.CreateOperator(
            'StringIndexCreate',
            [],
            ['index2']))
        workspace.RunOperatorOnce(core.CreateOperator(
            'IndexLoad',
            ['index2', 'stored_entries'],
            [],
            skip_first_entry=1))
        workspace.RunOperatorOnce(core.CreateOperator(
            'IndexSize',
            ['index2'],
            ['index2_size']))
        index2_size = workspace.FetchBlob('index2_size')
        self.assertEquals(index2_size, 5)
--- a/caffe2/python/operator_test/one_hot_ops_test.py
+++ b/caffe2/python/operator_test/one_hot_ops_test.py
@ -0,0 +1,76 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
 import numpy as np
 def _one_hots():
    index_size = st.integers(min_value=1, max_value=5)
    lengths = st.lists(
        elements=st.integers(min_value=0, max_value=5))
    return st.tuples(index_size, lengths).flatmap(
        lambda x: st.tuples(
            st.just(x[0]),
            st.just(x[1]),
            st.lists(
                elements=st.integers(min_value=0, max_value=x[0] - 1),
                min_size=sum(x[1]),
                max_size=sum(x[1]))))
 class TestOneHotOps(hu.HypothesisTestCase):
    @given(
        hot_indices=hu.tensor(
            min_dim=1, max_dim=1, dtype=np.int64,
            elements=st.integers(min_value=0, max_value=42)),
        end_padding=st.integers(min_value=0, max_value=2))
    def test_one_hot(self, hot_indices, end_padding):
        def one_hot_ref(hot_indices, size):
            out = np.zeros([len(hot_indices), size], dtype=float)
            x = enumerate(hot_indices)
            for i, x in enumerate(hot_indices):
                out[i, x] = 1.
            return (out, )
        size = np.array(max(hot_indices) + end_padding + 1, dtype=np.int64)
        if size == 0:
            size = 1
        op = core.CreateOperator('OneHot', ['hot_indices', 'size'], ['output'])
        self.assertReferenceChecks(
            hu.cpu_do,
            op,
            [hot_indices, size],
            one_hot_ref)
    @given(hot_indices=_one_hots())
    def test_segment_one_hot(self, hot_indices):
        index_size, lengths, indices = hot_indices
        index_size = np.array(index_size, dtype=np.int64)
        lengths = np.array(lengths, dtype=np.int32)
        indices = np.array(indices, dtype=np.int64)
        def segment_one_hot_ref(lengths, hot_indices, size):
            offset = 0
            out = np.zeros([len(lengths), size], dtype=float)
            for i, length in enumerate(lengths):
                for idx in hot_indices[offset:offset + length]:
                    out[i, idx] = 1.
                offset += length
            return (out, )
        op = core.CreateOperator(
            'SegmentOneHot',
            ['lengths', 'hot_indices', 'size'],
            ['output'])
        self.assertReferenceChecks(
            hu.cpu_do,
            op,
            [lengths, indices, index_size],
            segment_one_hot_ref)
--- a/caffe2/python/operator_test/segment_ops_test.py
+++ b/caffe2/python/operator_test/segment_ops_test.py
@ -83,6 +83,10 @@ def logsumexp_grad(grad_out, outputs, inputs):
        axis=0) * np.exp(inputs[0])
 def logmeanexp(x):
    return np.log(np.mean(np.exp(x), axis=0))
 def mean(x):
    return np.mean(x, axis=0)
@ -94,6 +98,30 @@ def mean_grad(grad_out, outputs, inputs):
        axis=0)
 def max(x):
    return np.amax(x, axis=0)
 def max_grad(grad_out, outputs, inputs):
    flat_inputs = inputs[0].flatten()
    flat_outputs = np.array(outputs[0]).flatten()
    flat_grad_in = np.zeros(flat_inputs.shape)
    flat_grad_out = np.array(grad_out).flatten()
    blocks = inputs[0].shape[0]
    block_size = flat_inputs.shape[0] // blocks
    for i in range(block_size):
        out_grad = flat_grad_out[i]
        out = flat_outputs[i]
        for j in range(blocks):
            idx = j * block_size + i
            if out == flat_inputs[idx]:
                flat_grad_in[idx] = out_grad
                break
    return np.resize(flat_grad_in, inputs[0].shape)
 REFERENCES_ALL = [
    ('Sum', partial(np.sum, axis=0), sum_grad),
 ]
@ -101,7 +129,10 @@ REFERENCES_ALL = [
 REFERENCES_SORTED = [
    ('RangeSum', partial(np.sum, axis=0), sum_grad),
    ('RangeLogSumExp', logsumexp, logsumexp_grad),
    # gradient is the same as sum
    ('RangeLogMeanExp', logmeanexp, logsumexp_grad),
    ('RangeMean', mean, mean_grad),
    ('RangeMax', max, max_grad),
 ]
--- a/caffe2/python/operator_test/sequence_ops_test.py
+++ b/caffe2/python/operator_test/sequence_ops_test.py
@ -185,3 +185,37 @@ class TestSequenceOps(hu.HypothesisTestCase):
            op,
            [padded_data, padded_lengths],
            partial(_gather_padding_ref, start_pad_width, end_pad_width))
    @given(data=hu.tensor(min_dim=3, max_dim=3, dtype=np.float32,
                          elements=st.floats(min_value=-np.inf,
                                             max_value=np.inf),
                          min_value=1, max_value=10),
                          **hu.gcs_cpu_only)
    def test_reverse_packed_segs(self, data, gc, dc):
        max_length = data.shape[0]
        batch_size = data.shape[1]
        lengths = np.random.randint(max_length + 1, size=batch_size)
        op = core.CreateOperator(
            "ReversePackedSegs",
            ["data", "lengths"],
            ["reversed_data"])
        def op_ref(data, lengths):
            rev_data = np.array(data, copy=True)
            for i in range(batch_size):
                seg_length = lengths[i]
                for j in range(seg_length):
                    rev_data[j][i] = data[seg_length - 1 - j][i]
            return (rev_data,)
        def op_grad_ref(grad_out, outputs, inputs):
            return op_ref(grad_out, inputs[1]) + (None,)
        self.assertReferenceChecks(
            device_option=gc,
            op=op,
            inputs=[data, lengths],
            reference=op_ref,
            output_to_grad='reversed_data',
            grad_reference=op_grad_ref)
--- a/caffe2/python/operator_test/string_ops_test.py
+++ b/caffe2/python/operator_test/string_ops_test.py
@ -0,0 +1,106 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
 import numpy as np
 def _string_lists(alphabet=None):
    return st.lists(
        elements=st.text(alphabet=alphabet, average_size=3),
        min_size=0,
        max_size=3)
 class TestStringOps(hu.HypothesisTestCase):
    @given(strings=_string_lists())
    def test_string_prefix(self, strings):
        length = 3
        # although we are utf-8 encoding below to avoid python exceptions,
        # StringPrefix op deals with byte-length prefixes, which may produce
        # an invalid utf-8 string. The goal here is just to avoid python
        # complaining about the unicode -> str conversion.
        strings = np.array(
            map(lambda a: a.encode('utf-8'), strings), dtype=np.object)
        def string_prefix_ref(strings):
            return (
                np.array(map(lambda a: a[:length], strings), dtype=object), )
        op = core.CreateOperator(
            'StringPrefix',
            ['strings'],
            ['stripped'],
            length=length)
        self.assertReferenceChecks(
            hu.cpu_do,
            op,
            [strings],
            string_prefix_ref)
    @given(strings=_string_lists())
    def test_string_suffix(self, strings):
        length = 3
        strings = np.array(
            map(lambda a: a.encode('utf-8'), strings), dtype=np.object)
        def string_suffix_ref(strings):
            return (
                np.array(map(lambda a: a[-length:], strings), dtype=object), )
        op = core.CreateOperator(
            'StringSuffix',
            ['strings'],
            ['stripped'],
            length=length)
        self.assertReferenceChecks(
            hu.cpu_do,
            op,
            [strings],
            string_suffix_ref)
    @given(strings=st.text(alphabet=['a', 'b'], average_size=3))
    def test_string_starts_with(self, strings):
        prefix = 'a'
        strings = np.array(
            map(lambda a: str(strings), strings), dtype=np.object)
        def string_starts_with_ref(strings):
            return (np.array(
                map(lambda a: a.startswith(prefix), strings), dtype=bool), )
        op = core.CreateOperator(
            'StringStartsWith',
            ['strings'],
            ['bools'],
            prefix=prefix)
        self.assertReferenceChecks(
            hu.cpu_do,
            op,
            [strings],
            string_starts_with_ref)
    @given(strings=st.text(alphabet=['a', 'b'], average_size=3))
    def test_string_ends_with(self, strings):
        suffix = 'a'
        strings = np.array(
            map(lambda a: str(strings), strings), dtype=np.object)
        def string_ends_with_ref(strings):
            return (np.array(
                map(lambda a: a.endswith(suffix), strings), dtype=bool), )
        op = core.CreateOperator(
            'StringEndsWith',
            ['strings'],
            ['bools'],
            suffix=suffix)
        self.assertReferenceChecks(
            hu.cpu_do,
            op,
            [strings],
            string_ends_with_ref)
--- a/caffe2/python/schema.py
+++ b/caffe2/python/schema.py
@ -0,0 +1,348 @@
 """
 Defines a minimal set of data types that allow to represent datasets with
 arbitrary nested structure, including objects of variable length, such as
 maps and lists.
 This defines a columnar storage format for such datasets on top of caffe2
 tensors. In terms of capacity of representation, it can represent most of
 the data types supported by Parquet, ORC, DWRF file formats.
 See comments in operator_test/dataset_ops_test.py for a example and
 walkthrough on how to use schema to store and iterate through a structured
 in-memory dataset.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
 from collections import OrderedDict
 import logging
 import numpy as np
 logger = logging.getLogger(__name__)
 def _join_field_name(prefix, suffix):
    if prefix and suffix:
        return '{}:{}'.format(prefix, suffix)
    elif prefix:
        return prefix
    elif suffix:
        return suffix
    else:
        return ''
 class Field(object):
    """Represents an abstract field type in a dataset.
    """
    def __init__(self, children):
        """Derived classes must call this after their initialization."""
        self._parent = (None, 0)
        offset = 0
        self._field_offsets = []
        for child in children:
            self._field_offsets.append(offset)
            offset += len(child.field_names())
        self._field_offsets.append(offset)
    def field_names(self):
        """Return the children field names for this field."""
        raise NotImplementedError('Field is an abstract class.')
    def field_types(self):
        """Return the numpy.dtype for each of the children fields."""
        raise NotImplementedError('Field is an abstract class.')
    def clone(self):
        """Clone this Field along with its children."""
        raise NotImplementedError('Field is an abstract class.')
    def _set_parent(self, parent, relative_id):
        self._parent = (parent, relative_id)
    def slice(self):
        """
        Returns a slice representing the range of field ids that belong to
        this field. This slice can be used to index a list of fields.
        E.g.:
        >>> s = Struct(
        >>>     ('a', Scalar()),
        >>>     ('b', Struct(
        >>>         ('b1', Scalar()),
        >>>         ('b2', Scalar()),
        >>>     )),
        >>>     ('c', Scalar()),
        >>> )
        >>> field_data = ['da', 'db1', 'db2', 'dc']
        >>> field_data[s.b.split()]
        ['db1', 'db2']
        """
        base_id = self._child_base_id()
        return slice(base_id, base_id + len(self.field_names()))
    def _child_base_id(self, child_index=None):
        """Get the base id of the given child"""
        p, i = self._parent
        pos = 0 if child_index is None else self._field_offsets[child_index]
        if p:
            pos += p._child_base_id(i)
        return pos
    def __eq__(self, other):
        """Equivalance of two schemas"""
        return ((self.field_names() == other.field_names()) and
                (self.field_types() == other.field_types()))
 class List(Field):
    """Represents a variable-length list.
    Values of a list can also be complex fields such as Lists and Structs.
    In addition to the fields exposed by its `values` field, a List exposes an
    additional `lengths` field, which will contain the size of each list under
    the parent domain.
    """
    def __init__(self, values):
        assert isinstance(values, Field)
        self.lengths = Scalar(np.int32)
        self.values = values.clone()
        self.lengths._set_parent(self, 0)
        self.values._set_parent(self, 1)
        Field.__init__(self, [self.lengths, self.values])
    def field_names(self):
        value_fields = self.values.field_names()
        return (
            ['lengths'] +
            [_join_field_name('values', v) for v in value_fields])
    def field_types(self):
        return self.lengths.field_types() + self.values.field_types()
    def clone(self):
        return List(self.values)
 class Struct(Field):
    """Represents a named list of fields sharing the same domain.
    """
    def __init__(self, *fields):
        for field in fields:
            assert len(field) == 2
            assert field[0], 'Field names cannot be empty'
            assert field[0] != 'lengths', (
                'Struct cannot contain a field named `lengths`.')
            assert isinstance(field[1], Field)
        fields = [(name, field.clone()) for name, field in fields]
        for id, (name, field) in enumerate(fields):
            field._set_parent(self, id)
        self.fields = OrderedDict(fields)
        Field.__init__(self, self.fields.values())
    def field_names(self):
        names = []
        for name, field in self.fields.items():
            names += [_join_field_name(name, f) for f in field.field_names()]
        return names
    def field_types(self):
        types = []
        for name, field in self.fields.items():
            types += field.field_types()
        return types
    def clone(self):
        return Struct(*self.fields.items())
    def __getattr__(self, item):
        return self.fields[item]
 class Scalar(Field):
    """Represents a typed scalar or tensor of fixed shape.
    A Scalar is a leaf in a schema tree, translating to exactly one tensor in
    the dataset's underlying storage.
    Usually, the tensor storing the actual values of this field is a 1D tensor,
    representing a series of values in its domain. It is possible however to
    have higher rank values stored as a Scalar, as long as all entries have
    the same shape.
    E.g.:
        Scalar(np.float64)
            Scalar field of type float32. Caffe2 will expect readers and
            datasets to expose it as a 1D tensor of doubles (vector), where
            the size of the vector is determined by this fields' domain.
        Scalar((np.int32, 5))
            Tensor field of type int32. Caffe2 will expect readers and
            datasets to implement it as a 2D tensor (matrix) of shape (L, 5),
            where L is determined by this fields' domain.
        Scalar((str, (10, 20)))
            Tensor field of type str. Caffe2 will expect readers and
            datasets to implement it as a 3D tensor of shape (L, 10, 20),
            where L is determined by this fields' domain.
    If the field type is unknown at construction time, call Scalar(), that will
    default to np.void as its dtype.
    It is an error to pass a structured dtype to Scalar, since it would contain
    more than one field. Instead, use from_dtype, which will construct
    a nested `Struct` field reflecting the given dtype's structure.
    """
    def __init__(self, dtype=None):
        self._original_dtype = dtype
        self.dtype = np.dtype(dtype or np.void)
        assert not self.dtype.fields, (
            'Cannot create Scalar with a structured dtype. ' +
            'Use from_dtype instead.')
        Field.__init__(self, [])
    def field_names(self):
        return ['']
    def field_types(self):
        return [self.dtype]
    def clone(self):
        return Scalar(self._original_dtype)
    def id(self):
        """
        Return the zero-indexed position of this scalar field in its schema.
        Used in order to index into the field_blob list returned by readers or
        accepted by writers.
        """
        return self._child_base_id()
 def Map(keys, values, keys_name='keys', values_name='values'):
    """A map is a List of Struct containing keys and values fields.
    Optionally, you can provide custom name for the key and value fields.
    """
    return List(Struct((keys_name, keys), (values_name, values)))
 def from_dtype(dtype, _outer_shape=()):
    """Constructs a Caffe2 schema from the given numpy's dtype.
    Numpy supports scalar, array-like and structured datatypes, as long as
    all the shapes are fixed. This function breaks down the given dtype into
    a Caffe2 schema containing `Struct` and `Scalar` types.
    Fields containing byte offsets are not currently supported.
    """
    if not isinstance(dtype, np.dtype):
        # wrap into a ndtype
        shape = _outer_shape
        dtype = np.dtype((dtype, _outer_shape))
    else:
        # concatenate shapes if necessary
        shape = _outer_shape + dtype.shape
        if shape != dtype.shape:
            dtype = np.dtype((dtype.base, shape))
    if not dtype.fields:
        return Scalar(dtype)
    struct_fields = []
    for name, (fdtype, offset) in dtype.fields:
        assert offset == 0, ('Fields with byte offsets are not supported.')
        struct_fields += (name, from_dtype(fdtype, _outer_shape=shape))
    return Struct(*struct_fields)
 class _SchemaNode(object):
    """This is a private class used to represent a Schema Node"""
    def __init__(self, name, type_str=''):
        self.name = name
        self.children = []
        self.type_str = type_str
        self.field = None
    def add_child(self, name, type_str=''):
        for child in self.children:
            if child.name == name and child.type_str == type_str:
                return child
        child = _SchemaNode(name, type_str)
        self.children.append(child)
        return child
    def get_field(self):
        list_names = ['lengths', 'values']
        map_names = ['lengths', 'keys', 'values']
        if len(self.children) == 0 or self.field is not None:
            assert self.field is not None
            return self.field
        child_names = []
        for child in self.children:
            child_names.append(child.name)
        if (set(child_names) == set(list_names)):
            for child in self.children:
                if child.name == 'values':
                    self.field = List(child.get_field())
                    self.type_str = "List"
                    return self.field
        elif (set(child_names) == set(map_names)):
            for child in self.children:
                if child.name == 'keys':
                    key_field = child.get_field()
                elif child.name == 'values':
                    values_field = child.get_field()
            self.field = Map(key_field, values_field)
            self.type_str = "Map"
            return self.field
        else:
            struct_fields = []
            for child in self.children:
                if child.field is not None:
                    struct_fields.append((child.name, child.field))
                else:
                    struct_fields.append((child.name, child.get_field()))
            self.field = Struct(*struct_fields)
            self.type_str = "Struct"
            return self.field
    def print_recursively(self):
        for child in self.children:
            child.print_recursively()
        logger.info("Printing node: Name and type")
        logger.info(self.name)
        logger.info(self.type_str)
 def from_column_list(column_names, column_types):
    root = _SchemaNode('root', 'Struct')
    for column_name, column_type in zip(column_names, column_types):
        columns = column_name.split(':')
        current = root
        for i in range(len(columns)):
            name = columns[i]
            type_str = ''
            field = None
            if i == len(columns) - 1:
                type_str = column_type
                field = Scalar(column_type)
            next = current.add_child(name, type_str)
            if field is not None:
                next.field = field
            current = next
    return root.get_field()
--- a/caffe2/python/sparse_to_dense_mask_test.py
+++ b/caffe2/python/sparse_to_dense_mask_test.py
@ -0,0 +1,82 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
 import numpy as np
 class TestSparseToDenseMask(TestCase):
    def test_sparse_to_dense_mask_float(self):
        op = core.CreateOperator(
            'SparseToDenseMask',
            ['indices', 'values', 'default', 'lengths'],
            ['output'],
            mask=[999999999, 2, 6])
        workspace.FeedBlob(
            'indices',
            np.array([2, 4, 6, 1, 2, 999999999, 2], dtype=np.int32))
        workspace.FeedBlob(
            'values',
            np.array([1, 2, 3, 4, 5, 6, 7], dtype=np.float))
        workspace.FeedBlob('default', np.array(-1, dtype=np.float))
        workspace.FeedBlob('lengths', np.array([3, 4], dtype=np.int32))
        workspace.RunOperatorOnce(op)
        output = workspace.FetchBlob('output')
        expected = np.array([[-1, 1, 3], [6, 7, -1]], dtype=np.float)
        self.assertEqual(output.shape, expected.shape)
        self.assertFalse(np.any(output - expected))
    def test_sparse_to_dense_mask_string(self):
        op = core.CreateOperator(
            'SparseToDenseMask',
            ['indices', 'values', 'default', 'lengths'],
            ['output'],
            mask=[999999999, 2, 6])
        workspace.FeedBlob(
            'indices',
            np.array([2, 4, 6, 1, 2, 999999999, 2], dtype=np.int32))
        workspace.FeedBlob(
            'values',
            np.array(['1', '2', '3', '4', '5', '6', '7'], dtype=np.str))
        workspace.FeedBlob('default', np.array('-1', dtype=np.str))
        workspace.FeedBlob('lengths', np.array([3, 4], dtype=np.int32))
        workspace.RunOperatorOnce(op)
        output = workspace.FetchBlob('output')
        expected = np.array([['-1', '1', '3'], ['6', '7', '-1']], dtype=np.str)
        self.assertEqual(output.shape, expected.shape)
        self.assertTrue(np.all(np.equal(output, expected)))
    def test_sparse_to_dense_mask_empty_lengths(self):
        op = core.CreateOperator(
            'SparseToDenseMask',
            ['indices', 'values', 'default', 'lengths'],
            ['output'],
            mask=[1, 2, 6])
        workspace.FeedBlob('indices', np.array([2, 4, 6], dtype=np.int32))
        workspace.FeedBlob('values', np.array([1, 2, 3], dtype=np.float))
        workspace.FeedBlob('default', np.array(-1, dtype=np.float))
        workspace.FeedBlob('lengths', np.array([], dtype=np.int32))
        workspace.RunOperatorOnce(op)
        output = workspace.FetchBlob('output')
        expected = np.array([-1, 1, 3], dtype=np.float)
        self.assertEqual(output.shape, expected.shape)
        self.assertFalse(np.any(output - expected))
    def test_sparse_to_dense_mask_no_lengths(self):
        op = core.CreateOperator(
            'SparseToDenseMask',
            ['indices', 'values', 'default'],
            ['output'],
            mask=[1, 2, 6])
        workspace.FeedBlob('indices', np.array([2, 4, 6], dtype=np.int32))
        workspace.FeedBlob('values', np.array([1, 2, 3], dtype=np.float))
        workspace.FeedBlob('default', np.array(-1, dtype=np.float))
        workspace.RunOperatorOnce(op)
        output = workspace.FetchBlob('output')
        expected = np.array([-1, 1, 3], dtype=np.float)
        self.assertEqual(output.shape, expected.shape)
        self.assertFalse(np.any(output - expected))
--- a/caffe2/python/workspace.py
+++ b/caffe2/python/workspace.py
@ -6,10 +6,18 @@ import shutil
 import socket
 import tempfile
 import numpy as np
 from caffe2.proto import caffe2_pb2
 from caffe2.python import scope, utils
 from ._import_c_extension import *  # noqa
 # Python 2 and 3 compatibility: test if basestring exists
 try:
    basestring  # NOQA
 except NameError:
    # This is python3 so we define basestring.
    basestring = str
 def _GetFreeFlaskPort():
    """Get a free flask port."""
@ -86,7 +94,9 @@ def ResetWorkspace(root_folder=None):
        return cc_ResetWorkspace(root_folder)
-def CreateNet(net, input_blobs=[]):
+def CreateNet(net, input_blobs=None):
    if input_blobs is None:
        input_blobs = []
    for input_blob in input_blobs:
        CreateBlob(input_blob)
    return cc_CreateNet(StringfyProto(net))
@ -112,6 +122,14 @@ def RunPlan(plan):
    return cc_RunPlan(StringfyProto(plan))
 def _StringifyBlobName(name):
    if isinstance(name, basestring):
        return name
    assert type(name).__name__ == 'BlobReference', \
        "Expected a string or BlobReference"
    return str(name)
 def FeedBlob(name, arr, device_option=None):
    """Feeds a blob into the workspace.
@ -125,6 +143,10 @@ def FeedBlob(name, arr, device_option=None):
    """
    if type(arr) is caffe2_pb2.TensorProto:
        arr = utils.Caffe2TensorToNumpyArray(arr)
    if type(arr) is np.ndarray and arr.dtype.kind == 'S':
        # Plain NumPy strings are weird, let's use objects instead
        arr = arr.astype(np.object)
    name = _StringifyBlobName(name)
    if device_option is not None:
        return cc_FeedBlob(name, arr, StringfyProto(device_option))
    elif scope.DEVICESCOPE is not None:
@ -133,6 +155,40 @@ def FeedBlob(name, arr, device_option=None):
        return cc_FeedBlob(name, arr)
 def FetchBlob(name):
    """Fetches a blob from the workspace.
    Inputs:
      name: the name of the blob - a string or a BlobReference
    Returns:
      Fetched blob (numpy array or string) if successful
    """
    name = _StringifyBlobName(name)
    return cc_FetchBlob(name)
 class _BlobDict(object):
    """Provides python dict compatible way to do fetching and feeding"""
    def __getitem__(self, key):
        return FetchBlob(key)
    def __setitem__(self, key, value):
        return FeedBlob(key, value)
    def __len__(self):
        return len(Blobs())
    def __iter__(self):
        return Blobs().__iter__()
    def __contains__(self, item):
        return HasBlob(item)
 blobs = _BlobDict()
 class Model(object):
    def __init__(self, net, parameters, inputs, outputs, device_option=None):
        """Initializes a model.
--- a/caffe2/python/workspace_test.py
+++ b/caffe2/python/workspace_test.py
@ -8,7 +8,8 @@ from caffe2.python import core, test_util, workspace
 class TestWorkspace(unittest.TestCase):
    def setUp(self):
        self.net = core.Net("test-net")
-        self.net.ConstantFill([], "testblob", shape=[1, 2, 3, 4], value=1.0)
+        self.testblob_ref = self.net.ConstantFill(
            [], "testblob", shape=[1, 2, 3, 4], value=1.0)
        workspace.ResetWorkspace()
    def testRootFolder(self):
@ -64,6 +65,20 @@ class TestWorkspace(unittest.TestCase):
        self.assertEqual(fetched_again.shape, (1, 2, 3, 4))
        np.testing.assert_array_equal(fetched_again, 2.0)
    def testFetchFeedBlobViaBlobReference(self):
        self.assertEqual(
            workspace.RunNetOnce(self.net.Proto().SerializeToString()), True)
        fetched = workspace.FetchBlob(self.testblob_ref)
        # check if fetched is correct.
        self.assertEqual(fetched.shape, (1, 2, 3, 4))
        np.testing.assert_array_equal(fetched, 1.0)
        fetched[:] = 2.0
        self.assertEqual(workspace.FeedBlob(self.testblob_ref, fetched), True)
        fetched_again = workspace.FetchBlob("testblob")  # fetch by name now
        self.assertEqual(fetched_again.shape, (1, 2, 3, 4))
        np.testing.assert_array_equal(fetched_again, 2.0)
    def testFetchFeedBlobTypes(self):
        for dtype in [np.float16, np.float32, np.float64, np.bool,
                      np.int8, np.int16, np.int32, np.int64,
@ -101,7 +116,8 @@ class TestWorkspace(unittest.TestCase):
        strs = np.array([
            ' '.join(10 * ['long string']),
            ' '.join(128 * ['very long string']),
-            'small string'])
+            'small \0\1\2 string',
            "Hello, world! I have special \0 symbols \1!"])
        workspace.FeedBlob('my_str_tensor', strs)
        strs2 = workspace.FetchBlob('my_str_tensor')
        self.assertEqual(strs.shape, strs2.shape)
@ -117,6 +133,32 @@ class TestWorkspace(unittest.TestCase):
        for i in range(0, strs.shape[0]):
            self.assertEqual(strs[i], strs2[i])
    def testFetchFeedPlainString(self):
        # this is actual string, not a tensor of strings
        s = "Hello, world! I have special \0 symbols \1!"
        workspace.FeedBlob('my_plain_string', s)
        s2 = workspace.FetchBlob('my_plain_string')
        self.assertEqual(s, s2)
    def testFetchFeedViaBlobDict(self):
        self.assertEqual(
            workspace.RunNetOnce(self.net.Proto().SerializeToString()), True)
        fetched = workspace.blobs["testblob"]
        # check if fetched is correct.
        self.assertEqual(fetched.shape, (1, 2, 3, 4))
        np.testing.assert_array_equal(fetched, 1.0)
        fetched[:] = 2.0
        workspace.blobs["testblob"] = fetched
        fetched_again = workspace.blobs["testblob"]
        self.assertEqual(fetched_again.shape, (1, 2, 3, 4))
        np.testing.assert_array_equal(fetched_again, 2.0)
        self.assertTrue("testblob" in workspace.blobs)
        self.assertFalse("non_existant" in workspace.blobs)
        self.assertEqual(len(workspace.blobs), 1)
        for key in workspace.blobs:
            self.assertEqual(key, "testblob")
 class TestMultiWorkspaces(unittest.TestCase):
    def setUp(self):
--- a/caffe2/sgd/adagrad_op.h
+++ b/caffe2/sgd/adagrad_op.h
@ -14,7 +14,8 @@ void adagrad_update(
    float epsilon,
    const float* lr,
    Context* context) {
-#pragma omp parallel for
+  // TODO(cxj): use OMP when it is reliable
  // #pragma omp parallel for
  for (auto i = 0; i < N; ++i) {
    float gi = g[i];
    float hi = nh[i] = h[i] + gi * gi;
@ -78,8 +79,8 @@ class SparseAdagradOp final : public Operator<Context> {
    const auto* momentIn = Input(MOMENT_1).template data<T>();
    auto* gradOut = Output(OUTPUT_GRAD)->template mutable_data<T>();
    auto* momentOut = Output(OUTPUT_MOMENT_1)->template mutable_data<T>();
-
+    // TODO(cxj): use OMP when it is reliable
-#pragma omp parallel for
+    // #pragma omp parallel for
    for (auto i = 0; i < n; ++i) {
      auto idx = indices[i];
      if (block_size == 1) {
--- a/caffe2/sgd/ftrl_op.cc
+++ b/caffe2/sgd/ftrl_op.cc
@ -41,7 +41,8 @@ void ftrl_update(
    T* new_nz,
    const FtrlParams<T>& params,
    Context* context) {
-#pragma omp parallel for
+  // TODO(cxj): use OMP when it is reliable
  // #pragma omp parallel for
  for (auto i = 0; i < N; ++i) {
    ftrl_compute(
        w[i],
@ -93,7 +94,9 @@ void SparseFtrlOp<T>::DoRun() {
  T* nz = n_z->template mutable_data<T>();
  const SIndex* idxs = indices.template data<SIndex>();
  const T* g = grad.template data<T>();
-#pragma omp parallel for
+
  // TODO(cxj): use OMP when it is reliable
  // #pragma omp parallel for
  for (TIndex i = 0; i < K; ++i) {
    SIndex idx = idxs[i];
    DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx
--- a/caffe2/utils/math_cpu.cc
+++ b/caffe2/utils/math_cpu.cc
@ -11,9 +11,11 @@
 //     platforms, it allows one to quickly port Caffe2 to different platforms
 //     where BLAS may not be present.
 #include <random>
 #include <sys/time.h>
 #include <sys/types.h>
 #include <unistd.h>
 #include <atomic>
 #include <random>
 #ifdef CAFFE2_USE_MKL
 #include <mkl.h>
@ -486,6 +488,7 @@ void Set<T, CPUContext>(const int N, const T alpha, T *Y,                      \
 CAFFE2_SPECIALIZED_SET(float);
 CAFFE2_SPECIALIZED_SET(double);
 CAFFE2_SPECIALIZED_SET(int);
 CAFFE2_SPECIALIZED_SET(bool);
 #undef CAFFE2_SPECIALIZED_SET
 #define CAFFE2_INSTANTIATE_BINARY_OP(name, op, T)                          \
--- a/caffe2/utils/string_utils.cc
+++ b/caffe2/utils/string_utils.cc
@ -0,0 +1,14 @@
 #include "caffe2/utils/string_utils.h"
 namespace caffe2 {
 std::vector<std::string> split(char separator, const std::string& string) {
  std::vector<std::string> pieces;
  std::stringstream ss(string);
  std::string item;
  while (getline(ss, item, separator)) {
    pieces.push_back(std::move(item));
  }
  return pieces;
 }
 }
--- a/caffe2/utils/string_utils.h
+++ b/caffe2/utils/string_utils.h
@ -0,0 +1,10 @@
 #pragma once
 #include <sstream>
 #include <string>
 #include <vector>
 namespace caffe2 {
 std::vector<std::string> split(char separator, const std::string& string);
 }