sync

2025-12-06 12:20:52 +01:00 · 2016-07-28 15:06:04 -07:00 · 2016-07-28 15:06:04 -07:00 · bcea409c82
commit bcea409c82
parent f01f2063dd
70 changed files with 5124 additions and 383 deletions
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@ -11,12 +11,70 @@ CAFFE2_DEFINE_int(
    "Chunk size to split tensor data into");

 namespace caffe2 {
+namespace {
+/**
+ * @brief StringSerializer is the serializer for String.
+ *
+ * StringSerializer takes in a blob that contains a String, and serializes it
+ * into a BlobProto protocol buffer.
+ */
+class StringSerializer : public BlobSerializerBase {
+ public:
+  StringSerializer() {}
+  ~StringSerializer() {}
+  /**
+   * Serializes a Blob. Note that this blob has to contain Tensor<Context>,
+   * otherwise this function produces a fatal error.
+   */
+  void Serialize(
+      const Blob& blob,
+      const string& name,
+      SerializationAcceptor acceptor) override {
+    CHECK(blob.IsType<std::string>());
+
+    BlobProto blob_proto;
+    blob_proto.set_name(name);
+    blob_proto.set_type("std::string");
+    blob_proto.set_content(blob.template Get<std::string>());
+    acceptor(name, blob_proto.SerializeAsString());
+  }
+};
+
+/**
+ * @brief StringDeserializer is the deserializer for Strings.
+ *
+ */
+class StringDeserializer : public BlobDeserializerBase {
+ public:
+  bool Deserialize(const BlobProto& proto, Blob* blob) override {
+    *blob->GetMutable<std::string>() = proto.content();
+    return true;
+  }
+};
+}
+
+namespace {
+
+// We can't use DeviceType_Name because of a protobuf-lite constraint.
+std::string tensorDeviceTypeName(const DeviceType& d) {
+  switch (d) {
+    case CPU:
+      return "TensorCPU";
+    case CUDA:
+      return "TensorCUDA";
+    default:
+      CAFFE_THROW("Unknown device: ", d);
+      return "";
+  }
+};
+}

 // The blob serialization member function implementation.
 void Blob::Serialize(
    const string& name,
    BlobSerializerBase::SerializationAcceptor acceptor) const {
  std::unique_ptr<BlobSerializerBase> serializer(CreateSerializer(meta_.id()));
+  CAFFE_ENFORCE(serializer, "No known serializer for ", meta_.name());
  serializer->Serialize(*this, name, acceptor);
 }

@ -33,7 +91,6 @@ std::string Blob::Serialize(const string& name) const {
  return data.str();
 }

-
 // Specialization for StoreDeviceDetail for CPU - nothing needs to be done.
 template <>
 void TensorSerializer<CPUContext>::StoreDeviceDetail(
@ -60,9 +117,8 @@ bool Blob::Deserialize(const BlobProto& blob_proto) {
  if (blob_proto.has_tensor()) {
    // This is a tensor object. Depending on the device type, we will
    // use the corresponding TensorDeserializer.
-    auto deserializer = CreateDeserializer(
-        "Tensor" +
-        DeviceType_Name(blob_proto.tensor().device_detail().device_type()));
+    auto deserializer = CreateDeserializer(tensorDeviceTypeName(
+        blob_proto.tensor().device_detail().device_type()));
    // Tensor's deserializer should always be registered, but we will double
    // check if it is not null anyway.
    return CHECK_NOTNULL(deserializer.get())->Deserialize(blob_proto, this);
@ -82,5 +138,8 @@ REGISTER_BLOB_SERIALIZER(
    (TypeMeta::Id<TensorCPU>()),
    TensorSerializer<CPUContext>);
 REGISTER_BLOB_DESERIALIZER(TensorCPU, TensorDeserializer<CPUContext>);
+// Serialize std::string
+REGISTER_BLOB_SERIALIZER((TypeMeta::Id<std::string>()), StringSerializer);
+REGISTER_BLOB_DESERIALIZER(std::string, StringDeserializer);
 }  // namespace
 }  // namespace caffe2
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@ -69,6 +69,20 @@ TEST(BlobTest, BlobWrongType) {
  ASSERT_THROW(blob.Get<int>(), EnforceNotMet);
 }

+TEST(BlobTest, StringSerialization) {
+  const std::string kTestString = "Hello world?";
+  Blob blob;
+  *blob.GetMutable<std::string>() = kTestString;
+
+  string serialized = blob.Serialize("test");
+  BlobProto proto;
+  CHECK(proto.ParseFromString(serialized));
+  EXPECT_EQ(proto.name(), "test");
+  EXPECT_EQ(proto.type(), "std::string");
+  EXPECT_FALSE(proto.has_tensor());
+  EXPECT_EQ(proto.content(), kTestString);
+}
+
 TEST(TensorNonTypedTest, TensorChangeType) {
  vector<int> dims(3);
  dims[0] = 2;
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@ -5,8 +5,9 @@
 #include <cstdlib>
 #include <random>

-#include "caffe2/proto/caffe2.pb.h"
 #include "caffe2/core/logging.h"
+#include "caffe2/core/typeid.h"
+#include "caffe2/proto/caffe2.pb.h"
 #include "caffe2/utils/math.h"

 namespace caffe2 {
@ -103,6 +104,7 @@ class CPUContext final {
  // Two copy functions that deals with cross-device copies.
  template <class SrcContext, class DstContext>
  inline void CopyBytes(size_t nbytes, const void* src, void* dst);
+
  template <typename T, class SrcContext, class DstContext>
  inline void Copy(size_t n, const T* src, T* dst) {
    if (std::is_fundamental<T>::value) {
@ -116,6 +118,16 @@ class CPUContext final {
    }
  }

+  template <class SrcContext, class DstContext>
+  inline void
+  CopyItems(const TypeMeta& meta, size_t n, const void* src, void* dst) {
+    if (meta.copy()) {
+      meta.copy()(src, dst, n);
+    } else {
+      CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
+    }
+  }
+
 protected:
  // TODO(jiayq): instead of hard-coding a generator, make it more flexible.
  int random_seed_{1701};
--- a/caffe2/core/context_gpu.cc
+++ b/caffe2/core/context_gpu.cc
@ -7,6 +7,16 @@ thread_local ThreadLocalCUDAObjects CUDAContext::cuda_objects_;

 namespace {
 bool Caffe2UsePinnedCPUAllocator(int*, char***) {
+#ifdef __SANITIZE_ADDRESS__
+  // Note(jiayq): for more details, see
+  //     https://github.com/google/sanitizers/issues/629
+  LOG(WARNING) << "There are known issues between address sanitizer and "
+                  "cudaMallocHost. As a result, caffe2 will not enable pinned "
+                  "memory allocation in asan mode. If you are expecting any "
+                  "behavior that depends on asan, be advised that it is not "
+                  "turned on.";
+  return true;
+#else
  if (!HasCudaGPU()) {
    VLOG(1) << "No GPU present. I won't use pinned allocator then.";
    return true;
@ -14,6 +24,7 @@ bool Caffe2UsePinnedCPUAllocator(int*, char***) {
  VLOG(1) << "Caffe2 gpu: setting CPUAllocator to PinnedCPUAllocator.";
  SetCPUAllocator(new PinnedCPUAllocator());
  return true;
+#endif
 }

 REGISTER_CAFFE2_INIT_FUNCTION(Caffe2UsePinnedCPUAllocator,
--- a/caffe2/core/db.h
+++ b/caffe2/core/db.h
@ -116,7 +116,9 @@ CAFFE_DECLARE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
 */
 inline unique_ptr<DB> CreateDB(
    const string& db_type, const string& source, Mode mode) {
-  return Caffe2DBRegistry()->Create(db_type, source, mode);
+  auto result = Caffe2DBRegistry()->Create(db_type, source, mode);
+  VLOG(1) << ((!result) ? "not found db " : "found db ") << db_type;
+  return result;
 }

 /**
--- a/caffe2/core/init.h
+++ b/caffe2/core/init.h
@ -68,13 +68,13 @@ class InitRegisterer {

 #define REGISTER_CAFFE2_INIT_FUNCTION(name, function, description)             \
  namespace {                                                                  \
-  ::caffe2::InitRegisterer g_caffe2_initregisterer_name(                       \
+  ::caffe2::InitRegisterer g_caffe2_initregisterer_##name(                     \
      function, false, description);                                           \
  }  // namespace

 #define REGISTER_CAFFE2_EARLY_INIT_FUNCTION(name, function, description)       \
  namespace {                                                                  \
-  ::caffe2::InitRegisterer g_caffe2_initregisterer_name(                       \
+  ::caffe2::InitRegisterer g_caffe2_initregisterer_##name(                     \
      function, true, description);                                            \
  }  // namespace

--- a/caffe2/core/net_gpu.cc
+++ b/caffe2/core/net_gpu.cc
@ -1,14 +1,70 @@
 #include "caffe2/core/net.h"
+
 #include "caffe2/core/context_gpu.h"
+#include "caffe2/core/flags.h"

 #include "caffe2/core/operator.h"
 #include "caffe2/core/timer.h"
 #include "caffe2/proto/caffe2.pb.h"

+#ifdef CAFFE2_USE_NVTX
+#include <nvToolsExt.h>
+#endif
+
+CAFFE2_DEFINE_bool(caffe2_use_nvtx, false, "Use NVTX ranges for profiling");
+
 namespace caffe2 {

 namespace {

+using Color = int32_t;
+constexpr Color kRunColor = 0x0000CCFF; // blue
+constexpr Color kRecordColor = 0x00FF3300; // red
+constexpr Color kWaitColor = 0x0066FF33; // green
+
+#ifdef CAFFE2_USE_NVTX
+
+class ProfiledRange {
+ public:
+  ProfiledRange(const OperatorDef& def, Color color) {
+    if (!FLAGS_caffe2_use_nvtx) {
+      return;
+    }
+    nvtxEventAttributes_t eventAttrib = {0};
+    eventAttrib.version = NVTX_VERSION;
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+    eventAttrib.colorType = NVTX_COLOR_ARGB;
+    eventAttrib.color = color;
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+    eventAttrib.message.ascii = def.type().c_str();
+    range_ = nvtxRangeStartEx(&eventAttrib);
+    CHECK(range_);
+  }
+
+  ~ProfiledRange() {
+    if (!FLAGS_caffe2_use_nvtx) {
+      return;
+    }
+    nvtxRangeEnd(range_);
+  }
+
+ private:
+  nvtxRangeId_t range_ = 0;
+  DISABLE_COPY_AND_ASSIGN(ProfiledRange);
+};
+
+#else
+
+class ProfiledRange {
+ public:
+  ProfiledRange(const OperatorDef& def, Color color) {}
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(ProfiledRange);
+};
+
+#endif // ifdef CAFFE2_USE_NVTX
+
 struct Stream;

 struct Event {
@ -69,6 +125,7 @@ struct Stream {

  int gpu_id_{-1};
  cudaStream_t stream_{nullptr};
+
 private:
  DISABLE_COPY_AND_ASSIGN(Stream);
 };
@ -128,18 +185,24 @@ class AsyncDAGNet : public DAGNetBase {
        }));

    for (auto source_parent_idx : operator_nodes_[source_idx].parents_) {
+      ProfiledRange r(
+          operator_nodes_[source_parent_idx].operator_->def(), kWaitColor);
      stream.wait(events_[source_parent_idx].get());
    }

    // We've waited on all our parent indices.
    bool success = true;
-    for (auto idx: chain) {
+    for (auto idx : chain) {
+      ProfiledRange r(operator_nodes_[idx].operator_->def(), kRunColor);
      success &= operator_nodes_[idx].operator_->RunAsync();
    }

    // Record an event for the sink of the chain.
    const auto& sink_idx = chain.back();
+    {
+      ProfiledRange r(operator_nodes_[sink_idx].operator_->def(), kRecordColor);
      events_[sink_idx]->record(stream);
+    }
    CHECK(!eventRecorded_[sink_idx]);
    eventRecorded_[sink_idx] = 1;
    return success;
@ -157,9 +220,11 @@ class AsyncDAGNet : public DAGNetBase {
    Stream stream{device_option};

    // Potential optimization: we can pre-compute outstanding events.
-    for (auto& event : events_) {
+    for (auto i = 0; i < events_.size(); ++i) {
+      auto& event = events_[i];
      if (event->outstanding_) {
        VLOG(2) << "Synchronizing host on outstanding event";
+        ProfiledRange r(operator_nodes_[i].operator_->def(), kWaitColor);
        stream.wait(event.get());
      }
    }
--- a/caffe2/core/tensor.cc
+++ b/caffe2/core/tensor.cc
@ -2,17 +2,6 @@
 #include "caffe2/core/flags.h"

 CAFFE2_DEFINE_bool(
-    caffe2_keep_on_shrink, false,
+    caffe2_keep_on_shrink,
+    true,
    "If set, keeps memory when a tensor is shrinking its size.");
-
-namespace caffe2 {
-
-namespace detail {
-
-vector<TIndex>& shape(size_t n) {
-  static thread_local vector<TIndex> r;
-  r.resize(n);
-  return r;
-}
-}
-}
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@ -150,6 +150,40 @@ class Tensor {

  virtual ~Tensor() {}

+  /**
+   * @brief Extends the outer-most dimension of this tensor by num elements,
+   * preserving the existing data.
+   *
+   * The underlying data may be reallocated in order to accommodate the new
+   * elements, in which case this tensors' capacity is grown at a factor of
+   * growthPct. This ensures that Extend runs on an amortized O(1) time
+   * complexity.
+   */
+  template <class ContextForCopy>
+  void Extend(TIndex num, int growthPct, ContextForCopy* context) {
+    CHECK_GE(dims_.size(), 1);
+    auto oldSize = size_;
+    auto newDims = dims_;
+    newDims[0] += num;
+    if (!data_) {
+      Resize(newDims);
+      return;
+    }
+    auto newSize = std::accumulate(
+        newDims.begin(), newDims.end(), 1, std::multiplies<TIndex>());
+    if (newSize * meta_.itemsize() > capacity_) {
+      auto newCapacity = dims_;
+      newCapacity[0] = std::max(newDims[0], dims_[0] * (growthPct + 100) / 100);
+      auto oldData = std::move(data_);
+      Resize(newCapacity);
+      auto* newData = raw_mutable_data(meta_);
+      context->template CopyItems<ContextForCopy, ContextForCopy>(
+          meta_, oldSize, oldData.get(), newData);
+    }
+    dims_ = newDims;
+    size_ = newSize;
+  }
+
  /**
   * @brief Resizes a tensor.
   *
@ -297,9 +331,12 @@ class Tensor {
    CHECK(data_.get() || size_ == 0)
        << "The tensor is uninitialized. You probably need to call "
        << "Resize() and mutable_data() first.";
-    CHECK(IsType<T>())
-        << "Tensor type mistmatch, caller expects elements to be "
-        << TypeMeta::Name<T>() << " while tensor contains " << meta_.name();
+    CAFFE_ENFORCE(
+        IsType<T>(),
+        "Tensor type mistmatch, caller expects elements to be ",
+        TypeMeta::Name<T>(),
+        " while tensor contains ",
+        meta_.name());
    return static_cast<T*>(data_.get());
  }

--- a/caffe2/core/workspace.cc
+++ b/caffe2/core/workspace.cc
@ -12,34 +12,39 @@
 namespace caffe2 {

 namespace {
-// Returns a function that returns `true` if we should continue
-// iterating, given the current iteration count.
-std::function<bool(int)> getContinuationTest(
-    Workspace* ws,
-    const ExecutionStep& step) {
-  if (step.has_criteria_network()) {
-    CHECK(!step.has_num_iter())
-        << "Must not specify num_iter if critera_network is set";
+// try to get the should_stop signal, a scalar bool blob value.
+// if the blob doesn't exist or is not initiaized, return false
+const bool getShouldStop(const Blob* b) {
+  if (!b || !b->meta().id()) { // not exist or uninitialized
+    return false;
  }

-  if (!step.has_criteria_network()) {
-    int iterations = step.has_num_iter() ? step.num_iter() : 1;
-    VLOG(1) << "Executing step for " << iterations << " iterations.";
-    return [=](int i) { return i < iterations; };
+  const auto& t = b->Get<TensorCPU>();
+  CAFFE_ENFORCE(t.IsType<bool>() && t.size() == 1, "expects a scalar boolean");
+  return *(t.template data<bool>());
+}
+
+// Returns a function that returns `true` if we should continue
+// iterating, given the current iteration count.
+std::function<bool(int64_t)> getContinuationTest(
+    Workspace* ws,
+    const ExecutionStep& step) {
+  if (step.has_should_stop_blob()) {
+    CAFFE_ENFORCE(
+        !step.has_num_iter(),
+        "Must not specify num_iter if should_stop_blob is set");
+  }
+
+  if (!step.has_should_stop_blob()) {
+    int64_t iterations = step.has_num_iter() ? step.num_iter() : 1;
+    VLOG(1) << "Will execute step " << step.name() << " for " << iterations
+            << " iterations.";
+    return [=](int64_t i) { return i < iterations; };
+  } else {
+    VLOG(1) << "Will execute step " << step.name() << " until stopped by blob "
+            << step.should_stop_blob();
+    return [](int64_t i) { return true; };
  }
-  auto* criteria_network = ws->GetNet(step.criteria_network());
-  CHECK_NOTNULL(criteria_network);
-  CHECK_EQ(criteria_network->external_output().size(), 1);
-  const auto& criteria_output = criteria_network->external_output().front();
-  VLOG(1) << "Executing step controlled by criteria output: "
-                << criteria_output;
-  return [=](int) {
-    criteria_network->Run();
-    const auto& blob = ws->GetBlob(criteria_output)->Get<TensorCPU>();
-    CHECK_EQ(blob.size(), 1);
-    CHECK(blob.IsType<bool>());
-    return blob.template data<bool>()[0] > 0;
-  };
 };
 }  // namespace

@ -229,10 +234,17 @@ struct Reporter {

 }

+#define CHECK_SHOULD_STOP(shouldStop)                   \
+  if (getShouldStop(shouldStop)) {                      \
+    VLOG(1) << "Execution stopped by should_stop_blob"; \
+    return true;                                        \
+  }
+
 bool Workspace::ExecuteStepRecursive(
      const ExecutionStep& step,
      ShouldContinue externalShouldContinue) {
-  LOG(INFO) << "Running execution step " << step.name();
+  VLOG(1) << "Running execution step " << step.name();
+
  if (!(step.substep_size() == 0 || step.network_size() == 0)) {
    LOG(ERROR) << "An ExecutionStep should either have substep or networks "
               << "but not both.";
@ -247,22 +259,40 @@ bool Workspace::ExecuteStepRecursive(
    if (net_map_.count(step.report_net()) == 0) {
      LOG(ERROR) << "Report net " << step.report_net() << " not found.";
    }
+    VLOG(1) << "Starting reporter net";
    reporter.start(net_map_[step.report_net()].get(), step.report_interval());
  }

+  const Blob* shouldStop = nullptr;
+  if (step.has_should_stop_blob()) {
+    shouldStop = GetBlob(step.should_stop_blob());
+    CAFFE_ENFORCE(
+        shouldStop, "blob ", step.should_stop_blob(), " does not exist");
+  }
+
  const auto netShouldContinue = getContinuationTest(this, step);
-  const auto shouldContinue = [&](int iter) {
+  const auto shouldContinue = [&](int64_t iter) {
    return externalShouldContinue(iter) && netShouldContinue(iter);
  };
  if (step.substep_size()) {
-    for (int iter = 0; shouldContinue(iter); ++iter) {
-      // we assume that, if we have substeps, each substep is going to take a
-      // reasonable amount of time, so logging here is fine
-      LOG(INFO) << "Execution step " << step.name()
-                << ": Starting iteration " << iter;
+    for (int64_t iter = 0; shouldContinue(iter); ++iter) {
+      VLOG(1) << "Execution step " << step.name() << ": iteration " << iter;
+
+      if (!step.concurrent_substeps() || step.substep().size() <= 1) {
+        auto substepShouldContinue = [&, externalShouldContinue](int64_t iter) {
+          return externalShouldContinue(iter);
+        };
+
+        for (auto& ss : step.substep()) {
+          if (!ExecuteStepRecursive(ss, substepShouldContinue)) {
+            return false;
+          }
+          CHECK_SHOULD_STOP(shouldStop);
+        }
+      } else {
        std::atomic<int> next_substep{0};
        std::atomic<bool> got_failure{false};
-      auto substepShouldContinue = [&, externalShouldContinue](int iter) {
+        auto substepShouldContinue = [&, externalShouldContinue](int64_t iter) {
          return !got_failure && externalShouldContinue(iter);
        };
        auto worker = [&]() {
@ -271,26 +301,26 @@ bool Workspace::ExecuteStepRecursive(
            if (got_failure || (substep_id >= step.substep().size())) {
              break;
            }
-          if (!ExecuteStepRecursive(step.substep().Get(substep_id),
-                                    substepShouldContinue)) {
+            if (!ExecuteStepRecursive(
+                    step.substep().Get(substep_id), substepShouldContinue)) {
              got_failure = true;
            }
          }
        };
-      if (!step.concurrent_substeps() || step.substep().size() <= 1) {
-        worker();
-      } else {
+
        std::vector<std::thread> threads;
-        for (int i = 0; i < step.substep().size(); ++i) {
+        for (int64_t i = 0; i < step.substep().size(); ++i) {
          threads.emplace_back(worker);
        }
        for (auto& thread: threads) {
          thread.join();
        }
-      }
        if (got_failure) {
          return false;
        }
+        // concurrent substeps should be careful about setting should_stop_blob
+        CHECK_SHOULD_STOP(shouldStop);
+      }
    }
    return true;
  } else {
@ -305,16 +335,19 @@ bool Workspace::ExecuteStepRecursive(
      VLOG(1) << "Going to execute network " << network_name;
      networks.push_back(net_map_[network_name].get());
    }
-    for (int iter = 0; shouldContinue(iter); ++iter) {
+    for (int64_t iter = 0; shouldContinue(iter); ++iter) {
      VLOG(1) << "Executing network iteration " << iter;
      for (NetBase* network : networks) {
        if (!network->Run()) {
          return false;
        }
+        CHECK_SHOULD_STOP(shouldStop);
      }
    }
  }
  return true;
 }

+#undef CHECK_SHOULD_STOP
+
 }  // namespace caffe2
--- a/caffe2/operators/atomic_ops.cc
+++ b/caffe2/operators/atomic_ops.cc
@ -0,0 +1,73 @@
+#include <mutex>
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+namespace fb {
+namespace {
+
+class CreateMutexOp final : public Operator<CPUContext> {
+ public:
+  CreateMutexOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    *OperatorBase::Output<std::unique_ptr<std::mutex>>(0) =
+        std::unique_ptr<std::mutex>(new std::mutex);
+    return true;
+  }
+};
+
+class AtomicFetchAddOp final : public Operator<CPUContext> {
+ public:
+  AtomicFetchAddOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& mutex = OperatorBase::Input<std::unique_ptr<std::mutex>>(0);
+    auto& a = Input(1);
+    auto& b = Input(2);
+    auto* c = Output(0);
+    auto* d = Output(1);
+    c->Resize(std::vector<TIndex>());
+    d->Resize(std::vector<TIndex>());
+    auto* aPtr = a.data<int32_t>();
+    auto* bPtr = b.data<int32_t>();
+    auto* cPtr = c->mutable_data<int32_t>();
+    auto* dPtr = d->mutable_data<int32_t>();
+    std::lock_guard<std::mutex> lg(*mutex);
+    *dPtr = *aPtr;
+    *cPtr = *aPtr + *bPtr;
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(CreateMutex, CreateMutexOp);
+REGISTER_CPU_OPERATOR(AtomicFetchAdd, AtomicFetchAddOp);
+
+OPERATOR_SCHEMA(CreateMutex)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .SetDoc("Creates an unlocked mutex and returns it in a unique_ptr blob.")
+    .Output(0, "mutex_ptr", "Blob containing a std::unique_ptr<mutex>.");
+
+OPERATOR_SCHEMA(AtomicFetchAdd)
+    .NumInputs(3)
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
+Given a mutex and two int32 scalar tensors, performs an atomic fetch add
+by mutating the first argument and adding it to the second input
+argument. Returns the updated integer and the value prior to the update.
+)DOC")
+    .Input(0, "mutex_ptr", "Blob containing to a unique_ptr<mutex>")
+    .Input(1, "mut_value", "Value to be mutated after the sum.")
+    .Input(2, "increment", "Value to add to the first operand.")
+    .Output(0, "mut_value", "Mutated value after sum. Usually same as input 1.")
+    .Output(1, "fetched_value", "Value of the first operand before sum.")
+    .AllowInplace({{1, 0}});
+
+SHOULD_NOT_DO_GRADIENT(CreateMutex);
+SHOULD_NOT_DO_GRADIENT(AtomicFetchAdd);
+}
+}
+}
--- a/caffe2/operators/boolean_mask_ops.cc
+++ b/caffe2/operators/boolean_mask_ops.cc
@ -0,0 +1,143 @@
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+namespace {
+
+template <class Context>
+class BooleanMaskLengthsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  BooleanMaskLengthsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    auto& lengths = Input(0);
+    auto& mask = Input(1);
+    auto* lengthsOut = Output(0);
+    CAFFE_ENFORCE(lengths.ndim() == 1);
+    CAFFE_ENFORCE(mask.ndim() == 1);
+    const auto* lengthsPtr = lengths.template data<T>();
+    const auto* maskPtr = mask.template data<bool>();
+    auto totalLength =
+        std::accumulate(lengthsPtr, lengthsPtr + lengths.size(), 0);
+    CAFFE_ENFORCE(mask.size() == totalLength);
+    lengthsOut->ResizeLike(lengths);
+    auto* lengthsOutPtr = lengthsOut->template mutable_data<T>();
+    int p = 0;
+    for (int i = 0; i < lengths.size(); ++i) {
+      T lengthOut = 0;
+      for (int j = 0; j < lengthsPtr[i]; ++j) {
+        if (maskPtr[p++]) {
+          ++lengthOut;
+        }
+      }
+      lengthsOutPtr[i] = lengthOut;
+    }
+    return true;
+  }
+};
+
+template <class Context>
+class BooleanMaskOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  BooleanMaskOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& data = Input(0);
+    auto& mask = Input(1);
+    auto* dataOut = Output(0);
+    CAFFE_ENFORCE(data.ndim() >= 1);
+    CAFFE_ENFORCE(mask.ndim(), 1);
+    CAFFE_ENFORCE(data.dims()[0] == mask.dims()[0]);
+
+    const auto* maskPtr = mask.template data<bool>();
+    int numOutputs = 0;
+    int outerSize = mask.size();
+    for (int i = 0; i < outerSize; ++i) {
+      if (maskPtr[i]) {
+        ++numOutputs;
+      }
+    }
+    std::vector<TIndex> outShape;
+    outShape.push_back(numOutputs);
+    outShape.insert(outShape.end(), data.dims().begin() + 1, data.dims().end());
+    dataOut->Resize(outShape);
+    if (numOutputs == 0) {
+      return true;
+    }
+    auto innerSizeBytes = std::accumulate(
+                              data.dims().begin() + 1,
+                              data.dims().end(),
+                              1,
+                              std::multiplies<TIndex>()) *
+        data.meta().itemsize();
+    TIndex lastStart = -1;
+    const auto* inPtr = (char*)data.raw_data();
+    auto* outPtr = (char*)dataOut->raw_mutable_data(data.meta());
+    TIndex outStart = 0;
+    for (TIndex i = 0;; ++i) {
+      // mask was true and either a) became false, or b) sequence finished
+      if (lastStart != -1 && ((i >= outerSize) || !maskPtr[i])) {
+        const auto* src = inPtr + lastStart * innerSizeBytes;
+        auto* dst = outPtr + outStart * innerSizeBytes;
+        int numItems = i - lastStart;
+        if (data.meta().copy()) {
+          data.meta().copy()(src, dst, numItems);
+        } else {
+          context_.template CopyBytes<CPUContext, CPUContext>(
+              numItems * data.meta().itemsize(), src, dst);
+        }
+        outStart += numItems;
+        lastStart = -1;
+      }
+      if (i >= outerSize) {
+        break;
+      }
+      // mask was false and became true
+      if (lastStart == -1 && maskPtr[i]) {
+        lastStart = i;
+      }
+    }
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(BooleanMask, BooleanMaskOp<CPUContext>);
+REGISTER_CPU_OPERATOR(BooleanMaskLengths, BooleanMaskLengthsOp<CPUContext>);
+
+OPERATOR_SCHEMA(BooleanMask)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Given a data 1D tensor and a mask (boolean) tensor of same shape, returns a
+tensor containing only the elements corresponding to positions where the mask
+is true.
+)DOC")
+    .Input(0, "data", "The 1D, original data tensor.")
+    .Input(1, "mask", "A tensor of bools of same shape as `data`.")
+    .Output(0, "masked_data", "A tensor of same type as `data`.");
+
+OPERATOR_SCHEMA(BooleanMaskLengths)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Given a tensor of int32 segment lengths and a mask (boolean) tensor, return
+the segment lengths of a corresponding segmented tensor after BooleanMask is
+applied.
+)DOC")
+    .Input(0, "lengths", "A 1D int32 tensor representing segment lengths.")
+    .Input(1, "mask", "A 1D bool tensor of values to keep.")
+    .Output(0, "masked_lengths", "Segment lengths of a masked tensor.");
+
+NO_GRADIENT(BooleanMask)
+NO_GRADIENT(BooleanMaskLengths);
+}
+}
--- a/caffe2/operators/concat_split_op.cc
+++ b/caffe2/operators/concat_split_op.cc
@ -10,8 +10,14 @@ OPERATOR_SCHEMA(Concat).NumInputs(1, INT_MAX).NumOutputs(2);
 // Backward compatibility names.
 REGISTER_CPU_OPERATOR(DepthSplit, SplitOp<CPUContext>);
 REGISTER_CPU_OPERATOR(DepthConcat, ConcatOp<CPUContext>);
-OPERATOR_SCHEMA(DepthSplit).NumInputs(1, 2).NumOutputs(1, INT_MAX);
-OPERATOR_SCHEMA(DepthConcat).NumInputs(1, INT_MAX).NumOutputs(2);
+OPERATOR_SCHEMA(DepthSplit)
+    .NumInputs(1, 2)
+    .NumOutputs(1, INT_MAX)
+    .SetDoc("Backward compatible operator name for Split.");
+OPERATOR_SCHEMA(DepthConcat)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(2)
+    .SetDoc("Backward compatible operator name for Concat.");

 class GetSplitGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
--- a/caffe2/operators/concat_split_op.h
+++ b/caffe2/operators/concat_split_op.h
@ -85,7 +85,9 @@ class ConcatOp final : public Operator<Context> {
 template <class Context>
 bool SplitOp<Context>::RunOnDevice() {
  auto& input = Input(0);
+  const int input_channels = input.dim32(axis_);
  const int* axis_data;
+  vector<int> equal_split;
  if (InputSize() == 2) {
    // We obtain split from the input tensor.
    CHECK_EQ(split_.size(), 0)
@ -94,13 +96,21 @@ bool SplitOp<Context>::RunOnDevice() {
    auto& split_tensor = OperatorBase::Input<TensorCPU>(1);
    CHECK_EQ(split_tensor.size(), OutputSize());
    axis_data = split_tensor.template data<int>();
+  } else if (split_.size() == 0) {
+    CAFFE_ENFORCE(input_channels % OutputSize() == 0,
+                  "If you did not specify split explicitly, the number of "
+                  "input channels should be divisible by the output size.");
+    equal_split.resize(OutputSize(), input_channels / OutputSize());
+    axis_data = equal_split.data();
  } else {
    // We obtain split from the parameters.
-    CHECK_EQ(split_.size(), OutputSize());
+    CAFFE_ENFORCE(split_.size() == OutputSize(),
+                  "The number of splits specified should be equal to the "
+                  "number of outputs.");
    axis_data = split_.data();
  }
  CHECK_LT(axis_, input.ndim());
-  const int input_channels = input.dim32(axis_);
+
  CHECK_EQ(std::accumulate(axis_data, axis_data + OutputSize(), 0),
           input_channels)
      << "Sum of split dimensions do not match: should be " << input_channels;
--- a/caffe2/operators/conv_op_cudnn.cc
+++ b/caffe2/operators/conv_op_cudnn.cc
@ -22,11 +22,11 @@ template <typename ArrayOfcudnnConvolutionAlgoPerf_t>
 inline void LogCuDNNPerfStats(
    const ArrayOfcudnnConvolutionAlgoPerf_t& perf_stat,
    int returned_algo_count) {
-  LOG(INFO) << "Perf result: (algo: stat, time, memory)";
+  VLOG(1) << "Perf result: (algo: stat, time, memory)";
  for (int i = 0; i < returned_algo_count; ++i) {
    const auto& stat = perf_stat[i];
-    LOG(INFO) << stat.algo << ": " << stat.status
-                   << " " << stat.time << " " << stat.memory;
+    VLOG(1) << stat.algo << ": " << stat.status << " " << stat.time << " "
+            << stat.memory;
  }
 }
 }  // namespace
@ -193,7 +193,7 @@ bool CudnnConvOp<T>::RunOnDevice() {
    if (deterministic_) {
      algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
    } else if (exhaustive_search_) {
-      LOG(INFO) << "CUDNN Convolution: doing exhaustive search.";
+      VLOG(1) << "CUDNN Convolution: doing exhaustive search.";
      // When we do an exhaustive search, we will ignore the workspace size
      // limit and simply go for the fastest algorithm. If you happen to run
      // out of memory later, you will be on your own...
@ -229,8 +229,8 @@ bool CudnnConvOp<T>::RunOnDevice() {
        cudnn_wrapper_.inline_cudnn_handle(),
        bottom_desc_, filter_desc_, conv_desc_, top_desc_,
        algo_, &cudnn_ws_nbytes_));
-    LOG(INFO) << "CuDNN algorithm: " << algo_;
-    LOG(INFO) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
+    VLOG(1) << "CuDNN algorithm: " << algo_;
+    VLOG(1) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
  }

  // Now, actually run the computation.
@ -346,7 +346,7 @@ bool CudnnConvGradientOp<T>::RunOnDevice() {
      bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
      bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
    } else if (exhaustive_search_) {
-      LOG(INFO) << "CUDNN Convolution bwd: doing exhaustive search.";
+      VLOG(1) << "CUDNN Convolution bwd: doing exhaustive search.";
      // When we do an exhaustive search, we will ignore the workspace size
      // limit and simply go for the fastest algorithm. If you happen to run
      // out of memory later, you will be on your own...
@ -416,9 +416,9 @@ bool CudnnConvGradientOp<T>::RunOnDevice() {
        bwd_data_algo_, &bwd_data_ws_size));
    cudnn_ws_nbytes_ = std::max(bwd_filter_ws_size, bwd_data_ws_size);

-    LOG(INFO) << "CuDNN bwd algorithm: " << bwd_filter_algo_ << ", "
+    VLOG(1) << "CuDNN bwd algorithm: " << bwd_filter_algo_ << ", "
            << bwd_data_algo_;
-    LOG(INFO) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
+    VLOG(1) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
  }

  // Now, actually run the computation.
--- a/caffe2/operators/conv_transpose_op_cudnn.cc
+++ b/caffe2/operators/conv_transpose_op_cudnn.cc
@ -0,0 +1,579 @@
+#include "caffe2/core/common_cudnn.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/conv_transpose_op.h"
+
+namespace caffe2 {
+
+// Earlier in the days Caffe sets the default cudnn workspace to 8MB. We bump
+// it up to 64MB in Caffe2, as this enables the use of Winograd in many cases,
+// something very beneficial to more recent CNN models.
+static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 64 * 1024 * 1024;
+
+// Manually specified number of algorithms implemented in CuDNN.
+// This does not have any performance implications, as we will always find the
+// fastest algorithm; setting them to the right number of algorithms will enable
+// us to best report the statistics when doing an exhaustive search, though.
+static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7;
+static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
+static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
+
+namespace {
+template <typename ArrayOfcudnnConvolutionAlgoPerf_t>
+inline void LogCuDNNPerfStats(
+    const ArrayOfcudnnConvolutionAlgoPerf_t& perf_stat,
+    int returned_algo_count) {
+  LOG(INFO) << "Perf result: (algo: stat, time, memory)";
+  for (int i = 0; i < returned_algo_count; ++i) {
+    const auto& stat = perf_stat[i];
+    LOG(INFO) << stat.algo << ": " << stat.status << " " << stat.time << " "
+              << stat.memory;
+  }
+}
+} // namespace
+
+class CudnnConvTransposeOpBase : public ConvTransposeUnpoolBase<CUDAContext> {
+ public:
+  CudnnConvTransposeOpBase(const OperatorDef& operator_def, Workspace* ws)
+      : ConvTransposeUnpoolBase<CUDAContext>(operator_def, ws),
+        cudnn_wrapper_(&context_),
+        cudnn_ws_nbytes_limit_(OperatorBase::GetSingleArgument<size_t>(
+            "ws_nbytes_limit",
+            kCONV_CUDNN_WORKSPACE_LIMIT_BYTES)),
+        exhaustive_search_(
+            OperatorBase::GetSingleArgument<int>("exhaustive_search", 0)),
+        deterministic_(
+            OperatorBase::GetSingleArgument<int>("deterministic", 0)),
+        cudnn_state_(OperatorBase::GetSingleArgument<int>("cudnn_state", 0)) {
+    CHECK(!deterministic_ || !exhaustive_search_);
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&bottom_desc_));
+    CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&top_desc_));
+    CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
+  }
+
+  ~CudnnConvTransposeOpBase() {
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(bottom_desc_));
+    CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc_));
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(bias_desc_));
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(top_desc_));
+    CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc_));
+  }
+
+ protected:
+  vector<TIndex> cudnn_input_dims_;
+  vector<TIndex> cudnn_filter_dims_;
+
+  CuDNNWrapper cudnn_wrapper_;
+  cudnnTensorDescriptor_t bottom_desc_;
+  cudnnFilterDescriptor_t filter_desc_;
+  cudnnTensorDescriptor_t bias_desc_;
+  cudnnTensorDescriptor_t top_desc_;
+  cudnnConvolutionDescriptor_t conv_desc_;
+  const size_t cudnn_ws_nbytes_limit_;
+  size_t cudnn_ws_nbytes_;
+  bool exhaustive_search_;
+  bool deterministic_;
+  size_t cudnn_state_;
+};
+
+template <typename T>
+class CudnnConvTransposeOp final : public CudnnConvTransposeOpBase {
+ public:
+  CudnnConvTransposeOp(const OperatorDef& operator_def, Workspace* ws)
+      : CudnnConvTransposeOpBase(operator_def, ws) {}
+
+  ~CudnnConvTransposeOp() {}
+
+  bool RunOnDevice() override;
+
+ private:
+  cudnnConvolutionBwdDataAlgo_t bwd_data_algo_;
+  // Input: X, W, b
+  // Output: Y
+  INPUT_TAGS(INPUT, FILTER, BIAS);
+};
+
+template <typename T>
+class CudnnConvTransposeGradientOp final : public CudnnConvTransposeOpBase {
+ public:
+  CudnnConvTransposeGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : CudnnConvTransposeOpBase(operator_def, ws) {}
+
+  ~CudnnConvTransposeGradientOp() {}
+
+  bool RunOnDevice() override;
+
+ private:
+  cudnnConvolutionFwdAlgo_t algo_;
+  cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo_;
+  // input: X, W, dY
+  // output: dW, db, and optionally dX
+  INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
+  OUTPUT_TAGS(FILTER_GRAD, BIAS_GRAD, INPUT_GRAD);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Implementations
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+bool CudnnConvTransposeOp<T>::RunOnDevice() {
+  auto& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  auto& bias = Input(BIAS);
+  auto* Y = Output(0);
+  int C = 0;
+  switch (order_) {
+    case StorageOrder::NHWC:
+      C = filter.dim32(3);
+      break;
+    case StorageOrder::NCHW:
+      C = filter.dim32(1);
+      break;
+    default:
+      LOG(FATAL) << "Unknown storage order: " << order_;
+  }
+  ConvTransposeUnpoolBase<CUDAContext>::SetOutputSize(X, Y, C);
+
+  int N = 0, M = 0, H = 0, W = 0, H_out = 0, W_out = 0;
+  switch (order_) {
+    case StorageOrder::NHWC:
+      N = X.dim32(0);
+      H = X.dim32(1);
+      W = X.dim32(2);
+      M = X.dim32(3);
+      H_out = Y->dim32(1);
+      W_out = Y->dim32(2);
+      DCHECK_EQ(filter.dim32(1), kernel_h_);
+      DCHECK_EQ(filter.dim32(1), kernel_h_);
+      DCHECK_EQ(filter.dim32(2), kernel_w_);
+      DCHECK_EQ(filter.dim32(3), C);
+      break;
+    case StorageOrder::NCHW:
+      N = X.dim32(0);
+      M = X.dim32(1);
+      H = X.dim32(2);
+      W = X.dim32(3);
+      H_out = Y->dim32(2);
+      W_out = Y->dim32(3);
+      DCHECK_EQ(filter.dim32(1), C);
+      DCHECK_EQ(filter.dim32(2), kernel_h_);
+      DCHECK_EQ(filter.dim32(3), kernel_w_);
+      break;
+    default:
+      LOG(FATAL) << "Unknown storage order: " << order_;
+  }
+
+  DCHECK_EQ(bias.ndim(), 1);
+  DCHECK_EQ(bias.dim32(0), C);
+
+  // Set up the cudnn algorithms & workspace if necessary
+  bool input_changed = (X.dims() != cudnn_input_dims_);
+  bool filter_changed = (filter.dims() != cudnn_filter_dims_);
+
+  if (input_changed || filter_changed) {
+    VLOG(1) << "Changing the cudnn descriptor configurations.";
+    if (input_changed) {
+      cudnn_input_dims_ = X.dims();
+      CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+          bottom_desc_,
+          GetCudnnTensorFormat(order_),
+          cudnnTypeWrapper<T>::type,
+          N,
+          M,
+          H,
+          W));
+    }
+    if (filter_changed) {
+      cudnn_filter_dims_ = filter.dims();
+      CUDNN_CHECK(cudnnSetFilter4dDescriptor(
+          filter_desc_,
+          cudnnTypeWrapper<T>::type,
+          GetCudnnTensorFormat(order_),
+          M,
+          C,
+          kernel_h_,
+          kernel_w_));
+      CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+          bias_desc_,
+          GetCudnnTensorFormat(order_),
+          cudnnTypeWrapper<T>::type,
+          1,
+          C,
+          1,
+          1));
+    }
+    // Set the output
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+        top_desc_,
+        GetCudnnTensorFormat(order_),
+        cudnnTypeWrapper<T>::type,
+        N,
+        C,
+        H_out,
+        W_out));
+    // Set the convolution descriptor
+    CHECK_EQ(pad_t_, pad_b_)
+        << "The current padding scheme leads to unequal padding on the top and "
+           "bottom, which is not supported by cudnn.";
+    CHECK_EQ(pad_l_, pad_r_)
+        << "The current padding scheme leads to unequal padding on the left "
+           "and right, which is not supported by cudnn.";
+    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+        conv_desc_,
+        pad_t_,
+        pad_l_,
+        stride_h_,
+        stride_w_,
+        1,
+        1,
+        CUDNN_CROSS_CORRELATION));
+    if (deterministic_) {
+      bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+    } else if (exhaustive_search_) {
+      int returned_algo_count;
+      std::array<cudnnConvolutionBwdDataAlgoPerf_t, kNUM_CUDNN_BWD_DATA_ALGS>
+          data_perf_stat;
+      cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
+        state->workspace().reset();
+        CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
+            state->cudnn_handle(),
+            filter_desc_,
+            bottom_desc_,
+            conv_desc_,
+            top_desc_,
+            kNUM_CUDNN_BWD_DATA_ALGS,
+            &returned_algo_count,
+            data_perf_stat.data()));
+      });
+
+      LogCuDNNPerfStats(data_perf_stat, returned_algo_count);
+      bwd_data_algo_ = data_perf_stat[0].algo;
+    } else {
+      CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
+          cudnn_wrapper_.inline_cudnn_handle(),
+          filter_desc_,
+          bottom_desc_,
+          conv_desc_,
+          top_desc_,
+          CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+          cudnn_ws_nbytes_limit_,
+          &bwd_data_algo_));
+    }
+
+    size_t bwd_data_ws_size;
+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        filter_desc_,
+        bottom_desc_,
+        conv_desc_,
+        top_desc_,
+        bwd_data_algo_,
+        &bwd_data_ws_size));
+    cudnn_ws_nbytes_ = bwd_data_ws_size;
+    LOG(INFO) << "CuDNN algorithm: " << bwd_data_algo_;
+    LOG(INFO) << "CuDNN workspace size: " << bwd_data_ws_size;
+  }
+
+  // Now, actually run the computation.
+  // Filter
+  cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
+    CUDNN_CHECK(cudnnConvolutionBackwardData(
+        state->cudnn_handle(),
+        cudnnTypeWrapper<T>::kOne(),
+        filter_desc_,
+        filter.template data<T>(),
+        bottom_desc_,
+        X.template data<T>(),
+        conv_desc_,
+        bwd_data_algo_,
+        state->workspace().get(cudnn_ws_nbytes_),
+        cudnn_ws_nbytes_,
+        cudnnTypeWrapper<T>::kZero(),
+        top_desc_,
+        Y->template mutable_data<T>()));
+  });
+  // Bias
+  CUDNN_CHECK(cudnnAddTensor(
+      cudnn_wrapper_.inline_cudnn_handle(),
+      cudnnTypeWrapper<T>::kOne(),
+      bias_desc_,
+      bias.template data<T>(),
+      cudnnTypeWrapper<T>::kOne(),
+      top_desc_,
+      Y->template mutable_data<T>()));
+  // Done.
+  return true;
+}
+
+// TODO(Yangqing): a lot of the function contents are very similar. Consider
+// consolidating them.
+template <typename T>
+bool CudnnConvTransposeGradientOp<T>::RunOnDevice() {
+  auto& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  auto& dY = Input(OUTPUT_GRAD);
+  auto* dfilter = Output(FILTER_GRAD);
+  auto* dbias = Output(BIAS_GRAD);
+  DCHECK_EQ(X.ndim(), 4);
+  DCHECK_EQ(filter.ndim(), 4);
+  auto* Y = Output(0);
+  int C = 0;
+  switch (order_) {
+    case StorageOrder::NHWC:
+      C = filter.dim32(3);
+      break;
+    case StorageOrder::NCHW:
+      C = filter.dim32(1);
+      break;
+    default:
+      LOG(FATAL) << "Unknown storage order: " << order_;
+  }
+  ConvTransposeUnpoolBase<CUDAContext>::SetOutputSize(X, Y, C);
+
+  int N = 0, M = 0, H = 0, W = 0, H_out = 0, W_out = 0;
+  switch (order_) {
+    case StorageOrder::NHWC:
+      N = X.dim32(0);
+      H = X.dim32(1);
+      W = X.dim32(2);
+      M = X.dim32(3);
+      H_out = dY.dim32(1);
+      W_out = dY.dim32(2);
+      DCHECK_EQ(filter.dim32(1), kernel_h_);
+      DCHECK_EQ(filter.dim32(1), kernel_h_);
+      DCHECK_EQ(filter.dim32(2), kernel_w_);
+      DCHECK_EQ(filter.dim32(3), C);
+      break;
+    case StorageOrder::NCHW:
+      N = X.dim32(0);
+      M = X.dim32(1);
+      H = X.dim32(2);
+      W = X.dim32(3);
+      H_out = dY.dim32(2);
+      W_out = dY.dim32(3);
+      DCHECK_EQ(filter.dim32(1), C);
+      DCHECK_EQ(filter.dim32(2), kernel_h_);
+      DCHECK_EQ(filter.dim32(3), kernel_w_);
+      break;
+    default:
+      LOG(FATAL) << "Unknown storage order: " << order_;
+  }
+  // Since we only handle LegacyPadding::NOTSET, we don't need to
+  // compute padding.
+  dfilter->ResizeLike(filter);
+  dbias->Resize(C);
+
+  // Set up the cudnn algorithms & workspace if necessary
+  bool input_changed = (X.dims() != cudnn_input_dims_);
+  bool filter_changed = (filter.dims() != cudnn_filter_dims_);
+  if (input_changed || filter_changed) {
+    VLOG(1) << "Changing the cudnn descriptor configurations.";
+    if (input_changed) {
+      cudnn_input_dims_ = X.dims();
+      CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+          bottom_desc_,
+          GetCudnnTensorFormat(order_),
+          cudnnTypeWrapper<T>::type,
+          N,
+          M,
+          H,
+          W));
+    }
+    if (filter_changed) {
+      cudnn_filter_dims_ = filter.dims();
+      CUDNN_CHECK(cudnnSetFilter4dDescriptor(
+          filter_desc_,
+          cudnnTypeWrapper<T>::type,
+          GetCudnnTensorFormat(order_),
+          M,
+          C,
+          kernel_h_,
+          kernel_w_));
+      CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+          bias_desc_,
+          GetCudnnTensorFormat(order_),
+          cudnnTypeWrapper<T>::type,
+          1,
+          C,
+          1,
+          1));
+    }
+    // Set the output
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+        top_desc_,
+        GetCudnnTensorFormat(order_),
+        cudnnTypeWrapper<T>::type,
+        N,
+        C,
+        H_out,
+        W_out));
+    // Set the convolution descriptor
+    CHECK_EQ(pad_t_, pad_b_)
+        << "The current padding scheme leads to unequal padding on the top and "
+           "bottom, which is not supported by cudnn.";
+    CHECK_EQ(pad_l_, pad_r_)
+        << "The current padding scheme leads to unequal padding on the left "
+           "and right, which is not supported by cudnn.";
+    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
+        conv_desc_,
+        pad_t_,
+        pad_l_,
+        stride_h_,
+        stride_w_,
+        1,
+        1,
+        CUDNN_CROSS_CORRELATION));
+    // Set the workspace
+
+    size_t bwd_filter_ws_size, fwd_ws_size;
+
+    if (deterministic_) {
+      algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+      bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+    } else if (exhaustive_search_) {
+      LOG(INFO) << "CUDNN Convolution bwd: doing exhaustive search.";
+      // When we do an exhaustive search, we will ignore the workspace size
+      // limit and simply go for the fastest algorithm. If you happen to run
+      // out of memory later, you will be on your own...
+      int returned_algo_count;
+      // We clean up the current workspace memory so that the forward algorithm
+      // is free to allocate memory.
+      // Actually run the search.
+      std::
+          array<cudnnConvolutionBwdFilterAlgoPerf_t, kNUM_CUDNN_BWD_FILTER_ALGS>
+              filter_perf_stat;
+
+      cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
+        state->workspace().reset();
+        CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
+            state->cudnn_handle(),
+            top_desc_,
+            bottom_desc_,
+            conv_desc_,
+            filter_desc_,
+            kNUM_CUDNN_BWD_FILTER_ALGS,
+            &returned_algo_count,
+            filter_perf_stat.data()));
+      });
+      LogCuDNNPerfStats(filter_perf_stat, returned_algo_count);
+      bwd_filter_algo_ = filter_perf_stat[0].algo;
+
+      std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
+          fwd_perf_stat;
+      cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
+        state->workspace().reset();
+        CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
+            state->cudnn_handle(),
+            top_desc_,
+            filter_desc_,
+            conv_desc_,
+            bottom_desc_,
+            kNUM_CUDNN_BWD_DATA_ALGS,
+            &returned_algo_count,
+            fwd_perf_stat.data()));
+      });
+
+      LogCuDNNPerfStats(fwd_perf_stat, returned_algo_count);
+      algo_ = fwd_perf_stat[0].algo;
+    } else {
+      // choose backward algorithm for filter
+      CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
+          cudnn_wrapper_.inline_cudnn_handle(),
+          top_desc_,
+          bottom_desc_,
+          conv_desc_,
+          filter_desc_,
+          CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+          cudnn_ws_nbytes_limit_,
+          &bwd_filter_algo_));
+      // choose backward algo for data
+      CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
+          cudnn_wrapper_.inline_cudnn_handle(),
+          top_desc_,
+          filter_desc_,
+          conv_desc_,
+          bottom_desc_,
+          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+          cudnn_ws_nbytes_limit_,
+          &algo_));
+    }
+    // get workspace for backwards filter algorithm
+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        top_desc_,
+        bottom_desc_,
+        conv_desc_,
+        filter_desc_,
+        bwd_filter_algo_,
+        &bwd_filter_ws_size));
+    // get workspace for backwards data algorithm
+    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        top_desc_,
+        filter_desc_,
+        conv_desc_,
+        bottom_desc_,
+        algo_,
+        &fwd_ws_size));
+    cudnn_ws_nbytes_ = std::max(bwd_filter_ws_size, fwd_ws_size);
+
+    LOG(INFO) << "CuDNN bwd algorithm: " << bwd_filter_algo_ << ", " << algo_;
+    LOG(INFO) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
+  }
+
+  // Now, actually run the computation.
+  CUDNN_CHECK(cudnnConvolutionBackwardBias(
+      cudnn_wrapper_.inline_cudnn_handle(),
+      cudnnTypeWrapper<T>::kOne(),
+      top_desc_,
+      dY.template data<T>(),
+      cudnnTypeWrapper<T>::kZero(),
+      bias_desc_,
+      dbias->template mutable_data<T>()));
+
+  cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
+    CUDNN_CHECK(cudnnConvolutionBackwardFilter(
+        state->cudnn_handle(),
+        cudnnTypeWrapper<T>::kOne(),
+        top_desc_,
+        dY.template data<T>(),
+        bottom_desc_,
+        X.template data<T>(),
+        conv_desc_,
+        bwd_filter_algo_,
+        state->workspace().get(cudnn_ws_nbytes_),
+        cudnn_ws_nbytes_,
+        cudnnTypeWrapper<T>::kZero(),
+        filter_desc_,
+        dfilter->template mutable_data<T>()));
+    if (OutputSize() == 3) {
+      // Compute the gradient w.r.t. the input.
+      auto* dX = Output(INPUT_GRAD);
+      dX->ResizeLike(X);
+      CUDNN_CHECK(cudnnConvolutionForward(
+          state->cudnn_handle(),
+          cudnnTypeWrapper<T>::kOne(),
+          top_desc_,
+          dY.template data<T>(),
+          filter_desc_,
+          filter.template data<T>(),
+          conv_desc_,
+          algo_,
+          state->workspace().get(cudnn_ws_nbytes_),
+          cudnn_ws_nbytes_,
+          cudnnTypeWrapper<T>::kZero(),
+          bottom_desc_,
+          dX->template mutable_data<T>()));
+    }
+  });
+  return true;
+}
+
+REGISTER_CUDNN_OPERATOR(ConvTranspose, CudnnConvTransposeOp<float>);
+REGISTER_CUDNN_OPERATOR(
+    ConvTransposeGradient,
+    CudnnConvTransposeGradientOp<float>);
+
+} // namespace caffe2
--- a/caffe2/operators/conv_transpose_unpool_op_base.h
+++ b/caffe2/operators/conv_transpose_unpool_op_base.h
@ -118,9 +118,13 @@ class ConvTransposeUnpoolBase : public Operator<Context> {
    return true;
  }

-  virtual bool RunOnDeviceWithOrderNCHW() = 0;
+  virtual bool RunOnDeviceWithOrderNCHW() {
+    CAFFE_THROW("Not implemented");
+  }

-  virtual bool RunOnDeviceWithOrderNHWC() = 0;
+  virtual bool RunOnDeviceWithOrderNHWC() {
+    CAFFE_THROW("Not implemented");
+  }

  virtual ~ConvTransposeUnpoolBase() {}

--- a/caffe2/operators/counter_ops.cc
+++ b/caffe2/operators/counter_ops.cc
@ -0,0 +1,46 @@
+#include "counter_ops.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CPU_OPERATOR(CreateCounter, CreateCounterOp<int32_t>);
+REGISTER_CPU_OPERATOR(ResetCounter, ResetCounterOp<int32_t>);
+REGISTER_CPU_OPERATOR(CountDown, CountDownOp<int32_t>);
+
+OPERATOR_SCHEMA(CreateCounter)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Creates a count-down counter with initial value specified by the 'init_count'
+argument.
+)DOC")
+    .Output(0, "counter", "A blob pointing to an instance of a new counter.")
+    .Arg("init_count", "Initial count for the counter, must be >= 0.");
+
+OPERATOR_SCHEMA(ResetCounter)
+    .NumInputs(1)
+    .NumOutputs(0)
+    .SetDoc(R"DOC(
+Resets a count-down counter with initial value specified by the 'init_count'
+argument.
+)DOC")
+    .Input(0, "counter", "A blob pointing to an instance of a new counter.")
+    .Arg("init_count", "Resets counter to this value, must be >= 0.");
+
+OPERATOR_SCHEMA(CountDown)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+If the internal count value > 0, decreases count value by 1 and outputs false,
+otherwise outputs true.
+)DOC")
+    .Input(0, "counter", "A blob pointing to an instance of a counter.")
+    .Output(0, "should_stop", "false unless the internal count is zero.");
+
+SHOULD_NOT_DO_GRADIENT(CreateCounter);
+SHOULD_NOT_DO_GRADIENT(ResetCounter);
+SHOULD_NOT_DO_GRADIENT(CountDown);
+
+} // namespace
+
+} // namespace caffe2
--- a/caffe2/operators/counter_ops.h
+++ b/caffe2/operators/counter_ops.h
@ -0,0 +1,89 @@
+#ifndef CAFFE2_OPERATORS_COUNTER_OPS_H
+#define CAFFE2_OPERATORS_COUNTER_OPS_H
+
+#include <atomic>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+namespace {
+template <typename T>
+class Counter {
+ public:
+  explicit Counter(T count) : count_(count) {}
+  bool CountDown() {
+    if (count_ > 0) {
+      --count_;
+      return false;
+    }
+    return true;
+  }
+
+  void reset(T init_count) {
+    count_ = init_count;
+  }
+
+ private:
+  std::atomic<T> count_;
+};
+}
+
+template <typename T, class Context = CPUContext>
+class CreateCounterOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CreateCounterOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        init_count_(OperatorBase::GetSingleArgument<T>("init_count", 0)) {
+    CHECK_LE(0, init_count_) << "negative init_count is not permitted.";
+  }
+
+  bool RunOnDevice() override {
+    *OperatorBase::Output<std::unique_ptr<Counter<T>>>(0) =
+        std::unique_ptr<Counter<T>>(new Counter<T>(init_count_));
+    return true;
+  }
+
+ private:
+  T init_count_ = 0;
+};
+
+template <typename T, class Context = CPUContext>
+class ResetCounterOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  ResetCounterOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        init_count_(OperatorBase::GetSingleArgument<T>("init_count", 0)) {
+    CHECK_LE(0, init_count_) << "negative init_count is not permitted.";
+  }
+
+  bool RunOnDevice() override {
+    auto& counterPtr = OperatorBase::Input<std::unique_ptr<Counter<T>>>(0);
+    counterPtr->reset(init_count_);
+    return true;
+  }
+
+ private:
+  T init_count_;
+};
+
+template <typename T, class Context = CPUContext>
+class CountDownOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CountDownOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& counterPtr = OperatorBase::Input<std::unique_ptr<Counter<T>>>(0);
+    auto* output = Output(0);
+    output->Resize(std::vector<int>{});
+    *output->template mutable_data<bool>() = counterPtr->CountDown();
+    return true;
+  }
+};
+} // namespace caffe2
+#endif // CAFFE2_OPERATORS_COUNTER_OPS_H_
--- a/caffe2/operators/cross_entropy_op.cc
+++ b/caffe2/operators/cross_entropy_op.cc
@ -2,6 +2,17 @@

 namespace caffe2 {

+namespace {
+
+inline float sigmoid_xent_forward(float lgt, float tgt) {
+  return lgt * (tgt - (lgt >= 0)) - log(1 + exp(lgt - 2 * lgt * (lgt >= 0)));
+}
+
+inline float sigmoid_xent_backward(float lgt, float tgt) {
+  return tgt - 1. / (1. + exp(-lgt));
+}
+}
+
 template <>
 bool LabelCrossEntropyOp<float, CPUContext>::RunOnDevice() {
  auto& X = Input(0);
@ -26,6 +37,68 @@ bool LabelCrossEntropyOp<float, CPUContext>::RunOnDevice() {
  return true;
 }

+template <>
+bool SigmoidCrossEntropyWithLogitsOp<float, CPUContext>::RunOnDevice() {
+  auto& logits = Input(0);
+  auto& targets = Input(1);
+  CAFFE_ENFORCE(logits.dims() == targets.dims());
+  const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
+  const auto outer_size = logits.size() / inner_size;
+
+  auto* out = Output(0);
+  if (logits.ndim() == 0) {
+    out->Resize(std::vector<TIndex>{});
+  } else {
+    std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
+    out->Resize(dims);
+  }
+  auto* out_ptr = out->mutable_data<float>();
+
+  auto* logits_ptr = logits.data<float>();
+  auto* targets_ptr = targets.data<float>();
+
+  auto in_idx = 0;
+  for (int i = 0; i < outer_size; ++i) {
+    float value = 0;
+    for (int j = 0; j < inner_size; ++j) {
+      value += sigmoid_xent_forward(logits_ptr[in_idx], targets_ptr[in_idx]);
+      ++in_idx;
+    }
+    out_ptr[i] = -value / inner_size;
+  }
+  return true;
+}
+
+template <>
+bool SigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& g = Input(0);
+  auto& logits = Input(1);
+  auto& targets = Input(2);
+  CAFFE_ENFORCE(logits.dims() == targets.dims());
+  const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
+  const auto outer_size = logits.size() / inner_size;
+  CAFFE_ENFORCE(g.size() == outer_size);
+
+  auto* out = Output(0);
+  out->ResizeLike(logits);
+  auto* out_ptr = out->mutable_data<float>();
+
+  auto* logits_ptr = logits.data<float>();
+  auto* targets_ptr = targets.data<float>();
+  auto* g_ptr = g.data<float>();
+
+  auto in_idx = 0;
+  for (int i = 0; i < outer_size; ++i) {
+    auto g_factor = -g_ptr[i] / inner_size;
+    for (int i = 0; i < inner_size; ++i) {
+      out_ptr[in_idx] = g_factor *
+          sigmoid_xent_backward(logits_ptr[in_idx], targets_ptr[in_idx]);
+      ++in_idx;
+    }
+  }
+  return true;
+}
+
 template <>
 bool LabelCrossEntropyGradientOp<float, CPUContext>::RunOnDevice() {
  auto& X = Input(0);
@ -129,6 +202,13 @@ REGISTER_CPU_OPERATOR(MakeTwoClass,
 REGISTER_CPU_OPERATOR(MakeTwoClassGradient,
                      MakeTwoClassGradientOp<float, CPUContext>);

+REGISTER_CPU_OPERATOR(
+    SigmoidCrossEntropyWithLogits,
+    SigmoidCrossEntropyWithLogitsOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    SigmoidCrossEntropyWithLogitsGradient,
+    SigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>);
+
 OPERATOR_SCHEMA(MakeTwoClass)
  .NumInputs(1)
  .NumOutputs(1)
@ -145,6 +225,22 @@ OPERATOR_SCHEMA(MakeTwoClassGradient)
  .NumInputs(1)
  .NumOutputs(1);

+OPERATOR_SCHEMA(SigmoidCrossEntropyWithLogits)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Given two matrices logits and targets, of same shape,
+(batch_size, num_classes), computes the sigmoid cross entropy between the two.
+Returns a tensor of shape (batch_size,) of losses for each example.
+)DOC")
+    .Input(0, "logits", "matrix of logits for each example and class.")
+    .Input(1, "targets", "matrix of targets, same shape as logits.")
+    .Output(0, "xentropy", "Vector with the total xentropy for each example.");
+
+OPERATOR_SCHEMA(SigmoidCrossEntropyWithLogitsGradient)
+    .NumInputs(3)
+    .NumOutputs(1);
+
 struct GetMakeTwoClassGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
  vector<OperatorDef> GetGradientDefs() override {
@ -156,5 +252,20 @@ struct GetMakeTwoClassGradient : public GradientMakerBase {
  }
 };
 REGISTER_GRADIENT(MakeTwoClass, GetMakeTwoClassGradient);
+
+struct GetSigmoidCrossEntropyWithLogitsGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "SigmoidCrossEntropyWithLogitsGradient",
+        "",
+        vector<string>{GO(0), I(0), I(1)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(
+    SigmoidCrossEntropyWithLogits,
+    GetSigmoidCrossEntropyWithLogitsGradient);
+
 }  // namespace
 }  // namespace caffe2
--- a/caffe2/operators/cross_entropy_op.h
+++ b/caffe2/operators/cross_entropy_op.h
@ -62,6 +62,22 @@ class MakeTwoClassGradientOp final
  // Ouptut: dX
 };

+template <typename T, class Context>
+class SigmoidCrossEntropyWithLogitsOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(SigmoidCrossEntropyWithLogitsOp);
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+};
+
+template <typename T, class Context>
+class SigmoidCrossEntropyWithLogitsGradientOp final : public Operator<Context> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(SigmoidCrossEntropyWithLogitsGradientOp);
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  bool RunOnDevice() override;
+};
+
 }  // namespace caffe2

 #endif  // CAFFE2_OPERATORS_CROSS_ENTROPY_OP_H_
--- a/caffe2/operators/dataset_ops.cc
+++ b/caffe2/operators/dataset_ops.cc
@ -0,0 +1,734 @@
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/string_utils.h"
+
+namespace caffe2 {
+namespace {
+
+const char kDatasetFieldSeparator = ':';
+const char* kDatasetLengthField = "lengths";
+
+// how much percent to grow the dataset when needed
+const int kDatasetGrowthPct = 40;
+
+// used for lengths tensors in the dataset
+using TLength = int32_t;
+// used for all internal dataset operations (offsets, sizes to read, etc.)
+using TOffset = int64_t;
+
+/**
+ * Provides functionality to iterate across a list of tensors where some
+ * of those tensors represent lengths in a hierarchical structure.
+ */
+class TreeIterator {
+ public:
+  struct FieldDesc {
+    int id;
+    int lengthFieldId = -1;
+    std::string name;
+  };
+
+  explicit TreeIterator(const std::vector<std::string>& fields) {
+    // populate field vector and split field names
+    fields_.resize(fields.size());
+    std::vector<std::vector<std::string>> nameParts(fields_.size());
+    for (int i = 0; i < fields.size(); ++i) {
+      auto& field = fields_.at(i);
+      field.name = fields[i];
+      field.id = i;
+      field.lengthFieldId = -1;
+      nameParts.at(i) = split(kDatasetFieldSeparator, field.name);
+    }
+
+    // populate lengthFields
+    for (const auto& field : fields_) {
+      const auto& parts = nameParts.at(field.id);
+      if (!parts.empty() && parts.back() == kDatasetLengthField) {
+        lengthFieldIds_.push_back(field.id);
+      }
+    }
+
+    // find length-field with maximum prefix matching for each field
+    for (auto& field : fields_) {
+      // by default, we are matching against the root domain
+      int maxMatchLevel = 1;
+      int maxMatchLengthFieldId = -1;
+      for (int j = 0; j < numLengthFields(); ++j) {
+        const auto& lenField = lengthField(j);
+        // a length field can't have itself as its length field
+        if (field.id == lenField.id) {
+          continue;
+        }
+        auto lf = nameParts.at(lenField.id);
+        auto lfEnd = lf.end() - 1;
+        // check whether this lengthField is a prefix for this field name
+        if (std::mismatch(lf.begin(), lfEnd, nameParts.at(field.id).begin())
+                .first != lfEnd) {
+          continue;
+        }
+        if (lf.size() > maxMatchLevel) {
+          maxMatchLevel = lf.size();
+          maxMatchLengthFieldId = j;
+        }
+      }
+      field.lengthFieldId = maxMatchLengthFieldId;
+    }
+
+    // check that fields are topologically sorted
+    // (no length field depends on a length defined afterwards)
+    for (const auto& field : fields_) {
+      const auto* lengthField = lengthFieldFor(field);
+      CAFFE_ENFORCE(
+          (lengthField == nullptr) || (lengthField->id < field.id),
+          "Error: Field ",
+          field.id,
+          " (",
+          field.name,
+          ") ",
+          "depends on a field defined afterwards: ",
+          lengthField->id,
+          " (",
+          lengthField->name,
+          ").");
+    }
+  }
+
+  void advance(
+      const std::vector<const TLength*>& lengths,
+      std::vector<TOffset>& offsets,
+      std::vector<TOffset>& sizes,
+      std::vector<TOffset>& limits,
+      TOffset num) {
+    thread_local std::vector<TOffset> newOffsets;
+    CHECK_EQ(lengths.size(), numLengthFields());
+    CHECK_EQ(offsets.size(), numOffsetFields());
+    sizes.resize(offsets.size());
+    newOffsets.resize(offsets.size());
+    // first index, top level
+    {
+      auto limit = limits[0];
+      auto offset = offsets[0];
+      CAFFE_ENFORCE(limit >= offset, "Tried to advance past end of cursor.");
+      TOffset total = std::min(limit - offset, num);
+      sizes[0] = total;
+      newOffsets[0] = offset + total;
+    }
+    // child indices
+    for (int j = 1; j < numOffsetFields(); ++j) {
+      TOffset total = 0;
+      int parentOffsetId = offsetFieldIdFor(lengthField(j - 1));
+      const TLength* length = lengths[j - 1] + offsets[parentOffsetId];
+      for (int k = 0; k < sizes[parentOffsetId]; ++k) {
+        total += *(length++);
+      }
+      auto offset = offsets[j];
+      CAFFE_ENFORCE(
+          offset + total <= limits[j],
+          "Inconsistent field length: ",
+          "tried to advance past the end of field ",
+          j);
+      sizes[j] = total;
+      newOffsets[j] = offset + total;
+    }
+    offsets = newOffsets;
+  }
+
+  // Corresponds to the number of fields that have "length" as its last name
+  int numLengthFields() const {
+    return lengthFieldIds_.size();
+  }
+
+  // Corresponds to the number of length fields + 1 (for the top-level domain)
+  int numOffsetFields() const {
+    return numLengthFields() + 1;
+  }
+
+  // Get lengthField description for the given field
+  const FieldDesc* lengthFieldFor(const FieldDesc& desc) {
+    return (desc.lengthFieldId == -1)
+        ? nullptr
+        : &fields_.at(lengthFieldIds_.at(desc.lengthFieldId));
+  }
+
+  // Get lengthField description for the given lengthFieldId, where
+  // 0 <= lengthFieldId < numLengthFields()
+  const FieldDesc& lengthField(int lengthFieldId) {
+    return fields_.at(lengthFieldIds_.at(lengthFieldId));
+  }
+
+  // Returns the index into the 'offset' vector for the given field.
+  int offsetFieldIdFor(const FieldDesc& fieldDesc) {
+    return fieldDesc.lengthFieldId + 1;
+  }
+
+  // Returns the field description for all fields.
+  const std::vector<FieldDesc>& fields() {
+    return fields_;
+  }
+
+ private:
+  // Description of each field
+  std::vector<FieldDesc> fields_;
+  // Index into fields_ above for the fields that are lengths.
+  std::vector<int> lengthFieldIds_;
+};
+
+class TreeCursor {
+ public:
+  explicit TreeCursor(const TreeIterator& iterator) : it(iterator) {}
+  std::vector<TOffset> offsets;
+  std::mutex mutex_;
+  TreeIterator it;
+};
+
+class CreateTreeCursorOp : public Operator<CPUContext> {
+ public:
+  CreateTreeCursorOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws),
+        fields_(OperatorBase::GetRepeatedArgument<std::string>("fields")) {}
+
+  bool RunOnDevice() override {
+    *OperatorBase::Output<std::unique_ptr<TreeCursor>>(0) =
+        std::unique_ptr<TreeCursor>(new TreeCursor(TreeIterator(fields_)));
+    return true;
+  }
+
+ private:
+  std::vector<std::string> fields_;
+};
+
+class ResetCursorOp : public Operator<CPUContext> {
+ public:
+  ResetCursorOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& cursor = OperatorBase::Input<std::unique_ptr<TreeCursor>>(0);
+    std::lock_guard<std::mutex> lock(cursor->mutex_);
+    cursor->offsets.clear();
+    return true;
+  }
+};
+
+class CheckDatasetConsistencyOp : public Operator<CPUContext> {
+ public:
+  CheckDatasetConsistencyOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws),
+        iterator_(OperatorBase::GetRepeatedArgument<std::string>("fields")) {}
+
+  bool RunOnDevice() override {
+    thread_local std::vector<const TLength*> lengths;
+    thread_local std::vector<TOffset> limits;
+    thread_local std::vector<TOffset> sizes;
+    thread_local std::vector<TOffset> offsets;
+    CAFFE_ENFORCE(
+        InputSize() == iterator_.fields().size(),
+        "Invalid number of fields. Expected ",
+        iterator_.fields().size(),
+        ", got ",
+        InputSize());
+    sizes.resize(iterator_.numOffsetFields());
+    // gather length data
+    lengths.resize(iterator_.numLengthFields());
+    for (int i = 0; i < lengths.size(); ++i) {
+      lengths[i] = Input(iterator_.lengthField(i).id).data<TLength>();
+    }
+    // gather size limits
+    limits.assign(sizes.size(), std::numeric_limits<TOffset>::max());
+    for (int i = 0; i < iterator_.fields().size(); ++i) {
+      int lengthIdx = iterator_.fields()[i].lengthFieldId + 1;
+      TOffset size = (TOffset)Input(i).dims()[0];
+      if (limits[lengthIdx] == std::numeric_limits<TOffset>::max()) {
+        limits[lengthIdx] = size;
+      } else {
+        CAFFE_ENFORCE(
+            limits[lengthIdx] == size,
+            "Inconsistent sizes for fields belonging to same domain.",
+            " Field: ",
+            i,
+            " (",
+            iterator_.fields()[i].name,
+            "); Length field index: ",
+            lengthIdx,
+            "); Previous size: ",
+            limits[lengthIdx],
+            "; New size: ",
+            size);
+      }
+    }
+    // advance to the end
+    offsets.assign(sizes.size(), 0);
+    iterator_.advance(lengths, offsets, sizes, limits, limits[0]);
+    for (int i = 0; i < limits.size(); ++i) {
+      CAFFE_ENFORCE(limits[i] == offsets[i]);
+    }
+    return true;
+  }
+
+ private:
+  TreeIterator iterator_;
+};
+
+class ReadNextBatchOp : public Operator<CPUContext> {
+ public:
+  ReadNextBatchOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws),
+        batchSize_(OperatorBase::GetSingleArgument<int>("batch_size", 1)) {}
+
+  bool RunOnDevice() override {
+    auto& cursor = OperatorBase::Input<std::unique_ptr<TreeCursor>>(0);
+    CAFFE_ENFORCE(InputSize() == cursor->it.fields().size() + 1);
+    thread_local std::vector<const TLength*> lengths;
+    thread_local std::vector<TOffset> limits;
+    thread_local std::vector<TOffset> sizes;
+    thread_local std::vector<TOffset> offsets;
+    sizes.resize(cursor->it.numOffsetFields());
+    // gather length data
+    lengths.resize(cursor->it.numLengthFields());
+    for (int i = 0; i < lengths.size(); ++i) {
+      lengths[i] = Input(cursor->it.lengthField(i).id + 1).data<int>();
+    }
+    // gather size limits
+    limits.assign(sizes.size(), std::numeric_limits<TOffset>::max());
+    for (int i = 0; i < cursor->it.fields().size(); ++i) {
+      int lengthFieldIdx = cursor->it.fields()[i].lengthFieldId + 1;
+      limits[lengthFieldIdx] =
+          std::min(limits[lengthFieldIdx], (TOffset)Input(i + 1).dims()[0]);
+    }
+    // advance cursor
+    {
+      std::lock_guard<std::mutex> lock(cursor->mutex_);
+      if (cursor->offsets.empty()) {
+        cursor->offsets.assign(sizes.size(), 0);
+      }
+      offsets = cursor->offsets;
+      cursor->it.advance(lengths, cursor->offsets, sizes, limits, batchSize_);
+    }
+    // gather data
+    thread_local std::vector<TIndex> outDim;
+    for (int i = 0; i < cursor->it.fields().size(); ++i) {
+      auto lengthIdx = cursor->it.fields()[i].lengthFieldId + 1;
+      auto size = sizes[lengthIdx];
+      auto offset = offsets[lengthIdx];
+      auto& in = Input(i + 1);
+      auto innerSize = in.size_from_dim(1);
+      outDim = in.dims();
+      outDim[0] = size;
+      auto* out = Output(i);
+      out->Resize(outDim);
+      if (out->size() == 0) {
+        continue;
+      }
+      void* src =
+          (char*)in.raw_data() + offset * innerSize * in.meta().itemsize();
+      void* dst = out->raw_mutable_data(in.meta());
+      context_.template CopyItems<CPUContext, CPUContext>(
+          in.meta(), out->size(), src, dst);
+    }
+    return true;
+  }
+  int batchSize_;
+};
+
+class ComputeOffsetOp : public Operator<CPUContext> {
+ public:
+  ComputeOffsetOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& cursor = OperatorBase::Input<std::unique_ptr<TreeCursor>>(0);
+    CAFFE_ENFORCE(InputSize() == cursor->it.fields().size() + 1);
+    auto* out = Output(0);
+    std::vector<const TLength*> lengths;
+    std::vector<TOffset> limits;
+    std::vector<TOffset> sizes;
+    std::vector<TOffset> offsets;
+    sizes.resize(cursor->it.numOffsetFields());
+    // gather length data
+    lengths.resize(cursor->it.numLengthFields());
+    for (int i = 0; i < lengths.size(); ++i) {
+      lengths[i] = Input(cursor->it.lengthField(i).id + 1).data<int>();
+    }
+    // gather size limits
+    limits.assign(sizes.size(), std::numeric_limits<TOffset>::max());
+    for (int i = 0; i < cursor->it.fields().size(); ++i) {
+      int lengthFieldIdx = cursor->it.fields()[i].lengthFieldId + 1;
+      limits[lengthFieldIdx] =
+          std::min(limits[lengthFieldIdx], (TOffset)Input(i + 1).dims()[0]);
+    }
+    out->Resize(limits.at(0) + 1, sizes.size());
+    auto* out_data = out->mutable_data<int64_t>();
+    for (int k = 0; k <= limits.at(0); k++) {
+      // advance cursor
+      if (cursor->offsets.empty()) {
+        cursor->offsets.assign(sizes.size(), 0);
+      }
+      // write output
+      std::copy(cursor->offsets.begin(), cursor->offsets.end(), out_data);
+      out_data += sizes.size();
+      cursor->it.advance(lengths, cursor->offsets, sizes, limits, 1);
+    }
+    cursor->offsets.assign(sizes.size(), 0); // reSet after getting meta info
+    return true;
+  }
+};
+
+class ReadRandomBatchOp : public Operator<CPUContext> {
+ public:
+  ReadRandomBatchOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws),
+        batchSize_(OperatorBase::GetSingleArgument<int>("batch_size", 1)) {}
+  bool RunOnDevice() override {
+    auto& cursor = OperatorBase::Input<std::unique_ptr<TreeCursor>>(0);
+    auto& idxblob = Input(1);
+    auto& offsetsmat = Input(2);
+    CAFFE_ENFORCE(InputSize() == cursor->it.fields().size() + 3);
+    auto idxvec = idxblob.template data<int64_t>();
+    auto& offsetdim = offsetsmat.dims();
+    // gather data
+    thread_local std::vector<TIndex> outDim;
+    int64_t idx;
+    {
+      std::lock_guard<std::mutex> lock(cursor->mutex_);
+      cursor->offsets.resize(1);
+      idx = cursor->offsets.at(0);
+      cursor->offsets.at(0) += batchSize_;
+    }
+
+    for (int i = 0; i < cursor->it.fields().size(); ++i) {
+      auto lengthIdx = cursor->it.fields()[i].lengthFieldId + 1;
+      auto& in = Input(i + 3);
+      outDim = in.dims();
+      outDim.at(0) = 0;
+      auto idxbegin = idx;
+      for (int j = 0; j < batchSize_; ++j) {
+        if (idx >= idxblob.size()) {
+          break;
+        }
+        CAFFE_ENFORCE(
+            (idxvec[idx] + 1) * offsetdim[1] + lengthIdx < offsetsmat.size(),
+            "Out of bound when trying to get elem from offsetsmat");
+        auto offsetptr = offsetsmat.template data<TOffset>() +
+            idxvec[idx] * offsetdim[1] + lengthIdx;
+        auto offset = *offsetptr;
+        auto size = *(offsetptr + offsetdim[1]) - offset;
+        outDim.at(0) += size; // accumulate over the batch
+        idx++;
+      }
+      idx = idxbegin; // reSet
+      auto* out = Output(i);
+      out->Resize(outDim);
+      if (out->size() == 0) {
+        continue;
+      }
+      auto innerSize = in.size_from_dim(1);
+      auto dst = static_cast<char*>(out->raw_mutable_data(in.meta()));
+      int block_size = in.size() / in.dim(0);
+      int block_bytesize = in.nbytes() / in.dim(0);
+      int start = 0;
+      for (int j = 0; j < batchSize_; ++j) {
+        if (idx >= idxblob.size()) {
+          break;
+        }
+        auto offsetptr = offsetsmat.template data<TOffset>() +
+            idxvec[idx] * offsetdim[1] + lengthIdx;
+        auto offset = *offsetptr;
+        auto size = *(offsetptr + offsetdim[1]) - offset;
+        // copy data
+        void* src =
+            (char*)in.raw_data() + offset * innerSize * in.meta().itemsize();
+        context_.template CopyItems<CPUContext, CPUContext>(
+            in.meta(), size * block_size, src, dst + start * block_bytesize);
+        start += size;
+        idx++;
+      }
+      idx = idxbegin; // reSet
+    }
+    return true;
+  }
+  int batchSize_;
+};
+
+template <class Context>
+class AppendOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  AppendOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& a = Input(0);
+    auto& b = Input(1);
+    auto* c = Output(0);
+    CAFFE_ENFORCE(b.ndim() >= 1);
+    if (a.size() == 0) {
+      c->CopyFrom(b);
+      return true;
+    }
+    CAFFE_ENFORCE(&a == c, "First argument must be in-place.");
+    CAFFE_ENFORCE(c->ndim() == b.ndim());
+    CAFFE_ENFORCE(b.ndim() == c->ndim());
+    CAFFE_ENFORCE(a.meta() == b.meta());
+    for (int i = 1; i < a.ndim(); ++i) {
+      CAFFE_ENFORCE(a.dims()[i] == b.dims()[i]);
+    }
+    auto oldSize = c->size();
+    c->Extend(b.dims()[0], kDatasetGrowthPct, &context_);
+    auto* dst = (char*)c->raw_mutable_data() + oldSize * b.meta().itemsize();
+    context_.template CopyItems<Context, Context>(
+        b.meta(), b.size(), b.raw_data(), dst);
+    return true;
+  }
+};
+
+template <class Context>
+class AtomicAppendOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  AtomicAppendOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& mutex = OperatorBase::Input<std::unique_ptr<std::mutex>>(0);
+    const auto numFields = (InputSize() - 1) / 2;
+    CAFFE_ENFORCE(OutputSize() == numFields);
+
+    std::lock_guard<std::mutex> guard(*mutex);
+
+    // 1: checks
+    for (int i = 0; i < numFields; ++i) {
+      auto& a = Input(1 + i);
+      auto& b = Input(1 + i + numFields);
+      auto* c = Output(i);
+      CAFFE_ENFORCE(b.ndim() >= 1);
+      if (a.size() == 0) {
+        continue;
+      }
+      CAFFE_ENFORCE(
+          (void*)&a == (void*)c, "Appended-to arguments must be in-place.");
+      CAFFE_ENFORCE(c->ndim() == b.ndim());
+      CAFFE_ENFORCE(b.ndim() == c->ndim());
+      CAFFE_ENFORCE(a.meta() == b.meta());
+      for (int j = 1; j < a.ndim(); ++j) {
+        CAFFE_ENFORCE(a.dims()[j] == b.dims()[j]);
+      }
+    }
+
+    // 2: copies
+    for (int i = 0; i < numFields; ++i) {
+      auto& a = Input(1 + i);
+      auto& b = Input(1 + i + numFields);
+      auto* c = Output(i);
+      if (a.size() == 0) {
+        c->CopyFrom(b);
+        continue;
+      }
+      auto oldSize = c->size();
+      c->Extend(b.dims()[0], kDatasetGrowthPct, &context_);
+      auto* dst = (char*)c->raw_mutable_data() + oldSize * b.meta().itemsize();
+      context_.template CopyItems<Context, Context>(
+          b.meta(), b.size(), b.raw_data(), dst);
+    }
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(CreateTreeCursor, CreateTreeCursorOp);
+REGISTER_CPU_OPERATOR(ResetCursor, ResetCursorOp);
+REGISTER_CPU_OPERATOR(ReadNextBatch, ReadNextBatchOp);
+REGISTER_CPU_OPERATOR(ComputeOffset, ComputeOffsetOp);
+REGISTER_CPU_OPERATOR(ReadRandomBatch, ReadRandomBatchOp);
+REGISTER_CPU_OPERATOR(CheckDatasetConsistency, CheckDatasetConsistencyOp);
+REGISTER_CPU_OPERATOR(Append, AppendOp<CPUContext>);
+REGISTER_CPU_OPERATOR(AtomicAppend, AtomicAppendOp<CPUContext>);
+
+OPERATOR_SCHEMA(CreateTreeCursor)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Creates a cursor to iterate through a list of tensors, where some of those
+tensors contains the lengths in a nested schema. The schema is determined by
+the `fields` arguments.
+
+For example, to represent the following schema:
+
+  Struct(
+      a=Int(),
+      b=List(List(Int),
+      c=List(
+          Struct(
+             c1=String,
+             c2=List(Int),
+          ),
+      ),
+  )
+
+the field list will be:
+  [
+      "a",
+      "b:lengths",
+      "b:values:lengths",
+      "b:values:values",
+      "c:lengths",
+      "c:c1",
+      "c:c2:lengths",
+      "c:c2:values",
+  ]
+
+And for the following instance of the struct:
+
+  Struct(
+      a=3,
+      b=[[4, 5], [6, 7, 8], [], [9]],
+      c=[
+          Struct(c1='alex', c2=[10, 11]),
+          Struct(c1='bob', c2=[12]),
+      ],
+  )
+
+The values of the fields will be:
+  {
+      "a": [3],
+      "b:lengths": [4],
+      "b:values:lengths": [2, 3, 0, 1],
+      "b:values:values": [4, 5, 6, 7, 8, 9],
+      "c:lengths": [2],
+      "c:c1": ["alex", "bob"],
+      "c:c2:lengths": [2, 1],
+      "c:c2:values", [10, 11, 12],
+  }
+
+In general, every field name in the format "{prefix}:lengths" defines a domain
+"{prefix}", and every subsequent field in the format "{prefx}:{field}" will
+be in that domain, and the length of the domain is provided for each entry of
+the parent domain. In the example, "b:lengths" defines a domain of length 4, so
+every field under domain "b" will have 4 entries.
+The "lengths" field for a given domain must appear before any reference to
+that domain.
+
+Returns a pointer to an instance of the Cursor, which keeps the current offset
+on each of the domains defined by `fields`. Cursor also ensures thread-safety
+such that ReadNextBatch and ResetCursor can be used safely in parallel.
+
+A cursor does not contain data per se, so calls to ReadNextBatch actually need
+to pass a list of blobs containing the data to read for each one of the fields.
+)DOC")
+    .Output(0, "cursor", "A blob pointing to an instance of a new TreeCursor.")
+    .Arg(
+        "fields",
+        "A list of strings each one representing a field of the dataset.");
+
+OPERATOR_SCHEMA(ResetCursor)
+    .NumInputs(1)
+    .NumOutputs(0)
+    .SetDoc(R"DOC(
+Resets the offsets for the given TreeCursor. This operation is thread safe.
+)DOC")
+    .Input(0, "cursor", "A blob containing a pointer to the cursor.");
+
+OPERATOR_SCHEMA(ReadNextBatch)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(1, INT_MAX)
+    .SetDoc(R"DOC(
+Read the next batch of examples out of the given cursor and data blobs.
+
+Input(0) is a blob pointing to a TreeCursor, and
+[Input(1),... Input(num_fields)] a list of tensors containing the data for
+each field of the dataset.
+
+ReadNextBatch is thread safe.
+)DOC")
+    .Input(0, "cursor", "A blob containing a pointer to the cursor.")
+    .Input(1, "dataset_field_0", "First dataset field")
+    .Output(0, "field_0", "Tensor containing the next batch for field 0.")
+    .Arg("batch_size", "Number of top-level entries to read.");
+
+OPERATOR_SCHEMA(ComputeOffset)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Compute the offsets matrix given cursor and data blobs. Need to be ran at
+beginning or after reseting cursor
+
+Input(0) is a blob pointing to a TreeCursor, and
+[Input(1),... Input(num_fields)] a list of tensors containing the data for
+each field of the dataset.
+
+ComputeOffset is thread safe.
+)DOC")
+    .Input(0, "cursor", "A blob containing a pointer to the cursor.")
+    .Input(1, "dataset_field_0", "First dataset field")
+    .Output(0, "field_0", "Tensor containing offset info for this chunk.");
+
+OPERATOR_SCHEMA(ReadRandomBatch)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(1, INT_MAX)
+    .SetDoc(R"DOC(
+Read the next batch of examples out of the given cursor,
+idx blob, offset matrix and data blobs.
+
+Input(0) is a blob pointing to a TreeCursor,
+Input(1) is a blob pointing to the shuffled idx
+Input(2) is a blob pointing to the offset matrix and
+[Input(3),... Input(num_fields)] a list of tensors containing the data for
+each field of the dataset.
+
+ReadRandomBatch is thread safe.
+)DOC")
+    .Input(0, "cursor", "A blob containing a pointer to the cursor.")
+    .Input(1, "idx", "idx with a shuffled order.")
+    .Input(2, "offsetsmat", "offset matrix containing length offset info.")
+    .Input(3, "dataset_field_0", "First dataset field")
+    .Output(0, "field_0", "Tensor containing the next batch for field 0.")
+    .Arg("batch_size", "Number of top-level entries to read.");
+
+OPERATOR_SCHEMA(CheckDatasetConsistency)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(0)
+    .SetDoc(R"DOC(
+Checks that the given data fields represents a consistent dataset unther
+the schema specified by the `fields` argument. Operator fails if the fields
+are not consistent. If data is consistent, each field's data can be safely
+appended to an existing dataset, keeping it consistent.
+)DOC")
+    .Input(0, "field_0", "Data for field 0.")
+    .Arg(
+        "fields",
+        "List of strings representing the string names in the format"
+        "specified in the doc for CreateTreeCursor.");
+
+OPERATOR_SCHEMA(Append)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .EnforceInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Append input 2 to the end of input 1.
+Input 1 must be the same as output, that is, it is required to be in-place.
+Input 1 may have to be re-allocated in order for accommodate to the new size.
+Currently, an exponential growth ratio is used in order to ensure amortized
+constant time complexity.
+All except the outer-most dimension must be the same between input 1 and 2.
+)DOC")
+    .Input(0, "dataset", "The tensor to be appended to.")
+    .Input(1, "new_data", "Tensor to append to the end of dataset.")
+    .Output(0, "dataset", "Same as input 0, representing the mutated tensor.");
+
+OPERATOR_SCHEMA(AtomicAppend)
+    .NumInputs(3, INT_MAX)
+    .NumOutputs(1, INT_MAX)
+    .AllowInplace([](int in, int out) { return in == out + 1; });
+
+SHOULD_NOT_DO_GRADIENT(CreateTreeCursor);
+SHOULD_NOT_DO_GRADIENT(ResetCursor);
+SHOULD_NOT_DO_GRADIENT(ReadNextBatch);
+SHOULD_NOT_DO_GRADIENT(ComputeOffset);
+SHOULD_NOT_DO_GRADIENT(ReadRandomBatch);
+SHOULD_NOT_DO_GRADIENT(CheckDatasetConsistency);
+SHOULD_NOT_DO_GRADIENT(Append);
+SHOULD_NOT_DO_GRADIENT(AtomicAppend);
+}
+}
--- a/caffe2/operators/elementwise_op.h
+++ b/caffe2/operators/elementwise_op.h
@ -2,16 +2,16 @@
 #define CAFFE2_OPERATORS_ELEMENTWISE_OP_H_

 #include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
-#include "caffe2/core/logging.h"

 namespace caffe2 {

 using NumericTypes = TensorTypes<int32_t, int64_t, float, double>;
 class SameTypeAsInput {};

-template<typename OutputTemplate, typename InputType>
+template <typename OutputTemplate, typename InputType>
 struct TypeForOutput {
  using value = OutputTemplate;
 };
@ -21,12 +21,26 @@ struct TypeForOutput<SameTypeAsInput, InputType> {
  using value = InputType;
 };

-template <typename InputTypes, class Context, class Functor,
+/**
+ * Generic meta-operator that is able to processes element-wise operations on
+ * a single-element tensor, returning a tensor with same shape, and either of
+ * the same type as the input or of a specified result type.
+ *
+ * The functor provided must implement operator() as a template on input and
+ * output types, and on a Context. Moreover, it needs to provide a constructor
+ * that takes OperatorBase& as argument. This is in order to consume arguments
+ * passed to the operator instance.
+ */
+template <
+    typename InputTypes,
+    class Context,
+    class Functor,
    class OutputType = SameTypeAsInput>
-class UnaryElementwiseOp : public Operator<Context> {
+class UnaryElementwiseWithArgsOp : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  USE_SIMPLE_CTOR_DTOR(UnaryElementwiseOp);
+  UnaryElementwiseWithArgsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws), functor(*this) {}

  bool RunOnDevice() override {
    return DispatchHelper<InputTypes>::call(this, Input(0));
@ -38,11 +52,67 @@ class UnaryElementwiseOp : public Operator<Context> {
    auto* output = Output(0);
    output->ResizeLike(input);
    using R = typename TypeForOutput<OutputType, T>::value;
-    Functor()(input.size(), input.template data<T>(),
-              output->template mutable_data<R>(), &context_);
+    functor(
+        input.size(),
+        input.template data<T>(),
+        output->template mutable_data<R>(),
+        &context_);
    return true;
  }

+  Functor functor;
+};
+
+/**
+ * WithDefaultConstructor is a functor that can be used as the functor of an
+ * UnaryElementwiseWithArgsOp. It simply forwards the operator() call into
+ * another functor that doesn't accept arguments in its constructor.
+ */
+template <typename Functor>
+struct WithDefaultConstructor {
+  explicit WithDefaultConstructor(OperatorBase& op) {}
+
+  template <typename In, typename Out, typename Context>
+  void operator()(int n, const In* in, Out* out, Context* c) {
+    Functor()(n, in, out, c);
+  }
+};
+
+/**
+ * UnaryElementwiseOp is a wrapper around UnaryElementwiseWithArgsOp, with the
+ * difference that it takes a functor with default constructor, e.g. that does
+ * not need to take into consideration any arguments during operator creation.
+ */
+template <
+    typename InputTypes,
+    class Context,
+    class Functor,
+    class OutputType = SameTypeAsInput>
+using UnaryElementwiseOp = UnaryElementwiseWithArgsOp<
+    InputTypes,
+    Context,
+    WithDefaultConstructor<Functor>,
+    OutputType>;
+
+/**
+ * ForEach is a unary functor that forwards each element of the input array
+ * into the elementwise Functor provided, and gathers the results of each
+ * call into the resulting array. Use it as an adaptor if you want to create
+ * a UnaryElementwiseOp that acts on each element of the tensor per function
+ * call -- this is resonable for complex types where vectorization wouldn't
+ * be much of a gain, performance-wise.
+ */
+template <typename Functor>
+struct ForEach {
+  explicit ForEach(OperatorBase& op) : functor(op) {}
+
+  template <typename In, typename Out, typename Context>
+  void operator()(int n, const In* in, Out* out, Context* c) {
+    for (int i = 0; i < n; ++i) {
+      out[i] = functor(in[i]);
+    }
+  }
+  Functor functor;
 };

 /**
@ -113,7 +183,7 @@ class BinaryElementwiseOp : public Operator<Context> {
      return false;
    }
    for (int i = 0; i < b.size(); ++i) {
-      if (a[a.size()-1-i] != b[b.size()-1-i]) {
+      if (a[a.size() - 1 - i] != b[b.size() - 1 - i]) {
        return false;
      }
    }
@ -192,8 +262,12 @@ class DivGradientOp final : public Operator<Context> {
    }                                                               \
  };                                                                \
  template <class DC>                                               \
-  using name##Op = BinaryElementwiseOp<NumericTypes, DC,                  \
-    name##Functor, SameTypeAsInput, true>
+  using name##Op = BinaryElementwiseOp<                             \
+      NumericTypes,                                                 \
+      DC,                                                           \
+      name##Functor,                                                \
+      SameTypeAsInput,                                              \
+      true>

 CAFFE2_BINARY_FUNCTOR_WRAPPER(Add);
 CAFFE2_BINARY_FUNCTOR_WRAPPER(Sub);
@ -225,8 +299,8 @@ CAFFE2_BINARY_FUNCTOR_WRAPPER(Div);
    }                                                               \
  };                                                                \
  template <class DC>                                               \
-  using name##Op = BinaryElementwiseOp<                                       \
-      NumericTypes, DC, name##Functor, bool, true>
+  using name##Op =                                                  \
+      BinaryElementwiseOp<NumericTypes, DC, name##Functor, bool, true>

 CAFFE2_BINARY_FUNCTOR_BINARY_RESULT_WRAPPER(LT);
 CAFFE2_BINARY_FUNCTOR_BINARY_RESULT_WRAPPER(LE);
--- a/caffe2/operators/filler_op.cc
+++ b/caffe2/operators/filler_op.cc
@ -18,6 +18,7 @@ REGISTER_CPU_OPERATOR(UniformFill, UniformFillOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(UniformIntFill, UniformFillOp<int, CPUContext>);
 REGISTER_CPU_OPERATOR(ConstantFill, ConstantFillOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(ConstantIntFill, ConstantFillOp<int, CPUContext>);
+REGISTER_CPU_OPERATOR(ConstantBoolFill, ConstantFillOp<bool, CPUContext>);
 REGISTER_CPU_OPERATOR(GivenTensorFill, GivenTensorFillOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(GivenTensorIntFill, GivenTensorFillOp<int, CPUContext>);
 REGISTER_CPU_OPERATOR(GaussianFill, GaussianFillOp<float, CPUContext>);
@ -30,6 +31,10 @@ OPERATOR_SCHEMA(UniformFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}}
 OPERATOR_SCHEMA(UniformIntFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
 OPERATOR_SCHEMA(ConstantFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
 OPERATOR_SCHEMA(ConstantIntFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
+OPERATOR_SCHEMA(ConstantBoolFill)
+    .NumInputs(0, 1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}});
 OPERATOR_SCHEMA(GivenTensorFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
 OPERATOR_SCHEMA(GivenTensorIntFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
 OPERATOR_SCHEMA(GaussianFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
--- a/caffe2/operators/fully_connected_op.h
+++ b/caffe2/operators/fully_connected_op.h
@ -56,14 +56,12 @@ class FullyConnectedOp final : public Operator<Context> {
    CAFFE_ENFORCE(N == b.dim32(0), dimErrorString());
    CAFFE_ENFORCE(N == b.size(), dimErrorString());

-    // Create the Y shape (without allocation)
-    static thread_local vector<TIndex> Y_shape;
-    Y_shape = X.dims();
+    Y_shape_cache_ = X.dims();
    // This is an invariant of canonical_axis, so we can DCHECK.
-    DCHECK_LE(canonical_axis + 1, Y_shape.size());
-    Y_shape.resize(canonical_axis + 1);
-    Y_shape[canonical_axis] = N;
-    Y->Resize(Y_shape);
+    DCHECK_LE(canonical_axis + 1, Y_shape_cache_.size());
+    Y_shape_cache_.resize(canonical_axis + 1);
+    Y_shape_cache_[canonical_axis] = N;
+    Y->Resize(Y_shape_cache_);
    CAFFE_ENFORCE(M * N == Y->size(), dimErrorString());

    // W * x
@ -88,6 +86,9 @@ class FullyConnectedOp final : public Operator<Context> {

 protected:
  size_t axis_{1};
+  // A local vector to cache the output shape so we don't need to recreate
+  // a vector object every time we run Run().
+  vector<TIndex> Y_shape_cache_;
  Tensor<Context> bias_multiplier_;
 };

--- a/caffe2/operators/index_ops.cc
+++ b/caffe2/operators/index_ops.cc
@ -7,7 +7,6 @@
 #include <limits>

 namespace caffe2 {
-
 namespace {
 using IndexKeyTypes = TensorTypes<int32_t, int64_t, std::string>;
 using TIndexValue = int64_t;
@ -22,12 +21,17 @@ struct IndexBase {
  void Freeze() { frozen_ = true; }
  virtual ~IndexBase() {}
  const TypeMeta& Type() const { return meta_; }
+  TIndexValue Size() {
+    std::lock_guard<std::mutex> guard(dictMutex_);
+    return nextId_;
+  }

 protected:
  int64_t maxElements_;
  TypeMeta meta_;
  TIndexValue nextId_{1}; // guarded by dictMutex_
  std::atomic<bool> frozen_{false};
+  std::mutex dictMutex_;
 };

 template<typename T>
@ -96,7 +100,6 @@ struct Index: IndexBase {
  }

  std::unordered_map<T, TIndexValue> dict_;
-  std::mutex dictMutex_;
 };

 template<class T>
@ -142,7 +145,9 @@ class IndexGetOp: public Operator<CPUContext> {
 class IndexLoadOp: public Operator<CPUContext> {
 public:
  IndexLoadOp(const OperatorDef& operator_def, Workspace* ws)
-   : Operator(operator_def, ws) {}
+      : Operator(operator_def, ws),
+        skipFirstEntry_(
+            OperatorBase::GetSingleArgument<int>("skip_first_entry", 0)) {}

  bool RunOnDevice() override {
    return DispatchHelper<IndexKeyTypes>::call(this, Input(1));
@ -153,8 +158,18 @@ class IndexLoadOp: public Operator<CPUContext> {
    auto* dict = dynamic_cast_if_rtti<Index<T>*>(base.get());
    CAFFE_ENFORCE(dict, "Wrong dictionary type given input keys.");
    const auto& keys = Input(1);
-    return dict->Load(keys.data<T>(), keys.size());
+    const auto* keys_data = keys.data<T>();
+    auto keys_size = keys.size();
+    if (skipFirstEntry_) {
+      CAFFE_ENFORCE(keys.size() > 0);
+      ++keys_data;
+      --keys_size;
    }
+    return dict->Load(keys_data, keys_size);
+  }
+
+ private:
+  bool skipFirstEntry_;
 };

 class IndexStoreOp: public Operator<CPUContext> {
@ -188,6 +203,19 @@ class IndexFreezeOp: public Operator<CPUContext> {
  }
 };

+class IndexSizeOp : public Operator<CPUContext> {
+ public:
+  IndexSizeOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& base = OperatorBase::Input<std::unique_ptr<IndexBase>>(0);
+    auto* out = Output(0);
+    out->Resize(std::vector<TIndex>{});
+    *out->mutable_data<TIndexValue>() = base->Size();
+    return true;
+  }
+};

 REGISTER_CPU_OPERATOR(IntIndexCreate, IndexCreateOp<int32_t>);
 REGISTER_CPU_OPERATOR(LongIndexCreate, IndexCreateOp<int64_t>);
@ -197,6 +225,7 @@ REGISTER_CPU_OPERATOR(IndexGet, IndexGetOp);
 REGISTER_CPU_OPERATOR(IndexLoad, IndexLoadOp);
 REGISTER_CPU_OPERATOR(IndexStore, IndexStoreOp);
 REGISTER_CPU_OPERATOR(IndexFreeze, IndexFreezeOp);
+REGISTER_CPU_OPERATOR(IndexSize, IndexSizeOp);

 OPERATOR_SCHEMA(IntIndexCreate)
  .NumInputs(0)
@ -250,7 +279,6 @@ Should not be called concurrently with IndexGet.
 )DOC")
  .Input(0, "handle", "Pointer to an Index instance.");

-
 OPERATOR_SCHEMA(IndexLoad)
    .NumInputs(2)
    .NumOutputs(0)
@ -259,7 +287,12 @@ Loads the index from the given 1-D tensor. Elements in the tensor will be given
 consecutive indexes starting at 1. Fails if tensor contains repeated elements.
 )DOC")
    .Input(0, "handle", "Pointer to an Index instance.")
-  .Input(1, "items", "1-D tensor with elements starting with index 1.");
+    .Input(1, "items", "1-D tensor with elements starting with index 1.")
+    .Arg(
+        "skip_first_entry",
+        "If set, skips the first entry of the tensor. This allows "
+        "to load tensors that are aligned with an embedding, where the first "
+        "entry corresponds to the default 0 index entry.");

 OPERATOR_SCHEMA(IndexStore)
  .NumInputs(1)
@ -271,6 +304,15 @@ for unknowns, the first element of the output tensor will be element of index 1.
  .Input(0, "handle", "Pointer to an Index instance.")
  .Output(0, "items", "1-D tensor with elements starting with index 1.");

+OPERATOR_SCHEMA(IndexSize)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Returns the number of entries currently present in the index.
+)DOC")
+    .Input(0, "handle", "Pointer to an Index instance.")
+    .Output(0, "items", "Scalar int64 tensor with number of entries.");
+
 NO_GRADIENT(IndexGetOp);
 NO_GRADIENT(IntIndexCreate);
 NO_GRADIENT(LongIndexCreate);
@ -278,5 +320,5 @@ NO_GRADIENT(StringIndexCreate);
 SHOULD_NOT_DO_GRADIENT(IndexFreeze);
 SHOULD_NOT_DO_GRADIENT(IndexLoad);
 SHOULD_NOT_DO_GRADIENT(IndexStore);
-
+SHOULD_NOT_DO_GRADIENT(IndexSize);
 }  // namespace caffe2
--- a/caffe2/operators/load_save_op.h
+++ b/caffe2/operators/load_save_op.h
@ -3,6 +3,7 @@

 #include <cstdio>
 #include <map>
+#include <unordered_set>

 #include "caffe2/core/context.h"
 #include "caffe2/core/db.h"
@ -65,12 +66,18 @@ class LoadOp final : public Operator<Context> {
    // chunks. This way we can make sure that all chunks were loaded in the end.
    // This is a map from output index to current size of the blob
    std::map<int, size_t> blobSizes;
-
+    std::unordered_set<string> loaded;
    for (; cursor->Valid(); cursor->Next()) {
      const string& key = cursor->key();
      if (!output_indices_.count(key)) {
        VLOG(1) << "Key " << key << " not used. Skipping.";
      } else {
+        CAFFE_ENFORCE(
+            loaded.count(key) == 0,
+            "Multiple copies of blob ",
+            key,
+            " found in the db.");
+
        VLOG(2) << "Deserializing blob " << key;
        BlobProto proto;
        CHECK(proto.ParseFromString(cursor->value()));
@ -101,6 +108,15 @@ class LoadOp final : public Operator<Context> {
            blobSize.first->second = blob->Get<Tensor<Context>>().size();
          }
        }
+
+        if (!proto.has_tensor() ||
+            blobSize.first->second >= blob->Get<Tensor<Context>>().size()) {
+          loaded.insert(key);
+        }
+
+        if (loaded.size() >= OutputSize()) {
+          break;
+        }
      }
    }

@ -116,6 +132,8 @@ class LoadOp final : public Operator<Context> {
            blobSize.second);
      }
    }
+
+    CHECK_EQ(loaded.size(), OutputSize());
  }

 private:
--- a/caffe2/operators/one_hot_ops.cc
+++ b/caffe2/operators/one_hot_ops.cc
@ -0,0 +1,107 @@
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+namespace {
+
+class OneHotOp : public Operator<CPUContext> {
+ public:
+  OneHotOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& indices = Input(0);
+    auto& index_size_tensor = Input(1);
+    CAFFE_ENFORCE(indices.ndim() == 1);
+    CAFFE_ENFORCE(index_size_tensor.size() == 1);
+    auto batch_size = indices.size();
+    auto index_size = *index_size_tensor.data<int64_t>();
+
+    auto* indices_ptr = indices.data<int64_t>();
+    auto* one_hots = Output(0);
+    one_hots->Resize(std::vector<TIndex>{batch_size, index_size});
+    if (one_hots->size() == 0) {
+      return true;
+    }
+    auto* one_hots_ptr = one_hots->mutable_data<float>();
+    memset(one_hots_ptr, 0, one_hots->nbytes());
+    for (int i = 0; i < batch_size; ++i) {
+      auto label_idx = indices_ptr[i];
+      DCHECK((0 <= label_idx) && (label_idx < index_size));
+      one_hots_ptr[label_idx] = 1.0;
+      one_hots_ptr += index_size;
+    }
+    return true;
+  }
+};
+
+class SegmentOneHotOp : public Operator<CPUContext> {
+ public:
+  SegmentOneHotOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& lengths = Input(0);
+    auto& indices = Input(1);
+    auto& index_size_tensor = Input(2);
+    CAFFE_ENFORCE(lengths.ndim() == 1);
+    CAFFE_ENFORCE(indices.ndim() == 1);
+    CAFFE_ENFORCE(index_size_tensor.size() == 1);
+    auto batch_size = lengths.size();
+    auto index_size = *index_size_tensor.data<int64_t>();
+    CAFFE_ENFORCE(index_size > 0);
+
+    auto* lengths_ptr = lengths.data<int32_t>();
+    auto* indices_ptr = indices.data<int64_t>();
+    auto* one_hots = Output(0);
+    one_hots->Resize(std::vector<TIndex>{batch_size, index_size});
+    auto* one_hots_ptr = one_hots->mutable_data<float>();
+    if (one_hots->size() == 0) {
+      return true;
+    }
+    memset(one_hots_ptr, 0, one_hots->nbytes());
+    int el_idx = 0;
+    for (int i = 0; i < batch_size; ++i) {
+      for (int j = 0; j < lengths_ptr[i]; ++j) {
+        DCHECK(el_idx < indices.size());
+        auto label_idx = indices_ptr[el_idx++];
+        DCHECK((0 <= label_idx) && (label_idx < index_size));
+        one_hots_ptr[label_idx] = 1.0;
+      }
+      one_hots_ptr += index_size;
+    }
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(OneHot, OneHotOp);
+REGISTER_CPU_OPERATOR(SegmentOneHot, SegmentOneHotOp);
+
+OPERATOR_SCHEMA(OneHot)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Given a sequence of indices, one for each example in a batch, returns a matrix
+where each inner dimension has the size of the index and has 1.0 in the index
+active in the given example, and 0.0 everywhere else.
+)DOC")
+    .Input(0, "indices", "The active index for each example in the batch.")
+    .Input(1, "index_size_tensor", "Scalar with the size of the index.")
+    .Output(0, "one_hots", "Matrix of size len(indices) x index_size");
+
+OPERATOR_SCHEMA(SegmentOneHot)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Given a sequence of indices, segmented by the lengths tensor, returns a matrix
+that has the elements in each sequence set to 1.0, and 0.0 everywhere else.
+)DOC")
+    .Input(0, "lengths", "Size of each segment.")
+    .Input(1, "indices", "Active indices, of size sum(lengths)")
+    .Input(2, "index_size_tensor", "Size of the index")
+    .Output(0, "one_hots", "Matrix of size len(lengths) x index_size");
+
+NO_GRADIENT(OneHot);
+NO_GRADIENT(SegmentOneHot);
+}
+}
--- a/caffe2/operators/reducer_functors.h
+++ b/caffe2/operators/reducer_functors.h
@ -124,6 +124,67 @@ struct LogSumExpRangeReducerDef {
      "input slices. Operation doesn't change the shape of individual blocks.";
 };

+template <typename T, class Context>
+class LogMeanExpRangeReducer;
+template <typename T, class Context>
+class LogMeanExpRangeReducerGradient;
+
+template <typename T>
+class LogMeanExpRangeReducer<T, CPUContext> {
+ public:
+  void operator()(
+      const TIndex block_size,
+      const TIndex blocks,
+      const T* in,
+      T* out,
+      CPUContext* context) {
+    for (int j = 0; j < block_size; ++j) {
+      T max_value = std::numeric_limits<T>::lowest();
+      for (int i = 0; i < blocks; ++i) {
+        max_value = std::max(max_value, in[i * block_size + j]);
+      }
+      T scaled_exp_sum = 0;
+      for (int i = 0; i < blocks; ++i) {
+        scaled_exp_sum += std::exp(in[i * block_size + j] - max_value);
+      }
+      scaled_exp_sum /= blocks;
+      *(out++) = std::log(scaled_exp_sum) + max_value;
+    }
+  }
+};
+
+template <typename T, class Context>
+class LogMeanExpRangeReducerGradient {
+ public:
+  void operator()(
+      const TIndex block_size,
+      const TIndex blocks,
+      const T* segment_grad, // GO
+      T* data_grad, // GI
+      const T* data_in, // I
+      const T* data_out, // O
+      Context* context) {
+    for (int j = 0; j < block_size; ++j) {
+      const T out_grad = *(segment_grad++);
+      const T offset = *(data_out++);
+      for (int i = 0; i < blocks; ++i) {
+        auto idx = i * block_size + j;
+        data_grad[idx] = out_grad * std::exp(data_in[idx] - offset) / blocks;
+      }
+    }
+  }
+};
+
+struct LogMeanExpRangeReducerDef {
+  template <typename T, class Context>
+  using Reducer = LogMeanExpRangeReducer<T, Context>;
+  template <typename T, class Context>
+  using ReducerGradient = LogMeanExpRangeReducerGradient<T, Context>;
+  static constexpr const char* name = "LogMeanExp";
+  static constexpr const char* doc =
+      "LogMeanExp computes the element-wise log of the mean of exponentials of "
+      "input slices. Operation doesn't change the shape of individual blocks.";
+};

 template <typename T, class Context>
 class MeanRangeReducer;
@ -180,10 +241,78 @@ struct MeanRangeReducerDef {
  static constexpr const char* doc =
      "Mean computation is done element-wise, so that each element of the "
      "output slice corresponds to the average value of the respective "
-      "elements in the input slives. Operation doesn't change the shape of "
+      "elements in the input slices. Operation doesn't change the shape of "
      "individual blocks.";
 };

+template <typename T, class Context>
+class MaxRangeReducer;
+template <typename T, class Context>
+class MaxRangeReducerGradient;
+
+template <typename T>
+class MaxRangeReducer<T, CPUContext> {
+ public:
+  void operator()(
+      const TIndex block_size,
+      const TIndex blocks,
+      const T* in,
+      T* out,
+      CPUContext* context) {
+    for (int j = 0; j < block_size; ++j) {
+      T max_value = std::numeric_limits<T>::lowest();
+      for (int i = 0; i < blocks; ++i) {
+        max_value = std::max(max_value, in[i * block_size + j]);
+      }
+      *(out++) = max_value;
+    }
+  }
+};
+
+template <typename T, class Context>
+class MaxRangeReducerGradient {
+ public:
+  void operator()(
+      const TIndex block_size,
+      const TIndex blocks,
+      const T* segment_grad, // GO
+      T* data_grad, // GI
+      const T* data_in, // I
+      const T* data_out, // O
+      Context* context) {
+    std::memset(
+        static_cast<void*>(data_grad), 0, blocks * block_size * sizeof(T));
+    for (int j = 0; j < block_size; ++j) {
+      const T out_grad = *(segment_grad++);
+      const T out = data_out[j];
+      for (int i = 0; i < blocks; ++i) {
+        auto idx = i * block_size + j;
+        if (out == data_in[idx]) {
+          data_grad[idx] = out_grad;
+          break;
+        }
+      }
+    }
+  }
+};
+
+struct MaxRangeReducerDef {
+  template <typename T, class Context>
+  using Reducer = MaxRangeReducer<T, Context>;
+  template <typename T, class Context>
+  using ReducerGradient = MaxRangeReducerGradient<T, Context>;
+  static constexpr const char* name = "Max";
+  static constexpr const char* doc =
+      "Max computation is done element-wise, so that each element of the "
+      "output slice corresponds to the max value of the respective "
+      "elements in the input slices. Operation doesn't change the shape of "
+      "individual blocks. This implementation imitates torch nn.Max operator. "
+      "If the maximum value occurs more than once, the operator will return "
+      "the first occurence of value. When computing the gradient using the "
+      "backward propagation, the gradient input corresponding to the first "
+      "occurence of the maximum value will be used.";
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 // Incremental reducers: consume elements one by one
 ////////////////////////////////////////////////////////////////////////////////
--- a/caffe2/operators/relu_op_cudnn.cc
+++ b/caffe2/operators/relu_op_cudnn.cc
@ -32,12 +32,16 @@ class CuDNNReluOp final : public Operator<CUDAContext> {
    if (X.dims() != cudnn_input_dims_) {
      VLOG(1) << "Setting descriptors.";
      cudnn_input_dims_ = X.dims();
-      int C = (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(3));
-      int H = 1;
-      int W = 1;
+      int C = 1, H = 1, W = 1;
      if (X.ndim() == 4) {
+        // Normal 4-dimensional tensors for images.
+        C = (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(3));
        H = (order_ == StorageOrder::NCHW ? X.dim32(2) : X.dim32(1));
        W = (order_ == StorageOrder::NCHW ? X.dim32(3) : X.dim32(2));
+      } else {
+        // If X is not 4-dimensional, we will simply use H = 1 and W = 1
+        // and wrap everything into C.
+        C = X.size() / X.dim32(0);
      }
      CUDNN_CHECK(cudnnSetTensor4dDescriptor(
          data_desc_, GetCudnnTensorFormat(order_),
@ -93,12 +97,16 @@ class CuDNNReluGradientOp final : public Operator<CUDAContext> {
    if (Y.dims() != cudnn_input_dims_) {
      VLOG(1) << "Setting descriptors.";
      cudnn_input_dims_ = Y.dims();
-      int C = (order_ == StorageOrder::NCHW ? Y.dim32(1) : Y.dim32(3));
-      int H = 1;
-      int W = 1;
+      int C = 1, H = 1, W = 1;
      if (Y.ndim() == 4) {
+        // Normal 4-dimensional tensors for images.
+        C = (order_ == StorageOrder::NCHW ? Y.dim32(1) : Y.dim32(3));
        H = (order_ == StorageOrder::NCHW ? Y.dim32(2) : Y.dim32(1));
        W = (order_ == StorageOrder::NCHW ? Y.dim32(3) : Y.dim32(2));
+      } else {
+        // If Y is not 4-dimensional, we will simply use H = 1 and W = 1
+        // and wrap everything into C.
+        C = Y.size() / Y.dim32(0);
      }
      CUDNN_CHECK(cudnnSetTensor4dDescriptor(
          data_desc_, GetCudnnTensorFormat(order_),
--- a/caffe2/operators/reverse_packed_segs_op.cc
+++ b/caffe2/operators/reverse_packed_segs_op.cc
@ -0,0 +1,35 @@
+#include "caffe2/operators/reverse_packed_segs_op.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_CPU_OPERATOR(ReversePackedSegs, ReversePackedSegsOp<CPUContext>);
+
+OPERATOR_SCHEMA(ReversePackedSegs)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Reverse segments in a 3-D tensor (lengths, segments, embeddings,), leaving
+paddings unchanged. This operator is used to reverse input of a recurrent neural
+network to make it a BRNN.
+  )DOC")
+    .Input(0, "data", "a 3-D (lengths, segments, embeddings,) tensor.")
+    .Input(1, "lengths", "length of each segment.")
+    .Output(
+        0,
+        "reversed data",
+        "a (lengths, segments, embeddings,) tensor with each segment reversed"
+        "and paddings unchanged.");
+
+class GetReversePackedSegsGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "ReversePackedSegs",
+        "",
+        vector<string>{GO(0), I(1)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(ReversePackedSegs, GetReversePackedSegsGradient);
+} // namespace
+} // namespace caffe2
--- a/caffe2/operators/reverse_packed_segs_op.h
+++ b/caffe2/operators/reverse_packed_segs_op.h
@ -0,0 +1,84 @@
+#ifndef CAFFE2_OPERATORS_REVERSE_PACKED_SEGS_OP_H_
+#define CAFFE2_OPERATORS_REVERSE_PACKED_SEGS_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context>
+class ReversePackedSegsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(ReversePackedSegsOp);
+  USE_DISPATCH_HELPER;
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, double, int, long, bool>>::call(
+        this, Input(DATA));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    if (Input(LENGTHS).template IsType<int>()) {
+      DoRunWithLengthType<T, int>();
+    } else {
+      DoRunWithLengthType<T, long>();
+    }
+    return true;
+  }
+
+ private:
+  INPUT_TAGS(DATA, LENGTHS);
+
+  template <typename T, typename LengthType>
+  void DoRunWithLengthType() {
+    const auto& data = Input(DATA);
+    const auto& lengths = Input(LENGTHS);
+
+    CAFFE_ENFORCE(
+        data.ndim() == 3,
+        "DATA should be 3-D tensor <lengths, "
+        "segments, embeddings>");
+    CAFFE_ENFORCE(lengths.ndim() == 1, "LENGTH should be 1-D");
+
+    auto* output = Output(0);
+    const auto& shape = data.dims();
+    output->Resize(shape);
+
+    const auto& max_length = data.dims()[0];
+    const auto& batch_size = data.dims()[1];
+    const auto& block_size = data.dims()[2];
+    CAFFE_ENFORCE(
+        lengths.dims()[0] == batch_size,
+        "lenths size should be"
+        " equal to batch size");
+
+    const T* data_ptr = data.template data<T>();
+    const LengthType* lengths_ptr = lengths.template data<LengthType>();
+    T* rev_data_ptr = output->template mutable_data<T>();
+    for (TIndex i = 0; i < batch_size; i++) {
+      const auto& seg_length = lengths_ptr[i];
+      CHECK_LE(seg_length, max_length);
+      TIndex j = 0;
+      for (; j < seg_length; j++) {
+        const T* data_block_ptr = data_ptr + (j * batch_size + i) * block_size;
+        T* rev_data_block_ptr =
+            rev_data_ptr + ((seg_length - 1 - j) * batch_size + i) * block_size;
+        context_.template Copy<T, Context, Context>(
+            block_size, data_block_ptr, rev_data_block_ptr);
+      }
+      for (; j < max_length; j++) {
+        const T* data_block_ptr = data_ptr + (j * batch_size + i) * block_size;
+        T* rev_data_block_ptr =
+            rev_data_ptr + (j * batch_size + i) * block_size;
+        context_.template Copy<T, Context, Context>(
+            block_size, data_block_ptr, rev_data_block_ptr);
+      }
+    }
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_REVERSE_PACKED_SEGS_OP_H_
--- a/caffe2/operators/segment_reduction_op.cc
+++ b/caffe2/operators/segment_reduction_op.cc
@ -1066,9 +1066,15 @@ REGISTER_SEGMENT_DEF(
 REGISTER_SEGMENT_DEF(
    AbstractSortedSegmentRangeDef<float, int, CPUContext,
                                  LogSumExpRangeReducerDef>);
+REGISTER_SEGMENT_DEF(AbstractSortedSegmentRangeDef<
+                     float,
+                     int,
+                     CPUContext,
+                     LogMeanExpRangeReducerDef>);
 REGISTER_SEGMENT_DEF(
-    AbstractSortedSegmentRangeDef<float, int, CPUContext,
-                                  MeanRangeReducerDef>);
+    AbstractSortedSegmentRangeDef<float, int, CPUContext, MeanRangeReducerDef>);
+REGISTER_SEGMENT_DEF(
+    AbstractSortedSegmentRangeDef<float, int, CPUContext, MaxRangeReducerDef>);

 #define REGISTER_REDUCER_WITH_ALL_OPS(reducer_def)                          \
  REGISTER_SEGMENT_DEF(                                                     \
--- a/caffe2/operators/sparse_to_dense_mask_op.cc
+++ b/caffe2/operators/sparse_to_dense_mask_op.cc
@ -0,0 +1,152 @@
+#include <algorithm>
+#include <unordered_map>
+#include <vector>
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+using ValueTypes = TensorTypes<int32_t, int64_t, float, double, string, bool>;
+
+class SparseToDenseMaskOp : public Operator<CPUContext> {
+ public:
+  SparseToDenseMaskOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {
+    std::vector<int> mask = GetRepeatedArgument<int>("mask");
+    featuresCount_ = mask.size();
+    auto biggest = *std::max_element(mask.begin(), mask.end());
+    dense_.assign(std::min(kMaxDenseSize, biggest + 1), -1);
+    for (int i = 0; i < mask.size(); i++) {
+      int id = mask[i];
+      CAFFE_ENFORCE(id >= 0, "Only positive IDs are allowed.");
+      if (id >= kMaxDenseSize) {
+        sparse_[id] = i;
+      } else {
+        dense_[id] = i;
+      }
+    }
+  }
+
+  bool RunOnDevice() override {
+    const TypeMeta& meta = Input(INDICES).meta();
+    if (meta.Match<int32_t>()) {
+      return DoRunWithIndexType<int32_t>();
+    } else if (meta.Match<int64_t>()) {
+      return DoRunWithIndexType<int64_t>();
+    } else {
+      CAFFE_THROW("Unsupported type of tensor: ", meta.name());
+      return false;
+    }
+  }
+
+  template <typename TInd>
+  bool DoRunWithIndexType() {
+    if (InputSize() < 4) {
+      return DoRunWithLengthType<TInd, int32_t>();
+    } else {
+      const TypeMeta& meta = Input(LENGTHS).meta();
+      if (meta.Match<int32_t>()) {
+        return DoRunWithLengthType<TInd, int32_t>();
+      } else if (meta.Match<int64_t>()) {
+        return DoRunWithLengthType<TInd, int64_t>();
+      } else {
+        CAFFE_THROW("Unsupported type of tensor: ", meta.name());
+        return false;
+      }
+    }
+  }
+
+  template <typename TInd, typename TLen>
+  bool DoRunWithLengthType() {
+    return DispatchHelper<ValueTypes, TInd, TLen>::call(this, Input(VALUES));
+  }
+
+  template <typename TInd, typename TLen, typename TVal>
+  bool DoRunWithType() {
+    auto& sparse_indices = Input(INDICES);
+    CAFFE_ENFORCE(sparse_indices.ndim() == 1);
+    auto& sparse_values = Input(VALUES);
+    CAFFE_ENFORCE(sparse_values.ndim() == 1);
+    CAFFE_ENFORCE(sparse_indices.size() == sparse_values.size());
+    auto& default_value = Input(DEFAULT);
+    CAFFE_ENFORCE(default_value.size() == 1);
+
+    const TInd* sparse_indices_vec = sparse_indices.data<TInd>();
+    const TVal* sparse_values_vec = sparse_values.template data<TVal>();
+    const TVal* default_val = default_value.template data<TVal>();
+
+    int cols = featuresCount_;
+    int rows = 0;
+    TLen default_length = sparse_indices.dim32(0);
+    const TLen* lengths_vec = nullptr;
+    auto* output = Output(0);
+    if (InputSize() == 4) {
+      auto& lengths = Input(LENGTHS);
+      CAFFE_ENFORCE(lengths.ndim() == 1);
+      lengths_vec = lengths.data<TLen>();
+      rows = lengths.dim32(0);
+      output->Resize(rows, cols);
+    }
+    if (rows == 0) {
+      // if the LENGTHS is not set or it is empty, the output will be a vector
+      rows = 1;
+      lengths_vec = &default_length;
+      output->Resize(cols);
+    }
+
+    // init
+    TVal* output_data = output->template mutable_data<TVal>();
+    for (int i = 0; i < cols * rows; i++) {
+      output_data[i] = default_val[0];
+    }
+
+    TLen offset = 0;
+    for (int r = 0; r < rows; r++) {
+      for (int c = 0; c < lengths_vec[r]; c++) {
+        int idx = getFeatureIdx(sparse_indices_vec[offset + c]);
+        if (idx != -1) {
+          output_data[r * cols + idx] = sparse_values_vec[offset + c];
+        }
+      }
+      offset += lengths_vec[r];
+    }
+
+    return true;
+  }
+
+ private:
+  const int kMaxDenseSize = 1024 * 128;
+
+  std::unordered_map<int, int> sparse_;
+  std::vector<int> dense_;
+  int featuresCount_;
+
+  inline int getFeatureIdx(int id) const {
+    if (id >= kMaxDenseSize) {
+      const auto& iter = sparse_.find(id);
+      if (iter == sparse_.end()) {
+        return -1;
+      } else {
+        return iter->second;
+      }
+    } else {
+      return (id >= dense_.size()) ? -1 : dense_[id];
+    }
+  }
+
+  INPUT_TAGS(INDICES, VALUES, DEFAULT, LENGTHS);
+};
+
+namespace {
+REGISTER_CPU_OPERATOR(SparseToDenseMask, SparseToDenseMaskOp);
+
+OPERATOR_SCHEMA(SparseToDenseMask)
+    .NumInputs(3, 4)
+    .NumOutputs(1)
+    .SetDoc("Convert sparse representations to dense with given indices.")
+    .Output(0, "output", "1-D or 2-D dense tensor.");
+
+NO_GRADIENT(SparseToDenseMask);
+} // namespace
+} // namespace caffe2
--- a/caffe2/operators/string_ops.cc
+++ b/caffe2/operators/string_ops.cc
@ -0,0 +1,124 @@
+#include "caffe2/caffe2/operators/string_ops.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+namespace {
+
+struct StartsWith {
+  explicit StartsWith(OperatorBase& op)
+      : prefix_(op.GetSingleArgument<std::string>("prefix", "")) {}
+  bool operator()(const std::string& str) {
+    return std::mismatch(prefix_.begin(), prefix_.end(), str.begin()).first ==
+        prefix_.end();
+  }
+
+ private:
+  std::string prefix_;
+};
+
+struct EndsWith {
+  explicit EndsWith(OperatorBase& op)
+      : suffix_(op.GetSingleArgument<std::string>("suffix", "")) {}
+  bool operator()(const std::string& str) {
+    return std::mismatch(suffix_.rbegin(), suffix_.rend(), str.rbegin())
+               .first == suffix_.rend();
+  }
+
+ private:
+  std::string suffix_;
+};
+
+struct Prefix {
+  explicit Prefix(OperatorBase& op)
+      : length_(op.GetSingleArgument<int>("length", 3)) {}
+  std::string operator()(const std::string& str) {
+    return std::string(str.begin(), std::min(str.end(), str.begin() + length_));
+  }
+
+ private:
+  int length_;
+};
+
+struct Suffix {
+  explicit Suffix(OperatorBase& op)
+      : length_(op.GetSingleArgument<int>("length", 3)) {}
+  std::string operator()(const std::string& str) {
+    return std::string(std::max(str.begin(), str.end() - length_), str.end());
+  }
+
+ private:
+  int length_;
+};
+
+template <typename ScalarFunctor, typename OutputType = std::string>
+using StringElementwiseOp = UnaryElementwiseWithArgsOp<
+    TensorTypes<std::string>,
+    CPUContext,
+    ForEach<ScalarFunctor>,
+    OutputType>;
+
+REGISTER_CPU_OPERATOR(StringPrefix, StringElementwiseOp<Prefix>);
+REGISTER_CPU_OPERATOR(StringSuffix, StringElementwiseOp<Suffix>);
+REGISTER_CPU_OPERATOR(StringStartsWith, StringElementwiseOp<StartsWith, bool>);
+REGISTER_CPU_OPERATOR(StringEndsWith, StringElementwiseOp<EndsWith, bool>);
+
+OPERATOR_SCHEMA(StringPrefix)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Computes the element-wise string prefix of the string tensor.
+Input strings that are shorter than prefix length will be returned unchanged.
+NOTE: Prefix is computed on number of bytes, which may lead to wrong behavior
+and potentially invalid strings for variable-length encodings such as utf-8.
+)DOC")
+    .Arg("length", "Maximum size of the prefix, in bytes.")
+    .Input(0, "strings", "Tensor of std::string.")
+    .Output(
+        0,
+        "prefixes",
+        "Tensor of std::string containing prefixes for each input.");
+
+OPERATOR_SCHEMA(StringSuffix)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Computes the element-wise string suffix of the string tensor.
+Input strings that are shorter than suffix length will be returned unchanged.
+NOTE: Prefix is computed on number of bytes, which may lead to wrong behavior
+and potentially invalid strings for variable-length encodings such as utf-8.
+)DOC")
+    .Input(0, "strings", "Tensor of std::string.")
+    .Output(
+        0,
+        "suffixes",
+        "Tensor of std::string containing suffixes for each output.")
+    .Arg("length", "Maximum size of the suffix, in bytes.");
+
+OPERATOR_SCHEMA(StringStartsWith)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Performs the starts-with check on each string in the input tensor.
+Returns tensor of boolean of the same dimension of input.
+)DOC")
+    .Arg("prefix", "The prefix to check input strings against.")
+    .Input(0, "strings", "Tensor of std::string.")
+    .Output(0, "bools", "Tensor of bools of same shape as input.");
+
+OPERATOR_SCHEMA(StringEndsWith)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Performs the ends-with check on each string in the input tensor.
+Returns tensor of boolean of the same dimension of input.
+)DOC")
+    .Arg("suffix", "The suffix to check input strings against.")
+    .Input(0, "strings", "Tensor of std::string.")
+    .Output(0, "bools", "Tensor of bools of same shape as input.");
+
+SHOULD_NOT_DO_GRADIENT(StringPrefix);
+SHOULD_NOT_DO_GRADIENT(StringSuffix);
+SHOULD_NOT_DO_GRADIENT(StringStartsWith);
+SHOULD_NOT_DO_GRADIENT(StringEndsWith);
+}
+} // namespace caffe2
--- a/caffe2/operators/string_ops.h
+++ b/caffe2/operators/string_ops.h
@ -0,0 +1,13 @@
+#pragma once
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/elementwise_op.h"
+
+namespace caffe2 {
+
+template <typename ScalarFunctor, typename OutputType = std::string>
+using StringElementwiseOp = UnaryElementwiseWithArgsOp<
+    TensorTypes<std::string>,
+    CPUContext,
+    ForEach<ScalarFunctor>,
+    OutputType>;
+}
--- a/caffe2/operators/transpose_op.cu
+++ b/caffe2/operators/transpose_op.cu
@ -10,24 +10,25 @@ namespace caffe2 {
 #define COMPILE_TIME_CUDA_MAX_TRANSPOSE_DIMS 5

 namespace {
-// TODO(jiayq): one possible optimization is to copy the buffer into a shared memory
-// location to speed up access.
+// TODO(jiayq): one possible optimization is to copy the buffer into a shared
+// memory location to speed up access.
 template <typename Dtype>
 __global__ void transpose_gpu(const int nthreads, const Dtype* from_data,
  Dtype* to_data, const int* buffer, const int num_axes) {
  int from_inds[COMPILE_TIME_CUDA_MAX_TRANSPOSE_DIMS];
  const int* from_counts = buffer;
  const int* to_counts = buffer + num_axes;
-  const int* map = buffer + num_axes * 2;
+  const int* axes = buffer + num_axes * 2;
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    int from_index = index, to_index = 0;
-    for (int i = 0; i < num_axes; i++) {
-      from_inds[i] = from_index / from_counts[i];
-      from_index = from_index % from_counts[i];
+    for (int i = num_axes - 1; i >= 0; --i) {
+      from_inds[i] = from_index % from_counts[i];
+      from_index = from_index / from_counts[i];
    }
-    for (int i = 0; i < num_axes; i++) {
-      to_index += from_inds[map[i]] * to_counts[i];
+    for (int i = 0; i < num_axes - 1; i++) {
+      to_index = (to_index + from_inds[axes[i]]) * to_counts[i + 1];
    }
+    to_index += from_inds[axes[num_axes - 1]];
    to_data[to_index] = from_data[index];
  }
 }
@ -42,7 +43,7 @@ bool TransposeOp<CUDAContext>::DoRunWithType() {
  int ndim = input.ndim();
  CAFFE_ENFORCE(count < std::numeric_limits<int>::max(),
                "Transpose op on GPU only supports int32"); 
-  CAFFE_ENFORCE(count < COMPILE_TIME_CUDA_MAX_TRANSPOSE_DIMS,
+  CAFFE_ENFORCE(ndim < COMPILE_TIME_CUDA_MAX_TRANSPOSE_DIMS,
                "Input ndim exceeds compile time max."); 
  // Buffer contains the following data:
  // (1) the dimenions of the inputs
--- a/caffe2/operators/transpose_op.h
+++ b/caffe2/operators/transpose_op.h
@ -45,7 +45,8 @@ class TransposeOp final : public Operator<Context> {
    }
    Y->Resize(new_dims_);
    // Do the actual transpose, which is implemented in DoRunWithType().
-    return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
+    return DispatchHelper<TensorTypes<float, double, int, long>>::call(
+        this, Input(0));
  }

 protected:
--- a/caffe2/operators/utility_ops.cc
+++ b/caffe2/operators/utility_ops.cc
@ -17,6 +17,7 @@ REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(Copy, CopyOp<CPUContext, CPUContext, CPUContext>);
 REGISTER_CPU_OPERATOR(Shape, ShapeOp<CPUContext>);
 REGISTER_CPU_OPERATOR(HasElements, HasElementsOp<CPUContext>);
+REGISTER_CPU_OPERATOR(IsEmpty, IsEmptyOp<CPUContext>);
 REGISTER_CPU_OPERATOR(Gather, GatherOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(Unique, UniqueOp<CPUContext>);
 REGISTER_CPU_OPERATOR(LengthsToSegmentIds, LengthsToSegmentIdsOp<CPUContext>);
@ -24,6 +25,7 @@ REGISTER_CPU_OPERATOR(SegmentIdsToLengths, SegmentIdsToLengthsOp<CPUContext>);
 REGISTER_CPU_OPERATOR(Slice, SliceOp<int, CPUContext>);
 REGISTER_CPU_OPERATOR(Squeeze, SqueezeOp<CPUContext>);
 REGISTER_CPU_OPERATOR(ExpandDims, ExpandDimsOp<CPUContext>);
+REGISTER_CPU_OPERATOR(And, AndOp<CPUContext>);

 OPERATOR_SCHEMA(Print)
    .NumInputs(1)
@ -209,6 +211,13 @@ OPERATOR_SCHEMA(HasElements)
        "has_elements",
        "Scalar bool tensor. True if input is not empty.");

+OPERATOR_SCHEMA(IsEmpty)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc("Returns true iff the input tensor has size == 0")
+    .Input(0, "tensor", "Tensor of any type.")
+    .Output(0, "is_empty", "Scalar bool tensor. True if input is empty.");
+
 OPERATOR_SCHEMA(Gather)
    .NumInputs(2)
    .NumOutputs(1)
@ -340,9 +349,21 @@ If the same blob is provided in input and output, the operation is copy-free.
    .Input(0, "data", "Original tensor")
    .Output(0, "expanded", "Reshaped tensor with same data as input.");

+OPERATOR_SCHEMA(And)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Outputs true iff both input blob values are true.
+)DOC")
+    .Input(0, "input_0", "first boolean input.")
+    .Input(1, "input_1", "second boolean input.")
+    .Output(0, "output", "input_0 && input_1.");
+
 SHOULD_NOT_DO_GRADIENT(Print);
 SHOULD_NOT_DO_GRADIENT(Shape);
 SHOULD_NOT_DO_GRADIENT(HasElements);
+SHOULD_NOT_DO_GRADIENT(IsEmpty);

 class GetSqueezeGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
@ -433,6 +454,7 @@ SHOULD_NOT_DO_GRADIENT(LengthsToSegmentIds);
 SHOULD_NOT_DO_GRADIENT(SegmentIdsToLengths);
 // TODO(azzolini): Add support for slice gradient
 SHOULD_NOT_DO_GRADIENT(Slice);
+SHOULD_NOT_DO_GRADIENT(And);

 } // namespace

--- a/caffe2/operators/utility_ops.h
+++ b/caffe2/operators/utility_ops.h
@ -675,6 +675,21 @@ class HasElementsOp : public Operator<Context> {
  }
 };

+template <class Context>
+class IsEmptyOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(IsEmptyOp);
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    auto* output = OperatorBase::Output<TensorCPU>(0);
+    output->Resize(std::vector<TIndex>{});
+    *output->template mutable_data<bool>() = (input.size() == 0);
+    return true;
+  }
+};
+
 // RecordShapeOp records the shape of the input tensor to a vector of int. You
 // mostly don't need this operator explicitly, and it is mostly used in the
 // autodiff process.
@ -911,6 +926,23 @@ class UniqueOp : public Operator<Context> {
 public:
  OUTPUT_TAGS(UNIQUE, REMAPPING);
 };
+
+template <class Context>
+class AndOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  AndOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    const auto* i1 = Input(0).template data<bool>();
+    const auto* i2 = Input(1).template data<bool>();
+    auto* output = Output(0);
+    output->Resize(std::vector<int>{});
+    *output->template mutable_data<bool>() = (*i1 && *i2);
+    return true;
+  }
+};
 } // namespace caffe2

 #endif // CAFFE2_OPERATORS_UTILITY_OPS_H_
--- a/caffe2/proto/caffe2.proto
+++ b/caffe2/proto/caffe2.proto
@ -199,7 +199,7 @@ message ExecutionStep {
  // Criteria network specifies a single output (TensorCPU<bool>) of
  // size (1), is run on every iteration by the executor, and
  // execution terminates when the output[0] is `false`.
-  optional string criteria_network = 5;
+  optional string criteria_network = 5 [deprecated=true];

  // If specified, run report_net asynchronously every `report_interval`
  // seconds. Report_net is guaranteed to run at least once after all
@ -210,6 +210,20 @@ message ExecutionStep {
  // If false or not set, execute sub-steps serially.
  // If true, execute all substeps concurrently, each one in a separte thread.
  optional bool concurrent_substeps = 6;
+
+  // Name of a scalar boolean tensor.
+  // ES checks this blob AFTER every substeps/subnets.
+  // If specified, and the value is true, then ES will skip the rest and return
+  // immediately.
+  // This means that the report_net and the first step will always be called.
+  // Use cases:
+  // 1) the first substep stops the rest if data condition not met
+  // 2) the first substep decide which of the rest of the steps should be run.
+  // 3) external control
+  //
+  // ** It is the user's responsibility to not to put this blob in race conditions.
+  // ** For example when setting this blob in concurrent substeps
+  optional string should_stop_blob = 9;
 }

 message PlanDef {
--- a/caffe2/python/caffe2_python.cc
+++ b/caffe2/python/caffe2_python.cc
@ -87,7 +87,6 @@ const TypeMeta& NumpyTypeToCaffe(int numpy_type) {
    {NPY_UINT8, TypeMeta::Make<uint8_t>()},
    {NPY_UINT16, TypeMeta::Make<uint16_t>()},
    {NPY_OBJECT, TypeMeta::Make<std::string>()},
-    {NPY_STRING, TypeMeta::Make<std::string>()},
    // Note: Add more types here.
  };
  static TypeMeta unknown_type;
@ -565,11 +564,10 @@ PyObject* FetchBlob(PyObject* self, PyObject* args) {

 PyObject* FeedBlob(PyObject* self, PyObject* args) {
  char* name_char;
-  PyArrayObject* array = nullptr;
+  PyObject* arg = nullptr;
  PyObject* device_option_string = nullptr;
-  // TODO(dzhulgakov): implement accepting other types (at least string)
-  if (!PyArg_ParseTuple(args, "sO!|O", &name_char, &PyArray_Type, &array,
-                        &device_option_string)) {
+  if (!PyArg_ParseTuple(
+          args, "sO|O", &name_char, &arg, &device_option_string)) {
    PyErr_SetString(PyExc_ValueError, "Incorrect arguments.");
    return nullptr;
  }
@ -584,13 +582,25 @@ PyObject* FeedBlob(PyObject* self, PyObject* args) {
  }
  Blob* blob = gWorkspace->CreateBlob(name);

+  if (PyArray_Check(arg)) { // numpy array
+    PyArrayObject* array = reinterpret_cast<PyArrayObject*>(arg);
    auto feeder = CreateFeeder(option.device_type());
    if (!feeder) {
-    PyErr_SetString(PyExc_TypeError,
-                    "Unknown device type encountered in FeedBlob.");
+      PyErr_SetString(
+          PyExc_TypeError, "Unknown device type encountered in FeedBlob.");
      return nullptr;
    }
    return feeder->Feed(option, array, blob);
+  } else if (PyString_Check(arg)) { // string
+    *blob->GetMutable<std::string>() = PyBytesToStdString(arg);
+    Py_RETURN_TRUE;
+  } else {
+    PyErr_SetString(
+        PyExc_ValueError,
+        "Unexpected type of argument - only numpy array or string are "
+        "supported for feeding");
+    return nullptr;
+  }
 }

 // A simple macro to avoid writing repeated symbols.
@ -620,7 +630,7 @@ PyMethodDef* GetCaffe2PythonMethods() {
      {"cc_RunPlan", RunPlan, METH_VARARGS, ""},
      _PYNAME(CreateBlob),
      _PYNAME(SerializeBlob),
-    _PYNAME(FetchBlob),
+      {"cc_FetchBlob", FetchBlob, METH_VARARGS, ""},
      {"cc_FeedBlob", FeedBlob, METH_VARARGS, ""},
      {nullptr, nullptr, 0, nullptr}, // end of python methods.
  };
--- a/caffe2/python/caffe2_python.h
+++ b/caffe2/python/caffe2_python.h
@ -160,7 +160,10 @@ class TensorFetcher : public BlobFetcherBase {
            Py_DECREF(outObj[j]);
          }
          Py_DECREF(array);
-          LOG(FATAL) << "Failed to allocate string for ndarray of strings.";
+          PyErr_SetString(
+              PyExc_TypeError,
+              "Failed to allocate string for ndarray of strings.");
+          return nullptr;
        }
      }
      return array;
@ -217,21 +220,14 @@ class TensorFeeder : public BlobFeederBase {
        char* str;
        Py_ssize_t strSize;
        if (PyBytes_AsStringAndSize(input[i], &str, &strSize) == -1) {
-          LOG(FATAL) << "Unsupported pyhton object type passed into ndarray.";
+          PyErr_SetString(
+              PyExc_TypeError,
+              "Unsupported python object type passed into ndarray.");
+          return nullptr;
        }
        outPtr[i] = std::string(str, strSize);
      }
    } break;
-    case NPY_STRING: {
-      char* inputData = PyArray_BYTES(array);
-      auto* outPtr = tensor->template mutable_data<std::string>();
-      auto itemSize = PyArray_ITEMSIZE(array);
-      for (int i = 0; i < tensor->size(); ++i) {
-        auto start = inputData + i * itemSize;
-        auto end = std::find(start, start + itemSize, '\0');
-        outPtr[i] = std::string(start, end - start);
-      }
-    } break;
    default:
      context.template CopyBytes<CPUContext, Context>(
          tensor->size() * meta.itemsize(),
--- a/caffe2/python/convnet_benchmarks.py
+++ b/caffe2/python/convnet_benchmarks.py
@ -613,6 +613,7 @@ def GetArgumentParser():
    )
    parser.add_argument("--net_type", type=str, default="dag")
    parser.add_argument("--num_workers", type=int, default=2)
+    parser.add_argument("--use-nvtx", default=False, action='store_true')
    return parser


@ -624,7 +625,9 @@ if __name__ == '__main__':
    ):
        GetArgumentParser().print_help()

-    workspace.GlobalInit(['caffe2', '--caffe2_log_level=0'])
+    workspace.GlobalInit(
+        ['caffe2', '--caffe2_log_level=0'] +
+        ['--caffe2_use_nvtx'] if args.use_nvtx else [])
    model_map = {
        'AlexNet': AlexNet,
        'OverFeat': OverFeat,
--- a/caffe2/python/core.py
+++ b/caffe2/python/core.py
@ -146,7 +146,7 @@ def ScopedBlobReference(name, *args, **kwargs):
    return BlobReference(scope.NAMESCOPE + name, *args, **kwargs)


-def _RectifyInputOutput(blobs):
+def _RectifyInputOutput(blobs, net=None):
    """A helper function to rectify the input or output of the CreateOperator
    interface.
    """
@ -154,18 +154,18 @@ def _RectifyInputOutput(blobs):
        # If blobs is a single string, prepend scope.NAMESCOPE and put it as a
        # list.
        # TODO(jiayq): enforce using BlobReference instead of raw strings.
-        return [ScopedBlobReference(blobs)]
+        return [ScopedBlobReference(blobs, net=net)]
    elif type(blobs) is BlobReference:
        # If blob is a BlobReference, simply put it as a list.
-        return [BlobReference(str(blobs))]
-    elif type(blobs) is list:
+        return [blobs]
+    elif type(blobs) in (list, tuple):
        # If blob is a list, we go through it and type check.
        rectified = []
        for blob in blobs:
            if isinstance(blob, basestring):
-                rectified.append(ScopedBlobReference(blob))
+                rectified.append(ScopedBlobReference(blob, net=net))
            elif type(blob) is BlobReference:
-                rectified.append(BlobReference(str(blob)))
+                rectified.append(blob)
            else:
                raise TypeError(
                    "I/O blob #{} of unsupported type: {} of type {}"
@ -670,8 +670,19 @@ def get_op_ids_in_path(ssa, blob_versions, inputs, outputs):


 class Net(object):
+    _net_names_used = set()
    operator_registry_ = {}

+    @staticmethod
+    def _get_next_net_name(basename):
+        name = basename
+        next_idx = 1
+        while name in Net._net_names_used:
+            name = basename + '_' + str(next_idx)
+            next_idx += 1
+        Net._net_names_used |= set([name])
+        return name
+
    def __init__(self, name_or_proto):
        """
        Create a Net.
@ -706,29 +717,29 @@ class Net(object):
            else:
                self._next_name_index = 0
        else:
-            name = name_or_proto
            self._net = caffe2_pb2.NetDef()
-            self._net.name = name
+            self._net.name = name_or_proto
            self._next_name_index = 0

+        # make sure that this net name hasn't been used before
+        self._net.name = Net._get_next_net_name(self._net.name)
+
    def __str__(self):
        return self._net.name

-    def DefinesBlob(self, blob):
+    def BlobIsDefined(self, blob):
        """
        Returns true if the given BlobReference is produced as output of
        an operator in this net, or if it is provided as an external input.
        """
-        if isinstance(blob, BlobReference):
-            assert blob.Net() == self, 'Reference belongs to different net'
        blob_name = str(blob)
+        for input in self._net.external_input:
+            if input == blob_name:
+                return True
        for op in self._net.op:
            for output in op.output:
                if output == blob_name:
                    return True
-        for input in self._net.external_input:
-            if input == blob_name:
-                return True
        return False

    def UsesBlob(self, blob):
@ -753,7 +764,7 @@ class Net(object):
        raises KeyError.
        """
        blob_name = str(blob_name)
-        if not self.DefinesBlob(blob_name):
+        if not self.BlobIsDefined(blob_name):
            raise KeyError('Net does not define blob %s' % blob_name)
        return BlobReference(blob_name, self)

@ -818,13 +829,16 @@ class Net(object):
                new_outputs:   list of BlobReferences corresponding to the
                               outputs produced by new_net.
        """
-        inputs = inputs if isinstance(inputs, dict) else {i: i for i in inputs}
+        input_is_pair_list = isinstance(inputs, list) and all(
+            isinstance(i, tuple) and len(i) == 2 for i in inputs)
+        inputs = (
+            inputs if isinstance(inputs, (dict, OrderedDict)) else
+            OrderedDict(inputs) if input_is_pair_list else
+            OrderedDict(zip(inputs, inputs)))
+        for output in outputs:
+            assert self.BlobIsDefined(output)
        input_names = {str(k): str(v) for k, v in inputs.items()}
        output_names = [str(o) for o in outputs]
-        for input in inputs.keys():
-            assert self.UsesBlob(input)
-        for output in outputs:
-            assert self.DefinesBlob(output)
        proto = self._net
        ssa, blob_versions = get_ssa(proto)
        used_op_ids = get_op_ids_in_path(ssa, blob_versions, inputs, outputs)
@ -859,9 +873,21 @@ class Net(object):
    def Proto(self):
        return self._net

-    def NextName(self):
+    def NextName(self, prefix=None, output_id=None):
        """Returns the next name to be used, if you do not want to explicitly
        name your blob."""
+        if prefix:
+            output_name_base = self._net.name + '/' + prefix
+            output_name = output_name_base
+            if output_id is not None:
+                output_name += ':' + str(output_id)
+            index = 2
+            while self.BlobIsDefined(output_name):
+                output_name = output_name_base + '_' + str(index)
+                if output_id is not None:
+                    output_name += ':' + str(output_id)
+                index += 1
+        else:
            output_name = self._net.name + '_blob_' + str(self._next_name_index)
            self._next_name_index += 1
        return str(output_name)
@ -900,16 +926,18 @@ class Net(object):
        self._net.op.extend(grad_ops)
        return input_to_grad

-    def AddExternalInput(self, input_name):
-        input_name = str(input_name)
+    def AddExternalInput(self, input):
+        input_name = str(input)
        assert input_name not in self._net.external_input, (
            'Net already contains an input named %s' % input_name)
        self._net.external_input.extend([input_name])
-        return BlobReference(input_name, self)
+        return (
+            input if isinstance(input, BlobReference)
+            else BlobReference(input_name))

    def AddExternalOutput(self, output):
        assert isinstance(output, BlobReference)
-        assert self.DefinesBlob(output)
+        assert self.BlobIsDefined(output)
        self.Proto().external_output.extend([str(output)])

    def DeduplicateGradientSlices(self, g):
@ -931,14 +959,22 @@ class Net(object):
    def _CreateAndAddToSelf(self, op_type, inputs, outputs=None, **kwargs):
        """A helper function to create an operator and add it to self.
        """
+        inputs = _RectifyInputOutput(inputs)
+        for input in inputs:
+            if not self.BlobIsDefined(input):
+                assert input.Net() != self
+                self.AddExternalInput(input)
        if outputs is None:
            # If we do not specify an output, we will assume that this op
            # produces one output in this case.
-            outputs = self.NextName()
+            outputs = self.NextName(prefix=op_type)
        elif type(outputs) is int:
            # In this case, we will auto-fill the given number of outputs
            # with auto-generated names.
-            outputs = [self.NextName() for i in range(outputs)]
+            outputs = [
+                self.NextName(prefix=op_type, output_id=i)
+                for i in range(outputs)]
+        outputs = _RectifyInputOutput(outputs, net=self)
        op = CreateOperator(op_type, inputs, outputs, **kwargs)
        self._net.op.extend([op])
        if len(op.output) == 0:
@ -1036,10 +1072,11 @@ class ExecutionStep(object):
        self._assert_can_mutate()
        self._step.num_iter = num_iter

-    def SetCriteriaNet(self, criteria_net):
+    def SetShouldStopBlob(self, should_stop_blob):
+        assert isinstance(should_stop_blob, BlobReference), (
+            "expects BlobReference here, got {}".format(type(should_stop_blob)))
        self._assert_can_mutate()
-        _add_net_to_dict(self._net_dict, criteria_net)
-        self._step.criteria_network = get_net_name(criteria_net)
+        self._step.should_stop_blob = str(should_stop_blob)

    def SetReportNet(self, report_net, report_interval):
        self._assert_can_mutate()
@ -1053,7 +1090,7 @@ class ExecutionStep(object):
        if isinstance(substep, ExecutionStep):
            substep._notify_is_used()
            if not substep.HasNets() and not substep.HasSubsteps():
-                return
+                return self
            for net in substep.Nets():
                _add_net_to_dict(self._net_dict, net)
            self._substeps.append(substep)
@ -1061,6 +1098,7 @@ class ExecutionStep(object):
        else:
            proto = substep
        self._step.substep.add().CopyFrom(proto)
+        return self

    def SetConcurrentSubsteps(self, concurrent_substeps):
        self._assert_can_mutate()
@ -1073,6 +1111,7 @@ class ExecutionStep(object):
        assert isinstance(net, Net)
        _add_net_to_dict(self._net_dict, net)
        self._step.network.extend([get_net_name(net)])
+        return self


 class Plan(object):
@ -1107,11 +1146,11 @@ class Plan(object):

 def execution_step(default_name,
                   steps_or_nets,
-                   criteria=None,
                   num_iter=None,
                   report_net=None,
                   report_interval=None,
-                   concurrent_substeps=None):
+                   concurrent_substeps=None,
+                   should_stop_blob=None):
    """
    Helper for creating an ExecutionStep.
    - steps_or_nets can be:
@ -1120,18 +1159,20 @@ def execution_step(default_name,
      - ExecutionStep
      - list<Net>
      - list<ExecutionStep>
-    - criteria is either None or a Net
-    - if no criteria or num_iter is provided, defaults to num_iter=1
+    - should_stop_blob is either None or a scalar boolean blob.
+      - This blob is checked AFTER every substeps/subnets.
+      - If specified and true, then this step will return immediately.
+      - Be sure to handle race conditions if setting from concurrent threads.
+    - if no should_stop_blob or num_iter is provided, defaults to num_iter=1
    """
-    assert criteria is None or isinstance(criteria, Net)
-    assert criteria is None or num_iter is None, (
-        'Cannot set both criteria and num_iter.')
-    if criteria is None and num_iter is None:
+    assert should_stop_blob is None or num_iter is None, (
+        'Cannot set both should_stop_blob and num_iter.')
+    if should_stop_blob is None and num_iter is None:
        num_iter = 1

-    def set_criteria(step):
-        if criteria is not None:
-            step.SetCriteriaNet(criteria)
+    def set_step_attr(step):
+        if should_stop_blob is not None:
+            step.SetShouldStopBlob(should_stop_blob)
        else:
            step.SetIter(num_iter)
        if concurrent_substeps is not None:
@ -1144,18 +1185,20 @@ def execution_step(default_name,
    if not steps_or_nets:
        return ExecutionStep(default_name)
    if isinstance(steps_or_nets, ExecutionStep):
-        return set_criteria(steps_or_nets)
+        step = set_step_attr(ExecutionStep(default_name))
+        step.AddSubstep(steps_or_nets)
+        return step
    elif isinstance(steps_or_nets, Net):
-        step = set_criteria(ExecutionStep(default_name))
+        step = set_step_attr(ExecutionStep(default_name))
        step.AddNet(steps_or_nets)
        return step
    elif isinstance(steps_or_nets, list):
        if isinstance(steps_or_nets[0], Net):
-            step = set_criteria(ExecutionStep(default_name))
+            step = set_step_attr(ExecutionStep(default_name))
            map(step.AddNet, steps_or_nets)
            return step
        elif isinstance(steps_or_nets[0], ExecutionStep):
-            step = set_criteria(ExecutionStep(default_name))
+            step = set_step_attr(ExecutionStep(default_name))
            map(step.AddSubstep, steps_or_nets)
            return step
    else:
--- a/caffe2/python/dataset.py
+++ b/caffe2/python/dataset.py
@ -0,0 +1,276 @@
+"""
+Implementation of an in-memory dataset with structured schema.
+
+Use this to store and iterate through datasets with complex schema that
+fit in memory.
+
+Iterating through entries of this dataset is very fast since the dataset
+is stored as a set of native Caffe2 tensors, thus no type conversion or
+deserialization is necessary.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+from caffe2.python.io import Reader, Writer
+from caffe2.python.schema import Struct
+import numpy as np
+
+
+class _DatasetReader(Reader):
+    def __init__(self, field_names, field_blobs, cursor, name):
+        """Don't call this directly. Instead, use dataset.reader()"""
+        self.field_names = field_names
+        self.field_blobs = field_blobs
+        self.cursor = cursor
+        self.name = name
+
+    def read(self, read_net, batch_size=1):
+        with core.NameScope(read_net.NextName(self.name)):
+            fields = read_net.ReadNextBatch(
+                [self.cursor] + self.field_blobs,
+                self.field_names,
+                batch_size=batch_size)
+            return (read_net.IsEmpty([fields[0]]), fields)
+
+    def reset(self, net):
+        net.ResetCursor([self.cursor], [])
+
+
+class _DatasetRandomReader(Reader):
+    def __init__(self, field_names, field_blobs, cursor, name, indices):
+        """Don't call this directly. Instead, use dataset.random_reader()"""
+        self.field_names = field_names
+        self.field_blobs = field_blobs
+        self.cursor = cursor
+        self.name = name
+        self.indices = indices
+
+    def reset(self, net):
+        net.ResetCursor([self.cursor], [])
+
+    def computeoffset(self, net):
+        self.reset(net)
+        offsets = net.ComputeOffset(
+            [self.cursor] + self.field_blobs,
+            'offsets')
+        self.offsets = offsets
+
+    def read(self, read_net, batch_size=1):
+        fields = read_net.ReadRandomBatch(
+            [self.cursor, self.indices, self.offsets] + self.field_blobs,
+            self.field_names,
+            batch_size=batch_size)
+        return (read_net.IsEmpty([fields[0]]), fields)
+
+
+class _DatasetWriter(Writer):
+    def __init__(self, fields, field_blobs, init_net):
+        """Don't call this directly. Use dataset.writer() instead."""
+        self.fields = fields
+        self.field_blobs = field_blobs
+        self.mutex = init_net.CreateMutex([])
+
+    def write(self, writer_net, fields):
+        """
+        Add operations to `net` that append the blobs in `fields` to the end
+        of the dataset. An additional operator will also be added that checks
+        the consistency of the data in `fields` against the dataset schema.
+
+        Args:
+            writer_net: The net that will contain the Append operators.
+            fields: A list of BlobReference to be appeneded to this dataset.
+        """
+        assert len(fields) == len(self.fields), (
+            'Expected %s fields, got %s.' % (len(self.fields), len(fields)))
+        writer_net.CheckDatasetConsistency(fields, [], fields=self.fields)
+        writer_net.AtomicAppend(
+            [self.mutex] + list(self.field_blobs) + list(fields),
+            self.field_blobs)
+
+    def commit(self, finish_net):
+        """Commit is a no-op for an in-memory dataset."""
+        pass
+
+
+def to_ndarray_list(values, schema):
+    """
+    Given a list of values and a dataset schema, produce list of ndarray in the
+    right format.
+
+    This function will perform some checks to make sure that the arrays
+    produced have the right dtype and rank.
+    """
+    assert isinstance(schema, Struct), 'schema must be a Struct.'
+    names = schema.field_names()
+    types = schema.field_types()
+    assert len(types) == len(values), (
+        'Values must have %d elements, got %d' % (len(types), len(values)))
+
+    arrays = []
+    for value, dtype, name in zip(values, types, names):
+        array = np.array(value, dtype=dtype.base)
+        # if array is empty we may need to reshape a little
+        if array.size == 0:
+            array = array.reshape((0,) + dtype.shape)
+        # check that the inner dimensions match the schema
+        assert (array.shape[1:] == dtype.shape), (
+            'Invalid array shape for field %s. Expected (%s), got (%s).' % (
+                name,
+                ', '.join(['_'] + map(str, dtype.shape)),
+                ', '.join(map(str, array.shape))))
+        arrays.append(array)
+    return arrays
+
+
+def Const(net, value, dtype=None, name=None):
+    """
+    Create a 'constant' by first creating an external input in the given
+    net, and then feeding the corresponding blob with its provided value
+    in the current workspace. The name is automatically generated in order
+    to avoid clashes with existing blob names.
+    """
+    assert isinstance(net, core.Net), 'net must be a core.Net instance.'
+    value = np.array(value, dtype=dtype)
+    blob = net.AddExternalInput(net.NextName(prefix=name))
+    workspace.FeedBlob(str(blob), value)
+    return blob
+
+
+class Dataset(object):
+    """Represents an in-memory dataset with fixed schema.
+
+    Use this to store and iterate through datasets with complex schema that
+    fit in memory.
+
+    Iterating through entries of this dataset is very fast since the dataset
+    is stored as a set of native Caffe2 tensors, thus no type conversion or
+    deserialization is necessary.
+    """
+
+    def __init__(self, fields, name=None):
+        """Create an un-initialized dataset with schema provided by `fields`.
+
+        Before this dataset can be used, it must be initialized, either by
+        `init_empty` or `init_from_dataframe`.
+
+        Args:
+            fields: either a schema.Struct or a list of field names in a format
+                    compatible with the one described in schema.py.
+            name: optional name to prepend to blobs that will store the data.
+        """
+        assert isinstance(fields, list) or isinstance(fields, Struct), (
+            'fields must be either a Struct or a list of raw field names.')
+        self.schema = fields
+        self.fields = (
+            fields.field_names() if isinstance(fields, Struct) else fields)
+        self.field_types = (
+            fields.field_types() if isinstance(fields, Struct) else
+            [np.dtype(np.void)] * len(self.fields))
+        self.name = name or 'dataset'
+        self.field_blobs = None
+
+    def init_empty(self, init_net):
+        """Initialize the blobs for this dataset with empty values.
+
+        Empty arrays will be immediately fed into the current workspace,
+        and `init_net` will take those blobs as external inputs.
+        """
+        self.field_blobs = [Const(init_net, [], name=f) for f in self.fields]
+
+    def init_from_dataframe(self, net, dataframe):
+        """Initialize the blobs for this dataset from a Pandas dataframe.
+
+        Each column of the dataframe will be immediately fed into the current
+        workspace, and the `net` will take this blobs as external inputs.
+        """
+        assert len(self.fields) == len(dataframe.columns)
+        self.field_blobs = [
+            Const(net, dataframe.as_matrix([col]).flatten(), name=field)
+            for col, field in enumerate(self.fields)]
+
+    def get_blobs(self):
+        """
+        Return the list of BlobReference pointing to the blobs that contain
+        the data for this dataset.
+        """
+        assert self
+        return self.field_blobs
+
+    def field_names(self):
+        """Return the list of field names for this dataset."""
+        return self.fields
+
+    def field_types(self):
+        """
+        Return the list of field dtypes for this dataset.
+
+        If a list of strings, not a schema.Struct, was passed to the
+        constructor, this will return a list of dtype(np.void).
+        """
+        return self.field_types
+
+    def reader(self, init_net, cursor_name=None):
+        """Create a Reader object that is used to iterate through the dataset.
+
+        This will append operations to `init_net` that create a TreeCursor,
+        used to iterate through the data.
+
+        NOTE: Currently, it is not safe to append to a dataset while reading.
+
+        Args:
+            init_net: net that will be run once to create the cursor.
+            cursor_name: optional name for the blob containing a pointer
+                         to the cursor.
+
+        Returns:
+            A _DatasetReader that can be used to create operators that will
+            iterate through the dataset.
+        """
+        assert self.field_blobs, 'Dataset not initialized.'
+        cursor_name = cursor_name or (self.name + '_cursor')
+        cursor = init_net.CreateTreeCursor(
+            [],
+            [cursor_name],
+            fields=self.fields)
+        return _DatasetReader(
+            self.fields, self.field_blobs, cursor, cursor_name)
+
+    def random_reader(self, init_net, indices, cursor_name=None):
+        """Create a Reader object that is used to iterate through the dataset.
+
+        NOTE: The reader order depends on the order in indices.
+
+        Args:
+            Similar to reader
+            indices: blob of reading order
+
+        Returns:
+            A DatasetReader that can be used to create operators that will
+            iterate through the dataset according to indices.
+        """
+        assert self.field_blobs, 'Dataset not initialized.'
+        cursor_name = cursor_name or (self.name + '_cursor')
+        cursor = init_net.CreateTreeCursor(
+            [],
+            [cursor_name],
+            fields=self.fields)
+        return _DatasetRandomReader(
+            self.fields, self.field_blobs, cursor, cursor_name, indices)
+
+    def writer(self, init_net):
+        """Create a Writer that can be used to append entries into the dataset.
+
+        NOTE: Currently, it is not safe to append to a dataset
+              while reading from it.
+        NOTE: Currently implementation of writer is not thread safe.
+              TODO: fixme
+
+        Args:
+            init_net: net that will be run once in order to create the writer.
+                      (currently not used)
+        """
+        assert self.field_blobs, 'Dataset not initialized.'
+        return _DatasetWriter(self.fields, self.field_blobs, init_net)
--- a/caffe2/python/device_checker.py
+++ b/caffe2/python/device_checker.py
@ -1,6 +1,6 @@
 import numpy as np
 import copy
-from caffe2.python import core, workspace
+from caffe2.python import workspace


 class DeviceChecker(object):
@ -41,9 +41,8 @@ class DeviceChecker(object):
            op.device_option.CopyFrom(device_option)
            workspace.RunOperatorOnce(op)
            results.append(
-                [workspace.FetchBlob(op.output[idx]) for idx in outputs_to_check
-                ]
-            )
+                [workspace.FetchBlob(op.output[idx])
+                 for idx in outputs_to_check])
            # Everything is done, reset the workspace.
            workspace.ResetWorkspace()
        # After running on all devices, check correctness
@ -61,15 +60,15 @@ class DeviceChecker(object):
                    print(y.flatten())
                    print(np.max(np.abs(x - y)))
                    success = False
-                #else:
+                # else:
                #     print ('Passed device pair (0, %d), %s %s' %
                #            (i, outputs_to_check[j], y.shape))
        workspace.SwitchWorkspace(old_ws_name)
        return success

    def CheckNet(self, net, inputs={}, blobs_to_check=None, ignore=set()):
-        """Checks a network by inspecting all of its intermediate results, and see
-        if things match.
+        """Checks a network by inspecting all of its intermediate results, and
+        see if things match.
        """
        old_ws_name = workspace.CurrentWorkspace()
        results = []
@ -78,8 +77,8 @@ class DeviceChecker(object):
        blobs_to_check = [b for b in blobs_to_check if b not in ignore]
        workspace.SwitchWorkspace("_device_check_", True)
        for i, device_option in enumerate(self._device_options):
-            for name, arr in inputs.iteritems():
-                #print 'feeding', name
+            for name, arr in inputs.items():
+                # print 'feeding', name
                workspace.FeedBlob(name, arr, device_option)
            for op in net.op:
                op.device_option.CopyFrom(device_option)
@ -93,15 +92,18 @@ class DeviceChecker(object):
            for j in range(len(blobs_to_check)):
                x = results[i][j]
                y = results[0][j]
-                if np.any(np.abs(x - y) > self._threshold):
+                if not np.allclose(x, y,
+                                   atol=self._threshold, rtol=self._threshold):
                    print('Failure in checking device option {}'
                          ' and output {}. The outputs are:'
                          .format(i, blobs_to_check[j]))
                    print(x.flatten())
                    print(y.flatten())
+                    print(np.max(np.abs(x - y)))
                    success = False
-                #else:
+                # else:
                #     print ('Passed device pair (%d, %d), %s %s: %s' %
-                #         (i, j, blobs_to_check[j], y.shape, str(y.flatten())))
+                #            (i, j, blobs_to_check[j], y.shape,
+                #             str(y.flatten())))
        workspace.SwitchWorkspace(old_ws_name)
        return success
--- a/caffe2/python/hypothesis_test.py
+++ b/caffe2/python/hypothesis_test.py
@ -178,7 +178,12 @@ class TestOperators(hu.HypothesisTestCase):
            return st.sampled_from([np.float32, np.float64])

        _test_binary(
-            "Div", ref, filter_=non_zero, test_gradient=True, dtypes=div_dtypes
+            "Div", ref, filter_=non_zero, test_gradient=True,
+            dtypes=div_dtypes, gcs=hu.gcs_cpu_only
+        )(self)
+        _test_binary(
+            "Div", ref, filter_=non_zero, test_gradient=False,
+            dtypes=div_dtypes
        )(self)
        _test_binary_broadcast(
            "Div", ref, filter_=non_zero, dtypes=div_dtypes)(self)
@ -269,8 +274,8 @@ class TestOperators(hu.HypothesisTestCase):
        for param, _ in enumerate(inputs):
            self.assertGradientChecks(gc, op, inputs, param, [0])

-    @unittest.skipIf(True,
-                     "Recurrent only works on CUDA 7.5 and above")
+    @unittest.skipIf(not workspace.has_gpu_support,
+                     "Skipping test due to no gpu present.")
    @given(hidden_size=st.integers(min_value=1, max_value=3),
           num_layers=st.integers(min_value=1, max_value=3),
           bidirectional=st.booleans(),
@ -371,10 +376,10 @@ class TestOperators(hu.HypothesisTestCase):
           pad_l=st.integers(0, 3),
           pad_b=st.integers(0, 3),
           pad_r=st.integers(0, 3),
-            kernel=st.integers(1, 5),
-            size=st.integers(7, 10),
-            input_channels=st.integers(1, 8),
-            output_channels=st.integers(1, 8),
+           kernel=st.integers(3, 5),
+           size=st.integers(8, 8),
+           input_channels=st.integers(1, 3),
+           output_channels=st.integers(1, 3),
           batch_size=st.integers(1, 3),
           order=st.sampled_from(["NCHW", "NHWC"]),
           engine=st.sampled_from([""]),
@ -661,7 +666,8 @@ class TestOperators(hu.HypothesisTestCase):
           output_channels=st.integers(1, 8),
           batch_size=st.integers(1, 3),
           order=st.sampled_from(["NCHW", "NHWC"]),
-            engine=st.sampled_from([""]), **hu.gcs)
+           engine=st.sampled_from(["", "CUDNN"]), **hu.gcs)
+    @settings(max_examples=2, timeout=100)
    def test_convolution_transpose_gradients(self, stride, pad, kernel,
                                             size, input_channels,
                                             output_channels, batch_size,
@ -698,7 +704,7 @@ class TestOperators(hu.HypothesisTestCase):
           input_channels=st.integers(1, 8),
           output_channels=st.integers(1, 8),
           batch_size=st.integers(1, 3),
-            engine=st.sampled_from([""]), **hu.gcs)
+           engine=st.sampled_from(["", "CUDNN"]), **hu.gcs)
    def test_convolution_transpose_layout(self, stride, pad, kernel,
                                          size, input_channels,
                                          output_channels, batch_size,
@ -1049,7 +1055,7 @@ class TestOperators(hu.HypothesisTestCase):
    @given(target_probabilities=hu.arrays(
        dims=[10], elements=st.floats(allow_nan=False,
                                      allow_infinity=False,
-                                      min_value=0,
+                                      min_value=0.01,
                                      max_value=1)),
           **hu.gcs)
    def test_perplexity(self, target_probabilities, gc, dc):
@ -1477,9 +1483,13 @@ class TestOperators(hu.HypothesisTestCase):
        op = core.CreateOperator("HasElements", ["data"], ["has_elements"])
        self.assertReferenceChecks(gc, op, [data], lambda x: (len(x) > 0, ))

+        op = core.CreateOperator("IsEmpty", ["data"], ["is_empty"])
+        self.assertReferenceChecks(gc, op, [data], lambda x: (len(x) == 0, ))
+
    @given(initial_iters=st.integers(0, 100),
           max_iters=st.integers(0, 100))
-    def test_criteria_net_with_execution_step(self, initial_iters, max_iters):
+    def test_should_stop_as_criteria_net_execution_step(
+            self, initial_iters, max_iters):
        net = core.Net("net")
        net.Iter(["iter"], ["iter"])
        workspace.FeedBlob(
@ -1487,16 +1497,87 @@ class TestOperators(hu.HypothesisTestCase):
        workspace.FeedBlob(
            "num_iters", np.asarray([max_iters]).astype(np.int32))
        criteria_net = core.Net("criteria")
-        criteria_net.LT(["iter", "num_iters"], ["continue"])
-        criteria_net.Proto().external_output.extend(["continue"])
+        criteria_net.GE(["iter", "num_iters"], ["stop"])
+        criteria_net.Proto().external_output.extend(["stop"])

        plan = core.Plan('plan')
-        plan.AddStep(core.execution_step('step', net, criteria=criteria_net))
+        plan.AddStep(core.execution_step(
+            'step', [criteria_net, net],
+            should_stop_blob=core.BlobReference("stop")))
        workspace.RunPlan(plan)
        iters = workspace.FetchBlob("iter")
        self.assertEqual(iters.dtype, np.int32)
        self.assertEqual(iters[0], max(initial_iters, max_iters))

+    def test_disabled_execution_step(self):
+        def createNets(i, disabled):
+            should_stop = 'should_stop_{}'.format(i)
+            output = 'output_{}'.format(i)
+
+            # init content and stop signal
+            init = core.Net("init_{}".format(i))
+            init.ConstantFill(
+                [],
+                [output],
+                shape=[1],
+                value=0.0
+            )
+            init.Cast([output], [should_stop], to='bool')
+
+            # decide if disabled or not
+            criterion = core.Net("criterion_{}".format(i))
+            tmp = criterion.ConstantFill(
+                [],
+                shape=[1],
+                value=1.0 if disabled else 0.0
+            )
+            criterion.Cast([tmp], [should_stop], to='bool')
+            criterion.Proto().external_output.extend([should_stop])
+
+            # the body net is just to turn a 0 blob to 1
+            net = core.Net("net_{}".format(i))
+            net.ConstantFill(
+                [],
+                [output],
+                shape=[1],
+                value=1.0
+            )
+
+            # always end the loop
+            ender = core.Net("ender_{}".format(i))
+            tmp = ender.ConstantFill(
+                [],
+                shape=[1],
+                value=1.0
+            )
+            ender.Cast([tmp], [should_stop], to='bool')
+            ender.Proto().external_output.extend([should_stop])
+
+            return [init, criterion, net, ender]
+
+        nets = [createNets(1, False),
+                createNets(2, True),
+                createNets(3, False)]
+        steps = [
+            core.execution_step(
+                'step_1', nets[0],
+                should_stop_blob=core.BlobReference('should_stop_1')),
+            core.execution_step(
+                'step_2', nets[1],
+                should_stop_blob=core.BlobReference('should_stop_2')),
+            core.execution_step('step_3', nets[2])
+        ]
+        expected = [1.0, 0.0, 1.0]
+
+        plan = core.Plan('plan')
+        plan.AddStep(core.execution_step('all_steps', steps, num_iter=3))
+        workspace.RunPlan(plan)
+
+        for i, net in enumerate(nets):
+            self.assertEqual(
+                workspace.FetchBlob('output_{}'.format(i + 1))[0],
+                expected[i])
+
    @given(initial_iters=st.integers(0, 100),
           num_iters=st.integers(0, 100))
    def test_iter_count_with_execution_step(self, initial_iters, num_iters):
@ -1523,6 +1604,13 @@ class TestOperators(hu.HypothesisTestCase):
    def test_cast(self, a, src, dst, use_name, gc, dc):
        a = a.astype(src)

+        # Casting from a float type outside the range of the integral
+        # type is UB.
+        ftypes = [np.float32, np.float64]
+        if src in ftypes and dst not in ftypes and dst is not np.bool:
+            info = np.iinfo(dst)
+            a = np.clip(a, info.min, info.max)
+
        def ref(data):
            return [data.astype(dst)]

@ -1571,7 +1659,8 @@ class TestOperators(hu.HypothesisTestCase):
        self.assertDeviceChecks(dc, op, [X], [0])
        self.assertGradientChecks(gc, op, [X], 0, [0])

-    @given(X=hu.tensor(), seed=st.integers(min_value=0, max_value=65536),
+    @given(X=_dtypes().flatmap(lambda dtype: hu.tensor(dtype=dtype)),
+           seed=st.integers(min_value=0, max_value=65536),
           null_axes=st.booleans(),
           **hu.gcs)
    def test_transpose(self, X, seed, null_axes, gc, dc):
@ -1589,6 +1678,7 @@ class TestOperators(hu.HypothesisTestCase):

        self.assertReferenceChecks(gc, op, [X, axes],
                                   transpose_ref)
+        if X.dtype != np.int32 and X.dtype != np.int64:
            self.assertGradientChecks(gc, op, [X], 0, [0])

    @given(n=st.integers(1, 3),
--- a/caffe2/python/hypothesis_test_util.py
+++ b/caffe2/python/hypothesis_test_util.py
@ -263,7 +263,11 @@ class HypothesisTestCase(test_util.TestCase):
            outs = []
            for (n, ref) in zip(op.output, reference_outputs):
                output = workspace.FetchBlob(n)
-                np.testing.assert_allclose(output, ref, atol=1e-4, rtol=1e-4)
+                if output.dtype.kind in ('S', 'O'):
+                    np.testing.assert_array_equal(output, ref)
+                else:
+                    np.testing.assert_allclose(
+                        output, ref, atol=1e-4, rtol=1e-4)
                outs.append(output)
            if grad_reference and output_to_grad:
                self._assertGradReferenceChecks(
--- a/caffe2/python/io.py
+++ b/caffe2/python/io.py
@ -0,0 +1,138 @@
+"""
+Defines the base interface for reading and writing operations.
+
+Readers/Writers are objects that produce operations that read/write sequences
+of data. Each operation reads or writes a list of BlobReferences.
+
+Readers and Writers must be implemented such that read and write operations
+are atomic and thread safe.
+
+Examples of possible Readers and Writers:
+    HiveReader, HiveWriter,
+    QueueReader, QueueWriter,
+    DatasetReader, DatasetWriter,
+    DBReader, DBWriter,
+
+See `dataset.py` for an example of implementation.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core
+
+
+class Reader(object):
+    """
+    Reader is a abstract class to be implemented in order to provide
+    operations capable of iterating through a dataset or stream of data.
+
+    A Reader must implement at least one operation, `read`, which
+    adds operations to a net that read the next batch of data. Readers can
+    optionally support the `reset` operation, which is useful when multiple
+    passes over the data are required.
+    """
+    def read(self, read_net, batch_size=1, *args):
+        """
+        Add operations to read_net that will read the read batch of data
+        and return a list of BlobReference representing the blobs that will
+        contain the batches produced.
+
+        Operations added to `read_net` must be thread safe and atomic, that is,
+        it should be possible to clone `read_net` and run multiple instances of
+        it in parallel.
+
+        Args:
+            read_net: the net that will be appended with read operations
+            batch_size: number of entires to read
+
+        Returns:
+            A tuple (should_stop, fields), with:
+
+                should_stop: BlobReference pointing to a boolean scalar
+                             blob that indicates whether the read operation
+                             was succesfull or whether the end of data has
+                             been reached.
+                fields: A tuple of BlobReference containing the latest batch
+                        of data that was read.
+        """
+        raise NotImplementedError('Readers must implement `read`.')
+
+    def reset(self, net):
+        """Append operations to `net` that will reset the reader.
+
+        This can be used to read the data multiple times.
+        Not all readers support this operation.
+        """
+        raise NotImplementedError('This reader cannot be resetted.')
+
+    def execution_step(self, reader_net_name=None, batch_size=1):
+        """Create an execution step with a net containing read operators.
+
+        The execution step will contain a `stop_blob` that knows how to stop
+        the execution loop when end of data was reached.
+
+        E.g.:
+
+            read_step, fields = reader.execution_step()
+            consume_net = core.Net('consume')
+            consume_net.Print(fields[0], [])
+            p = core.Plan('reader')
+            p.AddStep(read_step.AddNet(consume_net))
+            core.RunPlan(p)
+
+        Args:
+
+            reader_net_name: (optional) the name of the reader_net to be
+                             created. The execution step will
+                             be named accordingly.
+            batch_size: the batch size
+
+        Returns:
+            A tuple (read_step, fields), with:
+
+                read_step: A newly created execution step containing a net with
+                           read operations. The step will have `stop_blob` set,
+                           in order to stop the loop on end of data.
+                fields: A tuple of BlobReference containing the latest batch
+                        of data that was read.
+        """
+        reader_net = core.Net(reader_net_name or 'reader')
+        should_stop, fields = self.read(reader_net, batch_size=batch_size)
+        read_step = core.execution_step(
+            '{}_step'.format(reader_net_name),
+            reader_net,
+            should_stop_blob=should_stop)
+        return (read_step, fields)
+
+
+class Writer(object):
+    """
+    Writer is a abstract class to be implemented in order to provide
+    operations capable of feeding a data stream or a dataset.
+
+    A Writer must implement 2 operations:
+    `write`, which adds operations to a net that write the write batch of
+    data, and `commit`, which adds operations to a net in order to indicate
+    that no more data will be written.
+    """
+
+    def write(self, writer_net, fields):
+        """Add operations to `writer_net` that write the next batch of data.
+
+        Operations added to the net must be thread-safe and unique, that is:
+        multiple writers must be able to write to the dataset in parallel.
+
+        Args:
+            fields: a tuple of BlobReference containing the batch of data to
+                    write.
+        """
+        raise NotImplementedError('Writers must implement write.')
+
+    def commit(self, finish_net):
+        """Add operations to `finish_net` that signal end of data.
+
+        This must be implemented by all Writers, but may be no-op for some
+        of them.
+        """
+        raise NotImplementedError('Writers must implement commit.')
--- a/caffe2/python/model_device_test.py
+++ b/caffe2/python/model_device_test.py
@ -116,7 +116,7 @@ class TestMiniAlexNet(test_util.TestCase):
        gpu_device = caffe2_pb2.DeviceOption()
        gpu_device.device_type = caffe2_pb2.CUDA

-        checker = device_checker.DeviceChecker(1e-2, [cpu_device, gpu_device])
+        checker = device_checker.DeviceChecker(0.05, [cpu_device, gpu_device])
        ret = checker.CheckNet(
            model.net.Proto(),
            inputs,
@ -126,15 +126,16 @@ class TestMiniAlexNet(test_util.TestCase):
        )
        self.assertEqual(ret, True)

-    def testMiniAlexNet(self):
+    @unittest.skipIf(not workspace.has_gpu_support,
+                     "No GPU support. Skipping test.")
+    def testMiniAlexNetNCHW(self):
        self._testMiniAlexNet("NCHW")
+
+    @unittest.skipIf(not workspace.has_gpu_support,
+                     "No GPU support. Skipping test.")
+    def testMiniAlexNetNHWC(self):
        self._testMiniAlexNet("NHWC")


 if __name__ == '__main__':
-    if not workspace.has_gpu_support:
-        print('No GPU support. Skipping gpu test.')
-    elif workspace.NumCudaDevices() == 0:
-        print('No GPU device. Skipping gpu test.')
-    else:
    unittest.main()
--- a/caffe2/python/net_drawer.py
+++ b/caffe2/python/net_drawer.py
@ -3,6 +3,7 @@ from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
 import argparse
+import json
 from collections import defaultdict
 from caffe2.python import utils

@ -53,6 +54,11 @@ def _rectify_operator_and_name(operators_or_net, name):
    return operators, name


+def _escape_label(name):
+    # json.dumps is poor man's escaping
+    return json.dumps(name)
+
+
 def GetPydotGraph(operators_or_net, name=None, rankdir='LR'):
    operators, name = _rectify_operator_and_name(operators_or_net, name)
    graph = pydot.Dot(name, rankdir=rankdir)
@ -73,7 +79,7 @@ def GetPydotGraph(operators_or_net, name=None, rankdir='LR'):
            if input_name not in pydot_nodes:
                input_node = pydot.Node(
                    input_name + str(pydot_node_counts[input_name]),
-                    label=input_name,
+                    label=_escape_label(input_name),
                    **BLOB_STYLE
                )
                pydot_nodes[input_name] = input_node
@ -87,7 +93,7 @@ def GetPydotGraph(operators_or_net, name=None, rankdir='LR'):
                pydot_node_counts[output_name] += 1
            output_node = pydot.Node(
                output_name + str(pydot_node_counts[output_name]),
-                label=output_name,
+                label=_escape_label(output_name),
                **BLOB_STYLE
            )
            pydot_nodes[output_name] = output_node
--- a/caffe2/python/operator_test/atomic_ops_test.py
+++ b/caffe2/python/operator_test/atomic_ops_test.py
@ -0,0 +1,43 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core, workspace
+from caffe2.python.test_util import TestCase
+
+
+class TestAtomicOps(TestCase):
+    def test_atomic_ops(self):
+        """
+        Test that both countdown and checksum are update atomically by having
+        cowntdown count from 20k to 0 from parallel the workers and updating
+        the checksum to the value fetched. If operations are trully atomic,
+        each value from 1 to 20k should be fetched exactly once from the
+        countdown, and fed exactly once to the checksum, such that at the end
+        checksum must contain the exact value of sum[i=0..20000](i).
+        """
+        init_net = core.Net('init')
+        mutex_countdown = init_net.CreateMutex([])
+        mutex_checksum = init_net.CreateMutex([])
+        countdown = init_net.ConstantIntFill([], shape=[], value=20000.)
+        checksum = init_net.ConstantIntFill([], shape=[], value=0.)
+        minus_one = init_net.ConstantIntFill([], shape=[], value=-1.)
+        steps = []
+        for i in range(0, 100):
+            net = core.Net('net:%d' % i)
+            _, fetched_count = net.AtomicFetchAdd(
+                [mutex_countdown, countdown, minus_one],
+                [countdown, 'fetched_count:%d' % i])
+            net.AtomicFetchAdd(
+                [mutex_checksum, checksum, fetched_count],
+                [checksum, 'not_used'])
+            steps.append(
+                core.execution_step('worker:%d' % i, net, num_iter=200))
+        super_step = core.execution_step(
+            'parent', steps, concurrent_substeps=True)
+        plan = core.Plan('plan')
+        plan.AddStep(core.execution_step('init', init_net))
+        plan.AddStep(super_step)
+        workspace.RunPlan(plan)
+        # checksum = sum[i=1..20000](i) = 20000 * 20001 / 2 = 200010000
+        self.assertEquals(workspace.FetchBlob(checksum), 200010000)
--- a/caffe2/python/operator_test/counter_ops_test.py
+++ b/caffe2/python/operator_test/counter_ops_test.py
@ -0,0 +1,42 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+from caffe2.python.test_util import TestCase
+
+
+class TestCounterOps(TestCase):
+    def test_counter_ops(self):
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'CreateCounter', [], ['c'], init_count=1))
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'CountDown', ['c'], ['t1']))  # 1 -> 0
+        assert not workspace.FetchBlob('t1')
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'CountDown', ['c'], ['t2']))  # 0 -> 0
+        assert workspace.FetchBlob('t2')
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'ResetCounter', ['c'], [], init_count=1))  # -> 1
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'CountDown', ['c'], ['t3']))  # 1 -> 0
+        assert not workspace.FetchBlob('t3')
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'ConstantBoolFill', [], ['t4'], value=0.0, shape=[]))
+        assert workspace.FetchBlob('t4') == workspace.FetchBlob('t1')
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'ConstantBoolFill', [], ['t5'], value=1.0, shape=[]))
+        assert workspace.FetchBlob('t5') == workspace.FetchBlob('t2')
+
+        assert workspace.RunOperatorOnce(core.CreateOperator(
+            'And', ['t1', 't2'], ['t6']))
+        assert not workspace.FetchBlob('t6')  # True && False
+
+        assert workspace.RunOperatorOnce(core.CreateOperator(
+            'And', ['t2', 't5'], ['t7']))
+        assert workspace.FetchBlob('t7')  # True && True
--- a/caffe2/python/operator_test/cross_entropy_ops_test.py
+++ b/caffe2/python/operator_test/cross_entropy_ops_test.py
@ -0,0 +1,71 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+def sigmoid(x):
+    return 1.0 / (1.0 + np.exp(-x))
+
+
+def sigmoid_cross_entropy_with_logits(x, z):
+    return np.maximum(x, 0) - x * z + np.log(1 + np.exp(-np.abs(x)))
+
+
+def sigmoid_cross_entropy_with_logits_grad(x, z):
+    return z - sigmoid(x)
+
+
+class TestCrossEntropyOps(hu.HypothesisTestCase):
+    @given(
+        inputs=st.lists(
+            elements=st.integers(min_value=1, max_value=5),
+            min_size=1,
+            max_size=2,
+            average_size=2,
+        ).flatmap(
+            lambda shape: st.tuples(
+                hu.arrays(
+                    dims=shape,
+                    elements=st.one_of(
+                        st.floats(min_value=-1.0, max_value=-0.1),
+                        st.floats(min_value=0.1, max_value=1.0),
+                    )),
+                hu.arrays(
+                    dims=shape,
+                    elements=st.sampled_from([0.0, 1.0]),
+                ),
+            )
+        ),
+    )
+    def test_sigmoid_cross_entropy_with_logits(self, inputs):
+        logits, targets = inputs
+
+        def sigmoid_xentr_logit_ref(logits, targets):
+            s = sigmoid_cross_entropy_with_logits(logits, targets)
+            m = np.mean(s, axis=len(logits.shape) - 1)
+            return (m, )
+
+        def sigmoid_xentr_logit_grad_ref(g_out, outputs, fwd_inputs):
+            fwd_logits, fwd_targets = fwd_inputs
+            inner_size = fwd_logits.shape[-1]
+            m = fwd_targets - sigmoid(fwd_logits)
+            g_in = -np.expand_dims(g_out, axis=-1) * m / inner_size
+            return (g_in, None)
+
+        op = core.CreateOperator(
+            'SigmoidCrossEntropyWithLogits',
+            ['logits', 'targets'],
+            ['xentropy'])
+        self.assertReferenceChecks(
+            hu.cpu_do,
+            op,
+            [logits, targets],
+            sigmoid_xentr_logit_ref,
+            output_to_grad='xentropy',
+            grad_reference=sigmoid_xentr_logit_grad_ref)
--- a/caffe2/python/operator_test/dataset_ops_test.py
+++ b/caffe2/python/operator_test/dataset_ops_test.py
@ -0,0 +1,284 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import numpy as np
+from caffe2.python import core, workspace, dataset
+from caffe2.python.dataset import Const
+from caffe2.python.schema import List, Struct, Scalar, Map
+from caffe2.python.test_util import TestCase
+
+
+def _assert_arrays_equal(actual, ref, err_msg):
+    if ref.dtype.kind in ('S', 'O'):
+        np.testing.assert_array_equal(actual, ref, err_msg=err_msg)
+    else:
+        np.testing.assert_allclose(
+            actual, ref, atol=1e-4, rtol=1e-4, err_msg=err_msg)
+
+
+class TestDatasetOps(TestCase):
+    def test_dataset_ops(self):
+        """
+        1. Defining the schema of our dataset.
+
+        This example schema could represent, for example, a search query log.
+        """
+        schema = Struct(
+            # fixed size vector, which will be stored as a matrix when batched
+            ('dense', Scalar((np.float32, 3))),
+            # could represent a feature map from feature ID to float value
+            ('floats', Map(
+                Scalar(np.int32),
+                Scalar(np.float32))),
+            # could represent a multi-valued categorical feature map
+            ('int_lists', Map(
+                Scalar(np.int32),
+                List(Scalar(np.int64)),
+            )),
+            # could represent a multi-valued, weighted categorical feature map
+            ('id_score_pairs', Map(
+                Scalar(np.int32),
+                Map(
+                    Scalar(np.int64),
+                    Scalar(np.float32),
+                    keys_name='ids',
+                    values_name='scores'),
+            )),
+            # additional scalar information
+            ('metadata', Struct(
+                ('user_id', Scalar(np.int64)),
+                ('user_embed', Scalar((np.float32, 2))),
+                ('query', Scalar(str)),
+            )),
+        )
+
+        """
+        This is what the flattened fields for this schema look like, along
+        with its type. Each one of these fields will be stored, read and
+        writen as a tensor.
+        """
+        expected_fields = [
+            ('dense', (np.float32, 3)),
+            ('floats:lengths', np.int32),
+            ('floats:values:keys', np.int32),
+            ('floats:values:values', np.float32),
+            ('int_lists:lengths', np.int32),
+            ('int_lists:values:keys', np.int32),
+            ('int_lists:values:values:lengths', np.int32),
+            ('int_lists:values:values:values', np.int64),
+            ('id_score_pairs:lengths', np.int32),
+            ('id_score_pairs:values:keys', np.int32),
+            ('id_score_pairs:values:values:lengths', np.int32),
+            ('id_score_pairs:values:values:values:ids', np.int64),
+            ('id_score_pairs:values:values:values:scores', np.float32),
+            ('metadata:user_id', np.int64),
+            ('metadata:user_embed', (np.float32, 2)),
+            ('metadata:query', str),
+        ]
+        zipped = zip(
+            expected_fields,
+            schema.field_names(),
+            schema.field_types())
+        for (ref_name, ref_type), name, dtype in zipped:
+            self.assertEquals(ref_name, name)
+            self.assertEquals(np.dtype(ref_type), dtype)
+
+        """
+        2. The contents of our dataset.
+
+        Contents as defined below could represent, for example, a log of
+        search queries along with dense, sparse features and metadata.
+        The datset below has 3 top-level entries.
+        """
+        contents_raw = [
+            # dense
+            [[1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3]],
+            # floats
+            [1, 2, 3],  # len
+            [11, 21, 22, 31, 32, 33],  # key
+            [1.1, 2.1, 2.2, 3.1, 3.2, 3.3],  # value
+            # int lists
+            [2, 0, 2],  # len
+            [11, 12, 31, 32],  # key
+            [2, 4, 3, 1],  # value:len
+            [111, 112, 121, 122, 123, 124, 311, 312, 313, 321],  # value:value
+            # id score pairs
+            [1, 2, 2],  # len
+            [11, 21, 22, 31, 32],  # key
+            [1, 1, 2, 2, 3],  # value:len
+            [111, 211, 221, 222, 311, 312, 321, 322, 323],  # value:ids
+            [11.1, 21.1, 22.1, 22.2, 31.1, 31.2, 32.1, 32.2, 32.3],  # val:score
+            # metadata
+            [123, 234, 456],  # user_id
+            [[0.2, 0.8], [0.5, 0.5], [0.7, 0.3]],  # user_embed
+            ['dog posts', 'friends who like to', 'posts about ca'],  # query
+        ]
+        # convert the above content to ndarrays, checking against the schema
+        contents = dataset.to_ndarray_list(contents_raw, schema)
+
+        """
+        3. Creating and appending to the dataset.
+        We first create an empty dataset with the given schema.
+        Then, a Writer is used to append these entries to the dataset.
+        """
+        ds = dataset.Dataset(schema)
+        net = core.Net('init')
+        ds.init_empty(net)
+
+        blobs_to_append = [Const(net, c) for c in contents]
+        writer = ds.writer(init_net=net)
+        writer.write(net, blobs_to_append)
+        workspace.RunNetOnce(net)
+
+        """
+        4. Iterating through the dataset contents.
+
+        If we were to iterate through the top level entries of our dataset,
+        this is what we should expect to see:
+        """
+        entries_raw = [
+            (
+                [[1.1, 1.2, 1.3]],  # dense
+                [1], [11], [1.1],  # floats
+                [2], [11, 12], [2, 4], [111, 112, 121, 122, 123, 124],  # intlst
+                [1], [11], [1], [111], [11.1],  # id score pairs
+                [123], [[0.2, 0.8]], ['dog posts'],  # metadata
+            ),
+            (
+                [[2.1, 2.2, 2.3]],  # dense
+                [2], [21, 22], [2.1, 2.2],  # floats
+                [0], [], [], [],  # int list
+                [2], [21, 22], [1, 2], [211, 221, 222], [21.1, 22.1, 22.2],
+                [234], [[0.5, 0.5]], ['friends who like to'],  # metadata
+            ),
+            (
+                [[3.1, 3.2, 3.3]],  # dense
+                [3], [31, 32, 33], [3.1, 3.2, 3.3],  # floats
+                [2], [31, 32], [3, 1], [311, 312, 313, 321],  # int lst
+                [2], [31, 32], [2, 3], [311, 312, 321, 322, 323],
+                [31.1, 31.2, 32.1, 32.2, 32.3],  # id score list
+                [456], [[0.7, 0.3]], ['posts about ca'],  # metadata
+            ),
+            # after the end of the dataset, we will keep getting empty vectors
+            ([],) * 16,
+            ([],) * 16,
+        ]
+        entries = [dataset.to_ndarray_list(e, schema) for e in entries_raw]
+
+        """
+        Let's go ahead and create the reading nets.
+        We will run `read` net multiple times and assert that we are reading the
+        entries the way we stated above.
+        """
+        read_init_net = core.Net('read_init')
+        read_next_net = core.Net('read_next')
+        reader = ds.reader(read_init_net)
+        should_continue, batch_blobs = reader.read(read_next_net)
+
+        workspace.RunNetOnce(read_init_net)
+
+        workspace.CreateNet(read_next_net)
+        read_next_net_name = str(read_next_net)
+
+        for i, entry in enumerate(entries):
+            workspace.RunNet(read_next_net_name)
+            for name, blob, base in zip(ds.field_names(), batch_blobs, entry):
+                data = workspace.FetchBlob(str(blob))
+                _assert_arrays_equal(
+                    data, base,
+                    err_msg='Mismatch in entry %d, field %s' % (i, name))
+
+        """
+        5. Reading/writing in a single plan
+
+        If all of operations on the data are expressible as Caffe2 operators,
+        we don't need to load the data to python, iterating through the dataset
+        in a single Plan.
+
+        Where we will process the dataset a little and store it in a second
+        dataset. We can reuse the same Reader since it supports reset.
+        """
+        reset_net = core.Net('reset_net')
+        reader.reset(reset_net)
+        read_step, fields = reader.execution_step()
+
+        """ We will add the line number * 1000 to the feature ids. """
+        process_net = core.Net('process')
+        line_no = Const(process_net, 0, dtype=np.int32)
+        const_one = Const(process_net, 1000, dtype=np.int32)
+        process_net.Add([line_no, const_one], [line_no])
+        fid = schema.floats.values.keys.id()
+        process_net.Print(fields[fid], [])
+        process_net.Add([fields[fid], line_no], fields[fid], broadcast=1)
+
+        """ Lets create a second dataset and append to it. """
+        ds2 = dataset.Dataset(schema, name='dataset2')
+        ds2.init_empty(reset_net)
+        writer = ds2.writer(reset_net)
+        writer.write(process_net, fields)
+        # commit is not necessary for DatasetWriter but will add it for
+        # generality of the example
+        commit_net = core.Net('commit')
+        writer.commit(commit_net)
+
+        """ Time to create and run a plan which will do the processing """
+        plan = core.Plan('process')
+        plan.AddStep(core.execution_step('reset', reset_net))
+        plan.AddStep(read_step.AddNet(process_net))
+        plan.AddStep(core.execution_step('commit', commit_net))
+        workspace.RunPlan(plan)
+
+        """
+        Now we should have dataset2 populated.
+        """
+        ds2blobs = ds2.get_blobs()
+        for i, (name, blob) in enumerate(zip(schema.field_names(), ds2blobs)):
+            data = workspace.FetchBlob(str(blob))
+            content = contents[i]
+            if i == fid:
+                # one of our fields has been added with line numbers * 1000
+                content += [1000, 2000, 2000, 3000, 3000, 3000]
+            _assert_arrays_equal(
+                data, contents[i], err_msg='Mismatch in field %s.' % name)
+
+        """
+        6. Slicing a dataset
+
+        You can create a new schema from pieces of another schema and reuse
+        the same data.
+        """
+        subschema = Struct(('top_level', schema.int_lists.values))
+        int_list_contents = contents[schema.int_lists.values.slice()]
+        self.assertEquals(len(subschema.field_names()), len(int_list_contents))
+
+        """
+        7. Random Access a dataset
+
+        """
+        read_init_net = core.Net('read_init')
+        read_next_net = core.Net('read_next')
+
+        idx = np.array([2, 1, 0])
+        workspace.FeedBlob('idx', idx)
+
+        reader = ds.random_reader(read_init_net, 'idx')
+        reader.computeoffset(read_init_net)
+
+        should_continue, batch_blobs = reader.read(read_next_net)
+
+        workspace.CreateNet(read_init_net)
+        workspace.RunNetOnce(read_init_net)
+
+        workspace.CreateNet(read_next_net)
+        read_next_net_name = str(read_next_net)
+
+        for i in range(len(entries)):
+            k = idx[i] if i in idx else i
+            entry = entries[k]
+            workspace.RunNet(read_next_net_name)
+            for name, blob, base in zip(ds.field_names(), batch_blobs, entry):
+                data = workspace.FetchBlob(str(blob))
+                _assert_arrays_equal(
+                    data, base,
+                    err_msg='Mismatch in entry %d, field %s' % (i, name))
--- a/caffe2/python/operator_test/index_ops_test.py
+++ b/caffe2/python/operator_test/index_ops_test.py
@ -2,9 +2,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
-import numpy as np
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
+import numpy as np


 class TestIndexOps(TestCase):
@ -47,6 +47,13 @@ class TestIndexOps(TestCase):
        result2 = workspace.FetchBlob('result2')
        np.testing.assert_array_equal([0, 5, 1, 0, 0], result2)

+        workspace.RunOperatorOnce(core.CreateOperator(
+            'IndexSize',
+            ['index'],
+            ['index_size']))
+        size = workspace.FetchBlob('index_size')
+        self.assertEquals(size, 6)
+
        workspace.RunOperatorOnce(core.CreateOperator(
            'IndexStore',
            ['index'],
@ -55,3 +62,21 @@ class TestIndexOps(TestCase):
        new_entries = np.array(['new_entry1', 'new_entry2'], dtype=str)
        np.testing.assert_array_equal(
            np.concatenate((entries, new_entries)), stored_actual)
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'StringIndexCreate',
+            [],
+            ['index2']))
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'IndexLoad',
+            ['index2', 'stored_entries'],
+            [],
+            skip_first_entry=1))
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'IndexSize',
+            ['index2'],
+            ['index2_size']))
+        index2_size = workspace.FetchBlob('index2_size')
+        self.assertEquals(index2_size, 5)
--- a/caffe2/python/operator_test/one_hot_ops_test.py
+++ b/caffe2/python/operator_test/one_hot_ops_test.py
@ -0,0 +1,76 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+def _one_hots():
+    index_size = st.integers(min_value=1, max_value=5)
+    lengths = st.lists(
+        elements=st.integers(min_value=0, max_value=5))
+    return st.tuples(index_size, lengths).flatmap(
+        lambda x: st.tuples(
+            st.just(x[0]),
+            st.just(x[1]),
+            st.lists(
+                elements=st.integers(min_value=0, max_value=x[0] - 1),
+                min_size=sum(x[1]),
+                max_size=sum(x[1]))))
+
+
+class TestOneHotOps(hu.HypothesisTestCase):
+    @given(
+        hot_indices=hu.tensor(
+            min_dim=1, max_dim=1, dtype=np.int64,
+            elements=st.integers(min_value=0, max_value=42)),
+        end_padding=st.integers(min_value=0, max_value=2))
+    def test_one_hot(self, hot_indices, end_padding):
+
+        def one_hot_ref(hot_indices, size):
+            out = np.zeros([len(hot_indices), size], dtype=float)
+            x = enumerate(hot_indices)
+            for i, x in enumerate(hot_indices):
+                out[i, x] = 1.
+            return (out, )
+
+        size = np.array(max(hot_indices) + end_padding + 1, dtype=np.int64)
+        if size == 0:
+            size = 1
+        op = core.CreateOperator('OneHot', ['hot_indices', 'size'], ['output'])
+        self.assertReferenceChecks(
+            hu.cpu_do,
+            op,
+            [hot_indices, size],
+            one_hot_ref)
+
+    @given(hot_indices=_one_hots())
+    def test_segment_one_hot(self, hot_indices):
+        index_size, lengths, indices = hot_indices
+
+        index_size = np.array(index_size, dtype=np.int64)
+        lengths = np.array(lengths, dtype=np.int32)
+        indices = np.array(indices, dtype=np.int64)
+
+        def segment_one_hot_ref(lengths, hot_indices, size):
+            offset = 0
+            out = np.zeros([len(lengths), size], dtype=float)
+            for i, length in enumerate(lengths):
+                for idx in hot_indices[offset:offset + length]:
+                    out[i, idx] = 1.
+                offset += length
+            return (out, )
+
+        op = core.CreateOperator(
+            'SegmentOneHot',
+            ['lengths', 'hot_indices', 'size'],
+            ['output'])
+        self.assertReferenceChecks(
+            hu.cpu_do,
+            op,
+            [lengths, indices, index_size],
+            segment_one_hot_ref)
--- a/caffe2/python/operator_test/segment_ops_test.py
+++ b/caffe2/python/operator_test/segment_ops_test.py
@ -83,6 +83,10 @@ def logsumexp_grad(grad_out, outputs, inputs):
        axis=0) * np.exp(inputs[0])


+def logmeanexp(x):
+    return np.log(np.mean(np.exp(x), axis=0))
+
+
 def mean(x):
    return np.mean(x, axis=0)

@ -94,6 +98,30 @@ def mean_grad(grad_out, outputs, inputs):
        axis=0)


+def max(x):
+    return np.amax(x, axis=0)
+
+
+def max_grad(grad_out, outputs, inputs):
+    flat_inputs = inputs[0].flatten()
+    flat_outputs = np.array(outputs[0]).flatten()
+    flat_grad_in = np.zeros(flat_inputs.shape)
+    flat_grad_out = np.array(grad_out).flatten()
+    blocks = inputs[0].shape[0]
+    block_size = flat_inputs.shape[0] // blocks
+
+    for i in range(block_size):
+        out_grad = flat_grad_out[i]
+        out = flat_outputs[i]
+        for j in range(blocks):
+            idx = j * block_size + i
+            if out == flat_inputs[idx]:
+                flat_grad_in[idx] = out_grad
+                break
+
+    return np.resize(flat_grad_in, inputs[0].shape)
+
+
 REFERENCES_ALL = [
    ('Sum', partial(np.sum, axis=0), sum_grad),
 ]
@ -101,7 +129,10 @@ REFERENCES_ALL = [
 REFERENCES_SORTED = [
    ('RangeSum', partial(np.sum, axis=0), sum_grad),
    ('RangeLogSumExp', logsumexp, logsumexp_grad),
+    # gradient is the same as sum
+    ('RangeLogMeanExp', logmeanexp, logsumexp_grad),
    ('RangeMean', mean, mean_grad),
+    ('RangeMax', max, max_grad),
 ]


--- a/caffe2/python/operator_test/sequence_ops_test.py
+++ b/caffe2/python/operator_test/sequence_ops_test.py
@ -185,3 +185,37 @@ class TestSequenceOps(hu.HypothesisTestCase):
            op,
            [padded_data, padded_lengths],
            partial(_gather_padding_ref, start_pad_width, end_pad_width))
+
+    @given(data=hu.tensor(min_dim=3, max_dim=3, dtype=np.float32,
+                          elements=st.floats(min_value=-np.inf,
+                                             max_value=np.inf),
+                          min_value=1, max_value=10),
+                          **hu.gcs_cpu_only)
+    def test_reverse_packed_segs(self, data, gc, dc):
+        max_length = data.shape[0]
+        batch_size = data.shape[1]
+        lengths = np.random.randint(max_length + 1, size=batch_size)
+
+        op = core.CreateOperator(
+            "ReversePackedSegs",
+            ["data", "lengths"],
+            ["reversed_data"])
+
+        def op_ref(data, lengths):
+            rev_data = np.array(data, copy=True)
+            for i in range(batch_size):
+                seg_length = lengths[i]
+                for j in range(seg_length):
+                    rev_data[j][i] = data[seg_length - 1 - j][i]
+            return (rev_data,)
+
+        def op_grad_ref(grad_out, outputs, inputs):
+            return op_ref(grad_out, inputs[1]) + (None,)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[data, lengths],
+            reference=op_ref,
+            output_to_grad='reversed_data',
+            grad_reference=op_grad_ref)
--- a/caffe2/python/operator_test/string_ops_test.py
+++ b/caffe2/python/operator_test/string_ops_test.py
@ -0,0 +1,106 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+def _string_lists(alphabet=None):
+    return st.lists(
+        elements=st.text(alphabet=alphabet, average_size=3),
+        min_size=0,
+        max_size=3)
+
+
+class TestStringOps(hu.HypothesisTestCase):
+    @given(strings=_string_lists())
+    def test_string_prefix(self, strings):
+        length = 3
+        # although we are utf-8 encoding below to avoid python exceptions,
+        # StringPrefix op deals with byte-length prefixes, which may produce
+        # an invalid utf-8 string. The goal here is just to avoid python
+        # complaining about the unicode -> str conversion.
+        strings = np.array(
+            map(lambda a: a.encode('utf-8'), strings), dtype=np.object)
+
+        def string_prefix_ref(strings):
+            return (
+                np.array(map(lambda a: a[:length], strings), dtype=object), )
+
+        op = core.CreateOperator(
+            'StringPrefix',
+            ['strings'],
+            ['stripped'],
+            length=length)
+        self.assertReferenceChecks(
+            hu.cpu_do,
+            op,
+            [strings],
+            string_prefix_ref)
+
+    @given(strings=_string_lists())
+    def test_string_suffix(self, strings):
+        length = 3
+        strings = np.array(
+            map(lambda a: a.encode('utf-8'), strings), dtype=np.object)
+
+        def string_suffix_ref(strings):
+            return (
+                np.array(map(lambda a: a[-length:], strings), dtype=object), )
+
+        op = core.CreateOperator(
+            'StringSuffix',
+            ['strings'],
+            ['stripped'],
+            length=length)
+        self.assertReferenceChecks(
+            hu.cpu_do,
+            op,
+            [strings],
+            string_suffix_ref)
+
+    @given(strings=st.text(alphabet=['a', 'b'], average_size=3))
+    def test_string_starts_with(self, strings):
+        prefix = 'a'
+        strings = np.array(
+            map(lambda a: str(strings), strings), dtype=np.object)
+
+        def string_starts_with_ref(strings):
+            return (np.array(
+                map(lambda a: a.startswith(prefix), strings), dtype=bool), )
+
+        op = core.CreateOperator(
+            'StringStartsWith',
+            ['strings'],
+            ['bools'],
+            prefix=prefix)
+        self.assertReferenceChecks(
+            hu.cpu_do,
+            op,
+            [strings],
+            string_starts_with_ref)
+
+    @given(strings=st.text(alphabet=['a', 'b'], average_size=3))
+    def test_string_ends_with(self, strings):
+        suffix = 'a'
+        strings = np.array(
+            map(lambda a: str(strings), strings), dtype=np.object)
+
+        def string_ends_with_ref(strings):
+            return (np.array(
+                map(lambda a: a.endswith(suffix), strings), dtype=bool), )
+
+        op = core.CreateOperator(
+            'StringEndsWith',
+            ['strings'],
+            ['bools'],
+            suffix=suffix)
+        self.assertReferenceChecks(
+            hu.cpu_do,
+            op,
+            [strings],
+            string_ends_with_ref)
--- a/caffe2/python/schema.py
+++ b/caffe2/python/schema.py
@ -0,0 +1,348 @@
+"""
+Defines a minimal set of data types that allow to represent datasets with
+arbitrary nested structure, including objects of variable length, such as
+maps and lists.
+
+This defines a columnar storage format for such datasets on top of caffe2
+tensors. In terms of capacity of representation, it can represent most of
+the data types supported by Parquet, ORC, DWRF file formats.
+
+See comments in operator_test/dataset_ops_test.py for a example and
+walkthrough on how to use schema to store and iterate through a structured
+in-memory dataset.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from collections import OrderedDict
+import logging
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+
+def _join_field_name(prefix, suffix):
+    if prefix and suffix:
+        return '{}:{}'.format(prefix, suffix)
+    elif prefix:
+        return prefix
+    elif suffix:
+        return suffix
+    else:
+        return ''
+
+
+class Field(object):
+    """Represents an abstract field type in a dataset.
+    """
+    def __init__(self, children):
+        """Derived classes must call this after their initialization."""
+        self._parent = (None, 0)
+        offset = 0
+        self._field_offsets = []
+        for child in children:
+            self._field_offsets.append(offset)
+            offset += len(child.field_names())
+        self._field_offsets.append(offset)
+
+    def field_names(self):
+        """Return the children field names for this field."""
+        raise NotImplementedError('Field is an abstract class.')
+
+    def field_types(self):
+        """Return the numpy.dtype for each of the children fields."""
+        raise NotImplementedError('Field is an abstract class.')
+
+    def clone(self):
+        """Clone this Field along with its children."""
+        raise NotImplementedError('Field is an abstract class.')
+
+    def _set_parent(self, parent, relative_id):
+        self._parent = (parent, relative_id)
+
+    def slice(self):
+        """
+        Returns a slice representing the range of field ids that belong to
+        this field. This slice can be used to index a list of fields.
+
+        E.g.:
+
+        >>> s = Struct(
+        >>>     ('a', Scalar()),
+        >>>     ('b', Struct(
+        >>>         ('b1', Scalar()),
+        >>>         ('b2', Scalar()),
+        >>>     )),
+        >>>     ('c', Scalar()),
+        >>> )
+        >>> field_data = ['da', 'db1', 'db2', 'dc']
+        >>> field_data[s.b.split()]
+        ['db1', 'db2']
+        """
+        base_id = self._child_base_id()
+        return slice(base_id, base_id + len(self.field_names()))
+
+    def _child_base_id(self, child_index=None):
+        """Get the base id of the given child"""
+        p, i = self._parent
+        pos = 0 if child_index is None else self._field_offsets[child_index]
+        if p:
+            pos += p._child_base_id(i)
+        return pos
+
+    def __eq__(self, other):
+        """Equivalance of two schemas"""
+        return ((self.field_names() == other.field_names()) and
+                (self.field_types() == other.field_types()))
+
+class List(Field):
+    """Represents a variable-length list.
+
+    Values of a list can also be complex fields such as Lists and Structs.
+    In addition to the fields exposed by its `values` field, a List exposes an
+    additional `lengths` field, which will contain the size of each list under
+    the parent domain.
+    """
+    def __init__(self, values):
+        assert isinstance(values, Field)
+        self.lengths = Scalar(np.int32)
+        self.values = values.clone()
+        self.lengths._set_parent(self, 0)
+        self.values._set_parent(self, 1)
+        Field.__init__(self, [self.lengths, self.values])
+
+    def field_names(self):
+        value_fields = self.values.field_names()
+        return (
+            ['lengths'] +
+            [_join_field_name('values', v) for v in value_fields])
+
+    def field_types(self):
+        return self.lengths.field_types() + self.values.field_types()
+
+    def clone(self):
+        return List(self.values)
+
+
+class Struct(Field):
+    """Represents a named list of fields sharing the same domain.
+    """
+    def __init__(self, *fields):
+        for field in fields:
+            assert len(field) == 2
+            assert field[0], 'Field names cannot be empty'
+            assert field[0] != 'lengths', (
+                'Struct cannot contain a field named `lengths`.')
+            assert isinstance(field[1], Field)
+        fields = [(name, field.clone()) for name, field in fields]
+        for id, (name, field) in enumerate(fields):
+            field._set_parent(self, id)
+        self.fields = OrderedDict(fields)
+        Field.__init__(self, self.fields.values())
+
+    def field_names(self):
+        names = []
+        for name, field in self.fields.items():
+            names += [_join_field_name(name, f) for f in field.field_names()]
+        return names
+
+    def field_types(self):
+        types = []
+        for name, field in self.fields.items():
+            types += field.field_types()
+        return types
+
+    def clone(self):
+        return Struct(*self.fields.items())
+
+    def __getattr__(self, item):
+        return self.fields[item]
+
+
+class Scalar(Field):
+    """Represents a typed scalar or tensor of fixed shape.
+
+    A Scalar is a leaf in a schema tree, translating to exactly one tensor in
+    the dataset's underlying storage.
+
+    Usually, the tensor storing the actual values of this field is a 1D tensor,
+    representing a series of values in its domain. It is possible however to
+    have higher rank values stored as a Scalar, as long as all entries have
+    the same shape.
+
+    E.g.:
+
+        Scalar(np.float64)
+
+            Scalar field of type float32. Caffe2 will expect readers and
+            datasets to expose it as a 1D tensor of doubles (vector), where
+            the size of the vector is determined by this fields' domain.
+
+        Scalar((np.int32, 5))
+
+            Tensor field of type int32. Caffe2 will expect readers and
+            datasets to implement it as a 2D tensor (matrix) of shape (L, 5),
+            where L is determined by this fields' domain.
+
+        Scalar((str, (10, 20)))
+
+            Tensor field of type str. Caffe2 will expect readers and
+            datasets to implement it as a 3D tensor of shape (L, 10, 20),
+            where L is determined by this fields' domain.
+
+    If the field type is unknown at construction time, call Scalar(), that will
+    default to np.void as its dtype.
+
+    It is an error to pass a structured dtype to Scalar, since it would contain
+    more than one field. Instead, use from_dtype, which will construct
+    a nested `Struct` field reflecting the given dtype's structure.
+    """
+    def __init__(self, dtype=None):
+        self._original_dtype = dtype
+        self.dtype = np.dtype(dtype or np.void)
+        assert not self.dtype.fields, (
+            'Cannot create Scalar with a structured dtype. ' +
+            'Use from_dtype instead.')
+        Field.__init__(self, [])
+
+    def field_names(self):
+        return ['']
+
+    def field_types(self):
+        return [self.dtype]
+
+    def clone(self):
+        return Scalar(self._original_dtype)
+
+    def id(self):
+        """
+        Return the zero-indexed position of this scalar field in its schema.
+        Used in order to index into the field_blob list returned by readers or
+        accepted by writers.
+        """
+        return self._child_base_id()
+
+
+def Map(keys, values, keys_name='keys', values_name='values'):
+    """A map is a List of Struct containing keys and values fields.
+    Optionally, you can provide custom name for the key and value fields.
+    """
+    return List(Struct((keys_name, keys), (values_name, values)))
+
+
+def from_dtype(dtype, _outer_shape=()):
+    """Constructs a Caffe2 schema from the given numpy's dtype.
+
+    Numpy supports scalar, array-like and structured datatypes, as long as
+    all the shapes are fixed. This function breaks down the given dtype into
+    a Caffe2 schema containing `Struct` and `Scalar` types.
+
+    Fields containing byte offsets are not currently supported.
+    """
+    if not isinstance(dtype, np.dtype):
+        # wrap into a ndtype
+        shape = _outer_shape
+        dtype = np.dtype((dtype, _outer_shape))
+    else:
+        # concatenate shapes if necessary
+        shape = _outer_shape + dtype.shape
+        if shape != dtype.shape:
+            dtype = np.dtype((dtype.base, shape))
+
+    if not dtype.fields:
+        return Scalar(dtype)
+
+    struct_fields = []
+    for name, (fdtype, offset) in dtype.fields:
+        assert offset == 0, ('Fields with byte offsets are not supported.')
+        struct_fields += (name, from_dtype(fdtype, _outer_shape=shape))
+    return Struct(*struct_fields)
+
+
+class _SchemaNode(object):
+    """This is a private class used to represent a Schema Node"""
+    def __init__(self, name, type_str=''):
+        self.name = name
+        self.children = []
+        self.type_str = type_str
+        self.field = None
+
+    def add_child(self, name, type_str=''):
+        for child in self.children:
+            if child.name == name and child.type_str == type_str:
+                return child
+        child = _SchemaNode(name, type_str)
+        self.children.append(child)
+        return child
+
+    def get_field(self):
+
+        list_names = ['lengths', 'values']
+        map_names = ['lengths', 'keys', 'values']
+
+        if len(self.children) == 0 or self.field is not None:
+            assert self.field is not None
+            return self.field
+
+        child_names = []
+        for child in self.children:
+            child_names.append(child.name)
+
+        if (set(child_names) == set(list_names)):
+            for child in self.children:
+                if child.name == 'values':
+                    self.field = List(child.get_field())
+                    self.type_str = "List"
+                    return self.field
+
+        elif (set(child_names) == set(map_names)):
+            for child in self.children:
+                if child.name == 'keys':
+                    key_field = child.get_field()
+                elif child.name == 'values':
+                    values_field = child.get_field()
+            self.field = Map(key_field, values_field)
+            self.type_str = "Map"
+            return self.field
+
+        else:
+            struct_fields = []
+            for child in self.children:
+                if child.field is not None:
+                    struct_fields.append((child.name, child.field))
+                else:
+                    struct_fields.append((child.name, child.get_field()))
+
+            self.field = Struct(*struct_fields)
+            self.type_str = "Struct"
+            return self.field
+
+    def print_recursively(self):
+        for child in self.children:
+            child.print_recursively()
+        logger.info("Printing node: Name and type")
+        logger.info(self.name)
+        logger.info(self.type_str)
+
+
+def from_column_list(column_names, column_types):
+
+    root = _SchemaNode('root', 'Struct')
+    for column_name, column_type in zip(column_names, column_types):
+        columns = column_name.split(':')
+        current = root
+        for i in range(len(columns)):
+            name = columns[i]
+            type_str = ''
+            field = None
+            if i == len(columns) - 1:
+                type_str = column_type
+                field = Scalar(column_type)
+            next = current.add_child(name, type_str)
+            if field is not None:
+                next.field = field
+            current = next
+
+    return root.get_field()
--- a/caffe2/python/sparse_to_dense_mask_test.py
+++ b/caffe2/python/sparse_to_dense_mask_test.py
@ -0,0 +1,82 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core, workspace
+from caffe2.python.test_util import TestCase
+
+import numpy as np
+
+
+class TestSparseToDenseMask(TestCase):
+
+    def test_sparse_to_dense_mask_float(self):
+        op = core.CreateOperator(
+            'SparseToDenseMask',
+            ['indices', 'values', 'default', 'lengths'],
+            ['output'],
+            mask=[999999999, 2, 6])
+        workspace.FeedBlob(
+            'indices',
+            np.array([2, 4, 6, 1, 2, 999999999, 2], dtype=np.int32))
+        workspace.FeedBlob(
+            'values',
+            np.array([1, 2, 3, 4, 5, 6, 7], dtype=np.float))
+        workspace.FeedBlob('default', np.array(-1, dtype=np.float))
+        workspace.FeedBlob('lengths', np.array([3, 4], dtype=np.int32))
+        workspace.RunOperatorOnce(op)
+        output = workspace.FetchBlob('output')
+        expected = np.array([[-1, 1, 3], [6, 7, -1]], dtype=np.float)
+        self.assertEqual(output.shape, expected.shape)
+        self.assertFalse(np.any(output - expected))
+
+    def test_sparse_to_dense_mask_string(self):
+        op = core.CreateOperator(
+            'SparseToDenseMask',
+            ['indices', 'values', 'default', 'lengths'],
+            ['output'],
+            mask=[999999999, 2, 6])
+        workspace.FeedBlob(
+            'indices',
+            np.array([2, 4, 6, 1, 2, 999999999, 2], dtype=np.int32))
+        workspace.FeedBlob(
+            'values',
+            np.array(['1', '2', '3', '4', '5', '6', '7'], dtype=np.str))
+        workspace.FeedBlob('default', np.array('-1', dtype=np.str))
+        workspace.FeedBlob('lengths', np.array([3, 4], dtype=np.int32))
+        workspace.RunOperatorOnce(op)
+        output = workspace.FetchBlob('output')
+        expected = np.array([['-1', '1', '3'], ['6', '7', '-1']], dtype=np.str)
+        self.assertEqual(output.shape, expected.shape)
+        self.assertTrue(np.all(np.equal(output, expected)))
+
+    def test_sparse_to_dense_mask_empty_lengths(self):
+        op = core.CreateOperator(
+            'SparseToDenseMask',
+            ['indices', 'values', 'default', 'lengths'],
+            ['output'],
+            mask=[1, 2, 6])
+        workspace.FeedBlob('indices', np.array([2, 4, 6], dtype=np.int32))
+        workspace.FeedBlob('values', np.array([1, 2, 3], dtype=np.float))
+        workspace.FeedBlob('default', np.array(-1, dtype=np.float))
+        workspace.FeedBlob('lengths', np.array([], dtype=np.int32))
+        workspace.RunOperatorOnce(op)
+        output = workspace.FetchBlob('output')
+        expected = np.array([-1, 1, 3], dtype=np.float)
+        self.assertEqual(output.shape, expected.shape)
+        self.assertFalse(np.any(output - expected))
+
+    def test_sparse_to_dense_mask_no_lengths(self):
+        op = core.CreateOperator(
+            'SparseToDenseMask',
+            ['indices', 'values', 'default'],
+            ['output'],
+            mask=[1, 2, 6])
+        workspace.FeedBlob('indices', np.array([2, 4, 6], dtype=np.int32))
+        workspace.FeedBlob('values', np.array([1, 2, 3], dtype=np.float))
+        workspace.FeedBlob('default', np.array(-1, dtype=np.float))
+        workspace.RunOperatorOnce(op)
+        output = workspace.FetchBlob('output')
+        expected = np.array([-1, 1, 3], dtype=np.float)
+        self.assertEqual(output.shape, expected.shape)
+        self.assertFalse(np.any(output - expected))
--- a/caffe2/python/workspace.py
+++ b/caffe2/python/workspace.py
@ -6,10 +6,18 @@ import shutil
 import socket
 import tempfile

+import numpy as np
 from caffe2.proto import caffe2_pb2
 from caffe2.python import scope, utils
 from ._import_c_extension import *  # noqa

+# Python 2 and 3 compatibility: test if basestring exists
+try:
+    basestring  # NOQA
+except NameError:
+    # This is python3 so we define basestring.
+    basestring = str
+

 def _GetFreeFlaskPort():
    """Get a free flask port."""
@ -86,7 +94,9 @@ def ResetWorkspace(root_folder=None):
        return cc_ResetWorkspace(root_folder)


-def CreateNet(net, input_blobs=[]):
+def CreateNet(net, input_blobs=None):
+    if input_blobs is None:
+        input_blobs = []
    for input_blob in input_blobs:
        CreateBlob(input_blob)
    return cc_CreateNet(StringfyProto(net))
@ -112,6 +122,14 @@ def RunPlan(plan):
    return cc_RunPlan(StringfyProto(plan))


+def _StringifyBlobName(name):
+    if isinstance(name, basestring):
+        return name
+    assert type(name).__name__ == 'BlobReference', \
+        "Expected a string or BlobReference"
+    return str(name)
+
+
 def FeedBlob(name, arr, device_option=None):
    """Feeds a blob into the workspace.

@ -125,6 +143,10 @@ def FeedBlob(name, arr, device_option=None):
    """
    if type(arr) is caffe2_pb2.TensorProto:
        arr = utils.Caffe2TensorToNumpyArray(arr)
+    if type(arr) is np.ndarray and arr.dtype.kind == 'S':
+        # Plain NumPy strings are weird, let's use objects instead
+        arr = arr.astype(np.object)
+    name = _StringifyBlobName(name)
    if device_option is not None:
        return cc_FeedBlob(name, arr, StringfyProto(device_option))
    elif scope.DEVICESCOPE is not None:
@ -133,6 +155,40 @@ def FeedBlob(name, arr, device_option=None):
        return cc_FeedBlob(name, arr)


+def FetchBlob(name):
+    """Fetches a blob from the workspace.
+
+    Inputs:
+      name: the name of the blob - a string or a BlobReference
+    Returns:
+      Fetched blob (numpy array or string) if successful
+    """
+    name = _StringifyBlobName(name)
+    return cc_FetchBlob(name)
+
+
+class _BlobDict(object):
+    """Provides python dict compatible way to do fetching and feeding"""
+
+    def __getitem__(self, key):
+        return FetchBlob(key)
+
+    def __setitem__(self, key, value):
+        return FeedBlob(key, value)
+
+    def __len__(self):
+        return len(Blobs())
+
+    def __iter__(self):
+        return Blobs().__iter__()
+
+    def __contains__(self, item):
+        return HasBlob(item)
+
+
+blobs = _BlobDict()
+
+
 class Model(object):
    def __init__(self, net, parameters, inputs, outputs, device_option=None):
        """Initializes a model.
--- a/caffe2/python/workspace_test.py
+++ b/caffe2/python/workspace_test.py
@ -8,7 +8,8 @@ from caffe2.python import core, test_util, workspace
 class TestWorkspace(unittest.TestCase):
    def setUp(self):
        self.net = core.Net("test-net")
-        self.net.ConstantFill([], "testblob", shape=[1, 2, 3, 4], value=1.0)
+        self.testblob_ref = self.net.ConstantFill(
+            [], "testblob", shape=[1, 2, 3, 4], value=1.0)
        workspace.ResetWorkspace()

    def testRootFolder(self):
@ -64,6 +65,20 @@ class TestWorkspace(unittest.TestCase):
        self.assertEqual(fetched_again.shape, (1, 2, 3, 4))
        np.testing.assert_array_equal(fetched_again, 2.0)

+    def testFetchFeedBlobViaBlobReference(self):
+        self.assertEqual(
+            workspace.RunNetOnce(self.net.Proto().SerializeToString()), True)
+        fetched = workspace.FetchBlob(self.testblob_ref)
+        # check if fetched is correct.
+        self.assertEqual(fetched.shape, (1, 2, 3, 4))
+        np.testing.assert_array_equal(fetched, 1.0)
+        fetched[:] = 2.0
+        self.assertEqual(workspace.FeedBlob(self.testblob_ref, fetched), True)
+        fetched_again = workspace.FetchBlob("testblob")  # fetch by name now
+        self.assertEqual(fetched_again.shape, (1, 2, 3, 4))
+        np.testing.assert_array_equal(fetched_again, 2.0)
+
+
    def testFetchFeedBlobTypes(self):
        for dtype in [np.float16, np.float32, np.float64, np.bool,
                      np.int8, np.int16, np.int32, np.int64,
@ -101,7 +116,8 @@ class TestWorkspace(unittest.TestCase):
        strs = np.array([
            ' '.join(10 * ['long string']),
            ' '.join(128 * ['very long string']),
-            'small string'])
+            'small \0\1\2 string',
+            "Hello, world! I have special \0 symbols \1!"])
        workspace.FeedBlob('my_str_tensor', strs)
        strs2 = workspace.FetchBlob('my_str_tensor')
        self.assertEqual(strs.shape, strs2.shape)
@ -117,6 +133,32 @@ class TestWorkspace(unittest.TestCase):
        for i in range(0, strs.shape[0]):
            self.assertEqual(strs[i], strs2[i])

+    def testFetchFeedPlainString(self):
+        # this is actual string, not a tensor of strings
+        s = "Hello, world! I have special \0 symbols \1!"
+        workspace.FeedBlob('my_plain_string', s)
+        s2 = workspace.FetchBlob('my_plain_string')
+        self.assertEqual(s, s2)
+
+    def testFetchFeedViaBlobDict(self):
+        self.assertEqual(
+            workspace.RunNetOnce(self.net.Proto().SerializeToString()), True)
+        fetched = workspace.blobs["testblob"]
+        # check if fetched is correct.
+        self.assertEqual(fetched.shape, (1, 2, 3, 4))
+        np.testing.assert_array_equal(fetched, 1.0)
+        fetched[:] = 2.0
+        workspace.blobs["testblob"] = fetched
+        fetched_again = workspace.blobs["testblob"]
+        self.assertEqual(fetched_again.shape, (1, 2, 3, 4))
+        np.testing.assert_array_equal(fetched_again, 2.0)
+
+        self.assertTrue("testblob" in workspace.blobs)
+        self.assertFalse("non_existant" in workspace.blobs)
+        self.assertEqual(len(workspace.blobs), 1)
+        for key in workspace.blobs:
+            self.assertEqual(key, "testblob")
+

 class TestMultiWorkspaces(unittest.TestCase):
    def setUp(self):
--- a/caffe2/sgd/adagrad_op.h
+++ b/caffe2/sgd/adagrad_op.h
@ -14,7 +14,8 @@ void adagrad_update(
    float epsilon,
    const float* lr,
    Context* context) {
-#pragma omp parallel for
+  // TODO(cxj): use OMP when it is reliable
+  // #pragma omp parallel for
  for (auto i = 0; i < N; ++i) {
    float gi = g[i];
    float hi = nh[i] = h[i] + gi * gi;
@ -78,8 +79,8 @@ class SparseAdagradOp final : public Operator<Context> {
    const auto* momentIn = Input(MOMENT_1).template data<T>();
    auto* gradOut = Output(OUTPUT_GRAD)->template mutable_data<T>();
    auto* momentOut = Output(OUTPUT_MOMENT_1)->template mutable_data<T>();
-
-#pragma omp parallel for
+    // TODO(cxj): use OMP when it is reliable
+    // #pragma omp parallel for
    for (auto i = 0; i < n; ++i) {
      auto idx = indices[i];
      if (block_size == 1) {
--- a/caffe2/sgd/ftrl_op.cc
+++ b/caffe2/sgd/ftrl_op.cc
@ -41,7 +41,8 @@ void ftrl_update(
    T* new_nz,
    const FtrlParams<T>& params,
    Context* context) {
-#pragma omp parallel for
+  // TODO(cxj): use OMP when it is reliable
+  // #pragma omp parallel for
  for (auto i = 0; i < N; ++i) {
    ftrl_compute(
        w[i],
@ -93,7 +94,9 @@ void SparseFtrlOp<T>::DoRun() {
  T* nz = n_z->template mutable_data<T>();
  const SIndex* idxs = indices.template data<SIndex>();
  const T* g = grad.template data<T>();
-#pragma omp parallel for
+
+  // TODO(cxj): use OMP when it is reliable
+  // #pragma omp parallel for
  for (TIndex i = 0; i < K; ++i) {
    SIndex idx = idxs[i];
    DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx
--- a/caffe2/utils/math_cpu.cc
+++ b/caffe2/utils/math_cpu.cc
@ -11,9 +11,11 @@
 //     platforms, it allows one to quickly port Caffe2 to different platforms
 //     where BLAS may not be present.

-#include <random>
 #include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
 #include <atomic>
+#include <random>

 #ifdef CAFFE2_USE_MKL
 #include <mkl.h>
@ -486,6 +488,7 @@ void Set<T, CPUContext>(const int N, const T alpha, T *Y,                      \
 CAFFE2_SPECIALIZED_SET(float);
 CAFFE2_SPECIALIZED_SET(double);
 CAFFE2_SPECIALIZED_SET(int);
+CAFFE2_SPECIALIZED_SET(bool);
 #undef CAFFE2_SPECIALIZED_SET

 #define CAFFE2_INSTANTIATE_BINARY_OP(name, op, T)                          \
--- a/caffe2/utils/string_utils.cc
+++ b/caffe2/utils/string_utils.cc
@ -0,0 +1,14 @@
+#include "caffe2/utils/string_utils.h"
+
+namespace caffe2 {
+
+std::vector<std::string> split(char separator, const std::string& string) {
+  std::vector<std::string> pieces;
+  std::stringstream ss(string);
+  std::string item;
+  while (getline(ss, item, separator)) {
+    pieces.push_back(std::move(item));
+  }
+  return pieces;
+}
+}
--- a/caffe2/utils/string_utils.h
+++ b/caffe2/utils/string_utils.h
@ -0,0 +1,10 @@
+#pragma once
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace caffe2 {
+
+std::vector<std::string> split(char separator, const std::string& string);
+}