diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc index 359dac0665c..253132ef043 100644 --- a/caffe2/core/blob_serialization.cc +++ b/caffe2/core/blob_serialization.cc @@ -11,12 +11,70 @@ CAFFE2_DEFINE_int( "Chunk size to split tensor data into"); namespace caffe2 { +namespace { +/** + * @brief StringSerializer is the serializer for String. + * + * StringSerializer takes in a blob that contains a String, and serializes it + * into a BlobProto protocol buffer. + */ +class StringSerializer : public BlobSerializerBase { + public: + StringSerializer() {} + ~StringSerializer() {} + /** + * Serializes a Blob. Note that this blob has to contain Tensor, + * otherwise this function produces a fatal error. + */ + void Serialize( + const Blob& blob, + const string& name, + SerializationAcceptor acceptor) override { + CHECK(blob.IsType()); + + BlobProto blob_proto; + blob_proto.set_name(name); + blob_proto.set_type("std::string"); + blob_proto.set_content(blob.template Get()); + acceptor(name, blob_proto.SerializeAsString()); + } +}; + +/** + * @brief StringDeserializer is the deserializer for Strings. + * + */ +class StringDeserializer : public BlobDeserializerBase { + public: + bool Deserialize(const BlobProto& proto, Blob* blob) override { + *blob->GetMutable() = proto.content(); + return true; + } +}; +} + +namespace { + +// We can't use DeviceType_Name because of a protobuf-lite constraint. +std::string tensorDeviceTypeName(const DeviceType& d) { + switch (d) { + case CPU: + return "TensorCPU"; + case CUDA: + return "TensorCUDA"; + default: + CAFFE_THROW("Unknown device: ", d); + return ""; + } +}; +} // The blob serialization member function implementation. void Blob::Serialize( const string& name, BlobSerializerBase::SerializationAcceptor acceptor) const { std::unique_ptr serializer(CreateSerializer(meta_.id())); + CAFFE_ENFORCE(serializer, "No known serializer for ", meta_.name()); serializer->Serialize(*this, name, acceptor); } @@ -33,7 +91,6 @@ std::string Blob::Serialize(const string& name) const { return data.str(); } - // Specialization for StoreDeviceDetail for CPU - nothing needs to be done. template <> void TensorSerializer::StoreDeviceDetail( @@ -60,9 +117,8 @@ bool Blob::Deserialize(const BlobProto& blob_proto) { if (blob_proto.has_tensor()) { // This is a tensor object. Depending on the device type, we will // use the corresponding TensorDeserializer. - auto deserializer = CreateDeserializer( - "Tensor" + - DeviceType_Name(blob_proto.tensor().device_detail().device_type())); + auto deserializer = CreateDeserializer(tensorDeviceTypeName( + blob_proto.tensor().device_detail().device_type())); // Tensor's deserializer should always be registered, but we will double // check if it is not null anyway. return CHECK_NOTNULL(deserializer.get())->Deserialize(blob_proto, this); @@ -82,5 +138,8 @@ REGISTER_BLOB_SERIALIZER( (TypeMeta::Id()), TensorSerializer); REGISTER_BLOB_DESERIALIZER(TensorCPU, TensorDeserializer); +// Serialize std::string +REGISTER_BLOB_SERIALIZER((TypeMeta::Id()), StringSerializer); +REGISTER_BLOB_DESERIALIZER(std::string, StringDeserializer); } // namespace } // namespace caffe2 diff --git a/caffe2/core/blob_test.cc b/caffe2/core/blob_test.cc index 0b3efdbdea2..c9002d11ad6 100644 --- a/caffe2/core/blob_test.cc +++ b/caffe2/core/blob_test.cc @@ -69,6 +69,20 @@ TEST(BlobTest, BlobWrongType) { ASSERT_THROW(blob.Get(), EnforceNotMet); } +TEST(BlobTest, StringSerialization) { + const std::string kTestString = "Hello world?"; + Blob blob; + *blob.GetMutable() = kTestString; + + string serialized = blob.Serialize("test"); + BlobProto proto; + CHECK(proto.ParseFromString(serialized)); + EXPECT_EQ(proto.name(), "test"); + EXPECT_EQ(proto.type(), "std::string"); + EXPECT_FALSE(proto.has_tensor()); + EXPECT_EQ(proto.content(), kTestString); +} + TEST(TensorNonTypedTest, TensorChangeType) { vector dims(3); dims[0] = 2; diff --git a/caffe2/core/context.h b/caffe2/core/context.h index deb6076b3bd..662a68c3bd4 100644 --- a/caffe2/core/context.h +++ b/caffe2/core/context.h @@ -5,8 +5,9 @@ #include #include -#include "caffe2/proto/caffe2.pb.h" #include "caffe2/core/logging.h" +#include "caffe2/core/typeid.h" +#include "caffe2/proto/caffe2.pb.h" #include "caffe2/utils/math.h" namespace caffe2 { @@ -103,6 +104,7 @@ class CPUContext final { // Two copy functions that deals with cross-device copies. template inline void CopyBytes(size_t nbytes, const void* src, void* dst); + template inline void Copy(size_t n, const T* src, T* dst) { if (std::is_fundamental::value) { @@ -116,6 +118,16 @@ class CPUContext final { } } + template + inline void + CopyItems(const TypeMeta& meta, size_t n, const void* src, void* dst) { + if (meta.copy()) { + meta.copy()(src, dst, n); + } else { + CopyBytes(n * meta.itemsize(), src, dst); + } + } + protected: // TODO(jiayq): instead of hard-coding a generator, make it more flexible. int random_seed_{1701}; diff --git a/caffe2/core/context_gpu.cc b/caffe2/core/context_gpu.cc index e69c0153159..0e55d255270 100644 --- a/caffe2/core/context_gpu.cc +++ b/caffe2/core/context_gpu.cc @@ -7,6 +7,16 @@ thread_local ThreadLocalCUDAObjects CUDAContext::cuda_objects_; namespace { bool Caffe2UsePinnedCPUAllocator(int*, char***) { +#ifdef __SANITIZE_ADDRESS__ + // Note(jiayq): for more details, see + // https://github.com/google/sanitizers/issues/629 + LOG(WARNING) << "There are known issues between address sanitizer and " + "cudaMallocHost. As a result, caffe2 will not enable pinned " + "memory allocation in asan mode. If you are expecting any " + "behavior that depends on asan, be advised that it is not " + "turned on."; + return true; +#else if (!HasCudaGPU()) { VLOG(1) << "No GPU present. I won't use pinned allocator then."; return true; @@ -14,6 +24,7 @@ bool Caffe2UsePinnedCPUAllocator(int*, char***) { VLOG(1) << "Caffe2 gpu: setting CPUAllocator to PinnedCPUAllocator."; SetCPUAllocator(new PinnedCPUAllocator()); return true; +#endif } REGISTER_CAFFE2_INIT_FUNCTION(Caffe2UsePinnedCPUAllocator, diff --git a/caffe2/core/db.h b/caffe2/core/db.h index 7765ec953bb..da16b6ba6ff 100644 --- a/caffe2/core/db.h +++ b/caffe2/core/db.h @@ -116,7 +116,9 @@ CAFFE_DECLARE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode); */ inline unique_ptr CreateDB( const string& db_type, const string& source, Mode mode) { - return Caffe2DBRegistry()->Create(db_type, source, mode); + auto result = Caffe2DBRegistry()->Create(db_type, source, mode); + VLOG(1) << ((!result) ? "not found db " : "found db ") << db_type; + return result; } /** diff --git a/caffe2/core/init.h b/caffe2/core/init.h index 94be34a143d..ca7f979e04d 100644 --- a/caffe2/core/init.h +++ b/caffe2/core/init.h @@ -68,13 +68,13 @@ class InitRegisterer { #define REGISTER_CAFFE2_INIT_FUNCTION(name, function, description) \ namespace { \ - ::caffe2::InitRegisterer g_caffe2_initregisterer_name( \ + ::caffe2::InitRegisterer g_caffe2_initregisterer_##name( \ function, false, description); \ } // namespace #define REGISTER_CAFFE2_EARLY_INIT_FUNCTION(name, function, description) \ namespace { \ - ::caffe2::InitRegisterer g_caffe2_initregisterer_name( \ + ::caffe2::InitRegisterer g_caffe2_initregisterer_##name( \ function, true, description); \ } // namespace diff --git a/caffe2/core/net_gpu.cc b/caffe2/core/net_gpu.cc index 6b7bab65898..49dd6844cb3 100644 --- a/caffe2/core/net_gpu.cc +++ b/caffe2/core/net_gpu.cc @@ -1,14 +1,70 @@ #include "caffe2/core/net.h" + #include "caffe2/core/context_gpu.h" +#include "caffe2/core/flags.h" #include "caffe2/core/operator.h" #include "caffe2/core/timer.h" #include "caffe2/proto/caffe2.pb.h" +#ifdef CAFFE2_USE_NVTX +#include +#endif + +CAFFE2_DEFINE_bool(caffe2_use_nvtx, false, "Use NVTX ranges for profiling"); + namespace caffe2 { namespace { +using Color = int32_t; +constexpr Color kRunColor = 0x0000CCFF; // blue +constexpr Color kRecordColor = 0x00FF3300; // red +constexpr Color kWaitColor = 0x0066FF33; // green + +#ifdef CAFFE2_USE_NVTX + +class ProfiledRange { + public: + ProfiledRange(const OperatorDef& def, Color color) { + if (!FLAGS_caffe2_use_nvtx) { + return; + } + nvtxEventAttributes_t eventAttrib = {0}; + eventAttrib.version = NVTX_VERSION; + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + eventAttrib.colorType = NVTX_COLOR_ARGB; + eventAttrib.color = color; + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; + eventAttrib.message.ascii = def.type().c_str(); + range_ = nvtxRangeStartEx(&eventAttrib); + CHECK(range_); + } + + ~ProfiledRange() { + if (!FLAGS_caffe2_use_nvtx) { + return; + } + nvtxRangeEnd(range_); + } + + private: + nvtxRangeId_t range_ = 0; + DISABLE_COPY_AND_ASSIGN(ProfiledRange); +}; + +#else + +class ProfiledRange { + public: + ProfiledRange(const OperatorDef& def, Color color) {} + + private: + DISABLE_COPY_AND_ASSIGN(ProfiledRange); +}; + +#endif // ifdef CAFFE2_USE_NVTX + struct Stream; struct Event { @@ -69,6 +125,7 @@ struct Stream { int gpu_id_{-1}; cudaStream_t stream_{nullptr}; + private: DISABLE_COPY_AND_ASSIGN(Stream); }; @@ -128,18 +185,24 @@ class AsyncDAGNet : public DAGNetBase { })); for (auto source_parent_idx : operator_nodes_[source_idx].parents_) { + ProfiledRange r( + operator_nodes_[source_parent_idx].operator_->def(), kWaitColor); stream.wait(events_[source_parent_idx].get()); } // We've waited on all our parent indices. bool success = true; - for (auto idx: chain) { + for (auto idx : chain) { + ProfiledRange r(operator_nodes_[idx].operator_->def(), kRunColor); success &= operator_nodes_[idx].operator_->RunAsync(); } // Record an event for the sink of the chain. const auto& sink_idx = chain.back(); - events_[sink_idx]->record(stream); + { + ProfiledRange r(operator_nodes_[sink_idx].operator_->def(), kRecordColor); + events_[sink_idx]->record(stream); + } CHECK(!eventRecorded_[sink_idx]); eventRecorded_[sink_idx] = 1; return success; @@ -157,9 +220,11 @@ class AsyncDAGNet : public DAGNetBase { Stream stream{device_option}; // Potential optimization: we can pre-compute outstanding events. - for (auto& event : events_) { + for (auto i = 0; i < events_.size(); ++i) { + auto& event = events_[i]; if (event->outstanding_) { VLOG(2) << "Synchronizing host on outstanding event"; + ProfiledRange r(operator_nodes_[i].operator_->def(), kWaitColor); stream.wait(event.get()); } } diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc index 7a37af57e11..7d0a358477c 100644 --- a/caffe2/core/tensor.cc +++ b/caffe2/core/tensor.cc @@ -2,17 +2,6 @@ #include "caffe2/core/flags.h" CAFFE2_DEFINE_bool( - caffe2_keep_on_shrink, false, + caffe2_keep_on_shrink, + true, "If set, keeps memory when a tensor is shrinking its size."); - -namespace caffe2 { - -namespace detail { - -vector& shape(size_t n) { - static thread_local vector r; - r.resize(n); - return r; -} -} -} diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h index 994afd38a5f..dc0588aa11a 100644 --- a/caffe2/core/tensor.h +++ b/caffe2/core/tensor.h @@ -150,6 +150,40 @@ class Tensor { virtual ~Tensor() {} + /** + * @brief Extends the outer-most dimension of this tensor by num elements, + * preserving the existing data. + * + * The underlying data may be reallocated in order to accommodate the new + * elements, in which case this tensors' capacity is grown at a factor of + * growthPct. This ensures that Extend runs on an amortized O(1) time + * complexity. + */ + template + void Extend(TIndex num, int growthPct, ContextForCopy* context) { + CHECK_GE(dims_.size(), 1); + auto oldSize = size_; + auto newDims = dims_; + newDims[0] += num; + if (!data_) { + Resize(newDims); + return; + } + auto newSize = std::accumulate( + newDims.begin(), newDims.end(), 1, std::multiplies()); + if (newSize * meta_.itemsize() > capacity_) { + auto newCapacity = dims_; + newCapacity[0] = std::max(newDims[0], dims_[0] * (growthPct + 100) / 100); + auto oldData = std::move(data_); + Resize(newCapacity); + auto* newData = raw_mutable_data(meta_); + context->template CopyItems( + meta_, oldSize, oldData.get(), newData); + } + dims_ = newDims; + size_ = newSize; + } + /** * @brief Resizes a tensor. * @@ -297,9 +331,12 @@ class Tensor { CHECK(data_.get() || size_ == 0) << "The tensor is uninitialized. You probably need to call " << "Resize() and mutable_data() first."; - CHECK(IsType()) - << "Tensor type mistmatch, caller expects elements to be " - << TypeMeta::Name() << " while tensor contains " << meta_.name(); + CAFFE_ENFORCE( + IsType(), + "Tensor type mistmatch, caller expects elements to be ", + TypeMeta::Name(), + " while tensor contains ", + meta_.name()); return static_cast(data_.get()); } diff --git a/caffe2/core/workspace.cc b/caffe2/core/workspace.cc index 571f1868768..9525340fdc0 100644 --- a/caffe2/core/workspace.cc +++ b/caffe2/core/workspace.cc @@ -12,34 +12,39 @@ namespace caffe2 { namespace { -// Returns a function that returns `true` if we should continue -// iterating, given the current iteration count. -std::function getContinuationTest( - Workspace* ws, - const ExecutionStep& step) { - if (step.has_criteria_network()) { - CHECK(!step.has_num_iter()) - << "Must not specify num_iter if critera_network is set"; +// try to get the should_stop signal, a scalar bool blob value. +// if the blob doesn't exist or is not initiaized, return false +const bool getShouldStop(const Blob* b) { + if (!b || !b->meta().id()) { // not exist or uninitialized + return false; } - if (!step.has_criteria_network()) { - int iterations = step.has_num_iter() ? step.num_iter() : 1; - VLOG(1) << "Executing step for " << iterations << " iterations."; - return [=](int i) { return i < iterations; }; + const auto& t = b->Get(); + CAFFE_ENFORCE(t.IsType() && t.size() == 1, "expects a scalar boolean"); + return *(t.template data()); +} + +// Returns a function that returns `true` if we should continue +// iterating, given the current iteration count. +std::function getContinuationTest( + Workspace* ws, + const ExecutionStep& step) { + if (step.has_should_stop_blob()) { + CAFFE_ENFORCE( + !step.has_num_iter(), + "Must not specify num_iter if should_stop_blob is set"); + } + + if (!step.has_should_stop_blob()) { + int64_t iterations = step.has_num_iter() ? step.num_iter() : 1; + VLOG(1) << "Will execute step " << step.name() << " for " << iterations + << " iterations."; + return [=](int64_t i) { return i < iterations; }; + } else { + VLOG(1) << "Will execute step " << step.name() << " until stopped by blob " + << step.should_stop_blob(); + return [](int64_t i) { return true; }; } - auto* criteria_network = ws->GetNet(step.criteria_network()); - CHECK_NOTNULL(criteria_network); - CHECK_EQ(criteria_network->external_output().size(), 1); - const auto& criteria_output = criteria_network->external_output().front(); - VLOG(1) << "Executing step controlled by criteria output: " - << criteria_output; - return [=](int) { - criteria_network->Run(); - const auto& blob = ws->GetBlob(criteria_output)->Get(); - CHECK_EQ(blob.size(), 1); - CHECK(blob.IsType()); - return blob.template data()[0] > 0; - }; }; } // namespace @@ -229,10 +234,17 @@ struct Reporter { } +#define CHECK_SHOULD_STOP(shouldStop) \ + if (getShouldStop(shouldStop)) { \ + VLOG(1) << "Execution stopped by should_stop_blob"; \ + return true; \ + } + bool Workspace::ExecuteStepRecursive( const ExecutionStep& step, ShouldContinue externalShouldContinue) { - LOG(INFO) << "Running execution step " << step.name(); + VLOG(1) << "Running execution step " << step.name(); + if (!(step.substep_size() == 0 || step.network_size() == 0)) { LOG(ERROR) << "An ExecutionStep should either have substep or networks " << "but not both."; @@ -247,49 +259,67 @@ bool Workspace::ExecuteStepRecursive( if (net_map_.count(step.report_net()) == 0) { LOG(ERROR) << "Report net " << step.report_net() << " not found."; } + VLOG(1) << "Starting reporter net"; reporter.start(net_map_[step.report_net()].get(), step.report_interval()); } + const Blob* shouldStop = nullptr; + if (step.has_should_stop_blob()) { + shouldStop = GetBlob(step.should_stop_blob()); + CAFFE_ENFORCE( + shouldStop, "blob ", step.should_stop_blob(), " does not exist"); + } + const auto netShouldContinue = getContinuationTest(this, step); - const auto shouldContinue = [&](int iter) { + const auto shouldContinue = [&](int64_t iter) { return externalShouldContinue(iter) && netShouldContinue(iter); }; if (step.substep_size()) { - for (int iter = 0; shouldContinue(iter); ++iter) { - // we assume that, if we have substeps, each substep is going to take a - // reasonable amount of time, so logging here is fine - LOG(INFO) << "Execution step " << step.name() - << ": Starting iteration " << iter; - std::atomic next_substep{0}; - std::atomic got_failure{false}; - auto substepShouldContinue = [&, externalShouldContinue](int iter) { - return !got_failure && externalShouldContinue(iter); - }; - auto worker = [&]() { - while (true) { - int substep_id = next_substep++; - if (got_failure || (substep_id >= step.substep().size())) { - break; - } - if (!ExecuteStepRecursive(step.substep().Get(substep_id), - substepShouldContinue)) { - got_failure = true; - } - } - }; + for (int64_t iter = 0; shouldContinue(iter); ++iter) { + VLOG(1) << "Execution step " << step.name() << ": iteration " << iter; + if (!step.concurrent_substeps() || step.substep().size() <= 1) { - worker(); + auto substepShouldContinue = [&, externalShouldContinue](int64_t iter) { + return externalShouldContinue(iter); + }; + + for (auto& ss : step.substep()) { + if (!ExecuteStepRecursive(ss, substepShouldContinue)) { + return false; + } + CHECK_SHOULD_STOP(shouldStop); + } } else { + std::atomic next_substep{0}; + std::atomic got_failure{false}; + auto substepShouldContinue = [&, externalShouldContinue](int64_t iter) { + return !got_failure && externalShouldContinue(iter); + }; + auto worker = [&]() { + while (true) { + int substep_id = next_substep++; + if (got_failure || (substep_id >= step.substep().size())) { + break; + } + if (!ExecuteStepRecursive( + step.substep().Get(substep_id), substepShouldContinue)) { + got_failure = true; + } + } + }; + std::vector threads; - for (int i = 0; i < step.substep().size(); ++i) { + for (int64_t i = 0; i < step.substep().size(); ++i) { threads.emplace_back(worker); } for (auto& thread: threads) { thread.join(); } - } - if (got_failure) { - return false; + if (got_failure) { + return false; + } + // concurrent substeps should be careful about setting should_stop_blob + CHECK_SHOULD_STOP(shouldStop); } } return true; @@ -305,16 +335,19 @@ bool Workspace::ExecuteStepRecursive( VLOG(1) << "Going to execute network " << network_name; networks.push_back(net_map_[network_name].get()); } - for (int iter = 0; shouldContinue(iter); ++iter) { + for (int64_t iter = 0; shouldContinue(iter); ++iter) { VLOG(1) << "Executing network iteration " << iter; for (NetBase* network : networks) { if (!network->Run()) { return false; } + CHECK_SHOULD_STOP(shouldStop); } } } return true; } +#undef CHECK_SHOULD_STOP + } // namespace caffe2 diff --git a/caffe2/operators/atomic_ops.cc b/caffe2/operators/atomic_ops.cc new file mode 100644 index 00000000000..e8d35b1ba63 --- /dev/null +++ b/caffe2/operators/atomic_ops.cc @@ -0,0 +1,73 @@ +#include +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" + +namespace caffe2 { +namespace fb { +namespace { + +class CreateMutexOp final : public Operator { + public: + CreateMutexOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) {} + + bool RunOnDevice() override { + *OperatorBase::Output>(0) = + std::unique_ptr(new std::mutex); + return true; + } +}; + +class AtomicFetchAddOp final : public Operator { + public: + AtomicFetchAddOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) {} + + bool RunOnDevice() override { + auto& mutex = OperatorBase::Input>(0); + auto& a = Input(1); + auto& b = Input(2); + auto* c = Output(0); + auto* d = Output(1); + c->Resize(std::vector()); + d->Resize(std::vector()); + auto* aPtr = a.data(); + auto* bPtr = b.data(); + auto* cPtr = c->mutable_data(); + auto* dPtr = d->mutable_data(); + std::lock_guard lg(*mutex); + *dPtr = *aPtr; + *cPtr = *aPtr + *bPtr; + return true; + } +}; + +REGISTER_CPU_OPERATOR(CreateMutex, CreateMutexOp); +REGISTER_CPU_OPERATOR(AtomicFetchAdd, AtomicFetchAddOp); + +OPERATOR_SCHEMA(CreateMutex) + .NumInputs(0) + .NumOutputs(1) + .SetDoc("Creates an unlocked mutex and returns it in a unique_ptr blob.") + .Output(0, "mutex_ptr", "Blob containing a std::unique_ptr."); + +OPERATOR_SCHEMA(AtomicFetchAdd) + .NumInputs(3) + .NumOutputs(2) + .SetDoc(R"DOC( +Given a mutex and two int32 scalar tensors, performs an atomic fetch add +by mutating the first argument and adding it to the second input +argument. Returns the updated integer and the value prior to the update. +)DOC") + .Input(0, "mutex_ptr", "Blob containing to a unique_ptr") + .Input(1, "mut_value", "Value to be mutated after the sum.") + .Input(2, "increment", "Value to add to the first operand.") + .Output(0, "mut_value", "Mutated value after sum. Usually same as input 1.") + .Output(1, "fetched_value", "Value of the first operand before sum.") + .AllowInplace({{1, 0}}); + +SHOULD_NOT_DO_GRADIENT(CreateMutex); +SHOULD_NOT_DO_GRADIENT(AtomicFetchAdd); +} +} +} diff --git a/caffe2/operators/boolean_mask_ops.cc b/caffe2/operators/boolean_mask_ops.cc new file mode 100644 index 00000000000..ddc10fe78bb --- /dev/null +++ b/caffe2/operators/boolean_mask_ops.cc @@ -0,0 +1,143 @@ +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor.h" + +namespace caffe2 { +namespace { + +template +class BooleanMaskLengthsOp final : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + BooleanMaskLengthsOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) {} + + bool RunOnDevice() override { + return DispatchHelper>::call(this, Input(0)); + } + + template + bool DoRunWithType() { + auto& lengths = Input(0); + auto& mask = Input(1); + auto* lengthsOut = Output(0); + CAFFE_ENFORCE(lengths.ndim() == 1); + CAFFE_ENFORCE(mask.ndim() == 1); + const auto* lengthsPtr = lengths.template data(); + const auto* maskPtr = mask.template data(); + auto totalLength = + std::accumulate(lengthsPtr, lengthsPtr + lengths.size(), 0); + CAFFE_ENFORCE(mask.size() == totalLength); + lengthsOut->ResizeLike(lengths); + auto* lengthsOutPtr = lengthsOut->template mutable_data(); + int p = 0; + for (int i = 0; i < lengths.size(); ++i) { + T lengthOut = 0; + for (int j = 0; j < lengthsPtr[i]; ++j) { + if (maskPtr[p++]) { + ++lengthOut; + } + } + lengthsOutPtr[i] = lengthOut; + } + return true; + } +}; + +template +class BooleanMaskOp final : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + BooleanMaskOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) {} + + bool RunOnDevice() override { + auto& data = Input(0); + auto& mask = Input(1); + auto* dataOut = Output(0); + CAFFE_ENFORCE(data.ndim() >= 1); + CAFFE_ENFORCE(mask.ndim(), 1); + CAFFE_ENFORCE(data.dims()[0] == mask.dims()[0]); + + const auto* maskPtr = mask.template data(); + int numOutputs = 0; + int outerSize = mask.size(); + for (int i = 0; i < outerSize; ++i) { + if (maskPtr[i]) { + ++numOutputs; + } + } + std::vector outShape; + outShape.push_back(numOutputs); + outShape.insert(outShape.end(), data.dims().begin() + 1, data.dims().end()); + dataOut->Resize(outShape); + if (numOutputs == 0) { + return true; + } + auto innerSizeBytes = std::accumulate( + data.dims().begin() + 1, + data.dims().end(), + 1, + std::multiplies()) * + data.meta().itemsize(); + TIndex lastStart = -1; + const auto* inPtr = (char*)data.raw_data(); + auto* outPtr = (char*)dataOut->raw_mutable_data(data.meta()); + TIndex outStart = 0; + for (TIndex i = 0;; ++i) { + // mask was true and either a) became false, or b) sequence finished + if (lastStart != -1 && ((i >= outerSize) || !maskPtr[i])) { + const auto* src = inPtr + lastStart * innerSizeBytes; + auto* dst = outPtr + outStart * innerSizeBytes; + int numItems = i - lastStart; + if (data.meta().copy()) { + data.meta().copy()(src, dst, numItems); + } else { + context_.template CopyBytes( + numItems * data.meta().itemsize(), src, dst); + } + outStart += numItems; + lastStart = -1; + } + if (i >= outerSize) { + break; + } + // mask was false and became true + if (lastStart == -1 && maskPtr[i]) { + lastStart = i; + } + } + return true; + } +}; + +REGISTER_CPU_OPERATOR(BooleanMask, BooleanMaskOp); +REGISTER_CPU_OPERATOR(BooleanMaskLengths, BooleanMaskLengthsOp); + +OPERATOR_SCHEMA(BooleanMask) + .NumInputs(2) + .NumOutputs(1) + .SetDoc(R"DOC( +Given a data 1D tensor and a mask (boolean) tensor of same shape, returns a +tensor containing only the elements corresponding to positions where the mask +is true. +)DOC") + .Input(0, "data", "The 1D, original data tensor.") + .Input(1, "mask", "A tensor of bools of same shape as `data`.") + .Output(0, "masked_data", "A tensor of same type as `data`."); + +OPERATOR_SCHEMA(BooleanMaskLengths) + .NumInputs(2) + .NumOutputs(1) + .SetDoc(R"DOC( +Given a tensor of int32 segment lengths and a mask (boolean) tensor, return +the segment lengths of a corresponding segmented tensor after BooleanMask is +applied. +)DOC") + .Input(0, "lengths", "A 1D int32 tensor representing segment lengths.") + .Input(1, "mask", "A 1D bool tensor of values to keep.") + .Output(0, "masked_lengths", "Segment lengths of a masked tensor."); + +NO_GRADIENT(BooleanMask) +NO_GRADIENT(BooleanMaskLengths); +} +} diff --git a/caffe2/operators/concat_split_op.cc b/caffe2/operators/concat_split_op.cc index 5133183c5d6..c1bd6823a40 100644 --- a/caffe2/operators/concat_split_op.cc +++ b/caffe2/operators/concat_split_op.cc @@ -10,8 +10,14 @@ OPERATOR_SCHEMA(Concat).NumInputs(1, INT_MAX).NumOutputs(2); // Backward compatibility names. REGISTER_CPU_OPERATOR(DepthSplit, SplitOp); REGISTER_CPU_OPERATOR(DepthConcat, ConcatOp); -OPERATOR_SCHEMA(DepthSplit).NumInputs(1, 2).NumOutputs(1, INT_MAX); -OPERATOR_SCHEMA(DepthConcat).NumInputs(1, INT_MAX).NumOutputs(2); +OPERATOR_SCHEMA(DepthSplit) + .NumInputs(1, 2) + .NumOutputs(1, INT_MAX) + .SetDoc("Backward compatible operator name for Split."); +OPERATOR_SCHEMA(DepthConcat) + .NumInputs(1, INT_MAX) + .NumOutputs(2) + .SetDoc("Backward compatible operator name for Concat."); class GetSplitGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; diff --git a/caffe2/operators/concat_split_op.h b/caffe2/operators/concat_split_op.h index 265078d2c8e..ffc71c8c37f 100644 --- a/caffe2/operators/concat_split_op.h +++ b/caffe2/operators/concat_split_op.h @@ -31,7 +31,7 @@ class SplitOp final : public Operator { : Operator(operator_def, ws), split_(OperatorBase::GetRepeatedArgument("split")) { CHECK(OperatorBase::HasArgument("axis") ^ - OperatorBase::HasArgument("order")) + OperatorBase::HasArgument("order")) << "You should either specify the dim to split, or the order " "in the case of 4-D images."; if (OperatorBase::HasArgument("axis")) { @@ -85,7 +85,9 @@ class ConcatOp final : public Operator { template bool SplitOp::RunOnDevice() { auto& input = Input(0); + const int input_channels = input.dim32(axis_); const int* axis_data; + vector equal_split; if (InputSize() == 2) { // We obtain split from the input tensor. CHECK_EQ(split_.size(), 0) @@ -94,13 +96,21 @@ bool SplitOp::RunOnDevice() { auto& split_tensor = OperatorBase::Input(1); CHECK_EQ(split_tensor.size(), OutputSize()); axis_data = split_tensor.template data(); + } else if (split_.size() == 0) { + CAFFE_ENFORCE(input_channels % OutputSize() == 0, + "If you did not specify split explicitly, the number of " + "input channels should be divisible by the output size."); + equal_split.resize(OutputSize(), input_channels / OutputSize()); + axis_data = equal_split.data(); } else { // We obtain split from the parameters. - CHECK_EQ(split_.size(), OutputSize()); + CAFFE_ENFORCE(split_.size() == OutputSize(), + "The number of splits specified should be equal to the " + "number of outputs."); axis_data = split_.data(); } CHECK_LT(axis_, input.ndim()); - const int input_channels = input.dim32(axis_); + CHECK_EQ(std::accumulate(axis_data, axis_data + OutputSize(), 0), input_channels) << "Sum of split dimensions do not match: should be " << input_channels; diff --git a/caffe2/operators/conv_op_cudnn.cc b/caffe2/operators/conv_op_cudnn.cc index 6ebd6b01490..090a59b668c 100644 --- a/caffe2/operators/conv_op_cudnn.cc +++ b/caffe2/operators/conv_op_cudnn.cc @@ -22,11 +22,11 @@ template inline void LogCuDNNPerfStats( const ArrayOfcudnnConvolutionAlgoPerf_t& perf_stat, int returned_algo_count) { - LOG(INFO) << "Perf result: (algo: stat, time, memory)"; + VLOG(1) << "Perf result: (algo: stat, time, memory)"; for (int i = 0; i < returned_algo_count; ++i) { const auto& stat = perf_stat[i]; - LOG(INFO) << stat.algo << ": " << stat.status - << " " << stat.time << " " << stat.memory; + VLOG(1) << stat.algo << ": " << stat.status << " " << stat.time << " " + << stat.memory; } } } // namespace @@ -193,7 +193,7 @@ bool CudnnConvOp::RunOnDevice() { if (deterministic_) { algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; } else if (exhaustive_search_) { - LOG(INFO) << "CUDNN Convolution: doing exhaustive search."; + VLOG(1) << "CUDNN Convolution: doing exhaustive search."; // When we do an exhaustive search, we will ignore the workspace size // limit and simply go for the fastest algorithm. If you happen to run // out of memory later, you will be on your own... @@ -229,8 +229,8 @@ bool CudnnConvOp::RunOnDevice() { cudnn_wrapper_.inline_cudnn_handle(), bottom_desc_, filter_desc_, conv_desc_, top_desc_, algo_, &cudnn_ws_nbytes_)); - LOG(INFO) << "CuDNN algorithm: " << algo_; - LOG(INFO) << "CuDNN workspace size: " << cudnn_ws_nbytes_; + VLOG(1) << "CuDNN algorithm: " << algo_; + VLOG(1) << "CuDNN workspace size: " << cudnn_ws_nbytes_; } // Now, actually run the computation. @@ -346,7 +346,7 @@ bool CudnnConvGradientOp::RunOnDevice() { bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; } else if (exhaustive_search_) { - LOG(INFO) << "CUDNN Convolution bwd: doing exhaustive search."; + VLOG(1) << "CUDNN Convolution bwd: doing exhaustive search."; // When we do an exhaustive search, we will ignore the workspace size // limit and simply go for the fastest algorithm. If you happen to run // out of memory later, you will be on your own... @@ -416,9 +416,9 @@ bool CudnnConvGradientOp::RunOnDevice() { bwd_data_algo_, &bwd_data_ws_size)); cudnn_ws_nbytes_ = std::max(bwd_filter_ws_size, bwd_data_ws_size); - LOG(INFO) << "CuDNN bwd algorithm: " << bwd_filter_algo_ << ", " - << bwd_data_algo_; - LOG(INFO) << "CuDNN workspace size: " << cudnn_ws_nbytes_; + VLOG(1) << "CuDNN bwd algorithm: " << bwd_filter_algo_ << ", " + << bwd_data_algo_; + VLOG(1) << "CuDNN workspace size: " << cudnn_ws_nbytes_; } // Now, actually run the computation. diff --git a/caffe2/operators/conv_transpose_op_cudnn.cc b/caffe2/operators/conv_transpose_op_cudnn.cc new file mode 100644 index 00000000000..8e533c088bf --- /dev/null +++ b/caffe2/operators/conv_transpose_op_cudnn.cc @@ -0,0 +1,579 @@ +#include "caffe2/core/common_cudnn.h" +#include "caffe2/core/context_gpu.h" +#include "caffe2/operators/conv_transpose_op.h" + +namespace caffe2 { + +// Earlier in the days Caffe sets the default cudnn workspace to 8MB. We bump +// it up to 64MB in Caffe2, as this enables the use of Winograd in many cases, +// something very beneficial to more recent CNN models. +static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 64 * 1024 * 1024; + +// Manually specified number of algorithms implemented in CuDNN. +// This does not have any performance implications, as we will always find the +// fastest algorithm; setting them to the right number of algorithms will enable +// us to best report the statistics when doing an exhaustive search, though. +static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7; +static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4; +static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5; + +namespace { +template +inline void LogCuDNNPerfStats( + const ArrayOfcudnnConvolutionAlgoPerf_t& perf_stat, + int returned_algo_count) { + LOG(INFO) << "Perf result: (algo: stat, time, memory)"; + for (int i = 0; i < returned_algo_count; ++i) { + const auto& stat = perf_stat[i]; + LOG(INFO) << stat.algo << ": " << stat.status << " " << stat.time << " " + << stat.memory; + } +} +} // namespace + +class CudnnConvTransposeOpBase : public ConvTransposeUnpoolBase { + public: + CudnnConvTransposeOpBase(const OperatorDef& operator_def, Workspace* ws) + : ConvTransposeUnpoolBase(operator_def, ws), + cudnn_wrapper_(&context_), + cudnn_ws_nbytes_limit_(OperatorBase::GetSingleArgument( + "ws_nbytes_limit", + kCONV_CUDNN_WORKSPACE_LIMIT_BYTES)), + exhaustive_search_( + OperatorBase::GetSingleArgument("exhaustive_search", 0)), + deterministic_( + OperatorBase::GetSingleArgument("deterministic", 0)), + cudnn_state_(OperatorBase::GetSingleArgument("cudnn_state", 0)) { + CHECK(!deterministic_ || !exhaustive_search_); + CUDNN_CHECK(cudnnCreateTensorDescriptor(&bottom_desc_)); + CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_)); + CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_)); + CUDNN_CHECK(cudnnCreateTensorDescriptor(&top_desc_)); + CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_)); + } + + ~CudnnConvTransposeOpBase() { + CUDNN_CHECK(cudnnDestroyTensorDescriptor(bottom_desc_)); + CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc_)); + CUDNN_CHECK(cudnnDestroyTensorDescriptor(bias_desc_)); + CUDNN_CHECK(cudnnDestroyTensorDescriptor(top_desc_)); + CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc_)); + } + + protected: + vector cudnn_input_dims_; + vector cudnn_filter_dims_; + + CuDNNWrapper cudnn_wrapper_; + cudnnTensorDescriptor_t bottom_desc_; + cudnnFilterDescriptor_t filter_desc_; + cudnnTensorDescriptor_t bias_desc_; + cudnnTensorDescriptor_t top_desc_; + cudnnConvolutionDescriptor_t conv_desc_; + const size_t cudnn_ws_nbytes_limit_; + size_t cudnn_ws_nbytes_; + bool exhaustive_search_; + bool deterministic_; + size_t cudnn_state_; +}; + +template +class CudnnConvTransposeOp final : public CudnnConvTransposeOpBase { + public: + CudnnConvTransposeOp(const OperatorDef& operator_def, Workspace* ws) + : CudnnConvTransposeOpBase(operator_def, ws) {} + + ~CudnnConvTransposeOp() {} + + bool RunOnDevice() override; + + private: + cudnnConvolutionBwdDataAlgo_t bwd_data_algo_; + // Input: X, W, b + // Output: Y + INPUT_TAGS(INPUT, FILTER, BIAS); +}; + +template +class CudnnConvTransposeGradientOp final : public CudnnConvTransposeOpBase { + public: + CudnnConvTransposeGradientOp(const OperatorDef& operator_def, Workspace* ws) + : CudnnConvTransposeOpBase(operator_def, ws) {} + + ~CudnnConvTransposeGradientOp() {} + + bool RunOnDevice() override; + + private: + cudnnConvolutionFwdAlgo_t algo_; + cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo_; + // input: X, W, dY + // output: dW, db, and optionally dX + INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD); + OUTPUT_TAGS(FILTER_GRAD, BIAS_GRAD, INPUT_GRAD); +}; + +//////////////////////////////////////////////////////////////////////////////// +// Implementations +//////////////////////////////////////////////////////////////////////////////// + +template +bool CudnnConvTransposeOp::RunOnDevice() { + auto& X = Input(INPUT); + auto& filter = Input(FILTER); + auto& bias = Input(BIAS); + auto* Y = Output(0); + int C = 0; + switch (order_) { + case StorageOrder::NHWC: + C = filter.dim32(3); + break; + case StorageOrder::NCHW: + C = filter.dim32(1); + break; + default: + LOG(FATAL) << "Unknown storage order: " << order_; + } + ConvTransposeUnpoolBase::SetOutputSize(X, Y, C); + + int N = 0, M = 0, H = 0, W = 0, H_out = 0, W_out = 0; + switch (order_) { + case StorageOrder::NHWC: + N = X.dim32(0); + H = X.dim32(1); + W = X.dim32(2); + M = X.dim32(3); + H_out = Y->dim32(1); + W_out = Y->dim32(2); + DCHECK_EQ(filter.dim32(1), kernel_h_); + DCHECK_EQ(filter.dim32(1), kernel_h_); + DCHECK_EQ(filter.dim32(2), kernel_w_); + DCHECK_EQ(filter.dim32(3), C); + break; + case StorageOrder::NCHW: + N = X.dim32(0); + M = X.dim32(1); + H = X.dim32(2); + W = X.dim32(3); + H_out = Y->dim32(2); + W_out = Y->dim32(3); + DCHECK_EQ(filter.dim32(1), C); + DCHECK_EQ(filter.dim32(2), kernel_h_); + DCHECK_EQ(filter.dim32(3), kernel_w_); + break; + default: + LOG(FATAL) << "Unknown storage order: " << order_; + } + + DCHECK_EQ(bias.ndim(), 1); + DCHECK_EQ(bias.dim32(0), C); + + // Set up the cudnn algorithms & workspace if necessary + bool input_changed = (X.dims() != cudnn_input_dims_); + bool filter_changed = (filter.dims() != cudnn_filter_dims_); + + if (input_changed || filter_changed) { + VLOG(1) << "Changing the cudnn descriptor configurations."; + if (input_changed) { + cudnn_input_dims_ = X.dims(); + CUDNN_CHECK(cudnnSetTensor4dDescriptor( + bottom_desc_, + GetCudnnTensorFormat(order_), + cudnnTypeWrapper::type, + N, + M, + H, + W)); + } + if (filter_changed) { + cudnn_filter_dims_ = filter.dims(); + CUDNN_CHECK(cudnnSetFilter4dDescriptor( + filter_desc_, + cudnnTypeWrapper::type, + GetCudnnTensorFormat(order_), + M, + C, + kernel_h_, + kernel_w_)); + CUDNN_CHECK(cudnnSetTensor4dDescriptor( + bias_desc_, + GetCudnnTensorFormat(order_), + cudnnTypeWrapper::type, + 1, + C, + 1, + 1)); + } + // Set the output + CUDNN_CHECK(cudnnSetTensor4dDescriptor( + top_desc_, + GetCudnnTensorFormat(order_), + cudnnTypeWrapper::type, + N, + C, + H_out, + W_out)); + // Set the convolution descriptor + CHECK_EQ(pad_t_, pad_b_) + << "The current padding scheme leads to unequal padding on the top and " + "bottom, which is not supported by cudnn."; + CHECK_EQ(pad_l_, pad_r_) + << "The current padding scheme leads to unequal padding on the left " + "and right, which is not supported by cudnn."; + CUDNN_CHECK(cudnnSetConvolution2dDescriptor( + conv_desc_, + pad_t_, + pad_l_, + stride_h_, + stride_w_, + 1, + 1, + CUDNN_CROSS_CORRELATION)); + if (deterministic_) { + bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + } else if (exhaustive_search_) { + int returned_algo_count; + std::array + data_perf_stat; + cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) { + state->workspace().reset(); + CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm( + state->cudnn_handle(), + filter_desc_, + bottom_desc_, + conv_desc_, + top_desc_, + kNUM_CUDNN_BWD_DATA_ALGS, + &returned_algo_count, + data_perf_stat.data())); + }); + + LogCuDNNPerfStats(data_perf_stat, returned_algo_count); + bwd_data_algo_ = data_perf_stat[0].algo; + } else { + CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm( + cudnn_wrapper_.inline_cudnn_handle(), + filter_desc_, + bottom_desc_, + conv_desc_, + top_desc_, + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + cudnn_ws_nbytes_limit_, + &bwd_data_algo_)); + } + + size_t bwd_data_ws_size; + CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize( + cudnn_wrapper_.inline_cudnn_handle(), + filter_desc_, + bottom_desc_, + conv_desc_, + top_desc_, + bwd_data_algo_, + &bwd_data_ws_size)); + cudnn_ws_nbytes_ = bwd_data_ws_size; + LOG(INFO) << "CuDNN algorithm: " << bwd_data_algo_; + LOG(INFO) << "CuDNN workspace size: " << bwd_data_ws_size; + } + + // Now, actually run the computation. + // Filter + cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) { + CUDNN_CHECK(cudnnConvolutionBackwardData( + state->cudnn_handle(), + cudnnTypeWrapper::kOne(), + filter_desc_, + filter.template data(), + bottom_desc_, + X.template data(), + conv_desc_, + bwd_data_algo_, + state->workspace().get(cudnn_ws_nbytes_), + cudnn_ws_nbytes_, + cudnnTypeWrapper::kZero(), + top_desc_, + Y->template mutable_data())); + }); + // Bias + CUDNN_CHECK(cudnnAddTensor( + cudnn_wrapper_.inline_cudnn_handle(), + cudnnTypeWrapper::kOne(), + bias_desc_, + bias.template data(), + cudnnTypeWrapper::kOne(), + top_desc_, + Y->template mutable_data())); + // Done. + return true; +} + +// TODO(Yangqing): a lot of the function contents are very similar. Consider +// consolidating them. +template +bool CudnnConvTransposeGradientOp::RunOnDevice() { + auto& X = Input(INPUT); + auto& filter = Input(FILTER); + auto& dY = Input(OUTPUT_GRAD); + auto* dfilter = Output(FILTER_GRAD); + auto* dbias = Output(BIAS_GRAD); + DCHECK_EQ(X.ndim(), 4); + DCHECK_EQ(filter.ndim(), 4); + auto* Y = Output(0); + int C = 0; + switch (order_) { + case StorageOrder::NHWC: + C = filter.dim32(3); + break; + case StorageOrder::NCHW: + C = filter.dim32(1); + break; + default: + LOG(FATAL) << "Unknown storage order: " << order_; + } + ConvTransposeUnpoolBase::SetOutputSize(X, Y, C); + + int N = 0, M = 0, H = 0, W = 0, H_out = 0, W_out = 0; + switch (order_) { + case StorageOrder::NHWC: + N = X.dim32(0); + H = X.dim32(1); + W = X.dim32(2); + M = X.dim32(3); + H_out = dY.dim32(1); + W_out = dY.dim32(2); + DCHECK_EQ(filter.dim32(1), kernel_h_); + DCHECK_EQ(filter.dim32(1), kernel_h_); + DCHECK_EQ(filter.dim32(2), kernel_w_); + DCHECK_EQ(filter.dim32(3), C); + break; + case StorageOrder::NCHW: + N = X.dim32(0); + M = X.dim32(1); + H = X.dim32(2); + W = X.dim32(3); + H_out = dY.dim32(2); + W_out = dY.dim32(3); + DCHECK_EQ(filter.dim32(1), C); + DCHECK_EQ(filter.dim32(2), kernel_h_); + DCHECK_EQ(filter.dim32(3), kernel_w_); + break; + default: + LOG(FATAL) << "Unknown storage order: " << order_; + } + // Since we only handle LegacyPadding::NOTSET, we don't need to + // compute padding. + dfilter->ResizeLike(filter); + dbias->Resize(C); + + // Set up the cudnn algorithms & workspace if necessary + bool input_changed = (X.dims() != cudnn_input_dims_); + bool filter_changed = (filter.dims() != cudnn_filter_dims_); + if (input_changed || filter_changed) { + VLOG(1) << "Changing the cudnn descriptor configurations."; + if (input_changed) { + cudnn_input_dims_ = X.dims(); + CUDNN_CHECK(cudnnSetTensor4dDescriptor( + bottom_desc_, + GetCudnnTensorFormat(order_), + cudnnTypeWrapper::type, + N, + M, + H, + W)); + } + if (filter_changed) { + cudnn_filter_dims_ = filter.dims(); + CUDNN_CHECK(cudnnSetFilter4dDescriptor( + filter_desc_, + cudnnTypeWrapper::type, + GetCudnnTensorFormat(order_), + M, + C, + kernel_h_, + kernel_w_)); + CUDNN_CHECK(cudnnSetTensor4dDescriptor( + bias_desc_, + GetCudnnTensorFormat(order_), + cudnnTypeWrapper::type, + 1, + C, + 1, + 1)); + } + // Set the output + CUDNN_CHECK(cudnnSetTensor4dDescriptor( + top_desc_, + GetCudnnTensorFormat(order_), + cudnnTypeWrapper::type, + N, + C, + H_out, + W_out)); + // Set the convolution descriptor + CHECK_EQ(pad_t_, pad_b_) + << "The current padding scheme leads to unequal padding on the top and " + "bottom, which is not supported by cudnn."; + CHECK_EQ(pad_l_, pad_r_) + << "The current padding scheme leads to unequal padding on the left " + "and right, which is not supported by cudnn."; + CUDNN_CHECK(cudnnSetConvolution2dDescriptor( + conv_desc_, + pad_t_, + pad_l_, + stride_h_, + stride_w_, + 1, + 1, + CUDNN_CROSS_CORRELATION)); + // Set the workspace + + size_t bwd_filter_ws_size, fwd_ws_size; + + if (deterministic_) { + algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; + bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; + } else if (exhaustive_search_) { + LOG(INFO) << "CUDNN Convolution bwd: doing exhaustive search."; + // When we do an exhaustive search, we will ignore the workspace size + // limit and simply go for the fastest algorithm. If you happen to run + // out of memory later, you will be on your own... + int returned_algo_count; + // We clean up the current workspace memory so that the forward algorithm + // is free to allocate memory. + // Actually run the search. + std:: + array + filter_perf_stat; + + cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) { + state->workspace().reset(); + CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm( + state->cudnn_handle(), + top_desc_, + bottom_desc_, + conv_desc_, + filter_desc_, + kNUM_CUDNN_BWD_FILTER_ALGS, + &returned_algo_count, + filter_perf_stat.data())); + }); + LogCuDNNPerfStats(filter_perf_stat, returned_algo_count); + bwd_filter_algo_ = filter_perf_stat[0].algo; + + std::array + fwd_perf_stat; + cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) { + state->workspace().reset(); + CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm( + state->cudnn_handle(), + top_desc_, + filter_desc_, + conv_desc_, + bottom_desc_, + kNUM_CUDNN_BWD_DATA_ALGS, + &returned_algo_count, + fwd_perf_stat.data())); + }); + + LogCuDNNPerfStats(fwd_perf_stat, returned_algo_count); + algo_ = fwd_perf_stat[0].algo; + } else { + // choose backward algorithm for filter + CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm( + cudnn_wrapper_.inline_cudnn_handle(), + top_desc_, + bottom_desc_, + conv_desc_, + filter_desc_, + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + cudnn_ws_nbytes_limit_, + &bwd_filter_algo_)); + // choose backward algo for data + CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm( + cudnn_wrapper_.inline_cudnn_handle(), + top_desc_, + filter_desc_, + conv_desc_, + bottom_desc_, + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + cudnn_ws_nbytes_limit_, + &algo_)); + } + // get workspace for backwards filter algorithm + CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize( + cudnn_wrapper_.inline_cudnn_handle(), + top_desc_, + bottom_desc_, + conv_desc_, + filter_desc_, + bwd_filter_algo_, + &bwd_filter_ws_size)); + // get workspace for backwards data algorithm + CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize( + cudnn_wrapper_.inline_cudnn_handle(), + top_desc_, + filter_desc_, + conv_desc_, + bottom_desc_, + algo_, + &fwd_ws_size)); + cudnn_ws_nbytes_ = std::max(bwd_filter_ws_size, fwd_ws_size); + + LOG(INFO) << "CuDNN bwd algorithm: " << bwd_filter_algo_ << ", " << algo_; + LOG(INFO) << "CuDNN workspace size: " << cudnn_ws_nbytes_; + } + + // Now, actually run the computation. + CUDNN_CHECK(cudnnConvolutionBackwardBias( + cudnn_wrapper_.inline_cudnn_handle(), + cudnnTypeWrapper::kOne(), + top_desc_, + dY.template data(), + cudnnTypeWrapper::kZero(), + bias_desc_, + dbias->template mutable_data())); + + cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) { + CUDNN_CHECK(cudnnConvolutionBackwardFilter( + state->cudnn_handle(), + cudnnTypeWrapper::kOne(), + top_desc_, + dY.template data(), + bottom_desc_, + X.template data(), + conv_desc_, + bwd_filter_algo_, + state->workspace().get(cudnn_ws_nbytes_), + cudnn_ws_nbytes_, + cudnnTypeWrapper::kZero(), + filter_desc_, + dfilter->template mutable_data())); + if (OutputSize() == 3) { + // Compute the gradient w.r.t. the input. + auto* dX = Output(INPUT_GRAD); + dX->ResizeLike(X); + CUDNN_CHECK(cudnnConvolutionForward( + state->cudnn_handle(), + cudnnTypeWrapper::kOne(), + top_desc_, + dY.template data(), + filter_desc_, + filter.template data(), + conv_desc_, + algo_, + state->workspace().get(cudnn_ws_nbytes_), + cudnn_ws_nbytes_, + cudnnTypeWrapper::kZero(), + bottom_desc_, + dX->template mutable_data())); + } + }); + return true; +} + +REGISTER_CUDNN_OPERATOR(ConvTranspose, CudnnConvTransposeOp); +REGISTER_CUDNN_OPERATOR( + ConvTransposeGradient, + CudnnConvTransposeGradientOp); + +} // namespace caffe2 diff --git a/caffe2/operators/conv_transpose_unpool_op_base.h b/caffe2/operators/conv_transpose_unpool_op_base.h index b4b52c37007..c2c67ba773e 100644 --- a/caffe2/operators/conv_transpose_unpool_op_base.h +++ b/caffe2/operators/conv_transpose_unpool_op_base.h @@ -118,9 +118,13 @@ class ConvTransposeUnpoolBase : public Operator { return true; } - virtual bool RunOnDeviceWithOrderNCHW() = 0; + virtual bool RunOnDeviceWithOrderNCHW() { + CAFFE_THROW("Not implemented"); + } - virtual bool RunOnDeviceWithOrderNHWC() = 0; + virtual bool RunOnDeviceWithOrderNHWC() { + CAFFE_THROW("Not implemented"); + } virtual ~ConvTransposeUnpoolBase() {} diff --git a/caffe2/operators/counter_ops.cc b/caffe2/operators/counter_ops.cc new file mode 100644 index 00000000000..931d8a102b0 --- /dev/null +++ b/caffe2/operators/counter_ops.cc @@ -0,0 +1,46 @@ +#include "counter_ops.h" + +namespace caffe2 { +namespace { + +REGISTER_CPU_OPERATOR(CreateCounter, CreateCounterOp); +REGISTER_CPU_OPERATOR(ResetCounter, ResetCounterOp); +REGISTER_CPU_OPERATOR(CountDown, CountDownOp); + +OPERATOR_SCHEMA(CreateCounter) + .NumInputs(0) + .NumOutputs(1) + .SetDoc(R"DOC( +Creates a count-down counter with initial value specified by the 'init_count' +argument. +)DOC") + .Output(0, "counter", "A blob pointing to an instance of a new counter.") + .Arg("init_count", "Initial count for the counter, must be >= 0."); + +OPERATOR_SCHEMA(ResetCounter) + .NumInputs(1) + .NumOutputs(0) + .SetDoc(R"DOC( +Resets a count-down counter with initial value specified by the 'init_count' +argument. +)DOC") + .Input(0, "counter", "A blob pointing to an instance of a new counter.") + .Arg("init_count", "Resets counter to this value, must be >= 0."); + +OPERATOR_SCHEMA(CountDown) + .NumInputs(1) + .NumOutputs(1) + .SetDoc(R"DOC( +If the internal count value > 0, decreases count value by 1 and outputs false, +otherwise outputs true. +)DOC") + .Input(0, "counter", "A blob pointing to an instance of a counter.") + .Output(0, "should_stop", "false unless the internal count is zero."); + +SHOULD_NOT_DO_GRADIENT(CreateCounter); +SHOULD_NOT_DO_GRADIENT(ResetCounter); +SHOULD_NOT_DO_GRADIENT(CountDown); + +} // namespace + +} // namespace caffe2 diff --git a/caffe2/operators/counter_ops.h b/caffe2/operators/counter_ops.h new file mode 100644 index 00000000000..4baa1314596 --- /dev/null +++ b/caffe2/operators/counter_ops.h @@ -0,0 +1,89 @@ +#ifndef CAFFE2_OPERATORS_COUNTER_OPS_H +#define CAFFE2_OPERATORS_COUNTER_OPS_H + +#include + +#include "caffe2/core/context.h" +#include "caffe2/core/logging.h" +#include "caffe2/core/operator.h" + +namespace caffe2 { +namespace { +template +class Counter { + public: + explicit Counter(T count) : count_(count) {} + bool CountDown() { + if (count_ > 0) { + --count_; + return false; + } + return true; + } + + void reset(T init_count) { + count_ = init_count; + } + + private: + std::atomic count_; +}; +} + +template +class CreateCounterOp final : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + CreateCounterOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + init_count_(OperatorBase::GetSingleArgument("init_count", 0)) { + CHECK_LE(0, init_count_) << "negative init_count is not permitted."; + } + + bool RunOnDevice() override { + *OperatorBase::Output>>(0) = + std::unique_ptr>(new Counter(init_count_)); + return true; + } + + private: + T init_count_ = 0; +}; + +template +class ResetCounterOp final : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + ResetCounterOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + init_count_(OperatorBase::GetSingleArgument("init_count", 0)) { + CHECK_LE(0, init_count_) << "negative init_count is not permitted."; + } + + bool RunOnDevice() override { + auto& counterPtr = OperatorBase::Input>>(0); + counterPtr->reset(init_count_); + return true; + } + + private: + T init_count_; +}; + +template +class CountDownOp final : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + CountDownOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) {} + + bool RunOnDevice() override { + auto& counterPtr = OperatorBase::Input>>(0); + auto* output = Output(0); + output->Resize(std::vector{}); + *output->template mutable_data() = counterPtr->CountDown(); + return true; + } +}; +} // namespace caffe2 +#endif // CAFFE2_OPERATORS_COUNTER_OPS_H_ diff --git a/caffe2/operators/cross_entropy_op.cc b/caffe2/operators/cross_entropy_op.cc index bae0528b218..d2316b811ac 100644 --- a/caffe2/operators/cross_entropy_op.cc +++ b/caffe2/operators/cross_entropy_op.cc @@ -2,6 +2,17 @@ namespace caffe2 { +namespace { + +inline float sigmoid_xent_forward(float lgt, float tgt) { + return lgt * (tgt - (lgt >= 0)) - log(1 + exp(lgt - 2 * lgt * (lgt >= 0))); +} + +inline float sigmoid_xent_backward(float lgt, float tgt) { + return tgt - 1. / (1. + exp(-lgt)); +} +} + template <> bool LabelCrossEntropyOp::RunOnDevice() { auto& X = Input(0); @@ -26,6 +37,68 @@ bool LabelCrossEntropyOp::RunOnDevice() { return true; } +template <> +bool SigmoidCrossEntropyWithLogitsOp::RunOnDevice() { + auto& logits = Input(0); + auto& targets = Input(1); + CAFFE_ENFORCE(logits.dims() == targets.dims()); + const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1; + const auto outer_size = logits.size() / inner_size; + + auto* out = Output(0); + if (logits.ndim() == 0) { + out->Resize(std::vector{}); + } else { + std::vector dims(logits.dims().begin(), logits.dims().end() - 1); + out->Resize(dims); + } + auto* out_ptr = out->mutable_data(); + + auto* logits_ptr = logits.data(); + auto* targets_ptr = targets.data(); + + auto in_idx = 0; + for (int i = 0; i < outer_size; ++i) { + float value = 0; + for (int j = 0; j < inner_size; ++j) { + value += sigmoid_xent_forward(logits_ptr[in_idx], targets_ptr[in_idx]); + ++in_idx; + } + out_ptr[i] = -value / inner_size; + } + return true; +} + +template <> +bool SigmoidCrossEntropyWithLogitsGradientOp::RunOnDevice() { + auto& g = Input(0); + auto& logits = Input(1); + auto& targets = Input(2); + CAFFE_ENFORCE(logits.dims() == targets.dims()); + const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1; + const auto outer_size = logits.size() / inner_size; + CAFFE_ENFORCE(g.size() == outer_size); + + auto* out = Output(0); + out->ResizeLike(logits); + auto* out_ptr = out->mutable_data(); + + auto* logits_ptr = logits.data(); + auto* targets_ptr = targets.data(); + auto* g_ptr = g.data(); + + auto in_idx = 0; + for (int i = 0; i < outer_size; ++i) { + auto g_factor = -g_ptr[i] / inner_size; + for (int i = 0; i < inner_size; ++i) { + out_ptr[in_idx] = g_factor * + sigmoid_xent_backward(logits_ptr[in_idx], targets_ptr[in_idx]); + ++in_idx; + } + } + return true; +} + template <> bool LabelCrossEntropyGradientOp::RunOnDevice() { auto& X = Input(0); @@ -129,6 +202,13 @@ REGISTER_CPU_OPERATOR(MakeTwoClass, REGISTER_CPU_OPERATOR(MakeTwoClassGradient, MakeTwoClassGradientOp); +REGISTER_CPU_OPERATOR( + SigmoidCrossEntropyWithLogits, + SigmoidCrossEntropyWithLogitsOp); +REGISTER_CPU_OPERATOR( + SigmoidCrossEntropyWithLogitsGradient, + SigmoidCrossEntropyWithLogitsGradientOp); + OPERATOR_SCHEMA(MakeTwoClass) .NumInputs(1) .NumOutputs(1) @@ -145,6 +225,22 @@ OPERATOR_SCHEMA(MakeTwoClassGradient) .NumInputs(1) .NumOutputs(1); +OPERATOR_SCHEMA(SigmoidCrossEntropyWithLogits) + .NumInputs(2) + .NumOutputs(1) + .SetDoc(R"DOC( +Given two matrices logits and targets, of same shape, +(batch_size, num_classes), computes the sigmoid cross entropy between the two. +Returns a tensor of shape (batch_size,) of losses for each example. +)DOC") + .Input(0, "logits", "matrix of logits for each example and class.") + .Input(1, "targets", "matrix of targets, same shape as logits.") + .Output(0, "xentropy", "Vector with the total xentropy for each example."); + +OPERATOR_SCHEMA(SigmoidCrossEntropyWithLogitsGradient) + .NumInputs(3) + .NumOutputs(1); + struct GetMakeTwoClassGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; vector GetGradientDefs() override { @@ -156,5 +252,20 @@ struct GetMakeTwoClassGradient : public GradientMakerBase { } }; REGISTER_GRADIENT(MakeTwoClass, GetMakeTwoClassGradient); + +struct GetSigmoidCrossEntropyWithLogitsGradient : public GradientMakerBase { + using GradientMakerBase::GradientMakerBase; + vector GetGradientDefs() override { + return SingleGradientDef( + "SigmoidCrossEntropyWithLogitsGradient", + "", + vector{GO(0), I(0), I(1)}, + vector{GI(0)}); + } +}; +REGISTER_GRADIENT( + SigmoidCrossEntropyWithLogits, + GetSigmoidCrossEntropyWithLogitsGradient); + } // namespace } // namespace caffe2 diff --git a/caffe2/operators/cross_entropy_op.h b/caffe2/operators/cross_entropy_op.h index aac20c5a519..08f687fa279 100644 --- a/caffe2/operators/cross_entropy_op.h +++ b/caffe2/operators/cross_entropy_op.h @@ -62,6 +62,22 @@ class MakeTwoClassGradientOp final // Ouptut: dX }; +template +class SigmoidCrossEntropyWithLogitsOp final : public Operator { + public: + USE_SIMPLE_CTOR_DTOR(SigmoidCrossEntropyWithLogitsOp); + USE_OPERATOR_CONTEXT_FUNCTIONS; + bool RunOnDevice() override; +}; + +template +class SigmoidCrossEntropyWithLogitsGradientOp final : public Operator { + public: + USE_SIMPLE_CTOR_DTOR(SigmoidCrossEntropyWithLogitsGradientOp); + USE_OPERATOR_CONTEXT_FUNCTIONS; + bool RunOnDevice() override; +}; + } // namespace caffe2 #endif // CAFFE2_OPERATORS_CROSS_ENTROPY_OP_H_ diff --git a/caffe2/operators/dataset_ops.cc b/caffe2/operators/dataset_ops.cc new file mode 100644 index 00000000000..c91c4467c5e --- /dev/null +++ b/caffe2/operators/dataset_ops.cc @@ -0,0 +1,734 @@ +#include +#include +#include +#include +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor.h" +#include "caffe2/utils/string_utils.h" + +namespace caffe2 { +namespace { + +const char kDatasetFieldSeparator = ':'; +const char* kDatasetLengthField = "lengths"; + +// how much percent to grow the dataset when needed +const int kDatasetGrowthPct = 40; + +// used for lengths tensors in the dataset +using TLength = int32_t; +// used for all internal dataset operations (offsets, sizes to read, etc.) +using TOffset = int64_t; + +/** + * Provides functionality to iterate across a list of tensors where some + * of those tensors represent lengths in a hierarchical structure. + */ +class TreeIterator { + public: + struct FieldDesc { + int id; + int lengthFieldId = -1; + std::string name; + }; + + explicit TreeIterator(const std::vector& fields) { + // populate field vector and split field names + fields_.resize(fields.size()); + std::vector> nameParts(fields_.size()); + for (int i = 0; i < fields.size(); ++i) { + auto& field = fields_.at(i); + field.name = fields[i]; + field.id = i; + field.lengthFieldId = -1; + nameParts.at(i) = split(kDatasetFieldSeparator, field.name); + } + + // populate lengthFields + for (const auto& field : fields_) { + const auto& parts = nameParts.at(field.id); + if (!parts.empty() && parts.back() == kDatasetLengthField) { + lengthFieldIds_.push_back(field.id); + } + } + + // find length-field with maximum prefix matching for each field + for (auto& field : fields_) { + // by default, we are matching against the root domain + int maxMatchLevel = 1; + int maxMatchLengthFieldId = -1; + for (int j = 0; j < numLengthFields(); ++j) { + const auto& lenField = lengthField(j); + // a length field can't have itself as its length field + if (field.id == lenField.id) { + continue; + } + auto lf = nameParts.at(lenField.id); + auto lfEnd = lf.end() - 1; + // check whether this lengthField is a prefix for this field name + if (std::mismatch(lf.begin(), lfEnd, nameParts.at(field.id).begin()) + .first != lfEnd) { + continue; + } + if (lf.size() > maxMatchLevel) { + maxMatchLevel = lf.size(); + maxMatchLengthFieldId = j; + } + } + field.lengthFieldId = maxMatchLengthFieldId; + } + + // check that fields are topologically sorted + // (no length field depends on a length defined afterwards) + for (const auto& field : fields_) { + const auto* lengthField = lengthFieldFor(field); + CAFFE_ENFORCE( + (lengthField == nullptr) || (lengthField->id < field.id), + "Error: Field ", + field.id, + " (", + field.name, + ") ", + "depends on a field defined afterwards: ", + lengthField->id, + " (", + lengthField->name, + ")."); + } + } + + void advance( + const std::vector& lengths, + std::vector& offsets, + std::vector& sizes, + std::vector& limits, + TOffset num) { + thread_local std::vector newOffsets; + CHECK_EQ(lengths.size(), numLengthFields()); + CHECK_EQ(offsets.size(), numOffsetFields()); + sizes.resize(offsets.size()); + newOffsets.resize(offsets.size()); + // first index, top level + { + auto limit = limits[0]; + auto offset = offsets[0]; + CAFFE_ENFORCE(limit >= offset, "Tried to advance past end of cursor."); + TOffset total = std::min(limit - offset, num); + sizes[0] = total; + newOffsets[0] = offset + total; + } + // child indices + for (int j = 1; j < numOffsetFields(); ++j) { + TOffset total = 0; + int parentOffsetId = offsetFieldIdFor(lengthField(j - 1)); + const TLength* length = lengths[j - 1] + offsets[parentOffsetId]; + for (int k = 0; k < sizes[parentOffsetId]; ++k) { + total += *(length++); + } + auto offset = offsets[j]; + CAFFE_ENFORCE( + offset + total <= limits[j], + "Inconsistent field length: ", + "tried to advance past the end of field ", + j); + sizes[j] = total; + newOffsets[j] = offset + total; + } + offsets = newOffsets; + } + + // Corresponds to the number of fields that have "length" as its last name + int numLengthFields() const { + return lengthFieldIds_.size(); + } + + // Corresponds to the number of length fields + 1 (for the top-level domain) + int numOffsetFields() const { + return numLengthFields() + 1; + } + + // Get lengthField description for the given field + const FieldDesc* lengthFieldFor(const FieldDesc& desc) { + return (desc.lengthFieldId == -1) + ? nullptr + : &fields_.at(lengthFieldIds_.at(desc.lengthFieldId)); + } + + // Get lengthField description for the given lengthFieldId, where + // 0 <= lengthFieldId < numLengthFields() + const FieldDesc& lengthField(int lengthFieldId) { + return fields_.at(lengthFieldIds_.at(lengthFieldId)); + } + + // Returns the index into the 'offset' vector for the given field. + int offsetFieldIdFor(const FieldDesc& fieldDesc) { + return fieldDesc.lengthFieldId + 1; + } + + // Returns the field description for all fields. + const std::vector& fields() { + return fields_; + } + + private: + // Description of each field + std::vector fields_; + // Index into fields_ above for the fields that are lengths. + std::vector lengthFieldIds_; +}; + +class TreeCursor { + public: + explicit TreeCursor(const TreeIterator& iterator) : it(iterator) {} + std::vector offsets; + std::mutex mutex_; + TreeIterator it; +}; + +class CreateTreeCursorOp : public Operator { + public: + CreateTreeCursorOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + fields_(OperatorBase::GetRepeatedArgument("fields")) {} + + bool RunOnDevice() override { + *OperatorBase::Output>(0) = + std::unique_ptr(new TreeCursor(TreeIterator(fields_))); + return true; + } + + private: + std::vector fields_; +}; + +class ResetCursorOp : public Operator { + public: + ResetCursorOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) {} + + bool RunOnDevice() override { + auto& cursor = OperatorBase::Input>(0); + std::lock_guard lock(cursor->mutex_); + cursor->offsets.clear(); + return true; + } +}; + +class CheckDatasetConsistencyOp : public Operator { + public: + CheckDatasetConsistencyOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + iterator_(OperatorBase::GetRepeatedArgument("fields")) {} + + bool RunOnDevice() override { + thread_local std::vector lengths; + thread_local std::vector limits; + thread_local std::vector sizes; + thread_local std::vector offsets; + CAFFE_ENFORCE( + InputSize() == iterator_.fields().size(), + "Invalid number of fields. Expected ", + iterator_.fields().size(), + ", got ", + InputSize()); + sizes.resize(iterator_.numOffsetFields()); + // gather length data + lengths.resize(iterator_.numLengthFields()); + for (int i = 0; i < lengths.size(); ++i) { + lengths[i] = Input(iterator_.lengthField(i).id).data(); + } + // gather size limits + limits.assign(sizes.size(), std::numeric_limits::max()); + for (int i = 0; i < iterator_.fields().size(); ++i) { + int lengthIdx = iterator_.fields()[i].lengthFieldId + 1; + TOffset size = (TOffset)Input(i).dims()[0]; + if (limits[lengthIdx] == std::numeric_limits::max()) { + limits[lengthIdx] = size; + } else { + CAFFE_ENFORCE( + limits[lengthIdx] == size, + "Inconsistent sizes for fields belonging to same domain.", + " Field: ", + i, + " (", + iterator_.fields()[i].name, + "); Length field index: ", + lengthIdx, + "); Previous size: ", + limits[lengthIdx], + "; New size: ", + size); + } + } + // advance to the end + offsets.assign(sizes.size(), 0); + iterator_.advance(lengths, offsets, sizes, limits, limits[0]); + for (int i = 0; i < limits.size(); ++i) { + CAFFE_ENFORCE(limits[i] == offsets[i]); + } + return true; + } + + private: + TreeIterator iterator_; +}; + +class ReadNextBatchOp : public Operator { + public: + ReadNextBatchOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + batchSize_(OperatorBase::GetSingleArgument("batch_size", 1)) {} + + bool RunOnDevice() override { + auto& cursor = OperatorBase::Input>(0); + CAFFE_ENFORCE(InputSize() == cursor->it.fields().size() + 1); + thread_local std::vector lengths; + thread_local std::vector limits; + thread_local std::vector sizes; + thread_local std::vector offsets; + sizes.resize(cursor->it.numOffsetFields()); + // gather length data + lengths.resize(cursor->it.numLengthFields()); + for (int i = 0; i < lengths.size(); ++i) { + lengths[i] = Input(cursor->it.lengthField(i).id + 1).data(); + } + // gather size limits + limits.assign(sizes.size(), std::numeric_limits::max()); + for (int i = 0; i < cursor->it.fields().size(); ++i) { + int lengthFieldIdx = cursor->it.fields()[i].lengthFieldId + 1; + limits[lengthFieldIdx] = + std::min(limits[lengthFieldIdx], (TOffset)Input(i + 1).dims()[0]); + } + // advance cursor + { + std::lock_guard lock(cursor->mutex_); + if (cursor->offsets.empty()) { + cursor->offsets.assign(sizes.size(), 0); + } + offsets = cursor->offsets; + cursor->it.advance(lengths, cursor->offsets, sizes, limits, batchSize_); + } + // gather data + thread_local std::vector outDim; + for (int i = 0; i < cursor->it.fields().size(); ++i) { + auto lengthIdx = cursor->it.fields()[i].lengthFieldId + 1; + auto size = sizes[lengthIdx]; + auto offset = offsets[lengthIdx]; + auto& in = Input(i + 1); + auto innerSize = in.size_from_dim(1); + outDim = in.dims(); + outDim[0] = size; + auto* out = Output(i); + out->Resize(outDim); + if (out->size() == 0) { + continue; + } + void* src = + (char*)in.raw_data() + offset * innerSize * in.meta().itemsize(); + void* dst = out->raw_mutable_data(in.meta()); + context_.template CopyItems( + in.meta(), out->size(), src, dst); + } + return true; + } + int batchSize_; +}; + +class ComputeOffsetOp : public Operator { + public: + ComputeOffsetOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) {} + + bool RunOnDevice() override { + auto& cursor = OperatorBase::Input>(0); + CAFFE_ENFORCE(InputSize() == cursor->it.fields().size() + 1); + auto* out = Output(0); + std::vector lengths; + std::vector limits; + std::vector sizes; + std::vector offsets; + sizes.resize(cursor->it.numOffsetFields()); + // gather length data + lengths.resize(cursor->it.numLengthFields()); + for (int i = 0; i < lengths.size(); ++i) { + lengths[i] = Input(cursor->it.lengthField(i).id + 1).data(); + } + // gather size limits + limits.assign(sizes.size(), std::numeric_limits::max()); + for (int i = 0; i < cursor->it.fields().size(); ++i) { + int lengthFieldIdx = cursor->it.fields()[i].lengthFieldId + 1; + limits[lengthFieldIdx] = + std::min(limits[lengthFieldIdx], (TOffset)Input(i + 1).dims()[0]); + } + out->Resize(limits.at(0) + 1, sizes.size()); + auto* out_data = out->mutable_data(); + for (int k = 0; k <= limits.at(0); k++) { + // advance cursor + if (cursor->offsets.empty()) { + cursor->offsets.assign(sizes.size(), 0); + } + // write output + std::copy(cursor->offsets.begin(), cursor->offsets.end(), out_data); + out_data += sizes.size(); + cursor->it.advance(lengths, cursor->offsets, sizes, limits, 1); + } + cursor->offsets.assign(sizes.size(), 0); // reSet after getting meta info + return true; + } +}; + +class ReadRandomBatchOp : public Operator { + public: + ReadRandomBatchOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + batchSize_(OperatorBase::GetSingleArgument("batch_size", 1)) {} + bool RunOnDevice() override { + auto& cursor = OperatorBase::Input>(0); + auto& idxblob = Input(1); + auto& offsetsmat = Input(2); + CAFFE_ENFORCE(InputSize() == cursor->it.fields().size() + 3); + auto idxvec = idxblob.template data(); + auto& offsetdim = offsetsmat.dims(); + // gather data + thread_local std::vector outDim; + int64_t idx; + { + std::lock_guard lock(cursor->mutex_); + cursor->offsets.resize(1); + idx = cursor->offsets.at(0); + cursor->offsets.at(0) += batchSize_; + } + + for (int i = 0; i < cursor->it.fields().size(); ++i) { + auto lengthIdx = cursor->it.fields()[i].lengthFieldId + 1; + auto& in = Input(i + 3); + outDim = in.dims(); + outDim.at(0) = 0; + auto idxbegin = idx; + for (int j = 0; j < batchSize_; ++j) { + if (idx >= idxblob.size()) { + break; + } + CAFFE_ENFORCE( + (idxvec[idx] + 1) * offsetdim[1] + lengthIdx < offsetsmat.size(), + "Out of bound when trying to get elem from offsetsmat"); + auto offsetptr = offsetsmat.template data() + + idxvec[idx] * offsetdim[1] + lengthIdx; + auto offset = *offsetptr; + auto size = *(offsetptr + offsetdim[1]) - offset; + outDim.at(0) += size; // accumulate over the batch + idx++; + } + idx = idxbegin; // reSet + auto* out = Output(i); + out->Resize(outDim); + if (out->size() == 0) { + continue; + } + auto innerSize = in.size_from_dim(1); + auto dst = static_cast(out->raw_mutable_data(in.meta())); + int block_size = in.size() / in.dim(0); + int block_bytesize = in.nbytes() / in.dim(0); + int start = 0; + for (int j = 0; j < batchSize_; ++j) { + if (idx >= idxblob.size()) { + break; + } + auto offsetptr = offsetsmat.template data() + + idxvec[idx] * offsetdim[1] + lengthIdx; + auto offset = *offsetptr; + auto size = *(offsetptr + offsetdim[1]) - offset; + // copy data + void* src = + (char*)in.raw_data() + offset * innerSize * in.meta().itemsize(); + context_.template CopyItems( + in.meta(), size * block_size, src, dst + start * block_bytesize); + start += size; + idx++; + } + idx = idxbegin; // reSet + } + return true; + } + int batchSize_; +}; + +template +class AppendOp final : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + AppendOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) {} + + bool RunOnDevice() override { + auto& a = Input(0); + auto& b = Input(1); + auto* c = Output(0); + CAFFE_ENFORCE(b.ndim() >= 1); + if (a.size() == 0) { + c->CopyFrom(b); + return true; + } + CAFFE_ENFORCE(&a == c, "First argument must be in-place."); + CAFFE_ENFORCE(c->ndim() == b.ndim()); + CAFFE_ENFORCE(b.ndim() == c->ndim()); + CAFFE_ENFORCE(a.meta() == b.meta()); + for (int i = 1; i < a.ndim(); ++i) { + CAFFE_ENFORCE(a.dims()[i] == b.dims()[i]); + } + auto oldSize = c->size(); + c->Extend(b.dims()[0], kDatasetGrowthPct, &context_); + auto* dst = (char*)c->raw_mutable_data() + oldSize * b.meta().itemsize(); + context_.template CopyItems( + b.meta(), b.size(), b.raw_data(), dst); + return true; + } +}; + +template +class AtomicAppendOp final : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + AtomicAppendOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) {} + + bool RunOnDevice() override { + auto& mutex = OperatorBase::Input>(0); + const auto numFields = (InputSize() - 1) / 2; + CAFFE_ENFORCE(OutputSize() == numFields); + + std::lock_guard guard(*mutex); + + // 1: checks + for (int i = 0; i < numFields; ++i) { + auto& a = Input(1 + i); + auto& b = Input(1 + i + numFields); + auto* c = Output(i); + CAFFE_ENFORCE(b.ndim() >= 1); + if (a.size() == 0) { + continue; + } + CAFFE_ENFORCE( + (void*)&a == (void*)c, "Appended-to arguments must be in-place."); + CAFFE_ENFORCE(c->ndim() == b.ndim()); + CAFFE_ENFORCE(b.ndim() == c->ndim()); + CAFFE_ENFORCE(a.meta() == b.meta()); + for (int j = 1; j < a.ndim(); ++j) { + CAFFE_ENFORCE(a.dims()[j] == b.dims()[j]); + } + } + + // 2: copies + for (int i = 0; i < numFields; ++i) { + auto& a = Input(1 + i); + auto& b = Input(1 + i + numFields); + auto* c = Output(i); + if (a.size() == 0) { + c->CopyFrom(b); + continue; + } + auto oldSize = c->size(); + c->Extend(b.dims()[0], kDatasetGrowthPct, &context_); + auto* dst = (char*)c->raw_mutable_data() + oldSize * b.meta().itemsize(); + context_.template CopyItems( + b.meta(), b.size(), b.raw_data(), dst); + } + return true; + } +}; + +REGISTER_CPU_OPERATOR(CreateTreeCursor, CreateTreeCursorOp); +REGISTER_CPU_OPERATOR(ResetCursor, ResetCursorOp); +REGISTER_CPU_OPERATOR(ReadNextBatch, ReadNextBatchOp); +REGISTER_CPU_OPERATOR(ComputeOffset, ComputeOffsetOp); +REGISTER_CPU_OPERATOR(ReadRandomBatch, ReadRandomBatchOp); +REGISTER_CPU_OPERATOR(CheckDatasetConsistency, CheckDatasetConsistencyOp); +REGISTER_CPU_OPERATOR(Append, AppendOp); +REGISTER_CPU_OPERATOR(AtomicAppend, AtomicAppendOp); + +OPERATOR_SCHEMA(CreateTreeCursor) + .NumInputs(0) + .NumOutputs(1) + .SetDoc(R"DOC( +Creates a cursor to iterate through a list of tensors, where some of those +tensors contains the lengths in a nested schema. The schema is determined by +the `fields` arguments. + +For example, to represent the following schema: + + Struct( + a=Int(), + b=List(List(Int), + c=List( + Struct( + c1=String, + c2=List(Int), + ), + ), + ) + +the field list will be: + [ + "a", + "b:lengths", + "b:values:lengths", + "b:values:values", + "c:lengths", + "c:c1", + "c:c2:lengths", + "c:c2:values", + ] + +And for the following instance of the struct: + + Struct( + a=3, + b=[[4, 5], [6, 7, 8], [], [9]], + c=[ + Struct(c1='alex', c2=[10, 11]), + Struct(c1='bob', c2=[12]), + ], + ) + +The values of the fields will be: + { + "a": [3], + "b:lengths": [4], + "b:values:lengths": [2, 3, 0, 1], + "b:values:values": [4, 5, 6, 7, 8, 9], + "c:lengths": [2], + "c:c1": ["alex", "bob"], + "c:c2:lengths": [2, 1], + "c:c2:values", [10, 11, 12], + } + +In general, every field name in the format "{prefix}:lengths" defines a domain +"{prefix}", and every subsequent field in the format "{prefx}:{field}" will +be in that domain, and the length of the domain is provided for each entry of +the parent domain. In the example, "b:lengths" defines a domain of length 4, so +every field under domain "b" will have 4 entries. +The "lengths" field for a given domain must appear before any reference to +that domain. + +Returns a pointer to an instance of the Cursor, which keeps the current offset +on each of the domains defined by `fields`. Cursor also ensures thread-safety +such that ReadNextBatch and ResetCursor can be used safely in parallel. + +A cursor does not contain data per se, so calls to ReadNextBatch actually need +to pass a list of blobs containing the data to read for each one of the fields. +)DOC") + .Output(0, "cursor", "A blob pointing to an instance of a new TreeCursor.") + .Arg( + "fields", + "A list of strings each one representing a field of the dataset."); + +OPERATOR_SCHEMA(ResetCursor) + .NumInputs(1) + .NumOutputs(0) + .SetDoc(R"DOC( +Resets the offsets for the given TreeCursor. This operation is thread safe. +)DOC") + .Input(0, "cursor", "A blob containing a pointer to the cursor."); + +OPERATOR_SCHEMA(ReadNextBatch) + .NumInputs(1, INT_MAX) + .NumOutputs(1, INT_MAX) + .SetDoc(R"DOC( +Read the next batch of examples out of the given cursor and data blobs. + +Input(0) is a blob pointing to a TreeCursor, and +[Input(1),... Input(num_fields)] a list of tensors containing the data for +each field of the dataset. + +ReadNextBatch is thread safe. +)DOC") + .Input(0, "cursor", "A blob containing a pointer to the cursor.") + .Input(1, "dataset_field_0", "First dataset field") + .Output(0, "field_0", "Tensor containing the next batch for field 0.") + .Arg("batch_size", "Number of top-level entries to read."); + +OPERATOR_SCHEMA(ComputeOffset) + .NumInputs(1, INT_MAX) + .NumOutputs(1) + .SetDoc(R"DOC( +Compute the offsets matrix given cursor and data blobs. Need to be ran at +beginning or after reseting cursor + +Input(0) is a blob pointing to a TreeCursor, and +[Input(1),... Input(num_fields)] a list of tensors containing the data for +each field of the dataset. + +ComputeOffset is thread safe. +)DOC") + .Input(0, "cursor", "A blob containing a pointer to the cursor.") + .Input(1, "dataset_field_0", "First dataset field") + .Output(0, "field_0", "Tensor containing offset info for this chunk."); + +OPERATOR_SCHEMA(ReadRandomBatch) + .NumInputs(1, INT_MAX) + .NumOutputs(1, INT_MAX) + .SetDoc(R"DOC( +Read the next batch of examples out of the given cursor, +idx blob, offset matrix and data blobs. + +Input(0) is a blob pointing to a TreeCursor, +Input(1) is a blob pointing to the shuffled idx +Input(2) is a blob pointing to the offset matrix and +[Input(3),... Input(num_fields)] a list of tensors containing the data for +each field of the dataset. + +ReadRandomBatch is thread safe. +)DOC") + .Input(0, "cursor", "A blob containing a pointer to the cursor.") + .Input(1, "idx", "idx with a shuffled order.") + .Input(2, "offsetsmat", "offset matrix containing length offset info.") + .Input(3, "dataset_field_0", "First dataset field") + .Output(0, "field_0", "Tensor containing the next batch for field 0.") + .Arg("batch_size", "Number of top-level entries to read."); + +OPERATOR_SCHEMA(CheckDatasetConsistency) + .NumInputs(1, INT_MAX) + .NumOutputs(0) + .SetDoc(R"DOC( +Checks that the given data fields represents a consistent dataset unther +the schema specified by the `fields` argument. Operator fails if the fields +are not consistent. If data is consistent, each field's data can be safely +appended to an existing dataset, keeping it consistent. +)DOC") + .Input(0, "field_0", "Data for field 0.") + .Arg( + "fields", + "List of strings representing the string names in the format" + "specified in the doc for CreateTreeCursor."); + +OPERATOR_SCHEMA(Append) + .NumInputs(2) + .NumOutputs(1) + .EnforceInplace({{0, 0}}) + .SetDoc(R"DOC( +Append input 2 to the end of input 1. +Input 1 must be the same as output, that is, it is required to be in-place. +Input 1 may have to be re-allocated in order for accommodate to the new size. +Currently, an exponential growth ratio is used in order to ensure amortized +constant time complexity. +All except the outer-most dimension must be the same between input 1 and 2. +)DOC") + .Input(0, "dataset", "The tensor to be appended to.") + .Input(1, "new_data", "Tensor to append to the end of dataset.") + .Output(0, "dataset", "Same as input 0, representing the mutated tensor."); + +OPERATOR_SCHEMA(AtomicAppend) + .NumInputs(3, INT_MAX) + .NumOutputs(1, INT_MAX) + .AllowInplace([](int in, int out) { return in == out + 1; }); + +SHOULD_NOT_DO_GRADIENT(CreateTreeCursor); +SHOULD_NOT_DO_GRADIENT(ResetCursor); +SHOULD_NOT_DO_GRADIENT(ReadNextBatch); +SHOULD_NOT_DO_GRADIENT(ComputeOffset); +SHOULD_NOT_DO_GRADIENT(ReadRandomBatch); +SHOULD_NOT_DO_GRADIENT(CheckDatasetConsistency); +SHOULD_NOT_DO_GRADIENT(Append); +SHOULD_NOT_DO_GRADIENT(AtomicAppend); +} +} diff --git a/caffe2/operators/elementwise_op.h b/caffe2/operators/elementwise_op.h index 50a4ac65029..b8b1ab623ee 100644 --- a/caffe2/operators/elementwise_op.h +++ b/caffe2/operators/elementwise_op.h @@ -2,16 +2,16 @@ #define CAFFE2_OPERATORS_ELEMENTWISE_OP_H_ #include "caffe2/core/context.h" +#include "caffe2/core/logging.h" #include "caffe2/core/operator.h" #include "caffe2/utils/math.h" -#include "caffe2/core/logging.h" namespace caffe2 { using NumericTypes = TensorTypes; class SameTypeAsInput {}; -template +template struct TypeForOutput { using value = OutputTemplate; }; @@ -21,12 +21,26 @@ struct TypeForOutput { using value = InputType; }; -template -class UnaryElementwiseOp : public Operator { +/** + * Generic meta-operator that is able to processes element-wise operations on + * a single-element tensor, returning a tensor with same shape, and either of + * the same type as the input or of a specified result type. + * + * The functor provided must implement operator() as a template on input and + * output types, and on a Context. Moreover, it needs to provide a constructor + * that takes OperatorBase& as argument. This is in order to consume arguments + * passed to the operator instance. + */ +template < + typename InputTypes, + class Context, + class Functor, + class OutputType = SameTypeAsInput> +class UnaryElementwiseWithArgsOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; - USE_SIMPLE_CTOR_DTOR(UnaryElementwiseOp); + UnaryElementwiseWithArgsOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), functor(*this) {} bool RunOnDevice() override { return DispatchHelper::call(this, Input(0)); @@ -38,11 +52,67 @@ class UnaryElementwiseOp : public Operator { auto* output = Output(0); output->ResizeLike(input); using R = typename TypeForOutput::value; - Functor()(input.size(), input.template data(), - output->template mutable_data(), &context_); + functor( + input.size(), + input.template data(), + output->template mutable_data(), + &context_); return true; } + Functor functor; +}; + +/** + * WithDefaultConstructor is a functor that can be used as the functor of an + * UnaryElementwiseWithArgsOp. It simply forwards the operator() call into + * another functor that doesn't accept arguments in its constructor. + */ +template +struct WithDefaultConstructor { + explicit WithDefaultConstructor(OperatorBase& op) {} + + template + void operator()(int n, const In* in, Out* out, Context* c) { + Functor()(n, in, out, c); + } +}; + +/** + * UnaryElementwiseOp is a wrapper around UnaryElementwiseWithArgsOp, with the + * difference that it takes a functor with default constructor, e.g. that does + * not need to take into consideration any arguments during operator creation. + */ +template < + typename InputTypes, + class Context, + class Functor, + class OutputType = SameTypeAsInput> +using UnaryElementwiseOp = UnaryElementwiseWithArgsOp< + InputTypes, + Context, + WithDefaultConstructor, + OutputType>; + +/** + * ForEach is a unary functor that forwards each element of the input array + * into the elementwise Functor provided, and gathers the results of each + * call into the resulting array. Use it as an adaptor if you want to create + * a UnaryElementwiseOp that acts on each element of the tensor per function + * call -- this is resonable for complex types where vectorization wouldn't + * be much of a gain, performance-wise. + */ +template +struct ForEach { + explicit ForEach(OperatorBase& op) : functor(op) {} + + template + void operator()(int n, const In* in, Out* out, Context* c) { + for (int i = 0; i < n; ++i) { + out[i] = functor(in[i]); + } + } + Functor functor; }; /** @@ -113,7 +183,7 @@ class BinaryElementwiseOp : public Operator { return false; } for (int i = 0; i < b.size(); ++i) { - if (a[a.size()-1-i] != b[b.size()-1-i]) { + if (a[a.size() - 1 - i] != b[b.size() - 1 - i]) { return false; } } @@ -169,31 +239,35 @@ class DivGradientOp final : public Operator { bool RunOnDevice() override; }; -#define CAFFE2_BINARY_FUNCTOR_WRAPPER(name) \ - struct name##Functor { \ - template \ - inline void operator()( \ - const int n, \ - const T* x, \ - const T* y, \ - T* output, \ - Context* device_context) { \ - math::name(n, x, y, output, device_context); \ - } \ - template \ - inline void WithBroadcast( \ - const int m, \ - const int n, \ - const T* a, \ - const T* b, \ - T* y, \ - Context* device_context) { \ - math::name##ToRow(m, n, a, b, y, device_context); \ - } \ - }; \ - template \ - using name##Op = BinaryElementwiseOp +#define CAFFE2_BINARY_FUNCTOR_WRAPPER(name) \ + struct name##Functor { \ + template \ + inline void operator()( \ + const int n, \ + const T* x, \ + const T* y, \ + T* output, \ + Context* device_context) { \ + math::name(n, x, y, output, device_context); \ + } \ + template \ + inline void WithBroadcast( \ + const int m, \ + const int n, \ + const T* a, \ + const T* b, \ + T* y, \ + Context* device_context) { \ + math::name##ToRow(m, n, a, b, y, device_context); \ + } \ + }; \ + template \ + using name##Op = BinaryElementwiseOp< \ + NumericTypes, \ + DC, \ + name##Functor, \ + SameTypeAsInput, \ + true> CAFFE2_BINARY_FUNCTOR_WRAPPER(Add); CAFFE2_BINARY_FUNCTOR_WRAPPER(Sub); @@ -202,31 +276,31 @@ CAFFE2_BINARY_FUNCTOR_WRAPPER(Div); #undef CAFFE2_BINARY_FUNCTOR_WRAPPER -#define CAFFE2_BINARY_FUNCTOR_BINARY_RESULT_WRAPPER(name) \ - struct name##Functor { \ - template \ - inline void operator()( \ - const int n, \ - const T* x, \ - const T* y, \ - bool* output, \ - Context* device_context) { \ - math::name(n, x, y, output, device_context); \ - } \ - template \ - inline void WithBroadcast( \ - const int m, \ - const int n, \ - const T* a, \ - const T* b, \ - bool* y, \ - Context* device_context) { \ - math::name##ToRow(m, n, a, b, y, device_context); \ - } \ - }; \ - template \ - using name##Op = BinaryElementwiseOp< \ - NumericTypes, DC, name##Functor, bool, true> +#define CAFFE2_BINARY_FUNCTOR_BINARY_RESULT_WRAPPER(name) \ + struct name##Functor { \ + template \ + inline void operator()( \ + const int n, \ + const T* x, \ + const T* y, \ + bool* output, \ + Context* device_context) { \ + math::name(n, x, y, output, device_context); \ + } \ + template \ + inline void WithBroadcast( \ + const int m, \ + const int n, \ + const T* a, \ + const T* b, \ + bool* y, \ + Context* device_context) { \ + math::name##ToRow(m, n, a, b, y, device_context); \ + } \ + }; \ + template \ + using name##Op = \ + BinaryElementwiseOp CAFFE2_BINARY_FUNCTOR_BINARY_RESULT_WRAPPER(LT); CAFFE2_BINARY_FUNCTOR_BINARY_RESULT_WRAPPER(LE); @@ -234,6 +308,6 @@ CAFFE2_BINARY_FUNCTOR_BINARY_RESULT_WRAPPER(GT); CAFFE2_BINARY_FUNCTOR_BINARY_RESULT_WRAPPER(GE); #undef CAFFE2_BINARY_FUNCTOR_BINARY_RESULT_WRAPPER -} // namespace caffe2 +} // namespace caffe2 -#endif // CAFFE2_OPERATORS_ELEMENTWISE_OP_H_ +#endif // CAFFE2_OPERATORS_ELEMENTWISE_OP_H_ diff --git a/caffe2/operators/filler_op.cc b/caffe2/operators/filler_op.cc index 8bc4c48ccae..9f7c1f34dbf 100644 --- a/caffe2/operators/filler_op.cc +++ b/caffe2/operators/filler_op.cc @@ -18,6 +18,7 @@ REGISTER_CPU_OPERATOR(UniformFill, UniformFillOp); REGISTER_CPU_OPERATOR(UniformIntFill, UniformFillOp); REGISTER_CPU_OPERATOR(ConstantFill, ConstantFillOp); REGISTER_CPU_OPERATOR(ConstantIntFill, ConstantFillOp); +REGISTER_CPU_OPERATOR(ConstantBoolFill, ConstantFillOp); REGISTER_CPU_OPERATOR(GivenTensorFill, GivenTensorFillOp); REGISTER_CPU_OPERATOR(GivenTensorIntFill, GivenTensorFillOp); REGISTER_CPU_OPERATOR(GaussianFill, GaussianFillOp); @@ -30,6 +31,10 @@ OPERATOR_SCHEMA(UniformFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}} OPERATOR_SCHEMA(UniformIntFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}}); OPERATOR_SCHEMA(ConstantFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}}); OPERATOR_SCHEMA(ConstantIntFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}}); +OPERATOR_SCHEMA(ConstantBoolFill) + .NumInputs(0, 1) + .NumOutputs(1) + .AllowInplace({{0, 0}}); OPERATOR_SCHEMA(GivenTensorFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}}); OPERATOR_SCHEMA(GivenTensorIntFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}}); OPERATOR_SCHEMA(GaussianFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}}); diff --git a/caffe2/operators/fully_connected_op.h b/caffe2/operators/fully_connected_op.h index a24147fc624..d319f521a61 100644 --- a/caffe2/operators/fully_connected_op.h +++ b/caffe2/operators/fully_connected_op.h @@ -56,14 +56,12 @@ class FullyConnectedOp final : public Operator { CAFFE_ENFORCE(N == b.dim32(0), dimErrorString()); CAFFE_ENFORCE(N == b.size(), dimErrorString()); - // Create the Y shape (without allocation) - static thread_local vector Y_shape; - Y_shape = X.dims(); + Y_shape_cache_ = X.dims(); // This is an invariant of canonical_axis, so we can DCHECK. - DCHECK_LE(canonical_axis + 1, Y_shape.size()); - Y_shape.resize(canonical_axis + 1); - Y_shape[canonical_axis] = N; - Y->Resize(Y_shape); + DCHECK_LE(canonical_axis + 1, Y_shape_cache_.size()); + Y_shape_cache_.resize(canonical_axis + 1); + Y_shape_cache_[canonical_axis] = N; + Y->Resize(Y_shape_cache_); CAFFE_ENFORCE(M * N == Y->size(), dimErrorString()); // W * x @@ -88,6 +86,9 @@ class FullyConnectedOp final : public Operator { protected: size_t axis_{1}; + // A local vector to cache the output shape so we don't need to recreate + // a vector object every time we run Run(). + vector Y_shape_cache_; Tensor bias_multiplier_; }; diff --git a/caffe2/operators/index_ops.cc b/caffe2/operators/index_ops.cc index 33f7991c0c5..ed22247da03 100644 --- a/caffe2/operators/index_ops.cc +++ b/caffe2/operators/index_ops.cc @@ -7,7 +7,6 @@ #include namespace caffe2 { - namespace { using IndexKeyTypes = TensorTypes; using TIndexValue = int64_t; @@ -22,12 +21,17 @@ struct IndexBase { void Freeze() { frozen_ = true; } virtual ~IndexBase() {} const TypeMeta& Type() const { return meta_; } + TIndexValue Size() { + std::lock_guard guard(dictMutex_); + return nextId_; + } protected: int64_t maxElements_; TypeMeta meta_; TIndexValue nextId_{1}; // guarded by dictMutex_ std::atomic frozen_{false}; + std::mutex dictMutex_; }; template @@ -96,7 +100,6 @@ struct Index: IndexBase { } std::unordered_map dict_; - std::mutex dictMutex_; }; template @@ -142,7 +145,9 @@ class IndexGetOp: public Operator { class IndexLoadOp: public Operator { public: IndexLoadOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws) {} + : Operator(operator_def, ws), + skipFirstEntry_( + OperatorBase::GetSingleArgument("skip_first_entry", 0)) {} bool RunOnDevice() override { return DispatchHelper::call(this, Input(1)); @@ -153,8 +158,18 @@ class IndexLoadOp: public Operator { auto* dict = dynamic_cast_if_rtti*>(base.get()); CAFFE_ENFORCE(dict, "Wrong dictionary type given input keys."); const auto& keys = Input(1); - return dict->Load(keys.data(), keys.size()); + const auto* keys_data = keys.data(); + auto keys_size = keys.size(); + if (skipFirstEntry_) { + CAFFE_ENFORCE(keys.size() > 0); + ++keys_data; + --keys_size; + } + return dict->Load(keys_data, keys_size); } + + private: + bool skipFirstEntry_; }; class IndexStoreOp: public Operator { @@ -188,6 +203,19 @@ class IndexFreezeOp: public Operator { } }; +class IndexSizeOp : public Operator { + public: + IndexSizeOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) {} + + bool RunOnDevice() override { + auto& base = OperatorBase::Input>(0); + auto* out = Output(0); + out->Resize(std::vector{}); + *out->mutable_data() = base->Size(); + return true; + } +}; REGISTER_CPU_OPERATOR(IntIndexCreate, IndexCreateOp); REGISTER_CPU_OPERATOR(LongIndexCreate, IndexCreateOp); @@ -197,6 +225,7 @@ REGISTER_CPU_OPERATOR(IndexGet, IndexGetOp); REGISTER_CPU_OPERATOR(IndexLoad, IndexLoadOp); REGISTER_CPU_OPERATOR(IndexStore, IndexStoreOp); REGISTER_CPU_OPERATOR(IndexFreeze, IndexFreezeOp); +REGISTER_CPU_OPERATOR(IndexSize, IndexSizeOp); OPERATOR_SCHEMA(IntIndexCreate) .NumInputs(0) @@ -250,16 +279,20 @@ Should not be called concurrently with IndexGet. )DOC") .Input(0, "handle", "Pointer to an Index instance."); - OPERATOR_SCHEMA(IndexLoad) - .NumInputs(2) - .NumOutputs(0) - .SetDoc(R"DOC( + .NumInputs(2) + .NumOutputs(0) + .SetDoc(R"DOC( Loads the index from the given 1-D tensor. Elements in the tensor will be given consecutive indexes starting at 1. Fails if tensor contains repeated elements. )DOC") - .Input(0, "handle", "Pointer to an Index instance.") - .Input(1, "items", "1-D tensor with elements starting with index 1."); + .Input(0, "handle", "Pointer to an Index instance.") + .Input(1, "items", "1-D tensor with elements starting with index 1.") + .Arg( + "skip_first_entry", + "If set, skips the first entry of the tensor. This allows " + "to load tensors that are aligned with an embedding, where the first " + "entry corresponds to the default 0 index entry."); OPERATOR_SCHEMA(IndexStore) .NumInputs(1) @@ -271,6 +304,15 @@ for unknowns, the first element of the output tensor will be element of index 1. .Input(0, "handle", "Pointer to an Index instance.") .Output(0, "items", "1-D tensor with elements starting with index 1."); +OPERATOR_SCHEMA(IndexSize) + .NumInputs(1) + .NumOutputs(1) + .SetDoc(R"DOC( +Returns the number of entries currently present in the index. +)DOC") + .Input(0, "handle", "Pointer to an Index instance.") + .Output(0, "items", "Scalar int64 tensor with number of entries."); + NO_GRADIENT(IndexGetOp); NO_GRADIENT(IntIndexCreate); NO_GRADIENT(LongIndexCreate); @@ -278,5 +320,5 @@ NO_GRADIENT(StringIndexCreate); SHOULD_NOT_DO_GRADIENT(IndexFreeze); SHOULD_NOT_DO_GRADIENT(IndexLoad); SHOULD_NOT_DO_GRADIENT(IndexStore); - +SHOULD_NOT_DO_GRADIENT(IndexSize); } // namespace caffe2 diff --git a/caffe2/operators/load_save_op.h b/caffe2/operators/load_save_op.h index 5918d9a5a78..a26ffbdbe99 100644 --- a/caffe2/operators/load_save_op.h +++ b/caffe2/operators/load_save_op.h @@ -3,6 +3,7 @@ #include #include +#include #include "caffe2/core/context.h" #include "caffe2/core/db.h" @@ -65,12 +66,18 @@ class LoadOp final : public Operator { // chunks. This way we can make sure that all chunks were loaded in the end. // This is a map from output index to current size of the blob std::map blobSizes; - + std::unordered_set loaded; for (; cursor->Valid(); cursor->Next()) { const string& key = cursor->key(); if (!output_indices_.count(key)) { VLOG(1) << "Key " << key << " not used. Skipping."; } else { + CAFFE_ENFORCE( + loaded.count(key) == 0, + "Multiple copies of blob ", + key, + " found in the db."); + VLOG(2) << "Deserializing blob " << key; BlobProto proto; CHECK(proto.ParseFromString(cursor->value())); @@ -101,6 +108,15 @@ class LoadOp final : public Operator { blobSize.first->second = blob->Get>().size(); } } + + if (!proto.has_tensor() || + blobSize.first->second >= blob->Get>().size()) { + loaded.insert(key); + } + + if (loaded.size() >= OutputSize()) { + break; + } } } @@ -116,6 +132,8 @@ class LoadOp final : public Operator { blobSize.second); } } + + CHECK_EQ(loaded.size(), OutputSize()); } private: diff --git a/caffe2/operators/one_hot_ops.cc b/caffe2/operators/one_hot_ops.cc new file mode 100644 index 00000000000..46bea11c343 --- /dev/null +++ b/caffe2/operators/one_hot_ops.cc @@ -0,0 +1,107 @@ +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor.h" + +namespace caffe2 { +namespace { + +class OneHotOp : public Operator { + public: + OneHotOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) {} + + bool RunOnDevice() override { + auto& indices = Input(0); + auto& index_size_tensor = Input(1); + CAFFE_ENFORCE(indices.ndim() == 1); + CAFFE_ENFORCE(index_size_tensor.size() == 1); + auto batch_size = indices.size(); + auto index_size = *index_size_tensor.data(); + + auto* indices_ptr = indices.data(); + auto* one_hots = Output(0); + one_hots->Resize(std::vector{batch_size, index_size}); + if (one_hots->size() == 0) { + return true; + } + auto* one_hots_ptr = one_hots->mutable_data(); + memset(one_hots_ptr, 0, one_hots->nbytes()); + for (int i = 0; i < batch_size; ++i) { + auto label_idx = indices_ptr[i]; + DCHECK((0 <= label_idx) && (label_idx < index_size)); + one_hots_ptr[label_idx] = 1.0; + one_hots_ptr += index_size; + } + return true; + } +}; + +class SegmentOneHotOp : public Operator { + public: + SegmentOneHotOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) {} + + bool RunOnDevice() override { + auto& lengths = Input(0); + auto& indices = Input(1); + auto& index_size_tensor = Input(2); + CAFFE_ENFORCE(lengths.ndim() == 1); + CAFFE_ENFORCE(indices.ndim() == 1); + CAFFE_ENFORCE(index_size_tensor.size() == 1); + auto batch_size = lengths.size(); + auto index_size = *index_size_tensor.data(); + CAFFE_ENFORCE(index_size > 0); + + auto* lengths_ptr = lengths.data(); + auto* indices_ptr = indices.data(); + auto* one_hots = Output(0); + one_hots->Resize(std::vector{batch_size, index_size}); + auto* one_hots_ptr = one_hots->mutable_data(); + if (one_hots->size() == 0) { + return true; + } + memset(one_hots_ptr, 0, one_hots->nbytes()); + int el_idx = 0; + for (int i = 0; i < batch_size; ++i) { + for (int j = 0; j < lengths_ptr[i]; ++j) { + DCHECK(el_idx < indices.size()); + auto label_idx = indices_ptr[el_idx++]; + DCHECK((0 <= label_idx) && (label_idx < index_size)); + one_hots_ptr[label_idx] = 1.0; + } + one_hots_ptr += index_size; + } + return true; + } +}; + +REGISTER_CPU_OPERATOR(OneHot, OneHotOp); +REGISTER_CPU_OPERATOR(SegmentOneHot, SegmentOneHotOp); + +OPERATOR_SCHEMA(OneHot) + .NumInputs(2) + .NumOutputs(1) + .SetDoc(R"DOC( +Given a sequence of indices, one for each example in a batch, returns a matrix +where each inner dimension has the size of the index and has 1.0 in the index +active in the given example, and 0.0 everywhere else. +)DOC") + .Input(0, "indices", "The active index for each example in the batch.") + .Input(1, "index_size_tensor", "Scalar with the size of the index.") + .Output(0, "one_hots", "Matrix of size len(indices) x index_size"); + +OPERATOR_SCHEMA(SegmentOneHot) + .NumInputs(3) + .NumOutputs(1) + .SetDoc(R"DOC( +Given a sequence of indices, segmented by the lengths tensor, returns a matrix +that has the elements in each sequence set to 1.0, and 0.0 everywhere else. +)DOC") + .Input(0, "lengths", "Size of each segment.") + .Input(1, "indices", "Active indices, of size sum(lengths)") + .Input(2, "index_size_tensor", "Size of the index") + .Output(0, "one_hots", "Matrix of size len(lengths) x index_size"); + +NO_GRADIENT(OneHot); +NO_GRADIENT(SegmentOneHot); +} +} diff --git a/caffe2/operators/reducer_functors.h b/caffe2/operators/reducer_functors.h index 15c120ae0a5..4919fae1def 100644 --- a/caffe2/operators/reducer_functors.h +++ b/caffe2/operators/reducer_functors.h @@ -124,6 +124,67 @@ struct LogSumExpRangeReducerDef { "input slices. Operation doesn't change the shape of individual blocks."; }; +template +class LogMeanExpRangeReducer; +template +class LogMeanExpRangeReducerGradient; + +template +class LogMeanExpRangeReducer { + public: + void operator()( + const TIndex block_size, + const TIndex blocks, + const T* in, + T* out, + CPUContext* context) { + for (int j = 0; j < block_size; ++j) { + T max_value = std::numeric_limits::lowest(); + for (int i = 0; i < blocks; ++i) { + max_value = std::max(max_value, in[i * block_size + j]); + } + T scaled_exp_sum = 0; + for (int i = 0; i < blocks; ++i) { + scaled_exp_sum += std::exp(in[i * block_size + j] - max_value); + } + scaled_exp_sum /= blocks; + *(out++) = std::log(scaled_exp_sum) + max_value; + } + } +}; + +template +class LogMeanExpRangeReducerGradient { + public: + void operator()( + const TIndex block_size, + const TIndex blocks, + const T* segment_grad, // GO + T* data_grad, // GI + const T* data_in, // I + const T* data_out, // O + Context* context) { + for (int j = 0; j < block_size; ++j) { + const T out_grad = *(segment_grad++); + const T offset = *(data_out++); + for (int i = 0; i < blocks; ++i) { + auto idx = i * block_size + j; + data_grad[idx] = out_grad * std::exp(data_in[idx] - offset) / blocks; + } + } + } +}; + +struct LogMeanExpRangeReducerDef { + template + using Reducer = LogMeanExpRangeReducer; + template + using ReducerGradient = LogMeanExpRangeReducerGradient; + static constexpr const char* name = "LogMeanExp"; + static constexpr const char* doc = + "LogMeanExp computes the element-wise log of the mean of exponentials of " + "input slices. Operation doesn't change the shape of individual blocks."; +}; template class MeanRangeReducer; @@ -180,10 +241,78 @@ struct MeanRangeReducerDef { static constexpr const char* doc = "Mean computation is done element-wise, so that each element of the " "output slice corresponds to the average value of the respective " - "elements in the input slives. Operation doesn't change the shape of " + "elements in the input slices. Operation doesn't change the shape of " "individual blocks."; }; +template +class MaxRangeReducer; +template +class MaxRangeReducerGradient; + +template +class MaxRangeReducer { + public: + void operator()( + const TIndex block_size, + const TIndex blocks, + const T* in, + T* out, + CPUContext* context) { + for (int j = 0; j < block_size; ++j) { + T max_value = std::numeric_limits::lowest(); + for (int i = 0; i < blocks; ++i) { + max_value = std::max(max_value, in[i * block_size + j]); + } + *(out++) = max_value; + } + } +}; + +template +class MaxRangeReducerGradient { + public: + void operator()( + const TIndex block_size, + const TIndex blocks, + const T* segment_grad, // GO + T* data_grad, // GI + const T* data_in, // I + const T* data_out, // O + Context* context) { + std::memset( + static_cast(data_grad), 0, blocks * block_size * sizeof(T)); + for (int j = 0; j < block_size; ++j) { + const T out_grad = *(segment_grad++); + const T out = data_out[j]; + for (int i = 0; i < blocks; ++i) { + auto idx = i * block_size + j; + if (out == data_in[idx]) { + data_grad[idx] = out_grad; + break; + } + } + } + } +}; + +struct MaxRangeReducerDef { + template + using Reducer = MaxRangeReducer; + template + using ReducerGradient = MaxRangeReducerGradient; + static constexpr const char* name = "Max"; + static constexpr const char* doc = + "Max computation is done element-wise, so that each element of the " + "output slice corresponds to the max value of the respective " + "elements in the input slices. Operation doesn't change the shape of " + "individual blocks. This implementation imitates torch nn.Max operator. " + "If the maximum value occurs more than once, the operator will return " + "the first occurence of value. When computing the gradient using the " + "backward propagation, the gradient input corresponding to the first " + "occurence of the maximum value will be used."; +}; + //////////////////////////////////////////////////////////////////////////////// // Incremental reducers: consume elements one by one //////////////////////////////////////////////////////////////////////////////// diff --git a/caffe2/operators/relu_op_cudnn.cc b/caffe2/operators/relu_op_cudnn.cc index 8d11acea55b..f2d57b55858 100644 --- a/caffe2/operators/relu_op_cudnn.cc +++ b/caffe2/operators/relu_op_cudnn.cc @@ -32,12 +32,16 @@ class CuDNNReluOp final : public Operator { if (X.dims() != cudnn_input_dims_) { VLOG(1) << "Setting descriptors."; cudnn_input_dims_ = X.dims(); - int C = (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(3)); - int H = 1; - int W = 1; + int C = 1, H = 1, W = 1; if (X.ndim() == 4) { + // Normal 4-dimensional tensors for images. + C = (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(3)); H = (order_ == StorageOrder::NCHW ? X.dim32(2) : X.dim32(1)); W = (order_ == StorageOrder::NCHW ? X.dim32(3) : X.dim32(2)); + } else { + // If X is not 4-dimensional, we will simply use H = 1 and W = 1 + // and wrap everything into C. + C = X.size() / X.dim32(0); } CUDNN_CHECK(cudnnSetTensor4dDescriptor( data_desc_, GetCudnnTensorFormat(order_), @@ -93,12 +97,16 @@ class CuDNNReluGradientOp final : public Operator { if (Y.dims() != cudnn_input_dims_) { VLOG(1) << "Setting descriptors."; cudnn_input_dims_ = Y.dims(); - int C = (order_ == StorageOrder::NCHW ? Y.dim32(1) : Y.dim32(3)); - int H = 1; - int W = 1; + int C = 1, H = 1, W = 1; if (Y.ndim() == 4) { + // Normal 4-dimensional tensors for images. + C = (order_ == StorageOrder::NCHW ? Y.dim32(1) : Y.dim32(3)); H = (order_ == StorageOrder::NCHW ? Y.dim32(2) : Y.dim32(1)); W = (order_ == StorageOrder::NCHW ? Y.dim32(3) : Y.dim32(2)); + } else { + // If Y is not 4-dimensional, we will simply use H = 1 and W = 1 + // and wrap everything into C. + C = Y.size() / Y.dim32(0); } CUDNN_CHECK(cudnnSetTensor4dDescriptor( data_desc_, GetCudnnTensorFormat(order_), diff --git a/caffe2/operators/reverse_packed_segs_op.cc b/caffe2/operators/reverse_packed_segs_op.cc new file mode 100644 index 00000000000..7a1648f334b --- /dev/null +++ b/caffe2/operators/reverse_packed_segs_op.cc @@ -0,0 +1,35 @@ +#include "caffe2/operators/reverse_packed_segs_op.h" + +namespace caffe2 { +namespace { +REGISTER_CPU_OPERATOR(ReversePackedSegs, ReversePackedSegsOp); + +OPERATOR_SCHEMA(ReversePackedSegs) + .NumInputs(2) + .NumOutputs(1) + .SetDoc(R"DOC( +Reverse segments in a 3-D tensor (lengths, segments, embeddings,), leaving +paddings unchanged. This operator is used to reverse input of a recurrent neural +network to make it a BRNN. + )DOC") + .Input(0, "data", "a 3-D (lengths, segments, embeddings,) tensor.") + .Input(1, "lengths", "length of each segment.") + .Output( + 0, + "reversed data", + "a (lengths, segments, embeddings,) tensor with each segment reversed" + "and paddings unchanged."); + +class GetReversePackedSegsGradient : public GradientMakerBase { + using GradientMakerBase::GradientMakerBase; + vector GetGradientDefs() override { + return SingleGradientDef( + "ReversePackedSegs", + "", + vector{GO(0), I(1)}, + vector{GI(0)}); + } +}; +REGISTER_GRADIENT(ReversePackedSegs, GetReversePackedSegsGradient); +} // namespace +} // namespace caffe2 diff --git a/caffe2/operators/reverse_packed_segs_op.h b/caffe2/operators/reverse_packed_segs_op.h new file mode 100644 index 00000000000..24f55746bc1 --- /dev/null +++ b/caffe2/operators/reverse_packed_segs_op.h @@ -0,0 +1,84 @@ +#ifndef CAFFE2_OPERATORS_REVERSE_PACKED_SEGS_OP_H_ +#define CAFFE2_OPERATORS_REVERSE_PACKED_SEGS_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" + +namespace caffe2 { + +template +class ReversePackedSegsOp final : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + USE_SIMPLE_CTOR_DTOR(ReversePackedSegsOp); + USE_DISPATCH_HELPER; + + bool RunOnDevice() override { + return DispatchHelper>::call( + this, Input(DATA)); + } + + template + bool DoRunWithType() { + if (Input(LENGTHS).template IsType()) { + DoRunWithLengthType(); + } else { + DoRunWithLengthType(); + } + return true; + } + + private: + INPUT_TAGS(DATA, LENGTHS); + + template + void DoRunWithLengthType() { + const auto& data = Input(DATA); + const auto& lengths = Input(LENGTHS); + + CAFFE_ENFORCE( + data.ndim() == 3, + "DATA should be 3-D tensor "); + CAFFE_ENFORCE(lengths.ndim() == 1, "LENGTH should be 1-D"); + + auto* output = Output(0); + const auto& shape = data.dims(); + output->Resize(shape); + + const auto& max_length = data.dims()[0]; + const auto& batch_size = data.dims()[1]; + const auto& block_size = data.dims()[2]; + CAFFE_ENFORCE( + lengths.dims()[0] == batch_size, + "lenths size should be" + " equal to batch size"); + + const T* data_ptr = data.template data(); + const LengthType* lengths_ptr = lengths.template data(); + T* rev_data_ptr = output->template mutable_data(); + for (TIndex i = 0; i < batch_size; i++) { + const auto& seg_length = lengths_ptr[i]; + CHECK_LE(seg_length, max_length); + TIndex j = 0; + for (; j < seg_length; j++) { + const T* data_block_ptr = data_ptr + (j * batch_size + i) * block_size; + T* rev_data_block_ptr = + rev_data_ptr + ((seg_length - 1 - j) * batch_size + i) * block_size; + context_.template Copy( + block_size, data_block_ptr, rev_data_block_ptr); + } + for (; j < max_length; j++) { + const T* data_block_ptr = data_ptr + (j * batch_size + i) * block_size; + T* rev_data_block_ptr = + rev_data_ptr + (j * batch_size + i) * block_size; + context_.template Copy( + block_size, data_block_ptr, rev_data_block_ptr); + } + } + } +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_REVERSE_PACKED_SEGS_OP_H_ diff --git a/caffe2/operators/segment_reduction_op.cc b/caffe2/operators/segment_reduction_op.cc index 7cbcb150f92..3bc01af523a 100644 --- a/caffe2/operators/segment_reduction_op.cc +++ b/caffe2/operators/segment_reduction_op.cc @@ -1066,9 +1066,15 @@ REGISTER_SEGMENT_DEF( REGISTER_SEGMENT_DEF( AbstractSortedSegmentRangeDef); +REGISTER_SEGMENT_DEF(AbstractSortedSegmentRangeDef< + float, + int, + CPUContext, + LogMeanExpRangeReducerDef>); REGISTER_SEGMENT_DEF( - AbstractSortedSegmentRangeDef); + AbstractSortedSegmentRangeDef); +REGISTER_SEGMENT_DEF( + AbstractSortedSegmentRangeDef); #define REGISTER_REDUCER_WITH_ALL_OPS(reducer_def) \ REGISTER_SEGMENT_DEF( \ diff --git a/caffe2/operators/sparse_to_dense_mask_op.cc b/caffe2/operators/sparse_to_dense_mask_op.cc new file mode 100644 index 00000000000..7a7bbab5c19 --- /dev/null +++ b/caffe2/operators/sparse_to_dense_mask_op.cc @@ -0,0 +1,152 @@ +#include +#include +#include +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor.h" + +namespace caffe2 { + +using ValueTypes = TensorTypes; + +class SparseToDenseMaskOp : public Operator { + public: + SparseToDenseMaskOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) { + std::vector mask = GetRepeatedArgument("mask"); + featuresCount_ = mask.size(); + auto biggest = *std::max_element(mask.begin(), mask.end()); + dense_.assign(std::min(kMaxDenseSize, biggest + 1), -1); + for (int i = 0; i < mask.size(); i++) { + int id = mask[i]; + CAFFE_ENFORCE(id >= 0, "Only positive IDs are allowed."); + if (id >= kMaxDenseSize) { + sparse_[id] = i; + } else { + dense_[id] = i; + } + } + } + + bool RunOnDevice() override { + const TypeMeta& meta = Input(INDICES).meta(); + if (meta.Match()) { + return DoRunWithIndexType(); + } else if (meta.Match()) { + return DoRunWithIndexType(); + } else { + CAFFE_THROW("Unsupported type of tensor: ", meta.name()); + return false; + } + } + + template + bool DoRunWithIndexType() { + if (InputSize() < 4) { + return DoRunWithLengthType(); + } else { + const TypeMeta& meta = Input(LENGTHS).meta(); + if (meta.Match()) { + return DoRunWithLengthType(); + } else if (meta.Match()) { + return DoRunWithLengthType(); + } else { + CAFFE_THROW("Unsupported type of tensor: ", meta.name()); + return false; + } + } + } + + template + bool DoRunWithLengthType() { + return DispatchHelper::call(this, Input(VALUES)); + } + + template + bool DoRunWithType() { + auto& sparse_indices = Input(INDICES); + CAFFE_ENFORCE(sparse_indices.ndim() == 1); + auto& sparse_values = Input(VALUES); + CAFFE_ENFORCE(sparse_values.ndim() == 1); + CAFFE_ENFORCE(sparse_indices.size() == sparse_values.size()); + auto& default_value = Input(DEFAULT); + CAFFE_ENFORCE(default_value.size() == 1); + + const TInd* sparse_indices_vec = sparse_indices.data(); + const TVal* sparse_values_vec = sparse_values.template data(); + const TVal* default_val = default_value.template data(); + + int cols = featuresCount_; + int rows = 0; + TLen default_length = sparse_indices.dim32(0); + const TLen* lengths_vec = nullptr; + auto* output = Output(0); + if (InputSize() == 4) { + auto& lengths = Input(LENGTHS); + CAFFE_ENFORCE(lengths.ndim() == 1); + lengths_vec = lengths.data(); + rows = lengths.dim32(0); + output->Resize(rows, cols); + } + if (rows == 0) { + // if the LENGTHS is not set or it is empty, the output will be a vector + rows = 1; + lengths_vec = &default_length; + output->Resize(cols); + } + + // init + TVal* output_data = output->template mutable_data(); + for (int i = 0; i < cols * rows; i++) { + output_data[i] = default_val[0]; + } + + TLen offset = 0; + for (int r = 0; r < rows; r++) { + for (int c = 0; c < lengths_vec[r]; c++) { + int idx = getFeatureIdx(sparse_indices_vec[offset + c]); + if (idx != -1) { + output_data[r * cols + idx] = sparse_values_vec[offset + c]; + } + } + offset += lengths_vec[r]; + } + + return true; + } + + private: + const int kMaxDenseSize = 1024 * 128; + + std::unordered_map sparse_; + std::vector dense_; + int featuresCount_; + + inline int getFeatureIdx(int id) const { + if (id >= kMaxDenseSize) { + const auto& iter = sparse_.find(id); + if (iter == sparse_.end()) { + return -1; + } else { + return iter->second; + } + } else { + return (id >= dense_.size()) ? -1 : dense_[id]; + } + } + + INPUT_TAGS(INDICES, VALUES, DEFAULT, LENGTHS); +}; + +namespace { +REGISTER_CPU_OPERATOR(SparseToDenseMask, SparseToDenseMaskOp); + +OPERATOR_SCHEMA(SparseToDenseMask) + .NumInputs(3, 4) + .NumOutputs(1) + .SetDoc("Convert sparse representations to dense with given indices.") + .Output(0, "output", "1-D or 2-D dense tensor."); + +NO_GRADIENT(SparseToDenseMask); +} // namespace +} // namespace caffe2 diff --git a/caffe2/operators/string_ops.cc b/caffe2/operators/string_ops.cc new file mode 100644 index 00000000000..0243a42584c --- /dev/null +++ b/caffe2/operators/string_ops.cc @@ -0,0 +1,124 @@ +#include "caffe2/caffe2/operators/string_ops.h" +#include "caffe2/core/operator.h" + +namespace caffe2 { +namespace { + +struct StartsWith { + explicit StartsWith(OperatorBase& op) + : prefix_(op.GetSingleArgument("prefix", "")) {} + bool operator()(const std::string& str) { + return std::mismatch(prefix_.begin(), prefix_.end(), str.begin()).first == + prefix_.end(); + } + + private: + std::string prefix_; +}; + +struct EndsWith { + explicit EndsWith(OperatorBase& op) + : suffix_(op.GetSingleArgument("suffix", "")) {} + bool operator()(const std::string& str) { + return std::mismatch(suffix_.rbegin(), suffix_.rend(), str.rbegin()) + .first == suffix_.rend(); + } + + private: + std::string suffix_; +}; + +struct Prefix { + explicit Prefix(OperatorBase& op) + : length_(op.GetSingleArgument("length", 3)) {} + std::string operator()(const std::string& str) { + return std::string(str.begin(), std::min(str.end(), str.begin() + length_)); + } + + private: + int length_; +}; + +struct Suffix { + explicit Suffix(OperatorBase& op) + : length_(op.GetSingleArgument("length", 3)) {} + std::string operator()(const std::string& str) { + return std::string(std::max(str.begin(), str.end() - length_), str.end()); + } + + private: + int length_; +}; + +template +using StringElementwiseOp = UnaryElementwiseWithArgsOp< + TensorTypes, + CPUContext, + ForEach, + OutputType>; + +REGISTER_CPU_OPERATOR(StringPrefix, StringElementwiseOp); +REGISTER_CPU_OPERATOR(StringSuffix, StringElementwiseOp); +REGISTER_CPU_OPERATOR(StringStartsWith, StringElementwiseOp); +REGISTER_CPU_OPERATOR(StringEndsWith, StringElementwiseOp); + +OPERATOR_SCHEMA(StringPrefix) + .NumInputs(1) + .NumOutputs(1) + .SetDoc(R"DOC( +Computes the element-wise string prefix of the string tensor. +Input strings that are shorter than prefix length will be returned unchanged. +NOTE: Prefix is computed on number of bytes, which may lead to wrong behavior +and potentially invalid strings for variable-length encodings such as utf-8. +)DOC") + .Arg("length", "Maximum size of the prefix, in bytes.") + .Input(0, "strings", "Tensor of std::string.") + .Output( + 0, + "prefixes", + "Tensor of std::string containing prefixes for each input."); + +OPERATOR_SCHEMA(StringSuffix) + .NumInputs(1) + .NumOutputs(1) + .SetDoc(R"DOC( +Computes the element-wise string suffix of the string tensor. +Input strings that are shorter than suffix length will be returned unchanged. +NOTE: Prefix is computed on number of bytes, which may lead to wrong behavior +and potentially invalid strings for variable-length encodings such as utf-8. +)DOC") + .Input(0, "strings", "Tensor of std::string.") + .Output( + 0, + "suffixes", + "Tensor of std::string containing suffixes for each output.") + .Arg("length", "Maximum size of the suffix, in bytes."); + +OPERATOR_SCHEMA(StringStartsWith) + .NumInputs(1) + .NumOutputs(1) + .SetDoc(R"DOC( +Performs the starts-with check on each string in the input tensor. +Returns tensor of boolean of the same dimension of input. +)DOC") + .Arg("prefix", "The prefix to check input strings against.") + .Input(0, "strings", "Tensor of std::string.") + .Output(0, "bools", "Tensor of bools of same shape as input."); + +OPERATOR_SCHEMA(StringEndsWith) + .NumInputs(1) + .NumOutputs(1) + .SetDoc(R"DOC( +Performs the ends-with check on each string in the input tensor. +Returns tensor of boolean of the same dimension of input. +)DOC") + .Arg("suffix", "The suffix to check input strings against.") + .Input(0, "strings", "Tensor of std::string.") + .Output(0, "bools", "Tensor of bools of same shape as input."); + +SHOULD_NOT_DO_GRADIENT(StringPrefix); +SHOULD_NOT_DO_GRADIENT(StringSuffix); +SHOULD_NOT_DO_GRADIENT(StringStartsWith); +SHOULD_NOT_DO_GRADIENT(StringEndsWith); +} +} // namespace caffe2 diff --git a/caffe2/operators/string_ops.h b/caffe2/operators/string_ops.h new file mode 100644 index 00000000000..3a3b3b10a75 --- /dev/null +++ b/caffe2/operators/string_ops.h @@ -0,0 +1,13 @@ +#pragma once +#include "caffe2/core/operator.h" +#include "caffe2/operators/elementwise_op.h" + +namespace caffe2 { + +template +using StringElementwiseOp = UnaryElementwiseWithArgsOp< + TensorTypes, + CPUContext, + ForEach, + OutputType>; +} diff --git a/caffe2/operators/transpose_op.cu b/caffe2/operators/transpose_op.cu index 537abe4c0b8..417954b961f 100644 --- a/caffe2/operators/transpose_op.cu +++ b/caffe2/operators/transpose_op.cu @@ -10,24 +10,25 @@ namespace caffe2 { #define COMPILE_TIME_CUDA_MAX_TRANSPOSE_DIMS 5 namespace { -// TODO(jiayq): one possible optimization is to copy the buffer into a shared memory -// location to speed up access. +// TODO(jiayq): one possible optimization is to copy the buffer into a shared +// memory location to speed up access. template __global__ void transpose_gpu(const int nthreads, const Dtype* from_data, Dtype* to_data, const int* buffer, const int num_axes) { int from_inds[COMPILE_TIME_CUDA_MAX_TRANSPOSE_DIMS]; const int* from_counts = buffer; const int* to_counts = buffer + num_axes; - const int* map = buffer + num_axes * 2; + const int* axes = buffer + num_axes * 2; CUDA_1D_KERNEL_LOOP(index, nthreads) { int from_index = index, to_index = 0; - for (int i = 0; i < num_axes; i++) { - from_inds[i] = from_index / from_counts[i]; - from_index = from_index % from_counts[i]; + for (int i = num_axes - 1; i >= 0; --i) { + from_inds[i] = from_index % from_counts[i]; + from_index = from_index / from_counts[i]; } - for (int i = 0; i < num_axes; i++) { - to_index += from_inds[map[i]] * to_counts[i]; + for (int i = 0; i < num_axes - 1; i++) { + to_index = (to_index + from_inds[axes[i]]) * to_counts[i + 1]; } + to_index += from_inds[axes[num_axes - 1]]; to_data[to_index] = from_data[index]; } } @@ -42,7 +43,7 @@ bool TransposeOp::DoRunWithType() { int ndim = input.ndim(); CAFFE_ENFORCE(count < std::numeric_limits::max(), "Transpose op on GPU only supports int32"); - CAFFE_ENFORCE(count < COMPILE_TIME_CUDA_MAX_TRANSPOSE_DIMS, + CAFFE_ENFORCE(ndim < COMPILE_TIME_CUDA_MAX_TRANSPOSE_DIMS, "Input ndim exceeds compile time max."); // Buffer contains the following data: // (1) the dimenions of the inputs diff --git a/caffe2/operators/transpose_op.h b/caffe2/operators/transpose_op.h index a430a98177c..565fc0cd649 100644 --- a/caffe2/operators/transpose_op.h +++ b/caffe2/operators/transpose_op.h @@ -45,7 +45,8 @@ class TransposeOp final : public Operator { } Y->Resize(new_dims_); // Do the actual transpose, which is implemented in DoRunWithType(). - return DispatchHelper>::call(this, Input(0)); + return DispatchHelper>::call( + this, Input(0)); } protected: diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc index f8473422fd0..dff1d76eda4 100644 --- a/caffe2/operators/utility_ops.cc +++ b/caffe2/operators/utility_ops.cc @@ -17,6 +17,7 @@ REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp); REGISTER_CPU_OPERATOR(Copy, CopyOp); REGISTER_CPU_OPERATOR(Shape, ShapeOp); REGISTER_CPU_OPERATOR(HasElements, HasElementsOp); +REGISTER_CPU_OPERATOR(IsEmpty, IsEmptyOp); REGISTER_CPU_OPERATOR(Gather, GatherOp); REGISTER_CPU_OPERATOR(Unique, UniqueOp); REGISTER_CPU_OPERATOR(LengthsToSegmentIds, LengthsToSegmentIdsOp); @@ -24,6 +25,7 @@ REGISTER_CPU_OPERATOR(SegmentIdsToLengths, SegmentIdsToLengthsOp); REGISTER_CPU_OPERATOR(Slice, SliceOp); REGISTER_CPU_OPERATOR(Squeeze, SqueezeOp); REGISTER_CPU_OPERATOR(ExpandDims, ExpandDimsOp); +REGISTER_CPU_OPERATOR(And, AndOp); OPERATOR_SCHEMA(Print) .NumInputs(1) @@ -209,6 +211,13 @@ OPERATOR_SCHEMA(HasElements) "has_elements", "Scalar bool tensor. True if input is not empty."); +OPERATOR_SCHEMA(IsEmpty) + .NumInputs(1) + .NumOutputs(1) + .SetDoc("Returns true iff the input tensor has size == 0") + .Input(0, "tensor", "Tensor of any type.") + .Output(0, "is_empty", "Scalar bool tensor. True if input is empty."); + OPERATOR_SCHEMA(Gather) .NumInputs(2) .NumOutputs(1) @@ -340,9 +349,21 @@ If the same blob is provided in input and output, the operation is copy-free. .Input(0, "data", "Original tensor") .Output(0, "expanded", "Reshaped tensor with same data as input."); +OPERATOR_SCHEMA(And) + .NumInputs(2) + .NumOutputs(1) + .AllowInplace({{0, 0}}) + .SetDoc(R"DOC( +Outputs true iff both input blob values are true. +)DOC") + .Input(0, "input_0", "first boolean input.") + .Input(1, "input_1", "second boolean input.") + .Output(0, "output", "input_0 && input_1."); + SHOULD_NOT_DO_GRADIENT(Print); SHOULD_NOT_DO_GRADIENT(Shape); SHOULD_NOT_DO_GRADIENT(HasElements); +SHOULD_NOT_DO_GRADIENT(IsEmpty); class GetSqueezeGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; @@ -433,6 +454,7 @@ SHOULD_NOT_DO_GRADIENT(LengthsToSegmentIds); SHOULD_NOT_DO_GRADIENT(SegmentIdsToLengths); // TODO(azzolini): Add support for slice gradient SHOULD_NOT_DO_GRADIENT(Slice); +SHOULD_NOT_DO_GRADIENT(And); } // namespace diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h index 249e9a6302a..7024cde71a5 100644 --- a/caffe2/operators/utility_ops.h +++ b/caffe2/operators/utility_ops.h @@ -675,6 +675,21 @@ class HasElementsOp : public Operator { } }; +template +class IsEmptyOp : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + USE_SIMPLE_CTOR_DTOR(IsEmptyOp); + + bool RunOnDevice() override { + auto& input = Input(0); + auto* output = OperatorBase::Output(0); + output->Resize(std::vector{}); + *output->template mutable_data() = (input.size() == 0); + return true; + } +}; + // RecordShapeOp records the shape of the input tensor to a vector of int. You // mostly don't need this operator explicitly, and it is mostly used in the // autodiff process. @@ -911,6 +926,23 @@ class UniqueOp : public Operator { public: OUTPUT_TAGS(UNIQUE, REMAPPING); }; + +template +class AndOp final : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + AndOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) {} + + bool RunOnDevice() override { + const auto* i1 = Input(0).template data(); + const auto* i2 = Input(1).template data(); + auto* output = Output(0); + output->Resize(std::vector{}); + *output->template mutable_data() = (*i1 && *i2); + return true; + } +}; } // namespace caffe2 #endif // CAFFE2_OPERATORS_UTILITY_OPS_H_ diff --git a/caffe2/proto/caffe2.proto b/caffe2/proto/caffe2.proto index 71e4f5478e3..b6406085bea 100644 --- a/caffe2/proto/caffe2.proto +++ b/caffe2/proto/caffe2.proto @@ -199,7 +199,7 @@ message ExecutionStep { // Criteria network specifies a single output (TensorCPU) of // size (1), is run on every iteration by the executor, and // execution terminates when the output[0] is `false`. - optional string criteria_network = 5; + optional string criteria_network = 5 [deprecated=true]; // If specified, run report_net asynchronously every `report_interval` // seconds. Report_net is guaranteed to run at least once after all @@ -210,6 +210,20 @@ message ExecutionStep { // If false or not set, execute sub-steps serially. // If true, execute all substeps concurrently, each one in a separte thread. optional bool concurrent_substeps = 6; + + // Name of a scalar boolean tensor. + // ES checks this blob AFTER every substeps/subnets. + // If specified, and the value is true, then ES will skip the rest and return + // immediately. + // This means that the report_net and the first step will always be called. + // Use cases: + // 1) the first substep stops the rest if data condition not met + // 2) the first substep decide which of the rest of the steps should be run. + // 3) external control + // + // ** It is the user's responsibility to not to put this blob in race conditions. + // ** For example when setting this blob in concurrent substeps + optional string should_stop_blob = 9; } message PlanDef { diff --git a/caffe2/python/caffe2_python.cc b/caffe2/python/caffe2_python.cc index 956d408209a..75092bef94e 100644 --- a/caffe2/python/caffe2_python.cc +++ b/caffe2/python/caffe2_python.cc @@ -87,7 +87,6 @@ const TypeMeta& NumpyTypeToCaffe(int numpy_type) { {NPY_UINT8, TypeMeta::Make()}, {NPY_UINT16, TypeMeta::Make()}, {NPY_OBJECT, TypeMeta::Make()}, - {NPY_STRING, TypeMeta::Make()}, // Note: Add more types here. }; static TypeMeta unknown_type; @@ -565,11 +564,10 @@ PyObject* FetchBlob(PyObject* self, PyObject* args) { PyObject* FeedBlob(PyObject* self, PyObject* args) { char* name_char; - PyArrayObject* array = nullptr; + PyObject* arg = nullptr; PyObject* device_option_string = nullptr; - // TODO(dzhulgakov): implement accepting other types (at least string) - if (!PyArg_ParseTuple(args, "sO!|O", &name_char, &PyArray_Type, &array, - &device_option_string)) { + if (!PyArg_ParseTuple( + args, "sO|O", &name_char, &arg, &device_option_string)) { PyErr_SetString(PyExc_ValueError, "Incorrect arguments."); return nullptr; } @@ -584,45 +582,57 @@ PyObject* FeedBlob(PyObject* self, PyObject* args) { } Blob* blob = gWorkspace->CreateBlob(name); - auto feeder = CreateFeeder(option.device_type()); - if (!feeder) { - PyErr_SetString(PyExc_TypeError, - "Unknown device type encountered in FeedBlob."); + if (PyArray_Check(arg)) { // numpy array + PyArrayObject* array = reinterpret_cast(arg); + auto feeder = CreateFeeder(option.device_type()); + if (!feeder) { + PyErr_SetString( + PyExc_TypeError, "Unknown device type encountered in FeedBlob."); + return nullptr; + } + return feeder->Feed(option, array, blob); + } else if (PyString_Check(arg)) { // string + *blob->GetMutable() = PyBytesToStdString(arg); + Py_RETURN_TRUE; + } else { + PyErr_SetString( + PyExc_ValueError, + "Unexpected type of argument - only numpy array or string are " + "supported for feeding"); return nullptr; } - return feeder->Feed(option, array, blob); } // A simple macro to avoid writing repeated symbols. #define _PYNAME(name) {#name, name, METH_VARARGS, ""} PyMethodDef* GetCaffe2PythonMethods() { static PyMethodDef gCaffe2PythonMethods[] = { - // Note(Yangqing): For any function that we are going to override in the - // python file, we prepend "cc_" here. - _PYNAME(GlobalInit), - _PYNAME(RegisteredOperators), - {"cc_GetGradientDefs", GetGradientDefs, METH_VARARGS, ""}, - _PYNAME(SwitchWorkspace), - _PYNAME(CurrentWorkspace), - _PYNAME(Workspaces), - {"cc_ResetWorkspace", ResetWorkspace, METH_VARARGS, ""}, - _PYNAME(RootFolder), - _PYNAME(OnModuleExit), - _PYNAME(Blobs), - _PYNAME(HasBlob), - {"cc_CreateNet", CreateNet, METH_VARARGS, ""}, - _PYNAME(RunNet), - _PYNAME(BenchmarkNet), - _PYNAME(DeleteNet), - _PYNAME(Nets), - {"cc_RunOperatorOnce", RunOperatorOnce, METH_VARARGS, ""}, - {"cc_RunNetOnce", RunNetOnce, METH_VARARGS, ""}, - {"cc_RunPlan", RunPlan, METH_VARARGS, ""}, - _PYNAME(CreateBlob), - _PYNAME(SerializeBlob), - _PYNAME(FetchBlob), - {"cc_FeedBlob", FeedBlob, METH_VARARGS, ""}, - {nullptr, nullptr, 0, nullptr}, // end of python methods. + // Note(Yangqing): For any function that we are going to override in the + // python file, we prepend "cc_" here. + _PYNAME(GlobalInit), + _PYNAME(RegisteredOperators), + {"cc_GetGradientDefs", GetGradientDefs, METH_VARARGS, ""}, + _PYNAME(SwitchWorkspace), + _PYNAME(CurrentWorkspace), + _PYNAME(Workspaces), + {"cc_ResetWorkspace", ResetWorkspace, METH_VARARGS, ""}, + _PYNAME(RootFolder), + _PYNAME(OnModuleExit), + _PYNAME(Blobs), + _PYNAME(HasBlob), + {"cc_CreateNet", CreateNet, METH_VARARGS, ""}, + _PYNAME(RunNet), + _PYNAME(BenchmarkNet), + _PYNAME(DeleteNet), + _PYNAME(Nets), + {"cc_RunOperatorOnce", RunOperatorOnce, METH_VARARGS, ""}, + {"cc_RunNetOnce", RunNetOnce, METH_VARARGS, ""}, + {"cc_RunPlan", RunPlan, METH_VARARGS, ""}, + _PYNAME(CreateBlob), + _PYNAME(SerializeBlob), + {"cc_FetchBlob", FetchBlob, METH_VARARGS, ""}, + {"cc_FeedBlob", FeedBlob, METH_VARARGS, ""}, + {nullptr, nullptr, 0, nullptr}, // end of python methods. }; return gCaffe2PythonMethods; } diff --git a/caffe2/python/caffe2_python.h b/caffe2/python/caffe2_python.h index 57c67455702..9a8f8f2ddd0 100644 --- a/caffe2/python/caffe2_python.h +++ b/caffe2/python/caffe2_python.h @@ -160,7 +160,10 @@ class TensorFetcher : public BlobFetcherBase { Py_DECREF(outObj[j]); } Py_DECREF(array); - LOG(FATAL) << "Failed to allocate string for ndarray of strings."; + PyErr_SetString( + PyExc_TypeError, + "Failed to allocate string for ndarray of strings."); + return nullptr; } } return array; @@ -217,21 +220,14 @@ class TensorFeeder : public BlobFeederBase { char* str; Py_ssize_t strSize; if (PyBytes_AsStringAndSize(input[i], &str, &strSize) == -1) { - LOG(FATAL) << "Unsupported pyhton object type passed into ndarray."; + PyErr_SetString( + PyExc_TypeError, + "Unsupported python object type passed into ndarray."); + return nullptr; } outPtr[i] = std::string(str, strSize); } } break; - case NPY_STRING: { - char* inputData = PyArray_BYTES(array); - auto* outPtr = tensor->template mutable_data(); - auto itemSize = PyArray_ITEMSIZE(array); - for (int i = 0; i < tensor->size(); ++i) { - auto start = inputData + i * itemSize; - auto end = std::find(start, start + itemSize, '\0'); - outPtr[i] = std::string(start, end - start); - } - } break; default: context.template CopyBytes( tensor->size() * meta.itemsize(), diff --git a/caffe2/python/convnet_benchmarks.py b/caffe2/python/convnet_benchmarks.py index 91205cfe4ca..a7e0e27be9c 100644 --- a/caffe2/python/convnet_benchmarks.py +++ b/caffe2/python/convnet_benchmarks.py @@ -613,6 +613,7 @@ def GetArgumentParser(): ) parser.add_argument("--net_type", type=str, default="dag") parser.add_argument("--num_workers", type=int, default=2) + parser.add_argument("--use-nvtx", default=False, action='store_true') return parser @@ -624,7 +625,9 @@ if __name__ == '__main__': ): GetArgumentParser().print_help() - workspace.GlobalInit(['caffe2', '--caffe2_log_level=0']) + workspace.GlobalInit( + ['caffe2', '--caffe2_log_level=0'] + + ['--caffe2_use_nvtx'] if args.use_nvtx else []) model_map = { 'AlexNet': AlexNet, 'OverFeat': OverFeat, diff --git a/caffe2/python/core.py b/caffe2/python/core.py index c9f038a1486..f5a742f1e98 100644 --- a/caffe2/python/core.py +++ b/caffe2/python/core.py @@ -146,7 +146,7 @@ def ScopedBlobReference(name, *args, **kwargs): return BlobReference(scope.NAMESCOPE + name, *args, **kwargs) -def _RectifyInputOutput(blobs): +def _RectifyInputOutput(blobs, net=None): """A helper function to rectify the input or output of the CreateOperator interface. """ @@ -154,18 +154,18 @@ def _RectifyInputOutput(blobs): # If blobs is a single string, prepend scope.NAMESCOPE and put it as a # list. # TODO(jiayq): enforce using BlobReference instead of raw strings. - return [ScopedBlobReference(blobs)] + return [ScopedBlobReference(blobs, net=net)] elif type(blobs) is BlobReference: # If blob is a BlobReference, simply put it as a list. - return [BlobReference(str(blobs))] - elif type(blobs) is list: + return [blobs] + elif type(blobs) in (list, tuple): # If blob is a list, we go through it and type check. rectified = [] for blob in blobs: if isinstance(blob, basestring): - rectified.append(ScopedBlobReference(blob)) + rectified.append(ScopedBlobReference(blob, net=net)) elif type(blob) is BlobReference: - rectified.append(BlobReference(str(blob))) + rectified.append(blob) else: raise TypeError( "I/O blob #{} of unsupported type: {} of type {}" @@ -670,8 +670,19 @@ def get_op_ids_in_path(ssa, blob_versions, inputs, outputs): class Net(object): + _net_names_used = set() operator_registry_ = {} + @staticmethod + def _get_next_net_name(basename): + name = basename + next_idx = 1 + while name in Net._net_names_used: + name = basename + '_' + str(next_idx) + next_idx += 1 + Net._net_names_used |= set([name]) + return name + def __init__(self, name_or_proto): """ Create a Net. @@ -706,29 +717,29 @@ class Net(object): else: self._next_name_index = 0 else: - name = name_or_proto self._net = caffe2_pb2.NetDef() - self._net.name = name + self._net.name = name_or_proto self._next_name_index = 0 + # make sure that this net name hasn't been used before + self._net.name = Net._get_next_net_name(self._net.name) + def __str__(self): return self._net.name - def DefinesBlob(self, blob): + def BlobIsDefined(self, blob): """ Returns true if the given BlobReference is produced as output of an operator in this net, or if it is provided as an external input. """ - if isinstance(blob, BlobReference): - assert blob.Net() == self, 'Reference belongs to different net' blob_name = str(blob) + for input in self._net.external_input: + if input == blob_name: + return True for op in self._net.op: for output in op.output: if output == blob_name: return True - for input in self._net.external_input: - if input == blob_name: - return True return False def UsesBlob(self, blob): @@ -753,7 +764,7 @@ class Net(object): raises KeyError. """ blob_name = str(blob_name) - if not self.DefinesBlob(blob_name): + if not self.BlobIsDefined(blob_name): raise KeyError('Net does not define blob %s' % blob_name) return BlobReference(blob_name, self) @@ -818,13 +829,16 @@ class Net(object): new_outputs: list of BlobReferences corresponding to the outputs produced by new_net. """ - inputs = inputs if isinstance(inputs, dict) else {i: i for i in inputs} + input_is_pair_list = isinstance(inputs, list) and all( + isinstance(i, tuple) and len(i) == 2 for i in inputs) + inputs = ( + inputs if isinstance(inputs, (dict, OrderedDict)) else + OrderedDict(inputs) if input_is_pair_list else + OrderedDict(zip(inputs, inputs))) + for output in outputs: + assert self.BlobIsDefined(output) input_names = {str(k): str(v) for k, v in inputs.items()} output_names = [str(o) for o in outputs] - for input in inputs.keys(): - assert self.UsesBlob(input) - for output in outputs: - assert self.DefinesBlob(output) proto = self._net ssa, blob_versions = get_ssa(proto) used_op_ids = get_op_ids_in_path(ssa, blob_versions, inputs, outputs) @@ -859,11 +873,23 @@ class Net(object): def Proto(self): return self._net - def NextName(self): + def NextName(self, prefix=None, output_id=None): """Returns the next name to be used, if you do not want to explicitly name your blob.""" - output_name = self._net.name + '_blob_' + str(self._next_name_index) - self._next_name_index += 1 + if prefix: + output_name_base = self._net.name + '/' + prefix + output_name = output_name_base + if output_id is not None: + output_name += ':' + str(output_id) + index = 2 + while self.BlobIsDefined(output_name): + output_name = output_name_base + '_' + str(index) + if output_id is not None: + output_name += ':' + str(output_id) + index += 1 + else: + output_name = self._net.name + '_blob_' + str(self._next_name_index) + self._next_name_index += 1 return str(output_name) def AddGradientOperators(self, ys, skip=0): @@ -900,16 +926,18 @@ class Net(object): self._net.op.extend(grad_ops) return input_to_grad - def AddExternalInput(self, input_name): - input_name = str(input_name) + def AddExternalInput(self, input): + input_name = str(input) assert input_name not in self._net.external_input, ( 'Net already contains an input named %s' % input_name) self._net.external_input.extend([input_name]) - return BlobReference(input_name, self) + return ( + input if isinstance(input, BlobReference) + else BlobReference(input_name)) def AddExternalOutput(self, output): assert isinstance(output, BlobReference) - assert self.DefinesBlob(output) + assert self.BlobIsDefined(output) self.Proto().external_output.extend([str(output)]) def DeduplicateGradientSlices(self, g): @@ -931,14 +959,22 @@ class Net(object): def _CreateAndAddToSelf(self, op_type, inputs, outputs=None, **kwargs): """A helper function to create an operator and add it to self. """ + inputs = _RectifyInputOutput(inputs) + for input in inputs: + if not self.BlobIsDefined(input): + assert input.Net() != self + self.AddExternalInput(input) if outputs is None: # If we do not specify an output, we will assume that this op # produces one output in this case. - outputs = self.NextName() + outputs = self.NextName(prefix=op_type) elif type(outputs) is int: # In this case, we will auto-fill the given number of outputs # with auto-generated names. - outputs = [self.NextName() for i in range(outputs)] + outputs = [ + self.NextName(prefix=op_type, output_id=i) + for i in range(outputs)] + outputs = _RectifyInputOutput(outputs, net=self) op = CreateOperator(op_type, inputs, outputs, **kwargs) self._net.op.extend([op]) if len(op.output) == 0: @@ -1036,10 +1072,11 @@ class ExecutionStep(object): self._assert_can_mutate() self._step.num_iter = num_iter - def SetCriteriaNet(self, criteria_net): + def SetShouldStopBlob(self, should_stop_blob): + assert isinstance(should_stop_blob, BlobReference), ( + "expects BlobReference here, got {}".format(type(should_stop_blob))) self._assert_can_mutate() - _add_net_to_dict(self._net_dict, criteria_net) - self._step.criteria_network = get_net_name(criteria_net) + self._step.should_stop_blob = str(should_stop_blob) def SetReportNet(self, report_net, report_interval): self._assert_can_mutate() @@ -1053,7 +1090,7 @@ class ExecutionStep(object): if isinstance(substep, ExecutionStep): substep._notify_is_used() if not substep.HasNets() and not substep.HasSubsteps(): - return + return self for net in substep.Nets(): _add_net_to_dict(self._net_dict, net) self._substeps.append(substep) @@ -1061,6 +1098,7 @@ class ExecutionStep(object): else: proto = substep self._step.substep.add().CopyFrom(proto) + return self def SetConcurrentSubsteps(self, concurrent_substeps): self._assert_can_mutate() @@ -1073,6 +1111,7 @@ class ExecutionStep(object): assert isinstance(net, Net) _add_net_to_dict(self._net_dict, net) self._step.network.extend([get_net_name(net)]) + return self class Plan(object): @@ -1107,11 +1146,11 @@ class Plan(object): def execution_step(default_name, steps_or_nets, - criteria=None, num_iter=None, report_net=None, report_interval=None, - concurrent_substeps=None): + concurrent_substeps=None, + should_stop_blob=None): """ Helper for creating an ExecutionStep. - steps_or_nets can be: @@ -1120,18 +1159,20 @@ def execution_step(default_name, - ExecutionStep - list - list - - criteria is either None or a Net - - if no criteria or num_iter is provided, defaults to num_iter=1 + - should_stop_blob is either None or a scalar boolean blob. + - This blob is checked AFTER every substeps/subnets. + - If specified and true, then this step will return immediately. + - Be sure to handle race conditions if setting from concurrent threads. + - if no should_stop_blob or num_iter is provided, defaults to num_iter=1 """ - assert criteria is None or isinstance(criteria, Net) - assert criteria is None or num_iter is None, ( - 'Cannot set both criteria and num_iter.') - if criteria is None and num_iter is None: + assert should_stop_blob is None or num_iter is None, ( + 'Cannot set both should_stop_blob and num_iter.') + if should_stop_blob is None and num_iter is None: num_iter = 1 - def set_criteria(step): - if criteria is not None: - step.SetCriteriaNet(criteria) + def set_step_attr(step): + if should_stop_blob is not None: + step.SetShouldStopBlob(should_stop_blob) else: step.SetIter(num_iter) if concurrent_substeps is not None: @@ -1144,18 +1185,20 @@ def execution_step(default_name, if not steps_or_nets: return ExecutionStep(default_name) if isinstance(steps_or_nets, ExecutionStep): - return set_criteria(steps_or_nets) + step = set_step_attr(ExecutionStep(default_name)) + step.AddSubstep(steps_or_nets) + return step elif isinstance(steps_or_nets, Net): - step = set_criteria(ExecutionStep(default_name)) + step = set_step_attr(ExecutionStep(default_name)) step.AddNet(steps_or_nets) return step elif isinstance(steps_or_nets, list): if isinstance(steps_or_nets[0], Net): - step = set_criteria(ExecutionStep(default_name)) + step = set_step_attr(ExecutionStep(default_name)) map(step.AddNet, steps_or_nets) return step elif isinstance(steps_or_nets[0], ExecutionStep): - step = set_criteria(ExecutionStep(default_name)) + step = set_step_attr(ExecutionStep(default_name)) map(step.AddSubstep, steps_or_nets) return step else: diff --git a/caffe2/python/dataset.py b/caffe2/python/dataset.py new file mode 100644 index 00000000000..260a4eb3de2 --- /dev/null +++ b/caffe2/python/dataset.py @@ -0,0 +1,276 @@ +""" +Implementation of an in-memory dataset with structured schema. + +Use this to store and iterate through datasets with complex schema that +fit in memory. + +Iterating through entries of this dataset is very fast since the dataset +is stored as a set of native Caffe2 tensors, thus no type conversion or +deserialization is necessary. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from caffe2.python import core, workspace +from caffe2.python.io import Reader, Writer +from caffe2.python.schema import Struct +import numpy as np + + +class _DatasetReader(Reader): + def __init__(self, field_names, field_blobs, cursor, name): + """Don't call this directly. Instead, use dataset.reader()""" + self.field_names = field_names + self.field_blobs = field_blobs + self.cursor = cursor + self.name = name + + def read(self, read_net, batch_size=1): + with core.NameScope(read_net.NextName(self.name)): + fields = read_net.ReadNextBatch( + [self.cursor] + self.field_blobs, + self.field_names, + batch_size=batch_size) + return (read_net.IsEmpty([fields[0]]), fields) + + def reset(self, net): + net.ResetCursor([self.cursor], []) + + +class _DatasetRandomReader(Reader): + def __init__(self, field_names, field_blobs, cursor, name, indices): + """Don't call this directly. Instead, use dataset.random_reader()""" + self.field_names = field_names + self.field_blobs = field_blobs + self.cursor = cursor + self.name = name + self.indices = indices + + def reset(self, net): + net.ResetCursor([self.cursor], []) + + def computeoffset(self, net): + self.reset(net) + offsets = net.ComputeOffset( + [self.cursor] + self.field_blobs, + 'offsets') + self.offsets = offsets + + def read(self, read_net, batch_size=1): + fields = read_net.ReadRandomBatch( + [self.cursor, self.indices, self.offsets] + self.field_blobs, + self.field_names, + batch_size=batch_size) + return (read_net.IsEmpty([fields[0]]), fields) + + +class _DatasetWriter(Writer): + def __init__(self, fields, field_blobs, init_net): + """Don't call this directly. Use dataset.writer() instead.""" + self.fields = fields + self.field_blobs = field_blobs + self.mutex = init_net.CreateMutex([]) + + def write(self, writer_net, fields): + """ + Add operations to `net` that append the blobs in `fields` to the end + of the dataset. An additional operator will also be added that checks + the consistency of the data in `fields` against the dataset schema. + + Args: + writer_net: The net that will contain the Append operators. + fields: A list of BlobReference to be appeneded to this dataset. + """ + assert len(fields) == len(self.fields), ( + 'Expected %s fields, got %s.' % (len(self.fields), len(fields))) + writer_net.CheckDatasetConsistency(fields, [], fields=self.fields) + writer_net.AtomicAppend( + [self.mutex] + list(self.field_blobs) + list(fields), + self.field_blobs) + + def commit(self, finish_net): + """Commit is a no-op for an in-memory dataset.""" + pass + + +def to_ndarray_list(values, schema): + """ + Given a list of values and a dataset schema, produce list of ndarray in the + right format. + + This function will perform some checks to make sure that the arrays + produced have the right dtype and rank. + """ + assert isinstance(schema, Struct), 'schema must be a Struct.' + names = schema.field_names() + types = schema.field_types() + assert len(types) == len(values), ( + 'Values must have %d elements, got %d' % (len(types), len(values))) + + arrays = [] + for value, dtype, name in zip(values, types, names): + array = np.array(value, dtype=dtype.base) + # if array is empty we may need to reshape a little + if array.size == 0: + array = array.reshape((0,) + dtype.shape) + # check that the inner dimensions match the schema + assert (array.shape[1:] == dtype.shape), ( + 'Invalid array shape for field %s. Expected (%s), got (%s).' % ( + name, + ', '.join(['_'] + map(str, dtype.shape)), + ', '.join(map(str, array.shape)))) + arrays.append(array) + return arrays + + +def Const(net, value, dtype=None, name=None): + """ + Create a 'constant' by first creating an external input in the given + net, and then feeding the corresponding blob with its provided value + in the current workspace. The name is automatically generated in order + to avoid clashes with existing blob names. + """ + assert isinstance(net, core.Net), 'net must be a core.Net instance.' + value = np.array(value, dtype=dtype) + blob = net.AddExternalInput(net.NextName(prefix=name)) + workspace.FeedBlob(str(blob), value) + return blob + + +class Dataset(object): + """Represents an in-memory dataset with fixed schema. + + Use this to store and iterate through datasets with complex schema that + fit in memory. + + Iterating through entries of this dataset is very fast since the dataset + is stored as a set of native Caffe2 tensors, thus no type conversion or + deserialization is necessary. + """ + + def __init__(self, fields, name=None): + """Create an un-initialized dataset with schema provided by `fields`. + + Before this dataset can be used, it must be initialized, either by + `init_empty` or `init_from_dataframe`. + + Args: + fields: either a schema.Struct or a list of field names in a format + compatible with the one described in schema.py. + name: optional name to prepend to blobs that will store the data. + """ + assert isinstance(fields, list) or isinstance(fields, Struct), ( + 'fields must be either a Struct or a list of raw field names.') + self.schema = fields + self.fields = ( + fields.field_names() if isinstance(fields, Struct) else fields) + self.field_types = ( + fields.field_types() if isinstance(fields, Struct) else + [np.dtype(np.void)] * len(self.fields)) + self.name = name or 'dataset' + self.field_blobs = None + + def init_empty(self, init_net): + """Initialize the blobs for this dataset with empty values. + + Empty arrays will be immediately fed into the current workspace, + and `init_net` will take those blobs as external inputs. + """ + self.field_blobs = [Const(init_net, [], name=f) for f in self.fields] + + def init_from_dataframe(self, net, dataframe): + """Initialize the blobs for this dataset from a Pandas dataframe. + + Each column of the dataframe will be immediately fed into the current + workspace, and the `net` will take this blobs as external inputs. + """ + assert len(self.fields) == len(dataframe.columns) + self.field_blobs = [ + Const(net, dataframe.as_matrix([col]).flatten(), name=field) + for col, field in enumerate(self.fields)] + + def get_blobs(self): + """ + Return the list of BlobReference pointing to the blobs that contain + the data for this dataset. + """ + assert self + return self.field_blobs + + def field_names(self): + """Return the list of field names for this dataset.""" + return self.fields + + def field_types(self): + """ + Return the list of field dtypes for this dataset. + + If a list of strings, not a schema.Struct, was passed to the + constructor, this will return a list of dtype(np.void). + """ + return self.field_types + + def reader(self, init_net, cursor_name=None): + """Create a Reader object that is used to iterate through the dataset. + + This will append operations to `init_net` that create a TreeCursor, + used to iterate through the data. + + NOTE: Currently, it is not safe to append to a dataset while reading. + + Args: + init_net: net that will be run once to create the cursor. + cursor_name: optional name for the blob containing a pointer + to the cursor. + + Returns: + A _DatasetReader that can be used to create operators that will + iterate through the dataset. + """ + assert self.field_blobs, 'Dataset not initialized.' + cursor_name = cursor_name or (self.name + '_cursor') + cursor = init_net.CreateTreeCursor( + [], + [cursor_name], + fields=self.fields) + return _DatasetReader( + self.fields, self.field_blobs, cursor, cursor_name) + + def random_reader(self, init_net, indices, cursor_name=None): + """Create a Reader object that is used to iterate through the dataset. + + NOTE: The reader order depends on the order in indices. + + Args: + Similar to reader + indices: blob of reading order + + Returns: + A DatasetReader that can be used to create operators that will + iterate through the dataset according to indices. + """ + assert self.field_blobs, 'Dataset not initialized.' + cursor_name = cursor_name or (self.name + '_cursor') + cursor = init_net.CreateTreeCursor( + [], + [cursor_name], + fields=self.fields) + return _DatasetRandomReader( + self.fields, self.field_blobs, cursor, cursor_name, indices) + + def writer(self, init_net): + """Create a Writer that can be used to append entries into the dataset. + + NOTE: Currently, it is not safe to append to a dataset + while reading from it. + NOTE: Currently implementation of writer is not thread safe. + TODO: fixme + + Args: + init_net: net that will be run once in order to create the writer. + (currently not used) + """ + assert self.field_blobs, 'Dataset not initialized.' + return _DatasetWriter(self.fields, self.field_blobs, init_net) diff --git a/caffe2/python/device_checker.py b/caffe2/python/device_checker.py index e561c6ae74d..77d06e96a97 100644 --- a/caffe2/python/device_checker.py +++ b/caffe2/python/device_checker.py @@ -1,6 +1,6 @@ import numpy as np import copy -from caffe2.python import core, workspace +from caffe2.python import workspace class DeviceChecker(object): @@ -41,9 +41,8 @@ class DeviceChecker(object): op.device_option.CopyFrom(device_option) workspace.RunOperatorOnce(op) results.append( - [workspace.FetchBlob(op.output[idx]) for idx in outputs_to_check - ] - ) + [workspace.FetchBlob(op.output[idx]) + for idx in outputs_to_check]) # Everything is done, reset the workspace. workspace.ResetWorkspace() # After running on all devices, check correctness @@ -61,15 +60,15 @@ class DeviceChecker(object): print(y.flatten()) print(np.max(np.abs(x - y))) success = False - #else: - # print ('Passed device pair (0, %d), %s %s' % - # (i, outputs_to_check[j], y.shape)) + # else: + # print ('Passed device pair (0, %d), %s %s' % + # (i, outputs_to_check[j], y.shape)) workspace.SwitchWorkspace(old_ws_name) return success def CheckNet(self, net, inputs={}, blobs_to_check=None, ignore=set()): - """Checks a network by inspecting all of its intermediate results, and see - if things match. + """Checks a network by inspecting all of its intermediate results, and + see if things match. """ old_ws_name = workspace.CurrentWorkspace() results = [] @@ -78,8 +77,8 @@ class DeviceChecker(object): blobs_to_check = [b for b in blobs_to_check if b not in ignore] workspace.SwitchWorkspace("_device_check_", True) for i, device_option in enumerate(self._device_options): - for name, arr in inputs.iteritems(): - #print 'feeding', name + for name, arr in inputs.items(): + # print 'feeding', name workspace.FeedBlob(name, arr, device_option) for op in net.op: op.device_option.CopyFrom(device_option) @@ -93,15 +92,18 @@ class DeviceChecker(object): for j in range(len(blobs_to_check)): x = results[i][j] y = results[0][j] - if np.any(np.abs(x - y) > self._threshold): + if not np.allclose(x, y, + atol=self._threshold, rtol=self._threshold): print('Failure in checking device option {}' ' and output {}. The outputs are:' .format(i, blobs_to_check[j])) print(x.flatten()) print(y.flatten()) + print(np.max(np.abs(x - y))) success = False - #else: - # print ('Passed device pair (%d, %d), %s %s: %s' % - # (i, j, blobs_to_check[j], y.shape, str(y.flatten()))) + # else: + # print ('Passed device pair (%d, %d), %s %s: %s' % + # (i, j, blobs_to_check[j], y.shape, + # str(y.flatten()))) workspace.SwitchWorkspace(old_ws_name) return success diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py index a377797fd4f..5580dc2391e 100644 --- a/caffe2/python/hypothesis_test.py +++ b/caffe2/python/hypothesis_test.py @@ -178,7 +178,12 @@ class TestOperators(hu.HypothesisTestCase): return st.sampled_from([np.float32, np.float64]) _test_binary( - "Div", ref, filter_=non_zero, test_gradient=True, dtypes=div_dtypes + "Div", ref, filter_=non_zero, test_gradient=True, + dtypes=div_dtypes, gcs=hu.gcs_cpu_only + )(self) + _test_binary( + "Div", ref, filter_=non_zero, test_gradient=False, + dtypes=div_dtypes )(self) _test_binary_broadcast( "Div", ref, filter_=non_zero, dtypes=div_dtypes)(self) @@ -269,8 +274,8 @@ class TestOperators(hu.HypothesisTestCase): for param, _ in enumerate(inputs): self.assertGradientChecks(gc, op, inputs, param, [0]) - @unittest.skipIf(True, - "Recurrent only works on CUDA 7.5 and above") + @unittest.skipIf(not workspace.has_gpu_support, + "Skipping test due to no gpu present.") @given(hidden_size=st.integers(min_value=1, max_value=3), num_layers=st.integers(min_value=1, max_value=3), bidirectional=st.booleans(), @@ -366,27 +371,27 @@ class TestOperators(hu.HypothesisTestCase): # CUDNN does NOT support different padding values and we skip it @given(stride_h=st.integers(1, 3), - stride_w=st.integers(1, 3), - pad_t=st.integers(0, 3), - pad_l=st.integers(0, 3), - pad_b=st.integers(0, 3), - pad_r=st.integers(0, 3), - kernel=st.integers(1, 5), - size=st.integers(7, 10), - input_channels=st.integers(1, 8), - output_channels=st.integers(1, 8), - batch_size=st.integers(1, 3), - order=st.sampled_from(["NCHW", "NHWC"]), - engine=st.sampled_from([""]), - **hu.gcs) + stride_w=st.integers(1, 3), + pad_t=st.integers(0, 3), + pad_l=st.integers(0, 3), + pad_b=st.integers(0, 3), + pad_r=st.integers(0, 3), + kernel=st.integers(3, 5), + size=st.integers(8, 8), + input_channels=st.integers(1, 3), + output_channels=st.integers(1, 3), + batch_size=st.integers(1, 3), + order=st.sampled_from(["NCHW", "NHWC"]), + engine=st.sampled_from([""]), + **hu.gcs) @settings(max_examples=2, timeout=100) def test_convolution_separate_stride_pad_gradients(self, stride_h, stride_w, - pad_t, pad_l, pad_b, - pad_r, kernel, size, - input_channels, - output_channels, - batch_size, order, - engine, gc, dc): + pad_t, pad_l, pad_b, + pad_r, kernel, size, + input_channels, + output_channels, + batch_size, order, + engine, gc, dc): assume(stride_h <= kernel) assume(stride_w <= kernel) op = core.CreateOperator( @@ -654,18 +659,19 @@ class TestOperators(hu.HypothesisTestCase): rtol=1e-5) @given(stride=st.integers(1, 3), - pad=st.integers(0, 3), - kernel=st.integers(1, 5), - size=st.integers(7, 10), - input_channels=st.integers(1, 8), - output_channels=st.integers(1, 8), - batch_size=st.integers(1, 3), - order=st.sampled_from(["NCHW", "NHWC"]), - engine=st.sampled_from([""]), **hu.gcs) + pad=st.integers(0, 3), + kernel=st.integers(1, 5), + size=st.integers(7, 10), + input_channels=st.integers(1, 8), + output_channels=st.integers(1, 8), + batch_size=st.integers(1, 3), + order=st.sampled_from(["NCHW", "NHWC"]), + engine=st.sampled_from(["", "CUDNN"]), **hu.gcs) + @settings(max_examples=2, timeout=100) def test_convolution_transpose_gradients(self, stride, pad, kernel, - size, input_channels, - output_channels, batch_size, - order, engine, gc, dc): + size, input_channels, + output_channels, batch_size, + order, engine, gc, dc): assume(stride <= kernel) X = np.random.rand( batch_size, size, size, input_channels).astype(np.float32) - 0.5 @@ -692,17 +698,17 @@ class TestOperators(hu.HypothesisTestCase): self.assertGradientChecks(gc, op, [X, w, b], i, [0]) @given(stride=st.integers(1, 3), - pad=st.integers(0, 3), - kernel=st.integers(1, 5), - size=st.integers(7, 10), - input_channels=st.integers(1, 8), - output_channels=st.integers(1, 8), - batch_size=st.integers(1, 3), - engine=st.sampled_from([""]), **hu.gcs) + pad=st.integers(0, 3), + kernel=st.integers(1, 5), + size=st.integers(7, 10), + input_channels=st.integers(1, 8), + output_channels=st.integers(1, 8), + batch_size=st.integers(1, 3), + engine=st.sampled_from(["", "CUDNN"]), **hu.gcs) def test_convolution_transpose_layout(self, stride, pad, kernel, - size, input_channels, - output_channels, batch_size, - engine, gc, dc): + size, input_channels, + output_channels, batch_size, + engine, gc, dc): assume(stride <= kernel) X = np.random.rand( batch_size, size, size, input_channels).astype(np.float32) - 0.5 @@ -1049,7 +1055,7 @@ class TestOperators(hu.HypothesisTestCase): @given(target_probabilities=hu.arrays( dims=[10], elements=st.floats(allow_nan=False, allow_infinity=False, - min_value=0, + min_value=0.01, max_value=1)), **hu.gcs) def test_perplexity(self, target_probabilities, gc, dc): @@ -1477,9 +1483,13 @@ class TestOperators(hu.HypothesisTestCase): op = core.CreateOperator("HasElements", ["data"], ["has_elements"]) self.assertReferenceChecks(gc, op, [data], lambda x: (len(x) > 0, )) + op = core.CreateOperator("IsEmpty", ["data"], ["is_empty"]) + self.assertReferenceChecks(gc, op, [data], lambda x: (len(x) == 0, )) + @given(initial_iters=st.integers(0, 100), max_iters=st.integers(0, 100)) - def test_criteria_net_with_execution_step(self, initial_iters, max_iters): + def test_should_stop_as_criteria_net_execution_step( + self, initial_iters, max_iters): net = core.Net("net") net.Iter(["iter"], ["iter"]) workspace.FeedBlob( @@ -1487,16 +1497,87 @@ class TestOperators(hu.HypothesisTestCase): workspace.FeedBlob( "num_iters", np.asarray([max_iters]).astype(np.int32)) criteria_net = core.Net("criteria") - criteria_net.LT(["iter", "num_iters"], ["continue"]) - criteria_net.Proto().external_output.extend(["continue"]) + criteria_net.GE(["iter", "num_iters"], ["stop"]) + criteria_net.Proto().external_output.extend(["stop"]) plan = core.Plan('plan') - plan.AddStep(core.execution_step('step', net, criteria=criteria_net)) + plan.AddStep(core.execution_step( + 'step', [criteria_net, net], + should_stop_blob=core.BlobReference("stop"))) workspace.RunPlan(plan) iters = workspace.FetchBlob("iter") self.assertEqual(iters.dtype, np.int32) self.assertEqual(iters[0], max(initial_iters, max_iters)) + def test_disabled_execution_step(self): + def createNets(i, disabled): + should_stop = 'should_stop_{}'.format(i) + output = 'output_{}'.format(i) + + # init content and stop signal + init = core.Net("init_{}".format(i)) + init.ConstantFill( + [], + [output], + shape=[1], + value=0.0 + ) + init.Cast([output], [should_stop], to='bool') + + # decide if disabled or not + criterion = core.Net("criterion_{}".format(i)) + tmp = criterion.ConstantFill( + [], + shape=[1], + value=1.0 if disabled else 0.0 + ) + criterion.Cast([tmp], [should_stop], to='bool') + criterion.Proto().external_output.extend([should_stop]) + + # the body net is just to turn a 0 blob to 1 + net = core.Net("net_{}".format(i)) + net.ConstantFill( + [], + [output], + shape=[1], + value=1.0 + ) + + # always end the loop + ender = core.Net("ender_{}".format(i)) + tmp = ender.ConstantFill( + [], + shape=[1], + value=1.0 + ) + ender.Cast([tmp], [should_stop], to='bool') + ender.Proto().external_output.extend([should_stop]) + + return [init, criterion, net, ender] + + nets = [createNets(1, False), + createNets(2, True), + createNets(3, False)] + steps = [ + core.execution_step( + 'step_1', nets[0], + should_stop_blob=core.BlobReference('should_stop_1')), + core.execution_step( + 'step_2', nets[1], + should_stop_blob=core.BlobReference('should_stop_2')), + core.execution_step('step_3', nets[2]) + ] + expected = [1.0, 0.0, 1.0] + + plan = core.Plan('plan') + plan.AddStep(core.execution_step('all_steps', steps, num_iter=3)) + workspace.RunPlan(plan) + + for i, net in enumerate(nets): + self.assertEqual( + workspace.FetchBlob('output_{}'.format(i + 1))[0], + expected[i]) + @given(initial_iters=st.integers(0, 100), num_iters=st.integers(0, 100)) def test_iter_count_with_execution_step(self, initial_iters, num_iters): @@ -1523,6 +1604,13 @@ class TestOperators(hu.HypothesisTestCase): def test_cast(self, a, src, dst, use_name, gc, dc): a = a.astype(src) + # Casting from a float type outside the range of the integral + # type is UB. + ftypes = [np.float32, np.float64] + if src in ftypes and dst not in ftypes and dst is not np.bool: + info = np.iinfo(dst) + a = np.clip(a, info.min, info.max) + def ref(data): return [data.astype(dst)] @@ -1571,7 +1659,8 @@ class TestOperators(hu.HypothesisTestCase): self.assertDeviceChecks(dc, op, [X], [0]) self.assertGradientChecks(gc, op, [X], 0, [0]) - @given(X=hu.tensor(), seed=st.integers(min_value=0, max_value=65536), + @given(X=_dtypes().flatmap(lambda dtype: hu.tensor(dtype=dtype)), + seed=st.integers(min_value=0, max_value=65536), null_axes=st.booleans(), **hu.gcs) def test_transpose(self, X, seed, null_axes, gc, dc): @@ -1589,7 +1678,8 @@ class TestOperators(hu.HypothesisTestCase): self.assertReferenceChecks(gc, op, [X, axes], transpose_ref) - self.assertGradientChecks(gc, op, [X], 0, [0]) + if X.dtype != np.int32 and X.dtype != np.int64: + self.assertGradientChecks(gc, op, [X], 0, [0]) @given(n=st.integers(1, 3), dim=st.integers(4, 16), diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py index 280ceb77635..06c1896c6cb 100644 --- a/caffe2/python/hypothesis_test_util.py +++ b/caffe2/python/hypothesis_test_util.py @@ -263,7 +263,11 @@ class HypothesisTestCase(test_util.TestCase): outs = [] for (n, ref) in zip(op.output, reference_outputs): output = workspace.FetchBlob(n) - np.testing.assert_allclose(output, ref, atol=1e-4, rtol=1e-4) + if output.dtype.kind in ('S', 'O'): + np.testing.assert_array_equal(output, ref) + else: + np.testing.assert_allclose( + output, ref, atol=1e-4, rtol=1e-4) outs.append(output) if grad_reference and output_to_grad: self._assertGradReferenceChecks( diff --git a/caffe2/python/io.py b/caffe2/python/io.py new file mode 100644 index 00000000000..89a61384abf --- /dev/null +++ b/caffe2/python/io.py @@ -0,0 +1,138 @@ +""" +Defines the base interface for reading and writing operations. + +Readers/Writers are objects that produce operations that read/write sequences +of data. Each operation reads or writes a list of BlobReferences. + +Readers and Writers must be implemented such that read and write operations +are atomic and thread safe. + +Examples of possible Readers and Writers: + HiveReader, HiveWriter, + QueueReader, QueueWriter, + DatasetReader, DatasetWriter, + DBReader, DBWriter, + +See `dataset.py` for an example of implementation. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from caffe2.python import core + + +class Reader(object): + """ + Reader is a abstract class to be implemented in order to provide + operations capable of iterating through a dataset or stream of data. + + A Reader must implement at least one operation, `read`, which + adds operations to a net that read the next batch of data. Readers can + optionally support the `reset` operation, which is useful when multiple + passes over the data are required. + """ + def read(self, read_net, batch_size=1, *args): + """ + Add operations to read_net that will read the read batch of data + and return a list of BlobReference representing the blobs that will + contain the batches produced. + + Operations added to `read_net` must be thread safe and atomic, that is, + it should be possible to clone `read_net` and run multiple instances of + it in parallel. + + Args: + read_net: the net that will be appended with read operations + batch_size: number of entires to read + + Returns: + A tuple (should_stop, fields), with: + + should_stop: BlobReference pointing to a boolean scalar + blob that indicates whether the read operation + was succesfull or whether the end of data has + been reached. + fields: A tuple of BlobReference containing the latest batch + of data that was read. + """ + raise NotImplementedError('Readers must implement `read`.') + + def reset(self, net): + """Append operations to `net` that will reset the reader. + + This can be used to read the data multiple times. + Not all readers support this operation. + """ + raise NotImplementedError('This reader cannot be resetted.') + + def execution_step(self, reader_net_name=None, batch_size=1): + """Create an execution step with a net containing read operators. + + The execution step will contain a `stop_blob` that knows how to stop + the execution loop when end of data was reached. + + E.g.: + + read_step, fields = reader.execution_step() + consume_net = core.Net('consume') + consume_net.Print(fields[0], []) + p = core.Plan('reader') + p.AddStep(read_step.AddNet(consume_net)) + core.RunPlan(p) + + Args: + + reader_net_name: (optional) the name of the reader_net to be + created. The execution step will + be named accordingly. + batch_size: the batch size + + Returns: + A tuple (read_step, fields), with: + + read_step: A newly created execution step containing a net with + read operations. The step will have `stop_blob` set, + in order to stop the loop on end of data. + fields: A tuple of BlobReference containing the latest batch + of data that was read. + """ + reader_net = core.Net(reader_net_name or 'reader') + should_stop, fields = self.read(reader_net, batch_size=batch_size) + read_step = core.execution_step( + '{}_step'.format(reader_net_name), + reader_net, + should_stop_blob=should_stop) + return (read_step, fields) + + +class Writer(object): + """ + Writer is a abstract class to be implemented in order to provide + operations capable of feeding a data stream or a dataset. + + A Writer must implement 2 operations: + `write`, which adds operations to a net that write the write batch of + data, and `commit`, which adds operations to a net in order to indicate + that no more data will be written. + """ + + def write(self, writer_net, fields): + """Add operations to `writer_net` that write the next batch of data. + + Operations added to the net must be thread-safe and unique, that is: + multiple writers must be able to write to the dataset in parallel. + + Args: + fields: a tuple of BlobReference containing the batch of data to + write. + """ + raise NotImplementedError('Writers must implement write.') + + def commit(self, finish_net): + """Add operations to `finish_net` that signal end of data. + + This must be implemented by all Writers, but may be no-op for some + of them. + """ + raise NotImplementedError('Writers must implement commit.') diff --git a/caffe2/python/model_device_test.py b/caffe2/python/model_device_test.py index 90bbfdcc0c0..a1824a225ce 100644 --- a/caffe2/python/model_device_test.py +++ b/caffe2/python/model_device_test.py @@ -116,7 +116,7 @@ class TestMiniAlexNet(test_util.TestCase): gpu_device = caffe2_pb2.DeviceOption() gpu_device.device_type = caffe2_pb2.CUDA - checker = device_checker.DeviceChecker(1e-2, [cpu_device, gpu_device]) + checker = device_checker.DeviceChecker(0.05, [cpu_device, gpu_device]) ret = checker.CheckNet( model.net.Proto(), inputs, @@ -126,15 +126,16 @@ class TestMiniAlexNet(test_util.TestCase): ) self.assertEqual(ret, True) - def testMiniAlexNet(self): + @unittest.skipIf(not workspace.has_gpu_support, + "No GPU support. Skipping test.") + def testMiniAlexNetNCHW(self): self._testMiniAlexNet("NCHW") + + @unittest.skipIf(not workspace.has_gpu_support, + "No GPU support. Skipping test.") + def testMiniAlexNetNHWC(self): self._testMiniAlexNet("NHWC") if __name__ == '__main__': - if not workspace.has_gpu_support: - print('No GPU support. Skipping gpu test.') - elif workspace.NumCudaDevices() == 0: - print('No GPU device. Skipping gpu test.') - else: - unittest.main() + unittest.main() diff --git a/caffe2/python/net_drawer.py b/caffe2/python/net_drawer.py index fe0b3ce3da4..1b7333f7b6d 100644 --- a/caffe2/python/net_drawer.py +++ b/caffe2/python/net_drawer.py @@ -3,6 +3,7 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals import argparse +import json from collections import defaultdict from caffe2.python import utils @@ -53,6 +54,11 @@ def _rectify_operator_and_name(operators_or_net, name): return operators, name +def _escape_label(name): + # json.dumps is poor man's escaping + return json.dumps(name) + + def GetPydotGraph(operators_or_net, name=None, rankdir='LR'): operators, name = _rectify_operator_and_name(operators_or_net, name) graph = pydot.Dot(name, rankdir=rankdir) @@ -73,7 +79,7 @@ def GetPydotGraph(operators_or_net, name=None, rankdir='LR'): if input_name not in pydot_nodes: input_node = pydot.Node( input_name + str(pydot_node_counts[input_name]), - label=input_name, + label=_escape_label(input_name), **BLOB_STYLE ) pydot_nodes[input_name] = input_node @@ -87,7 +93,7 @@ def GetPydotGraph(operators_or_net, name=None, rankdir='LR'): pydot_node_counts[output_name] += 1 output_node = pydot.Node( output_name + str(pydot_node_counts[output_name]), - label=output_name, + label=_escape_label(output_name), **BLOB_STYLE ) pydot_nodes[output_name] = output_node diff --git a/caffe2/python/operator_test/atomic_ops_test.py b/caffe2/python/operator_test/atomic_ops_test.py new file mode 100644 index 00000000000..37a3541d915 --- /dev/null +++ b/caffe2/python/operator_test/atomic_ops_test.py @@ -0,0 +1,43 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from caffe2.python import core, workspace +from caffe2.python.test_util import TestCase + + +class TestAtomicOps(TestCase): + def test_atomic_ops(self): + """ + Test that both countdown and checksum are update atomically by having + cowntdown count from 20k to 0 from parallel the workers and updating + the checksum to the value fetched. If operations are trully atomic, + each value from 1 to 20k should be fetched exactly once from the + countdown, and fed exactly once to the checksum, such that at the end + checksum must contain the exact value of sum[i=0..20000](i). + """ + init_net = core.Net('init') + mutex_countdown = init_net.CreateMutex([]) + mutex_checksum = init_net.CreateMutex([]) + countdown = init_net.ConstantIntFill([], shape=[], value=20000.) + checksum = init_net.ConstantIntFill([], shape=[], value=0.) + minus_one = init_net.ConstantIntFill([], shape=[], value=-1.) + steps = [] + for i in range(0, 100): + net = core.Net('net:%d' % i) + _, fetched_count = net.AtomicFetchAdd( + [mutex_countdown, countdown, minus_one], + [countdown, 'fetched_count:%d' % i]) + net.AtomicFetchAdd( + [mutex_checksum, checksum, fetched_count], + [checksum, 'not_used']) + steps.append( + core.execution_step('worker:%d' % i, net, num_iter=200)) + super_step = core.execution_step( + 'parent', steps, concurrent_substeps=True) + plan = core.Plan('plan') + plan.AddStep(core.execution_step('init', init_net)) + plan.AddStep(super_step) + workspace.RunPlan(plan) + # checksum = sum[i=1..20000](i) = 20000 * 20001 / 2 = 200010000 + self.assertEquals(workspace.FetchBlob(checksum), 200010000) diff --git a/caffe2/python/operator_test/counter_ops_test.py b/caffe2/python/operator_test/counter_ops_test.py new file mode 100644 index 00000000000..0ece148d360 --- /dev/null +++ b/caffe2/python/operator_test/counter_ops_test.py @@ -0,0 +1,42 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from caffe2.python import core, workspace +from caffe2.python.test_util import TestCase + + +class TestCounterOps(TestCase): + def test_counter_ops(self): + workspace.RunOperatorOnce(core.CreateOperator( + 'CreateCounter', [], ['c'], init_count=1)) + workspace.RunOperatorOnce(core.CreateOperator( + 'CountDown', ['c'], ['t1'])) # 1 -> 0 + assert not workspace.FetchBlob('t1') + + workspace.RunOperatorOnce(core.CreateOperator( + 'CountDown', ['c'], ['t2'])) # 0 -> 0 + assert workspace.FetchBlob('t2') + + workspace.RunOperatorOnce(core.CreateOperator( + 'ResetCounter', ['c'], [], init_count=1)) # -> 1 + workspace.RunOperatorOnce(core.CreateOperator( + 'CountDown', ['c'], ['t3'])) # 1 -> 0 + assert not workspace.FetchBlob('t3') + + workspace.RunOperatorOnce(core.CreateOperator( + 'ConstantBoolFill', [], ['t4'], value=0.0, shape=[])) + assert workspace.FetchBlob('t4') == workspace.FetchBlob('t1') + + workspace.RunOperatorOnce(core.CreateOperator( + 'ConstantBoolFill', [], ['t5'], value=1.0, shape=[])) + assert workspace.FetchBlob('t5') == workspace.FetchBlob('t2') + + assert workspace.RunOperatorOnce(core.CreateOperator( + 'And', ['t1', 't2'], ['t6'])) + assert not workspace.FetchBlob('t6') # True && False + + assert workspace.RunOperatorOnce(core.CreateOperator( + 'And', ['t2', 't5'], ['t7'])) + assert workspace.FetchBlob('t7') # True && True diff --git a/caffe2/python/operator_test/cross_entropy_ops_test.py b/caffe2/python/operator_test/cross_entropy_ops_test.py new file mode 100644 index 00000000000..2346676bcb0 --- /dev/null +++ b/caffe2/python/operator_test/cross_entropy_ops_test.py @@ -0,0 +1,71 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from caffe2.python import core +from hypothesis import given +import caffe2.python.hypothesis_test_util as hu +import hypothesis.strategies as st +import numpy as np + + +def sigmoid(x): + return 1.0 / (1.0 + np.exp(-x)) + + +def sigmoid_cross_entropy_with_logits(x, z): + return np.maximum(x, 0) - x * z + np.log(1 + np.exp(-np.abs(x))) + + +def sigmoid_cross_entropy_with_logits_grad(x, z): + return z - sigmoid(x) + + +class TestCrossEntropyOps(hu.HypothesisTestCase): + @given( + inputs=st.lists( + elements=st.integers(min_value=1, max_value=5), + min_size=1, + max_size=2, + average_size=2, + ).flatmap( + lambda shape: st.tuples( + hu.arrays( + dims=shape, + elements=st.one_of( + st.floats(min_value=-1.0, max_value=-0.1), + st.floats(min_value=0.1, max_value=1.0), + )), + hu.arrays( + dims=shape, + elements=st.sampled_from([0.0, 1.0]), + ), + ) + ), + ) + def test_sigmoid_cross_entropy_with_logits(self, inputs): + logits, targets = inputs + + def sigmoid_xentr_logit_ref(logits, targets): + s = sigmoid_cross_entropy_with_logits(logits, targets) + m = np.mean(s, axis=len(logits.shape) - 1) + return (m, ) + + def sigmoid_xentr_logit_grad_ref(g_out, outputs, fwd_inputs): + fwd_logits, fwd_targets = fwd_inputs + inner_size = fwd_logits.shape[-1] + m = fwd_targets - sigmoid(fwd_logits) + g_in = -np.expand_dims(g_out, axis=-1) * m / inner_size + return (g_in, None) + + op = core.CreateOperator( + 'SigmoidCrossEntropyWithLogits', + ['logits', 'targets'], + ['xentropy']) + self.assertReferenceChecks( + hu.cpu_do, + op, + [logits, targets], + sigmoid_xentr_logit_ref, + output_to_grad='xentropy', + grad_reference=sigmoid_xentr_logit_grad_ref) diff --git a/caffe2/python/operator_test/dataset_ops_test.py b/caffe2/python/operator_test/dataset_ops_test.py new file mode 100644 index 00000000000..d2f20a60c07 --- /dev/null +++ b/caffe2/python/operator_test/dataset_ops_test.py @@ -0,0 +1,284 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +import numpy as np +from caffe2.python import core, workspace, dataset +from caffe2.python.dataset import Const +from caffe2.python.schema import List, Struct, Scalar, Map +from caffe2.python.test_util import TestCase + + +def _assert_arrays_equal(actual, ref, err_msg): + if ref.dtype.kind in ('S', 'O'): + np.testing.assert_array_equal(actual, ref, err_msg=err_msg) + else: + np.testing.assert_allclose( + actual, ref, atol=1e-4, rtol=1e-4, err_msg=err_msg) + + +class TestDatasetOps(TestCase): + def test_dataset_ops(self): + """ + 1. Defining the schema of our dataset. + + This example schema could represent, for example, a search query log. + """ + schema = Struct( + # fixed size vector, which will be stored as a matrix when batched + ('dense', Scalar((np.float32, 3))), + # could represent a feature map from feature ID to float value + ('floats', Map( + Scalar(np.int32), + Scalar(np.float32))), + # could represent a multi-valued categorical feature map + ('int_lists', Map( + Scalar(np.int32), + List(Scalar(np.int64)), + )), + # could represent a multi-valued, weighted categorical feature map + ('id_score_pairs', Map( + Scalar(np.int32), + Map( + Scalar(np.int64), + Scalar(np.float32), + keys_name='ids', + values_name='scores'), + )), + # additional scalar information + ('metadata', Struct( + ('user_id', Scalar(np.int64)), + ('user_embed', Scalar((np.float32, 2))), + ('query', Scalar(str)), + )), + ) + + """ + This is what the flattened fields for this schema look like, along + with its type. Each one of these fields will be stored, read and + writen as a tensor. + """ + expected_fields = [ + ('dense', (np.float32, 3)), + ('floats:lengths', np.int32), + ('floats:values:keys', np.int32), + ('floats:values:values', np.float32), + ('int_lists:lengths', np.int32), + ('int_lists:values:keys', np.int32), + ('int_lists:values:values:lengths', np.int32), + ('int_lists:values:values:values', np.int64), + ('id_score_pairs:lengths', np.int32), + ('id_score_pairs:values:keys', np.int32), + ('id_score_pairs:values:values:lengths', np.int32), + ('id_score_pairs:values:values:values:ids', np.int64), + ('id_score_pairs:values:values:values:scores', np.float32), + ('metadata:user_id', np.int64), + ('metadata:user_embed', (np.float32, 2)), + ('metadata:query', str), + ] + zipped = zip( + expected_fields, + schema.field_names(), + schema.field_types()) + for (ref_name, ref_type), name, dtype in zipped: + self.assertEquals(ref_name, name) + self.assertEquals(np.dtype(ref_type), dtype) + + """ + 2. The contents of our dataset. + + Contents as defined below could represent, for example, a log of + search queries along with dense, sparse features and metadata. + The datset below has 3 top-level entries. + """ + contents_raw = [ + # dense + [[1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3]], + # floats + [1, 2, 3], # len + [11, 21, 22, 31, 32, 33], # key + [1.1, 2.1, 2.2, 3.1, 3.2, 3.3], # value + # int lists + [2, 0, 2], # len + [11, 12, 31, 32], # key + [2, 4, 3, 1], # value:len + [111, 112, 121, 122, 123, 124, 311, 312, 313, 321], # value:value + # id score pairs + [1, 2, 2], # len + [11, 21, 22, 31, 32], # key + [1, 1, 2, 2, 3], # value:len + [111, 211, 221, 222, 311, 312, 321, 322, 323], # value:ids + [11.1, 21.1, 22.1, 22.2, 31.1, 31.2, 32.1, 32.2, 32.3], # val:score + # metadata + [123, 234, 456], # user_id + [[0.2, 0.8], [0.5, 0.5], [0.7, 0.3]], # user_embed + ['dog posts', 'friends who like to', 'posts about ca'], # query + ] + # convert the above content to ndarrays, checking against the schema + contents = dataset.to_ndarray_list(contents_raw, schema) + + """ + 3. Creating and appending to the dataset. + We first create an empty dataset with the given schema. + Then, a Writer is used to append these entries to the dataset. + """ + ds = dataset.Dataset(schema) + net = core.Net('init') + ds.init_empty(net) + + blobs_to_append = [Const(net, c) for c in contents] + writer = ds.writer(init_net=net) + writer.write(net, blobs_to_append) + workspace.RunNetOnce(net) + + """ + 4. Iterating through the dataset contents. + + If we were to iterate through the top level entries of our dataset, + this is what we should expect to see: + """ + entries_raw = [ + ( + [[1.1, 1.2, 1.3]], # dense + [1], [11], [1.1], # floats + [2], [11, 12], [2, 4], [111, 112, 121, 122, 123, 124], # intlst + [1], [11], [1], [111], [11.1], # id score pairs + [123], [[0.2, 0.8]], ['dog posts'], # metadata + ), + ( + [[2.1, 2.2, 2.3]], # dense + [2], [21, 22], [2.1, 2.2], # floats + [0], [], [], [], # int list + [2], [21, 22], [1, 2], [211, 221, 222], [21.1, 22.1, 22.2], + [234], [[0.5, 0.5]], ['friends who like to'], # metadata + ), + ( + [[3.1, 3.2, 3.3]], # dense + [3], [31, 32, 33], [3.1, 3.2, 3.3], # floats + [2], [31, 32], [3, 1], [311, 312, 313, 321], # int lst + [2], [31, 32], [2, 3], [311, 312, 321, 322, 323], + [31.1, 31.2, 32.1, 32.2, 32.3], # id score list + [456], [[0.7, 0.3]], ['posts about ca'], # metadata + ), + # after the end of the dataset, we will keep getting empty vectors + ([],) * 16, + ([],) * 16, + ] + entries = [dataset.to_ndarray_list(e, schema) for e in entries_raw] + + """ + Let's go ahead and create the reading nets. + We will run `read` net multiple times and assert that we are reading the + entries the way we stated above. + """ + read_init_net = core.Net('read_init') + read_next_net = core.Net('read_next') + reader = ds.reader(read_init_net) + should_continue, batch_blobs = reader.read(read_next_net) + + workspace.RunNetOnce(read_init_net) + + workspace.CreateNet(read_next_net) + read_next_net_name = str(read_next_net) + + for i, entry in enumerate(entries): + workspace.RunNet(read_next_net_name) + for name, blob, base in zip(ds.field_names(), batch_blobs, entry): + data = workspace.FetchBlob(str(blob)) + _assert_arrays_equal( + data, base, + err_msg='Mismatch in entry %d, field %s' % (i, name)) + + """ + 5. Reading/writing in a single plan + + If all of operations on the data are expressible as Caffe2 operators, + we don't need to load the data to python, iterating through the dataset + in a single Plan. + + Where we will process the dataset a little and store it in a second + dataset. We can reuse the same Reader since it supports reset. + """ + reset_net = core.Net('reset_net') + reader.reset(reset_net) + read_step, fields = reader.execution_step() + + """ We will add the line number * 1000 to the feature ids. """ + process_net = core.Net('process') + line_no = Const(process_net, 0, dtype=np.int32) + const_one = Const(process_net, 1000, dtype=np.int32) + process_net.Add([line_no, const_one], [line_no]) + fid = schema.floats.values.keys.id() + process_net.Print(fields[fid], []) + process_net.Add([fields[fid], line_no], fields[fid], broadcast=1) + + """ Lets create a second dataset and append to it. """ + ds2 = dataset.Dataset(schema, name='dataset2') + ds2.init_empty(reset_net) + writer = ds2.writer(reset_net) + writer.write(process_net, fields) + # commit is not necessary for DatasetWriter but will add it for + # generality of the example + commit_net = core.Net('commit') + writer.commit(commit_net) + + """ Time to create and run a plan which will do the processing """ + plan = core.Plan('process') + plan.AddStep(core.execution_step('reset', reset_net)) + plan.AddStep(read_step.AddNet(process_net)) + plan.AddStep(core.execution_step('commit', commit_net)) + workspace.RunPlan(plan) + + """ + Now we should have dataset2 populated. + """ + ds2blobs = ds2.get_blobs() + for i, (name, blob) in enumerate(zip(schema.field_names(), ds2blobs)): + data = workspace.FetchBlob(str(blob)) + content = contents[i] + if i == fid: + # one of our fields has been added with line numbers * 1000 + content += [1000, 2000, 2000, 3000, 3000, 3000] + _assert_arrays_equal( + data, contents[i], err_msg='Mismatch in field %s.' % name) + + """ + 6. Slicing a dataset + + You can create a new schema from pieces of another schema and reuse + the same data. + """ + subschema = Struct(('top_level', schema.int_lists.values)) + int_list_contents = contents[schema.int_lists.values.slice()] + self.assertEquals(len(subschema.field_names()), len(int_list_contents)) + + """ + 7. Random Access a dataset + + """ + read_init_net = core.Net('read_init') + read_next_net = core.Net('read_next') + + idx = np.array([2, 1, 0]) + workspace.FeedBlob('idx', idx) + + reader = ds.random_reader(read_init_net, 'idx') + reader.computeoffset(read_init_net) + + should_continue, batch_blobs = reader.read(read_next_net) + + workspace.CreateNet(read_init_net) + workspace.RunNetOnce(read_init_net) + + workspace.CreateNet(read_next_net) + read_next_net_name = str(read_next_net) + + for i in range(len(entries)): + k = idx[i] if i in idx else i + entry = entries[k] + workspace.RunNet(read_next_net_name) + for name, blob, base in zip(ds.field_names(), batch_blobs, entry): + data = workspace.FetchBlob(str(blob)) + _assert_arrays_equal( + data, base, + err_msg='Mismatch in entry %d, field %s' % (i, name)) diff --git a/caffe2/python/operator_test/index_ops_test.py b/caffe2/python/operator_test/index_ops_test.py index c09ba63589d..27bf3f2c865 100644 --- a/caffe2/python/operator_test/index_ops_test.py +++ b/caffe2/python/operator_test/index_ops_test.py @@ -2,9 +2,9 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals -import numpy as np from caffe2.python import core, workspace from caffe2.python.test_util import TestCase +import numpy as np class TestIndexOps(TestCase): @@ -47,6 +47,13 @@ class TestIndexOps(TestCase): result2 = workspace.FetchBlob('result2') np.testing.assert_array_equal([0, 5, 1, 0, 0], result2) + workspace.RunOperatorOnce(core.CreateOperator( + 'IndexSize', + ['index'], + ['index_size'])) + size = workspace.FetchBlob('index_size') + self.assertEquals(size, 6) + workspace.RunOperatorOnce(core.CreateOperator( 'IndexStore', ['index'], @@ -55,3 +62,21 @@ class TestIndexOps(TestCase): new_entries = np.array(['new_entry1', 'new_entry2'], dtype=str) np.testing.assert_array_equal( np.concatenate((entries, new_entries)), stored_actual) + + workspace.RunOperatorOnce(core.CreateOperator( + 'StringIndexCreate', + [], + ['index2'])) + + workspace.RunOperatorOnce(core.CreateOperator( + 'IndexLoad', + ['index2', 'stored_entries'], + [], + skip_first_entry=1)) + + workspace.RunOperatorOnce(core.CreateOperator( + 'IndexSize', + ['index2'], + ['index2_size'])) + index2_size = workspace.FetchBlob('index2_size') + self.assertEquals(index2_size, 5) diff --git a/caffe2/python/operator_test/one_hot_ops_test.py b/caffe2/python/operator_test/one_hot_ops_test.py new file mode 100644 index 00000000000..0585dfc50e6 --- /dev/null +++ b/caffe2/python/operator_test/one_hot_ops_test.py @@ -0,0 +1,76 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from caffe2.python import core +from hypothesis import given +import caffe2.python.hypothesis_test_util as hu +import hypothesis.strategies as st +import numpy as np + + +def _one_hots(): + index_size = st.integers(min_value=1, max_value=5) + lengths = st.lists( + elements=st.integers(min_value=0, max_value=5)) + return st.tuples(index_size, lengths).flatmap( + lambda x: st.tuples( + st.just(x[0]), + st.just(x[1]), + st.lists( + elements=st.integers(min_value=0, max_value=x[0] - 1), + min_size=sum(x[1]), + max_size=sum(x[1])))) + + +class TestOneHotOps(hu.HypothesisTestCase): + @given( + hot_indices=hu.tensor( + min_dim=1, max_dim=1, dtype=np.int64, + elements=st.integers(min_value=0, max_value=42)), + end_padding=st.integers(min_value=0, max_value=2)) + def test_one_hot(self, hot_indices, end_padding): + + def one_hot_ref(hot_indices, size): + out = np.zeros([len(hot_indices), size], dtype=float) + x = enumerate(hot_indices) + for i, x in enumerate(hot_indices): + out[i, x] = 1. + return (out, ) + + size = np.array(max(hot_indices) + end_padding + 1, dtype=np.int64) + if size == 0: + size = 1 + op = core.CreateOperator('OneHot', ['hot_indices', 'size'], ['output']) + self.assertReferenceChecks( + hu.cpu_do, + op, + [hot_indices, size], + one_hot_ref) + + @given(hot_indices=_one_hots()) + def test_segment_one_hot(self, hot_indices): + index_size, lengths, indices = hot_indices + + index_size = np.array(index_size, dtype=np.int64) + lengths = np.array(lengths, dtype=np.int32) + indices = np.array(indices, dtype=np.int64) + + def segment_one_hot_ref(lengths, hot_indices, size): + offset = 0 + out = np.zeros([len(lengths), size], dtype=float) + for i, length in enumerate(lengths): + for idx in hot_indices[offset:offset + length]: + out[i, idx] = 1. + offset += length + return (out, ) + + op = core.CreateOperator( + 'SegmentOneHot', + ['lengths', 'hot_indices', 'size'], + ['output']) + self.assertReferenceChecks( + hu.cpu_do, + op, + [lengths, indices, index_size], + segment_one_hot_ref) diff --git a/caffe2/python/operator_test/segment_ops_test.py b/caffe2/python/operator_test/segment_ops_test.py index 6b567a6b984..d98c0bb942d 100644 --- a/caffe2/python/operator_test/segment_ops_test.py +++ b/caffe2/python/operator_test/segment_ops_test.py @@ -83,6 +83,10 @@ def logsumexp_grad(grad_out, outputs, inputs): axis=0) * np.exp(inputs[0]) +def logmeanexp(x): + return np.log(np.mean(np.exp(x), axis=0)) + + def mean(x): return np.mean(x, axis=0) @@ -94,6 +98,30 @@ def mean_grad(grad_out, outputs, inputs): axis=0) +def max(x): + return np.amax(x, axis=0) + + +def max_grad(grad_out, outputs, inputs): + flat_inputs = inputs[0].flatten() + flat_outputs = np.array(outputs[0]).flatten() + flat_grad_in = np.zeros(flat_inputs.shape) + flat_grad_out = np.array(grad_out).flatten() + blocks = inputs[0].shape[0] + block_size = flat_inputs.shape[0] // blocks + + for i in range(block_size): + out_grad = flat_grad_out[i] + out = flat_outputs[i] + for j in range(blocks): + idx = j * block_size + i + if out == flat_inputs[idx]: + flat_grad_in[idx] = out_grad + break + + return np.resize(flat_grad_in, inputs[0].shape) + + REFERENCES_ALL = [ ('Sum', partial(np.sum, axis=0), sum_grad), ] @@ -101,7 +129,10 @@ REFERENCES_ALL = [ REFERENCES_SORTED = [ ('RangeSum', partial(np.sum, axis=0), sum_grad), ('RangeLogSumExp', logsumexp, logsumexp_grad), + # gradient is the same as sum + ('RangeLogMeanExp', logmeanexp, logsumexp_grad), ('RangeMean', mean, mean_grad), + ('RangeMax', max, max_grad), ] diff --git a/caffe2/python/operator_test/sequence_ops_test.py b/caffe2/python/operator_test/sequence_ops_test.py index 56288c72e51..7afca6bdc05 100644 --- a/caffe2/python/operator_test/sequence_ops_test.py +++ b/caffe2/python/operator_test/sequence_ops_test.py @@ -185,3 +185,37 @@ class TestSequenceOps(hu.HypothesisTestCase): op, [padded_data, padded_lengths], partial(_gather_padding_ref, start_pad_width, end_pad_width)) + + @given(data=hu.tensor(min_dim=3, max_dim=3, dtype=np.float32, + elements=st.floats(min_value=-np.inf, + max_value=np.inf), + min_value=1, max_value=10), + **hu.gcs_cpu_only) + def test_reverse_packed_segs(self, data, gc, dc): + max_length = data.shape[0] + batch_size = data.shape[1] + lengths = np.random.randint(max_length + 1, size=batch_size) + + op = core.CreateOperator( + "ReversePackedSegs", + ["data", "lengths"], + ["reversed_data"]) + + def op_ref(data, lengths): + rev_data = np.array(data, copy=True) + for i in range(batch_size): + seg_length = lengths[i] + for j in range(seg_length): + rev_data[j][i] = data[seg_length - 1 - j][i] + return (rev_data,) + + def op_grad_ref(grad_out, outputs, inputs): + return op_ref(grad_out, inputs[1]) + (None,) + + self.assertReferenceChecks( + device_option=gc, + op=op, + inputs=[data, lengths], + reference=op_ref, + output_to_grad='reversed_data', + grad_reference=op_grad_ref) diff --git a/caffe2/python/operator_test/string_ops_test.py b/caffe2/python/operator_test/string_ops_test.py new file mode 100644 index 00000000000..69abbe70590 --- /dev/null +++ b/caffe2/python/operator_test/string_ops_test.py @@ -0,0 +1,106 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from caffe2.python import core +from hypothesis import given +import caffe2.python.hypothesis_test_util as hu +import hypothesis.strategies as st +import numpy as np + + +def _string_lists(alphabet=None): + return st.lists( + elements=st.text(alphabet=alphabet, average_size=3), + min_size=0, + max_size=3) + + +class TestStringOps(hu.HypothesisTestCase): + @given(strings=_string_lists()) + def test_string_prefix(self, strings): + length = 3 + # although we are utf-8 encoding below to avoid python exceptions, + # StringPrefix op deals with byte-length prefixes, which may produce + # an invalid utf-8 string. The goal here is just to avoid python + # complaining about the unicode -> str conversion. + strings = np.array( + map(lambda a: a.encode('utf-8'), strings), dtype=np.object) + + def string_prefix_ref(strings): + return ( + np.array(map(lambda a: a[:length], strings), dtype=object), ) + + op = core.CreateOperator( + 'StringPrefix', + ['strings'], + ['stripped'], + length=length) + self.assertReferenceChecks( + hu.cpu_do, + op, + [strings], + string_prefix_ref) + + @given(strings=_string_lists()) + def test_string_suffix(self, strings): + length = 3 + strings = np.array( + map(lambda a: a.encode('utf-8'), strings), dtype=np.object) + + def string_suffix_ref(strings): + return ( + np.array(map(lambda a: a[-length:], strings), dtype=object), ) + + op = core.CreateOperator( + 'StringSuffix', + ['strings'], + ['stripped'], + length=length) + self.assertReferenceChecks( + hu.cpu_do, + op, + [strings], + string_suffix_ref) + + @given(strings=st.text(alphabet=['a', 'b'], average_size=3)) + def test_string_starts_with(self, strings): + prefix = 'a' + strings = np.array( + map(lambda a: str(strings), strings), dtype=np.object) + + def string_starts_with_ref(strings): + return (np.array( + map(lambda a: a.startswith(prefix), strings), dtype=bool), ) + + op = core.CreateOperator( + 'StringStartsWith', + ['strings'], + ['bools'], + prefix=prefix) + self.assertReferenceChecks( + hu.cpu_do, + op, + [strings], + string_starts_with_ref) + + @given(strings=st.text(alphabet=['a', 'b'], average_size=3)) + def test_string_ends_with(self, strings): + suffix = 'a' + strings = np.array( + map(lambda a: str(strings), strings), dtype=np.object) + + def string_ends_with_ref(strings): + return (np.array( + map(lambda a: a.endswith(suffix), strings), dtype=bool), ) + + op = core.CreateOperator( + 'StringEndsWith', + ['strings'], + ['bools'], + suffix=suffix) + self.assertReferenceChecks( + hu.cpu_do, + op, + [strings], + string_ends_with_ref) diff --git a/caffe2/python/schema.py b/caffe2/python/schema.py new file mode 100644 index 00000000000..2c5be9d37de --- /dev/null +++ b/caffe2/python/schema.py @@ -0,0 +1,348 @@ +""" +Defines a minimal set of data types that allow to represent datasets with +arbitrary nested structure, including objects of variable length, such as +maps and lists. + +This defines a columnar storage format for such datasets on top of caffe2 +tensors. In terms of capacity of representation, it can represent most of +the data types supported by Parquet, ORC, DWRF file formats. + +See comments in operator_test/dataset_ops_test.py for a example and +walkthrough on how to use schema to store and iterate through a structured +in-memory dataset. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from collections import OrderedDict +import logging +import numpy as np + +logger = logging.getLogger(__name__) + + +def _join_field_name(prefix, suffix): + if prefix and suffix: + return '{}:{}'.format(prefix, suffix) + elif prefix: + return prefix + elif suffix: + return suffix + else: + return '' + + +class Field(object): + """Represents an abstract field type in a dataset. + """ + def __init__(self, children): + """Derived classes must call this after their initialization.""" + self._parent = (None, 0) + offset = 0 + self._field_offsets = [] + for child in children: + self._field_offsets.append(offset) + offset += len(child.field_names()) + self._field_offsets.append(offset) + + def field_names(self): + """Return the children field names for this field.""" + raise NotImplementedError('Field is an abstract class.') + + def field_types(self): + """Return the numpy.dtype for each of the children fields.""" + raise NotImplementedError('Field is an abstract class.') + + def clone(self): + """Clone this Field along with its children.""" + raise NotImplementedError('Field is an abstract class.') + + def _set_parent(self, parent, relative_id): + self._parent = (parent, relative_id) + + def slice(self): + """ + Returns a slice representing the range of field ids that belong to + this field. This slice can be used to index a list of fields. + + E.g.: + + >>> s = Struct( + >>> ('a', Scalar()), + >>> ('b', Struct( + >>> ('b1', Scalar()), + >>> ('b2', Scalar()), + >>> )), + >>> ('c', Scalar()), + >>> ) + >>> field_data = ['da', 'db1', 'db2', 'dc'] + >>> field_data[s.b.split()] + ['db1', 'db2'] + """ + base_id = self._child_base_id() + return slice(base_id, base_id + len(self.field_names())) + + def _child_base_id(self, child_index=None): + """Get the base id of the given child""" + p, i = self._parent + pos = 0 if child_index is None else self._field_offsets[child_index] + if p: + pos += p._child_base_id(i) + return pos + + def __eq__(self, other): + """Equivalance of two schemas""" + return ((self.field_names() == other.field_names()) and + (self.field_types() == other.field_types())) + +class List(Field): + """Represents a variable-length list. + + Values of a list can also be complex fields such as Lists and Structs. + In addition to the fields exposed by its `values` field, a List exposes an + additional `lengths` field, which will contain the size of each list under + the parent domain. + """ + def __init__(self, values): + assert isinstance(values, Field) + self.lengths = Scalar(np.int32) + self.values = values.clone() + self.lengths._set_parent(self, 0) + self.values._set_parent(self, 1) + Field.__init__(self, [self.lengths, self.values]) + + def field_names(self): + value_fields = self.values.field_names() + return ( + ['lengths'] + + [_join_field_name('values', v) for v in value_fields]) + + def field_types(self): + return self.lengths.field_types() + self.values.field_types() + + def clone(self): + return List(self.values) + + +class Struct(Field): + """Represents a named list of fields sharing the same domain. + """ + def __init__(self, *fields): + for field in fields: + assert len(field) == 2 + assert field[0], 'Field names cannot be empty' + assert field[0] != 'lengths', ( + 'Struct cannot contain a field named `lengths`.') + assert isinstance(field[1], Field) + fields = [(name, field.clone()) for name, field in fields] + for id, (name, field) in enumerate(fields): + field._set_parent(self, id) + self.fields = OrderedDict(fields) + Field.__init__(self, self.fields.values()) + + def field_names(self): + names = [] + for name, field in self.fields.items(): + names += [_join_field_name(name, f) for f in field.field_names()] + return names + + def field_types(self): + types = [] + for name, field in self.fields.items(): + types += field.field_types() + return types + + def clone(self): + return Struct(*self.fields.items()) + + def __getattr__(self, item): + return self.fields[item] + + +class Scalar(Field): + """Represents a typed scalar or tensor of fixed shape. + + A Scalar is a leaf in a schema tree, translating to exactly one tensor in + the dataset's underlying storage. + + Usually, the tensor storing the actual values of this field is a 1D tensor, + representing a series of values in its domain. It is possible however to + have higher rank values stored as a Scalar, as long as all entries have + the same shape. + + E.g.: + + Scalar(np.float64) + + Scalar field of type float32. Caffe2 will expect readers and + datasets to expose it as a 1D tensor of doubles (vector), where + the size of the vector is determined by this fields' domain. + + Scalar((np.int32, 5)) + + Tensor field of type int32. Caffe2 will expect readers and + datasets to implement it as a 2D tensor (matrix) of shape (L, 5), + where L is determined by this fields' domain. + + Scalar((str, (10, 20))) + + Tensor field of type str. Caffe2 will expect readers and + datasets to implement it as a 3D tensor of shape (L, 10, 20), + where L is determined by this fields' domain. + + If the field type is unknown at construction time, call Scalar(), that will + default to np.void as its dtype. + + It is an error to pass a structured dtype to Scalar, since it would contain + more than one field. Instead, use from_dtype, which will construct + a nested `Struct` field reflecting the given dtype's structure. + """ + def __init__(self, dtype=None): + self._original_dtype = dtype + self.dtype = np.dtype(dtype or np.void) + assert not self.dtype.fields, ( + 'Cannot create Scalar with a structured dtype. ' + + 'Use from_dtype instead.') + Field.__init__(self, []) + + def field_names(self): + return [''] + + def field_types(self): + return [self.dtype] + + def clone(self): + return Scalar(self._original_dtype) + + def id(self): + """ + Return the zero-indexed position of this scalar field in its schema. + Used in order to index into the field_blob list returned by readers or + accepted by writers. + """ + return self._child_base_id() + + +def Map(keys, values, keys_name='keys', values_name='values'): + """A map is a List of Struct containing keys and values fields. + Optionally, you can provide custom name for the key and value fields. + """ + return List(Struct((keys_name, keys), (values_name, values))) + + +def from_dtype(dtype, _outer_shape=()): + """Constructs a Caffe2 schema from the given numpy's dtype. + + Numpy supports scalar, array-like and structured datatypes, as long as + all the shapes are fixed. This function breaks down the given dtype into + a Caffe2 schema containing `Struct` and `Scalar` types. + + Fields containing byte offsets are not currently supported. + """ + if not isinstance(dtype, np.dtype): + # wrap into a ndtype + shape = _outer_shape + dtype = np.dtype((dtype, _outer_shape)) + else: + # concatenate shapes if necessary + shape = _outer_shape + dtype.shape + if shape != dtype.shape: + dtype = np.dtype((dtype.base, shape)) + + if not dtype.fields: + return Scalar(dtype) + + struct_fields = [] + for name, (fdtype, offset) in dtype.fields: + assert offset == 0, ('Fields with byte offsets are not supported.') + struct_fields += (name, from_dtype(fdtype, _outer_shape=shape)) + return Struct(*struct_fields) + + +class _SchemaNode(object): + """This is a private class used to represent a Schema Node""" + def __init__(self, name, type_str=''): + self.name = name + self.children = [] + self.type_str = type_str + self.field = None + + def add_child(self, name, type_str=''): + for child in self.children: + if child.name == name and child.type_str == type_str: + return child + child = _SchemaNode(name, type_str) + self.children.append(child) + return child + + def get_field(self): + + list_names = ['lengths', 'values'] + map_names = ['lengths', 'keys', 'values'] + + if len(self.children) == 0 or self.field is not None: + assert self.field is not None + return self.field + + child_names = [] + for child in self.children: + child_names.append(child.name) + + if (set(child_names) == set(list_names)): + for child in self.children: + if child.name == 'values': + self.field = List(child.get_field()) + self.type_str = "List" + return self.field + + elif (set(child_names) == set(map_names)): + for child in self.children: + if child.name == 'keys': + key_field = child.get_field() + elif child.name == 'values': + values_field = child.get_field() + self.field = Map(key_field, values_field) + self.type_str = "Map" + return self.field + + else: + struct_fields = [] + for child in self.children: + if child.field is not None: + struct_fields.append((child.name, child.field)) + else: + struct_fields.append((child.name, child.get_field())) + + self.field = Struct(*struct_fields) + self.type_str = "Struct" + return self.field + + def print_recursively(self): + for child in self.children: + child.print_recursively() + logger.info("Printing node: Name and type") + logger.info(self.name) + logger.info(self.type_str) + + +def from_column_list(column_names, column_types): + + root = _SchemaNode('root', 'Struct') + for column_name, column_type in zip(column_names, column_types): + columns = column_name.split(':') + current = root + for i in range(len(columns)): + name = columns[i] + type_str = '' + field = None + if i == len(columns) - 1: + type_str = column_type + field = Scalar(column_type) + next = current.add_child(name, type_str) + if field is not None: + next.field = field + current = next + + return root.get_field() diff --git a/caffe2/python/sparse_to_dense_mask_test.py b/caffe2/python/sparse_to_dense_mask_test.py new file mode 100644 index 00000000000..2da4fbee48a --- /dev/null +++ b/caffe2/python/sparse_to_dense_mask_test.py @@ -0,0 +1,82 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from caffe2.python import core, workspace +from caffe2.python.test_util import TestCase + +import numpy as np + + +class TestSparseToDenseMask(TestCase): + + def test_sparse_to_dense_mask_float(self): + op = core.CreateOperator( + 'SparseToDenseMask', + ['indices', 'values', 'default', 'lengths'], + ['output'], + mask=[999999999, 2, 6]) + workspace.FeedBlob( + 'indices', + np.array([2, 4, 6, 1, 2, 999999999, 2], dtype=np.int32)) + workspace.FeedBlob( + 'values', + np.array([1, 2, 3, 4, 5, 6, 7], dtype=np.float)) + workspace.FeedBlob('default', np.array(-1, dtype=np.float)) + workspace.FeedBlob('lengths', np.array([3, 4], dtype=np.int32)) + workspace.RunOperatorOnce(op) + output = workspace.FetchBlob('output') + expected = np.array([[-1, 1, 3], [6, 7, -1]], dtype=np.float) + self.assertEqual(output.shape, expected.shape) + self.assertFalse(np.any(output - expected)) + + def test_sparse_to_dense_mask_string(self): + op = core.CreateOperator( + 'SparseToDenseMask', + ['indices', 'values', 'default', 'lengths'], + ['output'], + mask=[999999999, 2, 6]) + workspace.FeedBlob( + 'indices', + np.array([2, 4, 6, 1, 2, 999999999, 2], dtype=np.int32)) + workspace.FeedBlob( + 'values', + np.array(['1', '2', '3', '4', '5', '6', '7'], dtype=np.str)) + workspace.FeedBlob('default', np.array('-1', dtype=np.str)) + workspace.FeedBlob('lengths', np.array([3, 4], dtype=np.int32)) + workspace.RunOperatorOnce(op) + output = workspace.FetchBlob('output') + expected = np.array([['-1', '1', '3'], ['6', '7', '-1']], dtype=np.str) + self.assertEqual(output.shape, expected.shape) + self.assertTrue(np.all(np.equal(output, expected))) + + def test_sparse_to_dense_mask_empty_lengths(self): + op = core.CreateOperator( + 'SparseToDenseMask', + ['indices', 'values', 'default', 'lengths'], + ['output'], + mask=[1, 2, 6]) + workspace.FeedBlob('indices', np.array([2, 4, 6], dtype=np.int32)) + workspace.FeedBlob('values', np.array([1, 2, 3], dtype=np.float)) + workspace.FeedBlob('default', np.array(-1, dtype=np.float)) + workspace.FeedBlob('lengths', np.array([], dtype=np.int32)) + workspace.RunOperatorOnce(op) + output = workspace.FetchBlob('output') + expected = np.array([-1, 1, 3], dtype=np.float) + self.assertEqual(output.shape, expected.shape) + self.assertFalse(np.any(output - expected)) + + def test_sparse_to_dense_mask_no_lengths(self): + op = core.CreateOperator( + 'SparseToDenseMask', + ['indices', 'values', 'default'], + ['output'], + mask=[1, 2, 6]) + workspace.FeedBlob('indices', np.array([2, 4, 6], dtype=np.int32)) + workspace.FeedBlob('values', np.array([1, 2, 3], dtype=np.float)) + workspace.FeedBlob('default', np.array(-1, dtype=np.float)) + workspace.RunOperatorOnce(op) + output = workspace.FetchBlob('output') + expected = np.array([-1, 1, 3], dtype=np.float) + self.assertEqual(output.shape, expected.shape) + self.assertFalse(np.any(output - expected)) diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py index bc1af5d6bb6..146a5c2f541 100644 --- a/caffe2/python/workspace.py +++ b/caffe2/python/workspace.py @@ -6,10 +6,18 @@ import shutil import socket import tempfile +import numpy as np from caffe2.proto import caffe2_pb2 from caffe2.python import scope, utils from ._import_c_extension import * # noqa +# Python 2 and 3 compatibility: test if basestring exists +try: + basestring # NOQA +except NameError: + # This is python3 so we define basestring. + basestring = str + def _GetFreeFlaskPort(): """Get a free flask port.""" @@ -86,7 +94,9 @@ def ResetWorkspace(root_folder=None): return cc_ResetWorkspace(root_folder) -def CreateNet(net, input_blobs=[]): +def CreateNet(net, input_blobs=None): + if input_blobs is None: + input_blobs = [] for input_blob in input_blobs: CreateBlob(input_blob) return cc_CreateNet(StringfyProto(net)) @@ -112,6 +122,14 @@ def RunPlan(plan): return cc_RunPlan(StringfyProto(plan)) +def _StringifyBlobName(name): + if isinstance(name, basestring): + return name + assert type(name).__name__ == 'BlobReference', \ + "Expected a string or BlobReference" + return str(name) + + def FeedBlob(name, arr, device_option=None): """Feeds a blob into the workspace. @@ -125,6 +143,10 @@ def FeedBlob(name, arr, device_option=None): """ if type(arr) is caffe2_pb2.TensorProto: arr = utils.Caffe2TensorToNumpyArray(arr) + if type(arr) is np.ndarray and arr.dtype.kind == 'S': + # Plain NumPy strings are weird, let's use objects instead + arr = arr.astype(np.object) + name = _StringifyBlobName(name) if device_option is not None: return cc_FeedBlob(name, arr, StringfyProto(device_option)) elif scope.DEVICESCOPE is not None: @@ -133,6 +155,40 @@ def FeedBlob(name, arr, device_option=None): return cc_FeedBlob(name, arr) +def FetchBlob(name): + """Fetches a blob from the workspace. + + Inputs: + name: the name of the blob - a string or a BlobReference + Returns: + Fetched blob (numpy array or string) if successful + """ + name = _StringifyBlobName(name) + return cc_FetchBlob(name) + + +class _BlobDict(object): + """Provides python dict compatible way to do fetching and feeding""" + + def __getitem__(self, key): + return FetchBlob(key) + + def __setitem__(self, key, value): + return FeedBlob(key, value) + + def __len__(self): + return len(Blobs()) + + def __iter__(self): + return Blobs().__iter__() + + def __contains__(self, item): + return HasBlob(item) + + +blobs = _BlobDict() + + class Model(object): def __init__(self, net, parameters, inputs, outputs, device_option=None): """Initializes a model. diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py index 39cf2712cb4..c93fdee0c93 100644 --- a/caffe2/python/workspace_test.py +++ b/caffe2/python/workspace_test.py @@ -8,7 +8,8 @@ from caffe2.python import core, test_util, workspace class TestWorkspace(unittest.TestCase): def setUp(self): self.net = core.Net("test-net") - self.net.ConstantFill([], "testblob", shape=[1, 2, 3, 4], value=1.0) + self.testblob_ref = self.net.ConstantFill( + [], "testblob", shape=[1, 2, 3, 4], value=1.0) workspace.ResetWorkspace() def testRootFolder(self): @@ -64,6 +65,20 @@ class TestWorkspace(unittest.TestCase): self.assertEqual(fetched_again.shape, (1, 2, 3, 4)) np.testing.assert_array_equal(fetched_again, 2.0) + def testFetchFeedBlobViaBlobReference(self): + self.assertEqual( + workspace.RunNetOnce(self.net.Proto().SerializeToString()), True) + fetched = workspace.FetchBlob(self.testblob_ref) + # check if fetched is correct. + self.assertEqual(fetched.shape, (1, 2, 3, 4)) + np.testing.assert_array_equal(fetched, 1.0) + fetched[:] = 2.0 + self.assertEqual(workspace.FeedBlob(self.testblob_ref, fetched), True) + fetched_again = workspace.FetchBlob("testblob") # fetch by name now + self.assertEqual(fetched_again.shape, (1, 2, 3, 4)) + np.testing.assert_array_equal(fetched_again, 2.0) + + def testFetchFeedBlobTypes(self): for dtype in [np.float16, np.float32, np.float64, np.bool, np.int8, np.int16, np.int32, np.int64, @@ -101,7 +116,8 @@ class TestWorkspace(unittest.TestCase): strs = np.array([ ' '.join(10 * ['long string']), ' '.join(128 * ['very long string']), - 'small string']) + 'small \0\1\2 string', + "Hello, world! I have special \0 symbols \1!"]) workspace.FeedBlob('my_str_tensor', strs) strs2 = workspace.FetchBlob('my_str_tensor') self.assertEqual(strs.shape, strs2.shape) @@ -117,6 +133,32 @@ class TestWorkspace(unittest.TestCase): for i in range(0, strs.shape[0]): self.assertEqual(strs[i], strs2[i]) + def testFetchFeedPlainString(self): + # this is actual string, not a tensor of strings + s = "Hello, world! I have special \0 symbols \1!" + workspace.FeedBlob('my_plain_string', s) + s2 = workspace.FetchBlob('my_plain_string') + self.assertEqual(s, s2) + + def testFetchFeedViaBlobDict(self): + self.assertEqual( + workspace.RunNetOnce(self.net.Proto().SerializeToString()), True) + fetched = workspace.blobs["testblob"] + # check if fetched is correct. + self.assertEqual(fetched.shape, (1, 2, 3, 4)) + np.testing.assert_array_equal(fetched, 1.0) + fetched[:] = 2.0 + workspace.blobs["testblob"] = fetched + fetched_again = workspace.blobs["testblob"] + self.assertEqual(fetched_again.shape, (1, 2, 3, 4)) + np.testing.assert_array_equal(fetched_again, 2.0) + + self.assertTrue("testblob" in workspace.blobs) + self.assertFalse("non_existant" in workspace.blobs) + self.assertEqual(len(workspace.blobs), 1) + for key in workspace.blobs: + self.assertEqual(key, "testblob") + class TestMultiWorkspaces(unittest.TestCase): def setUp(self): diff --git a/caffe2/sgd/adagrad_op.h b/caffe2/sgd/adagrad_op.h index 1b2ba7d4861..20fc11503dc 100644 --- a/caffe2/sgd/adagrad_op.h +++ b/caffe2/sgd/adagrad_op.h @@ -14,7 +14,8 @@ void adagrad_update( float epsilon, const float* lr, Context* context) { -#pragma omp parallel for + // TODO(cxj): use OMP when it is reliable + // #pragma omp parallel for for (auto i = 0; i < N; ++i) { float gi = g[i]; float hi = nh[i] = h[i] + gi * gi; @@ -78,8 +79,8 @@ class SparseAdagradOp final : public Operator { const auto* momentIn = Input(MOMENT_1).template data(); auto* gradOut = Output(OUTPUT_GRAD)->template mutable_data(); auto* momentOut = Output(OUTPUT_MOMENT_1)->template mutable_data(); - -#pragma omp parallel for + // TODO(cxj): use OMP when it is reliable + // #pragma omp parallel for for (auto i = 0; i < n; ++i) { auto idx = indices[i]; if (block_size == 1) { diff --git a/caffe2/sgd/ftrl_op.cc b/caffe2/sgd/ftrl_op.cc index b7062fb80ab..f11c934c416 100644 --- a/caffe2/sgd/ftrl_op.cc +++ b/caffe2/sgd/ftrl_op.cc @@ -41,7 +41,8 @@ void ftrl_update( T* new_nz, const FtrlParams& params, Context* context) { -#pragma omp parallel for + // TODO(cxj): use OMP when it is reliable + // #pragma omp parallel for for (auto i = 0; i < N; ++i) { ftrl_compute( w[i], @@ -93,11 +94,13 @@ void SparseFtrlOp::DoRun() { T* nz = n_z->template mutable_data(); const SIndex* idxs = indices.template data(); const T* g = grad.template data(); -#pragma omp parallel for + + // TODO(cxj): use OMP when it is reliable + // #pragma omp parallel for for (TIndex i = 0; i < K; ++i) { SIndex idx = idxs[i]; DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx - << ", range 0 to " << N; + << ", range 0 to " << N; if (block_size == 1) { ftrl_compute( w[idx], diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc index ac35580b83e..195b023fb38 100644 --- a/caffe2/utils/math_cpu.cc +++ b/caffe2/utils/math_cpu.cc @@ -11,9 +11,11 @@ // platforms, it allows one to quickly port Caffe2 to different platforms // where BLAS may not be present. -#include #include +#include +#include #include +#include #ifdef CAFFE2_USE_MKL #include @@ -486,6 +488,7 @@ void Set(const int N, const T alpha, T *Y, \ CAFFE2_SPECIALIZED_SET(float); CAFFE2_SPECIALIZED_SET(double); CAFFE2_SPECIALIZED_SET(int); +CAFFE2_SPECIALIZED_SET(bool); #undef CAFFE2_SPECIALIZED_SET #define CAFFE2_INSTANTIATE_BINARY_OP(name, op, T) \ diff --git a/caffe2/utils/string_utils.cc b/caffe2/utils/string_utils.cc new file mode 100644 index 00000000000..540c528d21a --- /dev/null +++ b/caffe2/utils/string_utils.cc @@ -0,0 +1,14 @@ +#include "caffe2/utils/string_utils.h" + +namespace caffe2 { + +std::vector split(char separator, const std::string& string) { + std::vector pieces; + std::stringstream ss(string); + std::string item; + while (getline(ss, item, separator)) { + pieces.push_back(std::move(item)); + } + return pieces; +} +} diff --git a/caffe2/utils/string_utils.h b/caffe2/utils/string_utils.h new file mode 100644 index 00000000000..f73ff601992 --- /dev/null +++ b/caffe2/utils/string_utils.h @@ -0,0 +1,10 @@ +#pragma once + +#include +#include +#include + +namespace caffe2 { + +std::vector split(char separator, const std::string& string); +}