This commit is contained in:
Yangqing Jia 2016-07-28 15:06:04 -07:00
parent f01f2063dd
commit bcea409c82
70 changed files with 5124 additions and 383 deletions

View File

@ -11,12 +11,70 @@ CAFFE2_DEFINE_int(
"Chunk size to split tensor data into");
namespace caffe2 {
namespace {
/**
* @brief StringSerializer is the serializer for String.
*
* StringSerializer takes in a blob that contains a String, and serializes it
* into a BlobProto protocol buffer.
*/
class StringSerializer : public BlobSerializerBase {
public:
StringSerializer() {}
~StringSerializer() {}
/**
* Serializes a Blob. Note that this blob has to contain Tensor<Context>,
* otherwise this function produces a fatal error.
*/
void Serialize(
const Blob& blob,
const string& name,
SerializationAcceptor acceptor) override {
CHECK(blob.IsType<std::string>());
BlobProto blob_proto;
blob_proto.set_name(name);
blob_proto.set_type("std::string");
blob_proto.set_content(blob.template Get<std::string>());
acceptor(name, blob_proto.SerializeAsString());
}
};
/**
* @brief StringDeserializer is the deserializer for Strings.
*
*/
class StringDeserializer : public BlobDeserializerBase {
public:
bool Deserialize(const BlobProto& proto, Blob* blob) override {
*blob->GetMutable<std::string>() = proto.content();
return true;
}
};
}
namespace {
// We can't use DeviceType_Name because of a protobuf-lite constraint.
std::string tensorDeviceTypeName(const DeviceType& d) {
switch (d) {
case CPU:
return "TensorCPU";
case CUDA:
return "TensorCUDA";
default:
CAFFE_THROW("Unknown device: ", d);
return "";
}
};
}
// The blob serialization member function implementation.
void Blob::Serialize(
const string& name,
BlobSerializerBase::SerializationAcceptor acceptor) const {
std::unique_ptr<BlobSerializerBase> serializer(CreateSerializer(meta_.id()));
CAFFE_ENFORCE(serializer, "No known serializer for ", meta_.name());
serializer->Serialize(*this, name, acceptor);
}
@ -33,7 +91,6 @@ std::string Blob::Serialize(const string& name) const {
return data.str();
}
// Specialization for StoreDeviceDetail for CPU - nothing needs to be done.
template <>
void TensorSerializer<CPUContext>::StoreDeviceDetail(
@ -60,9 +117,8 @@ bool Blob::Deserialize(const BlobProto& blob_proto) {
if (blob_proto.has_tensor()) {
// This is a tensor object. Depending on the device type, we will
// use the corresponding TensorDeserializer.
auto deserializer = CreateDeserializer(
"Tensor" +
DeviceType_Name(blob_proto.tensor().device_detail().device_type()));
auto deserializer = CreateDeserializer(tensorDeviceTypeName(
blob_proto.tensor().device_detail().device_type()));
// Tensor's deserializer should always be registered, but we will double
// check if it is not null anyway.
return CHECK_NOTNULL(deserializer.get())->Deserialize(blob_proto, this);
@ -82,5 +138,8 @@ REGISTER_BLOB_SERIALIZER(
(TypeMeta::Id<TensorCPU>()),
TensorSerializer<CPUContext>);
REGISTER_BLOB_DESERIALIZER(TensorCPU, TensorDeserializer<CPUContext>);
// Serialize std::string
REGISTER_BLOB_SERIALIZER((TypeMeta::Id<std::string>()), StringSerializer);
REGISTER_BLOB_DESERIALIZER(std::string, StringDeserializer);
} // namespace
} // namespace caffe2

View File

@ -69,6 +69,20 @@ TEST(BlobTest, BlobWrongType) {
ASSERT_THROW(blob.Get<int>(), EnforceNotMet);
}
TEST(BlobTest, StringSerialization) {
const std::string kTestString = "Hello world?";
Blob blob;
*blob.GetMutable<std::string>() = kTestString;
string serialized = blob.Serialize("test");
BlobProto proto;
CHECK(proto.ParseFromString(serialized));
EXPECT_EQ(proto.name(), "test");
EXPECT_EQ(proto.type(), "std::string");
EXPECT_FALSE(proto.has_tensor());
EXPECT_EQ(proto.content(), kTestString);
}
TEST(TensorNonTypedTest, TensorChangeType) {
vector<int> dims(3);
dims[0] = 2;

View File

@ -5,8 +5,9 @@
#include <cstdlib>
#include <random>
#include "caffe2/proto/caffe2.pb.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/typeid.h"
#include "caffe2/proto/caffe2.pb.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
@ -103,6 +104,7 @@ class CPUContext final {
// Two copy functions that deals with cross-device copies.
template <class SrcContext, class DstContext>
inline void CopyBytes(size_t nbytes, const void* src, void* dst);
template <typename T, class SrcContext, class DstContext>
inline void Copy(size_t n, const T* src, T* dst) {
if (std::is_fundamental<T>::value) {
@ -116,6 +118,16 @@ class CPUContext final {
}
}
template <class SrcContext, class DstContext>
inline void
CopyItems(const TypeMeta& meta, size_t n, const void* src, void* dst) {
if (meta.copy()) {
meta.copy()(src, dst, n);
} else {
CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
}
}
protected:
// TODO(jiayq): instead of hard-coding a generator, make it more flexible.
int random_seed_{1701};

View File

@ -7,6 +7,16 @@ thread_local ThreadLocalCUDAObjects CUDAContext::cuda_objects_;
namespace {
bool Caffe2UsePinnedCPUAllocator(int*, char***) {
#ifdef __SANITIZE_ADDRESS__
// Note(jiayq): for more details, see
// https://github.com/google/sanitizers/issues/629
LOG(WARNING) << "There are known issues between address sanitizer and "
"cudaMallocHost. As a result, caffe2 will not enable pinned "
"memory allocation in asan mode. If you are expecting any "
"behavior that depends on asan, be advised that it is not "
"turned on.";
return true;
#else
if (!HasCudaGPU()) {
VLOG(1) << "No GPU present. I won't use pinned allocator then.";
return true;
@ -14,6 +24,7 @@ bool Caffe2UsePinnedCPUAllocator(int*, char***) {
VLOG(1) << "Caffe2 gpu: setting CPUAllocator to PinnedCPUAllocator.";
SetCPUAllocator(new PinnedCPUAllocator());
return true;
#endif
}
REGISTER_CAFFE2_INIT_FUNCTION(Caffe2UsePinnedCPUAllocator,

View File

@ -116,7 +116,9 @@ CAFFE_DECLARE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
*/
inline unique_ptr<DB> CreateDB(
const string& db_type, const string& source, Mode mode) {
return Caffe2DBRegistry()->Create(db_type, source, mode);
auto result = Caffe2DBRegistry()->Create(db_type, source, mode);
VLOG(1) << ((!result) ? "not found db " : "found db ") << db_type;
return result;
}
/**

View File

@ -68,13 +68,13 @@ class InitRegisterer {
#define REGISTER_CAFFE2_INIT_FUNCTION(name, function, description) \
namespace { \
::caffe2::InitRegisterer g_caffe2_initregisterer_name( \
::caffe2::InitRegisterer g_caffe2_initregisterer_##name( \
function, false, description); \
} // namespace
#define REGISTER_CAFFE2_EARLY_INIT_FUNCTION(name, function, description) \
namespace { \
::caffe2::InitRegisterer g_caffe2_initregisterer_name( \
::caffe2::InitRegisterer g_caffe2_initregisterer_##name( \
function, true, description); \
} // namespace

View File

@ -1,14 +1,70 @@
#include "caffe2/core/net.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/flags.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/timer.h"
#include "caffe2/proto/caffe2.pb.h"
#ifdef CAFFE2_USE_NVTX
#include <nvToolsExt.h>
#endif
CAFFE2_DEFINE_bool(caffe2_use_nvtx, false, "Use NVTX ranges for profiling");
namespace caffe2 {
namespace {
using Color = int32_t;
constexpr Color kRunColor = 0x0000CCFF; // blue
constexpr Color kRecordColor = 0x00FF3300; // red
constexpr Color kWaitColor = 0x0066FF33; // green
#ifdef CAFFE2_USE_NVTX
class ProfiledRange {
public:
ProfiledRange(const OperatorDef& def, Color color) {
if (!FLAGS_caffe2_use_nvtx) {
return;
}
nvtxEventAttributes_t eventAttrib = {0};
eventAttrib.version = NVTX_VERSION;
eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
eventAttrib.colorType = NVTX_COLOR_ARGB;
eventAttrib.color = color;
eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
eventAttrib.message.ascii = def.type().c_str();
range_ = nvtxRangeStartEx(&eventAttrib);
CHECK(range_);
}
~ProfiledRange() {
if (!FLAGS_caffe2_use_nvtx) {
return;
}
nvtxRangeEnd(range_);
}
private:
nvtxRangeId_t range_ = 0;
DISABLE_COPY_AND_ASSIGN(ProfiledRange);
};
#else
class ProfiledRange {
public:
ProfiledRange(const OperatorDef& def, Color color) {}
private:
DISABLE_COPY_AND_ASSIGN(ProfiledRange);
};
#endif // ifdef CAFFE2_USE_NVTX
struct Stream;
struct Event {
@ -69,6 +125,7 @@ struct Stream {
int gpu_id_{-1};
cudaStream_t stream_{nullptr};
private:
DISABLE_COPY_AND_ASSIGN(Stream);
};
@ -128,18 +185,24 @@ class AsyncDAGNet : public DAGNetBase {
}));
for (auto source_parent_idx : operator_nodes_[source_idx].parents_) {
ProfiledRange r(
operator_nodes_[source_parent_idx].operator_->def(), kWaitColor);
stream.wait(events_[source_parent_idx].get());
}
// We've waited on all our parent indices.
bool success = true;
for (auto idx: chain) {
for (auto idx : chain) {
ProfiledRange r(operator_nodes_[idx].operator_->def(), kRunColor);
success &= operator_nodes_[idx].operator_->RunAsync();
}
// Record an event for the sink of the chain.
const auto& sink_idx = chain.back();
events_[sink_idx]->record(stream);
{
ProfiledRange r(operator_nodes_[sink_idx].operator_->def(), kRecordColor);
events_[sink_idx]->record(stream);
}
CHECK(!eventRecorded_[sink_idx]);
eventRecorded_[sink_idx] = 1;
return success;
@ -157,9 +220,11 @@ class AsyncDAGNet : public DAGNetBase {
Stream stream{device_option};
// Potential optimization: we can pre-compute outstanding events.
for (auto& event : events_) {
for (auto i = 0; i < events_.size(); ++i) {
auto& event = events_[i];
if (event->outstanding_) {
VLOG(2) << "Synchronizing host on outstanding event";
ProfiledRange r(operator_nodes_[i].operator_->def(), kWaitColor);
stream.wait(event.get());
}
}

View File

@ -2,17 +2,6 @@
#include "caffe2/core/flags.h"
CAFFE2_DEFINE_bool(
caffe2_keep_on_shrink, false,
caffe2_keep_on_shrink,
true,
"If set, keeps memory when a tensor is shrinking its size.");
namespace caffe2 {
namespace detail {
vector<TIndex>& shape(size_t n) {
static thread_local vector<TIndex> r;
r.resize(n);
return r;
}
}
}

View File

@ -150,6 +150,40 @@ class Tensor {
virtual ~Tensor() {}
/**
* @brief Extends the outer-most dimension of this tensor by num elements,
* preserving the existing data.
*
* The underlying data may be reallocated in order to accommodate the new
* elements, in which case this tensors' capacity is grown at a factor of
* growthPct. This ensures that Extend runs on an amortized O(1) time
* complexity.
*/
template <class ContextForCopy>
void Extend(TIndex num, int growthPct, ContextForCopy* context) {
CHECK_GE(dims_.size(), 1);
auto oldSize = size_;
auto newDims = dims_;
newDims[0] += num;
if (!data_) {
Resize(newDims);
return;
}
auto newSize = std::accumulate(
newDims.begin(), newDims.end(), 1, std::multiplies<TIndex>());
if (newSize * meta_.itemsize() > capacity_) {
auto newCapacity = dims_;
newCapacity[0] = std::max(newDims[0], dims_[0] * (growthPct + 100) / 100);
auto oldData = std::move(data_);
Resize(newCapacity);
auto* newData = raw_mutable_data(meta_);
context->template CopyItems<ContextForCopy, ContextForCopy>(
meta_, oldSize, oldData.get(), newData);
}
dims_ = newDims;
size_ = newSize;
}
/**
* @brief Resizes a tensor.
*
@ -297,9 +331,12 @@ class Tensor {
CHECK(data_.get() || size_ == 0)
<< "The tensor is uninitialized. You probably need to call "
<< "Resize() and mutable_data() first.";
CHECK(IsType<T>())
<< "Tensor type mistmatch, caller expects elements to be "
<< TypeMeta::Name<T>() << " while tensor contains " << meta_.name();
CAFFE_ENFORCE(
IsType<T>(),
"Tensor type mistmatch, caller expects elements to be ",
TypeMeta::Name<T>(),
" while tensor contains ",
meta_.name());
return static_cast<T*>(data_.get());
}

View File

@ -12,34 +12,39 @@
namespace caffe2 {
namespace {
// Returns a function that returns `true` if we should continue
// iterating, given the current iteration count.
std::function<bool(int)> getContinuationTest(
Workspace* ws,
const ExecutionStep& step) {
if (step.has_criteria_network()) {
CHECK(!step.has_num_iter())
<< "Must not specify num_iter if critera_network is set";
// try to get the should_stop signal, a scalar bool blob value.
// if the blob doesn't exist or is not initiaized, return false
const bool getShouldStop(const Blob* b) {
if (!b || !b->meta().id()) { // not exist or uninitialized
return false;
}
if (!step.has_criteria_network()) {
int iterations = step.has_num_iter() ? step.num_iter() : 1;
VLOG(1) << "Executing step for " << iterations << " iterations.";
return [=](int i) { return i < iterations; };
const auto& t = b->Get<TensorCPU>();
CAFFE_ENFORCE(t.IsType<bool>() && t.size() == 1, "expects a scalar boolean");
return *(t.template data<bool>());
}
// Returns a function that returns `true` if we should continue
// iterating, given the current iteration count.
std::function<bool(int64_t)> getContinuationTest(
Workspace* ws,
const ExecutionStep& step) {
if (step.has_should_stop_blob()) {
CAFFE_ENFORCE(
!step.has_num_iter(),
"Must not specify num_iter if should_stop_blob is set");
}
if (!step.has_should_stop_blob()) {
int64_t iterations = step.has_num_iter() ? step.num_iter() : 1;
VLOG(1) << "Will execute step " << step.name() << " for " << iterations
<< " iterations.";
return [=](int64_t i) { return i < iterations; };
} else {
VLOG(1) << "Will execute step " << step.name() << " until stopped by blob "
<< step.should_stop_blob();
return [](int64_t i) { return true; };
}
auto* criteria_network = ws->GetNet(step.criteria_network());
CHECK_NOTNULL(criteria_network);
CHECK_EQ(criteria_network->external_output().size(), 1);
const auto& criteria_output = criteria_network->external_output().front();
VLOG(1) << "Executing step controlled by criteria output: "
<< criteria_output;
return [=](int) {
criteria_network->Run();
const auto& blob = ws->GetBlob(criteria_output)->Get<TensorCPU>();
CHECK_EQ(blob.size(), 1);
CHECK(blob.IsType<bool>());
return blob.template data<bool>()[0] > 0;
};
};
} // namespace
@ -229,10 +234,17 @@ struct Reporter {
}
#define CHECK_SHOULD_STOP(shouldStop) \
if (getShouldStop(shouldStop)) { \
VLOG(1) << "Execution stopped by should_stop_blob"; \
return true; \
}
bool Workspace::ExecuteStepRecursive(
const ExecutionStep& step,
ShouldContinue externalShouldContinue) {
LOG(INFO) << "Running execution step " << step.name();
VLOG(1) << "Running execution step " << step.name();
if (!(step.substep_size() == 0 || step.network_size() == 0)) {
LOG(ERROR) << "An ExecutionStep should either have substep or networks "
<< "but not both.";
@ -247,49 +259,67 @@ bool Workspace::ExecuteStepRecursive(
if (net_map_.count(step.report_net()) == 0) {
LOG(ERROR) << "Report net " << step.report_net() << " not found.";
}
VLOG(1) << "Starting reporter net";
reporter.start(net_map_[step.report_net()].get(), step.report_interval());
}
const Blob* shouldStop = nullptr;
if (step.has_should_stop_blob()) {
shouldStop = GetBlob(step.should_stop_blob());
CAFFE_ENFORCE(
shouldStop, "blob ", step.should_stop_blob(), " does not exist");
}
const auto netShouldContinue = getContinuationTest(this, step);
const auto shouldContinue = [&](int iter) {
const auto shouldContinue = [&](int64_t iter) {
return externalShouldContinue(iter) && netShouldContinue(iter);
};
if (step.substep_size()) {
for (int iter = 0; shouldContinue(iter); ++iter) {
// we assume that, if we have substeps, each substep is going to take a
// reasonable amount of time, so logging here is fine
LOG(INFO) << "Execution step " << step.name()
<< ": Starting iteration " << iter;
std::atomic<int> next_substep{0};
std::atomic<bool> got_failure{false};
auto substepShouldContinue = [&, externalShouldContinue](int iter) {
return !got_failure && externalShouldContinue(iter);
};
auto worker = [&]() {
while (true) {
int substep_id = next_substep++;
if (got_failure || (substep_id >= step.substep().size())) {
break;
}
if (!ExecuteStepRecursive(step.substep().Get(substep_id),
substepShouldContinue)) {
got_failure = true;
}
}
};
for (int64_t iter = 0; shouldContinue(iter); ++iter) {
VLOG(1) << "Execution step " << step.name() << ": iteration " << iter;
if (!step.concurrent_substeps() || step.substep().size() <= 1) {
worker();
auto substepShouldContinue = [&, externalShouldContinue](int64_t iter) {
return externalShouldContinue(iter);
};
for (auto& ss : step.substep()) {
if (!ExecuteStepRecursive(ss, substepShouldContinue)) {
return false;
}
CHECK_SHOULD_STOP(shouldStop);
}
} else {
std::atomic<int> next_substep{0};
std::atomic<bool> got_failure{false};
auto substepShouldContinue = [&, externalShouldContinue](int64_t iter) {
return !got_failure && externalShouldContinue(iter);
};
auto worker = [&]() {
while (true) {
int substep_id = next_substep++;
if (got_failure || (substep_id >= step.substep().size())) {
break;
}
if (!ExecuteStepRecursive(
step.substep().Get(substep_id), substepShouldContinue)) {
got_failure = true;
}
}
};
std::vector<std::thread> threads;
for (int i = 0; i < step.substep().size(); ++i) {
for (int64_t i = 0; i < step.substep().size(); ++i) {
threads.emplace_back(worker);
}
for (auto& thread: threads) {
thread.join();
}
}
if (got_failure) {
return false;
if (got_failure) {
return false;
}
// concurrent substeps should be careful about setting should_stop_blob
CHECK_SHOULD_STOP(shouldStop);
}
}
return true;
@ -305,16 +335,19 @@ bool Workspace::ExecuteStepRecursive(
VLOG(1) << "Going to execute network " << network_name;
networks.push_back(net_map_[network_name].get());
}
for (int iter = 0; shouldContinue(iter); ++iter) {
for (int64_t iter = 0; shouldContinue(iter); ++iter) {
VLOG(1) << "Executing network iteration " << iter;
for (NetBase* network : networks) {
if (!network->Run()) {
return false;
}
CHECK_SHOULD_STOP(shouldStop);
}
}
}
return true;
}
#undef CHECK_SHOULD_STOP
} // namespace caffe2

View File

@ -0,0 +1,73 @@
#include <mutex>
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
namespace caffe2 {
namespace fb {
namespace {
class CreateMutexOp final : public Operator<CPUContext> {
public:
CreateMutexOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws) {}
bool RunOnDevice() override {
*OperatorBase::Output<std::unique_ptr<std::mutex>>(0) =
std::unique_ptr<std::mutex>(new std::mutex);
return true;
}
};
class AtomicFetchAddOp final : public Operator<CPUContext> {
public:
AtomicFetchAddOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws) {}
bool RunOnDevice() override {
auto& mutex = OperatorBase::Input<std::unique_ptr<std::mutex>>(0);
auto& a = Input(1);
auto& b = Input(2);
auto* c = Output(0);
auto* d = Output(1);
c->Resize(std::vector<TIndex>());
d->Resize(std::vector<TIndex>());
auto* aPtr = a.data<int32_t>();
auto* bPtr = b.data<int32_t>();
auto* cPtr = c->mutable_data<int32_t>();
auto* dPtr = d->mutable_data<int32_t>();
std::lock_guard<std::mutex> lg(*mutex);
*dPtr = *aPtr;
*cPtr = *aPtr + *bPtr;
return true;
}
};
REGISTER_CPU_OPERATOR(CreateMutex, CreateMutexOp);
REGISTER_CPU_OPERATOR(AtomicFetchAdd, AtomicFetchAddOp);
OPERATOR_SCHEMA(CreateMutex)
.NumInputs(0)
.NumOutputs(1)
.SetDoc("Creates an unlocked mutex and returns it in a unique_ptr blob.")
.Output(0, "mutex_ptr", "Blob containing a std::unique_ptr<mutex>.");
OPERATOR_SCHEMA(AtomicFetchAdd)
.NumInputs(3)
.NumOutputs(2)
.SetDoc(R"DOC(
Given a mutex and two int32 scalar tensors, performs an atomic fetch add
by mutating the first argument and adding it to the second input
argument. Returns the updated integer and the value prior to the update.
)DOC")
.Input(0, "mutex_ptr", "Blob containing to a unique_ptr<mutex>")
.Input(1, "mut_value", "Value to be mutated after the sum.")
.Input(2, "increment", "Value to add to the first operand.")
.Output(0, "mut_value", "Mutated value after sum. Usually same as input 1.")
.Output(1, "fetched_value", "Value of the first operand before sum.")
.AllowInplace({{1, 0}});
SHOULD_NOT_DO_GRADIENT(CreateMutex);
SHOULD_NOT_DO_GRADIENT(AtomicFetchAdd);
}
}
}

View File

@ -0,0 +1,143 @@
#include "caffe2/core/operator.h"
#include "caffe2/core/tensor.h"
namespace caffe2 {
namespace {
template <class Context>
class BooleanMaskLengthsOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
BooleanMaskLengthsOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws) {}
bool RunOnDevice() override {
return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
}
template <typename T>
bool DoRunWithType() {
auto& lengths = Input(0);
auto& mask = Input(1);
auto* lengthsOut = Output(0);
CAFFE_ENFORCE(lengths.ndim() == 1);
CAFFE_ENFORCE(mask.ndim() == 1);
const auto* lengthsPtr = lengths.template data<T>();
const auto* maskPtr = mask.template data<bool>();
auto totalLength =
std::accumulate(lengthsPtr, lengthsPtr + lengths.size(), 0);
CAFFE_ENFORCE(mask.size() == totalLength);
lengthsOut->ResizeLike(lengths);
auto* lengthsOutPtr = lengthsOut->template mutable_data<T>();
int p = 0;
for (int i = 0; i < lengths.size(); ++i) {
T lengthOut = 0;
for (int j = 0; j < lengthsPtr[i]; ++j) {
if (maskPtr[p++]) {
++lengthOut;
}
}
lengthsOutPtr[i] = lengthOut;
}
return true;
}
};
template <class Context>
class BooleanMaskOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
BooleanMaskOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws) {}
bool RunOnDevice() override {
auto& data = Input(0);
auto& mask = Input(1);
auto* dataOut = Output(0);
CAFFE_ENFORCE(data.ndim() >= 1);
CAFFE_ENFORCE(mask.ndim(), 1);
CAFFE_ENFORCE(data.dims()[0] == mask.dims()[0]);
const auto* maskPtr = mask.template data<bool>();
int numOutputs = 0;
int outerSize = mask.size();
for (int i = 0; i < outerSize; ++i) {
if (maskPtr[i]) {
++numOutputs;
}
}
std::vector<TIndex> outShape;
outShape.push_back(numOutputs);
outShape.insert(outShape.end(), data.dims().begin() + 1, data.dims().end());
dataOut->Resize(outShape);
if (numOutputs == 0) {
return true;
}
auto innerSizeBytes = std::accumulate(
data.dims().begin() + 1,
data.dims().end(),
1,
std::multiplies<TIndex>()) *
data.meta().itemsize();
TIndex lastStart = -1;
const auto* inPtr = (char*)data.raw_data();
auto* outPtr = (char*)dataOut->raw_mutable_data(data.meta());
TIndex outStart = 0;
for (TIndex i = 0;; ++i) {
// mask was true and either a) became false, or b) sequence finished
if (lastStart != -1 && ((i >= outerSize) || !maskPtr[i])) {
const auto* src = inPtr + lastStart * innerSizeBytes;
auto* dst = outPtr + outStart * innerSizeBytes;
int numItems = i - lastStart;
if (data.meta().copy()) {
data.meta().copy()(src, dst, numItems);
} else {
context_.template CopyBytes<CPUContext, CPUContext>(
numItems * data.meta().itemsize(), src, dst);
}
outStart += numItems;
lastStart = -1;
}
if (i >= outerSize) {
break;
}
// mask was false and became true
if (lastStart == -1 && maskPtr[i]) {
lastStart = i;
}
}
return true;
}
};
REGISTER_CPU_OPERATOR(BooleanMask, BooleanMaskOp<CPUContext>);
REGISTER_CPU_OPERATOR(BooleanMaskLengths, BooleanMaskLengthsOp<CPUContext>);
OPERATOR_SCHEMA(BooleanMask)
.NumInputs(2)
.NumOutputs(1)
.SetDoc(R"DOC(
Given a data 1D tensor and a mask (boolean) tensor of same shape, returns a
tensor containing only the elements corresponding to positions where the mask
is true.
)DOC")
.Input(0, "data", "The 1D, original data tensor.")
.Input(1, "mask", "A tensor of bools of same shape as `data`.")
.Output(0, "masked_data", "A tensor of same type as `data`.");
OPERATOR_SCHEMA(BooleanMaskLengths)
.NumInputs(2)
.NumOutputs(1)
.SetDoc(R"DOC(
Given a tensor of int32 segment lengths and a mask (boolean) tensor, return
the segment lengths of a corresponding segmented tensor after BooleanMask is
applied.
)DOC")
.Input(0, "lengths", "A 1D int32 tensor representing segment lengths.")
.Input(1, "mask", "A 1D bool tensor of values to keep.")
.Output(0, "masked_lengths", "Segment lengths of a masked tensor.");
NO_GRADIENT(BooleanMask)
NO_GRADIENT(BooleanMaskLengths);
}
}

View File

@ -10,8 +10,14 @@ OPERATOR_SCHEMA(Concat).NumInputs(1, INT_MAX).NumOutputs(2);
// Backward compatibility names.
REGISTER_CPU_OPERATOR(DepthSplit, SplitOp<CPUContext>);
REGISTER_CPU_OPERATOR(DepthConcat, ConcatOp<CPUContext>);
OPERATOR_SCHEMA(DepthSplit).NumInputs(1, 2).NumOutputs(1, INT_MAX);
OPERATOR_SCHEMA(DepthConcat).NumInputs(1, INT_MAX).NumOutputs(2);
OPERATOR_SCHEMA(DepthSplit)
.NumInputs(1, 2)
.NumOutputs(1, INT_MAX)
.SetDoc("Backward compatible operator name for Split.");
OPERATOR_SCHEMA(DepthConcat)
.NumInputs(1, INT_MAX)
.NumOutputs(2)
.SetDoc("Backward compatible operator name for Concat.");
class GetSplitGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;

View File

@ -31,7 +31,7 @@ class SplitOp final : public Operator<Context> {
: Operator<Context>(operator_def, ws),
split_(OperatorBase::GetRepeatedArgument<int>("split")) {
CHECK(OperatorBase::HasArgument("axis") ^
OperatorBase::HasArgument("order"))
OperatorBase::HasArgument("order"))
<< "You should either specify the dim to split, or the order "
"in the case of 4-D images.";
if (OperatorBase::HasArgument("axis")) {
@ -85,7 +85,9 @@ class ConcatOp final : public Operator<Context> {
template <class Context>
bool SplitOp<Context>::RunOnDevice() {
auto& input = Input(0);
const int input_channels = input.dim32(axis_);
const int* axis_data;
vector<int> equal_split;
if (InputSize() == 2) {
// We obtain split from the input tensor.
CHECK_EQ(split_.size(), 0)
@ -94,13 +96,21 @@ bool SplitOp<Context>::RunOnDevice() {
auto& split_tensor = OperatorBase::Input<TensorCPU>(1);
CHECK_EQ(split_tensor.size(), OutputSize());
axis_data = split_tensor.template data<int>();
} else if (split_.size() == 0) {
CAFFE_ENFORCE(input_channels % OutputSize() == 0,
"If you did not specify split explicitly, the number of "
"input channels should be divisible by the output size.");
equal_split.resize(OutputSize(), input_channels / OutputSize());
axis_data = equal_split.data();
} else {
// We obtain split from the parameters.
CHECK_EQ(split_.size(), OutputSize());
CAFFE_ENFORCE(split_.size() == OutputSize(),
"The number of splits specified should be equal to the "
"number of outputs.");
axis_data = split_.data();
}
CHECK_LT(axis_, input.ndim());
const int input_channels = input.dim32(axis_);
CHECK_EQ(std::accumulate(axis_data, axis_data + OutputSize(), 0),
input_channels)
<< "Sum of split dimensions do not match: should be " << input_channels;

View File

@ -22,11 +22,11 @@ template <typename ArrayOfcudnnConvolutionAlgoPerf_t>
inline void LogCuDNNPerfStats(
const ArrayOfcudnnConvolutionAlgoPerf_t& perf_stat,
int returned_algo_count) {
LOG(INFO) << "Perf result: (algo: stat, time, memory)";
VLOG(1) << "Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) {
const auto& stat = perf_stat[i];
LOG(INFO) << stat.algo << ": " << stat.status
<< " " << stat.time << " " << stat.memory;
VLOG(1) << stat.algo << ": " << stat.status << " " << stat.time << " "
<< stat.memory;
}
}
} // namespace
@ -193,7 +193,7 @@ bool CudnnConvOp<T>::RunOnDevice() {
if (deterministic_) {
algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
} else if (exhaustive_search_) {
LOG(INFO) << "CUDNN Convolution: doing exhaustive search.";
VLOG(1) << "CUDNN Convolution: doing exhaustive search.";
// When we do an exhaustive search, we will ignore the workspace size
// limit and simply go for the fastest algorithm. If you happen to run
// out of memory later, you will be on your own...
@ -229,8 +229,8 @@ bool CudnnConvOp<T>::RunOnDevice() {
cudnn_wrapper_.inline_cudnn_handle(),
bottom_desc_, filter_desc_, conv_desc_, top_desc_,
algo_, &cudnn_ws_nbytes_));
LOG(INFO) << "CuDNN algorithm: " << algo_;
LOG(INFO) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
VLOG(1) << "CuDNN algorithm: " << algo_;
VLOG(1) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
}
// Now, actually run the computation.
@ -346,7 +346,7 @@ bool CudnnConvGradientOp<T>::RunOnDevice() {
bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
} else if (exhaustive_search_) {
LOG(INFO) << "CUDNN Convolution bwd: doing exhaustive search.";
VLOG(1) << "CUDNN Convolution bwd: doing exhaustive search.";
// When we do an exhaustive search, we will ignore the workspace size
// limit and simply go for the fastest algorithm. If you happen to run
// out of memory later, you will be on your own...
@ -416,9 +416,9 @@ bool CudnnConvGradientOp<T>::RunOnDevice() {
bwd_data_algo_, &bwd_data_ws_size));
cudnn_ws_nbytes_ = std::max(bwd_filter_ws_size, bwd_data_ws_size);
LOG(INFO) << "CuDNN bwd algorithm: " << bwd_filter_algo_ << ", "
<< bwd_data_algo_;
LOG(INFO) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
VLOG(1) << "CuDNN bwd algorithm: " << bwd_filter_algo_ << ", "
<< bwd_data_algo_;
VLOG(1) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
}
// Now, actually run the computation.

View File

@ -0,0 +1,579 @@
#include "caffe2/core/common_cudnn.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/conv_transpose_op.h"
namespace caffe2 {
// Earlier in the days Caffe sets the default cudnn workspace to 8MB. We bump
// it up to 64MB in Caffe2, as this enables the use of Winograd in many cases,
// something very beneficial to more recent CNN models.
static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 64 * 1024 * 1024;
// Manually specified number of algorithms implemented in CuDNN.
// This does not have any performance implications, as we will always find the
// fastest algorithm; setting them to the right number of algorithms will enable
// us to best report the statistics when doing an exhaustive search, though.
static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7;
static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
namespace {
template <typename ArrayOfcudnnConvolutionAlgoPerf_t>
inline void LogCuDNNPerfStats(
const ArrayOfcudnnConvolutionAlgoPerf_t& perf_stat,
int returned_algo_count) {
LOG(INFO) << "Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) {
const auto& stat = perf_stat[i];
LOG(INFO) << stat.algo << ": " << stat.status << " " << stat.time << " "
<< stat.memory;
}
}
} // namespace
class CudnnConvTransposeOpBase : public ConvTransposeUnpoolBase<CUDAContext> {
public:
CudnnConvTransposeOpBase(const OperatorDef& operator_def, Workspace* ws)
: ConvTransposeUnpoolBase<CUDAContext>(operator_def, ws),
cudnn_wrapper_(&context_),
cudnn_ws_nbytes_limit_(OperatorBase::GetSingleArgument<size_t>(
"ws_nbytes_limit",
kCONV_CUDNN_WORKSPACE_LIMIT_BYTES)),
exhaustive_search_(
OperatorBase::GetSingleArgument<int>("exhaustive_search", 0)),
deterministic_(
OperatorBase::GetSingleArgument<int>("deterministic", 0)),
cudnn_state_(OperatorBase::GetSingleArgument<int>("cudnn_state", 0)) {
CHECK(!deterministic_ || !exhaustive_search_);
CUDNN_CHECK(cudnnCreateTensorDescriptor(&bottom_desc_));
CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
CUDNN_CHECK(cudnnCreateTensorDescriptor(&top_desc_));
CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
}
~CudnnConvTransposeOpBase() {
CUDNN_CHECK(cudnnDestroyTensorDescriptor(bottom_desc_));
CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc_));
CUDNN_CHECK(cudnnDestroyTensorDescriptor(bias_desc_));
CUDNN_CHECK(cudnnDestroyTensorDescriptor(top_desc_));
CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc_));
}
protected:
vector<TIndex> cudnn_input_dims_;
vector<TIndex> cudnn_filter_dims_;
CuDNNWrapper cudnn_wrapper_;
cudnnTensorDescriptor_t bottom_desc_;
cudnnFilterDescriptor_t filter_desc_;
cudnnTensorDescriptor_t bias_desc_;
cudnnTensorDescriptor_t top_desc_;
cudnnConvolutionDescriptor_t conv_desc_;
const size_t cudnn_ws_nbytes_limit_;
size_t cudnn_ws_nbytes_;
bool exhaustive_search_;
bool deterministic_;
size_t cudnn_state_;
};
template <typename T>
class CudnnConvTransposeOp final : public CudnnConvTransposeOpBase {
public:
CudnnConvTransposeOp(const OperatorDef& operator_def, Workspace* ws)
: CudnnConvTransposeOpBase(operator_def, ws) {}
~CudnnConvTransposeOp() {}
bool RunOnDevice() override;
private:
cudnnConvolutionBwdDataAlgo_t bwd_data_algo_;
// Input: X, W, b
// Output: Y
INPUT_TAGS(INPUT, FILTER, BIAS);
};
template <typename T>
class CudnnConvTransposeGradientOp final : public CudnnConvTransposeOpBase {
public:
CudnnConvTransposeGradientOp(const OperatorDef& operator_def, Workspace* ws)
: CudnnConvTransposeOpBase(operator_def, ws) {}
~CudnnConvTransposeGradientOp() {}
bool RunOnDevice() override;
private:
cudnnConvolutionFwdAlgo_t algo_;
cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo_;
// input: X, W, dY
// output: dW, db, and optionally dX
INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
OUTPUT_TAGS(FILTER_GRAD, BIAS_GRAD, INPUT_GRAD);
};
////////////////////////////////////////////////////////////////////////////////
// Implementations
////////////////////////////////////////////////////////////////////////////////
template <typename T>
bool CudnnConvTransposeOp<T>::RunOnDevice() {
auto& X = Input(INPUT);
auto& filter = Input(FILTER);
auto& bias = Input(BIAS);
auto* Y = Output(0);
int C = 0;
switch (order_) {
case StorageOrder::NHWC:
C = filter.dim32(3);
break;
case StorageOrder::NCHW:
C = filter.dim32(1);
break;
default:
LOG(FATAL) << "Unknown storage order: " << order_;
}
ConvTransposeUnpoolBase<CUDAContext>::SetOutputSize(X, Y, C);
int N = 0, M = 0, H = 0, W = 0, H_out = 0, W_out = 0;
switch (order_) {
case StorageOrder::NHWC:
N = X.dim32(0);
H = X.dim32(1);
W = X.dim32(2);
M = X.dim32(3);
H_out = Y->dim32(1);
W_out = Y->dim32(2);
DCHECK_EQ(filter.dim32(1), kernel_h_);
DCHECK_EQ(filter.dim32(1), kernel_h_);
DCHECK_EQ(filter.dim32(2), kernel_w_);
DCHECK_EQ(filter.dim32(3), C);
break;
case StorageOrder::NCHW:
N = X.dim32(0);
M = X.dim32(1);
H = X.dim32(2);
W = X.dim32(3);
H_out = Y->dim32(2);
W_out = Y->dim32(3);
DCHECK_EQ(filter.dim32(1), C);
DCHECK_EQ(filter.dim32(2), kernel_h_);
DCHECK_EQ(filter.dim32(3), kernel_w_);
break;
default:
LOG(FATAL) << "Unknown storage order: " << order_;
}
DCHECK_EQ(bias.ndim(), 1);
DCHECK_EQ(bias.dim32(0), C);
// Set up the cudnn algorithms & workspace if necessary
bool input_changed = (X.dims() != cudnn_input_dims_);
bool filter_changed = (filter.dims() != cudnn_filter_dims_);
if (input_changed || filter_changed) {
VLOG(1) << "Changing the cudnn descriptor configurations.";
if (input_changed) {
cudnn_input_dims_ = X.dims();
CUDNN_CHECK(cudnnSetTensor4dDescriptor(
bottom_desc_,
GetCudnnTensorFormat(order_),
cudnnTypeWrapper<T>::type,
N,
M,
H,
W));
}
if (filter_changed) {
cudnn_filter_dims_ = filter.dims();
CUDNN_CHECK(cudnnSetFilter4dDescriptor(
filter_desc_,
cudnnTypeWrapper<T>::type,
GetCudnnTensorFormat(order_),
M,
C,
kernel_h_,
kernel_w_));
CUDNN_CHECK(cudnnSetTensor4dDescriptor(
bias_desc_,
GetCudnnTensorFormat(order_),
cudnnTypeWrapper<T>::type,
1,
C,
1,
1));
}
// Set the output
CUDNN_CHECK(cudnnSetTensor4dDescriptor(
top_desc_,
GetCudnnTensorFormat(order_),
cudnnTypeWrapper<T>::type,
N,
C,
H_out,
W_out));
// Set the convolution descriptor
CHECK_EQ(pad_t_, pad_b_)
<< "The current padding scheme leads to unequal padding on the top and "
"bottom, which is not supported by cudnn.";
CHECK_EQ(pad_l_, pad_r_)
<< "The current padding scheme leads to unequal padding on the left "
"and right, which is not supported by cudnn.";
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
conv_desc_,
pad_t_,
pad_l_,
stride_h_,
stride_w_,
1,
1,
CUDNN_CROSS_CORRELATION));
if (deterministic_) {
bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
} else if (exhaustive_search_) {
int returned_algo_count;
std::array<cudnnConvolutionBwdDataAlgoPerf_t, kNUM_CUDNN_BWD_DATA_ALGS>
data_perf_stat;
cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
state->workspace().reset();
CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
state->cudnn_handle(),
filter_desc_,
bottom_desc_,
conv_desc_,
top_desc_,
kNUM_CUDNN_BWD_DATA_ALGS,
&returned_algo_count,
data_perf_stat.data()));
});
LogCuDNNPerfStats(data_perf_stat, returned_algo_count);
bwd_data_algo_ = data_perf_stat[0].algo;
} else {
CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
cudnn_wrapper_.inline_cudnn_handle(),
filter_desc_,
bottom_desc_,
conv_desc_,
top_desc_,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
cudnn_ws_nbytes_limit_,
&bwd_data_algo_));
}
size_t bwd_data_ws_size;
CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
cudnn_wrapper_.inline_cudnn_handle(),
filter_desc_,
bottom_desc_,
conv_desc_,
top_desc_,
bwd_data_algo_,
&bwd_data_ws_size));
cudnn_ws_nbytes_ = bwd_data_ws_size;
LOG(INFO) << "CuDNN algorithm: " << bwd_data_algo_;
LOG(INFO) << "CuDNN workspace size: " << bwd_data_ws_size;
}
// Now, actually run the computation.
// Filter
cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
CUDNN_CHECK(cudnnConvolutionBackwardData(
state->cudnn_handle(),
cudnnTypeWrapper<T>::kOne(),
filter_desc_,
filter.template data<T>(),
bottom_desc_,
X.template data<T>(),
conv_desc_,
bwd_data_algo_,
state->workspace().get(cudnn_ws_nbytes_),
cudnn_ws_nbytes_,
cudnnTypeWrapper<T>::kZero(),
top_desc_,
Y->template mutable_data<T>()));
});
// Bias
CUDNN_CHECK(cudnnAddTensor(
cudnn_wrapper_.inline_cudnn_handle(),
cudnnTypeWrapper<T>::kOne(),
bias_desc_,
bias.template data<T>(),
cudnnTypeWrapper<T>::kOne(),
top_desc_,
Y->template mutable_data<T>()));
// Done.
return true;
}
// TODO(Yangqing): a lot of the function contents are very similar. Consider
// consolidating them.
template <typename T>
bool CudnnConvTransposeGradientOp<T>::RunOnDevice() {
auto& X = Input(INPUT);
auto& filter = Input(FILTER);
auto& dY = Input(OUTPUT_GRAD);
auto* dfilter = Output(FILTER_GRAD);
auto* dbias = Output(BIAS_GRAD);
DCHECK_EQ(X.ndim(), 4);
DCHECK_EQ(filter.ndim(), 4);
auto* Y = Output(0);
int C = 0;
switch (order_) {
case StorageOrder::NHWC:
C = filter.dim32(3);
break;
case StorageOrder::NCHW:
C = filter.dim32(1);
break;
default:
LOG(FATAL) << "Unknown storage order: " << order_;
}
ConvTransposeUnpoolBase<CUDAContext>::SetOutputSize(X, Y, C);
int N = 0, M = 0, H = 0, W = 0, H_out = 0, W_out = 0;
switch (order_) {
case StorageOrder::NHWC:
N = X.dim32(0);
H = X.dim32(1);
W = X.dim32(2);
M = X.dim32(3);
H_out = dY.dim32(1);
W_out = dY.dim32(2);
DCHECK_EQ(filter.dim32(1), kernel_h_);
DCHECK_EQ(filter.dim32(1), kernel_h_);
DCHECK_EQ(filter.dim32(2), kernel_w_);
DCHECK_EQ(filter.dim32(3), C);
break;
case StorageOrder::NCHW:
N = X.dim32(0);
M = X.dim32(1);
H = X.dim32(2);
W = X.dim32(3);
H_out = dY.dim32(2);
W_out = dY.dim32(3);
DCHECK_EQ(filter.dim32(1), C);
DCHECK_EQ(filter.dim32(2), kernel_h_);
DCHECK_EQ(filter.dim32(3), kernel_w_);
break;
default:
LOG(FATAL) << "Unknown storage order: " << order_;
}
// Since we only handle LegacyPadding::NOTSET, we don't need to
// compute padding.
dfilter->ResizeLike(filter);
dbias->Resize(C);
// Set up the cudnn algorithms & workspace if necessary
bool input_changed = (X.dims() != cudnn_input_dims_);
bool filter_changed = (filter.dims() != cudnn_filter_dims_);
if (input_changed || filter_changed) {
VLOG(1) << "Changing the cudnn descriptor configurations.";
if (input_changed) {
cudnn_input_dims_ = X.dims();
CUDNN_CHECK(cudnnSetTensor4dDescriptor(
bottom_desc_,
GetCudnnTensorFormat(order_),
cudnnTypeWrapper<T>::type,
N,
M,
H,
W));
}
if (filter_changed) {
cudnn_filter_dims_ = filter.dims();
CUDNN_CHECK(cudnnSetFilter4dDescriptor(
filter_desc_,
cudnnTypeWrapper<T>::type,
GetCudnnTensorFormat(order_),
M,
C,
kernel_h_,
kernel_w_));
CUDNN_CHECK(cudnnSetTensor4dDescriptor(
bias_desc_,
GetCudnnTensorFormat(order_),
cudnnTypeWrapper<T>::type,
1,
C,
1,
1));
}
// Set the output
CUDNN_CHECK(cudnnSetTensor4dDescriptor(
top_desc_,
GetCudnnTensorFormat(order_),
cudnnTypeWrapper<T>::type,
N,
C,
H_out,
W_out));
// Set the convolution descriptor
CHECK_EQ(pad_t_, pad_b_)
<< "The current padding scheme leads to unequal padding on the top and "
"bottom, which is not supported by cudnn.";
CHECK_EQ(pad_l_, pad_r_)
<< "The current padding scheme leads to unequal padding on the left "
"and right, which is not supported by cudnn.";
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
conv_desc_,
pad_t_,
pad_l_,
stride_h_,
stride_w_,
1,
1,
CUDNN_CROSS_CORRELATION));
// Set the workspace
size_t bwd_filter_ws_size, fwd_ws_size;
if (deterministic_) {
algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
} else if (exhaustive_search_) {
LOG(INFO) << "CUDNN Convolution bwd: doing exhaustive search.";
// When we do an exhaustive search, we will ignore the workspace size
// limit and simply go for the fastest algorithm. If you happen to run
// out of memory later, you will be on your own...
int returned_algo_count;
// We clean up the current workspace memory so that the forward algorithm
// is free to allocate memory.
// Actually run the search.
std::
array<cudnnConvolutionBwdFilterAlgoPerf_t, kNUM_CUDNN_BWD_FILTER_ALGS>
filter_perf_stat;
cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
state->workspace().reset();
CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
state->cudnn_handle(),
top_desc_,
bottom_desc_,
conv_desc_,
filter_desc_,
kNUM_CUDNN_BWD_FILTER_ALGS,
&returned_algo_count,
filter_perf_stat.data()));
});
LogCuDNNPerfStats(filter_perf_stat, returned_algo_count);
bwd_filter_algo_ = filter_perf_stat[0].algo;
std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
fwd_perf_stat;
cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
state->workspace().reset();
CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
state->cudnn_handle(),
top_desc_,
filter_desc_,
conv_desc_,
bottom_desc_,
kNUM_CUDNN_BWD_DATA_ALGS,
&returned_algo_count,
fwd_perf_stat.data()));
});
LogCuDNNPerfStats(fwd_perf_stat, returned_algo_count);
algo_ = fwd_perf_stat[0].algo;
} else {
// choose backward algorithm for filter
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
cudnn_wrapper_.inline_cudnn_handle(),
top_desc_,
bottom_desc_,
conv_desc_,
filter_desc_,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
cudnn_ws_nbytes_limit_,
&bwd_filter_algo_));
// choose backward algo for data
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
cudnn_wrapper_.inline_cudnn_handle(),
top_desc_,
filter_desc_,
conv_desc_,
bottom_desc_,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
cudnn_ws_nbytes_limit_,
&algo_));
}
// get workspace for backwards filter algorithm
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
cudnn_wrapper_.inline_cudnn_handle(),
top_desc_,
bottom_desc_,
conv_desc_,
filter_desc_,
bwd_filter_algo_,
&bwd_filter_ws_size));
// get workspace for backwards data algorithm
CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
cudnn_wrapper_.inline_cudnn_handle(),
top_desc_,
filter_desc_,
conv_desc_,
bottom_desc_,
algo_,
&fwd_ws_size));
cudnn_ws_nbytes_ = std::max(bwd_filter_ws_size, fwd_ws_size);
LOG(INFO) << "CuDNN bwd algorithm: " << bwd_filter_algo_ << ", " << algo_;
LOG(INFO) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
}
// Now, actually run the computation.
CUDNN_CHECK(cudnnConvolutionBackwardBias(
cudnn_wrapper_.inline_cudnn_handle(),
cudnnTypeWrapper<T>::kOne(),
top_desc_,
dY.template data<T>(),
cudnnTypeWrapper<T>::kZero(),
bias_desc_,
dbias->template mutable_data<T>()));
cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
CUDNN_CHECK(cudnnConvolutionBackwardFilter(
state->cudnn_handle(),
cudnnTypeWrapper<T>::kOne(),
top_desc_,
dY.template data<T>(),
bottom_desc_,
X.template data<T>(),
conv_desc_,
bwd_filter_algo_,
state->workspace().get(cudnn_ws_nbytes_),
cudnn_ws_nbytes_,
cudnnTypeWrapper<T>::kZero(),
filter_desc_,
dfilter->template mutable_data<T>()));
if (OutputSize() == 3) {
// Compute the gradient w.r.t. the input.
auto* dX = Output(INPUT_GRAD);
dX->ResizeLike(X);
CUDNN_CHECK(cudnnConvolutionForward(
state->cudnn_handle(),
cudnnTypeWrapper<T>::kOne(),
top_desc_,
dY.template data<T>(),
filter_desc_,
filter.template data<T>(),
conv_desc_,
algo_,
state->workspace().get(cudnn_ws_nbytes_),
cudnn_ws_nbytes_,
cudnnTypeWrapper<T>::kZero(),
bottom_desc_,
dX->template mutable_data<T>()));
}
});
return true;
}
REGISTER_CUDNN_OPERATOR(ConvTranspose, CudnnConvTransposeOp<float>);
REGISTER_CUDNN_OPERATOR(
ConvTransposeGradient,
CudnnConvTransposeGradientOp<float>);
} // namespace caffe2

View File

@ -118,9 +118,13 @@ class ConvTransposeUnpoolBase : public Operator<Context> {
return true;
}
virtual bool RunOnDeviceWithOrderNCHW() = 0;
virtual bool RunOnDeviceWithOrderNCHW() {
CAFFE_THROW("Not implemented");
}
virtual bool RunOnDeviceWithOrderNHWC() = 0;
virtual bool RunOnDeviceWithOrderNHWC() {
CAFFE_THROW("Not implemented");
}
virtual ~ConvTransposeUnpoolBase() {}

View File

@ -0,0 +1,46 @@
#include "counter_ops.h"
namespace caffe2 {
namespace {
REGISTER_CPU_OPERATOR(CreateCounter, CreateCounterOp<int32_t>);
REGISTER_CPU_OPERATOR(ResetCounter, ResetCounterOp<int32_t>);
REGISTER_CPU_OPERATOR(CountDown, CountDownOp<int32_t>);
OPERATOR_SCHEMA(CreateCounter)
.NumInputs(0)
.NumOutputs(1)
.SetDoc(R"DOC(
Creates a count-down counter with initial value specified by the 'init_count'
argument.
)DOC")
.Output(0, "counter", "A blob pointing to an instance of a new counter.")
.Arg("init_count", "Initial count for the counter, must be >= 0.");
OPERATOR_SCHEMA(ResetCounter)
.NumInputs(1)
.NumOutputs(0)
.SetDoc(R"DOC(
Resets a count-down counter with initial value specified by the 'init_count'
argument.
)DOC")
.Input(0, "counter", "A blob pointing to an instance of a new counter.")
.Arg("init_count", "Resets counter to this value, must be >= 0.");
OPERATOR_SCHEMA(CountDown)
.NumInputs(1)
.NumOutputs(1)
.SetDoc(R"DOC(
If the internal count value > 0, decreases count value by 1 and outputs false,
otherwise outputs true.
)DOC")
.Input(0, "counter", "A blob pointing to an instance of a counter.")
.Output(0, "should_stop", "false unless the internal count is zero.");
SHOULD_NOT_DO_GRADIENT(CreateCounter);
SHOULD_NOT_DO_GRADIENT(ResetCounter);
SHOULD_NOT_DO_GRADIENT(CountDown);
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,89 @@
#ifndef CAFFE2_OPERATORS_COUNTER_OPS_H
#define CAFFE2_OPERATORS_COUNTER_OPS_H
#include <atomic>
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
namespace caffe2 {
namespace {
template <typename T>
class Counter {
public:
explicit Counter(T count) : count_(count) {}
bool CountDown() {
if (count_ > 0) {
--count_;
return false;
}
return true;
}
void reset(T init_count) {
count_ = init_count;
}
private:
std::atomic<T> count_;
};
}
template <typename T, class Context = CPUContext>
class CreateCounterOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
CreateCounterOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
init_count_(OperatorBase::GetSingleArgument<T>("init_count", 0)) {
CHECK_LE(0, init_count_) << "negative init_count is not permitted.";
}
bool RunOnDevice() override {
*OperatorBase::Output<std::unique_ptr<Counter<T>>>(0) =
std::unique_ptr<Counter<T>>(new Counter<T>(init_count_));
return true;
}
private:
T init_count_ = 0;
};
template <typename T, class Context = CPUContext>
class ResetCounterOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
ResetCounterOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
init_count_(OperatorBase::GetSingleArgument<T>("init_count", 0)) {
CHECK_LE(0, init_count_) << "negative init_count is not permitted.";
}
bool RunOnDevice() override {
auto& counterPtr = OperatorBase::Input<std::unique_ptr<Counter<T>>>(0);
counterPtr->reset(init_count_);
return true;
}
private:
T init_count_;
};
template <typename T, class Context = CPUContext>
class CountDownOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
CountDownOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws) {}
bool RunOnDevice() override {
auto& counterPtr = OperatorBase::Input<std::unique_ptr<Counter<T>>>(0);
auto* output = Output(0);
output->Resize(std::vector<int>{});
*output->template mutable_data<bool>() = counterPtr->CountDown();
return true;
}
};
} // namespace caffe2
#endif // CAFFE2_OPERATORS_COUNTER_OPS_H_

View File

@ -2,6 +2,17 @@
namespace caffe2 {
namespace {
inline float sigmoid_xent_forward(float lgt, float tgt) {
return lgt * (tgt - (lgt >= 0)) - log(1 + exp(lgt - 2 * lgt * (lgt >= 0)));
}
inline float sigmoid_xent_backward(float lgt, float tgt) {
return tgt - 1. / (1. + exp(-lgt));
}
}
template <>
bool LabelCrossEntropyOp<float, CPUContext>::RunOnDevice() {
auto& X = Input(0);
@ -26,6 +37,68 @@ bool LabelCrossEntropyOp<float, CPUContext>::RunOnDevice() {
return true;
}
template <>
bool SigmoidCrossEntropyWithLogitsOp<float, CPUContext>::RunOnDevice() {
auto& logits = Input(0);
auto& targets = Input(1);
CAFFE_ENFORCE(logits.dims() == targets.dims());
const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
const auto outer_size = logits.size() / inner_size;
auto* out = Output(0);
if (logits.ndim() == 0) {
out->Resize(std::vector<TIndex>{});
} else {
std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
out->Resize(dims);
}
auto* out_ptr = out->mutable_data<float>();
auto* logits_ptr = logits.data<float>();
auto* targets_ptr = targets.data<float>();
auto in_idx = 0;
for (int i = 0; i < outer_size; ++i) {
float value = 0;
for (int j = 0; j < inner_size; ++j) {
value += sigmoid_xent_forward(logits_ptr[in_idx], targets_ptr[in_idx]);
++in_idx;
}
out_ptr[i] = -value / inner_size;
}
return true;
}
template <>
bool SigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>::RunOnDevice() {
auto& g = Input(0);
auto& logits = Input(1);
auto& targets = Input(2);
CAFFE_ENFORCE(logits.dims() == targets.dims());
const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
const auto outer_size = logits.size() / inner_size;
CAFFE_ENFORCE(g.size() == outer_size);
auto* out = Output(0);
out->ResizeLike(logits);
auto* out_ptr = out->mutable_data<float>();
auto* logits_ptr = logits.data<float>();
auto* targets_ptr = targets.data<float>();
auto* g_ptr = g.data<float>();
auto in_idx = 0;
for (int i = 0; i < outer_size; ++i) {
auto g_factor = -g_ptr[i] / inner_size;
for (int i = 0; i < inner_size; ++i) {
out_ptr[in_idx] = g_factor *
sigmoid_xent_backward(logits_ptr[in_idx], targets_ptr[in_idx]);
++in_idx;
}
}
return true;
}
template <>
bool LabelCrossEntropyGradientOp<float, CPUContext>::RunOnDevice() {
auto& X = Input(0);
@ -129,6 +202,13 @@ REGISTER_CPU_OPERATOR(MakeTwoClass,
REGISTER_CPU_OPERATOR(MakeTwoClassGradient,
MakeTwoClassGradientOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(
SigmoidCrossEntropyWithLogits,
SigmoidCrossEntropyWithLogitsOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(
SigmoidCrossEntropyWithLogitsGradient,
SigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>);
OPERATOR_SCHEMA(MakeTwoClass)
.NumInputs(1)
.NumOutputs(1)
@ -145,6 +225,22 @@ OPERATOR_SCHEMA(MakeTwoClassGradient)
.NumInputs(1)
.NumOutputs(1);
OPERATOR_SCHEMA(SigmoidCrossEntropyWithLogits)
.NumInputs(2)
.NumOutputs(1)
.SetDoc(R"DOC(
Given two matrices logits and targets, of same shape,
(batch_size, num_classes), computes the sigmoid cross entropy between the two.
Returns a tensor of shape (batch_size,) of losses for each example.
)DOC")
.Input(0, "logits", "matrix of logits for each example and class.")
.Input(1, "targets", "matrix of targets, same shape as logits.")
.Output(0, "xentropy", "Vector with the total xentropy for each example.");
OPERATOR_SCHEMA(SigmoidCrossEntropyWithLogitsGradient)
.NumInputs(3)
.NumOutputs(1);
struct GetMakeTwoClassGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
@ -156,5 +252,20 @@ struct GetMakeTwoClassGradient : public GradientMakerBase {
}
};
REGISTER_GRADIENT(MakeTwoClass, GetMakeTwoClassGradient);
struct GetSigmoidCrossEntropyWithLogitsGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
"SigmoidCrossEntropyWithLogitsGradient",
"",
vector<string>{GO(0), I(0), I(1)},
vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(
SigmoidCrossEntropyWithLogits,
GetSigmoidCrossEntropyWithLogitsGradient);
} // namespace
} // namespace caffe2

View File

@ -62,6 +62,22 @@ class MakeTwoClassGradientOp final
// Ouptut: dX
};
template <typename T, class Context>
class SigmoidCrossEntropyWithLogitsOp final : public Operator<Context> {
public:
USE_SIMPLE_CTOR_DTOR(SigmoidCrossEntropyWithLogitsOp);
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override;
};
template <typename T, class Context>
class SigmoidCrossEntropyWithLogitsGradientOp final : public Operator<Context> {
public:
USE_SIMPLE_CTOR_DTOR(SigmoidCrossEntropyWithLogitsGradientOp);
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override;
};
} // namespace caffe2
#endif // CAFFE2_OPERATORS_CROSS_ENTROPY_OP_H_

View File

@ -0,0 +1,734 @@
#include <memory>
#include <mutex>
#include <string>
#include <vector>
#include "caffe2/core/operator.h"
#include "caffe2/core/tensor.h"
#include "caffe2/utils/string_utils.h"
namespace caffe2 {
namespace {
const char kDatasetFieldSeparator = ':';
const char* kDatasetLengthField = "lengths";
// how much percent to grow the dataset when needed
const int kDatasetGrowthPct = 40;
// used for lengths tensors in the dataset
using TLength = int32_t;
// used for all internal dataset operations (offsets, sizes to read, etc.)
using TOffset = int64_t;
/**
* Provides functionality to iterate across a list of tensors where some
* of those tensors represent lengths in a hierarchical structure.
*/
class TreeIterator {
public:
struct FieldDesc {
int id;
int lengthFieldId = -1;
std::string name;
};
explicit TreeIterator(const std::vector<std::string>& fields) {
// populate field vector and split field names
fields_.resize(fields.size());
std::vector<std::vector<std::string>> nameParts(fields_.size());
for (int i = 0; i < fields.size(); ++i) {
auto& field = fields_.at(i);
field.name = fields[i];
field.id = i;
field.lengthFieldId = -1;
nameParts.at(i) = split(kDatasetFieldSeparator, field.name);
}
// populate lengthFields
for (const auto& field : fields_) {
const auto& parts = nameParts.at(field.id);
if (!parts.empty() && parts.back() == kDatasetLengthField) {
lengthFieldIds_.push_back(field.id);
}
}
// find length-field with maximum prefix matching for each field
for (auto& field : fields_) {
// by default, we are matching against the root domain
int maxMatchLevel = 1;
int maxMatchLengthFieldId = -1;
for (int j = 0; j < numLengthFields(); ++j) {
const auto& lenField = lengthField(j);
// a length field can't have itself as its length field
if (field.id == lenField.id) {
continue;
}
auto lf = nameParts.at(lenField.id);
auto lfEnd = lf.end() - 1;
// check whether this lengthField is a prefix for this field name
if (std::mismatch(lf.begin(), lfEnd, nameParts.at(field.id).begin())
.first != lfEnd) {
continue;
}
if (lf.size() > maxMatchLevel) {
maxMatchLevel = lf.size();
maxMatchLengthFieldId = j;
}
}
field.lengthFieldId = maxMatchLengthFieldId;
}
// check that fields are topologically sorted
// (no length field depends on a length defined afterwards)
for (const auto& field : fields_) {
const auto* lengthField = lengthFieldFor(field);
CAFFE_ENFORCE(
(lengthField == nullptr) || (lengthField->id < field.id),
"Error: Field ",
field.id,
" (",
field.name,
") ",
"depends on a field defined afterwards: ",
lengthField->id,
" (",
lengthField->name,
").");
}
}
void advance(
const std::vector<const TLength*>& lengths,
std::vector<TOffset>& offsets,
std::vector<TOffset>& sizes,
std::vector<TOffset>& limits,
TOffset num) {
thread_local std::vector<TOffset> newOffsets;
CHECK_EQ(lengths.size(), numLengthFields());
CHECK_EQ(offsets.size(), numOffsetFields());
sizes.resize(offsets.size());
newOffsets.resize(offsets.size());
// first index, top level
{
auto limit = limits[0];
auto offset = offsets[0];
CAFFE_ENFORCE(limit >= offset, "Tried to advance past end of cursor.");
TOffset total = std::min(limit - offset, num);
sizes[0] = total;
newOffsets[0] = offset + total;
}
// child indices
for (int j = 1; j < numOffsetFields(); ++j) {
TOffset total = 0;
int parentOffsetId = offsetFieldIdFor(lengthField(j - 1));
const TLength* length = lengths[j - 1] + offsets[parentOffsetId];
for (int k = 0; k < sizes[parentOffsetId]; ++k) {
total += *(length++);
}
auto offset = offsets[j];
CAFFE_ENFORCE(
offset + total <= limits[j],
"Inconsistent field length: ",
"tried to advance past the end of field ",
j);
sizes[j] = total;
newOffsets[j] = offset + total;
}
offsets = newOffsets;
}
// Corresponds to the number of fields that have "length" as its last name
int numLengthFields() const {
return lengthFieldIds_.size();
}
// Corresponds to the number of length fields + 1 (for the top-level domain)
int numOffsetFields() const {
return numLengthFields() + 1;
}
// Get lengthField description for the given field
const FieldDesc* lengthFieldFor(const FieldDesc& desc) {
return (desc.lengthFieldId == -1)
? nullptr
: &fields_.at(lengthFieldIds_.at(desc.lengthFieldId));
}
// Get lengthField description for the given lengthFieldId, where
// 0 <= lengthFieldId < numLengthFields()
const FieldDesc& lengthField(int lengthFieldId) {
return fields_.at(lengthFieldIds_.at(lengthFieldId));
}
// Returns the index into the 'offset' vector for the given field.
int offsetFieldIdFor(const FieldDesc& fieldDesc) {
return fieldDesc.lengthFieldId + 1;
}
// Returns the field description for all fields.
const std::vector<FieldDesc>& fields() {
return fields_;
}
private:
// Description of each field
std::vector<FieldDesc> fields_;
// Index into fields_ above for the fields that are lengths.
std::vector<int> lengthFieldIds_;
};
class TreeCursor {
public:
explicit TreeCursor(const TreeIterator& iterator) : it(iterator) {}
std::vector<TOffset> offsets;
std::mutex mutex_;
TreeIterator it;
};
class CreateTreeCursorOp : public Operator<CPUContext> {
public:
CreateTreeCursorOp(const OperatorDef& operator_def, Workspace* ws)
: Operator(operator_def, ws),
fields_(OperatorBase::GetRepeatedArgument<std::string>("fields")) {}
bool RunOnDevice() override {
*OperatorBase::Output<std::unique_ptr<TreeCursor>>(0) =
std::unique_ptr<TreeCursor>(new TreeCursor(TreeIterator(fields_)));
return true;
}
private:
std::vector<std::string> fields_;
};
class ResetCursorOp : public Operator<CPUContext> {
public:
ResetCursorOp(const OperatorDef& operator_def, Workspace* ws)
: Operator(operator_def, ws) {}
bool RunOnDevice() override {
auto& cursor = OperatorBase::Input<std::unique_ptr<TreeCursor>>(0);
std::lock_guard<std::mutex> lock(cursor->mutex_);
cursor->offsets.clear();
return true;
}
};
class CheckDatasetConsistencyOp : public Operator<CPUContext> {
public:
CheckDatasetConsistencyOp(const OperatorDef& operator_def, Workspace* ws)
: Operator(operator_def, ws),
iterator_(OperatorBase::GetRepeatedArgument<std::string>("fields")) {}
bool RunOnDevice() override {
thread_local std::vector<const TLength*> lengths;
thread_local std::vector<TOffset> limits;
thread_local std::vector<TOffset> sizes;
thread_local std::vector<TOffset> offsets;
CAFFE_ENFORCE(
InputSize() == iterator_.fields().size(),
"Invalid number of fields. Expected ",
iterator_.fields().size(),
", got ",
InputSize());
sizes.resize(iterator_.numOffsetFields());
// gather length data
lengths.resize(iterator_.numLengthFields());
for (int i = 0; i < lengths.size(); ++i) {
lengths[i] = Input(iterator_.lengthField(i).id).data<TLength>();
}
// gather size limits
limits.assign(sizes.size(), std::numeric_limits<TOffset>::max());
for (int i = 0; i < iterator_.fields().size(); ++i) {
int lengthIdx = iterator_.fields()[i].lengthFieldId + 1;
TOffset size = (TOffset)Input(i).dims()[0];
if (limits[lengthIdx] == std::numeric_limits<TOffset>::max()) {
limits[lengthIdx] = size;
} else {
CAFFE_ENFORCE(
limits[lengthIdx] == size,
"Inconsistent sizes for fields belonging to same domain.",
" Field: ",
i,
" (",
iterator_.fields()[i].name,
"); Length field index: ",
lengthIdx,
"); Previous size: ",
limits[lengthIdx],
"; New size: ",
size);
}
}
// advance to the end
offsets.assign(sizes.size(), 0);
iterator_.advance(lengths, offsets, sizes, limits, limits[0]);
for (int i = 0; i < limits.size(); ++i) {
CAFFE_ENFORCE(limits[i] == offsets[i]);
}
return true;
}
private:
TreeIterator iterator_;
};
class ReadNextBatchOp : public Operator<CPUContext> {
public:
ReadNextBatchOp(const OperatorDef& operator_def, Workspace* ws)
: Operator(operator_def, ws),
batchSize_(OperatorBase::GetSingleArgument<int>("batch_size", 1)) {}
bool RunOnDevice() override {
auto& cursor = OperatorBase::Input<std::unique_ptr<TreeCursor>>(0);
CAFFE_ENFORCE(InputSize() == cursor->it.fields().size() + 1);
thread_local std::vector<const TLength*> lengths;
thread_local std::vector<TOffset> limits;
thread_local std::vector<TOffset> sizes;
thread_local std::vector<TOffset> offsets;
sizes.resize(cursor->it.numOffsetFields());
// gather length data
lengths.resize(cursor->it.numLengthFields());
for (int i = 0; i < lengths.size(); ++i) {
lengths[i] = Input(cursor->it.lengthField(i).id + 1).data<int>();
}
// gather size limits
limits.assign(sizes.size(), std::numeric_limits<TOffset>::max());
for (int i = 0; i < cursor->it.fields().size(); ++i) {
int lengthFieldIdx = cursor->it.fields()[i].lengthFieldId + 1;
limits[lengthFieldIdx] =
std::min(limits[lengthFieldIdx], (TOffset)Input(i + 1).dims()[0]);
}
// advance cursor
{
std::lock_guard<std::mutex> lock(cursor->mutex_);
if (cursor->offsets.empty()) {
cursor->offsets.assign(sizes.size(), 0);
}
offsets = cursor->offsets;
cursor->it.advance(lengths, cursor->offsets, sizes, limits, batchSize_);
}
// gather data
thread_local std::vector<TIndex> outDim;
for (int i = 0; i < cursor->it.fields().size(); ++i) {
auto lengthIdx = cursor->it.fields()[i].lengthFieldId + 1;
auto size = sizes[lengthIdx];
auto offset = offsets[lengthIdx];
auto& in = Input(i + 1);
auto innerSize = in.size_from_dim(1);
outDim = in.dims();
outDim[0] = size;
auto* out = Output(i);
out->Resize(outDim);
if (out->size() == 0) {
continue;
}
void* src =
(char*)in.raw_data() + offset * innerSize * in.meta().itemsize();
void* dst = out->raw_mutable_data(in.meta());
context_.template CopyItems<CPUContext, CPUContext>(
in.meta(), out->size(), src, dst);
}
return true;
}
int batchSize_;
};
class ComputeOffsetOp : public Operator<CPUContext> {
public:
ComputeOffsetOp(const OperatorDef& operator_def, Workspace* ws)
: Operator(operator_def, ws) {}
bool RunOnDevice() override {
auto& cursor = OperatorBase::Input<std::unique_ptr<TreeCursor>>(0);
CAFFE_ENFORCE(InputSize() == cursor->it.fields().size() + 1);
auto* out = Output(0);
std::vector<const TLength*> lengths;
std::vector<TOffset> limits;
std::vector<TOffset> sizes;
std::vector<TOffset> offsets;
sizes.resize(cursor->it.numOffsetFields());
// gather length data
lengths.resize(cursor->it.numLengthFields());
for (int i = 0; i < lengths.size(); ++i) {
lengths[i] = Input(cursor->it.lengthField(i).id + 1).data<int>();
}
// gather size limits
limits.assign(sizes.size(), std::numeric_limits<TOffset>::max());
for (int i = 0; i < cursor->it.fields().size(); ++i) {
int lengthFieldIdx = cursor->it.fields()[i].lengthFieldId + 1;
limits[lengthFieldIdx] =
std::min(limits[lengthFieldIdx], (TOffset)Input(i + 1).dims()[0]);
}
out->Resize(limits.at(0) + 1, sizes.size());
auto* out_data = out->mutable_data<int64_t>();
for (int k = 0; k <= limits.at(0); k++) {
// advance cursor
if (cursor->offsets.empty()) {
cursor->offsets.assign(sizes.size(), 0);
}
// write output
std::copy(cursor->offsets.begin(), cursor->offsets.end(), out_data);
out_data += sizes.size();
cursor->it.advance(lengths, cursor->offsets, sizes, limits, 1);
}
cursor->offsets.assign(sizes.size(), 0); // reSet after getting meta info
return true;
}
};
class ReadRandomBatchOp : public Operator<CPUContext> {
public:
ReadRandomBatchOp(const OperatorDef& operator_def, Workspace* ws)
: Operator(operator_def, ws),
batchSize_(OperatorBase::GetSingleArgument<int>("batch_size", 1)) {}
bool RunOnDevice() override {
auto& cursor = OperatorBase::Input<std::unique_ptr<TreeCursor>>(0);
auto& idxblob = Input(1);
auto& offsetsmat = Input(2);
CAFFE_ENFORCE(InputSize() == cursor->it.fields().size() + 3);
auto idxvec = idxblob.template data<int64_t>();
auto& offsetdim = offsetsmat.dims();
// gather data
thread_local std::vector<TIndex> outDim;
int64_t idx;
{
std::lock_guard<std::mutex> lock(cursor->mutex_);
cursor->offsets.resize(1);
idx = cursor->offsets.at(0);
cursor->offsets.at(0) += batchSize_;
}
for (int i = 0; i < cursor->it.fields().size(); ++i) {
auto lengthIdx = cursor->it.fields()[i].lengthFieldId + 1;
auto& in = Input(i + 3);
outDim = in.dims();
outDim.at(0) = 0;
auto idxbegin = idx;
for (int j = 0; j < batchSize_; ++j) {
if (idx >= idxblob.size()) {
break;
}
CAFFE_ENFORCE(
(idxvec[idx] + 1) * offsetdim[1] + lengthIdx < offsetsmat.size(),
"Out of bound when trying to get elem from offsetsmat");
auto offsetptr = offsetsmat.template data<TOffset>() +
idxvec[idx] * offsetdim[1] + lengthIdx;
auto offset = *offsetptr;
auto size = *(offsetptr + offsetdim[1]) - offset;
outDim.at(0) += size; // accumulate over the batch
idx++;
}
idx = idxbegin; // reSet
auto* out = Output(i);
out->Resize(outDim);
if (out->size() == 0) {
continue;
}
auto innerSize = in.size_from_dim(1);
auto dst = static_cast<char*>(out->raw_mutable_data(in.meta()));
int block_size = in.size() / in.dim(0);
int block_bytesize = in.nbytes() / in.dim(0);
int start = 0;
for (int j = 0; j < batchSize_; ++j) {
if (idx >= idxblob.size()) {
break;
}
auto offsetptr = offsetsmat.template data<TOffset>() +
idxvec[idx] * offsetdim[1] + lengthIdx;
auto offset = *offsetptr;
auto size = *(offsetptr + offsetdim[1]) - offset;
// copy data
void* src =
(char*)in.raw_data() + offset * innerSize * in.meta().itemsize();
context_.template CopyItems<CPUContext, CPUContext>(
in.meta(), size * block_size, src, dst + start * block_bytesize);
start += size;
idx++;
}
idx = idxbegin; // reSet
}
return true;
}
int batchSize_;
};
template <class Context>
class AppendOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
AppendOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws) {}
bool RunOnDevice() override {
auto& a = Input(0);
auto& b = Input(1);
auto* c = Output(0);
CAFFE_ENFORCE(b.ndim() >= 1);
if (a.size() == 0) {
c->CopyFrom(b);
return true;
}
CAFFE_ENFORCE(&a == c, "First argument must be in-place.");
CAFFE_ENFORCE(c->ndim() == b.ndim());
CAFFE_ENFORCE(b.ndim() == c->ndim());
CAFFE_ENFORCE(a.meta() == b.meta());
for (int i = 1; i < a.ndim(); ++i) {
CAFFE_ENFORCE(a.dims()[i] == b.dims()[i]);
}
auto oldSize = c->size();
c->Extend(b.dims()[0], kDatasetGrowthPct, &context_);
auto* dst = (char*)c->raw_mutable_data() + oldSize * b.meta().itemsize();
context_.template CopyItems<Context, Context>(
b.meta(), b.size(), b.raw_data(), dst);
return true;
}
};
template <class Context>
class AtomicAppendOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
AtomicAppendOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws) {}
bool RunOnDevice() override {
auto& mutex = OperatorBase::Input<std::unique_ptr<std::mutex>>(0);
const auto numFields = (InputSize() - 1) / 2;
CAFFE_ENFORCE(OutputSize() == numFields);
std::lock_guard<std::mutex> guard(*mutex);
// 1: checks
for (int i = 0; i < numFields; ++i) {
auto& a = Input(1 + i);
auto& b = Input(1 + i + numFields);
auto* c = Output(i);
CAFFE_ENFORCE(b.ndim() >= 1);
if (a.size() == 0) {
continue;
}
CAFFE_ENFORCE(
(void*)&a == (void*)c, "Appended-to arguments must be in-place.");
CAFFE_ENFORCE(c->ndim() == b.ndim());
CAFFE_ENFORCE(b.ndim() == c->ndim());
CAFFE_ENFORCE(a.meta() == b.meta());
for (int j = 1; j < a.ndim(); ++j) {
CAFFE_ENFORCE(a.dims()[j] == b.dims()[j]);
}
}
// 2: copies
for (int i = 0; i < numFields; ++i) {
auto& a = Input(1 + i);
auto& b = Input(1 + i + numFields);
auto* c = Output(i);
if (a.size() == 0) {
c->CopyFrom(b);
continue;
}
auto oldSize = c->size();
c->Extend(b.dims()[0], kDatasetGrowthPct, &context_);
auto* dst = (char*)c->raw_mutable_data() + oldSize * b.meta().itemsize();
context_.template CopyItems<Context, Context>(
b.meta(), b.size(), b.raw_data(), dst);
}
return true;
}
};
REGISTER_CPU_OPERATOR(CreateTreeCursor, CreateTreeCursorOp);
REGISTER_CPU_OPERATOR(ResetCursor, ResetCursorOp);
REGISTER_CPU_OPERATOR(ReadNextBatch, ReadNextBatchOp);
REGISTER_CPU_OPERATOR(ComputeOffset, ComputeOffsetOp);
REGISTER_CPU_OPERATOR(ReadRandomBatch, ReadRandomBatchOp);
REGISTER_CPU_OPERATOR(CheckDatasetConsistency, CheckDatasetConsistencyOp);
REGISTER_CPU_OPERATOR(Append, AppendOp<CPUContext>);
REGISTER_CPU_OPERATOR(AtomicAppend, AtomicAppendOp<CPUContext>);
OPERATOR_SCHEMA(CreateTreeCursor)
.NumInputs(0)
.NumOutputs(1)
.SetDoc(R"DOC(
Creates a cursor to iterate through a list of tensors, where some of those
tensors contains the lengths in a nested schema. The schema is determined by
the `fields` arguments.
For example, to represent the following schema:
Struct(
a=Int(),
b=List(List(Int),
c=List(
Struct(
c1=String,
c2=List(Int),
),
),
)
the field list will be:
[
"a",
"b:lengths",
"b:values:lengths",
"b:values:values",
"c:lengths",
"c:c1",
"c:c2:lengths",
"c:c2:values",
]
And for the following instance of the struct:
Struct(
a=3,
b=[[4, 5], [6, 7, 8], [], [9]],
c=[
Struct(c1='alex', c2=[10, 11]),
Struct(c1='bob', c2=[12]),
],
)
The values of the fields will be:
{
"a": [3],
"b:lengths": [4],
"b:values:lengths": [2, 3, 0, 1],
"b:values:values": [4, 5, 6, 7, 8, 9],
"c:lengths": [2],
"c:c1": ["alex", "bob"],
"c:c2:lengths": [2, 1],
"c:c2:values", [10, 11, 12],
}
In general, every field name in the format "{prefix}:lengths" defines a domain
"{prefix}", and every subsequent field in the format "{prefx}:{field}" will
be in that domain, and the length of the domain is provided for each entry of
the parent domain. In the example, "b:lengths" defines a domain of length 4, so
every field under domain "b" will have 4 entries.
The "lengths" field for a given domain must appear before any reference to
that domain.
Returns a pointer to an instance of the Cursor, which keeps the current offset
on each of the domains defined by `fields`. Cursor also ensures thread-safety
such that ReadNextBatch and ResetCursor can be used safely in parallel.
A cursor does not contain data per se, so calls to ReadNextBatch actually need
to pass a list of blobs containing the data to read for each one of the fields.
)DOC")
.Output(0, "cursor", "A blob pointing to an instance of a new TreeCursor.")
.Arg(
"fields",
"A list of strings each one representing a field of the dataset.");
OPERATOR_SCHEMA(ResetCursor)
.NumInputs(1)
.NumOutputs(0)
.SetDoc(R"DOC(
Resets the offsets for the given TreeCursor. This operation is thread safe.
)DOC")
.Input(0, "cursor", "A blob containing a pointer to the cursor.");
OPERATOR_SCHEMA(ReadNextBatch)
.NumInputs(1, INT_MAX)
.NumOutputs(1, INT_MAX)
.SetDoc(R"DOC(
Read the next batch of examples out of the given cursor and data blobs.
Input(0) is a blob pointing to a TreeCursor, and
[Input(1),... Input(num_fields)] a list of tensors containing the data for
each field of the dataset.
ReadNextBatch is thread safe.
)DOC")
.Input(0, "cursor", "A blob containing a pointer to the cursor.")
.Input(1, "dataset_field_0", "First dataset field")
.Output(0, "field_0", "Tensor containing the next batch for field 0.")
.Arg("batch_size", "Number of top-level entries to read.");
OPERATOR_SCHEMA(ComputeOffset)
.NumInputs(1, INT_MAX)
.NumOutputs(1)
.SetDoc(R"DOC(
Compute the offsets matrix given cursor and data blobs. Need to be ran at
beginning or after reseting cursor
Input(0) is a blob pointing to a TreeCursor, and
[Input(1),... Input(num_fields)] a list of tensors containing the data for
each field of the dataset.
ComputeOffset is thread safe.
)DOC")
.Input(0, "cursor", "A blob containing a pointer to the cursor.")
.Input(1, "dataset_field_0", "First dataset field")
.Output(0, "field_0", "Tensor containing offset info for this chunk.");
OPERATOR_SCHEMA(ReadRandomBatch)
.NumInputs(1, INT_MAX)
.NumOutputs(1, INT_MAX)
.SetDoc(R"DOC(
Read the next batch of examples out of the given cursor,
idx blob, offset matrix and data blobs.
Input(0) is a blob pointing to a TreeCursor,
Input(1) is a blob pointing to the shuffled idx
Input(2) is a blob pointing to the offset matrix and
[Input(3),... Input(num_fields)] a list of tensors containing the data for
each field of the dataset.
ReadRandomBatch is thread safe.
)DOC")
.Input(0, "cursor", "A blob containing a pointer to the cursor.")
.Input(1, "idx", "idx with a shuffled order.")
.Input(2, "offsetsmat", "offset matrix containing length offset info.")
.Input(3, "dataset_field_0", "First dataset field")
.Output(0, "field_0", "Tensor containing the next batch for field 0.")
.Arg("batch_size", "Number of top-level entries to read.");
OPERATOR_SCHEMA(CheckDatasetConsistency)
.NumInputs(1, INT_MAX)
.NumOutputs(0)
.SetDoc(R"DOC(
Checks that the given data fields represents a consistent dataset unther
the schema specified by the `fields` argument. Operator fails if the fields
are not consistent. If data is consistent, each field's data can be safely
appended to an existing dataset, keeping it consistent.
)DOC")
.Input(0, "field_0", "Data for field 0.")
.Arg(
"fields",
"List of strings representing the string names in the format"
"specified in the doc for CreateTreeCursor.");
OPERATOR_SCHEMA(Append)
.NumInputs(2)
.NumOutputs(1)
.EnforceInplace({{0, 0}})
.SetDoc(R"DOC(
Append input 2 to the end of input 1.
Input 1 must be the same as output, that is, it is required to be in-place.
Input 1 may have to be re-allocated in order for accommodate to the new size.
Currently, an exponential growth ratio is used in order to ensure amortized
constant time complexity.
All except the outer-most dimension must be the same between input 1 and 2.
)DOC")
.Input(0, "dataset", "The tensor to be appended to.")
.Input(1, "new_data", "Tensor to append to the end of dataset.")
.Output(0, "dataset", "Same as input 0, representing the mutated tensor.");
OPERATOR_SCHEMA(AtomicAppend)
.NumInputs(3, INT_MAX)
.NumOutputs(1, INT_MAX)
.AllowInplace([](int in, int out) { return in == out + 1; });
SHOULD_NOT_DO_GRADIENT(CreateTreeCursor);
SHOULD_NOT_DO_GRADIENT(ResetCursor);
SHOULD_NOT_DO_GRADIENT(ReadNextBatch);
SHOULD_NOT_DO_GRADIENT(ComputeOffset);
SHOULD_NOT_DO_GRADIENT(ReadRandomBatch);
SHOULD_NOT_DO_GRADIENT(CheckDatasetConsistency);
SHOULD_NOT_DO_GRADIENT(Append);
SHOULD_NOT_DO_GRADIENT(AtomicAppend);
}
}

View File

@ -2,16 +2,16 @@
#define CAFFE2_OPERATORS_ELEMENTWISE_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
#include "caffe2/core/logging.h"
namespace caffe2 {
using NumericTypes = TensorTypes<int32_t, int64_t, float, double>;
class SameTypeAsInput {};
template<typename OutputTemplate, typename InputType>
template <typename OutputTemplate, typename InputType>
struct TypeForOutput {
using value = OutputTemplate;
};
@ -21,12 +21,26 @@ struct TypeForOutput<SameTypeAsInput, InputType> {
using value = InputType;
};
template <typename InputTypes, class Context, class Functor,
class OutputType = SameTypeAsInput>
class UnaryElementwiseOp : public Operator<Context> {
/**
* Generic meta-operator that is able to processes element-wise operations on
* a single-element tensor, returning a tensor with same shape, and either of
* the same type as the input or of a specified result type.
*
* The functor provided must implement operator() as a template on input and
* output types, and on a Context. Moreover, it needs to provide a constructor
* that takes OperatorBase& as argument. This is in order to consume arguments
* passed to the operator instance.
*/
template <
typename InputTypes,
class Context,
class Functor,
class OutputType = SameTypeAsInput>
class UnaryElementwiseWithArgsOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(UnaryElementwiseOp);
UnaryElementwiseWithArgsOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws), functor(*this) {}
bool RunOnDevice() override {
return DispatchHelper<InputTypes>::call(this, Input(0));
@ -38,11 +52,67 @@ class UnaryElementwiseOp : public Operator<Context> {
auto* output = Output(0);
output->ResizeLike(input);
using R = typename TypeForOutput<OutputType, T>::value;
Functor()(input.size(), input.template data<T>(),
output->template mutable_data<R>(), &context_);
functor(
input.size(),
input.template data<T>(),
output->template mutable_data<R>(),
&context_);
return true;
}
Functor functor;
};
/**
* WithDefaultConstructor is a functor that can be used as the functor of an
* UnaryElementwiseWithArgsOp. It simply forwards the operator() call into
* another functor that doesn't accept arguments in its constructor.
*/
template <typename Functor>
struct WithDefaultConstructor {
explicit WithDefaultConstructor(OperatorBase& op) {}
template <typename In, typename Out, typename Context>
void operator()(int n, const In* in, Out* out, Context* c) {
Functor()(n, in, out, c);
}
};
/**
* UnaryElementwiseOp is a wrapper around UnaryElementwiseWithArgsOp, with the
* difference that it takes a functor with default constructor, e.g. that does
* not need to take into consideration any arguments during operator creation.
*/
template <
typename InputTypes,
class Context,
class Functor,
class OutputType = SameTypeAsInput>
using UnaryElementwiseOp = UnaryElementwiseWithArgsOp<
InputTypes,
Context,
WithDefaultConstructor<Functor>,
OutputType>;
/**
* ForEach is a unary functor that forwards each element of the input array
* into the elementwise Functor provided, and gathers the results of each
* call into the resulting array. Use it as an adaptor if you want to create
* a UnaryElementwiseOp that acts on each element of the tensor per function
* call -- this is resonable for complex types where vectorization wouldn't
* be much of a gain, performance-wise.
*/
template <typename Functor>
struct ForEach {
explicit ForEach(OperatorBase& op) : functor(op) {}
template <typename In, typename Out, typename Context>
void operator()(int n, const In* in, Out* out, Context* c) {
for (int i = 0; i < n; ++i) {
out[i] = functor(in[i]);
}
}
Functor functor;
};
/**
@ -113,7 +183,7 @@ class BinaryElementwiseOp : public Operator<Context> {
return false;
}
for (int i = 0; i < b.size(); ++i) {
if (a[a.size()-1-i] != b[b.size()-1-i]) {
if (a[a.size() - 1 - i] != b[b.size() - 1 - i]) {
return false;
}
}
@ -169,31 +239,35 @@ class DivGradientOp final : public Operator<Context> {
bool RunOnDevice() override;
};
#define CAFFE2_BINARY_FUNCTOR_WRAPPER(name) \
struct name##Functor { \
template <typename T, class Context> \
inline void operator()( \
const int n, \
const T* x, \
const T* y, \
T* output, \
Context* device_context) { \
math::name<T, Context>(n, x, y, output, device_context); \
} \
template <typename T, class Context> \
inline void WithBroadcast( \
const int m, \
const int n, \
const T* a, \
const T* b, \
T* y, \
Context* device_context) { \
math::name##ToRow<T, Context>(m, n, a, b, y, device_context); \
} \
}; \
template <class DC> \
using name##Op = BinaryElementwiseOp<NumericTypes, DC, \
name##Functor, SameTypeAsInput, true>
#define CAFFE2_BINARY_FUNCTOR_WRAPPER(name) \
struct name##Functor { \
template <typename T, class Context> \
inline void operator()( \
const int n, \
const T* x, \
const T* y, \
T* output, \
Context* device_context) { \
math::name<T, Context>(n, x, y, output, device_context); \
} \
template <typename T, class Context> \
inline void WithBroadcast( \
const int m, \
const int n, \
const T* a, \
const T* b, \
T* y, \
Context* device_context) { \
math::name##ToRow<T, Context>(m, n, a, b, y, device_context); \
} \
}; \
template <class DC> \
using name##Op = BinaryElementwiseOp< \
NumericTypes, \
DC, \
name##Functor, \
SameTypeAsInput, \
true>
CAFFE2_BINARY_FUNCTOR_WRAPPER(Add);
CAFFE2_BINARY_FUNCTOR_WRAPPER(Sub);
@ -202,31 +276,31 @@ CAFFE2_BINARY_FUNCTOR_WRAPPER(Div);
#undef CAFFE2_BINARY_FUNCTOR_WRAPPER
#define CAFFE2_BINARY_FUNCTOR_BINARY_RESULT_WRAPPER(name) \
struct name##Functor { \
template <typename T, class Context> \
inline void operator()( \
const int n, \
const T* x, \
const T* y, \
bool* output, \
Context* device_context) { \
math::name<T, Context>(n, x, y, output, device_context); \
} \
template <typename T, typename Context> \
inline void WithBroadcast( \
const int m, \
const int n, \
const T* a, \
const T* b, \
bool* y, \
Context* device_context) { \
math::name##ToRow<T, Context>(m, n, a, b, y, device_context); \
} \
}; \
template <class DC> \
using name##Op = BinaryElementwiseOp< \
NumericTypes, DC, name##Functor, bool, true>
#define CAFFE2_BINARY_FUNCTOR_BINARY_RESULT_WRAPPER(name) \
struct name##Functor { \
template <typename T, class Context> \
inline void operator()( \
const int n, \
const T* x, \
const T* y, \
bool* output, \
Context* device_context) { \
math::name<T, Context>(n, x, y, output, device_context); \
} \
template <typename T, typename Context> \
inline void WithBroadcast( \
const int m, \
const int n, \
const T* a, \
const T* b, \
bool* y, \
Context* device_context) { \
math::name##ToRow<T, Context>(m, n, a, b, y, device_context); \
} \
}; \
template <class DC> \
using name##Op = \
BinaryElementwiseOp<NumericTypes, DC, name##Functor, bool, true>
CAFFE2_BINARY_FUNCTOR_BINARY_RESULT_WRAPPER(LT);
CAFFE2_BINARY_FUNCTOR_BINARY_RESULT_WRAPPER(LE);
@ -234,6 +308,6 @@ CAFFE2_BINARY_FUNCTOR_BINARY_RESULT_WRAPPER(GT);
CAFFE2_BINARY_FUNCTOR_BINARY_RESULT_WRAPPER(GE);
#undef CAFFE2_BINARY_FUNCTOR_BINARY_RESULT_WRAPPER
} // namespace caffe2
} // namespace caffe2
#endif // CAFFE2_OPERATORS_ELEMENTWISE_OP_H_
#endif // CAFFE2_OPERATORS_ELEMENTWISE_OP_H_

View File

@ -18,6 +18,7 @@ REGISTER_CPU_OPERATOR(UniformFill, UniformFillOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(UniformIntFill, UniformFillOp<int, CPUContext>);
REGISTER_CPU_OPERATOR(ConstantFill, ConstantFillOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(ConstantIntFill, ConstantFillOp<int, CPUContext>);
REGISTER_CPU_OPERATOR(ConstantBoolFill, ConstantFillOp<bool, CPUContext>);
REGISTER_CPU_OPERATOR(GivenTensorFill, GivenTensorFillOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(GivenTensorIntFill, GivenTensorFillOp<int, CPUContext>);
REGISTER_CPU_OPERATOR(GaussianFill, GaussianFillOp<float, CPUContext>);
@ -30,6 +31,10 @@ OPERATOR_SCHEMA(UniformFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}}
OPERATOR_SCHEMA(UniformIntFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
OPERATOR_SCHEMA(ConstantFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
OPERATOR_SCHEMA(ConstantIntFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
OPERATOR_SCHEMA(ConstantBoolFill)
.NumInputs(0, 1)
.NumOutputs(1)
.AllowInplace({{0, 0}});
OPERATOR_SCHEMA(GivenTensorFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
OPERATOR_SCHEMA(GivenTensorIntFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
OPERATOR_SCHEMA(GaussianFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});

View File

@ -56,14 +56,12 @@ class FullyConnectedOp final : public Operator<Context> {
CAFFE_ENFORCE(N == b.dim32(0), dimErrorString());
CAFFE_ENFORCE(N == b.size(), dimErrorString());
// Create the Y shape (without allocation)
static thread_local vector<TIndex> Y_shape;
Y_shape = X.dims();
Y_shape_cache_ = X.dims();
// This is an invariant of canonical_axis, so we can DCHECK.
DCHECK_LE(canonical_axis + 1, Y_shape.size());
Y_shape.resize(canonical_axis + 1);
Y_shape[canonical_axis] = N;
Y->Resize(Y_shape);
DCHECK_LE(canonical_axis + 1, Y_shape_cache_.size());
Y_shape_cache_.resize(canonical_axis + 1);
Y_shape_cache_[canonical_axis] = N;
Y->Resize(Y_shape_cache_);
CAFFE_ENFORCE(M * N == Y->size(), dimErrorString());
// W * x
@ -88,6 +86,9 @@ class FullyConnectedOp final : public Operator<Context> {
protected:
size_t axis_{1};
// A local vector to cache the output shape so we don't need to recreate
// a vector object every time we run Run().
vector<TIndex> Y_shape_cache_;
Tensor<Context> bias_multiplier_;
};

View File

@ -7,7 +7,6 @@
#include <limits>
namespace caffe2 {
namespace {
using IndexKeyTypes = TensorTypes<int32_t, int64_t, std::string>;
using TIndexValue = int64_t;
@ -22,12 +21,17 @@ struct IndexBase {
void Freeze() { frozen_ = true; }
virtual ~IndexBase() {}
const TypeMeta& Type() const { return meta_; }
TIndexValue Size() {
std::lock_guard<std::mutex> guard(dictMutex_);
return nextId_;
}
protected:
int64_t maxElements_;
TypeMeta meta_;
TIndexValue nextId_{1}; // guarded by dictMutex_
std::atomic<bool> frozen_{false};
std::mutex dictMutex_;
};
template<typename T>
@ -96,7 +100,6 @@ struct Index: IndexBase {
}
std::unordered_map<T, TIndexValue> dict_;
std::mutex dictMutex_;
};
template<class T>
@ -142,7 +145,9 @@ class IndexGetOp: public Operator<CPUContext> {
class IndexLoadOp: public Operator<CPUContext> {
public:
IndexLoadOp(const OperatorDef& operator_def, Workspace* ws)
: Operator(operator_def, ws) {}
: Operator(operator_def, ws),
skipFirstEntry_(
OperatorBase::GetSingleArgument<int>("skip_first_entry", 0)) {}
bool RunOnDevice() override {
return DispatchHelper<IndexKeyTypes>::call(this, Input(1));
@ -153,8 +158,18 @@ class IndexLoadOp: public Operator<CPUContext> {
auto* dict = dynamic_cast_if_rtti<Index<T>*>(base.get());
CAFFE_ENFORCE(dict, "Wrong dictionary type given input keys.");
const auto& keys = Input(1);
return dict->Load(keys.data<T>(), keys.size());
const auto* keys_data = keys.data<T>();
auto keys_size = keys.size();
if (skipFirstEntry_) {
CAFFE_ENFORCE(keys.size() > 0);
++keys_data;
--keys_size;
}
return dict->Load(keys_data, keys_size);
}
private:
bool skipFirstEntry_;
};
class IndexStoreOp: public Operator<CPUContext> {
@ -188,6 +203,19 @@ class IndexFreezeOp: public Operator<CPUContext> {
}
};
class IndexSizeOp : public Operator<CPUContext> {
public:
IndexSizeOp(const OperatorDef& operator_def, Workspace* ws)
: Operator(operator_def, ws) {}
bool RunOnDevice() override {
auto& base = OperatorBase::Input<std::unique_ptr<IndexBase>>(0);
auto* out = Output(0);
out->Resize(std::vector<TIndex>{});
*out->mutable_data<TIndexValue>() = base->Size();
return true;
}
};
REGISTER_CPU_OPERATOR(IntIndexCreate, IndexCreateOp<int32_t>);
REGISTER_CPU_OPERATOR(LongIndexCreate, IndexCreateOp<int64_t>);
@ -197,6 +225,7 @@ REGISTER_CPU_OPERATOR(IndexGet, IndexGetOp);
REGISTER_CPU_OPERATOR(IndexLoad, IndexLoadOp);
REGISTER_CPU_OPERATOR(IndexStore, IndexStoreOp);
REGISTER_CPU_OPERATOR(IndexFreeze, IndexFreezeOp);
REGISTER_CPU_OPERATOR(IndexSize, IndexSizeOp);
OPERATOR_SCHEMA(IntIndexCreate)
.NumInputs(0)
@ -250,16 +279,20 @@ Should not be called concurrently with IndexGet.
)DOC")
.Input(0, "handle", "Pointer to an Index instance.");
OPERATOR_SCHEMA(IndexLoad)
.NumInputs(2)
.NumOutputs(0)
.SetDoc(R"DOC(
.NumInputs(2)
.NumOutputs(0)
.SetDoc(R"DOC(
Loads the index from the given 1-D tensor. Elements in the tensor will be given
consecutive indexes starting at 1. Fails if tensor contains repeated elements.
)DOC")
.Input(0, "handle", "Pointer to an Index instance.")
.Input(1, "items", "1-D tensor with elements starting with index 1.");
.Input(0, "handle", "Pointer to an Index instance.")
.Input(1, "items", "1-D tensor with elements starting with index 1.")
.Arg(
"skip_first_entry",
"If set, skips the first entry of the tensor. This allows "
"to load tensors that are aligned with an embedding, where the first "
"entry corresponds to the default 0 index entry.");
OPERATOR_SCHEMA(IndexStore)
.NumInputs(1)
@ -271,6 +304,15 @@ for unknowns, the first element of the output tensor will be element of index 1.
.Input(0, "handle", "Pointer to an Index instance.")
.Output(0, "items", "1-D tensor with elements starting with index 1.");
OPERATOR_SCHEMA(IndexSize)
.NumInputs(1)
.NumOutputs(1)
.SetDoc(R"DOC(
Returns the number of entries currently present in the index.
)DOC")
.Input(0, "handle", "Pointer to an Index instance.")
.Output(0, "items", "Scalar int64 tensor with number of entries.");
NO_GRADIENT(IndexGetOp);
NO_GRADIENT(IntIndexCreate);
NO_GRADIENT(LongIndexCreate);
@ -278,5 +320,5 @@ NO_GRADIENT(StringIndexCreate);
SHOULD_NOT_DO_GRADIENT(IndexFreeze);
SHOULD_NOT_DO_GRADIENT(IndexLoad);
SHOULD_NOT_DO_GRADIENT(IndexStore);
SHOULD_NOT_DO_GRADIENT(IndexSize);
} // namespace caffe2

View File

@ -3,6 +3,7 @@
#include <cstdio>
#include <map>
#include <unordered_set>
#include "caffe2/core/context.h"
#include "caffe2/core/db.h"
@ -65,12 +66,18 @@ class LoadOp final : public Operator<Context> {
// chunks. This way we can make sure that all chunks were loaded in the end.
// This is a map from output index to current size of the blob
std::map<int, size_t> blobSizes;
std::unordered_set<string> loaded;
for (; cursor->Valid(); cursor->Next()) {
const string& key = cursor->key();
if (!output_indices_.count(key)) {
VLOG(1) << "Key " << key << " not used. Skipping.";
} else {
CAFFE_ENFORCE(
loaded.count(key) == 0,
"Multiple copies of blob ",
key,
" found in the db.");
VLOG(2) << "Deserializing blob " << key;
BlobProto proto;
CHECK(proto.ParseFromString(cursor->value()));
@ -101,6 +108,15 @@ class LoadOp final : public Operator<Context> {
blobSize.first->second = blob->Get<Tensor<Context>>().size();
}
}
if (!proto.has_tensor() ||
blobSize.first->second >= blob->Get<Tensor<Context>>().size()) {
loaded.insert(key);
}
if (loaded.size() >= OutputSize()) {
break;
}
}
}
@ -116,6 +132,8 @@ class LoadOp final : public Operator<Context> {
blobSize.second);
}
}
CHECK_EQ(loaded.size(), OutputSize());
}
private:

View File

@ -0,0 +1,107 @@
#include "caffe2/core/operator.h"
#include "caffe2/core/tensor.h"
namespace caffe2 {
namespace {
class OneHotOp : public Operator<CPUContext> {
public:
OneHotOp(const OperatorDef& operator_def, Workspace* ws)
: Operator(operator_def, ws) {}
bool RunOnDevice() override {
auto& indices = Input(0);
auto& index_size_tensor = Input(1);
CAFFE_ENFORCE(indices.ndim() == 1);
CAFFE_ENFORCE(index_size_tensor.size() == 1);
auto batch_size = indices.size();
auto index_size = *index_size_tensor.data<int64_t>();
auto* indices_ptr = indices.data<int64_t>();
auto* one_hots = Output(0);
one_hots->Resize(std::vector<TIndex>{batch_size, index_size});
if (one_hots->size() == 0) {
return true;
}
auto* one_hots_ptr = one_hots->mutable_data<float>();
memset(one_hots_ptr, 0, one_hots->nbytes());
for (int i = 0; i < batch_size; ++i) {
auto label_idx = indices_ptr[i];
DCHECK((0 <= label_idx) && (label_idx < index_size));
one_hots_ptr[label_idx] = 1.0;
one_hots_ptr += index_size;
}
return true;
}
};
class SegmentOneHotOp : public Operator<CPUContext> {
public:
SegmentOneHotOp(const OperatorDef& operator_def, Workspace* ws)
: Operator(operator_def, ws) {}
bool RunOnDevice() override {
auto& lengths = Input(0);
auto& indices = Input(1);
auto& index_size_tensor = Input(2);
CAFFE_ENFORCE(lengths.ndim() == 1);
CAFFE_ENFORCE(indices.ndim() == 1);
CAFFE_ENFORCE(index_size_tensor.size() == 1);
auto batch_size = lengths.size();
auto index_size = *index_size_tensor.data<int64_t>();
CAFFE_ENFORCE(index_size > 0);
auto* lengths_ptr = lengths.data<int32_t>();
auto* indices_ptr = indices.data<int64_t>();
auto* one_hots = Output(0);
one_hots->Resize(std::vector<TIndex>{batch_size, index_size});
auto* one_hots_ptr = one_hots->mutable_data<float>();
if (one_hots->size() == 0) {
return true;
}
memset(one_hots_ptr, 0, one_hots->nbytes());
int el_idx = 0;
for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < lengths_ptr[i]; ++j) {
DCHECK(el_idx < indices.size());
auto label_idx = indices_ptr[el_idx++];
DCHECK((0 <= label_idx) && (label_idx < index_size));
one_hots_ptr[label_idx] = 1.0;
}
one_hots_ptr += index_size;
}
return true;
}
};
REGISTER_CPU_OPERATOR(OneHot, OneHotOp);
REGISTER_CPU_OPERATOR(SegmentOneHot, SegmentOneHotOp);
OPERATOR_SCHEMA(OneHot)
.NumInputs(2)
.NumOutputs(1)
.SetDoc(R"DOC(
Given a sequence of indices, one for each example in a batch, returns a matrix
where each inner dimension has the size of the index and has 1.0 in the index
active in the given example, and 0.0 everywhere else.
)DOC")
.Input(0, "indices", "The active index for each example in the batch.")
.Input(1, "index_size_tensor", "Scalar with the size of the index.")
.Output(0, "one_hots", "Matrix of size len(indices) x index_size");
OPERATOR_SCHEMA(SegmentOneHot)
.NumInputs(3)
.NumOutputs(1)
.SetDoc(R"DOC(
Given a sequence of indices, segmented by the lengths tensor, returns a matrix
that has the elements in each sequence set to 1.0, and 0.0 everywhere else.
)DOC")
.Input(0, "lengths", "Size of each segment.")
.Input(1, "indices", "Active indices, of size sum(lengths)")
.Input(2, "index_size_tensor", "Size of the index")
.Output(0, "one_hots", "Matrix of size len(lengths) x index_size");
NO_GRADIENT(OneHot);
NO_GRADIENT(SegmentOneHot);
}
}

View File

@ -124,6 +124,67 @@ struct LogSumExpRangeReducerDef {
"input slices. Operation doesn't change the shape of individual blocks.";
};
template <typename T, class Context>
class LogMeanExpRangeReducer;
template <typename T, class Context>
class LogMeanExpRangeReducerGradient;
template <typename T>
class LogMeanExpRangeReducer<T, CPUContext> {
public:
void operator()(
const TIndex block_size,
const TIndex blocks,
const T* in,
T* out,
CPUContext* context) {
for (int j = 0; j < block_size; ++j) {
T max_value = std::numeric_limits<T>::lowest();
for (int i = 0; i < blocks; ++i) {
max_value = std::max(max_value, in[i * block_size + j]);
}
T scaled_exp_sum = 0;
for (int i = 0; i < blocks; ++i) {
scaled_exp_sum += std::exp(in[i * block_size + j] - max_value);
}
scaled_exp_sum /= blocks;
*(out++) = std::log(scaled_exp_sum) + max_value;
}
}
};
template <typename T, class Context>
class LogMeanExpRangeReducerGradient {
public:
void operator()(
const TIndex block_size,
const TIndex blocks,
const T* segment_grad, // GO
T* data_grad, // GI
const T* data_in, // I
const T* data_out, // O
Context* context) {
for (int j = 0; j < block_size; ++j) {
const T out_grad = *(segment_grad++);
const T offset = *(data_out++);
for (int i = 0; i < blocks; ++i) {
auto idx = i * block_size + j;
data_grad[idx] = out_grad * std::exp(data_in[idx] - offset) / blocks;
}
}
}
};
struct LogMeanExpRangeReducerDef {
template <typename T, class Context>
using Reducer = LogMeanExpRangeReducer<T, Context>;
template <typename T, class Context>
using ReducerGradient = LogMeanExpRangeReducerGradient<T, Context>;
static constexpr const char* name = "LogMeanExp";
static constexpr const char* doc =
"LogMeanExp computes the element-wise log of the mean of exponentials of "
"input slices. Operation doesn't change the shape of individual blocks.";
};
template <typename T, class Context>
class MeanRangeReducer;
@ -180,10 +241,78 @@ struct MeanRangeReducerDef {
static constexpr const char* doc =
"Mean computation is done element-wise, so that each element of the "
"output slice corresponds to the average value of the respective "
"elements in the input slives. Operation doesn't change the shape of "
"elements in the input slices. Operation doesn't change the shape of "
"individual blocks.";
};
template <typename T, class Context>
class MaxRangeReducer;
template <typename T, class Context>
class MaxRangeReducerGradient;
template <typename T>
class MaxRangeReducer<T, CPUContext> {
public:
void operator()(
const TIndex block_size,
const TIndex blocks,
const T* in,
T* out,
CPUContext* context) {
for (int j = 0; j < block_size; ++j) {
T max_value = std::numeric_limits<T>::lowest();
for (int i = 0; i < blocks; ++i) {
max_value = std::max(max_value, in[i * block_size + j]);
}
*(out++) = max_value;
}
}
};
template <typename T, class Context>
class MaxRangeReducerGradient {
public:
void operator()(
const TIndex block_size,
const TIndex blocks,
const T* segment_grad, // GO
T* data_grad, // GI
const T* data_in, // I
const T* data_out, // O
Context* context) {
std::memset(
static_cast<void*>(data_grad), 0, blocks * block_size * sizeof(T));
for (int j = 0; j < block_size; ++j) {
const T out_grad = *(segment_grad++);
const T out = data_out[j];
for (int i = 0; i < blocks; ++i) {
auto idx = i * block_size + j;
if (out == data_in[idx]) {
data_grad[idx] = out_grad;
break;
}
}
}
}
};
struct MaxRangeReducerDef {
template <typename T, class Context>
using Reducer = MaxRangeReducer<T, Context>;
template <typename T, class Context>
using ReducerGradient = MaxRangeReducerGradient<T, Context>;
static constexpr const char* name = "Max";
static constexpr const char* doc =
"Max computation is done element-wise, so that each element of the "
"output slice corresponds to the max value of the respective "
"elements in the input slices. Operation doesn't change the shape of "
"individual blocks. This implementation imitates torch nn.Max operator. "
"If the maximum value occurs more than once, the operator will return "
"the first occurence of value. When computing the gradient using the "
"backward propagation, the gradient input corresponding to the first "
"occurence of the maximum value will be used.";
};
////////////////////////////////////////////////////////////////////////////////
// Incremental reducers: consume elements one by one
////////////////////////////////////////////////////////////////////////////////

View File

@ -32,12 +32,16 @@ class CuDNNReluOp final : public Operator<CUDAContext> {
if (X.dims() != cudnn_input_dims_) {
VLOG(1) << "Setting descriptors.";
cudnn_input_dims_ = X.dims();
int C = (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(3));
int H = 1;
int W = 1;
int C = 1, H = 1, W = 1;
if (X.ndim() == 4) {
// Normal 4-dimensional tensors for images.
C = (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(3));
H = (order_ == StorageOrder::NCHW ? X.dim32(2) : X.dim32(1));
W = (order_ == StorageOrder::NCHW ? X.dim32(3) : X.dim32(2));
} else {
// If X is not 4-dimensional, we will simply use H = 1 and W = 1
// and wrap everything into C.
C = X.size() / X.dim32(0);
}
CUDNN_CHECK(cudnnSetTensor4dDescriptor(
data_desc_, GetCudnnTensorFormat(order_),
@ -93,12 +97,16 @@ class CuDNNReluGradientOp final : public Operator<CUDAContext> {
if (Y.dims() != cudnn_input_dims_) {
VLOG(1) << "Setting descriptors.";
cudnn_input_dims_ = Y.dims();
int C = (order_ == StorageOrder::NCHW ? Y.dim32(1) : Y.dim32(3));
int H = 1;
int W = 1;
int C = 1, H = 1, W = 1;
if (Y.ndim() == 4) {
// Normal 4-dimensional tensors for images.
C = (order_ == StorageOrder::NCHW ? Y.dim32(1) : Y.dim32(3));
H = (order_ == StorageOrder::NCHW ? Y.dim32(2) : Y.dim32(1));
W = (order_ == StorageOrder::NCHW ? Y.dim32(3) : Y.dim32(2));
} else {
// If Y is not 4-dimensional, we will simply use H = 1 and W = 1
// and wrap everything into C.
C = Y.size() / Y.dim32(0);
}
CUDNN_CHECK(cudnnSetTensor4dDescriptor(
data_desc_, GetCudnnTensorFormat(order_),

View File

@ -0,0 +1,35 @@
#include "caffe2/operators/reverse_packed_segs_op.h"
namespace caffe2 {
namespace {
REGISTER_CPU_OPERATOR(ReversePackedSegs, ReversePackedSegsOp<CPUContext>);
OPERATOR_SCHEMA(ReversePackedSegs)
.NumInputs(2)
.NumOutputs(1)
.SetDoc(R"DOC(
Reverse segments in a 3-D tensor (lengths, segments, embeddings,), leaving
paddings unchanged. This operator is used to reverse input of a recurrent neural
network to make it a BRNN.
)DOC")
.Input(0, "data", "a 3-D (lengths, segments, embeddings,) tensor.")
.Input(1, "lengths", "length of each segment.")
.Output(
0,
"reversed data",
"a (lengths, segments, embeddings,) tensor with each segment reversed"
"and paddings unchanged.");
class GetReversePackedSegsGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
"ReversePackedSegs",
"",
vector<string>{GO(0), I(1)},
vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(ReversePackedSegs, GetReversePackedSegsGradient);
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,84 @@
#ifndef CAFFE2_OPERATORS_REVERSE_PACKED_SEGS_OP_H_
#define CAFFE2_OPERATORS_REVERSE_PACKED_SEGS_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
namespace caffe2 {
template <class Context>
class ReversePackedSegsOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(ReversePackedSegsOp);
USE_DISPATCH_HELPER;
bool RunOnDevice() override {
return DispatchHelper<TensorTypes<float, double, int, long, bool>>::call(
this, Input(DATA));
}
template <typename T>
bool DoRunWithType() {
if (Input(LENGTHS).template IsType<int>()) {
DoRunWithLengthType<T, int>();
} else {
DoRunWithLengthType<T, long>();
}
return true;
}
private:
INPUT_TAGS(DATA, LENGTHS);
template <typename T, typename LengthType>
void DoRunWithLengthType() {
const auto& data = Input(DATA);
const auto& lengths = Input(LENGTHS);
CAFFE_ENFORCE(
data.ndim() == 3,
"DATA should be 3-D tensor <lengths, "
"segments, embeddings>");
CAFFE_ENFORCE(lengths.ndim() == 1, "LENGTH should be 1-D");
auto* output = Output(0);
const auto& shape = data.dims();
output->Resize(shape);
const auto& max_length = data.dims()[0];
const auto& batch_size = data.dims()[1];
const auto& block_size = data.dims()[2];
CAFFE_ENFORCE(
lengths.dims()[0] == batch_size,
"lenths size should be"
" equal to batch size");
const T* data_ptr = data.template data<T>();
const LengthType* lengths_ptr = lengths.template data<LengthType>();
T* rev_data_ptr = output->template mutable_data<T>();
for (TIndex i = 0; i < batch_size; i++) {
const auto& seg_length = lengths_ptr[i];
CHECK_LE(seg_length, max_length);
TIndex j = 0;
for (; j < seg_length; j++) {
const T* data_block_ptr = data_ptr + (j * batch_size + i) * block_size;
T* rev_data_block_ptr =
rev_data_ptr + ((seg_length - 1 - j) * batch_size + i) * block_size;
context_.template Copy<T, Context, Context>(
block_size, data_block_ptr, rev_data_block_ptr);
}
for (; j < max_length; j++) {
const T* data_block_ptr = data_ptr + (j * batch_size + i) * block_size;
T* rev_data_block_ptr =
rev_data_ptr + (j * batch_size + i) * block_size;
context_.template Copy<T, Context, Context>(
block_size, data_block_ptr, rev_data_block_ptr);
}
}
}
};
} // namespace caffe2
#endif // CAFFE2_OPERATORS_REVERSE_PACKED_SEGS_OP_H_

View File

@ -1066,9 +1066,15 @@ REGISTER_SEGMENT_DEF(
REGISTER_SEGMENT_DEF(
AbstractSortedSegmentRangeDef<float, int, CPUContext,
LogSumExpRangeReducerDef>);
REGISTER_SEGMENT_DEF(AbstractSortedSegmentRangeDef<
float,
int,
CPUContext,
LogMeanExpRangeReducerDef>);
REGISTER_SEGMENT_DEF(
AbstractSortedSegmentRangeDef<float, int, CPUContext,
MeanRangeReducerDef>);
AbstractSortedSegmentRangeDef<float, int, CPUContext, MeanRangeReducerDef>);
REGISTER_SEGMENT_DEF(
AbstractSortedSegmentRangeDef<float, int, CPUContext, MaxRangeReducerDef>);
#define REGISTER_REDUCER_WITH_ALL_OPS(reducer_def) \
REGISTER_SEGMENT_DEF( \

View File

@ -0,0 +1,152 @@
#include <algorithm>
#include <unordered_map>
#include <vector>
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/tensor.h"
namespace caffe2 {
using ValueTypes = TensorTypes<int32_t, int64_t, float, double, string, bool>;
class SparseToDenseMaskOp : public Operator<CPUContext> {
public:
SparseToDenseMaskOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws) {
std::vector<int> mask = GetRepeatedArgument<int>("mask");
featuresCount_ = mask.size();
auto biggest = *std::max_element(mask.begin(), mask.end());
dense_.assign(std::min(kMaxDenseSize, biggest + 1), -1);
for (int i = 0; i < mask.size(); i++) {
int id = mask[i];
CAFFE_ENFORCE(id >= 0, "Only positive IDs are allowed.");
if (id >= kMaxDenseSize) {
sparse_[id] = i;
} else {
dense_[id] = i;
}
}
}
bool RunOnDevice() override {
const TypeMeta& meta = Input(INDICES).meta();
if (meta.Match<int32_t>()) {
return DoRunWithIndexType<int32_t>();
} else if (meta.Match<int64_t>()) {
return DoRunWithIndexType<int64_t>();
} else {
CAFFE_THROW("Unsupported type of tensor: ", meta.name());
return false;
}
}
template <typename TInd>
bool DoRunWithIndexType() {
if (InputSize() < 4) {
return DoRunWithLengthType<TInd, int32_t>();
} else {
const TypeMeta& meta = Input(LENGTHS).meta();
if (meta.Match<int32_t>()) {
return DoRunWithLengthType<TInd, int32_t>();
} else if (meta.Match<int64_t>()) {
return DoRunWithLengthType<TInd, int64_t>();
} else {
CAFFE_THROW("Unsupported type of tensor: ", meta.name());
return false;
}
}
}
template <typename TInd, typename TLen>
bool DoRunWithLengthType() {
return DispatchHelper<ValueTypes, TInd, TLen>::call(this, Input(VALUES));
}
template <typename TInd, typename TLen, typename TVal>
bool DoRunWithType() {
auto& sparse_indices = Input(INDICES);
CAFFE_ENFORCE(sparse_indices.ndim() == 1);
auto& sparse_values = Input(VALUES);
CAFFE_ENFORCE(sparse_values.ndim() == 1);
CAFFE_ENFORCE(sparse_indices.size() == sparse_values.size());
auto& default_value = Input(DEFAULT);
CAFFE_ENFORCE(default_value.size() == 1);
const TInd* sparse_indices_vec = sparse_indices.data<TInd>();
const TVal* sparse_values_vec = sparse_values.template data<TVal>();
const TVal* default_val = default_value.template data<TVal>();
int cols = featuresCount_;
int rows = 0;
TLen default_length = sparse_indices.dim32(0);
const TLen* lengths_vec = nullptr;
auto* output = Output(0);
if (InputSize() == 4) {
auto& lengths = Input(LENGTHS);
CAFFE_ENFORCE(lengths.ndim() == 1);
lengths_vec = lengths.data<TLen>();
rows = lengths.dim32(0);
output->Resize(rows, cols);
}
if (rows == 0) {
// if the LENGTHS is not set or it is empty, the output will be a vector
rows = 1;
lengths_vec = &default_length;
output->Resize(cols);
}
// init
TVal* output_data = output->template mutable_data<TVal>();
for (int i = 0; i < cols * rows; i++) {
output_data[i] = default_val[0];
}
TLen offset = 0;
for (int r = 0; r < rows; r++) {
for (int c = 0; c < lengths_vec[r]; c++) {
int idx = getFeatureIdx(sparse_indices_vec[offset + c]);
if (idx != -1) {
output_data[r * cols + idx] = sparse_values_vec[offset + c];
}
}
offset += lengths_vec[r];
}
return true;
}
private:
const int kMaxDenseSize = 1024 * 128;
std::unordered_map<int, int> sparse_;
std::vector<int> dense_;
int featuresCount_;
inline int getFeatureIdx(int id) const {
if (id >= kMaxDenseSize) {
const auto& iter = sparse_.find(id);
if (iter == sparse_.end()) {
return -1;
} else {
return iter->second;
}
} else {
return (id >= dense_.size()) ? -1 : dense_[id];
}
}
INPUT_TAGS(INDICES, VALUES, DEFAULT, LENGTHS);
};
namespace {
REGISTER_CPU_OPERATOR(SparseToDenseMask, SparseToDenseMaskOp);
OPERATOR_SCHEMA(SparseToDenseMask)
.NumInputs(3, 4)
.NumOutputs(1)
.SetDoc("Convert sparse representations to dense with given indices.")
.Output(0, "output", "1-D or 2-D dense tensor.");
NO_GRADIENT(SparseToDenseMask);
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,124 @@
#include "caffe2/caffe2/operators/string_ops.h"
#include "caffe2/core/operator.h"
namespace caffe2 {
namespace {
struct StartsWith {
explicit StartsWith(OperatorBase& op)
: prefix_(op.GetSingleArgument<std::string>("prefix", "")) {}
bool operator()(const std::string& str) {
return std::mismatch(prefix_.begin(), prefix_.end(), str.begin()).first ==
prefix_.end();
}
private:
std::string prefix_;
};
struct EndsWith {
explicit EndsWith(OperatorBase& op)
: suffix_(op.GetSingleArgument<std::string>("suffix", "")) {}
bool operator()(const std::string& str) {
return std::mismatch(suffix_.rbegin(), suffix_.rend(), str.rbegin())
.first == suffix_.rend();
}
private:
std::string suffix_;
};
struct Prefix {
explicit Prefix(OperatorBase& op)
: length_(op.GetSingleArgument<int>("length", 3)) {}
std::string operator()(const std::string& str) {
return std::string(str.begin(), std::min(str.end(), str.begin() + length_));
}
private:
int length_;
};
struct Suffix {
explicit Suffix(OperatorBase& op)
: length_(op.GetSingleArgument<int>("length", 3)) {}
std::string operator()(const std::string& str) {
return std::string(std::max(str.begin(), str.end() - length_), str.end());
}
private:
int length_;
};
template <typename ScalarFunctor, typename OutputType = std::string>
using StringElementwiseOp = UnaryElementwiseWithArgsOp<
TensorTypes<std::string>,
CPUContext,
ForEach<ScalarFunctor>,
OutputType>;
REGISTER_CPU_OPERATOR(StringPrefix, StringElementwiseOp<Prefix>);
REGISTER_CPU_OPERATOR(StringSuffix, StringElementwiseOp<Suffix>);
REGISTER_CPU_OPERATOR(StringStartsWith, StringElementwiseOp<StartsWith, bool>);
REGISTER_CPU_OPERATOR(StringEndsWith, StringElementwiseOp<EndsWith, bool>);
OPERATOR_SCHEMA(StringPrefix)
.NumInputs(1)
.NumOutputs(1)
.SetDoc(R"DOC(
Computes the element-wise string prefix of the string tensor.
Input strings that are shorter than prefix length will be returned unchanged.
NOTE: Prefix is computed on number of bytes, which may lead to wrong behavior
and potentially invalid strings for variable-length encodings such as utf-8.
)DOC")
.Arg("length", "Maximum size of the prefix, in bytes.")
.Input(0, "strings", "Tensor of std::string.")
.Output(
0,
"prefixes",
"Tensor of std::string containing prefixes for each input.");
OPERATOR_SCHEMA(StringSuffix)
.NumInputs(1)
.NumOutputs(1)
.SetDoc(R"DOC(
Computes the element-wise string suffix of the string tensor.
Input strings that are shorter than suffix length will be returned unchanged.
NOTE: Prefix is computed on number of bytes, which may lead to wrong behavior
and potentially invalid strings for variable-length encodings such as utf-8.
)DOC")
.Input(0, "strings", "Tensor of std::string.")
.Output(
0,
"suffixes",
"Tensor of std::string containing suffixes for each output.")
.Arg("length", "Maximum size of the suffix, in bytes.");
OPERATOR_SCHEMA(StringStartsWith)
.NumInputs(1)
.NumOutputs(1)
.SetDoc(R"DOC(
Performs the starts-with check on each string in the input tensor.
Returns tensor of boolean of the same dimension of input.
)DOC")
.Arg("prefix", "The prefix to check input strings against.")
.Input(0, "strings", "Tensor of std::string.")
.Output(0, "bools", "Tensor of bools of same shape as input.");
OPERATOR_SCHEMA(StringEndsWith)
.NumInputs(1)
.NumOutputs(1)
.SetDoc(R"DOC(
Performs the ends-with check on each string in the input tensor.
Returns tensor of boolean of the same dimension of input.
)DOC")
.Arg("suffix", "The suffix to check input strings against.")
.Input(0, "strings", "Tensor of std::string.")
.Output(0, "bools", "Tensor of bools of same shape as input.");
SHOULD_NOT_DO_GRADIENT(StringPrefix);
SHOULD_NOT_DO_GRADIENT(StringSuffix);
SHOULD_NOT_DO_GRADIENT(StringStartsWith);
SHOULD_NOT_DO_GRADIENT(StringEndsWith);
}
} // namespace caffe2

View File

@ -0,0 +1,13 @@
#pragma once
#include "caffe2/core/operator.h"
#include "caffe2/operators/elementwise_op.h"
namespace caffe2 {
template <typename ScalarFunctor, typename OutputType = std::string>
using StringElementwiseOp = UnaryElementwiseWithArgsOp<
TensorTypes<std::string>,
CPUContext,
ForEach<ScalarFunctor>,
OutputType>;
}

View File

@ -10,24 +10,25 @@ namespace caffe2 {
#define COMPILE_TIME_CUDA_MAX_TRANSPOSE_DIMS 5
namespace {
// TODO(jiayq): one possible optimization is to copy the buffer into a shared memory
// location to speed up access.
// TODO(jiayq): one possible optimization is to copy the buffer into a shared
// memory location to speed up access.
template <typename Dtype>
__global__ void transpose_gpu(const int nthreads, const Dtype* from_data,
Dtype* to_data, const int* buffer, const int num_axes) {
int from_inds[COMPILE_TIME_CUDA_MAX_TRANSPOSE_DIMS];
const int* from_counts = buffer;
const int* to_counts = buffer + num_axes;
const int* map = buffer + num_axes * 2;
const int* axes = buffer + num_axes * 2;
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int from_index = index, to_index = 0;
for (int i = 0; i < num_axes; i++) {
from_inds[i] = from_index / from_counts[i];
from_index = from_index % from_counts[i];
for (int i = num_axes - 1; i >= 0; --i) {
from_inds[i] = from_index % from_counts[i];
from_index = from_index / from_counts[i];
}
for (int i = 0; i < num_axes; i++) {
to_index += from_inds[map[i]] * to_counts[i];
for (int i = 0; i < num_axes - 1; i++) {
to_index = (to_index + from_inds[axes[i]]) * to_counts[i + 1];
}
to_index += from_inds[axes[num_axes - 1]];
to_data[to_index] = from_data[index];
}
}
@ -42,7 +43,7 @@ bool TransposeOp<CUDAContext>::DoRunWithType() {
int ndim = input.ndim();
CAFFE_ENFORCE(count < std::numeric_limits<int>::max(),
"Transpose op on GPU only supports int32");
CAFFE_ENFORCE(count < COMPILE_TIME_CUDA_MAX_TRANSPOSE_DIMS,
CAFFE_ENFORCE(ndim < COMPILE_TIME_CUDA_MAX_TRANSPOSE_DIMS,
"Input ndim exceeds compile time max.");
// Buffer contains the following data:
// (1) the dimenions of the inputs

View File

@ -45,7 +45,8 @@ class TransposeOp final : public Operator<Context> {
}
Y->Resize(new_dims_);
// Do the actual transpose, which is implemented in DoRunWithType().
return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
return DispatchHelper<TensorTypes<float, double, int, long>>::call(
this, Input(0));
}
protected:

View File

@ -17,6 +17,7 @@ REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(Copy, CopyOp<CPUContext, CPUContext, CPUContext>);
REGISTER_CPU_OPERATOR(Shape, ShapeOp<CPUContext>);
REGISTER_CPU_OPERATOR(HasElements, HasElementsOp<CPUContext>);
REGISTER_CPU_OPERATOR(IsEmpty, IsEmptyOp<CPUContext>);
REGISTER_CPU_OPERATOR(Gather, GatherOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(Unique, UniqueOp<CPUContext>);
REGISTER_CPU_OPERATOR(LengthsToSegmentIds, LengthsToSegmentIdsOp<CPUContext>);
@ -24,6 +25,7 @@ REGISTER_CPU_OPERATOR(SegmentIdsToLengths, SegmentIdsToLengthsOp<CPUContext>);
REGISTER_CPU_OPERATOR(Slice, SliceOp<int, CPUContext>);
REGISTER_CPU_OPERATOR(Squeeze, SqueezeOp<CPUContext>);
REGISTER_CPU_OPERATOR(ExpandDims, ExpandDimsOp<CPUContext>);
REGISTER_CPU_OPERATOR(And, AndOp<CPUContext>);
OPERATOR_SCHEMA(Print)
.NumInputs(1)
@ -209,6 +211,13 @@ OPERATOR_SCHEMA(HasElements)
"has_elements",
"Scalar bool tensor. True if input is not empty.");
OPERATOR_SCHEMA(IsEmpty)
.NumInputs(1)
.NumOutputs(1)
.SetDoc("Returns true iff the input tensor has size == 0")
.Input(0, "tensor", "Tensor of any type.")
.Output(0, "is_empty", "Scalar bool tensor. True if input is empty.");
OPERATOR_SCHEMA(Gather)
.NumInputs(2)
.NumOutputs(1)
@ -340,9 +349,21 @@ If the same blob is provided in input and output, the operation is copy-free.
.Input(0, "data", "Original tensor")
.Output(0, "expanded", "Reshaped tensor with same data as input.");
OPERATOR_SCHEMA(And)
.NumInputs(2)
.NumOutputs(1)
.AllowInplace({{0, 0}})
.SetDoc(R"DOC(
Outputs true iff both input blob values are true.
)DOC")
.Input(0, "input_0", "first boolean input.")
.Input(1, "input_1", "second boolean input.")
.Output(0, "output", "input_0 && input_1.");
SHOULD_NOT_DO_GRADIENT(Print);
SHOULD_NOT_DO_GRADIENT(Shape);
SHOULD_NOT_DO_GRADIENT(HasElements);
SHOULD_NOT_DO_GRADIENT(IsEmpty);
class GetSqueezeGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
@ -433,6 +454,7 @@ SHOULD_NOT_DO_GRADIENT(LengthsToSegmentIds);
SHOULD_NOT_DO_GRADIENT(SegmentIdsToLengths);
// TODO(azzolini): Add support for slice gradient
SHOULD_NOT_DO_GRADIENT(Slice);
SHOULD_NOT_DO_GRADIENT(And);
} // namespace

View File

@ -675,6 +675,21 @@ class HasElementsOp : public Operator<Context> {
}
};
template <class Context>
class IsEmptyOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(IsEmptyOp);
bool RunOnDevice() override {
auto& input = Input(0);
auto* output = OperatorBase::Output<TensorCPU>(0);
output->Resize(std::vector<TIndex>{});
*output->template mutable_data<bool>() = (input.size() == 0);
return true;
}
};
// RecordShapeOp records the shape of the input tensor to a vector of int. You
// mostly don't need this operator explicitly, and it is mostly used in the
// autodiff process.
@ -911,6 +926,23 @@ class UniqueOp : public Operator<Context> {
public:
OUTPUT_TAGS(UNIQUE, REMAPPING);
};
template <class Context>
class AndOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
AndOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws) {}
bool RunOnDevice() override {
const auto* i1 = Input(0).template data<bool>();
const auto* i2 = Input(1).template data<bool>();
auto* output = Output(0);
output->Resize(std::vector<int>{});
*output->template mutable_data<bool>() = (*i1 && *i2);
return true;
}
};
} // namespace caffe2
#endif // CAFFE2_OPERATORS_UTILITY_OPS_H_

View File

@ -199,7 +199,7 @@ message ExecutionStep {
// Criteria network specifies a single output (TensorCPU<bool>) of
// size (1), is run on every iteration by the executor, and
// execution terminates when the output[0] is `false`.
optional string criteria_network = 5;
optional string criteria_network = 5 [deprecated=true];
// If specified, run report_net asynchronously every `report_interval`
// seconds. Report_net is guaranteed to run at least once after all
@ -210,6 +210,20 @@ message ExecutionStep {
// If false or not set, execute sub-steps serially.
// If true, execute all substeps concurrently, each one in a separte thread.
optional bool concurrent_substeps = 6;
// Name of a scalar boolean tensor.
// ES checks this blob AFTER every substeps/subnets.
// If specified, and the value is true, then ES will skip the rest and return
// immediately.
// This means that the report_net and the first step will always be called.
// Use cases:
// 1) the first substep stops the rest if data condition not met
// 2) the first substep decide which of the rest of the steps should be run.
// 3) external control
//
// ** It is the user's responsibility to not to put this blob in race conditions.
// ** For example when setting this blob in concurrent substeps
optional string should_stop_blob = 9;
}
message PlanDef {

View File

@ -87,7 +87,6 @@ const TypeMeta& NumpyTypeToCaffe(int numpy_type) {
{NPY_UINT8, TypeMeta::Make<uint8_t>()},
{NPY_UINT16, TypeMeta::Make<uint16_t>()},
{NPY_OBJECT, TypeMeta::Make<std::string>()},
{NPY_STRING, TypeMeta::Make<std::string>()},
// Note: Add more types here.
};
static TypeMeta unknown_type;
@ -565,11 +564,10 @@ PyObject* FetchBlob(PyObject* self, PyObject* args) {
PyObject* FeedBlob(PyObject* self, PyObject* args) {
char* name_char;
PyArrayObject* array = nullptr;
PyObject* arg = nullptr;
PyObject* device_option_string = nullptr;
// TODO(dzhulgakov): implement accepting other types (at least string)
if (!PyArg_ParseTuple(args, "sO!|O", &name_char, &PyArray_Type, &array,
&device_option_string)) {
if (!PyArg_ParseTuple(
args, "sO|O", &name_char, &arg, &device_option_string)) {
PyErr_SetString(PyExc_ValueError, "Incorrect arguments.");
return nullptr;
}
@ -584,45 +582,57 @@ PyObject* FeedBlob(PyObject* self, PyObject* args) {
}
Blob* blob = gWorkspace->CreateBlob(name);
auto feeder = CreateFeeder(option.device_type());
if (!feeder) {
PyErr_SetString(PyExc_TypeError,
"Unknown device type encountered in FeedBlob.");
if (PyArray_Check(arg)) { // numpy array
PyArrayObject* array = reinterpret_cast<PyArrayObject*>(arg);
auto feeder = CreateFeeder(option.device_type());
if (!feeder) {
PyErr_SetString(
PyExc_TypeError, "Unknown device type encountered in FeedBlob.");
return nullptr;
}
return feeder->Feed(option, array, blob);
} else if (PyString_Check(arg)) { // string
*blob->GetMutable<std::string>() = PyBytesToStdString(arg);
Py_RETURN_TRUE;
} else {
PyErr_SetString(
PyExc_ValueError,
"Unexpected type of argument - only numpy array or string are "
"supported for feeding");
return nullptr;
}
return feeder->Feed(option, array, blob);
}
// A simple macro to avoid writing repeated symbols.
#define _PYNAME(name) {#name, name, METH_VARARGS, ""}
PyMethodDef* GetCaffe2PythonMethods() {
static PyMethodDef gCaffe2PythonMethods[] = {
// Note(Yangqing): For any function that we are going to override in the
// python file, we prepend "cc_" here.
_PYNAME(GlobalInit),
_PYNAME(RegisteredOperators),
{"cc_GetGradientDefs", GetGradientDefs, METH_VARARGS, ""},
_PYNAME(SwitchWorkspace),
_PYNAME(CurrentWorkspace),
_PYNAME(Workspaces),
{"cc_ResetWorkspace", ResetWorkspace, METH_VARARGS, ""},
_PYNAME(RootFolder),
_PYNAME(OnModuleExit),
_PYNAME(Blobs),
_PYNAME(HasBlob),
{"cc_CreateNet", CreateNet, METH_VARARGS, ""},
_PYNAME(RunNet),
_PYNAME(BenchmarkNet),
_PYNAME(DeleteNet),
_PYNAME(Nets),
{"cc_RunOperatorOnce", RunOperatorOnce, METH_VARARGS, ""},
{"cc_RunNetOnce", RunNetOnce, METH_VARARGS, ""},
{"cc_RunPlan", RunPlan, METH_VARARGS, ""},
_PYNAME(CreateBlob),
_PYNAME(SerializeBlob),
_PYNAME(FetchBlob),
{"cc_FeedBlob", FeedBlob, METH_VARARGS, ""},
{nullptr, nullptr, 0, nullptr}, // end of python methods.
// Note(Yangqing): For any function that we are going to override in the
// python file, we prepend "cc_" here.
_PYNAME(GlobalInit),
_PYNAME(RegisteredOperators),
{"cc_GetGradientDefs", GetGradientDefs, METH_VARARGS, ""},
_PYNAME(SwitchWorkspace),
_PYNAME(CurrentWorkspace),
_PYNAME(Workspaces),
{"cc_ResetWorkspace", ResetWorkspace, METH_VARARGS, ""},
_PYNAME(RootFolder),
_PYNAME(OnModuleExit),
_PYNAME(Blobs),
_PYNAME(HasBlob),
{"cc_CreateNet", CreateNet, METH_VARARGS, ""},
_PYNAME(RunNet),
_PYNAME(BenchmarkNet),
_PYNAME(DeleteNet),
_PYNAME(Nets),
{"cc_RunOperatorOnce", RunOperatorOnce, METH_VARARGS, ""},
{"cc_RunNetOnce", RunNetOnce, METH_VARARGS, ""},
{"cc_RunPlan", RunPlan, METH_VARARGS, ""},
_PYNAME(CreateBlob),
_PYNAME(SerializeBlob),
{"cc_FetchBlob", FetchBlob, METH_VARARGS, ""},
{"cc_FeedBlob", FeedBlob, METH_VARARGS, ""},
{nullptr, nullptr, 0, nullptr}, // end of python methods.
};
return gCaffe2PythonMethods;
}

View File

@ -160,7 +160,10 @@ class TensorFetcher : public BlobFetcherBase {
Py_DECREF(outObj[j]);
}
Py_DECREF(array);
LOG(FATAL) << "Failed to allocate string for ndarray of strings.";
PyErr_SetString(
PyExc_TypeError,
"Failed to allocate string for ndarray of strings.");
return nullptr;
}
}
return array;
@ -217,21 +220,14 @@ class TensorFeeder : public BlobFeederBase {
char* str;
Py_ssize_t strSize;
if (PyBytes_AsStringAndSize(input[i], &str, &strSize) == -1) {
LOG(FATAL) << "Unsupported pyhton object type passed into ndarray.";
PyErr_SetString(
PyExc_TypeError,
"Unsupported python object type passed into ndarray.");
return nullptr;
}
outPtr[i] = std::string(str, strSize);
}
} break;
case NPY_STRING: {
char* inputData = PyArray_BYTES(array);
auto* outPtr = tensor->template mutable_data<std::string>();
auto itemSize = PyArray_ITEMSIZE(array);
for (int i = 0; i < tensor->size(); ++i) {
auto start = inputData + i * itemSize;
auto end = std::find(start, start + itemSize, '\0');
outPtr[i] = std::string(start, end - start);
}
} break;
default:
context.template CopyBytes<CPUContext, Context>(
tensor->size() * meta.itemsize(),

View File

@ -613,6 +613,7 @@ def GetArgumentParser():
)
parser.add_argument("--net_type", type=str, default="dag")
parser.add_argument("--num_workers", type=int, default=2)
parser.add_argument("--use-nvtx", default=False, action='store_true')
return parser
@ -624,7 +625,9 @@ if __name__ == '__main__':
):
GetArgumentParser().print_help()
workspace.GlobalInit(['caffe2', '--caffe2_log_level=0'])
workspace.GlobalInit(
['caffe2', '--caffe2_log_level=0'] +
['--caffe2_use_nvtx'] if args.use_nvtx else [])
model_map = {
'AlexNet': AlexNet,
'OverFeat': OverFeat,

View File

@ -146,7 +146,7 @@ def ScopedBlobReference(name, *args, **kwargs):
return BlobReference(scope.NAMESCOPE + name, *args, **kwargs)
def _RectifyInputOutput(blobs):
def _RectifyInputOutput(blobs, net=None):
"""A helper function to rectify the input or output of the CreateOperator
interface.
"""
@ -154,18 +154,18 @@ def _RectifyInputOutput(blobs):
# If blobs is a single string, prepend scope.NAMESCOPE and put it as a
# list.
# TODO(jiayq): enforce using BlobReference instead of raw strings.
return [ScopedBlobReference(blobs)]
return [ScopedBlobReference(blobs, net=net)]
elif type(blobs) is BlobReference:
# If blob is a BlobReference, simply put it as a list.
return [BlobReference(str(blobs))]
elif type(blobs) is list:
return [blobs]
elif type(blobs) in (list, tuple):
# If blob is a list, we go through it and type check.
rectified = []
for blob in blobs:
if isinstance(blob, basestring):
rectified.append(ScopedBlobReference(blob))
rectified.append(ScopedBlobReference(blob, net=net))
elif type(blob) is BlobReference:
rectified.append(BlobReference(str(blob)))
rectified.append(blob)
else:
raise TypeError(
"I/O blob #{} of unsupported type: {} of type {}"
@ -670,8 +670,19 @@ def get_op_ids_in_path(ssa, blob_versions, inputs, outputs):
class Net(object):
_net_names_used = set()
operator_registry_ = {}
@staticmethod
def _get_next_net_name(basename):
name = basename
next_idx = 1
while name in Net._net_names_used:
name = basename + '_' + str(next_idx)
next_idx += 1
Net._net_names_used |= set([name])
return name
def __init__(self, name_or_proto):
"""
Create a Net.
@ -706,29 +717,29 @@ class Net(object):
else:
self._next_name_index = 0
else:
name = name_or_proto
self._net = caffe2_pb2.NetDef()
self._net.name = name
self._net.name = name_or_proto
self._next_name_index = 0
# make sure that this net name hasn't been used before
self._net.name = Net._get_next_net_name(self._net.name)
def __str__(self):
return self._net.name
def DefinesBlob(self, blob):
def BlobIsDefined(self, blob):
"""
Returns true if the given BlobReference is produced as output of
an operator in this net, or if it is provided as an external input.
"""
if isinstance(blob, BlobReference):
assert blob.Net() == self, 'Reference belongs to different net'
blob_name = str(blob)
for input in self._net.external_input:
if input == blob_name:
return True
for op in self._net.op:
for output in op.output:
if output == blob_name:
return True
for input in self._net.external_input:
if input == blob_name:
return True
return False
def UsesBlob(self, blob):
@ -753,7 +764,7 @@ class Net(object):
raises KeyError.
"""
blob_name = str(blob_name)
if not self.DefinesBlob(blob_name):
if not self.BlobIsDefined(blob_name):
raise KeyError('Net does not define blob %s' % blob_name)
return BlobReference(blob_name, self)
@ -818,13 +829,16 @@ class Net(object):
new_outputs: list of BlobReferences corresponding to the
outputs produced by new_net.
"""
inputs = inputs if isinstance(inputs, dict) else {i: i for i in inputs}
input_is_pair_list = isinstance(inputs, list) and all(
isinstance(i, tuple) and len(i) == 2 for i in inputs)
inputs = (
inputs if isinstance(inputs, (dict, OrderedDict)) else
OrderedDict(inputs) if input_is_pair_list else
OrderedDict(zip(inputs, inputs)))
for output in outputs:
assert self.BlobIsDefined(output)
input_names = {str(k): str(v) for k, v in inputs.items()}
output_names = [str(o) for o in outputs]
for input in inputs.keys():
assert self.UsesBlob(input)
for output in outputs:
assert self.DefinesBlob(output)
proto = self._net
ssa, blob_versions = get_ssa(proto)
used_op_ids = get_op_ids_in_path(ssa, blob_versions, inputs, outputs)
@ -859,11 +873,23 @@ class Net(object):
def Proto(self):
return self._net
def NextName(self):
def NextName(self, prefix=None, output_id=None):
"""Returns the next name to be used, if you do not want to explicitly
name your blob."""
output_name = self._net.name + '_blob_' + str(self._next_name_index)
self._next_name_index += 1
if prefix:
output_name_base = self._net.name + '/' + prefix
output_name = output_name_base
if output_id is not None:
output_name += ':' + str(output_id)
index = 2
while self.BlobIsDefined(output_name):
output_name = output_name_base + '_' + str(index)
if output_id is not None:
output_name += ':' + str(output_id)
index += 1
else:
output_name = self._net.name + '_blob_' + str(self._next_name_index)
self._next_name_index += 1
return str(output_name)
def AddGradientOperators(self, ys, skip=0):
@ -900,16 +926,18 @@ class Net(object):
self._net.op.extend(grad_ops)
return input_to_grad
def AddExternalInput(self, input_name):
input_name = str(input_name)
def AddExternalInput(self, input):
input_name = str(input)
assert input_name not in self._net.external_input, (
'Net already contains an input named %s' % input_name)
self._net.external_input.extend([input_name])
return BlobReference(input_name, self)
return (
input if isinstance(input, BlobReference)
else BlobReference(input_name))
def AddExternalOutput(self, output):
assert isinstance(output, BlobReference)
assert self.DefinesBlob(output)
assert self.BlobIsDefined(output)
self.Proto().external_output.extend([str(output)])
def DeduplicateGradientSlices(self, g):
@ -931,14 +959,22 @@ class Net(object):
def _CreateAndAddToSelf(self, op_type, inputs, outputs=None, **kwargs):
"""A helper function to create an operator and add it to self.
"""
inputs = _RectifyInputOutput(inputs)
for input in inputs:
if not self.BlobIsDefined(input):
assert input.Net() != self
self.AddExternalInput(input)
if outputs is None:
# If we do not specify an output, we will assume that this op
# produces one output in this case.
outputs = self.NextName()
outputs = self.NextName(prefix=op_type)
elif type(outputs) is int:
# In this case, we will auto-fill the given number of outputs
# with auto-generated names.
outputs = [self.NextName() for i in range(outputs)]
outputs = [
self.NextName(prefix=op_type, output_id=i)
for i in range(outputs)]
outputs = _RectifyInputOutput(outputs, net=self)
op = CreateOperator(op_type, inputs, outputs, **kwargs)
self._net.op.extend([op])
if len(op.output) == 0:
@ -1036,10 +1072,11 @@ class ExecutionStep(object):
self._assert_can_mutate()
self._step.num_iter = num_iter
def SetCriteriaNet(self, criteria_net):
def SetShouldStopBlob(self, should_stop_blob):
assert isinstance(should_stop_blob, BlobReference), (
"expects BlobReference here, got {}".format(type(should_stop_blob)))
self._assert_can_mutate()
_add_net_to_dict(self._net_dict, criteria_net)
self._step.criteria_network = get_net_name(criteria_net)
self._step.should_stop_blob = str(should_stop_blob)
def SetReportNet(self, report_net, report_interval):
self._assert_can_mutate()
@ -1053,7 +1090,7 @@ class ExecutionStep(object):
if isinstance(substep, ExecutionStep):
substep._notify_is_used()
if not substep.HasNets() and not substep.HasSubsteps():
return
return self
for net in substep.Nets():
_add_net_to_dict(self._net_dict, net)
self._substeps.append(substep)
@ -1061,6 +1098,7 @@ class ExecutionStep(object):
else:
proto = substep
self._step.substep.add().CopyFrom(proto)
return self
def SetConcurrentSubsteps(self, concurrent_substeps):
self._assert_can_mutate()
@ -1073,6 +1111,7 @@ class ExecutionStep(object):
assert isinstance(net, Net)
_add_net_to_dict(self._net_dict, net)
self._step.network.extend([get_net_name(net)])
return self
class Plan(object):
@ -1107,11 +1146,11 @@ class Plan(object):
def execution_step(default_name,
steps_or_nets,
criteria=None,
num_iter=None,
report_net=None,
report_interval=None,
concurrent_substeps=None):
concurrent_substeps=None,
should_stop_blob=None):
"""
Helper for creating an ExecutionStep.
- steps_or_nets can be:
@ -1120,18 +1159,20 @@ def execution_step(default_name,
- ExecutionStep
- list<Net>
- list<ExecutionStep>
- criteria is either None or a Net
- if no criteria or num_iter is provided, defaults to num_iter=1
- should_stop_blob is either None or a scalar boolean blob.
- This blob is checked AFTER every substeps/subnets.
- If specified and true, then this step will return immediately.
- Be sure to handle race conditions if setting from concurrent threads.
- if no should_stop_blob or num_iter is provided, defaults to num_iter=1
"""
assert criteria is None or isinstance(criteria, Net)
assert criteria is None or num_iter is None, (
'Cannot set both criteria and num_iter.')
if criteria is None and num_iter is None:
assert should_stop_blob is None or num_iter is None, (
'Cannot set both should_stop_blob and num_iter.')
if should_stop_blob is None and num_iter is None:
num_iter = 1
def set_criteria(step):
if criteria is not None:
step.SetCriteriaNet(criteria)
def set_step_attr(step):
if should_stop_blob is not None:
step.SetShouldStopBlob(should_stop_blob)
else:
step.SetIter(num_iter)
if concurrent_substeps is not None:
@ -1144,18 +1185,20 @@ def execution_step(default_name,
if not steps_or_nets:
return ExecutionStep(default_name)
if isinstance(steps_or_nets, ExecutionStep):
return set_criteria(steps_or_nets)
step = set_step_attr(ExecutionStep(default_name))
step.AddSubstep(steps_or_nets)
return step
elif isinstance(steps_or_nets, Net):
step = set_criteria(ExecutionStep(default_name))
step = set_step_attr(ExecutionStep(default_name))
step.AddNet(steps_or_nets)
return step
elif isinstance(steps_or_nets, list):
if isinstance(steps_or_nets[0], Net):
step = set_criteria(ExecutionStep(default_name))
step = set_step_attr(ExecutionStep(default_name))
map(step.AddNet, steps_or_nets)
return step
elif isinstance(steps_or_nets[0], ExecutionStep):
step = set_criteria(ExecutionStep(default_name))
step = set_step_attr(ExecutionStep(default_name))
map(step.AddSubstep, steps_or_nets)
return step
else:

276
caffe2/python/dataset.py Normal file
View File

@ -0,0 +1,276 @@
"""
Implementation of an in-memory dataset with structured schema.
Use this to store and iterate through datasets with complex schema that
fit in memory.
Iterating through entries of this dataset is very fast since the dataset
is stored as a set of native Caffe2 tensors, thus no type conversion or
deserialization is necessary.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import core, workspace
from caffe2.python.io import Reader, Writer
from caffe2.python.schema import Struct
import numpy as np
class _DatasetReader(Reader):
def __init__(self, field_names, field_blobs, cursor, name):
"""Don't call this directly. Instead, use dataset.reader()"""
self.field_names = field_names
self.field_blobs = field_blobs
self.cursor = cursor
self.name = name
def read(self, read_net, batch_size=1):
with core.NameScope(read_net.NextName(self.name)):
fields = read_net.ReadNextBatch(
[self.cursor] + self.field_blobs,
self.field_names,
batch_size=batch_size)
return (read_net.IsEmpty([fields[0]]), fields)
def reset(self, net):
net.ResetCursor([self.cursor], [])
class _DatasetRandomReader(Reader):
def __init__(self, field_names, field_blobs, cursor, name, indices):
"""Don't call this directly. Instead, use dataset.random_reader()"""
self.field_names = field_names
self.field_blobs = field_blobs
self.cursor = cursor
self.name = name
self.indices = indices
def reset(self, net):
net.ResetCursor([self.cursor], [])
def computeoffset(self, net):
self.reset(net)
offsets = net.ComputeOffset(
[self.cursor] + self.field_blobs,
'offsets')
self.offsets = offsets
def read(self, read_net, batch_size=1):
fields = read_net.ReadRandomBatch(
[self.cursor, self.indices, self.offsets] + self.field_blobs,
self.field_names,
batch_size=batch_size)
return (read_net.IsEmpty([fields[0]]), fields)
class _DatasetWriter(Writer):
def __init__(self, fields, field_blobs, init_net):
"""Don't call this directly. Use dataset.writer() instead."""
self.fields = fields
self.field_blobs = field_blobs
self.mutex = init_net.CreateMutex([])
def write(self, writer_net, fields):
"""
Add operations to `net` that append the blobs in `fields` to the end
of the dataset. An additional operator will also be added that checks
the consistency of the data in `fields` against the dataset schema.
Args:
writer_net: The net that will contain the Append operators.
fields: A list of BlobReference to be appeneded to this dataset.
"""
assert len(fields) == len(self.fields), (
'Expected %s fields, got %s.' % (len(self.fields), len(fields)))
writer_net.CheckDatasetConsistency(fields, [], fields=self.fields)
writer_net.AtomicAppend(
[self.mutex] + list(self.field_blobs) + list(fields),
self.field_blobs)
def commit(self, finish_net):
"""Commit is a no-op for an in-memory dataset."""
pass
def to_ndarray_list(values, schema):
"""
Given a list of values and a dataset schema, produce list of ndarray in the
right format.
This function will perform some checks to make sure that the arrays
produced have the right dtype and rank.
"""
assert isinstance(schema, Struct), 'schema must be a Struct.'
names = schema.field_names()
types = schema.field_types()
assert len(types) == len(values), (
'Values must have %d elements, got %d' % (len(types), len(values)))
arrays = []
for value, dtype, name in zip(values, types, names):
array = np.array(value, dtype=dtype.base)
# if array is empty we may need to reshape a little
if array.size == 0:
array = array.reshape((0,) + dtype.shape)
# check that the inner dimensions match the schema
assert (array.shape[1:] == dtype.shape), (
'Invalid array shape for field %s. Expected (%s), got (%s).' % (
name,
', '.join(['_'] + map(str, dtype.shape)),
', '.join(map(str, array.shape))))
arrays.append(array)
return arrays
def Const(net, value, dtype=None, name=None):
"""
Create a 'constant' by first creating an external input in the given
net, and then feeding the corresponding blob with its provided value
in the current workspace. The name is automatically generated in order
to avoid clashes with existing blob names.
"""
assert isinstance(net, core.Net), 'net must be a core.Net instance.'
value = np.array(value, dtype=dtype)
blob = net.AddExternalInput(net.NextName(prefix=name))
workspace.FeedBlob(str(blob), value)
return blob
class Dataset(object):
"""Represents an in-memory dataset with fixed schema.
Use this to store and iterate through datasets with complex schema that
fit in memory.
Iterating through entries of this dataset is very fast since the dataset
is stored as a set of native Caffe2 tensors, thus no type conversion or
deserialization is necessary.
"""
def __init__(self, fields, name=None):
"""Create an un-initialized dataset with schema provided by `fields`.
Before this dataset can be used, it must be initialized, either by
`init_empty` or `init_from_dataframe`.
Args:
fields: either a schema.Struct or a list of field names in a format
compatible with the one described in schema.py.
name: optional name to prepend to blobs that will store the data.
"""
assert isinstance(fields, list) or isinstance(fields, Struct), (
'fields must be either a Struct or a list of raw field names.')
self.schema = fields
self.fields = (
fields.field_names() if isinstance(fields, Struct) else fields)
self.field_types = (
fields.field_types() if isinstance(fields, Struct) else
[np.dtype(np.void)] * len(self.fields))
self.name = name or 'dataset'
self.field_blobs = None
def init_empty(self, init_net):
"""Initialize the blobs for this dataset with empty values.
Empty arrays will be immediately fed into the current workspace,
and `init_net` will take those blobs as external inputs.
"""
self.field_blobs = [Const(init_net, [], name=f) for f in self.fields]
def init_from_dataframe(self, net, dataframe):
"""Initialize the blobs for this dataset from a Pandas dataframe.
Each column of the dataframe will be immediately fed into the current
workspace, and the `net` will take this blobs as external inputs.
"""
assert len(self.fields) == len(dataframe.columns)
self.field_blobs = [
Const(net, dataframe.as_matrix([col]).flatten(), name=field)
for col, field in enumerate(self.fields)]
def get_blobs(self):
"""
Return the list of BlobReference pointing to the blobs that contain
the data for this dataset.
"""
assert self
return self.field_blobs
def field_names(self):
"""Return the list of field names for this dataset."""
return self.fields
def field_types(self):
"""
Return the list of field dtypes for this dataset.
If a list of strings, not a schema.Struct, was passed to the
constructor, this will return a list of dtype(np.void).
"""
return self.field_types
def reader(self, init_net, cursor_name=None):
"""Create a Reader object that is used to iterate through the dataset.
This will append operations to `init_net` that create a TreeCursor,
used to iterate through the data.
NOTE: Currently, it is not safe to append to a dataset while reading.
Args:
init_net: net that will be run once to create the cursor.
cursor_name: optional name for the blob containing a pointer
to the cursor.
Returns:
A _DatasetReader that can be used to create operators that will
iterate through the dataset.
"""
assert self.field_blobs, 'Dataset not initialized.'
cursor_name = cursor_name or (self.name + '_cursor')
cursor = init_net.CreateTreeCursor(
[],
[cursor_name],
fields=self.fields)
return _DatasetReader(
self.fields, self.field_blobs, cursor, cursor_name)
def random_reader(self, init_net, indices, cursor_name=None):
"""Create a Reader object that is used to iterate through the dataset.
NOTE: The reader order depends on the order in indices.
Args:
Similar to reader
indices: blob of reading order
Returns:
A DatasetReader that can be used to create operators that will
iterate through the dataset according to indices.
"""
assert self.field_blobs, 'Dataset not initialized.'
cursor_name = cursor_name or (self.name + '_cursor')
cursor = init_net.CreateTreeCursor(
[],
[cursor_name],
fields=self.fields)
return _DatasetRandomReader(
self.fields, self.field_blobs, cursor, cursor_name, indices)
def writer(self, init_net):
"""Create a Writer that can be used to append entries into the dataset.
NOTE: Currently, it is not safe to append to a dataset
while reading from it.
NOTE: Currently implementation of writer is not thread safe.
TODO: fixme
Args:
init_net: net that will be run once in order to create the writer.
(currently not used)
"""
assert self.field_blobs, 'Dataset not initialized.'
return _DatasetWriter(self.fields, self.field_blobs, init_net)

View File

@ -1,6 +1,6 @@
import numpy as np
import copy
from caffe2.python import core, workspace
from caffe2.python import workspace
class DeviceChecker(object):
@ -41,9 +41,8 @@ class DeviceChecker(object):
op.device_option.CopyFrom(device_option)
workspace.RunOperatorOnce(op)
results.append(
[workspace.FetchBlob(op.output[idx]) for idx in outputs_to_check
]
)
[workspace.FetchBlob(op.output[idx])
for idx in outputs_to_check])
# Everything is done, reset the workspace.
workspace.ResetWorkspace()
# After running on all devices, check correctness
@ -61,15 +60,15 @@ class DeviceChecker(object):
print(y.flatten())
print(np.max(np.abs(x - y)))
success = False
#else:
# print ('Passed device pair (0, %d), %s %s' %
# (i, outputs_to_check[j], y.shape))
# else:
# print ('Passed device pair (0, %d), %s %s' %
# (i, outputs_to_check[j], y.shape))
workspace.SwitchWorkspace(old_ws_name)
return success
def CheckNet(self, net, inputs={}, blobs_to_check=None, ignore=set()):
"""Checks a network by inspecting all of its intermediate results, and see
if things match.
"""Checks a network by inspecting all of its intermediate results, and
see if things match.
"""
old_ws_name = workspace.CurrentWorkspace()
results = []
@ -78,8 +77,8 @@ class DeviceChecker(object):
blobs_to_check = [b for b in blobs_to_check if b not in ignore]
workspace.SwitchWorkspace("_device_check_", True)
for i, device_option in enumerate(self._device_options):
for name, arr in inputs.iteritems():
#print 'feeding', name
for name, arr in inputs.items():
# print 'feeding', name
workspace.FeedBlob(name, arr, device_option)
for op in net.op:
op.device_option.CopyFrom(device_option)
@ -93,15 +92,18 @@ class DeviceChecker(object):
for j in range(len(blobs_to_check)):
x = results[i][j]
y = results[0][j]
if np.any(np.abs(x - y) > self._threshold):
if not np.allclose(x, y,
atol=self._threshold, rtol=self._threshold):
print('Failure in checking device option {}'
' and output {}. The outputs are:'
.format(i, blobs_to_check[j]))
print(x.flatten())
print(y.flatten())
print(np.max(np.abs(x - y)))
success = False
#else:
# print ('Passed device pair (%d, %d), %s %s: %s' %
# (i, j, blobs_to_check[j], y.shape, str(y.flatten())))
# else:
# print ('Passed device pair (%d, %d), %s %s: %s' %
# (i, j, blobs_to_check[j], y.shape,
# str(y.flatten())))
workspace.SwitchWorkspace(old_ws_name)
return success

View File

@ -178,7 +178,12 @@ class TestOperators(hu.HypothesisTestCase):
return st.sampled_from([np.float32, np.float64])
_test_binary(
"Div", ref, filter_=non_zero, test_gradient=True, dtypes=div_dtypes
"Div", ref, filter_=non_zero, test_gradient=True,
dtypes=div_dtypes, gcs=hu.gcs_cpu_only
)(self)
_test_binary(
"Div", ref, filter_=non_zero, test_gradient=False,
dtypes=div_dtypes
)(self)
_test_binary_broadcast(
"Div", ref, filter_=non_zero, dtypes=div_dtypes)(self)
@ -269,8 +274,8 @@ class TestOperators(hu.HypothesisTestCase):
for param, _ in enumerate(inputs):
self.assertGradientChecks(gc, op, inputs, param, [0])
@unittest.skipIf(True,
"Recurrent only works on CUDA 7.5 and above")
@unittest.skipIf(not workspace.has_gpu_support,
"Skipping test due to no gpu present.")
@given(hidden_size=st.integers(min_value=1, max_value=3),
num_layers=st.integers(min_value=1, max_value=3),
bidirectional=st.booleans(),
@ -366,27 +371,27 @@ class TestOperators(hu.HypothesisTestCase):
# CUDNN does NOT support different padding values and we skip it
@given(stride_h=st.integers(1, 3),
stride_w=st.integers(1, 3),
pad_t=st.integers(0, 3),
pad_l=st.integers(0, 3),
pad_b=st.integers(0, 3),
pad_r=st.integers(0, 3),
kernel=st.integers(1, 5),
size=st.integers(7, 10),
input_channels=st.integers(1, 8),
output_channels=st.integers(1, 8),
batch_size=st.integers(1, 3),
order=st.sampled_from(["NCHW", "NHWC"]),
engine=st.sampled_from([""]),
**hu.gcs)
stride_w=st.integers(1, 3),
pad_t=st.integers(0, 3),
pad_l=st.integers(0, 3),
pad_b=st.integers(0, 3),
pad_r=st.integers(0, 3),
kernel=st.integers(3, 5),
size=st.integers(8, 8),
input_channels=st.integers(1, 3),
output_channels=st.integers(1, 3),
batch_size=st.integers(1, 3),
order=st.sampled_from(["NCHW", "NHWC"]),
engine=st.sampled_from([""]),
**hu.gcs)
@settings(max_examples=2, timeout=100)
def test_convolution_separate_stride_pad_gradients(self, stride_h, stride_w,
pad_t, pad_l, pad_b,
pad_r, kernel, size,
input_channels,
output_channels,
batch_size, order,
engine, gc, dc):
pad_t, pad_l, pad_b,
pad_r, kernel, size,
input_channels,
output_channels,
batch_size, order,
engine, gc, dc):
assume(stride_h <= kernel)
assume(stride_w <= kernel)
op = core.CreateOperator(
@ -654,18 +659,19 @@ class TestOperators(hu.HypothesisTestCase):
rtol=1e-5)
@given(stride=st.integers(1, 3),
pad=st.integers(0, 3),
kernel=st.integers(1, 5),
size=st.integers(7, 10),
input_channels=st.integers(1, 8),
output_channels=st.integers(1, 8),
batch_size=st.integers(1, 3),
order=st.sampled_from(["NCHW", "NHWC"]),
engine=st.sampled_from([""]), **hu.gcs)
pad=st.integers(0, 3),
kernel=st.integers(1, 5),
size=st.integers(7, 10),
input_channels=st.integers(1, 8),
output_channels=st.integers(1, 8),
batch_size=st.integers(1, 3),
order=st.sampled_from(["NCHW", "NHWC"]),
engine=st.sampled_from(["", "CUDNN"]), **hu.gcs)
@settings(max_examples=2, timeout=100)
def test_convolution_transpose_gradients(self, stride, pad, kernel,
size, input_channels,
output_channels, batch_size,
order, engine, gc, dc):
size, input_channels,
output_channels, batch_size,
order, engine, gc, dc):
assume(stride <= kernel)
X = np.random.rand(
batch_size, size, size, input_channels).astype(np.float32) - 0.5
@ -692,17 +698,17 @@ class TestOperators(hu.HypothesisTestCase):
self.assertGradientChecks(gc, op, [X, w, b], i, [0])
@given(stride=st.integers(1, 3),
pad=st.integers(0, 3),
kernel=st.integers(1, 5),
size=st.integers(7, 10),
input_channels=st.integers(1, 8),
output_channels=st.integers(1, 8),
batch_size=st.integers(1, 3),
engine=st.sampled_from([""]), **hu.gcs)
pad=st.integers(0, 3),
kernel=st.integers(1, 5),
size=st.integers(7, 10),
input_channels=st.integers(1, 8),
output_channels=st.integers(1, 8),
batch_size=st.integers(1, 3),
engine=st.sampled_from(["", "CUDNN"]), **hu.gcs)
def test_convolution_transpose_layout(self, stride, pad, kernel,
size, input_channels,
output_channels, batch_size,
engine, gc, dc):
size, input_channels,
output_channels, batch_size,
engine, gc, dc):
assume(stride <= kernel)
X = np.random.rand(
batch_size, size, size, input_channels).astype(np.float32) - 0.5
@ -1049,7 +1055,7 @@ class TestOperators(hu.HypothesisTestCase):
@given(target_probabilities=hu.arrays(
dims=[10], elements=st.floats(allow_nan=False,
allow_infinity=False,
min_value=0,
min_value=0.01,
max_value=1)),
**hu.gcs)
def test_perplexity(self, target_probabilities, gc, dc):
@ -1477,9 +1483,13 @@ class TestOperators(hu.HypothesisTestCase):
op = core.CreateOperator("HasElements", ["data"], ["has_elements"])
self.assertReferenceChecks(gc, op, [data], lambda x: (len(x) > 0, ))
op = core.CreateOperator("IsEmpty", ["data"], ["is_empty"])
self.assertReferenceChecks(gc, op, [data], lambda x: (len(x) == 0, ))
@given(initial_iters=st.integers(0, 100),
max_iters=st.integers(0, 100))
def test_criteria_net_with_execution_step(self, initial_iters, max_iters):
def test_should_stop_as_criteria_net_execution_step(
self, initial_iters, max_iters):
net = core.Net("net")
net.Iter(["iter"], ["iter"])
workspace.FeedBlob(
@ -1487,16 +1497,87 @@ class TestOperators(hu.HypothesisTestCase):
workspace.FeedBlob(
"num_iters", np.asarray([max_iters]).astype(np.int32))
criteria_net = core.Net("criteria")
criteria_net.LT(["iter", "num_iters"], ["continue"])
criteria_net.Proto().external_output.extend(["continue"])
criteria_net.GE(["iter", "num_iters"], ["stop"])
criteria_net.Proto().external_output.extend(["stop"])
plan = core.Plan('plan')
plan.AddStep(core.execution_step('step', net, criteria=criteria_net))
plan.AddStep(core.execution_step(
'step', [criteria_net, net],
should_stop_blob=core.BlobReference("stop")))
workspace.RunPlan(plan)
iters = workspace.FetchBlob("iter")
self.assertEqual(iters.dtype, np.int32)
self.assertEqual(iters[0], max(initial_iters, max_iters))
def test_disabled_execution_step(self):
def createNets(i, disabled):
should_stop = 'should_stop_{}'.format(i)
output = 'output_{}'.format(i)
# init content and stop signal
init = core.Net("init_{}".format(i))
init.ConstantFill(
[],
[output],
shape=[1],
value=0.0
)
init.Cast([output], [should_stop], to='bool')
# decide if disabled or not
criterion = core.Net("criterion_{}".format(i))
tmp = criterion.ConstantFill(
[],
shape=[1],
value=1.0 if disabled else 0.0
)
criterion.Cast([tmp], [should_stop], to='bool')
criterion.Proto().external_output.extend([should_stop])
# the body net is just to turn a 0 blob to 1
net = core.Net("net_{}".format(i))
net.ConstantFill(
[],
[output],
shape=[1],
value=1.0
)
# always end the loop
ender = core.Net("ender_{}".format(i))
tmp = ender.ConstantFill(
[],
shape=[1],
value=1.0
)
ender.Cast([tmp], [should_stop], to='bool')
ender.Proto().external_output.extend([should_stop])
return [init, criterion, net, ender]
nets = [createNets(1, False),
createNets(2, True),
createNets(3, False)]
steps = [
core.execution_step(
'step_1', nets[0],
should_stop_blob=core.BlobReference('should_stop_1')),
core.execution_step(
'step_2', nets[1],
should_stop_blob=core.BlobReference('should_stop_2')),
core.execution_step('step_3', nets[2])
]
expected = [1.0, 0.0, 1.0]
plan = core.Plan('plan')
plan.AddStep(core.execution_step('all_steps', steps, num_iter=3))
workspace.RunPlan(plan)
for i, net in enumerate(nets):
self.assertEqual(
workspace.FetchBlob('output_{}'.format(i + 1))[0],
expected[i])
@given(initial_iters=st.integers(0, 100),
num_iters=st.integers(0, 100))
def test_iter_count_with_execution_step(self, initial_iters, num_iters):
@ -1523,6 +1604,13 @@ class TestOperators(hu.HypothesisTestCase):
def test_cast(self, a, src, dst, use_name, gc, dc):
a = a.astype(src)
# Casting from a float type outside the range of the integral
# type is UB.
ftypes = [np.float32, np.float64]
if src in ftypes and dst not in ftypes and dst is not np.bool:
info = np.iinfo(dst)
a = np.clip(a, info.min, info.max)
def ref(data):
return [data.astype(dst)]
@ -1571,7 +1659,8 @@ class TestOperators(hu.HypothesisTestCase):
self.assertDeviceChecks(dc, op, [X], [0])
self.assertGradientChecks(gc, op, [X], 0, [0])
@given(X=hu.tensor(), seed=st.integers(min_value=0, max_value=65536),
@given(X=_dtypes().flatmap(lambda dtype: hu.tensor(dtype=dtype)),
seed=st.integers(min_value=0, max_value=65536),
null_axes=st.booleans(),
**hu.gcs)
def test_transpose(self, X, seed, null_axes, gc, dc):
@ -1589,7 +1678,8 @@ class TestOperators(hu.HypothesisTestCase):
self.assertReferenceChecks(gc, op, [X, axes],
transpose_ref)
self.assertGradientChecks(gc, op, [X], 0, [0])
if X.dtype != np.int32 and X.dtype != np.int64:
self.assertGradientChecks(gc, op, [X], 0, [0])
@given(n=st.integers(1, 3),
dim=st.integers(4, 16),

View File

@ -263,7 +263,11 @@ class HypothesisTestCase(test_util.TestCase):
outs = []
for (n, ref) in zip(op.output, reference_outputs):
output = workspace.FetchBlob(n)
np.testing.assert_allclose(output, ref, atol=1e-4, rtol=1e-4)
if output.dtype.kind in ('S', 'O'):
np.testing.assert_array_equal(output, ref)
else:
np.testing.assert_allclose(
output, ref, atol=1e-4, rtol=1e-4)
outs.append(output)
if grad_reference and output_to_grad:
self._assertGradReferenceChecks(

138
caffe2/python/io.py Normal file
View File

@ -0,0 +1,138 @@
"""
Defines the base interface for reading and writing operations.
Readers/Writers are objects that produce operations that read/write sequences
of data. Each operation reads or writes a list of BlobReferences.
Readers and Writers must be implemented such that read and write operations
are atomic and thread safe.
Examples of possible Readers and Writers:
HiveReader, HiveWriter,
QueueReader, QueueWriter,
DatasetReader, DatasetWriter,
DBReader, DBWriter,
See `dataset.py` for an example of implementation.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import core
class Reader(object):
"""
Reader is a abstract class to be implemented in order to provide
operations capable of iterating through a dataset or stream of data.
A Reader must implement at least one operation, `read`, which
adds operations to a net that read the next batch of data. Readers can
optionally support the `reset` operation, which is useful when multiple
passes over the data are required.
"""
def read(self, read_net, batch_size=1, *args):
"""
Add operations to read_net that will read the read batch of data
and return a list of BlobReference representing the blobs that will
contain the batches produced.
Operations added to `read_net` must be thread safe and atomic, that is,
it should be possible to clone `read_net` and run multiple instances of
it in parallel.
Args:
read_net: the net that will be appended with read operations
batch_size: number of entires to read
Returns:
A tuple (should_stop, fields), with:
should_stop: BlobReference pointing to a boolean scalar
blob that indicates whether the read operation
was succesfull or whether the end of data has
been reached.
fields: A tuple of BlobReference containing the latest batch
of data that was read.
"""
raise NotImplementedError('Readers must implement `read`.')
def reset(self, net):
"""Append operations to `net` that will reset the reader.
This can be used to read the data multiple times.
Not all readers support this operation.
"""
raise NotImplementedError('This reader cannot be resetted.')
def execution_step(self, reader_net_name=None, batch_size=1):
"""Create an execution step with a net containing read operators.
The execution step will contain a `stop_blob` that knows how to stop
the execution loop when end of data was reached.
E.g.:
read_step, fields = reader.execution_step()
consume_net = core.Net('consume')
consume_net.Print(fields[0], [])
p = core.Plan('reader')
p.AddStep(read_step.AddNet(consume_net))
core.RunPlan(p)
Args:
reader_net_name: (optional) the name of the reader_net to be
created. The execution step will
be named accordingly.
batch_size: the batch size
Returns:
A tuple (read_step, fields), with:
read_step: A newly created execution step containing a net with
read operations. The step will have `stop_blob` set,
in order to stop the loop on end of data.
fields: A tuple of BlobReference containing the latest batch
of data that was read.
"""
reader_net = core.Net(reader_net_name or 'reader')
should_stop, fields = self.read(reader_net, batch_size=batch_size)
read_step = core.execution_step(
'{}_step'.format(reader_net_name),
reader_net,
should_stop_blob=should_stop)
return (read_step, fields)
class Writer(object):
"""
Writer is a abstract class to be implemented in order to provide
operations capable of feeding a data stream or a dataset.
A Writer must implement 2 operations:
`write`, which adds operations to a net that write the write batch of
data, and `commit`, which adds operations to a net in order to indicate
that no more data will be written.
"""
def write(self, writer_net, fields):
"""Add operations to `writer_net` that write the next batch of data.
Operations added to the net must be thread-safe and unique, that is:
multiple writers must be able to write to the dataset in parallel.
Args:
fields: a tuple of BlobReference containing the batch of data to
write.
"""
raise NotImplementedError('Writers must implement write.')
def commit(self, finish_net):
"""Add operations to `finish_net` that signal end of data.
This must be implemented by all Writers, but may be no-op for some
of them.
"""
raise NotImplementedError('Writers must implement commit.')

View File

@ -116,7 +116,7 @@ class TestMiniAlexNet(test_util.TestCase):
gpu_device = caffe2_pb2.DeviceOption()
gpu_device.device_type = caffe2_pb2.CUDA
checker = device_checker.DeviceChecker(1e-2, [cpu_device, gpu_device])
checker = device_checker.DeviceChecker(0.05, [cpu_device, gpu_device])
ret = checker.CheckNet(
model.net.Proto(),
inputs,
@ -126,15 +126,16 @@ class TestMiniAlexNet(test_util.TestCase):
)
self.assertEqual(ret, True)
def testMiniAlexNet(self):
@unittest.skipIf(not workspace.has_gpu_support,
"No GPU support. Skipping test.")
def testMiniAlexNetNCHW(self):
self._testMiniAlexNet("NCHW")
@unittest.skipIf(not workspace.has_gpu_support,
"No GPU support. Skipping test.")
def testMiniAlexNetNHWC(self):
self._testMiniAlexNet("NHWC")
if __name__ == '__main__':
if not workspace.has_gpu_support:
print('No GPU support. Skipping gpu test.')
elif workspace.NumCudaDevices() == 0:
print('No GPU device. Skipping gpu test.')
else:
unittest.main()
unittest.main()

View File

@ -3,6 +3,7 @@ from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import argparse
import json
from collections import defaultdict
from caffe2.python import utils
@ -53,6 +54,11 @@ def _rectify_operator_and_name(operators_or_net, name):
return operators, name
def _escape_label(name):
# json.dumps is poor man's escaping
return json.dumps(name)
def GetPydotGraph(operators_or_net, name=None, rankdir='LR'):
operators, name = _rectify_operator_and_name(operators_or_net, name)
graph = pydot.Dot(name, rankdir=rankdir)
@ -73,7 +79,7 @@ def GetPydotGraph(operators_or_net, name=None, rankdir='LR'):
if input_name not in pydot_nodes:
input_node = pydot.Node(
input_name + str(pydot_node_counts[input_name]),
label=input_name,
label=_escape_label(input_name),
**BLOB_STYLE
)
pydot_nodes[input_name] = input_node
@ -87,7 +93,7 @@ def GetPydotGraph(operators_or_net, name=None, rankdir='LR'):
pydot_node_counts[output_name] += 1
output_node = pydot.Node(
output_name + str(pydot_node_counts[output_name]),
label=output_name,
label=_escape_label(output_name),
**BLOB_STYLE
)
pydot_nodes[output_name] = output_node

View File

@ -0,0 +1,43 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import core, workspace
from caffe2.python.test_util import TestCase
class TestAtomicOps(TestCase):
def test_atomic_ops(self):
"""
Test that both countdown and checksum are update atomically by having
cowntdown count from 20k to 0 from parallel the workers and updating
the checksum to the value fetched. If operations are trully atomic,
each value from 1 to 20k should be fetched exactly once from the
countdown, and fed exactly once to the checksum, such that at the end
checksum must contain the exact value of sum[i=0..20000](i).
"""
init_net = core.Net('init')
mutex_countdown = init_net.CreateMutex([])
mutex_checksum = init_net.CreateMutex([])
countdown = init_net.ConstantIntFill([], shape=[], value=20000.)
checksum = init_net.ConstantIntFill([], shape=[], value=0.)
minus_one = init_net.ConstantIntFill([], shape=[], value=-1.)
steps = []
for i in range(0, 100):
net = core.Net('net:%d' % i)
_, fetched_count = net.AtomicFetchAdd(
[mutex_countdown, countdown, minus_one],
[countdown, 'fetched_count:%d' % i])
net.AtomicFetchAdd(
[mutex_checksum, checksum, fetched_count],
[checksum, 'not_used'])
steps.append(
core.execution_step('worker:%d' % i, net, num_iter=200))
super_step = core.execution_step(
'parent', steps, concurrent_substeps=True)
plan = core.Plan('plan')
plan.AddStep(core.execution_step('init', init_net))
plan.AddStep(super_step)
workspace.RunPlan(plan)
# checksum = sum[i=1..20000](i) = 20000 * 20001 / 2 = 200010000
self.assertEquals(workspace.FetchBlob(checksum), 200010000)

View File

@ -0,0 +1,42 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import core, workspace
from caffe2.python.test_util import TestCase
class TestCounterOps(TestCase):
def test_counter_ops(self):
workspace.RunOperatorOnce(core.CreateOperator(
'CreateCounter', [], ['c'], init_count=1))
workspace.RunOperatorOnce(core.CreateOperator(
'CountDown', ['c'], ['t1'])) # 1 -> 0
assert not workspace.FetchBlob('t1')
workspace.RunOperatorOnce(core.CreateOperator(
'CountDown', ['c'], ['t2'])) # 0 -> 0
assert workspace.FetchBlob('t2')
workspace.RunOperatorOnce(core.CreateOperator(
'ResetCounter', ['c'], [], init_count=1)) # -> 1
workspace.RunOperatorOnce(core.CreateOperator(
'CountDown', ['c'], ['t3'])) # 1 -> 0
assert not workspace.FetchBlob('t3')
workspace.RunOperatorOnce(core.CreateOperator(
'ConstantBoolFill', [], ['t4'], value=0.0, shape=[]))
assert workspace.FetchBlob('t4') == workspace.FetchBlob('t1')
workspace.RunOperatorOnce(core.CreateOperator(
'ConstantBoolFill', [], ['t5'], value=1.0, shape=[]))
assert workspace.FetchBlob('t5') == workspace.FetchBlob('t2')
assert workspace.RunOperatorOnce(core.CreateOperator(
'And', ['t1', 't2'], ['t6']))
assert not workspace.FetchBlob('t6') # True && False
assert workspace.RunOperatorOnce(core.CreateOperator(
'And', ['t2', 't5'], ['t7']))
assert workspace.FetchBlob('t7') # True && True

View File

@ -0,0 +1,71 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import core
from hypothesis import given
import caffe2.python.hypothesis_test_util as hu
import hypothesis.strategies as st
import numpy as np
def sigmoid(x):
return 1.0 / (1.0 + np.exp(-x))
def sigmoid_cross_entropy_with_logits(x, z):
return np.maximum(x, 0) - x * z + np.log(1 + np.exp(-np.abs(x)))
def sigmoid_cross_entropy_with_logits_grad(x, z):
return z - sigmoid(x)
class TestCrossEntropyOps(hu.HypothesisTestCase):
@given(
inputs=st.lists(
elements=st.integers(min_value=1, max_value=5),
min_size=1,
max_size=2,
average_size=2,
).flatmap(
lambda shape: st.tuples(
hu.arrays(
dims=shape,
elements=st.one_of(
st.floats(min_value=-1.0, max_value=-0.1),
st.floats(min_value=0.1, max_value=1.0),
)),
hu.arrays(
dims=shape,
elements=st.sampled_from([0.0, 1.0]),
),
)
),
)
def test_sigmoid_cross_entropy_with_logits(self, inputs):
logits, targets = inputs
def sigmoid_xentr_logit_ref(logits, targets):
s = sigmoid_cross_entropy_with_logits(logits, targets)
m = np.mean(s, axis=len(logits.shape) - 1)
return (m, )
def sigmoid_xentr_logit_grad_ref(g_out, outputs, fwd_inputs):
fwd_logits, fwd_targets = fwd_inputs
inner_size = fwd_logits.shape[-1]
m = fwd_targets - sigmoid(fwd_logits)
g_in = -np.expand_dims(g_out, axis=-1) * m / inner_size
return (g_in, None)
op = core.CreateOperator(
'SigmoidCrossEntropyWithLogits',
['logits', 'targets'],
['xentropy'])
self.assertReferenceChecks(
hu.cpu_do,
op,
[logits, targets],
sigmoid_xentr_logit_ref,
output_to_grad='xentropy',
grad_reference=sigmoid_xentr_logit_grad_ref)

View File

@ -0,0 +1,284 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import numpy as np
from caffe2.python import core, workspace, dataset
from caffe2.python.dataset import Const
from caffe2.python.schema import List, Struct, Scalar, Map
from caffe2.python.test_util import TestCase
def _assert_arrays_equal(actual, ref, err_msg):
if ref.dtype.kind in ('S', 'O'):
np.testing.assert_array_equal(actual, ref, err_msg=err_msg)
else:
np.testing.assert_allclose(
actual, ref, atol=1e-4, rtol=1e-4, err_msg=err_msg)
class TestDatasetOps(TestCase):
def test_dataset_ops(self):
"""
1. Defining the schema of our dataset.
This example schema could represent, for example, a search query log.
"""
schema = Struct(
# fixed size vector, which will be stored as a matrix when batched
('dense', Scalar((np.float32, 3))),
# could represent a feature map from feature ID to float value
('floats', Map(
Scalar(np.int32),
Scalar(np.float32))),
# could represent a multi-valued categorical feature map
('int_lists', Map(
Scalar(np.int32),
List(Scalar(np.int64)),
)),
# could represent a multi-valued, weighted categorical feature map
('id_score_pairs', Map(
Scalar(np.int32),
Map(
Scalar(np.int64),
Scalar(np.float32),
keys_name='ids',
values_name='scores'),
)),
# additional scalar information
('metadata', Struct(
('user_id', Scalar(np.int64)),
('user_embed', Scalar((np.float32, 2))),
('query', Scalar(str)),
)),
)
"""
This is what the flattened fields for this schema look like, along
with its type. Each one of these fields will be stored, read and
writen as a tensor.
"""
expected_fields = [
('dense', (np.float32, 3)),
('floats:lengths', np.int32),
('floats:values:keys', np.int32),
('floats:values:values', np.float32),
('int_lists:lengths', np.int32),
('int_lists:values:keys', np.int32),
('int_lists:values:values:lengths', np.int32),
('int_lists:values:values:values', np.int64),
('id_score_pairs:lengths', np.int32),
('id_score_pairs:values:keys', np.int32),
('id_score_pairs:values:values:lengths', np.int32),
('id_score_pairs:values:values:values:ids', np.int64),
('id_score_pairs:values:values:values:scores', np.float32),
('metadata:user_id', np.int64),
('metadata:user_embed', (np.float32, 2)),
('metadata:query', str),
]
zipped = zip(
expected_fields,
schema.field_names(),
schema.field_types())
for (ref_name, ref_type), name, dtype in zipped:
self.assertEquals(ref_name, name)
self.assertEquals(np.dtype(ref_type), dtype)
"""
2. The contents of our dataset.
Contents as defined below could represent, for example, a log of
search queries along with dense, sparse features and metadata.
The datset below has 3 top-level entries.
"""
contents_raw = [
# dense
[[1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3]],
# floats
[1, 2, 3], # len
[11, 21, 22, 31, 32, 33], # key
[1.1, 2.1, 2.2, 3.1, 3.2, 3.3], # value
# int lists
[2, 0, 2], # len
[11, 12, 31, 32], # key
[2, 4, 3, 1], # value:len
[111, 112, 121, 122, 123, 124, 311, 312, 313, 321], # value:value
# id score pairs
[1, 2, 2], # len
[11, 21, 22, 31, 32], # key
[1, 1, 2, 2, 3], # value:len
[111, 211, 221, 222, 311, 312, 321, 322, 323], # value:ids
[11.1, 21.1, 22.1, 22.2, 31.1, 31.2, 32.1, 32.2, 32.3], # val:score
# metadata
[123, 234, 456], # user_id
[[0.2, 0.8], [0.5, 0.5], [0.7, 0.3]], # user_embed
['dog posts', 'friends who like to', 'posts about ca'], # query
]
# convert the above content to ndarrays, checking against the schema
contents = dataset.to_ndarray_list(contents_raw, schema)
"""
3. Creating and appending to the dataset.
We first create an empty dataset with the given schema.
Then, a Writer is used to append these entries to the dataset.
"""
ds = dataset.Dataset(schema)
net = core.Net('init')
ds.init_empty(net)
blobs_to_append = [Const(net, c) for c in contents]
writer = ds.writer(init_net=net)
writer.write(net, blobs_to_append)
workspace.RunNetOnce(net)
"""
4. Iterating through the dataset contents.
If we were to iterate through the top level entries of our dataset,
this is what we should expect to see:
"""
entries_raw = [
(
[[1.1, 1.2, 1.3]], # dense
[1], [11], [1.1], # floats
[2], [11, 12], [2, 4], [111, 112, 121, 122, 123, 124], # intlst
[1], [11], [1], [111], [11.1], # id score pairs
[123], [[0.2, 0.8]], ['dog posts'], # metadata
),
(
[[2.1, 2.2, 2.3]], # dense
[2], [21, 22], [2.1, 2.2], # floats
[0], [], [], [], # int list
[2], [21, 22], [1, 2], [211, 221, 222], [21.1, 22.1, 22.2],
[234], [[0.5, 0.5]], ['friends who like to'], # metadata
),
(
[[3.1, 3.2, 3.3]], # dense
[3], [31, 32, 33], [3.1, 3.2, 3.3], # floats
[2], [31, 32], [3, 1], [311, 312, 313, 321], # int lst
[2], [31, 32], [2, 3], [311, 312, 321, 322, 323],
[31.1, 31.2, 32.1, 32.2, 32.3], # id score list
[456], [[0.7, 0.3]], ['posts about ca'], # metadata
),
# after the end of the dataset, we will keep getting empty vectors
([],) * 16,
([],) * 16,
]
entries = [dataset.to_ndarray_list(e, schema) for e in entries_raw]
"""
Let's go ahead and create the reading nets.
We will run `read` net multiple times and assert that we are reading the
entries the way we stated above.
"""
read_init_net = core.Net('read_init')
read_next_net = core.Net('read_next')
reader = ds.reader(read_init_net)
should_continue, batch_blobs = reader.read(read_next_net)
workspace.RunNetOnce(read_init_net)
workspace.CreateNet(read_next_net)
read_next_net_name = str(read_next_net)
for i, entry in enumerate(entries):
workspace.RunNet(read_next_net_name)
for name, blob, base in zip(ds.field_names(), batch_blobs, entry):
data = workspace.FetchBlob(str(blob))
_assert_arrays_equal(
data, base,
err_msg='Mismatch in entry %d, field %s' % (i, name))
"""
5. Reading/writing in a single plan
If all of operations on the data are expressible as Caffe2 operators,
we don't need to load the data to python, iterating through the dataset
in a single Plan.
Where we will process the dataset a little and store it in a second
dataset. We can reuse the same Reader since it supports reset.
"""
reset_net = core.Net('reset_net')
reader.reset(reset_net)
read_step, fields = reader.execution_step()
""" We will add the line number * 1000 to the feature ids. """
process_net = core.Net('process')
line_no = Const(process_net, 0, dtype=np.int32)
const_one = Const(process_net, 1000, dtype=np.int32)
process_net.Add([line_no, const_one], [line_no])
fid = schema.floats.values.keys.id()
process_net.Print(fields[fid], [])
process_net.Add([fields[fid], line_no], fields[fid], broadcast=1)
""" Lets create a second dataset and append to it. """
ds2 = dataset.Dataset(schema, name='dataset2')
ds2.init_empty(reset_net)
writer = ds2.writer(reset_net)
writer.write(process_net, fields)
# commit is not necessary for DatasetWriter but will add it for
# generality of the example
commit_net = core.Net('commit')
writer.commit(commit_net)
""" Time to create and run a plan which will do the processing """
plan = core.Plan('process')
plan.AddStep(core.execution_step('reset', reset_net))
plan.AddStep(read_step.AddNet(process_net))
plan.AddStep(core.execution_step('commit', commit_net))
workspace.RunPlan(plan)
"""
Now we should have dataset2 populated.
"""
ds2blobs = ds2.get_blobs()
for i, (name, blob) in enumerate(zip(schema.field_names(), ds2blobs)):
data = workspace.FetchBlob(str(blob))
content = contents[i]
if i == fid:
# one of our fields has been added with line numbers * 1000
content += [1000, 2000, 2000, 3000, 3000, 3000]
_assert_arrays_equal(
data, contents[i], err_msg='Mismatch in field %s.' % name)
"""
6. Slicing a dataset
You can create a new schema from pieces of another schema and reuse
the same data.
"""
subschema = Struct(('top_level', schema.int_lists.values))
int_list_contents = contents[schema.int_lists.values.slice()]
self.assertEquals(len(subschema.field_names()), len(int_list_contents))
"""
7. Random Access a dataset
"""
read_init_net = core.Net('read_init')
read_next_net = core.Net('read_next')
idx = np.array([2, 1, 0])
workspace.FeedBlob('idx', idx)
reader = ds.random_reader(read_init_net, 'idx')
reader.computeoffset(read_init_net)
should_continue, batch_blobs = reader.read(read_next_net)
workspace.CreateNet(read_init_net)
workspace.RunNetOnce(read_init_net)
workspace.CreateNet(read_next_net)
read_next_net_name = str(read_next_net)
for i in range(len(entries)):
k = idx[i] if i in idx else i
entry = entries[k]
workspace.RunNet(read_next_net_name)
for name, blob, base in zip(ds.field_names(), batch_blobs, entry):
data = workspace.FetchBlob(str(blob))
_assert_arrays_equal(
data, base,
err_msg='Mismatch in entry %d, field %s' % (i, name))

View File

@ -2,9 +2,9 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import numpy as np
from caffe2.python import core, workspace
from caffe2.python.test_util import TestCase
import numpy as np
class TestIndexOps(TestCase):
@ -47,6 +47,13 @@ class TestIndexOps(TestCase):
result2 = workspace.FetchBlob('result2')
np.testing.assert_array_equal([0, 5, 1, 0, 0], result2)
workspace.RunOperatorOnce(core.CreateOperator(
'IndexSize',
['index'],
['index_size']))
size = workspace.FetchBlob('index_size')
self.assertEquals(size, 6)
workspace.RunOperatorOnce(core.CreateOperator(
'IndexStore',
['index'],
@ -55,3 +62,21 @@ class TestIndexOps(TestCase):
new_entries = np.array(['new_entry1', 'new_entry2'], dtype=str)
np.testing.assert_array_equal(
np.concatenate((entries, new_entries)), stored_actual)
workspace.RunOperatorOnce(core.CreateOperator(
'StringIndexCreate',
[],
['index2']))
workspace.RunOperatorOnce(core.CreateOperator(
'IndexLoad',
['index2', 'stored_entries'],
[],
skip_first_entry=1))
workspace.RunOperatorOnce(core.CreateOperator(
'IndexSize',
['index2'],
['index2_size']))
index2_size = workspace.FetchBlob('index2_size')
self.assertEquals(index2_size, 5)

View File

@ -0,0 +1,76 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import core
from hypothesis import given
import caffe2.python.hypothesis_test_util as hu
import hypothesis.strategies as st
import numpy as np
def _one_hots():
index_size = st.integers(min_value=1, max_value=5)
lengths = st.lists(
elements=st.integers(min_value=0, max_value=5))
return st.tuples(index_size, lengths).flatmap(
lambda x: st.tuples(
st.just(x[0]),
st.just(x[1]),
st.lists(
elements=st.integers(min_value=0, max_value=x[0] - 1),
min_size=sum(x[1]),
max_size=sum(x[1]))))
class TestOneHotOps(hu.HypothesisTestCase):
@given(
hot_indices=hu.tensor(
min_dim=1, max_dim=1, dtype=np.int64,
elements=st.integers(min_value=0, max_value=42)),
end_padding=st.integers(min_value=0, max_value=2))
def test_one_hot(self, hot_indices, end_padding):
def one_hot_ref(hot_indices, size):
out = np.zeros([len(hot_indices), size], dtype=float)
x = enumerate(hot_indices)
for i, x in enumerate(hot_indices):
out[i, x] = 1.
return (out, )
size = np.array(max(hot_indices) + end_padding + 1, dtype=np.int64)
if size == 0:
size = 1
op = core.CreateOperator('OneHot', ['hot_indices', 'size'], ['output'])
self.assertReferenceChecks(
hu.cpu_do,
op,
[hot_indices, size],
one_hot_ref)
@given(hot_indices=_one_hots())
def test_segment_one_hot(self, hot_indices):
index_size, lengths, indices = hot_indices
index_size = np.array(index_size, dtype=np.int64)
lengths = np.array(lengths, dtype=np.int32)
indices = np.array(indices, dtype=np.int64)
def segment_one_hot_ref(lengths, hot_indices, size):
offset = 0
out = np.zeros([len(lengths), size], dtype=float)
for i, length in enumerate(lengths):
for idx in hot_indices[offset:offset + length]:
out[i, idx] = 1.
offset += length
return (out, )
op = core.CreateOperator(
'SegmentOneHot',
['lengths', 'hot_indices', 'size'],
['output'])
self.assertReferenceChecks(
hu.cpu_do,
op,
[lengths, indices, index_size],
segment_one_hot_ref)

View File

@ -83,6 +83,10 @@ def logsumexp_grad(grad_out, outputs, inputs):
axis=0) * np.exp(inputs[0])
def logmeanexp(x):
return np.log(np.mean(np.exp(x), axis=0))
def mean(x):
return np.mean(x, axis=0)
@ -94,6 +98,30 @@ def mean_grad(grad_out, outputs, inputs):
axis=0)
def max(x):
return np.amax(x, axis=0)
def max_grad(grad_out, outputs, inputs):
flat_inputs = inputs[0].flatten()
flat_outputs = np.array(outputs[0]).flatten()
flat_grad_in = np.zeros(flat_inputs.shape)
flat_grad_out = np.array(grad_out).flatten()
blocks = inputs[0].shape[0]
block_size = flat_inputs.shape[0] // blocks
for i in range(block_size):
out_grad = flat_grad_out[i]
out = flat_outputs[i]
for j in range(blocks):
idx = j * block_size + i
if out == flat_inputs[idx]:
flat_grad_in[idx] = out_grad
break
return np.resize(flat_grad_in, inputs[0].shape)
REFERENCES_ALL = [
('Sum', partial(np.sum, axis=0), sum_grad),
]
@ -101,7 +129,10 @@ REFERENCES_ALL = [
REFERENCES_SORTED = [
('RangeSum', partial(np.sum, axis=0), sum_grad),
('RangeLogSumExp', logsumexp, logsumexp_grad),
# gradient is the same as sum
('RangeLogMeanExp', logmeanexp, logsumexp_grad),
('RangeMean', mean, mean_grad),
('RangeMax', max, max_grad),
]

View File

@ -185,3 +185,37 @@ class TestSequenceOps(hu.HypothesisTestCase):
op,
[padded_data, padded_lengths],
partial(_gather_padding_ref, start_pad_width, end_pad_width))
@given(data=hu.tensor(min_dim=3, max_dim=3, dtype=np.float32,
elements=st.floats(min_value=-np.inf,
max_value=np.inf),
min_value=1, max_value=10),
**hu.gcs_cpu_only)
def test_reverse_packed_segs(self, data, gc, dc):
max_length = data.shape[0]
batch_size = data.shape[1]
lengths = np.random.randint(max_length + 1, size=batch_size)
op = core.CreateOperator(
"ReversePackedSegs",
["data", "lengths"],
["reversed_data"])
def op_ref(data, lengths):
rev_data = np.array(data, copy=True)
for i in range(batch_size):
seg_length = lengths[i]
for j in range(seg_length):
rev_data[j][i] = data[seg_length - 1 - j][i]
return (rev_data,)
def op_grad_ref(grad_out, outputs, inputs):
return op_ref(grad_out, inputs[1]) + (None,)
self.assertReferenceChecks(
device_option=gc,
op=op,
inputs=[data, lengths],
reference=op_ref,
output_to_grad='reversed_data',
grad_reference=op_grad_ref)

View File

@ -0,0 +1,106 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import core
from hypothesis import given
import caffe2.python.hypothesis_test_util as hu
import hypothesis.strategies as st
import numpy as np
def _string_lists(alphabet=None):
return st.lists(
elements=st.text(alphabet=alphabet, average_size=3),
min_size=0,
max_size=3)
class TestStringOps(hu.HypothesisTestCase):
@given(strings=_string_lists())
def test_string_prefix(self, strings):
length = 3
# although we are utf-8 encoding below to avoid python exceptions,
# StringPrefix op deals with byte-length prefixes, which may produce
# an invalid utf-8 string. The goal here is just to avoid python
# complaining about the unicode -> str conversion.
strings = np.array(
map(lambda a: a.encode('utf-8'), strings), dtype=np.object)
def string_prefix_ref(strings):
return (
np.array(map(lambda a: a[:length], strings), dtype=object), )
op = core.CreateOperator(
'StringPrefix',
['strings'],
['stripped'],
length=length)
self.assertReferenceChecks(
hu.cpu_do,
op,
[strings],
string_prefix_ref)
@given(strings=_string_lists())
def test_string_suffix(self, strings):
length = 3
strings = np.array(
map(lambda a: a.encode('utf-8'), strings), dtype=np.object)
def string_suffix_ref(strings):
return (
np.array(map(lambda a: a[-length:], strings), dtype=object), )
op = core.CreateOperator(
'StringSuffix',
['strings'],
['stripped'],
length=length)
self.assertReferenceChecks(
hu.cpu_do,
op,
[strings],
string_suffix_ref)
@given(strings=st.text(alphabet=['a', 'b'], average_size=3))
def test_string_starts_with(self, strings):
prefix = 'a'
strings = np.array(
map(lambda a: str(strings), strings), dtype=np.object)
def string_starts_with_ref(strings):
return (np.array(
map(lambda a: a.startswith(prefix), strings), dtype=bool), )
op = core.CreateOperator(
'StringStartsWith',
['strings'],
['bools'],
prefix=prefix)
self.assertReferenceChecks(
hu.cpu_do,
op,
[strings],
string_starts_with_ref)
@given(strings=st.text(alphabet=['a', 'b'], average_size=3))
def test_string_ends_with(self, strings):
suffix = 'a'
strings = np.array(
map(lambda a: str(strings), strings), dtype=np.object)
def string_ends_with_ref(strings):
return (np.array(
map(lambda a: a.endswith(suffix), strings), dtype=bool), )
op = core.CreateOperator(
'StringEndsWith',
['strings'],
['bools'],
suffix=suffix)
self.assertReferenceChecks(
hu.cpu_do,
op,
[strings],
string_ends_with_ref)

348
caffe2/python/schema.py Normal file
View File

@ -0,0 +1,348 @@
"""
Defines a minimal set of data types that allow to represent datasets with
arbitrary nested structure, including objects of variable length, such as
maps and lists.
This defines a columnar storage format for such datasets on top of caffe2
tensors. In terms of capacity of representation, it can represent most of
the data types supported by Parquet, ORC, DWRF file formats.
See comments in operator_test/dataset_ops_test.py for a example and
walkthrough on how to use schema to store and iterate through a structured
in-memory dataset.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from collections import OrderedDict
import logging
import numpy as np
logger = logging.getLogger(__name__)
def _join_field_name(prefix, suffix):
if prefix and suffix:
return '{}:{}'.format(prefix, suffix)
elif prefix:
return prefix
elif suffix:
return suffix
else:
return ''
class Field(object):
"""Represents an abstract field type in a dataset.
"""
def __init__(self, children):
"""Derived classes must call this after their initialization."""
self._parent = (None, 0)
offset = 0
self._field_offsets = []
for child in children:
self._field_offsets.append(offset)
offset += len(child.field_names())
self._field_offsets.append(offset)
def field_names(self):
"""Return the children field names for this field."""
raise NotImplementedError('Field is an abstract class.')
def field_types(self):
"""Return the numpy.dtype for each of the children fields."""
raise NotImplementedError('Field is an abstract class.')
def clone(self):
"""Clone this Field along with its children."""
raise NotImplementedError('Field is an abstract class.')
def _set_parent(self, parent, relative_id):
self._parent = (parent, relative_id)
def slice(self):
"""
Returns a slice representing the range of field ids that belong to
this field. This slice can be used to index a list of fields.
E.g.:
>>> s = Struct(
>>> ('a', Scalar()),
>>> ('b', Struct(
>>> ('b1', Scalar()),
>>> ('b2', Scalar()),
>>> )),
>>> ('c', Scalar()),
>>> )
>>> field_data = ['da', 'db1', 'db2', 'dc']
>>> field_data[s.b.split()]
['db1', 'db2']
"""
base_id = self._child_base_id()
return slice(base_id, base_id + len(self.field_names()))
def _child_base_id(self, child_index=None):
"""Get the base id of the given child"""
p, i = self._parent
pos = 0 if child_index is None else self._field_offsets[child_index]
if p:
pos += p._child_base_id(i)
return pos
def __eq__(self, other):
"""Equivalance of two schemas"""
return ((self.field_names() == other.field_names()) and
(self.field_types() == other.field_types()))
class List(Field):
"""Represents a variable-length list.
Values of a list can also be complex fields such as Lists and Structs.
In addition to the fields exposed by its `values` field, a List exposes an
additional `lengths` field, which will contain the size of each list under
the parent domain.
"""
def __init__(self, values):
assert isinstance(values, Field)
self.lengths = Scalar(np.int32)
self.values = values.clone()
self.lengths._set_parent(self, 0)
self.values._set_parent(self, 1)
Field.__init__(self, [self.lengths, self.values])
def field_names(self):
value_fields = self.values.field_names()
return (
['lengths'] +
[_join_field_name('values', v) for v in value_fields])
def field_types(self):
return self.lengths.field_types() + self.values.field_types()
def clone(self):
return List(self.values)
class Struct(Field):
"""Represents a named list of fields sharing the same domain.
"""
def __init__(self, *fields):
for field in fields:
assert len(field) == 2
assert field[0], 'Field names cannot be empty'
assert field[0] != 'lengths', (
'Struct cannot contain a field named `lengths`.')
assert isinstance(field[1], Field)
fields = [(name, field.clone()) for name, field in fields]
for id, (name, field) in enumerate(fields):
field._set_parent(self, id)
self.fields = OrderedDict(fields)
Field.__init__(self, self.fields.values())
def field_names(self):
names = []
for name, field in self.fields.items():
names += [_join_field_name(name, f) for f in field.field_names()]
return names
def field_types(self):
types = []
for name, field in self.fields.items():
types += field.field_types()
return types
def clone(self):
return Struct(*self.fields.items())
def __getattr__(self, item):
return self.fields[item]
class Scalar(Field):
"""Represents a typed scalar or tensor of fixed shape.
A Scalar is a leaf in a schema tree, translating to exactly one tensor in
the dataset's underlying storage.
Usually, the tensor storing the actual values of this field is a 1D tensor,
representing a series of values in its domain. It is possible however to
have higher rank values stored as a Scalar, as long as all entries have
the same shape.
E.g.:
Scalar(np.float64)
Scalar field of type float32. Caffe2 will expect readers and
datasets to expose it as a 1D tensor of doubles (vector), where
the size of the vector is determined by this fields' domain.
Scalar((np.int32, 5))
Tensor field of type int32. Caffe2 will expect readers and
datasets to implement it as a 2D tensor (matrix) of shape (L, 5),
where L is determined by this fields' domain.
Scalar((str, (10, 20)))
Tensor field of type str. Caffe2 will expect readers and
datasets to implement it as a 3D tensor of shape (L, 10, 20),
where L is determined by this fields' domain.
If the field type is unknown at construction time, call Scalar(), that will
default to np.void as its dtype.
It is an error to pass a structured dtype to Scalar, since it would contain
more than one field. Instead, use from_dtype, which will construct
a nested `Struct` field reflecting the given dtype's structure.
"""
def __init__(self, dtype=None):
self._original_dtype = dtype
self.dtype = np.dtype(dtype or np.void)
assert not self.dtype.fields, (
'Cannot create Scalar with a structured dtype. ' +
'Use from_dtype instead.')
Field.__init__(self, [])
def field_names(self):
return ['']
def field_types(self):
return [self.dtype]
def clone(self):
return Scalar(self._original_dtype)
def id(self):
"""
Return the zero-indexed position of this scalar field in its schema.
Used in order to index into the field_blob list returned by readers or
accepted by writers.
"""
return self._child_base_id()
def Map(keys, values, keys_name='keys', values_name='values'):
"""A map is a List of Struct containing keys and values fields.
Optionally, you can provide custom name for the key and value fields.
"""
return List(Struct((keys_name, keys), (values_name, values)))
def from_dtype(dtype, _outer_shape=()):
"""Constructs a Caffe2 schema from the given numpy's dtype.
Numpy supports scalar, array-like and structured datatypes, as long as
all the shapes are fixed. This function breaks down the given dtype into
a Caffe2 schema containing `Struct` and `Scalar` types.
Fields containing byte offsets are not currently supported.
"""
if not isinstance(dtype, np.dtype):
# wrap into a ndtype
shape = _outer_shape
dtype = np.dtype((dtype, _outer_shape))
else:
# concatenate shapes if necessary
shape = _outer_shape + dtype.shape
if shape != dtype.shape:
dtype = np.dtype((dtype.base, shape))
if not dtype.fields:
return Scalar(dtype)
struct_fields = []
for name, (fdtype, offset) in dtype.fields:
assert offset == 0, ('Fields with byte offsets are not supported.')
struct_fields += (name, from_dtype(fdtype, _outer_shape=shape))
return Struct(*struct_fields)
class _SchemaNode(object):
"""This is a private class used to represent a Schema Node"""
def __init__(self, name, type_str=''):
self.name = name
self.children = []
self.type_str = type_str
self.field = None
def add_child(self, name, type_str=''):
for child in self.children:
if child.name == name and child.type_str == type_str:
return child
child = _SchemaNode(name, type_str)
self.children.append(child)
return child
def get_field(self):
list_names = ['lengths', 'values']
map_names = ['lengths', 'keys', 'values']
if len(self.children) == 0 or self.field is not None:
assert self.field is not None
return self.field
child_names = []
for child in self.children:
child_names.append(child.name)
if (set(child_names) == set(list_names)):
for child in self.children:
if child.name == 'values':
self.field = List(child.get_field())
self.type_str = "List"
return self.field
elif (set(child_names) == set(map_names)):
for child in self.children:
if child.name == 'keys':
key_field = child.get_field()
elif child.name == 'values':
values_field = child.get_field()
self.field = Map(key_field, values_field)
self.type_str = "Map"
return self.field
else:
struct_fields = []
for child in self.children:
if child.field is not None:
struct_fields.append((child.name, child.field))
else:
struct_fields.append((child.name, child.get_field()))
self.field = Struct(*struct_fields)
self.type_str = "Struct"
return self.field
def print_recursively(self):
for child in self.children:
child.print_recursively()
logger.info("Printing node: Name and type")
logger.info(self.name)
logger.info(self.type_str)
def from_column_list(column_names, column_types):
root = _SchemaNode('root', 'Struct')
for column_name, column_type in zip(column_names, column_types):
columns = column_name.split(':')
current = root
for i in range(len(columns)):
name = columns[i]
type_str = ''
field = None
if i == len(columns) - 1:
type_str = column_type
field = Scalar(column_type)
next = current.add_child(name, type_str)
if field is not None:
next.field = field
current = next
return root.get_field()

View File

@ -0,0 +1,82 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import core, workspace
from caffe2.python.test_util import TestCase
import numpy as np
class TestSparseToDenseMask(TestCase):
def test_sparse_to_dense_mask_float(self):
op = core.CreateOperator(
'SparseToDenseMask',
['indices', 'values', 'default', 'lengths'],
['output'],
mask=[999999999, 2, 6])
workspace.FeedBlob(
'indices',
np.array([2, 4, 6, 1, 2, 999999999, 2], dtype=np.int32))
workspace.FeedBlob(
'values',
np.array([1, 2, 3, 4, 5, 6, 7], dtype=np.float))
workspace.FeedBlob('default', np.array(-1, dtype=np.float))
workspace.FeedBlob('lengths', np.array([3, 4], dtype=np.int32))
workspace.RunOperatorOnce(op)
output = workspace.FetchBlob('output')
expected = np.array([[-1, 1, 3], [6, 7, -1]], dtype=np.float)
self.assertEqual(output.shape, expected.shape)
self.assertFalse(np.any(output - expected))
def test_sparse_to_dense_mask_string(self):
op = core.CreateOperator(
'SparseToDenseMask',
['indices', 'values', 'default', 'lengths'],
['output'],
mask=[999999999, 2, 6])
workspace.FeedBlob(
'indices',
np.array([2, 4, 6, 1, 2, 999999999, 2], dtype=np.int32))
workspace.FeedBlob(
'values',
np.array(['1', '2', '3', '4', '5', '6', '7'], dtype=np.str))
workspace.FeedBlob('default', np.array('-1', dtype=np.str))
workspace.FeedBlob('lengths', np.array([3, 4], dtype=np.int32))
workspace.RunOperatorOnce(op)
output = workspace.FetchBlob('output')
expected = np.array([['-1', '1', '3'], ['6', '7', '-1']], dtype=np.str)
self.assertEqual(output.shape, expected.shape)
self.assertTrue(np.all(np.equal(output, expected)))
def test_sparse_to_dense_mask_empty_lengths(self):
op = core.CreateOperator(
'SparseToDenseMask',
['indices', 'values', 'default', 'lengths'],
['output'],
mask=[1, 2, 6])
workspace.FeedBlob('indices', np.array([2, 4, 6], dtype=np.int32))
workspace.FeedBlob('values', np.array([1, 2, 3], dtype=np.float))
workspace.FeedBlob('default', np.array(-1, dtype=np.float))
workspace.FeedBlob('lengths', np.array([], dtype=np.int32))
workspace.RunOperatorOnce(op)
output = workspace.FetchBlob('output')
expected = np.array([-1, 1, 3], dtype=np.float)
self.assertEqual(output.shape, expected.shape)
self.assertFalse(np.any(output - expected))
def test_sparse_to_dense_mask_no_lengths(self):
op = core.CreateOperator(
'SparseToDenseMask',
['indices', 'values', 'default'],
['output'],
mask=[1, 2, 6])
workspace.FeedBlob('indices', np.array([2, 4, 6], dtype=np.int32))
workspace.FeedBlob('values', np.array([1, 2, 3], dtype=np.float))
workspace.FeedBlob('default', np.array(-1, dtype=np.float))
workspace.RunOperatorOnce(op)
output = workspace.FetchBlob('output')
expected = np.array([-1, 1, 3], dtype=np.float)
self.assertEqual(output.shape, expected.shape)
self.assertFalse(np.any(output - expected))

View File

@ -6,10 +6,18 @@ import shutil
import socket
import tempfile
import numpy as np
from caffe2.proto import caffe2_pb2
from caffe2.python import scope, utils
from ._import_c_extension import * # noqa
# Python 2 and 3 compatibility: test if basestring exists
try:
basestring # NOQA
except NameError:
# This is python3 so we define basestring.
basestring = str
def _GetFreeFlaskPort():
"""Get a free flask port."""
@ -86,7 +94,9 @@ def ResetWorkspace(root_folder=None):
return cc_ResetWorkspace(root_folder)
def CreateNet(net, input_blobs=[]):
def CreateNet(net, input_blobs=None):
if input_blobs is None:
input_blobs = []
for input_blob in input_blobs:
CreateBlob(input_blob)
return cc_CreateNet(StringfyProto(net))
@ -112,6 +122,14 @@ def RunPlan(plan):
return cc_RunPlan(StringfyProto(plan))
def _StringifyBlobName(name):
if isinstance(name, basestring):
return name
assert type(name).__name__ == 'BlobReference', \
"Expected a string or BlobReference"
return str(name)
def FeedBlob(name, arr, device_option=None):
"""Feeds a blob into the workspace.
@ -125,6 +143,10 @@ def FeedBlob(name, arr, device_option=None):
"""
if type(arr) is caffe2_pb2.TensorProto:
arr = utils.Caffe2TensorToNumpyArray(arr)
if type(arr) is np.ndarray and arr.dtype.kind == 'S':
# Plain NumPy strings are weird, let's use objects instead
arr = arr.astype(np.object)
name = _StringifyBlobName(name)
if device_option is not None:
return cc_FeedBlob(name, arr, StringfyProto(device_option))
elif scope.DEVICESCOPE is not None:
@ -133,6 +155,40 @@ def FeedBlob(name, arr, device_option=None):
return cc_FeedBlob(name, arr)
def FetchBlob(name):
"""Fetches a blob from the workspace.
Inputs:
name: the name of the blob - a string or a BlobReference
Returns:
Fetched blob (numpy array or string) if successful
"""
name = _StringifyBlobName(name)
return cc_FetchBlob(name)
class _BlobDict(object):
"""Provides python dict compatible way to do fetching and feeding"""
def __getitem__(self, key):
return FetchBlob(key)
def __setitem__(self, key, value):
return FeedBlob(key, value)
def __len__(self):
return len(Blobs())
def __iter__(self):
return Blobs().__iter__()
def __contains__(self, item):
return HasBlob(item)
blobs = _BlobDict()
class Model(object):
def __init__(self, net, parameters, inputs, outputs, device_option=None):
"""Initializes a model.

View File

@ -8,7 +8,8 @@ from caffe2.python import core, test_util, workspace
class TestWorkspace(unittest.TestCase):
def setUp(self):
self.net = core.Net("test-net")
self.net.ConstantFill([], "testblob", shape=[1, 2, 3, 4], value=1.0)
self.testblob_ref = self.net.ConstantFill(
[], "testblob", shape=[1, 2, 3, 4], value=1.0)
workspace.ResetWorkspace()
def testRootFolder(self):
@ -64,6 +65,20 @@ class TestWorkspace(unittest.TestCase):
self.assertEqual(fetched_again.shape, (1, 2, 3, 4))
np.testing.assert_array_equal(fetched_again, 2.0)
def testFetchFeedBlobViaBlobReference(self):
self.assertEqual(
workspace.RunNetOnce(self.net.Proto().SerializeToString()), True)
fetched = workspace.FetchBlob(self.testblob_ref)
# check if fetched is correct.
self.assertEqual(fetched.shape, (1, 2, 3, 4))
np.testing.assert_array_equal(fetched, 1.0)
fetched[:] = 2.0
self.assertEqual(workspace.FeedBlob(self.testblob_ref, fetched), True)
fetched_again = workspace.FetchBlob("testblob") # fetch by name now
self.assertEqual(fetched_again.shape, (1, 2, 3, 4))
np.testing.assert_array_equal(fetched_again, 2.0)
def testFetchFeedBlobTypes(self):
for dtype in [np.float16, np.float32, np.float64, np.bool,
np.int8, np.int16, np.int32, np.int64,
@ -101,7 +116,8 @@ class TestWorkspace(unittest.TestCase):
strs = np.array([
' '.join(10 * ['long string']),
' '.join(128 * ['very long string']),
'small string'])
'small \0\1\2 string',
"Hello, world! I have special \0 symbols \1!"])
workspace.FeedBlob('my_str_tensor', strs)
strs2 = workspace.FetchBlob('my_str_tensor')
self.assertEqual(strs.shape, strs2.shape)
@ -117,6 +133,32 @@ class TestWorkspace(unittest.TestCase):
for i in range(0, strs.shape[0]):
self.assertEqual(strs[i], strs2[i])
def testFetchFeedPlainString(self):
# this is actual string, not a tensor of strings
s = "Hello, world! I have special \0 symbols \1!"
workspace.FeedBlob('my_plain_string', s)
s2 = workspace.FetchBlob('my_plain_string')
self.assertEqual(s, s2)
def testFetchFeedViaBlobDict(self):
self.assertEqual(
workspace.RunNetOnce(self.net.Proto().SerializeToString()), True)
fetched = workspace.blobs["testblob"]
# check if fetched is correct.
self.assertEqual(fetched.shape, (1, 2, 3, 4))
np.testing.assert_array_equal(fetched, 1.0)
fetched[:] = 2.0
workspace.blobs["testblob"] = fetched
fetched_again = workspace.blobs["testblob"]
self.assertEqual(fetched_again.shape, (1, 2, 3, 4))
np.testing.assert_array_equal(fetched_again, 2.0)
self.assertTrue("testblob" in workspace.blobs)
self.assertFalse("non_existant" in workspace.blobs)
self.assertEqual(len(workspace.blobs), 1)
for key in workspace.blobs:
self.assertEqual(key, "testblob")
class TestMultiWorkspaces(unittest.TestCase):
def setUp(self):

View File

@ -14,7 +14,8 @@ void adagrad_update(
float epsilon,
const float* lr,
Context* context) {
#pragma omp parallel for
// TODO(cxj): use OMP when it is reliable
// #pragma omp parallel for
for (auto i = 0; i < N; ++i) {
float gi = g[i];
float hi = nh[i] = h[i] + gi * gi;
@ -78,8 +79,8 @@ class SparseAdagradOp final : public Operator<Context> {
const auto* momentIn = Input(MOMENT_1).template data<T>();
auto* gradOut = Output(OUTPUT_GRAD)->template mutable_data<T>();
auto* momentOut = Output(OUTPUT_MOMENT_1)->template mutable_data<T>();
#pragma omp parallel for
// TODO(cxj): use OMP when it is reliable
// #pragma omp parallel for
for (auto i = 0; i < n; ++i) {
auto idx = indices[i];
if (block_size == 1) {

View File

@ -41,7 +41,8 @@ void ftrl_update(
T* new_nz,
const FtrlParams<T>& params,
Context* context) {
#pragma omp parallel for
// TODO(cxj): use OMP when it is reliable
// #pragma omp parallel for
for (auto i = 0; i < N; ++i) {
ftrl_compute(
w[i],
@ -93,11 +94,13 @@ void SparseFtrlOp<T>::DoRun() {
T* nz = n_z->template mutable_data<T>();
const SIndex* idxs = indices.template data<SIndex>();
const T* g = grad.template data<T>();
#pragma omp parallel for
// TODO(cxj): use OMP when it is reliable
// #pragma omp parallel for
for (TIndex i = 0; i < K; ++i) {
SIndex idx = idxs[i];
DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx
<< ", range 0 to " << N;
<< ", range 0 to " << N;
if (block_size == 1) {
ftrl_compute(
w[idx],

View File

@ -11,9 +11,11 @@
// platforms, it allows one to quickly port Caffe2 to different platforms
// where BLAS may not be present.
#include <random>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>
#include <atomic>
#include <random>
#ifdef CAFFE2_USE_MKL
#include <mkl.h>
@ -486,6 +488,7 @@ void Set<T, CPUContext>(const int N, const T alpha, T *Y, \
CAFFE2_SPECIALIZED_SET(float);
CAFFE2_SPECIALIZED_SET(double);
CAFFE2_SPECIALIZED_SET(int);
CAFFE2_SPECIALIZED_SET(bool);
#undef CAFFE2_SPECIALIZED_SET
#define CAFFE2_INSTANTIATE_BINARY_OP(name, op, T) \

View File

@ -0,0 +1,14 @@
#include "caffe2/utils/string_utils.h"
namespace caffe2 {
std::vector<std::string> split(char separator, const std::string& string) {
std::vector<std::string> pieces;
std::stringstream ss(string);
std::string item;
while (getline(ss, item, separator)) {
pieces.push_back(std::move(item));
}
return pieces;
}
}

View File

@ -0,0 +1,10 @@
#pragma once
#include <sstream>
#include <string>
#include <vector>
namespace caffe2 {
std::vector<std::string> split(char separator, const std::string& string);
}