Speeding up the case for sparse float columns that have only 1 value.

PiperOrigin-RevId: 173971121
This commit is contained in:
A. Unique TensorFlower 2017-10-30 16:54:23 -07:00 committed by TensorFlower Gardener
parent c315cf1ee6
commit 09f62ab38b
5 changed files with 149 additions and 80 deletions

View File

@ -17,7 +17,6 @@
#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLE_H_
#include <algorithm>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "tensorflow/contrib/boosted_trees/lib/utils/optional_value.h"
@ -25,55 +24,85 @@
namespace tensorflow {
namespace boosted_trees {
namespace utils {
// A matrix that given feature column id and feature value id will return
// either a value or an optional. First index indicates feature column, second
// index - the index of the value within this column - for single valued, it
// will be 0.
// Allows double-subscript access [][].
// Represents sparse vector that have a value for some feature indices within
// the feature column.
// Allows subscript access [].
template <class T>
class SparseMatrix {
typedef std::vector<std::tuple<int32, int32, T>> SparseMap;
class Proxy {
public:
Proxy(const int32 feature_column_idx, const SparseMap& values)
: feature_column_idx_(feature_column_idx), values_(values) {}
OptionalValue<T> operator[](int feature_idx) const {
auto value_iter = std::find_if(
values_.begin(), values_.end(),
[this, &feature_idx](const std::tuple<int32, int32, T>& element) {
return std::get<0>(element) == feature_column_idx_ &&
std::get<1>(element) == feature_idx;
});
if (value_iter == values_.end()) {
return OptionalValue<T>();
}
// There is this feature column and feature id.
return OptionalValue<T>(std::get<2>(*value_iter));
}
private:
int32 feature_column_idx_;
const SparseMap& values_;
};
class SparseMultidimensionalValues {
public:
void addElement(const int32 feature_column_idx, const int32 feature_idx,
const T value) {
values_.emplace_back(feature_column_idx, feature_idx, value);
void Add(const int32 feature_idx, const T value) {
values_.emplace_back(feature_idx, value);
}
void clear() { values_.clear(); }
void Clear() { values_.clear(); }
Proxy operator[](int feature_column_idx) const {
return Proxy(feature_column_idx, values_);
void Reserve(const int32 size) { values_.reserve(size); }
OptionalValue<T> operator[](int feature_idx) const {
auto value_iter =
std::find_if(values_.begin(), values_.end(),
[&feature_idx](const std::pair<int32, T>& element) {
return element.first == feature_idx;
});
if (value_iter == values_.end()) {
return OptionalValue<T>();
}
return OptionalValue<T>(value_iter->second);
}
private:
SparseMap values_;
std::vector<std::pair<int32, T>> values_;
};
// Represents storage for a sparse float feature column. Can store values either
// for one dimensional or a multivalent (multidimensional) sparse column.
// Allows subscript operator access [feature_id].
template <class T>
class SparseFloatFeatureColumn {
public:
void Reserve(const int32 size) {
if (!single_dimensional_) {
mutlidimensional_values.Reserve(size);
}
}
void SetDimension(const int32 dimension) {
single_dimensional_ = dimension <= 1;
}
void Add(const int32 feature_idx, const float value) {
if (single_dimensional_) {
DCHECK_EQ(0, feature_idx);
single_value_ = value;
} else {
mutlidimensional_values.Add(feature_idx, value);
}
initialized_ = true;
}
void Clear() {
single_dimensional_ = false;
initialized_ = false;
mutlidimensional_values.Clear();
}
OptionalValue<T> operator[](int feature_idx) const {
if (!initialized_) {
return OptionalValue<T>();
}
if (single_dimensional_) {
return OptionalValue<T>(single_value_);
} else {
return mutlidimensional_values[feature_idx];
}
}
private:
bool single_dimensional_;
bool initialized_;
T single_value_;
SparseMultidimensionalValues<T> mutlidimensional_values;
};
// Holds data for one example and enables lookup by feature column.
@ -87,9 +116,10 @@ struct Example {
// Dense and sparse float features indexed by feature column.
// TODO(salehay): figure out a design to support multivalent float features.
std::vector<float> dense_float_features;
// Sparse float features are allowed to be multivalent and thus can be
// represented as a sparse matrix.
SparseMatrix<float> sparse_float_features;
// Sparse float features columns (can be either single or multivalent
// (multidimensional).
std::vector<SparseFloatFeatureColumn<float>> sparse_float_features;
// Sparse integer features indexed by feature column.
// Note that all integer features are assumed to be categorical, i.e. will

View File

@ -25,21 +25,33 @@ namespace {
class ExampleTest : public ::testing::Test {};
TEST_F(ExampleTest, TestSparseMatrix) {
// Create the following matrix:
// row id | | 0.4 | 0.3
// 0 | 1 | | 2
// 1 | 3 | 1 | 5
// 2 | | | -4
// 3 | | |
SparseMatrix<float> matrix;
matrix.addElement(0, 1, 0.4f);
matrix.addElement(0, 2, 0.3f);
matrix.addElement(1, 0, 1.f);
matrix.addElement(1, 2, 2.f);
matrix.addElement(2, 0, 3.f);
matrix.addElement(2, 1, 1.f);
matrix.addElement(2, 2, 5.f);
matrix.addElement(3, 2, -4.f);
// Create the following matrix (FC is feature column):
// FC | f0 | f1 | f2
// multidimensional
// 0 | | 0.4 | 0.3
// 1 | 1 | | 2
// 2 | 3 | 1 | 5
// 3 | | |
// one dimensional columns
// 4 | -4
// 5 |
std::vector<SparseFloatFeatureColumn<float>> matrix;
matrix.resize(6);
matrix[0].SetDimension(3);
matrix[1].SetDimension(3);
matrix[2].SetDimension(3);
matrix[3].SetDimension(3);
matrix[4].SetDimension(1);
matrix[5].SetDimension(1);
matrix[0].Add(1, 0.4f);
matrix[0].Add(2, 0.3f);
matrix[1].Add(0, 1.f);
matrix[1].Add(2, 2.f);
matrix[2].Add(0, 3.f);
matrix[2].Add(1, 1.f);
matrix[2].Add(2, 5.f);
matrix[4].Add(0, -4.f);
// Row 0.
EXPECT_FALSE(matrix[0][0].has_value());
@ -66,13 +78,14 @@ TEST_F(ExampleTest, TestSparseMatrix) {
// Row 3.
EXPECT_FALSE(matrix[3][0].has_value());
EXPECT_FALSE(matrix[3][1].has_value());
EXPECT_TRUE(matrix[3][2].has_value());
EXPECT_EQ(-4.f, matrix[3][2].get_value());
EXPECT_FALSE(matrix[3][2].has_value());
// Row 4.
EXPECT_FALSE(matrix[4][0].has_value());
EXPECT_FALSE(matrix[4][1].has_value());
EXPECT_FALSE(matrix[4][2].has_value());
EXPECT_TRUE(matrix[4][0].has_value());
EXPECT_EQ(-4.f, matrix[4][0].get_value());
// Row 5.
EXPECT_FALSE(matrix[5][0].has_value());
}
} // namespace

View File

@ -36,12 +36,14 @@ ExamplesIterable::ExamplesIterable(
// Create sparse float column iterables and values.
sparse_float_column_iterables_.reserve(sparse_float_feature_columns.size());
sparse_float_column_values_.reserve(sparse_float_feature_columns.size());
sparse_float_dimensions_.reserve(sparse_float_feature_columns.size());
for (auto& sparse_float_column : sparse_float_feature_columns) {
sparse_float_column_iterables_.emplace_back(
sparse_float_column.indices().template matrix<int64>(), example_start,
example_end);
sparse_float_column_values_.emplace_back(
sparse_float_column.values().template vec<float>());
sparse_float_dimensions_.push_back(sparse_float_column.shape()[1]);
}
// Create sparse int column iterables and values.
@ -74,6 +76,8 @@ Iterator::Iterator(ExamplesIterable* iter, int64 example_idx)
example_.dense_float_features.resize(
iter_->dense_float_column_values_.size());
example_.sparse_int_features.resize(iter_->sparse_int_column_values_.size());
example_.sparse_float_features.resize(
iter_->sparse_float_column_values_.size());
}
} // namespace utils

View File

@ -87,33 +87,51 @@ class ExamplesIterable {
// Get sparse float values per column.
auto& sparse_float_features = example_.sparse_float_features;
sparse_float_features.clear();
// Iterate through each sparse float feature column.
for (size_t sparse_float_idx = 0;
sparse_float_idx < iter_->sparse_float_column_iterables_.size();
++sparse_float_idx) {
// Clear info from a previous instance.
sparse_float_features[sparse_float_idx].Clear();
// Get range for values tensor.
const auto& row_range =
(*sparse_float_column_iterators_[sparse_float_idx]);
DCHECK_EQ(example_idx_, row_range.example_idx);
// If the example has this feature column.
if (row_range.start < row_range.end) {
// Retrieve original indices tensor.
const TTypes<int64>::ConstMatrix& indices =
iter_->sparse_float_column_iterables_[sparse_float_idx]
.sparse_indices();
const int32 dimension =
iter_->sparse_float_dimensions_[sparse_float_idx];
sparse_float_features[sparse_float_idx].SetDimension(dimension);
if (dimension <= 1) {
// single dimensional sparse feature column.
DCHECK_EQ(1, row_range.end - row_range.start);
sparse_float_features[sparse_float_idx].Add(
0, iter_->sparse_float_column_values_[sparse_float_idx](
row_range.start));
} else {
// Retrieve original indices tensor.
const TTypes<int64>::ConstMatrix& indices =
iter_->sparse_float_column_iterables_[sparse_float_idx]
.sparse_indices();
// For each value.
for (int64 row_idx = row_range.start; row_idx < row_range.end;
++row_idx) {
// Get the feature id for the feature column and the value.
const int32 feature_id = indices(row_idx, 1);
DCHECK_EQ(example_idx_, indices(row_idx, 0));
sparse_float_features[sparse_float_idx].Reserve(row_range.end -
row_range.start);
// Save the value to our sparse matrix.
sparse_float_features.addElement(
sparse_float_idx, feature_id,
iter_->sparse_float_column_values_[sparse_float_idx](row_idx));
// For each value.
for (int64 row_idx = row_range.start; row_idx < row_range.end;
++row_idx) {
// Get the feature id for the feature column and the value.
const int32 feature_id = indices(row_idx, 1);
DCHECK_EQ(example_idx_, indices(row_idx, 0));
// Save the value to our sparse matrix.
sparse_float_features[sparse_float_idx].Add(
feature_id,
iter_->sparse_float_column_values_[sparse_float_idx](
row_idx));
}
}
}
}
@ -173,6 +191,9 @@ class ExamplesIterable {
// Sparse float column values.
std::vector<TTypes<float>::ConstVec> sparse_float_column_values_;
// Dimensions for sparse float feature columns.
std::vector<int32> sparse_float_dimensions_;
// Sparse int column iterables.
std::vector<SparseColumnIterable> sparse_int_column_iterables_;

View File

@ -194,6 +194,7 @@ TEST_F(ExamplesIterableTest, Iterate) {
{dense_float_tensor}, {sparse_float_tensor1, sparse_float_tensor2},
{sparse_int_tensor1, sparse_int_tensor2}, 0, 8);
int64 example_idx = 0;
for (const auto& example : full_iterable) {
validate_example_features(example_idx, example);
++example_idx;