mirror of
https://github.com/zebrajr/tensorflow.git
synced 2025-12-07 12:20:24 +01:00
Speeding up the case for sparse float columns that have only 1 value.
PiperOrigin-RevId: 173971121
This commit is contained in:
parent
c315cf1ee6
commit
09f62ab38b
|
|
@ -17,7 +17,6 @@
|
|||
#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLE_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
#include "tensorflow/contrib/boosted_trees/lib/utils/optional_value.h"
|
||||
|
|
@ -25,55 +24,85 @@
|
|||
namespace tensorflow {
|
||||
namespace boosted_trees {
|
||||
namespace utils {
|
||||
|
||||
// A matrix that given feature column id and feature value id will return
|
||||
// either a value or an optional. First index indicates feature column, second
|
||||
// index - the index of the value within this column - for single valued, it
|
||||
// will be 0.
|
||||
// Allows double-subscript access [][].
|
||||
// Represents sparse vector that have a value for some feature indices within
|
||||
// the feature column.
|
||||
// Allows subscript access [].
|
||||
template <class T>
|
||||
class SparseMatrix {
|
||||
typedef std::vector<std::tuple<int32, int32, T>> SparseMap;
|
||||
|
||||
class Proxy {
|
||||
public:
|
||||
Proxy(const int32 feature_column_idx, const SparseMap& values)
|
||||
: feature_column_idx_(feature_column_idx), values_(values) {}
|
||||
|
||||
OptionalValue<T> operator[](int feature_idx) const {
|
||||
auto value_iter = std::find_if(
|
||||
values_.begin(), values_.end(),
|
||||
[this, &feature_idx](const std::tuple<int32, int32, T>& element) {
|
||||
return std::get<0>(element) == feature_column_idx_ &&
|
||||
std::get<1>(element) == feature_idx;
|
||||
});
|
||||
|
||||
if (value_iter == values_.end()) {
|
||||
return OptionalValue<T>();
|
||||
}
|
||||
// There is this feature column and feature id.
|
||||
return OptionalValue<T>(std::get<2>(*value_iter));
|
||||
}
|
||||
|
||||
private:
|
||||
int32 feature_column_idx_;
|
||||
const SparseMap& values_;
|
||||
};
|
||||
|
||||
class SparseMultidimensionalValues {
|
||||
public:
|
||||
void addElement(const int32 feature_column_idx, const int32 feature_idx,
|
||||
const T value) {
|
||||
values_.emplace_back(feature_column_idx, feature_idx, value);
|
||||
void Add(const int32 feature_idx, const T value) {
|
||||
values_.emplace_back(feature_idx, value);
|
||||
}
|
||||
|
||||
void clear() { values_.clear(); }
|
||||
void Clear() { values_.clear(); }
|
||||
|
||||
Proxy operator[](int feature_column_idx) const {
|
||||
return Proxy(feature_column_idx, values_);
|
||||
void Reserve(const int32 size) { values_.reserve(size); }
|
||||
|
||||
OptionalValue<T> operator[](int feature_idx) const {
|
||||
auto value_iter =
|
||||
std::find_if(values_.begin(), values_.end(),
|
||||
[&feature_idx](const std::pair<int32, T>& element) {
|
||||
return element.first == feature_idx;
|
||||
});
|
||||
|
||||
if (value_iter == values_.end()) {
|
||||
return OptionalValue<T>();
|
||||
}
|
||||
return OptionalValue<T>(value_iter->second);
|
||||
}
|
||||
|
||||
private:
|
||||
SparseMap values_;
|
||||
std::vector<std::pair<int32, T>> values_;
|
||||
};
|
||||
|
||||
// Represents storage for a sparse float feature column. Can store values either
|
||||
// for one dimensional or a multivalent (multidimensional) sparse column.
|
||||
// Allows subscript operator access [feature_id].
|
||||
template <class T>
|
||||
class SparseFloatFeatureColumn {
|
||||
public:
|
||||
void Reserve(const int32 size) {
|
||||
if (!single_dimensional_) {
|
||||
mutlidimensional_values.Reserve(size);
|
||||
}
|
||||
}
|
||||
|
||||
void SetDimension(const int32 dimension) {
|
||||
single_dimensional_ = dimension <= 1;
|
||||
}
|
||||
|
||||
void Add(const int32 feature_idx, const float value) {
|
||||
if (single_dimensional_) {
|
||||
DCHECK_EQ(0, feature_idx);
|
||||
single_value_ = value;
|
||||
} else {
|
||||
mutlidimensional_values.Add(feature_idx, value);
|
||||
}
|
||||
initialized_ = true;
|
||||
}
|
||||
|
||||
void Clear() {
|
||||
single_dimensional_ = false;
|
||||
initialized_ = false;
|
||||
mutlidimensional_values.Clear();
|
||||
}
|
||||
|
||||
OptionalValue<T> operator[](int feature_idx) const {
|
||||
if (!initialized_) {
|
||||
return OptionalValue<T>();
|
||||
}
|
||||
if (single_dimensional_) {
|
||||
return OptionalValue<T>(single_value_);
|
||||
} else {
|
||||
return mutlidimensional_values[feature_idx];
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
bool single_dimensional_;
|
||||
bool initialized_;
|
||||
T single_value_;
|
||||
SparseMultidimensionalValues<T> mutlidimensional_values;
|
||||
};
|
||||
|
||||
// Holds data for one example and enables lookup by feature column.
|
||||
|
|
@ -87,9 +116,10 @@ struct Example {
|
|||
// Dense and sparse float features indexed by feature column.
|
||||
// TODO(salehay): figure out a design to support multivalent float features.
|
||||
std::vector<float> dense_float_features;
|
||||
// Sparse float features are allowed to be multivalent and thus can be
|
||||
// represented as a sparse matrix.
|
||||
SparseMatrix<float> sparse_float_features;
|
||||
|
||||
// Sparse float features columns (can be either single or multivalent
|
||||
// (multidimensional).
|
||||
std::vector<SparseFloatFeatureColumn<float>> sparse_float_features;
|
||||
|
||||
// Sparse integer features indexed by feature column.
|
||||
// Note that all integer features are assumed to be categorical, i.e. will
|
||||
|
|
|
|||
|
|
@ -25,21 +25,33 @@ namespace {
|
|||
class ExampleTest : public ::testing::Test {};
|
||||
|
||||
TEST_F(ExampleTest, TestSparseMatrix) {
|
||||
// Create the following matrix:
|
||||
// row id | | 0.4 | 0.3
|
||||
// 0 | 1 | | 2
|
||||
// 1 | 3 | 1 | 5
|
||||
// 2 | | | -4
|
||||
// 3 | | |
|
||||
SparseMatrix<float> matrix;
|
||||
matrix.addElement(0, 1, 0.4f);
|
||||
matrix.addElement(0, 2, 0.3f);
|
||||
matrix.addElement(1, 0, 1.f);
|
||||
matrix.addElement(1, 2, 2.f);
|
||||
matrix.addElement(2, 0, 3.f);
|
||||
matrix.addElement(2, 1, 1.f);
|
||||
matrix.addElement(2, 2, 5.f);
|
||||
matrix.addElement(3, 2, -4.f);
|
||||
// Create the following matrix (FC is feature column):
|
||||
// FC | f0 | f1 | f2
|
||||
// multidimensional
|
||||
// 0 | | 0.4 | 0.3
|
||||
// 1 | 1 | | 2
|
||||
// 2 | 3 | 1 | 5
|
||||
// 3 | | |
|
||||
// one dimensional columns
|
||||
// 4 | -4
|
||||
// 5 |
|
||||
std::vector<SparseFloatFeatureColumn<float>> matrix;
|
||||
matrix.resize(6);
|
||||
matrix[0].SetDimension(3);
|
||||
matrix[1].SetDimension(3);
|
||||
matrix[2].SetDimension(3);
|
||||
matrix[3].SetDimension(3);
|
||||
matrix[4].SetDimension(1);
|
||||
matrix[5].SetDimension(1);
|
||||
|
||||
matrix[0].Add(1, 0.4f);
|
||||
matrix[0].Add(2, 0.3f);
|
||||
matrix[1].Add(0, 1.f);
|
||||
matrix[1].Add(2, 2.f);
|
||||
matrix[2].Add(0, 3.f);
|
||||
matrix[2].Add(1, 1.f);
|
||||
matrix[2].Add(2, 5.f);
|
||||
matrix[4].Add(0, -4.f);
|
||||
|
||||
// Row 0.
|
||||
EXPECT_FALSE(matrix[0][0].has_value());
|
||||
|
|
@ -66,13 +78,14 @@ TEST_F(ExampleTest, TestSparseMatrix) {
|
|||
// Row 3.
|
||||
EXPECT_FALSE(matrix[3][0].has_value());
|
||||
EXPECT_FALSE(matrix[3][1].has_value());
|
||||
EXPECT_TRUE(matrix[3][2].has_value());
|
||||
EXPECT_EQ(-4.f, matrix[3][2].get_value());
|
||||
EXPECT_FALSE(matrix[3][2].has_value());
|
||||
|
||||
// Row 4.
|
||||
EXPECT_FALSE(matrix[4][0].has_value());
|
||||
EXPECT_FALSE(matrix[4][1].has_value());
|
||||
EXPECT_FALSE(matrix[4][2].has_value());
|
||||
EXPECT_TRUE(matrix[4][0].has_value());
|
||||
EXPECT_EQ(-4.f, matrix[4][0].get_value());
|
||||
|
||||
// Row 5.
|
||||
EXPECT_FALSE(matrix[5][0].has_value());
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
|
|
|||
|
|
@ -36,12 +36,14 @@ ExamplesIterable::ExamplesIterable(
|
|||
// Create sparse float column iterables and values.
|
||||
sparse_float_column_iterables_.reserve(sparse_float_feature_columns.size());
|
||||
sparse_float_column_values_.reserve(sparse_float_feature_columns.size());
|
||||
sparse_float_dimensions_.reserve(sparse_float_feature_columns.size());
|
||||
for (auto& sparse_float_column : sparse_float_feature_columns) {
|
||||
sparse_float_column_iterables_.emplace_back(
|
||||
sparse_float_column.indices().template matrix<int64>(), example_start,
|
||||
example_end);
|
||||
sparse_float_column_values_.emplace_back(
|
||||
sparse_float_column.values().template vec<float>());
|
||||
sparse_float_dimensions_.push_back(sparse_float_column.shape()[1]);
|
||||
}
|
||||
|
||||
// Create sparse int column iterables and values.
|
||||
|
|
@ -74,6 +76,8 @@ Iterator::Iterator(ExamplesIterable* iter, int64 example_idx)
|
|||
example_.dense_float_features.resize(
|
||||
iter_->dense_float_column_values_.size());
|
||||
example_.sparse_int_features.resize(iter_->sparse_int_column_values_.size());
|
||||
example_.sparse_float_features.resize(
|
||||
iter_->sparse_float_column_values_.size());
|
||||
}
|
||||
|
||||
} // namespace utils
|
||||
|
|
|
|||
|
|
@ -87,33 +87,51 @@ class ExamplesIterable {
|
|||
|
||||
// Get sparse float values per column.
|
||||
auto& sparse_float_features = example_.sparse_float_features;
|
||||
sparse_float_features.clear();
|
||||
// Iterate through each sparse float feature column.
|
||||
for (size_t sparse_float_idx = 0;
|
||||
sparse_float_idx < iter_->sparse_float_column_iterables_.size();
|
||||
++sparse_float_idx) {
|
||||
// Clear info from a previous instance.
|
||||
sparse_float_features[sparse_float_idx].Clear();
|
||||
|
||||
// Get range for values tensor.
|
||||
const auto& row_range =
|
||||
(*sparse_float_column_iterators_[sparse_float_idx]);
|
||||
DCHECK_EQ(example_idx_, row_range.example_idx);
|
||||
|
||||
// If the example has this feature column.
|
||||
if (row_range.start < row_range.end) {
|
||||
// Retrieve original indices tensor.
|
||||
const TTypes<int64>::ConstMatrix& indices =
|
||||
iter_->sparse_float_column_iterables_[sparse_float_idx]
|
||||
.sparse_indices();
|
||||
const int32 dimension =
|
||||
iter_->sparse_float_dimensions_[sparse_float_idx];
|
||||
sparse_float_features[sparse_float_idx].SetDimension(dimension);
|
||||
if (dimension <= 1) {
|
||||
// single dimensional sparse feature column.
|
||||
DCHECK_EQ(1, row_range.end - row_range.start);
|
||||
sparse_float_features[sparse_float_idx].Add(
|
||||
0, iter_->sparse_float_column_values_[sparse_float_idx](
|
||||
row_range.start));
|
||||
} else {
|
||||
// Retrieve original indices tensor.
|
||||
const TTypes<int64>::ConstMatrix& indices =
|
||||
iter_->sparse_float_column_iterables_[sparse_float_idx]
|
||||
.sparse_indices();
|
||||
|
||||
// For each value.
|
||||
for (int64 row_idx = row_range.start; row_idx < row_range.end;
|
||||
++row_idx) {
|
||||
// Get the feature id for the feature column and the value.
|
||||
const int32 feature_id = indices(row_idx, 1);
|
||||
DCHECK_EQ(example_idx_, indices(row_idx, 0));
|
||||
sparse_float_features[sparse_float_idx].Reserve(row_range.end -
|
||||
row_range.start);
|
||||
|
||||
// Save the value to our sparse matrix.
|
||||
sparse_float_features.addElement(
|
||||
sparse_float_idx, feature_id,
|
||||
iter_->sparse_float_column_values_[sparse_float_idx](row_idx));
|
||||
// For each value.
|
||||
for (int64 row_idx = row_range.start; row_idx < row_range.end;
|
||||
++row_idx) {
|
||||
// Get the feature id for the feature column and the value.
|
||||
const int32 feature_id = indices(row_idx, 1);
|
||||
DCHECK_EQ(example_idx_, indices(row_idx, 0));
|
||||
|
||||
// Save the value to our sparse matrix.
|
||||
sparse_float_features[sparse_float_idx].Add(
|
||||
feature_id,
|
||||
iter_->sparse_float_column_values_[sparse_float_idx](
|
||||
row_idx));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -173,6 +191,9 @@ class ExamplesIterable {
|
|||
// Sparse float column values.
|
||||
std::vector<TTypes<float>::ConstVec> sparse_float_column_values_;
|
||||
|
||||
// Dimensions for sparse float feature columns.
|
||||
std::vector<int32> sparse_float_dimensions_;
|
||||
|
||||
// Sparse int column iterables.
|
||||
std::vector<SparseColumnIterable> sparse_int_column_iterables_;
|
||||
|
||||
|
|
|
|||
|
|
@ -194,6 +194,7 @@ TEST_F(ExamplesIterableTest, Iterate) {
|
|||
{dense_float_tensor}, {sparse_float_tensor1, sparse_float_tensor2},
|
||||
{sparse_int_tensor1, sparse_int_tensor2}, 0, 8);
|
||||
int64 example_idx = 0;
|
||||
|
||||
for (const auto& example : full_iterable) {
|
||||
validate_example_features(example_idx, example);
|
||||
++example_idx;
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user