Speeding up the case for sparse float columns that have only 1 value.

PiperOrigin-RevId: 173971121
2025-12-07 12:20:24 +01:00 · 2017-10-30 16:54:23 -07:00 · 2017-10-30 16:54:23 -07:00 · 09f62ab38b
commit 09f62ab38b
parent c315cf1ee6
5 changed files with 149 additions and 80 deletions
--- a/tensorflow/contrib/boosted_trees/lib/utils/example.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/example.h
@ -17,7 +17,6 @@
 #define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLE_H_

 #include <algorithm>
-#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 #include "tensorflow/contrib/boosted_trees/lib/utils/optional_value.h"
@ -25,55 +24,85 @@
 namespace tensorflow {
 namespace boosted_trees {
 namespace utils {
-
-// A matrix that given feature column id and feature value id will return
-// either a value or an optional. First index indicates feature column, second
-// index - the index of the value within this column - for single valued, it
-// will be 0.
-// Allows double-subscript access [][].
+// Represents sparse vector that have a value for some feature indices within
+// the feature column.
+// Allows subscript access [].
 template <class T>
-class SparseMatrix {
-  typedef std::vector<std::tuple<int32, int32, T>> SparseMap;
-
-  class Proxy {
-   public:
-    Proxy(const int32 feature_column_idx, const SparseMap& values)
-        : feature_column_idx_(feature_column_idx), values_(values) {}
-
-    OptionalValue<T> operator[](int feature_idx) const {
-      auto value_iter = std::find_if(
-          values_.begin(), values_.end(),
-          [this, &feature_idx](const std::tuple<int32, int32, T>& element) {
-            return std::get<0>(element) == feature_column_idx_ &&
-                   std::get<1>(element) == feature_idx;
-          });
-
-      if (value_iter == values_.end()) {
-        return OptionalValue<T>();
-      }
-      // There is this feature column and feature id.
-      return OptionalValue<T>(std::get<2>(*value_iter));
-    }
-
-   private:
-    int32 feature_column_idx_;
-    const SparseMap& values_;
-  };
-
+class SparseMultidimensionalValues {
 public:
-  void addElement(const int32 feature_column_idx, const int32 feature_idx,
-                  const T value) {
-    values_.emplace_back(feature_column_idx, feature_idx, value);
+  void Add(const int32 feature_idx, const T value) {
+    values_.emplace_back(feature_idx, value);
  }

-  void clear() { values_.clear(); }
+  void Clear() { values_.clear(); }

-  Proxy operator[](int feature_column_idx) const {
-    return Proxy(feature_column_idx, values_);
+  void Reserve(const int32 size) { values_.reserve(size); }
+
+  OptionalValue<T> operator[](int feature_idx) const {
+    auto value_iter =
+        std::find_if(values_.begin(), values_.end(),
+                     [&feature_idx](const std::pair<int32, T>& element) {
+                       return element.first == feature_idx;
+                     });
+
+    if (value_iter == values_.end()) {
+      return OptionalValue<T>();
+    }
+    return OptionalValue<T>(value_iter->second);
  }

 private:
-  SparseMap values_;
+  std::vector<std::pair<int32, T>> values_;
+};
+
+// Represents storage for a sparse float feature column. Can store values either
+// for one dimensional or a multivalent (multidimensional) sparse column.
+// Allows subscript operator access [feature_id].
+template <class T>
+class SparseFloatFeatureColumn {
+ public:
+  void Reserve(const int32 size) {
+    if (!single_dimensional_) {
+      mutlidimensional_values.Reserve(size);
+    }
+  }
+
+  void SetDimension(const int32 dimension) {
+    single_dimensional_ = dimension <= 1;
+  }
+
+  void Add(const int32 feature_idx, const float value) {
+    if (single_dimensional_) {
+      DCHECK_EQ(0, feature_idx);
+      single_value_ = value;
+    } else {
+      mutlidimensional_values.Add(feature_idx, value);
+    }
+    initialized_ = true;
+  }
+
+  void Clear() {
+    single_dimensional_ = false;
+    initialized_ = false;
+    mutlidimensional_values.Clear();
+  }
+
+  OptionalValue<T> operator[](int feature_idx) const {
+    if (!initialized_) {
+      return OptionalValue<T>();
+    }
+    if (single_dimensional_) {
+      return OptionalValue<T>(single_value_);
+    } else {
+      return mutlidimensional_values[feature_idx];
+    }
+  }
+
+ private:
+  bool single_dimensional_;
+  bool initialized_;
+  T single_value_;
+  SparseMultidimensionalValues<T> mutlidimensional_values;
 };

 // Holds data for one example and enables lookup by feature column.
@ -87,9 +116,10 @@ struct Example {
  // Dense and sparse float features indexed by feature column.
  // TODO(salehay): figure out a design to support multivalent float features.
  std::vector<float> dense_float_features;
-  // Sparse float features are allowed to be multivalent and thus can be
-  // represented as a sparse matrix.
-  SparseMatrix<float> sparse_float_features;
+
+  // Sparse float features columns (can be either single or multivalent
+  // (multidimensional).
+  std::vector<SparseFloatFeatureColumn<float>> sparse_float_features;

  // Sparse integer features indexed by feature column.
  // Note that all integer features are assumed to be categorical, i.e. will
--- a/tensorflow/contrib/boosted_trees/lib/utils/example_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/example_test.cc
@ -25,21 +25,33 @@ namespace {
 class ExampleTest : public ::testing::Test {};

 TEST_F(ExampleTest, TestSparseMatrix) {
-  // Create the following matrix:
-  // row id |   | 0.4 |  0.3
-  // 0      | 1 |     |   2
-  // 1      | 3 |  1  |   5
-  // 2      |   |     |  -4
-  // 3      |   |     |
-  SparseMatrix<float> matrix;
-  matrix.addElement(0, 1, 0.4f);
-  matrix.addElement(0, 2, 0.3f);
-  matrix.addElement(1, 0, 1.f);
-  matrix.addElement(1, 2, 2.f);
-  matrix.addElement(2, 0, 3.f);
-  matrix.addElement(2, 1, 1.f);
-  matrix.addElement(2, 2, 5.f);
-  matrix.addElement(3, 2, -4.f);
+  // Create the following matrix (FC is feature column):
+  // FC | f0 | f1  | f2
+  // multidimensional
+  // 0  |    | 0.4 |  0.3
+  // 1  | 1  |     |   2
+  // 2  | 3  |  1  |   5
+  // 3  |    |     |
+  // one dimensional columns
+  // 4  |     -4
+  // 5  |
+  std::vector<SparseFloatFeatureColumn<float>> matrix;
+  matrix.resize(6);
+  matrix[0].SetDimension(3);
+  matrix[1].SetDimension(3);
+  matrix[2].SetDimension(3);
+  matrix[3].SetDimension(3);
+  matrix[4].SetDimension(1);
+  matrix[5].SetDimension(1);
+
+  matrix[0].Add(1, 0.4f);
+  matrix[0].Add(2, 0.3f);
+  matrix[1].Add(0, 1.f);
+  matrix[1].Add(2, 2.f);
+  matrix[2].Add(0, 3.f);
+  matrix[2].Add(1, 1.f);
+  matrix[2].Add(2, 5.f);
+  matrix[4].Add(0, -4.f);

  // Row 0.
  EXPECT_FALSE(matrix[0][0].has_value());
@ -66,13 +78,14 @@ TEST_F(ExampleTest, TestSparseMatrix) {
  // Row 3.
  EXPECT_FALSE(matrix[3][0].has_value());
  EXPECT_FALSE(matrix[3][1].has_value());
-  EXPECT_TRUE(matrix[3][2].has_value());
-  EXPECT_EQ(-4.f, matrix[3][2].get_value());
+  EXPECT_FALSE(matrix[3][2].has_value());

  // Row 4.
-  EXPECT_FALSE(matrix[4][0].has_value());
-  EXPECT_FALSE(matrix[4][1].has_value());
-  EXPECT_FALSE(matrix[4][2].has_value());
+  EXPECT_TRUE(matrix[4][0].has_value());
+  EXPECT_EQ(-4.f, matrix[4][0].get_value());
+
+  // Row 5.
+  EXPECT_FALSE(matrix[5][0].has_value());
 }

 }  // namespace
--- a/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.cc
@ -36,12 +36,14 @@ ExamplesIterable::ExamplesIterable(
  // Create sparse float column iterables and values.
  sparse_float_column_iterables_.reserve(sparse_float_feature_columns.size());
  sparse_float_column_values_.reserve(sparse_float_feature_columns.size());
+  sparse_float_dimensions_.reserve(sparse_float_feature_columns.size());
  for (auto& sparse_float_column : sparse_float_feature_columns) {
    sparse_float_column_iterables_.emplace_back(
        sparse_float_column.indices().template matrix<int64>(), example_start,
        example_end);
    sparse_float_column_values_.emplace_back(
        sparse_float_column.values().template vec<float>());
+    sparse_float_dimensions_.push_back(sparse_float_column.shape()[1]);
  }

  // Create sparse int column iterables and values.
@ -74,6 +76,8 @@ Iterator::Iterator(ExamplesIterable* iter, int64 example_idx)
  example_.dense_float_features.resize(
      iter_->dense_float_column_values_.size());
  example_.sparse_int_features.resize(iter_->sparse_int_column_values_.size());
+  example_.sparse_float_features.resize(
+      iter_->sparse_float_column_values_.size());
 }

 }  // namespace utils
--- a/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h
@ -87,33 +87,51 @@ class ExamplesIterable {

      // Get sparse float values per column.
      auto& sparse_float_features = example_.sparse_float_features;
-      sparse_float_features.clear();
      // Iterate through each sparse float feature column.
      for (size_t sparse_float_idx = 0;
           sparse_float_idx < iter_->sparse_float_column_iterables_.size();
           ++sparse_float_idx) {
+        // Clear info from a previous instance.
+        sparse_float_features[sparse_float_idx].Clear();
+
        // Get range for values tensor.
        const auto& row_range =
            (*sparse_float_column_iterators_[sparse_float_idx]);
        DCHECK_EQ(example_idx_, row_range.example_idx);
+
        // If the example has this feature column.
        if (row_range.start < row_range.end) {
-          // Retrieve original indices tensor.
-          const TTypes<int64>::ConstMatrix& indices =
-              iter_->sparse_float_column_iterables_[sparse_float_idx]
-                  .sparse_indices();
+          const int32 dimension =
+              iter_->sparse_float_dimensions_[sparse_float_idx];
+          sparse_float_features[sparse_float_idx].SetDimension(dimension);
+          if (dimension <= 1) {
+            // single dimensional sparse feature column.
+            DCHECK_EQ(1, row_range.end - row_range.start);
+            sparse_float_features[sparse_float_idx].Add(
+                0, iter_->sparse_float_column_values_[sparse_float_idx](
+                       row_range.start));
+          } else {
+            // Retrieve original indices tensor.
+            const TTypes<int64>::ConstMatrix& indices =
+                iter_->sparse_float_column_iterables_[sparse_float_idx]
+                    .sparse_indices();

-          // For each value.
-          for (int64 row_idx = row_range.start; row_idx < row_range.end;
-               ++row_idx) {
-            // Get the feature id for the feature column and the value.
-            const int32 feature_id = indices(row_idx, 1);
-            DCHECK_EQ(example_idx_, indices(row_idx, 0));
+            sparse_float_features[sparse_float_idx].Reserve(row_range.end -
+                                                            row_range.start);

-            // Save the value to our sparse matrix.
-            sparse_float_features.addElement(
-                sparse_float_idx, feature_id,
-                iter_->sparse_float_column_values_[sparse_float_idx](row_idx));
+            // For each value.
+            for (int64 row_idx = row_range.start; row_idx < row_range.end;
+                 ++row_idx) {
+              // Get the feature id for the feature column and the value.
+              const int32 feature_id = indices(row_idx, 1);
+              DCHECK_EQ(example_idx_, indices(row_idx, 0));
+
+              // Save the value to our sparse matrix.
+              sparse_float_features[sparse_float_idx].Add(
+                  feature_id,
+                  iter_->sparse_float_column_values_[sparse_float_idx](
+                      row_idx));
+            }
          }
        }
      }
@ -173,6 +191,9 @@ class ExamplesIterable {
  // Sparse float column values.
  std::vector<TTypes<float>::ConstVec> sparse_float_column_values_;

+  // Dimensions for sparse float feature columns.
+  std::vector<int32> sparse_float_dimensions_;
+
  // Sparse int column iterables.
  std::vector<SparseColumnIterable> sparse_int_column_iterables_;

--- a/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable_test.cc
@ -194,6 +194,7 @@ TEST_F(ExamplesIterableTest, Iterate) {
      {dense_float_tensor}, {sparse_float_tensor1, sparse_float_tensor2},
      {sparse_int_tensor1, sparse_int_tensor2}, 0, 8);
  int64 example_idx = 0;
+
  for (const auto& example : full_iterable) {
    validate_example_features(example_idx, example);
    ++example_idx;