Speed up multinomial_op on CPU by using a vectorized Eigen expression and avoiding unnecessary casts.

Benchmark with AVX+FMA enabled: Run on <redacted> (12 X 3492 MHz CPUs); 2017-06-05T12:54:07.881672447-07:00 CPU: Intel Haswell with HyperThreading (6 cores) dL1:32KB dL2:256KB dL3:15MB Benchmark Base (ns) New (ns) Improvement ------------------------------------------------------------------ BM_Multinomial_cpu_1_10000_4 250817 172953 +31.0% BM_Multinomial_cpu_1_10000_128 273834 187552 +31.5% BM_Multinomial_cpu_1_10000_10000 1174175 1130778 +3.7% BM_Multinomial_cpu_1_100000_4 2040741 1276761 +37.4% BM_Multinomial_cpu_32_10000_4 10221765 4498666 +56.0% BM_Multinomial_cpu_32_10000_128 10638159 4994754 +53.0% BM_Multinomial_cpu_32_100000_4 100790019 44193314 +56.2% BM_Multinomial_cpu_128_100000_1 431269640 182506078 +57.7% PiperOrigin-RevId: 158061480
2025-12-07 00:20:20 +01:00 · 2017-06-05 14:23:14 -07:00 · 2017-06-05 14:23:14 -07:00 · bee26215c9
commit bee26215c9
parent 515b3ac677
2 changed files with 23 additions and 15 deletions
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -3602,6 +3602,7 @@ tf_kernel_library(
        "//tensorflow/core:framework",
        "//tensorflow/core:lib",
        "//tensorflow/core:lib_internal",
+        "//third_party/eigen3",
    ],
 )

--- a/tensorflow/core/kernels/multinomial_op.cc
+++ b/tensorflow/core/kernels/multinomial_op.cc
@ -23,6 +23,7 @@ limitations under the License.
 #include <cmath>
 #include <memory>

+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@ -67,7 +68,7 @@ struct MultinomialFunctor<CPUDevice, T> {
    //
    // This takes O(BatchSize * NumSamples * log(NumClasses) + NumClasses) CPU
    // time.
-    auto DoWork = [num_samples, num_classes, &gen, &output, &logits](
+    auto DoWork = [ctx, num_samples, num_classes, &gen, &output, &logits](
                      int64 start_row, int64 limit_row) {
      // Capturing "gen" by-value would only make a copy for the _shared_
      // lambda.  Since we want to let each worker have its own copy, we pass
@ -78,15 +79,18 @@ struct MultinomialFunctor<CPUDevice, T> {
      gen_copy.Skip(start_row * (num_samples + 3) / 4);
      random::SimplePhilox simple_philox(&gen_copy);

-      std::vector<double> cdf(num_classes);
-
+      Tensor cdf_tensor;
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DT_DOUBLE, TensorShape({num_classes}),
+                                        &cdf_tensor));
+      auto cdf = cdf_tensor.flat<double>();
      for (int64 b = start_row; b < limit_row; ++b) {
        const auto* logits_row = &logits(b, 0);

        // Takes an along-class maximum (for numerical stability).
        T max = std::numeric_limits<T>::lowest();
        for (int64 j = 0; j < num_classes; ++j) {
-          if (std::isfinite(static_cast<double>(logits_row[j]))) {
+          if (Eigen::numext::isfinite(logits_row[j])) {
            max = std::max(max, logits_row[j]);
          }
        }
@ -94,19 +98,22 @@ struct MultinomialFunctor<CPUDevice, T> {

        // Precompute cumulative probability distribution across classes.
        // Note: This isn't normalized.
+        cdf = (logits.template chip<0>(b).template cast<double>() - max_logit)
+                  .exp();
        double running_total = 0;
        for (int64 j = 0; j < num_classes; ++j) {
-          if (std::isfinite(static_cast<double>(logits_row[j]))) {
-            running_total +=
-                std::exp(static_cast<double>(logits_row[j]) - max_logit);
+          if (Eigen::numext::isfinite(logits_row[j])) {
+            running_total += cdf(j);
          }
-          cdf[j] = running_total;
+          cdf(j) = running_total;
        }
        // Generate each sample.
+        const double* cdf_begin = cdf.data();
+        const double* cdf_end = cdf.data() + num_classes;
        for (int64 j = 0; j < num_samples; ++j) {
-          double to_find = simple_philox.RandDouble() * running_total;
-          auto found_iter = std::upper_bound(cdf.begin(), cdf.end(), to_find);
-          output(b, j) = std::distance(cdf.begin(), found_iter);
+          const double to_find = simple_philox.RandDouble() * running_total;
+          auto found_iter = std::upper_bound(cdf_begin, cdf_end, to_find);
+          output(b, j) = std::distance(cdf_begin, found_iter);
        }
      }
    };
@ -148,8 +155,8 @@ class MultinomialOp : public OpKernel {
    for (int i = 0; i < 2; i++) {
      const int64 dim = logits_t.dim_size(i);
      OP_REQUIRES(ctx, static_cast<int>(dim) == dim,
-                  errors::InvalidArgument("logits.shape = ",
-                                          logits_t.shape().DebugString(),
+                  errors::InvalidArgument(
+                      "logits.shape = ", logits_t.shape().DebugString(),
                      " too large for int"));
    }
    const int batch_size = static_cast<int>(logits_t.dim_size(0));