Speed up multinomial_op on CPU by using a vectorized Eigen expression and avoiding unnecessary casts.

Benchmark with AVX+FMA enabled:

Run on <redacted> (12 X 3492 MHz CPUs); 2017-06-05T12:54:07.881672447-07:00
CPU: Intel Haswell with HyperThreading (6 cores) dL1:32KB dL2:256KB dL3:15MB
Benchmark                          Base (ns)  New (ns) Improvement
------------------------------------------------------------------
BM_Multinomial_cpu_1_10000_4          250817    172953    +31.0%
BM_Multinomial_cpu_1_10000_128        273834    187552    +31.5%
BM_Multinomial_cpu_1_10000_10000     1174175   1130778     +3.7%
BM_Multinomial_cpu_1_100000_4        2040741   1276761    +37.4%
BM_Multinomial_cpu_32_10000_4       10221765   4498666    +56.0%
BM_Multinomial_cpu_32_10000_128     10638159   4994754    +53.0%
BM_Multinomial_cpu_32_100000_4      100790019  44193314    +56.2%
BM_Multinomial_cpu_128_100000_1     431269640  182506078    +57.7%
PiperOrigin-RevId: 158061480
This commit is contained in:
A. Unique TensorFlower 2017-06-05 14:23:14 -07:00 committed by TensorFlower Gardener
parent 515b3ac677
commit bee26215c9
2 changed files with 23 additions and 15 deletions

View File

@ -3602,6 +3602,7 @@ tf_kernel_library(
"//tensorflow/core:framework", "//tensorflow/core:framework",
"//tensorflow/core:lib", "//tensorflow/core:lib",
"//tensorflow/core:lib_internal", "//tensorflow/core:lib_internal",
"//third_party/eigen3",
], ],
) )

View File

@ -23,6 +23,7 @@ limitations under the License.
#include <cmath> #include <cmath>
#include <memory> #include <memory>
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor.h"
@ -67,8 +68,8 @@ struct MultinomialFunctor<CPUDevice, T> {
// //
// This takes O(BatchSize * NumSamples * log(NumClasses) + NumClasses) CPU // This takes O(BatchSize * NumSamples * log(NumClasses) + NumClasses) CPU
// time. // time.
auto DoWork = [num_samples, num_classes, &gen, &output, &logits]( auto DoWork = [ctx, num_samples, num_classes, &gen, &output, &logits](
int64 start_row, int64 limit_row) { int64 start_row, int64 limit_row) {
// Capturing "gen" by-value would only make a copy for the _shared_ // Capturing "gen" by-value would only make a copy for the _shared_
// lambda. Since we want to let each worker have its own copy, we pass // lambda. Since we want to let each worker have its own copy, we pass
// "gen" by reference and explicitly do a copy assignment here. // "gen" by reference and explicitly do a copy assignment here.
@ -78,15 +79,18 @@ struct MultinomialFunctor<CPUDevice, T> {
gen_copy.Skip(start_row * (num_samples + 3) / 4); gen_copy.Skip(start_row * (num_samples + 3) / 4);
random::SimplePhilox simple_philox(&gen_copy); random::SimplePhilox simple_philox(&gen_copy);
std::vector<double> cdf(num_classes); Tensor cdf_tensor;
OP_REQUIRES_OK(ctx,
ctx->allocate_temp(DT_DOUBLE, TensorShape({num_classes}),
&cdf_tensor));
auto cdf = cdf_tensor.flat<double>();
for (int64 b = start_row; b < limit_row; ++b) { for (int64 b = start_row; b < limit_row; ++b) {
const auto* logits_row = &logits(b, 0); const auto* logits_row = &logits(b, 0);
// Takes an along-class maximum (for numerical stability). // Takes an along-class maximum (for numerical stability).
T max = std::numeric_limits<T>::lowest(); T max = std::numeric_limits<T>::lowest();
for (int64 j = 0; j < num_classes; ++j) { for (int64 j = 0; j < num_classes; ++j) {
if (std::isfinite(static_cast<double>(logits_row[j]))) { if (Eigen::numext::isfinite(logits_row[j])) {
max = std::max(max, logits_row[j]); max = std::max(max, logits_row[j]);
} }
} }
@ -94,19 +98,22 @@ struct MultinomialFunctor<CPUDevice, T> {
// Precompute cumulative probability distribution across classes. // Precompute cumulative probability distribution across classes.
// Note: This isn't normalized. // Note: This isn't normalized.
cdf = (logits.template chip<0>(b).template cast<double>() - max_logit)
.exp();
double running_total = 0; double running_total = 0;
for (int64 j = 0; j < num_classes; ++j) { for (int64 j = 0; j < num_classes; ++j) {
if (std::isfinite(static_cast<double>(logits_row[j]))) { if (Eigen::numext::isfinite(logits_row[j])) {
running_total += running_total += cdf(j);
std::exp(static_cast<double>(logits_row[j]) - max_logit);
} }
cdf[j] = running_total; cdf(j) = running_total;
} }
// Generate each sample. // Generate each sample.
const double* cdf_begin = cdf.data();
const double* cdf_end = cdf.data() + num_classes;
for (int64 j = 0; j < num_samples; ++j) { for (int64 j = 0; j < num_samples; ++j) {
double to_find = simple_philox.RandDouble() * running_total; const double to_find = simple_philox.RandDouble() * running_total;
auto found_iter = std::upper_bound(cdf.begin(), cdf.end(), to_find); auto found_iter = std::upper_bound(cdf_begin, cdf_end, to_find);
output(b, j) = std::distance(cdf.begin(), found_iter); output(b, j) = std::distance(cdf_begin, found_iter);
} }
} }
}; };
@ -148,9 +155,9 @@ class MultinomialOp : public OpKernel {
for (int i = 0; i < 2; i++) { for (int i = 0; i < 2; i++) {
const int64 dim = logits_t.dim_size(i); const int64 dim = logits_t.dim_size(i);
OP_REQUIRES(ctx, static_cast<int>(dim) == dim, OP_REQUIRES(ctx, static_cast<int>(dim) == dim,
errors::InvalidArgument("logits.shape = ", errors::InvalidArgument(
logits_t.shape().DebugString(), "logits.shape = ", logits_t.shape().DebugString(),
" too large for int")); " too large for int"));
} }
const int batch_size = static_cast<int>(logits_t.dim_size(0)); const int batch_size = static_cast<int>(logits_t.dim_size(0));
const int num_classes = static_cast<int>(logits_t.dim_size(1)); const int num_classes = static_cast<int>(logits_t.dim_size(1));