mirror of
https://github.com/zebrajr/tensorflow.git
synced 2025-12-07 00:20:20 +01:00
Speed up multinomial_op on CPU by using a vectorized Eigen expression and avoiding unnecessary casts.
Benchmark with AVX+FMA enabled: Run on <redacted> (12 X 3492 MHz CPUs); 2017-06-05T12:54:07.881672447-07:00 CPU: Intel Haswell with HyperThreading (6 cores) dL1:32KB dL2:256KB dL3:15MB Benchmark Base (ns) New (ns) Improvement ------------------------------------------------------------------ BM_Multinomial_cpu_1_10000_4 250817 172953 +31.0% BM_Multinomial_cpu_1_10000_128 273834 187552 +31.5% BM_Multinomial_cpu_1_10000_10000 1174175 1130778 +3.7% BM_Multinomial_cpu_1_100000_4 2040741 1276761 +37.4% BM_Multinomial_cpu_32_10000_4 10221765 4498666 +56.0% BM_Multinomial_cpu_32_10000_128 10638159 4994754 +53.0% BM_Multinomial_cpu_32_100000_4 100790019 44193314 +56.2% BM_Multinomial_cpu_128_100000_1 431269640 182506078 +57.7% PiperOrigin-RevId: 158061480
This commit is contained in:
parent
515b3ac677
commit
bee26215c9
|
|
@ -3602,6 +3602,7 @@ tf_kernel_library(
|
|||
"//tensorflow/core:framework",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:lib_internal",
|
||||
"//third_party/eigen3",
|
||||
],
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ limitations under the License.
|
|||
#include <cmath>
|
||||
#include <memory>
|
||||
|
||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||
#include "tensorflow/core/framework/op_kernel.h"
|
||||
#include "tensorflow/core/framework/register_types.h"
|
||||
#include "tensorflow/core/framework/tensor.h"
|
||||
|
|
@ -67,7 +68,7 @@ struct MultinomialFunctor<CPUDevice, T> {
|
|||
//
|
||||
// This takes O(BatchSize * NumSamples * log(NumClasses) + NumClasses) CPU
|
||||
// time.
|
||||
auto DoWork = [num_samples, num_classes, &gen, &output, &logits](
|
||||
auto DoWork = [ctx, num_samples, num_classes, &gen, &output, &logits](
|
||||
int64 start_row, int64 limit_row) {
|
||||
// Capturing "gen" by-value would only make a copy for the _shared_
|
||||
// lambda. Since we want to let each worker have its own copy, we pass
|
||||
|
|
@ -78,15 +79,18 @@ struct MultinomialFunctor<CPUDevice, T> {
|
|||
gen_copy.Skip(start_row * (num_samples + 3) / 4);
|
||||
random::SimplePhilox simple_philox(&gen_copy);
|
||||
|
||||
std::vector<double> cdf(num_classes);
|
||||
|
||||
Tensor cdf_tensor;
|
||||
OP_REQUIRES_OK(ctx,
|
||||
ctx->allocate_temp(DT_DOUBLE, TensorShape({num_classes}),
|
||||
&cdf_tensor));
|
||||
auto cdf = cdf_tensor.flat<double>();
|
||||
for (int64 b = start_row; b < limit_row; ++b) {
|
||||
const auto* logits_row = &logits(b, 0);
|
||||
|
||||
// Takes an along-class maximum (for numerical stability).
|
||||
T max = std::numeric_limits<T>::lowest();
|
||||
for (int64 j = 0; j < num_classes; ++j) {
|
||||
if (std::isfinite(static_cast<double>(logits_row[j]))) {
|
||||
if (Eigen::numext::isfinite(logits_row[j])) {
|
||||
max = std::max(max, logits_row[j]);
|
||||
}
|
||||
}
|
||||
|
|
@ -94,19 +98,22 @@ struct MultinomialFunctor<CPUDevice, T> {
|
|||
|
||||
// Precompute cumulative probability distribution across classes.
|
||||
// Note: This isn't normalized.
|
||||
cdf = (logits.template chip<0>(b).template cast<double>() - max_logit)
|
||||
.exp();
|
||||
double running_total = 0;
|
||||
for (int64 j = 0; j < num_classes; ++j) {
|
||||
if (std::isfinite(static_cast<double>(logits_row[j]))) {
|
||||
running_total +=
|
||||
std::exp(static_cast<double>(logits_row[j]) - max_logit);
|
||||
if (Eigen::numext::isfinite(logits_row[j])) {
|
||||
running_total += cdf(j);
|
||||
}
|
||||
cdf[j] = running_total;
|
||||
cdf(j) = running_total;
|
||||
}
|
||||
// Generate each sample.
|
||||
const double* cdf_begin = cdf.data();
|
||||
const double* cdf_end = cdf.data() + num_classes;
|
||||
for (int64 j = 0; j < num_samples; ++j) {
|
||||
double to_find = simple_philox.RandDouble() * running_total;
|
||||
auto found_iter = std::upper_bound(cdf.begin(), cdf.end(), to_find);
|
||||
output(b, j) = std::distance(cdf.begin(), found_iter);
|
||||
const double to_find = simple_philox.RandDouble() * running_total;
|
||||
auto found_iter = std::upper_bound(cdf_begin, cdf_end, to_find);
|
||||
output(b, j) = std::distance(cdf_begin, found_iter);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
@ -148,8 +155,8 @@ class MultinomialOp : public OpKernel {
|
|||
for (int i = 0; i < 2; i++) {
|
||||
const int64 dim = logits_t.dim_size(i);
|
||||
OP_REQUIRES(ctx, static_cast<int>(dim) == dim,
|
||||
errors::InvalidArgument("logits.shape = ",
|
||||
logits_t.shape().DebugString(),
|
||||
errors::InvalidArgument(
|
||||
"logits.shape = ", logits_t.shape().DebugString(),
|
||||
" too large for int"));
|
||||
}
|
||||
const int batch_size = static_cast<int>(logits_t.dim_size(0));
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user