pytorch/test/cpp/tensorexpr/test_approx.cpp
Bert Maher 2e35fe9535 [te] Implement log approximation using the VML approach (#51752)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/51752

Using a straight power series approximation with enough terms gives
precision down to the denormal range, and avoids the fp division used in the
sleef approach.  This is nice because recent CPUs have dual pipelined fma units,
so we can compute 16 logarithms in parallel; whereas there's usually only one
FP divider and it has a fairly high latency/low throughput.
ghstack-source-id: 121392347

Test Plan:
On my avx2+fma broadwell:
```
---------------------------------------------------------------------------
Benchmark                    Time           CPU Iterations UserCounters...
---------------------------------------------------------------------------
log_nnc_sleef/64           178 ns        178 ns    3933565 log/s=358.993M/s
log_nnc_sleef/512         1286 ns       1285 ns     559459 log/s=398.354M/s
log_nnc_sleef/8192       19366 ns      19364 ns      36619 log/s=423.053M/s
log_nnc_sleef/32768      79288 ns      79286 ns       8718 log/s=413.287M/s

log_nnc_fast/64             92 ns         92 ns    7644990 log/s=696.939M/s
log_nnc_fast/512           483 ns        483 ns    1426802 log/s=1059.49M/s
log_nnc_fast/8192         7519 ns       7514 ns      95319 log/s=1090.23M/s
log_nnc_fast/32768       31344 ns      31338 ns      22397 log/s=1045.62M/s

log_nnc_vml/64              88 ns         88 ns    7923812 log/s=728.469M/s
log_nnc_vml/512            454 ns        454 ns    1521437 log/s=1.12739G/s
log_nnc_vml/8192          6763 ns       6763 ns     103264 log/s=1.21136G/s
log_nnc_vml/32768        26565 ns      26564 ns      23609 log/s=1.23354G/s

log_aten/64                418 ns        418 ns    1651401 log/s=153.117M/s
log_aten/512               801 ns        801 ns     875857 log/s=638.923M/s
log_aten/8192             6877 ns       6872 ns     100840 log/s=1.19208G/s
log_aten/32768           26989 ns      26988 ns      26268 log/s=1.21416G/s
```

Reviewed By: bwasti, zheng-xq

Differential Revision: D26246400

fbshipit-source-id: dae47ee6baeab1a813ec4d4440748164051aed3d
2021-02-10 02:09:10 -08:00

80 lines
2.3 KiB
C++

#ifdef TORCH_ENABLE_LLVM
#include <gtest/gtest.h>
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
#include <torch/csrc/jit/tensorexpr/loopnest.h>
#include <torch/csrc/jit/tensorexpr/tensor.h>
#include <torch/torch.h>
#include <cstring>
namespace te = torch::jit::tensorexpr;
static void vectorize(te::LoopNest* ln, te::Tensor* target, int width) {
auto loops = ln->getLoopStmtsFor(target);
te::For *outer, *inner, *tail;
ln->splitWithTail(loops[0], width, &outer, &inner, &tail);
ln->vectorize(inner);
}
TEST(Approx, log_vml) {
te::KernelScope ks;
te::VarHandle N("N", te::kInt);
te::Placeholder A("A", te::kFloat, {N});
te::Tensor* B = te::Compute(
"B", {N}, [&](const te::VarHandle& i) { return log_vml(A.load(i)); });
te::LoopNest ln({B});
ln.prepareForCodegen();
vectorize(&ln, B, 8);
te::Stmt* s = ln.root_stmt();
s = te::IRSimplifier::simplify(s);
te::LLVMCodeGen cg(s, {A, B, N});
auto test = [&](const at::Tensor& A_t) {
at::Tensor B_ref = at::log(A_t);
at::Tensor B_t = at::empty_like(A_t);
cg.call({A_t.data_ptr<float>(), B_t.data_ptr<float>(), A_t.numel()});
// Results should be bit-identical.
ASSERT_TRUE(
memcmp(
B_ref.data_ptr<float>(), B_t.data_ptr<float>(), B_ref.nbytes()) ==
0);
};
// Generate every single-precision FP value in [1.0, 2.0).
auto eps = std::numeric_limits<float>::epsilon();
at::Tensor A_t = torch::arange(1.0f, 2.0f, eps);
ASSERT_EQ(A_t.numel(), 1 << 23);
test(A_t);
test(A_t * 2.0f);
test(A_t * 0.5f);
test(A_t * 4.0f);
test(A_t * 0.25f);
test(A_t * powf(2.0f, 16));
test(A_t * powf(2.0f, -16));
test(A_t * powf(2.0f, 126));
test(A_t * powf(2.0f, -126));
test(torch::full({32}, INFINITY));
test(torch::full({32}, NAN));
auto min = std::numeric_limits<float>::min();
auto denorm_min = std::numeric_limits<float>::denorm_min();
// Denormals aren't bit precise, because sleef isn't bit-precise either.
A_t = torch::arange(0.0f, min, denorm_min);
ASSERT_EQ(A_t.numel(), 1 << 23);
auto B_ref = at::log(A_t);
auto B_t = at::empty_like(B_ref);
cg.call({A_t.data_ptr<float>(), B_t.data_ptr<float>(), A_t.numel()});
ASSERT_TRUE(torch::allclose(B_t, B_ref));
}
#endif // TORCH_ENABLE_LLVM