mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/51491 The vectorizer heuristic is pretty dumb and only kicks in if the unroll factor is exactly 8 or 4. It's still slower than direct implementation, which isn't surprising. ghstack-source-id: 120783426 Test Plan: `buck run mode/opt //caffe2/benchmarks/cpp/tensorexpr:tensorexpr_bench` Before: ``` --------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... --------------------------------------------------------------------------- log_nnc_sleef/64 438 ns 438 ns 1795511 log/s=146.259M/s log_nnc_sleef/512 3196 ns 3195 ns 210032 log/s=160.235M/s log_nnc_sleef/8192 77467 ns 77466 ns 8859 log/s=105.749M/s log_nnc_sleef/32768 310206 ns 310202 ns 2170 log/s=105.634M/s log_nnc_fast/64 100 ns 100 ns 7281074 log/s=637.144M/s log_nnc_fast/512 546 ns 546 ns 1335816 log/s=938.361M/s log_nnc_fast/8192 7360 ns 7359 ns 91971 log/s=1.11316G/s log_nnc_fast/32768 30793 ns 30792 ns 22633 log/s=1064.17M/s log_aten/64 427 ns 427 ns 1634897 log/s=150.021M/s log_aten/512 796 ns 796 ns 877318 log/s=643.566M/s log_aten/8192 6690 ns 6690 ns 102649 log/s=1.22452G/s log_aten/32768 25357 ns 25350 ns 27808 log/s=1.29263G/s ``` After: ``` --------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... --------------------------------------------------------------------------- log_nnc_sleef/64 189 ns 188 ns 3872475 log/s=340.585M/s log_nnc_sleef/512 1307 ns 1307 ns 557770 log/s=391.709M/s log_nnc_sleef/8192 20259 ns 20257 ns 34240 log/s=404.404M/s log_nnc_sleef/32768 81556 ns 81470 ns 8767 log/s=402.209M/s log_nnc_fast/64 110 ns 110 ns 6564558 log/s=581.116M/s log_nnc_fast/512 554 ns 554 ns 1279304 log/s=923.376M/s log_nnc_fast/8192 7774 ns 7774 ns 91421 log/s=1053.75M/s log_nnc_fast/32768 31008 ns 31006 ns 21279 log/s=1056.83M/s ``` Reviewed By: bwasti Differential Revision: D26139067 fbshipit-source-id: db31897ee9922695ff9dff4ff46e3d3fbd61f4c2
326 lines
11 KiB
C++
326 lines
11 KiB
C++
#include <benchmark/benchmark.h>
|
|
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
|
|
#include <torch/csrc/jit/tensorexpr/loopnest.h>
|
|
#include <torch/csrc/jit/tensorexpr/tensor.h>
|
|
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
|
|
#include <torch/torch.h>
|
|
#include "caffe2/operators/tanh_op.h"
|
|
#include "caffe2/operators/logit_op.h"
|
|
|
|
using namespace torch::jit;
|
|
using namespace torch::jit::tensorexpr;
|
|
|
|
void optimizeSleef(tensorexpr::LoopNest* ln, tensorexpr::Tensor* target) {
|
|
auto loops = ln->getLoopStmtsFor(target);
|
|
For *outer, *inner, *tail;
|
|
ln->splitWithTail(loops[0], 8, &outer, &inner, &tail);
|
|
ln->vectorize(inner);
|
|
}
|
|
|
|
void optimizePointwise(tensorexpr::LoopNest* ln, tensorexpr::Tensor* target) {
|
|
std::vector<For*> loops = ln->getLoopStmtsFor(target);
|
|
For *outer, *inner, *tail;
|
|
ln->splitWithTail(loops[0], 16 * 8, &outer, &inner, &tail);
|
|
ln->vectorize(inner);
|
|
ln->splitWithTail(outer, 8, &outer, &inner, &tail);
|
|
Stmt* unrolled;
|
|
LoopNest::unroll(inner, &unrolled);
|
|
}
|
|
|
|
static void log_nnc_sleef(benchmark::State& state) {
|
|
KernelScope ks;
|
|
auto N = VarHandle("N", kInt);
|
|
Placeholder A("A", kFloat, {N});
|
|
torch::jit::tensorexpr::Tensor* B =
|
|
Compute("B", {N}, [&](const VarHandle& i) {
|
|
return log(A.load(i));
|
|
});
|
|
LoopNest ln({B});
|
|
ln.prepareForCodegen();
|
|
optimizeSleef(&ln, B);
|
|
Stmt* s = ln.root_stmt();
|
|
s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
|
|
std::vector<CodeGen::BufferArg> args;
|
|
args.emplace_back(B);
|
|
args.emplace_back(A);
|
|
args.emplace_back(N);
|
|
LLVMCodeGen cg(s, args);
|
|
at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
|
|
at::Tensor B_t = torch::randn({state.range(0)});
|
|
auto B_ref = at::log(A_t);
|
|
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
|
|
TORCH_CHECK(at::allclose(B_t, B_ref));
|
|
for (auto _ : state) {
|
|
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
|
|
}
|
|
state.counters["log/s"] = benchmark::Counter(
|
|
uint64_t(state.range(0) * state.iterations()), benchmark::Counter::kIsRate);
|
|
}
|
|
|
|
static void log_nnc_fast(benchmark::State& state) {
|
|
KernelScope ks;
|
|
auto N = VarHandle("N", kInt);
|
|
Placeholder A("A", kFloat, {N});
|
|
torch::jit::tensorexpr::Tensor* B =
|
|
Compute("B", {N}, [&](const VarHandle& i) {
|
|
return fast_log(A.load(i));
|
|
});
|
|
LoopNest ln({B});
|
|
optimizePointwise(&ln, B);
|
|
ln.prepareForCodegen();
|
|
Stmt* s = ln.root_stmt();
|
|
s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
|
|
std::vector<CodeGen::BufferArg> args;
|
|
args.emplace_back(B);
|
|
args.emplace_back(A);
|
|
args.emplace_back(N);
|
|
LLVMCodeGen cg(s, args);
|
|
at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
|
|
at::Tensor B_t = torch::randn({state.range(0)});
|
|
auto B_ref = at::log(A_t);
|
|
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
|
|
TORCH_CHECK(at::allclose(B_t, B_ref));
|
|
for (auto _ : state) {
|
|
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
|
|
}
|
|
state.counters["log/s"] = benchmark::Counter(
|
|
uint64_t(state.range(0) * state.iterations()), benchmark::Counter::kIsRate);
|
|
}
|
|
|
|
static void log_aten(benchmark::State& state) {
|
|
at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
|
|
at::Tensor B_t = torch::randn({state.range(0)});
|
|
for (auto _ : state) {
|
|
at::native::log_out(B_t, A_t);
|
|
}
|
|
state.counters["log/s"] = benchmark::Counter(
|
|
uint64_t(state.range(0) * state.iterations()), benchmark::Counter::kIsRate);
|
|
}
|
|
|
|
static void logit_nnc_sleef(benchmark::State& state) {
|
|
KernelScope ks;
|
|
auto N = VarHandle("N", kInt);
|
|
Placeholder A("A", kFloat, {N});
|
|
auto clamp = 1e-6f;
|
|
tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) {
|
|
auto A_elem = [&]() {
|
|
auto elem = A.load(i);
|
|
auto min = FloatImm::make(clamp);
|
|
auto max = FloatImm::make(1.0f - clamp);
|
|
elem = CompareSelect::make(elem, min, min, elem, kLT);
|
|
return CompareSelect::make(elem, max, max, elem, kGT);
|
|
}();
|
|
return log(A_elem / (FloatImm::make(1.0f) - A_elem));
|
|
});
|
|
LoopNest ln({B});
|
|
ln.prepareForCodegen();
|
|
optimizePointwise(&ln, B);
|
|
Stmt* s = ln.root_stmt();
|
|
s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
|
|
std::vector<CodeGen::BufferArg> args;
|
|
args.emplace_back(B);
|
|
args.emplace_back(A);
|
|
args.emplace_back(N);
|
|
LLVMCodeGen cg(s, args);
|
|
at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
|
|
at::Tensor B_t = torch::randn({state.range(0)});
|
|
auto B_ref = at::logit(A_t, clamp);
|
|
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
|
|
TORCH_CHECK(at::allclose(at::nan_to_num(B_t), at::nan_to_num(B_ref)));
|
|
for (auto _ : state) {
|
|
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
|
|
}
|
|
state.counters["logit/s"] = benchmark::Counter(
|
|
uint64_t(state.range(0) * state.iterations()), benchmark::Counter::kIsRate);
|
|
}
|
|
|
|
static void logit_nnc_fast(benchmark::State& state) {
|
|
KernelScope ks;
|
|
auto N = VarHandle("N", kInt);
|
|
Placeholder A("A", kFloat, {N});
|
|
auto clamp = 1e-6f;
|
|
tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) {
|
|
auto A_elem = [&]() {
|
|
auto elem = A.load(i);
|
|
auto min = FloatImm::make(clamp);
|
|
auto max = FloatImm::make(1.0f - clamp);
|
|
elem = CompareSelect::make(elem, min, min, elem, kLT);
|
|
return CompareSelect::make(elem, max, max, elem, kGT);
|
|
}();
|
|
return fast_log(A_elem / (FloatImm::make(1.0f) - A_elem));
|
|
});
|
|
LoopNest ln({B});
|
|
ln.prepareForCodegen();
|
|
optimizePointwise(&ln, B);
|
|
Stmt* s = ln.root_stmt();
|
|
s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
|
|
std::vector<CodeGen::BufferArg> args;
|
|
args.emplace_back(B);
|
|
args.emplace_back(A);
|
|
args.emplace_back(N);
|
|
LLVMCodeGen cg(s, args);
|
|
at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
|
|
at::Tensor B_t = torch::randn({state.range(0)});
|
|
auto B_ref = at::logit(A_t, clamp);
|
|
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
|
|
TORCH_CHECK(at::allclose(at::nan_to_num(B_t), at::nan_to_num(B_ref)));
|
|
for (auto _ : state) {
|
|
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
|
|
}
|
|
state.counters["logit/s"] = benchmark::Counter(
|
|
uint64_t(state.range(0) * state.iterations()), benchmark::Counter::kIsRate);
|
|
}
|
|
|
|
static void logit_aten(benchmark::State& state) {
|
|
at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
|
|
at::Tensor B_t = torch::randn({state.range(0)});
|
|
auto clamp = 1e-6f;
|
|
for (auto _ : state) {
|
|
at::native::logit_out(B_t, A_t, clamp);
|
|
}
|
|
state.counters["logit/s"] = benchmark::Counter(
|
|
uint64_t(state.range(0) * state.iterations()), benchmark::Counter::kIsRate);
|
|
}
|
|
|
|
template <typename T>
|
|
void logit_caffe2_impl(int size, const T* X, T* Y, float eps_ = 1e-6f) {
|
|
using namespace caffe2;
|
|
ConstEigenVectorMap<T> X_vec(X, size);
|
|
EigenVectorMap<T> Y_vec(Y, size);
|
|
Y_vec = X_vec.array().min(static_cast<T>(1.0f - eps_));
|
|
Y_vec = Y_vec.array().max(eps_);
|
|
Y_vec = (Y_vec.array() / (T(1) - Y_vec.array())).log();
|
|
}
|
|
|
|
static void logit_caffe2(benchmark::State& state) {
|
|
at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
|
|
at::Tensor B_t = torch::randn({state.range(0)});
|
|
at::Tensor B_ref = torch::randn({state.range(0)});
|
|
auto N = state.range(0);
|
|
auto X = A_t.data_ptr<float>();
|
|
auto Y = B_t.data_ptr<float>();
|
|
auto clamp = 1e-6f;
|
|
at::native::logit_out(B_ref, A_t, clamp);
|
|
logit_caffe2_impl(N, X, Y, clamp);
|
|
TORCH_CHECK(at::allclose(at::nan_to_num(B_t), at::nan_to_num(B_ref)));
|
|
|
|
for (auto _ : state) {
|
|
logit_caffe2_impl(N, X, Y, clamp);
|
|
}
|
|
|
|
state.counters["logit/s"] = benchmark::Counter(
|
|
uint64_t(state.range(0) * state.iterations()), benchmark::Counter::kIsRate);
|
|
}
|
|
|
|
static void tanh_nnc_fast(benchmark::State& state) {
|
|
KernelScope ks;
|
|
auto N = VarHandle("N", kInt);
|
|
Placeholder A("A", kFloat, {N});
|
|
torch::jit::tensorexpr::Tensor* B =
|
|
Compute("B", {N}, [&](const VarHandle& i) {
|
|
return fast_tanh(A.load(i));
|
|
});
|
|
LoopNest ln({B});
|
|
optimizePointwise(&ln, B);
|
|
ln.prepareForCodegen();
|
|
Stmt* s = ln.root_stmt();
|
|
s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
|
|
std::vector<CodeGen::BufferArg> args;
|
|
args.emplace_back(B);
|
|
args.emplace_back(A);
|
|
args.emplace_back(N);
|
|
LLVMCodeGen cg(s, args);
|
|
at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
|
|
at::Tensor B_t = torch::randn({state.range(0)});
|
|
auto B_ref = at::tanh(A_t);
|
|
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
|
|
TORCH_CHECK(at::allclose(B_t, B_ref, 1e-3f, 1e-6f));
|
|
for (auto _ : state) {
|
|
cg.call({B_t.data_ptr<float>(), A_t.data_ptr<float>(), state.range(0)});
|
|
}
|
|
state.counters["tanh/s"] = benchmark::Counter(
|
|
uint64_t(state.range(0) * state.iterations()), benchmark::Counter::kIsRate);
|
|
}
|
|
|
|
static void tanh_aten(benchmark::State& state) {
|
|
at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
|
|
at::Tensor B_t = torch::randn({state.range(0)});
|
|
for (auto _ : state) {
|
|
at::native::tanh_out(B_t, A_t);
|
|
}
|
|
state.counters["tanh/s"] = benchmark::Counter(
|
|
uint64_t(state.range(0) * state.iterations()), benchmark::Counter::kIsRate);
|
|
}
|
|
|
|
static void tanh_caffe2(benchmark::State& state) {
|
|
at::Tensor A_t = torch::abs(torch::randn({state.range(0)}));
|
|
at::Tensor B_t = torch::randn({state.range(0)});
|
|
at::Tensor B_ref = torch::randn({state.range(0)});
|
|
|
|
auto N = state.range(0);
|
|
auto X = A_t.data_ptr<float>();
|
|
auto Y = B_t.data_ptr<float>();
|
|
caffe2::CPUContext c;
|
|
auto tanh = caffe2::TanhFunctor<caffe2::CPUContext>();
|
|
at::native::tanh_out(B_ref, A_t);
|
|
tanh(N, X, Y, &c);
|
|
TORCH_CHECK(at::native::allclose(B_t, B_ref, 1e-3f, 1e-6f));
|
|
|
|
for (auto _ : state) {
|
|
tanh(N, X, Y, &c);
|
|
}
|
|
state.counters["tanh/s"] = benchmark::Counter(
|
|
uint64_t(state.range(0) * state.iterations()), benchmark::Counter::kIsRate);
|
|
}
|
|
|
|
BENCHMARK(log_nnc_sleef)
|
|
->Args({2<<5})
|
|
->Args({2<<8})
|
|
->Args({2<<12})
|
|
->Args({2<<14});
|
|
BENCHMARK(log_nnc_fast)
|
|
->Args({2<<5})
|
|
->Args({2<<8})
|
|
->Args({2<<12})
|
|
->Args({2<<14});
|
|
BENCHMARK(log_aten)
|
|
->Args({2<<5})
|
|
->Args({2<<8})
|
|
->Args({2<<12})
|
|
->Args({2<<14});
|
|
BENCHMARK(logit_nnc_sleef)
|
|
->Args({2<<5})
|
|
->Args({2<<8})
|
|
->Args({2<<12})
|
|
->Args({2<<14});
|
|
BENCHMARK(logit_nnc_fast)
|
|
->Args({2<<5})
|
|
->Args({2<<8})
|
|
->Args({2<<12})
|
|
->Args({2<<14});
|
|
BENCHMARK(logit_aten)
|
|
->Args({2<<5})
|
|
->Args({2<<8})
|
|
->Args({2<<12})
|
|
->Args({2<<14});
|
|
BENCHMARK(logit_caffe2)
|
|
->Args({2<<5})
|
|
->Args({2<<8})
|
|
->Args({2<<12})
|
|
->Args({2<<14});
|
|
BENCHMARK(tanh_nnc_fast)
|
|
->Args({2<<5})
|
|
->Args({2<<8})
|
|
->Args({2<<12})
|
|
->Args({2<<14});
|
|
BENCHMARK(tanh_aten)
|
|
->Args({2<<5})
|
|
->Args({2<<8})
|
|
->Args({2<<12})
|
|
->Args({2<<14});
|
|
BENCHMARK(tanh_caffe2)
|
|
->Args({2<<5})
|
|
->Args({2<<8})
|
|
->Args({2<<12})
|
|
->Args({2<<14});
|