pytorch/benchmarks/cpp/tensorexpr/bench_compile.cpp
Bert Maher b7261de0df [pytorch][te] Add compilation time benchmark (#46124)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46124

We want to make sure we can actually fuse kernels within a fairly
tight time budget.  So here's a quick benchmark of codegen for a simple
pointwise activation function (swish).  I kept all the intermediate tensors
separate to force TE to actually do inlining.

Test Plan:
```
buck run mode/opt //caffe2/benchmarks/cpp/tensorexpr:tensorexpr_bench
```

I've only run in debug mode so results aren't super meaningful, but even in
that mode it's 18ms for compilation, 15 of which are in llvm.

Update, opt build mode:
```
----------------------------------------------------------------------------
Benchmark                                     Time           CPU Iterations
----------------------------------------------------------------------------
BM_CompileSwish                         5123276 ns    5119846 ns        148
BM_CompileSwishLLVMOnly                 4754361 ns    4753701 ns        160
```

Reviewed By: asuhan

Differential Revision: D24232801

fbshipit-source-id: d58a8b7f79bcd9244c49366af7a693e09f24bf76
2020-10-09 23:11:37 -07:00

75 lines
2.7 KiB
C++

#include <benchmark/benchmark.h>
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
#include <torch/csrc/jit/tensorexpr/loopnest.h>
#include <torch/csrc/jit/tensorexpr/tensor.h>
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
#ifdef TORCH_ENABLE_LLVM
namespace te = torch::jit::tensorexpr;
static void BM_CompileSwish(benchmark::State& state) {
for (auto _ : state) {
constexpr int N = 512;
te::KernelScope ks;
te::VarHandle n("n", te::kInt);
te::Placeholder A(te::BufHandle("A", {N}, te::kFloat));
te::Tensor* relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) {
return te::Max::make(A.load(i), 0.f, false);
});
te::Tensor* min6 = te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) {
return te::Min::make(relu->call(i), 6.f, false);
});
te::Tensor* plus3 = te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) {
return min6->call(i) + 3.f;
});
te::Tensor* times = te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) {
return A.load(i) * plus3->call(i);
});
te::Tensor* sixth = te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) {
return times->call(i) * 1.f / 6.f;
});
te::LoopNest nest({sixth});
for (auto tensor : {relu, min6, plus3, times}) {
nest.computeInline(tensor->buf());
}
nest.prepareForCodegen();
te::Stmt* s = te::IRSimplifier::simplify(nest.root_stmt());
te::LLVMCodeGen cg(s, {A, sixth});
}
}
static void BM_CompileSwishLLVMOnly(benchmark::State& state) {
constexpr int N = 512;
te::KernelScope ks;
te::VarHandle n("n", te::kInt);
te::Placeholder A(te::BufHandle("A", {N}, te::kFloat));
te::Tensor* relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) {
return te::Max::make(A.load(i), 0.f, false);
});
te::Tensor* min6 = te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) {
return te::Min::make(relu->call(i), 6.f, false);
});
te::Tensor* plus3 = te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) {
return min6->call(i) + 3.f;
});
te::Tensor* times = te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) {
return A.load(i) * plus3->call(i);
});
te::Tensor* sixth = te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) {
return times->call(i) * 1.f / 6.f;
});
te::LoopNest nest({sixth});
for (auto tensor : {relu, min6, plus3, times}) {
nest.computeInline(tensor->buf());
}
nest.prepareForCodegen();
te::Stmt* s = te::IRSimplifier::simplify(nest.root_stmt());
for (auto _ : state) {
te::LLVMCodeGen cg(s, {A, sixth});
}
}
BENCHMARK(BM_CompileSwish);
BENCHMARK(BM_CompileSwishLLVMOnly);
#endif // TORCH_ENABLE_LLVM