mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: This PR adds a c++ benchmark for "concat" with 3 different versions - 1) aten::cat, 2) NNC implementation with if-then-else, 3) NNC implementation using multiple loops. It also adds a python benchmark for "concat" which can now be invoked with and without CPU fusion. Here are the results of these benchmarks on a `Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz` machine with `OMP_NUM_THREADS=1` ``` -------------------------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... -------------------------------------------------------------------------------------------------------------------------- Concat2D2 (678fe9f077)Input/ATen/1/160/1/14/1 1211 ns 1211 ns 567896 GB/s=1.14953G/s Concat2D2 (678fe9f077)Input/ATen/1/580/1/174/1 1296 ns 1296 ns 537060 GB/s=4.65362G/s Concat2D2 (678fe9f077)Input/ATen/20/160/20/14/1 1823 ns 1823 ns 382052 GB/s=15.2677G/s Concat2D2 (678fe9f077)Input/ATen/20/580/20/174/1 3347 ns 3347 ns 210036 GB/s=36.0432G/s Concat2D2 (678fe9f077)Input/ATen/8/512/8/512/1 2093 ns 2093 ns 324760 GB/s=31.3061G/s Concat2D2 (678fe9f077)Input/NNC/1/160/1/14/1 694 ns 694 ns 1002902 GB/s=2.00692G/s Concat2D2 (678fe9f077)Input/NNC/1/580/1/174/1 852 ns 852 ns 803002 GB/s=7.08127G/s Concat2D2 (678fe9f077)Input/NNC/20/160/20/14/1 1639 ns 1639 ns 419683 GB/s=16.9828G/s Concat2D2 (678fe9f077)Input/NNC/20/580/20/174/1 5956 ns 5956 ns 117833 GB/s=20.2548G/s Concat2D2 (678fe9f077)Input/NNC/8/512/8/512/1 3136 ns 3136 ns 224122 GB/s=20.8958G/s Concat2D2 (678fe9f077)Input/NNCLoop/1/160/1/14/1 581 ns 581 ns 1209873 GB/s=2.39737G/s Concat2D2 (678fe9f077)Input/NNCLoop/1/580/1/174/1 614 ns 614 ns 1132332 GB/s=9.82955G/s Concat2D2 (678fe9f077)Input/NNCLoop/20/160/20/14/1 1091 ns 1091 ns 622952 GB/s=25.5247G/s Concat2D2 (678fe9f077)Input/NNCLoop/20/580/20/174/1 2399 ns 2399 ns 288376 GB/s=50.289G/s Concat2D2 (678fe9f077)Input/NNCLoop/8/512/8/512/1 1500 ns 1500 ns 478360 GB/s=43.6968G/s Concat2D3 (e23ddf06e9)Input/ATen/8/512/8/512/8/512/1 2584 ns 2584 ns 266394 GB/s=38.0397G/s Concat2D3 (e23ddf06e9)Input/NNC/8/512/8/512/8/512/1 5056 ns 5056 ns 139768 GB/s=19.4416G/s Concat2D3 (e23ddf06e9)Input/NNCLoop/8/512/8/512/8/512/1 1917 ns 1917 ns 369626 GB/s=51.2758G/s Concat2D7 (b5edf329f8)Input/ATen/8/128/8/256/8/384/8/512/8/512/8/512/8/512/1 3888 ns 3888 ns 178124 GB/s=46.3571G/s Concat2D7 (b5edf329f8)Input/NNC/8/128/8/256/8/384/8/512/8/512/8/512/8/512/1 24639 ns 24638 ns 28336 GB/s=7.31481G/s Concat2D7 (b5edf329f8)Input/NNCLoop/8/128/8/256/8/384/8/512/8/512/8/512/8/512/1 3093 ns 3093 ns 226326 GB/s=58.265G/s ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/52592 Reviewed By: bertmaher Differential Revision: D26596701 Pulled By: navahgar fbshipit-source-id: 650fa88febf4423ea49f5a1d3d734edc2294d257
294 lines
8.5 KiB
C++
294 lines
8.5 KiB
C++
#include <benchmark/benchmark.h>
|
|
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
|
|
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
|
|
#include <torch/csrc/jit/tensorexpr/loopnest.h>
|
|
#include <torch/csrc/jit/tensorexpr/tensor.h>
|
|
#include <torch/torch.h>
|
|
|
|
using namespace torch::jit::tensorexpr;
|
|
|
|
namespace {
|
|
|
|
class ConcatBench : public benchmark::Fixture {
|
|
public:
|
|
void init(const std::vector<std::vector<int>> input_sizes, int concat_dim) {
|
|
input_sizes_ = std::move(input_sizes);
|
|
concat_dim_ = concat_dim;
|
|
inputs_.resize(input_sizes_.size());
|
|
for (size_t i = 0; i < input_sizes_.size(); ++i) {
|
|
inputs_[i] = torch::ones({input_sizes_[i][0], input_sizes_[i][1]});
|
|
}
|
|
output_size_.resize(input_sizes_.front().size());
|
|
for (size_t i = 0; i < output_size_.size(); ++i) {
|
|
if (i == static_cast<size_t>(concat_dim_)) {
|
|
output_size_[i] = 0;
|
|
for (size_t j = 0; j < input_sizes_.size(); ++j) {
|
|
output_size_[i] += input_sizes_[j][i];
|
|
}
|
|
} else {
|
|
output_size_[i] = input_sizes_.front()[i];
|
|
}
|
|
}
|
|
ref_ = at::cat(inputs_, concat_dim_);
|
|
output_ = at::empty_like(ref_);
|
|
}
|
|
|
|
void TearDown(benchmark::State& state) override {
|
|
TORCH_CHECK(at::allclose(ref_, output_));
|
|
state.counters["GB/s"] = benchmark::Counter(
|
|
uint64_t(state.iterations()) * 2 * output_.nbytes(),
|
|
benchmark::Counter::kIsRate);
|
|
}
|
|
|
|
void runATen(benchmark::State& state) {
|
|
for (auto _ : state) {
|
|
output_ = at::cat(inputs_, concat_dim_);
|
|
}
|
|
}
|
|
|
|
void runNNC(benchmark::State& state) {
|
|
KernelScope ks;
|
|
|
|
size_t num_inputs = inputs_.size();
|
|
size_t num_dims = 2;
|
|
|
|
std::vector<Placeholder> inputs;
|
|
for (size_t i = 0; i < num_inputs; ++i) {
|
|
inputs.emplace_back(Placeholder(
|
|
"input" + std::to_string(i),
|
|
kFloat,
|
|
{input_sizes_[i][0], input_sizes_[i][1]}));
|
|
}
|
|
|
|
Tensor* output = Compute(
|
|
"aten_cat",
|
|
{{output_size_[0], "M"}, {output_size_[1], "N"}},
|
|
[&](const VarHandle& m, const VarHandle& n) {
|
|
int d = 0;
|
|
std::vector<int> cumulative_concat_dim_sizes(num_inputs);
|
|
for (size_t i = 0; i < num_inputs; ++i) {
|
|
cumulative_concat_dim_sizes[i] = d;
|
|
d += input_sizes_[i][concat_dim_];
|
|
}
|
|
auto load =
|
|
inputs.back().load(m, n - cumulative_concat_dim_sizes.back());
|
|
for (size_t i = num_inputs - 1; i > 0; --i) {
|
|
load = ifThenElse(
|
|
CompareSelect::make(
|
|
n, IntImm::make(cumulative_concat_dim_sizes[i]), kLT),
|
|
inputs[i - 1].load(m, n - cumulative_concat_dim_sizes[i - 1]),
|
|
load);
|
|
}
|
|
return load;
|
|
});
|
|
LoopNest nest({output});
|
|
nest.prepareForCodegen();
|
|
Stmt* s = IRSimplifier::simplify(nest.root_stmt());
|
|
std::vector<CodeGen::BufferArg> buf_args(inputs.begin(), inputs.end());
|
|
buf_args.push_back(output);
|
|
LLVMCodeGen cg(s, buf_args);
|
|
|
|
std::vector<CodeGen::CallArg> call_args;
|
|
for (auto _ : state) {
|
|
output_ = at::empty_like(ref_);
|
|
call_args.clear();
|
|
for (const auto& inp : inputs_) {
|
|
call_args.push_back(inp.data_ptr<float>());
|
|
}
|
|
call_args.push_back(output_.data_ptr<float>());
|
|
cg.call(call_args);
|
|
}
|
|
}
|
|
|
|
void runNNCLoop(benchmark::State& state) {
|
|
KernelScope ks;
|
|
|
|
size_t num_inputs = inputs_.size();
|
|
size_t num_dims = 2;
|
|
|
|
TORCH_INTERNAL_ASSERT(concat_dim_ == 1);
|
|
|
|
auto output_buf = new Buf(
|
|
new Var("aten_cat", kHandle),
|
|
{new IntImm(output_size_[0]), new IntImm(output_size_[1])},
|
|
kFloat);
|
|
|
|
std::vector<Placeholder> inputs;
|
|
std::vector<Stmt*> for_stmts(num_inputs);
|
|
int cumulative_input_sizes = 0;
|
|
for (size_t i = 0; i < num_inputs; ++i) {
|
|
inputs.emplace_back(Placeholder(
|
|
"input" + std::to_string(i),
|
|
kFloat,
|
|
{input_sizes_[i][0], input_sizes_[i][1]}));
|
|
std::vector<Var*> for_vars(num_inputs);
|
|
for (size_t d = 0; d < num_dims; ++d) {
|
|
for_vars[d] =
|
|
new Var("i" + std::to_string(i) + "_" + std::to_string(d), kInt);
|
|
}
|
|
auto store = new Store(
|
|
output_buf,
|
|
{for_vars[0],
|
|
new Add(for_vars[1], new IntImm(cumulative_input_sizes))},
|
|
new Load(inputs[i].data(), {for_vars[0], for_vars[1]}, new IntImm(1)),
|
|
new IntImm(1));
|
|
auto for_st = new For(
|
|
for_vars[0],
|
|
new IntImm(0),
|
|
new IntImm(input_sizes_[i][0]),
|
|
new For(
|
|
for_vars[1],
|
|
new IntImm(0),
|
|
new IntImm(input_sizes_[i][1]),
|
|
store));
|
|
for_stmts[i] = for_st;
|
|
cumulative_input_sizes += input_sizes_[i][1];
|
|
}
|
|
auto output = new Tensor(output_buf, new Block(for_stmts));
|
|
|
|
LoopNest nest({output});
|
|
nest.prepareForCodegen();
|
|
nest.vectorizeInnerLoops();
|
|
Stmt* s = IRSimplifier::simplify(nest.root_stmt());
|
|
std::vector<CodeGen::BufferArg> buf_args(inputs.begin(), inputs.end());
|
|
buf_args.push_back(output);
|
|
LLVMCodeGen cg(s, buf_args);
|
|
|
|
std::vector<CodeGen::CallArg> call_args;
|
|
for (auto _ : state) {
|
|
output_ = at::empty_like(ref_);
|
|
call_args.clear();
|
|
for (const auto& inp : inputs_) {
|
|
call_args.push_back(inp.data_ptr<float>());
|
|
}
|
|
call_args.push_back(output_.data_ptr<float>());
|
|
cg.call(call_args);
|
|
}
|
|
}
|
|
|
|
std::vector<std::vector<int>> input_sizes_;
|
|
int concat_dim_;
|
|
std::vector<at::Tensor> inputs_;
|
|
std::vector<int> output_size_;
|
|
at::Tensor output_;
|
|
at::Tensor ref_;
|
|
};
|
|
|
|
class Concat2D2Input : public ConcatBench {
|
|
public:
|
|
void SetUp(const benchmark::State& state) override {
|
|
init(
|
|
{{state.range(0), state.range(1)}, {state.range(2), state.range(3)}},
|
|
state.range(4));
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
BENCHMARK_DEFINE_F(Concat2D2Input, ATen)(benchmark::State& state) {
|
|
runATen(state);
|
|
}
|
|
|
|
BENCHMARK_DEFINE_F(Concat2D2Input, NNC)(benchmark::State& state) {
|
|
runNNC(state);
|
|
}
|
|
|
|
BENCHMARK_DEFINE_F(Concat2D2Input, NNCLoop)(benchmark::State& state) {
|
|
runNNCLoop(state);
|
|
}
|
|
|
|
BENCHMARK_REGISTER_F(Concat2D2Input, ATen)
|
|
->Args({1, 160, 1, 14, 1})
|
|
->Args({1, 580, 1, 174, 1})
|
|
->Args({20, 160, 20, 14, 1})
|
|
->Args({20, 580, 20, 174, 1})
|
|
->Args({8, 512, 8, 512, 1});
|
|
|
|
BENCHMARK_REGISTER_F(Concat2D2Input, NNC)
|
|
->Args({1, 160, 1, 14, 1})
|
|
->Args({1, 580, 1, 174, 1})
|
|
->Args({20, 160, 20, 14, 1})
|
|
->Args({20, 580, 20, 174, 1})
|
|
->Args({8, 512, 8, 512, 1});
|
|
|
|
BENCHMARK_REGISTER_F(Concat2D2Input, NNCLoop)
|
|
->Args({1, 160, 1, 14, 1})
|
|
->Args({1, 580, 1, 174, 1})
|
|
->Args({20, 160, 20, 14, 1})
|
|
->Args({20, 580, 20, 174, 1})
|
|
->Args({8, 512, 8, 512, 1});
|
|
|
|
namespace {
|
|
|
|
class Concat2D3Input : public ConcatBench {
|
|
public:
|
|
void SetUp(const benchmark::State& state) override {
|
|
init(
|
|
{{state.range(0), state.range(1)},
|
|
{state.range(2), state.range(3)},
|
|
{state.range(4), state.range(5)}},
|
|
state.range(6));
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
BENCHMARK_DEFINE_F(Concat2D3Input, ATen)(benchmark::State& state) {
|
|
runATen(state);
|
|
}
|
|
|
|
BENCHMARK_DEFINE_F(Concat2D3Input, NNC)(benchmark::State& state) {
|
|
runNNC(state);
|
|
}
|
|
|
|
BENCHMARK_DEFINE_F(Concat2D3Input, NNCLoop)(benchmark::State& state) {
|
|
runNNCLoop(state);
|
|
}
|
|
|
|
BENCHMARK_REGISTER_F(Concat2D3Input, ATen)->Args({8, 512, 8, 512, 8, 512, 1});
|
|
|
|
BENCHMARK_REGISTER_F(Concat2D3Input, NNC)->Args({8, 512, 8, 512, 8, 512, 1});
|
|
|
|
BENCHMARK_REGISTER_F(Concat2D3Input, NNCLoop)
|
|
->Args({8, 512, 8, 512, 8, 512, 1});
|
|
|
|
namespace {
|
|
|
|
class Concat2D7Input : public ConcatBench {
|
|
public:
|
|
void SetUp(const benchmark::State& state) override {
|
|
init(
|
|
{{state.range(0), state.range(1)},
|
|
{state.range(2), state.range(3)},
|
|
{state.range(4), state.range(5)},
|
|
{state.range(6), state.range(7)},
|
|
{state.range(8), state.range(9)},
|
|
{state.range(10), state.range(11)},
|
|
{state.range(12), state.range(13)}},
|
|
state.range(14));
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
BENCHMARK_DEFINE_F(Concat2D7Input, ATen)(benchmark::State& state) {
|
|
runATen(state);
|
|
}
|
|
|
|
BENCHMARK_DEFINE_F(Concat2D7Input, NNC)(benchmark::State& state) {
|
|
runNNC(state);
|
|
}
|
|
|
|
BENCHMARK_DEFINE_F(Concat2D7Input, NNCLoop)(benchmark::State& state) {
|
|
runNNCLoop(state);
|
|
}
|
|
|
|
BENCHMARK_REGISTER_F(Concat2D7Input, ATen)
|
|
->Args({8, 128, 8, 256, 8, 384, 8, 512, 8, 512, 8, 512, 8, 512, 1});
|
|
|
|
BENCHMARK_REGISTER_F(Concat2D7Input, NNC)
|
|
->Args({8, 128, 8, 256, 8, 384, 8, 512, 8, 512, 8, 512, 8, 512, 1});
|
|
|
|
BENCHMARK_REGISTER_F(Concat2D7Input, NNCLoop)
|
|
->Args({8, 128, 8, 256, 8, 384, 8, 512, 8, 512, 8, 512, 8, 512, 1});
|