mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63778 This is a preparation for a switch from raw pointers to shared pointers as a memory model for TE expressions and statements. Test Plan: Imported from OSS Reviewed By: navahgar Differential Revision: D30487425 Pulled By: ZolotukhinM fbshipit-source-id: 9cbe817b7d4e5fc2f150b29bb9b3bf578868f20c
225 lines
6.2 KiB
C++
225 lines
6.2 KiB
C++
#include <benchmark/benchmark.h>
|
|
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
|
|
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
|
|
#include <torch/csrc/jit/tensorexpr/loopnest.h>
|
|
#include <torch/csrc/jit/tensorexpr/tensor.h>
|
|
#include <torch/torch.h>
|
|
|
|
using namespace torch::jit::tensorexpr;
|
|
|
|
namespace {
|
|
class BatchNorm : public benchmark::Fixture {
|
|
public:
|
|
void SetUp(const benchmark::State& state) override {
|
|
N_ = state.range(0);
|
|
C_ = state.range(1);
|
|
H_ = state.range(2);
|
|
W_ = state.range(3);
|
|
input_ = torch::ones({N_, C_, H_, W_});
|
|
weight_ = torch::ones({C_});
|
|
bias_ = torch::ones({C_});
|
|
mean_ = torch::ones({C_}) * 0.5f;
|
|
var_ = torch::ones({C_}) * 0.1f;
|
|
ref_ = at::batch_norm(
|
|
input_,
|
|
weight_,
|
|
bias_,
|
|
mean_,
|
|
var_,
|
|
training_,
|
|
momentum_,
|
|
eps_,
|
|
cudnn_enabled_);
|
|
output_ = at::empty_like(ref_);
|
|
}
|
|
|
|
void TearDown(benchmark::State& state) override {
|
|
TORCH_CHECK(at::allclose(ref_, output_));
|
|
state.counters["GB/s"] = benchmark::Counter(
|
|
uint64_t(state.iterations()) * (input_.nbytes() + ref_.nbytes()),
|
|
benchmark::Counter::kIsRate);
|
|
}
|
|
|
|
int N_;
|
|
int C_;
|
|
int H_;
|
|
int W_;
|
|
at::Tensor input_;
|
|
at::Tensor weight_;
|
|
at::Tensor bias_;
|
|
at::Tensor mean_;
|
|
at::Tensor var_;
|
|
at::Tensor output_;
|
|
at::Tensor ref_;
|
|
bool training_{false};
|
|
float momentum_{0.1};
|
|
float eps_{1.0e-5f};
|
|
bool cudnn_enabled_{false};
|
|
};
|
|
} // namespace
|
|
|
|
BENCHMARK_DEFINE_F(BatchNorm, ATen)(benchmark::State& state) {
|
|
for (auto _ : state) {
|
|
output_ = at::batch_norm(
|
|
input_,
|
|
weight_,
|
|
bias_,
|
|
mean_,
|
|
var_,
|
|
training_,
|
|
momentum_,
|
|
eps_,
|
|
cudnn_enabled_);
|
|
}
|
|
}
|
|
|
|
BENCHMARK_DEFINE_F(BatchNorm, NNC)(benchmark::State& state) {
|
|
KernelScope ks;
|
|
|
|
Placeholder input("input", kFloat, {N_, C_, H_, W_});
|
|
Placeholder weight("weight", kFloat, {C_});
|
|
Placeholder bias("bias", kFloat, {C_});
|
|
Placeholder mean("mean", kFloat, {C_});
|
|
Placeholder var("var", kFloat, {C_});
|
|
VarHandle eps("eps", kFloat);
|
|
|
|
using axis = const VarHandle&;
|
|
Tensor* output = Compute(
|
|
"output",
|
|
{{N_, "N"}, {C_, "C"}, {H_, "H"}, {W_, "W"}},
|
|
[&](axis n, axis c, axis h, axis w) {
|
|
// Compute affine terms.
|
|
auto inv_var = FloatImm::make(1.0f) / sqrt(var.load(c) + eps);
|
|
auto weight_v = weight.load(c);
|
|
auto bias_v = bias.load(c);
|
|
auto alpha = inv_var * weight_v;
|
|
auto beta = bias_v - mean.load(c) * alpha;
|
|
|
|
return input.load(n, c, h, w) * alpha + beta;
|
|
});
|
|
LoopNest nest({output});
|
|
auto loops = nest.getLoopStmtsFor(output);
|
|
LoopNest::flatten({loops[2], loops[3]});
|
|
loops = nest.getLoopStmtsFor(output);
|
|
LoopNest::flatten({loops[0], loops[1]});
|
|
loops = nest.getLoopStmtsFor(output);
|
|
loops[0]->set_parallel();
|
|
nest.prepareForCodegen();
|
|
StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
|
|
LLVMCodeGen cg(s, {input, weight, bias, mean, var, output, eps});
|
|
|
|
std::vector<CodeGen::CallArg> args;
|
|
for (auto _ : state) {
|
|
args.clear();
|
|
output_ = at::empty_like(input_);
|
|
for (auto const& t : {input_, weight_, bias_, mean_, var_, output_}) {
|
|
args.push_back(t.data_ptr<float>());
|
|
}
|
|
args.push_back(eps_);
|
|
cg.call(args);
|
|
}
|
|
}
|
|
|
|
BENCHMARK_DEFINE_F(BatchNorm, ATenRelu)(benchmark::State& state) {
|
|
for (auto _ : state) {
|
|
output_ = at::batch_norm(
|
|
input_,
|
|
weight_,
|
|
bias_,
|
|
mean_,
|
|
var_,
|
|
training_,
|
|
momentum_,
|
|
eps_,
|
|
cudnn_enabled_);
|
|
output_.relu_();
|
|
}
|
|
}
|
|
|
|
BENCHMARK_DEFINE_F(BatchNorm, NNCRelu)(benchmark::State& state) {
|
|
KernelScope ks;
|
|
|
|
Placeholder input("input", kFloat, {N_, C_, H_, W_});
|
|
Placeholder weight("weight", kFloat, {C_});
|
|
Placeholder bias("bias", kFloat, {C_});
|
|
Placeholder mean("mean", kFloat, {C_});
|
|
Placeholder var("var", kFloat, {C_});
|
|
VarHandle eps("eps", kFloat);
|
|
|
|
using axis = const VarHandle&;
|
|
Tensor* output = Compute(
|
|
"output",
|
|
{{N_, "N"}, {C_, "C"}, {H_, "H"}, {W_, "W"}},
|
|
[&](axis n, axis c, axis h, axis w) {
|
|
// Compute affine terms.
|
|
auto inv_var = FloatImm::make(1.0f) / sqrt(var.load(c) + eps);
|
|
auto weight_v = weight.load(c);
|
|
auto bias_v = bias.load(c);
|
|
auto alpha = inv_var * weight_v;
|
|
auto beta = bias_v - mean.load(c) * alpha;
|
|
|
|
auto bn = input.load(n, c, h, w) * alpha + beta;
|
|
return CompareSelect::make(bn, 0.f, 0.f, bn, kLT);
|
|
});
|
|
LoopNest nest({output});
|
|
nest.prepareForCodegen();
|
|
StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
|
|
LLVMCodeGen cg(s, {input, weight, bias, mean, var, output, eps});
|
|
|
|
std::vector<CodeGen::CallArg> args;
|
|
for (auto _ : state) {
|
|
args.clear();
|
|
output_ = at::empty_like(input_);
|
|
for (auto const& t : {input_, weight_, bias_, mean_, var_, output_}) {
|
|
args.push_back(t.data_ptr<float>());
|
|
}
|
|
args.push_back(eps_);
|
|
cg.call(args);
|
|
}
|
|
}
|
|
|
|
BENCHMARK_REGISTER_F(BatchNorm, ATen)
|
|
->Args({1, 64, 112, 112})
|
|
->Args({1, 256, 14, 14})
|
|
->Args({1, 128, 28, 28})
|
|
->Args({1, 64, 56, 56})
|
|
->Args({1, 512, 7, 7})
|
|
->Args({5, 64, 112, 112})
|
|
->Args({5, 256, 14, 14})
|
|
->Args({5, 128, 28, 28})
|
|
->Args({5, 64, 56, 56})
|
|
->Args({5, 512, 7, 7});
|
|
BENCHMARK_REGISTER_F(BatchNorm, NNC)
|
|
->Args({1, 64, 112, 112})
|
|
->Args({1, 256, 14, 14})
|
|
->Args({1, 128, 28, 28})
|
|
->Args({1, 64, 56, 56})
|
|
->Args({1, 512, 7, 7})
|
|
->Args({5, 64, 112, 112})
|
|
->Args({5, 256, 14, 14})
|
|
->Args({5, 128, 28, 28})
|
|
->Args({5, 64, 56, 56})
|
|
->Args({5, 512, 7, 7});
|
|
BENCHMARK_REGISTER_F(BatchNorm, ATenRelu)
|
|
->Args({1, 64, 112, 112})
|
|
->Args({1, 256, 14, 14})
|
|
->Args({1, 128, 28, 28})
|
|
->Args({1, 64, 56, 56})
|
|
->Args({1, 512, 7, 7})
|
|
->Args({5, 64, 112, 112})
|
|
->Args({5, 256, 14, 14})
|
|
->Args({5, 128, 28, 28})
|
|
->Args({5, 64, 56, 56})
|
|
->Args({5, 512, 7, 7});
|
|
BENCHMARK_REGISTER_F(BatchNorm, NNCRelu)
|
|
->Args({1, 64, 112, 112})
|
|
->Args({1, 256, 14, 14})
|
|
->Args({1, 128, 28, 28})
|
|
->Args({1, 64, 56, 56})
|
|
->Args({1, 512, 7, 7})
|
|
->Args({5, 64, 112, 112})
|
|
->Args({5, 256, 14, 14})
|
|
->Args({5, 128, 28, 28})
|
|
->Args({5, 64, 56, 56})
|
|
->Args({5, 512, 7, 7});
|