pytorch/benchmarks/cpp/tensorexpr/bench_parallel.cpp
Bert Maher 93772792e3 [nnc] Get rid of fuser trigger counters (#57334)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/57334

Here's a possibly controversial PR.  These counters got in the way of
generalizing the fuser tests to handle arbitrary devices, and I guess I'm just
generally skeptical that they provide much value.  While true that they let us
observe whether fusion groups were created, we already have assertions based on
the shape of the graph, and I'm not sure that I trust those any less than these
counters.

Test Plan: Imported from OSS

Reviewed By: ZolotukhinM

Differential Revision: D29471484

Pulled By: bertmaher

fbshipit-source-id: f6d76f6e72dbfb581acff1d834b0c74500941b57
2021-06-29 22:22:15 -07:00

73 lines
2.0 KiB
C++

#include <benchmark/benchmark.h>
#include <torch/csrc/jit/tensorexpr/analysis.h>
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
#include <torch/csrc/jit/tensorexpr/loopnest.h>
#include <torch/csrc/jit/tensorexpr/tensor.h>
#include <torch/torch.h>
#include <immintrin.h>
namespace torch {
namespace jit {
namespace tensorexpr {
class ParallelAdd : public benchmark::Fixture {
public:
void SetUp(const benchmark::State& state) override {
at::set_num_threads(4);
torch::manual_seed(0x12345678);
M = state.range(0);
A = torch::randn({M});
B = torch::randn({M});
C = torch::zeros({M});
}
void TearDown(benchmark::State& state) override {
state.counters["tasks"] = benchmark::Counter(uint64_t(state.iterations()) * M,
benchmark::Counter::kIsRate);
}
int M;
at::Tensor A;
at::Tensor B;
at::Tensor C;
};
BENCHMARK_DEFINE_F(ParallelAdd, Simple)(benchmark::State& state) {
KernelScope kernel_scope;
Placeholder a_buf("a", kFloat, {M});
Placeholder b_buf("b", kFloat, {M});
Tensor* c_tensor = Compute(
"c", {{M, "m"}}, [&](const VarHandle& m) {
return a_buf.load(m) + b_buf.load(m);
});
LoopNest loop_nest({c_tensor});
auto const& loops = loop_nest.getLoopStmtsFor(c_tensor);
For* m = loops[0];
m->set_parallel();
loop_nest.prepareForCodegen();
Stmt* stmt = loop_nest.root_stmt();
LLVMCodeGen cg(stmt, {c_tensor, a_buf, b_buf});
float* a_ptr = A.data_ptr<float>();
float* b_ptr = B.data_ptr<float>();
float* c_ptr = C.data_ptr<float>();
std::vector<void*> args({c_ptr, a_ptr, b_ptr});
cg.value<int>(args);
for (int i = 0; i < M; i++) {
float diff = fabs(a_ptr[i] + b_ptr[i] - c_ptr[i]);
TORCH_CHECK(diff < 1e-5);
}
for (auto _ : state) {
cg.value<int>(args);
}
}
BENCHMARK_REGISTER_F(ParallelAdd, Simple)->Args({1 << 16});
} // namespace tensorexpr
} // namespace jit
} // namespace torch