mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 00:21:07 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/54122 Test Plan: * USE_TBB=1 ATEN_THREADING=TBB python setup.py develop --cmake * USE_TBB=1 ATEN_THREADING=NATIVE python setup.py develop --cmake * USE_TBB=1 ATEN_THREADING=OMP python setup.py develop --cmake * cd build; ninja bin/tensorexpr_bench * bin/test_tensorexpr --gtest_filter="*Parallel*" Reviewed By: bertmaher Differential Revision: D27109802 Pulled By: zheng-xq fbshipit-source-id: db159466d0b46357bcf0fbefb36094bee312368c
76 lines
2.1 KiB
C++
76 lines
2.1 KiB
C++
#include <benchmark/benchmark.h>
|
|
#include <torch/csrc/jit/tensorexpr/analysis.h>
|
|
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
|
|
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
|
|
#include <torch/csrc/jit/tensorexpr/loopnest.h>
|
|
#include <torch/csrc/jit/tensorexpr/tensor.h>
|
|
#include <torch/torch.h>
|
|
|
|
#include <immintrin.h>
|
|
|
|
namespace torch {
|
|
namespace jit {
|
|
namespace tensorexpr {
|
|
|
|
class ParallelAdd : public benchmark::Fixture {
|
|
public:
|
|
void SetUp(const benchmark::State& state) override {
|
|
at::set_num_threads(4);
|
|
torch::manual_seed(0x12345678);
|
|
M = state.range(0);
|
|
A = torch::randn({M});
|
|
B = torch::randn({M});
|
|
C = torch::zeros({M});
|
|
}
|
|
|
|
void TearDown(benchmark::State& state) override {
|
|
state.counters["tasks"] = benchmark::Counter(uint64_t(state.iterations()) * M,
|
|
benchmark::Counter::kIsRate);
|
|
}
|
|
|
|
int M;
|
|
at::Tensor A;
|
|
at::Tensor B;
|
|
at::Tensor C;
|
|
};
|
|
|
|
BENCHMARK_DEFINE_F(ParallelAdd, Simple)(benchmark::State& state) {
|
|
KernelScope kernel_scope;
|
|
ExecutionCounter counter(llvm_codegen_parallel_dispatched);
|
|
Placeholder a_buf("a", kFloat, {M});
|
|
Placeholder b_buf("b", kFloat, {M});
|
|
Tensor* c_tensor = Compute(
|
|
"c", {{M, "m"}}, [&](const VarHandle& m) {
|
|
return a_buf.load(m) + b_buf.load(m);
|
|
});
|
|
LoopNest loop_nest({c_tensor});
|
|
auto const& loops = loop_nest.getLoopStmtsFor(c_tensor);
|
|
For* m = loops[0];
|
|
m->set_parallel();
|
|
loop_nest.prepareForCodegen();
|
|
Stmt* stmt = loop_nest.root_stmt();
|
|
LLVMCodeGen cg(stmt, {c_tensor, a_buf, b_buf});
|
|
|
|
float* a_ptr = A.data_ptr<float>();
|
|
float* b_ptr = B.data_ptr<float>();
|
|
float* c_ptr = C.data_ptr<float>();
|
|
std::vector<void*> args({c_ptr, a_ptr, b_ptr});
|
|
cg.value<int>(args);
|
|
int count = counter.elapsed_value();
|
|
TORCH_CHECK(count > 0);
|
|
for (int i = 0; i < M; i++) {
|
|
float diff = fabs(a_ptr[i] + b_ptr[i] - c_ptr[i]);
|
|
TORCH_CHECK(diff < 1e-5);
|
|
}
|
|
|
|
for (auto _ : state) {
|
|
cg.value<int>(args);
|
|
}
|
|
}
|
|
|
|
BENCHMARK_REGISTER_F(ParallelAdd, Simple)->Args({1 << 16});
|
|
|
|
} // namespace tensorexpr
|
|
} // namespace jit
|
|
} // namespace torch
|