mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/57334 Here's a possibly controversial PR. These counters got in the way of generalizing the fuser tests to handle arbitrary devices, and I guess I'm just generally skeptical that they provide much value. While true that they let us observe whether fusion groups were created, we already have assertions based on the shape of the graph, and I'm not sure that I trust those any less than these counters. Test Plan: Imported from OSS Reviewed By: ZolotukhinM Differential Revision: D29471484 Pulled By: bertmaher fbshipit-source-id: f6d76f6e72dbfb581acff1d834b0c74500941b57
73 lines
2.0 KiB
C++
73 lines
2.0 KiB
C++
#include <benchmark/benchmark.h>
|
|
#include <torch/csrc/jit/tensorexpr/analysis.h>
|
|
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
|
|
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
|
|
#include <torch/csrc/jit/tensorexpr/loopnest.h>
|
|
#include <torch/csrc/jit/tensorexpr/tensor.h>
|
|
#include <torch/torch.h>
|
|
|
|
#include <immintrin.h>
|
|
|
|
namespace torch {
|
|
namespace jit {
|
|
namespace tensorexpr {
|
|
|
|
class ParallelAdd : public benchmark::Fixture {
|
|
public:
|
|
void SetUp(const benchmark::State& state) override {
|
|
at::set_num_threads(4);
|
|
torch::manual_seed(0x12345678);
|
|
M = state.range(0);
|
|
A = torch::randn({M});
|
|
B = torch::randn({M});
|
|
C = torch::zeros({M});
|
|
}
|
|
|
|
void TearDown(benchmark::State& state) override {
|
|
state.counters["tasks"] = benchmark::Counter(uint64_t(state.iterations()) * M,
|
|
benchmark::Counter::kIsRate);
|
|
}
|
|
|
|
int M;
|
|
at::Tensor A;
|
|
at::Tensor B;
|
|
at::Tensor C;
|
|
};
|
|
|
|
BENCHMARK_DEFINE_F(ParallelAdd, Simple)(benchmark::State& state) {
|
|
KernelScope kernel_scope;
|
|
Placeholder a_buf("a", kFloat, {M});
|
|
Placeholder b_buf("b", kFloat, {M});
|
|
Tensor* c_tensor = Compute(
|
|
"c", {{M, "m"}}, [&](const VarHandle& m) {
|
|
return a_buf.load(m) + b_buf.load(m);
|
|
});
|
|
LoopNest loop_nest({c_tensor});
|
|
auto const& loops = loop_nest.getLoopStmtsFor(c_tensor);
|
|
For* m = loops[0];
|
|
m->set_parallel();
|
|
loop_nest.prepareForCodegen();
|
|
Stmt* stmt = loop_nest.root_stmt();
|
|
LLVMCodeGen cg(stmt, {c_tensor, a_buf, b_buf});
|
|
|
|
float* a_ptr = A.data_ptr<float>();
|
|
float* b_ptr = B.data_ptr<float>();
|
|
float* c_ptr = C.data_ptr<float>();
|
|
std::vector<void*> args({c_ptr, a_ptr, b_ptr});
|
|
cg.value<int>(args);
|
|
for (int i = 0; i < M; i++) {
|
|
float diff = fabs(a_ptr[i] + b_ptr[i] - c_ptr[i]);
|
|
TORCH_CHECK(diff < 1e-5);
|
|
}
|
|
|
|
for (auto _ : state) {
|
|
cg.value<int>(args);
|
|
}
|
|
}
|
|
|
|
BENCHMARK_REGISTER_F(ParallelAdd, Simple)->Args({1 << 16});
|
|
|
|
} // namespace tensorexpr
|
|
} // namespace jit
|
|
} // namespace torch
|