mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50995 This change makes 'Tensor' a thin wrapper over 'Buf' and 'Stmt', and merges it with recently introduced 'CompoundTensor'. A statement for the tensor is either passed directly to the Tensor constructor (akin to 'CompoundTensor'), or is built immediately in constructor. LoopNest is no longer responsible for constructing statements from tensors - it simply stitches already constructed statements contained in Tensors. This has a side effect that now we cannot construct several loopnests from the same tensors - we need to explicitly clone statements if we want to do that. A special copy constructor was added to LoopNest to make it more convenient (note: this only affects tests, we don't usually create multiple loopnests in other places). Test Plan: Imported from OSS Reviewed By: bertmaher Differential Revision: D26038223 Pulled By: ZolotukhinM fbshipit-source-id: 27a2e5900437cfb0c151e8f89815edec53608e17
446 lines
12 KiB
C++
446 lines
12 KiB
C++
#include <benchmark/benchmark.h>
|
|
#include <torch/csrc/jit/tensorexpr/analysis.h>
|
|
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
|
|
#include <torch/csrc/jit/tensorexpr/loopnest.h>
|
|
#include <torch/csrc/jit/tensorexpr/tensor.h>
|
|
#include <torch/torch.h>
|
|
|
|
#include <immintrin.h>
|
|
|
|
namespace te = torch::jit::tensorexpr;
|
|
|
|
namespace {
|
|
class Reduce1D : public benchmark::Fixture {
|
|
public:
|
|
void SetUp(const benchmark::State& state) override {
|
|
at::set_num_threads(1);
|
|
torch::manual_seed(0x12345678);
|
|
M = state.range(0);
|
|
A = torch::randn({M});
|
|
B = torch::zeros({});
|
|
}
|
|
|
|
void TearDown(benchmark::State& state) override {
|
|
state.counters["BYTES"] = benchmark::Counter(uint64_t(state.iterations()) * M * sizeof(float),
|
|
benchmark::Counter::kIsRate);
|
|
}
|
|
|
|
int M;
|
|
at::Tensor A;
|
|
at::Tensor B;
|
|
};
|
|
|
|
} // namespace
|
|
|
|
BENCHMARK_DEFINE_F(Reduce1D, Torch)(benchmark::State& state) {
|
|
for (auto _ : state) {
|
|
B = torch::sum(A, {0});
|
|
}
|
|
}
|
|
|
|
BENCHMARK_REGISTER_F(Reduce1D, Torch)->Args({1 << 24});
|
|
|
|
#define VALIDATE(F, A, B) ValidateFunc((F), #F, (A), (B))
|
|
|
|
template <typename Func>
|
|
void ValidateFunc(Func func, const std::string& func_name, at::Tensor& A, at::Tensor& B) {
|
|
func(A, B);
|
|
float *pB = B.data_ptr<float>();
|
|
at::Tensor B2 = torch::sum(A, {0});
|
|
float *pB2 = B2.data_ptr<float>();
|
|
int size = A.numel();
|
|
float size_sqrt = std::sqrt(size);
|
|
float natural_noise = size_sqrt * 1e-7;
|
|
if (!torch::allclose(B, B2, natural_noise)) {
|
|
std::ostringstream oss;
|
|
oss << func_name << " failed check: " << std::endl;
|
|
oss << "value: " << B << std::endl;;
|
|
oss << "reference: " << B2 << std::endl;
|
|
oss << "threshold: " << natural_noise << std::endl;
|
|
throw std::runtime_error(oss.str());
|
|
}
|
|
}
|
|
|
|
static void reduce1d_naive(at::Tensor& A, at::Tensor& B) {
|
|
float *pA = A.data_ptr<float>();
|
|
float *pB = B.data_ptr<float>();
|
|
int size = A.numel();
|
|
TORCH_CHECK(B.numel() == 1);
|
|
*pB = 0.;
|
|
for (int i = 0; i < size; i++) {
|
|
*pB += pA[i];
|
|
}
|
|
}
|
|
|
|
BENCHMARK_DEFINE_F(Reduce1D, Naive)(benchmark::State& state) {
|
|
VALIDATE(reduce1d_naive, A, B);
|
|
for (auto _ : state) {
|
|
reduce1d_naive(A, B);
|
|
}
|
|
}
|
|
|
|
BENCHMARK_REGISTER_F(Reduce1D, Naive)->Args({1 << 24});
|
|
|
|
static void reduce1d_native_rfactor(at::Tensor& A, at::Tensor& B) {
|
|
float *pA = A.data_ptr<float>();
|
|
float *pB = B.data_ptr<float>();
|
|
int size = A.numel();
|
|
constexpr int kChunkSize = 16;
|
|
TORCH_CHECK(B.numel() == 1);
|
|
TORCH_CHECK(size % kChunkSize == 0);
|
|
*pB = 0.;
|
|
float temp[kChunkSize];
|
|
for (int j = 0; j < kChunkSize; j++) {
|
|
temp[j] = 0;
|
|
}
|
|
|
|
int chunk_count = size / kChunkSize;
|
|
for (int i = 0; i < chunk_count; i++) {
|
|
for (int j = 0; j < kChunkSize; j++) {
|
|
temp[j] += pA[i * kChunkSize + j];
|
|
}
|
|
}
|
|
|
|
for (int j = 0; j < kChunkSize; j++) {
|
|
*pB += temp[j];
|
|
}
|
|
}
|
|
|
|
BENCHMARK_DEFINE_F(Reduce1D, NativeRfactor)(benchmark::State& state) {
|
|
VALIDATE(reduce1d_native_rfactor, A, B);
|
|
for (auto _ : state) {
|
|
reduce1d_native_rfactor(A, B);
|
|
}
|
|
}
|
|
|
|
BENCHMARK_REGISTER_F(Reduce1D, NativeRfactor)->Args({1 << 24});
|
|
|
|
#ifdef USE_AVX2
|
|
|
|
// x = ( x7, x6, x5, x4, x3, x2, x1, x0 )
|
|
inline float sum_f32x8(__m256 x) {
|
|
// hiQuad = ( x7, x6, x5, x4 )
|
|
const __m128 hiQuad = _mm256_extractf128_ps(x, 1);
|
|
// loQuad = ( x3, x2, x1, x0 )
|
|
const __m128 loQuad = _mm256_castps256_ps128(x);
|
|
// sumQuad = ( x3 + x7, x2 + x6, x1 + x5, x0 + x4 )
|
|
const __m128 sumQuad = _mm_add_ps(loQuad, hiQuad);
|
|
// loDual = ( -, -, x1 + x5, x0 + x4 )
|
|
const __m128 loDual = sumQuad;
|
|
// hiDual = ( -, -, x3 + x7, x2 + x6 )
|
|
const __m128 hiDual = _mm_movehl_ps(sumQuad, sumQuad);
|
|
// sumDual = ( -, -, x1 + x3 + x5 + x7, x0 + x2 + x4 + x6 )
|
|
const __m128 sumDual = _mm_add_ps(loDual, hiDual);
|
|
// lo = ( -, -, -, x0 + x2 + x4 + x6 )
|
|
const __m128 lo = sumDual;
|
|
// hi = ( -, -, -, x1 + x3 + x5 + x7 )
|
|
const __m128 hi = _mm_shuffle_ps(sumDual, sumDual, 0x1);
|
|
// sum = ( -, -, -, x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 )
|
|
const __m128 sum = _mm_add_ss(lo, hi);
|
|
return _mm_cvtss_f32(sum);
|
|
}
|
|
|
|
static void reduce1d_native_vector(at::Tensor& A, at::Tensor& B) {
|
|
float *pA = A.data_ptr<float>();
|
|
float *pB = B.data_ptr<float>();
|
|
int size = A.numel();
|
|
constexpr int kChunkSize = sizeof(__m256) / sizeof(float);
|
|
TORCH_CHECK(B.numel() == 1);
|
|
TORCH_CHECK(size % kChunkSize == 0);
|
|
*pB = 0.;
|
|
__m256 temp;
|
|
temp = _mm256_setzero_ps();
|
|
|
|
int tile_count = size / kChunkSize;
|
|
for (int i = 0; i < tile_count; i++) {
|
|
__m256 data = _mm256_load_ps(pA + i * kChunkSize);
|
|
temp = _mm256_add_ps(temp, data);
|
|
}
|
|
|
|
float result = sum_f32x8(temp);
|
|
*pB = result;
|
|
}
|
|
|
|
BENCHMARK_DEFINE_F(Reduce1D, NativeVector)(benchmark::State& state) {
|
|
VALIDATE(reduce1d_native_vector, A, B);
|
|
for (auto _ : state) {
|
|
reduce1d_native_vector(A, B);
|
|
}
|
|
}
|
|
|
|
BENCHMARK_REGISTER_F(Reduce1D, NativeVector)->Args({1 << 24});
|
|
|
|
static void reduce1d_native_tiled(at::Tensor& A, at::Tensor& B) {
|
|
static constexpr int kTileSize = 4;
|
|
float *pA = A.data_ptr<float>();
|
|
float *pB = B.data_ptr<float>();
|
|
int size = A.numel();
|
|
constexpr int kChunkSize = sizeof(__m256) / sizeof(float);
|
|
TORCH_CHECK(B.numel() == 1, "Invalid size: ", B.numel(), " != 1");
|
|
TORCH_CHECK(size % kChunkSize == 0, "Invalid size: ", size, " % ", kChunkSize , " ! = 0");
|
|
__m256 t[kTileSize];
|
|
for (int j = 0; j < kTileSize; j++) {
|
|
t[j] = _mm256_setzero_ps();
|
|
}
|
|
|
|
int tile_count = size / kChunkSize / kTileSize;
|
|
for (int i = 0; i < tile_count; i++) {
|
|
#pragma unroll
|
|
for (int j = 0; j < kTileSize; j++) {
|
|
float *p = pA + (i * kTileSize + j) * kChunkSize;
|
|
__m256 data = _mm256_loadu_ps(p);
|
|
t[j] = _mm256_add_ps(t[j], data);
|
|
}
|
|
}
|
|
|
|
float result = sum_f32x8(t[0]);
|
|
for (int j = 1; j < kTileSize; j++) {
|
|
result += sum_f32x8(t[j]);
|
|
}
|
|
*pB = result;
|
|
}
|
|
|
|
BENCHMARK_DEFINE_F(Reduce1D, NativeTiled)(benchmark::State& state) {
|
|
VALIDATE(reduce1d_native_tiled, A, B);
|
|
for (auto _ : state) {
|
|
reduce1d_native_tiled(A, B);
|
|
}
|
|
}
|
|
|
|
BENCHMARK_REGISTER_F(Reduce1D, NativeTiled)->Args({1 << 24});
|
|
|
|
#endif // USE_AVX2
|
|
|
|
BENCHMARK_DEFINE_F(Reduce1D, TeNaive)(benchmark::State& state) {
|
|
te::KernelScope ks;
|
|
|
|
int M = A.numel();
|
|
|
|
te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
|
|
te::Tensor* BT = te::Reduce(
|
|
"reduce_full",
|
|
{{1, "N"}},
|
|
te::Sum(),
|
|
[&](const te::ExprHandle& n, const te::ExprHandle& m) {
|
|
return AP.load(m);
|
|
},
|
|
{{M, "M"}});
|
|
|
|
te::LoopNest loop({BT});
|
|
loop.prepareForCodegen();
|
|
te::Stmt* s = loop.root_stmt();
|
|
s = te::IRSimplifier::simplify(s);
|
|
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
|
|
|
|
auto func = [&](at::Tensor& A, at::Tensor& B) {
|
|
cg->call({A.data_ptr<float>(), B.data_ptr<float>()});
|
|
};
|
|
|
|
ValidateFunc(func, "reduce1d_te_naive", A, B);
|
|
for (auto _ : state) {
|
|
func(A, B);
|
|
}
|
|
}
|
|
|
|
BENCHMARK_REGISTER_F(Reduce1D, TeNaive)->Args({1 << 24});
|
|
|
|
BENCHMARK_DEFINE_F(Reduce1D, TeSplitTail)(benchmark::State& state) {
|
|
te::KernelScope ks;
|
|
|
|
int M = A.numel();
|
|
|
|
te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
|
|
te::Tensor* BT = te::Reduce(
|
|
"reduce_full",
|
|
{{1, "N"}},
|
|
te::Sum(),
|
|
[&](const te::ExprHandle& n, const te::ExprHandle& m) {
|
|
return AP.load(m);
|
|
},
|
|
{{M, "M"}});
|
|
|
|
te::LoopNest loop({BT});
|
|
const int kChunkSize = 8;
|
|
|
|
{
|
|
auto const& loops = loop.getLoopStmtsFor(BT);
|
|
te::For* m = loops[1];
|
|
te::For* mo;
|
|
te::For* mi;
|
|
te::For* tail;
|
|
loop.splitWithTail(m, kChunkSize, &mo, &mi, &tail);
|
|
}
|
|
|
|
loop.prepareForCodegen();
|
|
te::Stmt* s = loop.root_stmt();
|
|
s = te::IRSimplifier::simplify(s);
|
|
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
|
|
|
|
auto func = [&](at::Tensor& A, at::Tensor& B) {
|
|
cg->call({A.data_ptr<float>(), B.data_ptr<float>()});
|
|
};
|
|
|
|
ValidateFunc(func, "reduce1d_te_naive", A, B);
|
|
for (auto _ : state) {
|
|
func(A, B);
|
|
}
|
|
}
|
|
|
|
BENCHMARK_REGISTER_F(Reduce1D, TeSplitTail)->Args({1 << 24});
|
|
|
|
BENCHMARK_DEFINE_F(Reduce1D, TeSplitMask)(benchmark::State& state) {
|
|
te::KernelScope ks;
|
|
|
|
int M = A.numel();
|
|
|
|
te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
|
|
te::Tensor* BT = te::Reduce(
|
|
"reduce_full",
|
|
{{1, "N"}},
|
|
te::Sum(),
|
|
[&](const te::ExprHandle& n, const te::ExprHandle& m) {
|
|
return AP.load(m);
|
|
},
|
|
{{M, "M"}});
|
|
|
|
te::LoopNest loop({BT});
|
|
const int kChunkSize = 8;
|
|
|
|
{
|
|
auto const& loops = loop.getLoopStmtsFor(BT);
|
|
te::For* m = loops[1];
|
|
te::For* mo;
|
|
te::For* mi;
|
|
loop.splitWithMask(m, kChunkSize, &mo, &mi);
|
|
}
|
|
|
|
loop.prepareForCodegen();
|
|
te::Stmt* s = loop.root_stmt();
|
|
s = te::IRSimplifier::simplify(s);
|
|
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
|
|
|
|
auto func = [&](at::Tensor& A, at::Tensor& B) {
|
|
cg->call({A.data_ptr<float>(), B.data_ptr<float>()});
|
|
};
|
|
|
|
ValidateFunc(func, "reduce1d_te_naive", A, B);
|
|
for (auto _ : state) {
|
|
func(A, B);
|
|
}
|
|
}
|
|
|
|
BENCHMARK_REGISTER_F(Reduce1D, TeSplitMask)->Args({1 << 24});
|
|
|
|
BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) {
|
|
te::KernelScope ks;
|
|
|
|
int M = A.numel();
|
|
const int kChunkSize = 8;
|
|
TORCH_CHECK(M % kChunkSize == 0);
|
|
|
|
te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
|
|
te::Tensor* BT = te::Reduce(
|
|
"reduce_full",
|
|
{},
|
|
te::Sum(),
|
|
[&](const te::ExprHandle& m) {
|
|
return AP.load(m);
|
|
},
|
|
{{M, "M"}});
|
|
|
|
te::LoopNest loop({BT});
|
|
|
|
{
|
|
auto const& loops = loop.getLoopStmtsFor(BT);
|
|
TORCH_CHECK(loops.size() == 1);
|
|
te::For* m = loops[0];
|
|
te::For* mo;
|
|
te::For* mi;
|
|
loop.splitWithMask(m, kChunkSize, &mo, &mi);
|
|
}
|
|
|
|
{
|
|
auto const& loops = loop.getLoopStmtsFor(BT);
|
|
TORCH_CHECK(loops.size() == 2);
|
|
te::For* mo = loops[0];
|
|
te::For* mi = loops[1];
|
|
// TODO: rfactor works on the untransformed var set. This is a problem since we need to
|
|
// look for the loop after Split to rfactor.
|
|
auto bt_body = te::NodeFinder<te::ReduceOp>::find(loop.root_stmt())[0];
|
|
loop.rfactor(bt_body, mi->var());
|
|
}
|
|
|
|
loop.prepareForCodegen();
|
|
te::Stmt* s = loop.root_stmt();
|
|
s = te::IRSimplifier::simplify(s);
|
|
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
|
|
|
|
auto func = [&](at::Tensor& A, at::Tensor& B) {
|
|
cg->call({A.data_ptr<float>(), B.data_ptr<float>()});
|
|
};
|
|
|
|
ValidateFunc(func, "reduce1d_te_naive", A, B);
|
|
for (auto _ : state) {
|
|
func(A, B);
|
|
}
|
|
}
|
|
|
|
// TODO: add this back when the problem is fixed
|
|
// BENCHMARK_REGISTER_F(Reduce1D, TeRfactorV1)->Args({1 << 24});
|
|
|
|
// Similar to TeRfactor itself. But manually constructed the Split expression.
|
|
BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV2)(benchmark::State& state) {
|
|
te::KernelScope ks;
|
|
|
|
int M = A.numel();
|
|
const int kChunkSize = 8;
|
|
TORCH_CHECK(M % kChunkSize == 0);
|
|
|
|
te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
|
|
te::Tensor* BT = te::Reduce(
|
|
"reduce_full",
|
|
{},
|
|
te::Sum(),
|
|
[&](const te::ExprHandle& mo, const te::ExprHandle& mi) {
|
|
return AP.load(mo * kChunkSize + mi);
|
|
},
|
|
{{M / kChunkSize, "mo"}, {kChunkSize, "mi"}});
|
|
|
|
te::LoopNest loop({BT});
|
|
|
|
{
|
|
auto const& loops = loop.getLoopStmtsFor(BT);
|
|
TORCH_CHECK(loops.size() == 2);
|
|
te::For* mo = loops[0];
|
|
te::For* mi = loops[1];
|
|
auto bt_body = te::NodeFinder<te::ReduceOp>::find(loop.root_stmt())[0];
|
|
loop.rfactor(bt_body, mi->var());
|
|
}
|
|
|
|
{
|
|
// Look for the new For and vectorize, but rfactor didn't return the newly added "For *".
|
|
// Resort to a hack to find the lost "For *".
|
|
// TODO: make it easier to find the transformed loop after rfactor.
|
|
auto loops = te::NodeFinder<te::For>::find(loop.root_stmt());
|
|
TORCH_CHECK(loops.size() == 4);
|
|
auto mi = loops[2];
|
|
loop.vectorize(mi);
|
|
}
|
|
|
|
loop.prepareForCodegen();
|
|
te::Stmt* s = loop.root_stmt();
|
|
s = te::IRSimplifier::simplify(s);
|
|
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
|
|
|
|
auto func = [&](at::Tensor& A, at::Tensor& B) {
|
|
cg->call({A.data_ptr<float>(), B.data_ptr<float>()});
|
|
};
|
|
|
|
ValidateFunc(func, "reduce1d_te_naive", A, B);
|
|
for (auto _ : state) {
|
|
func(A, B);
|
|
}
|
|
}
|
|
|
|
BENCHMARK_REGISTER_F(Reduce1D, TeRfactorV2)->Args({1 << 24});
|