pytorch/test/cpp/tensorexpr/test_reductions.cpp
Bert Maher ac121165e2 Remove ReduceOp::accumulator (#52196)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/52196

A reduction does not need to know the buffer into which its
result will be written.  This change gets us closer to being able to
create reductions inside Compute, where we have access to the tensor
axes.
ghstack-source-id: 121813071

Test Plan: test_tensorexpr

Reviewed By: ZolotukhinM

Differential Revision: D26420107

Pulled By: bertmaher

fbshipit-source-id: c8d8a99649adfd6de56fe53a728f5aa034a84f13
2021-02-17 23:36:23 -08:00

1983 lines
51 KiB
C++

#include <gtest/gtest.h>
#include <limits>
#include <memory>
#include <sstream>
#include <stdexcept>
#include <unordered_map>
#include <test/cpp/tensorexpr/test_base.h>
#include <test/cpp/tensorexpr/padded_buffer.h>
#include <torch/csrc/jit/tensorexpr/analysis.h>
#include <torch/csrc/jit/tensorexpr/eval.h>
#include <torch/csrc/jit/tensorexpr/ir.h>
#include <torch/csrc/jit/tensorexpr/ir_printer.h>
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
#include <torch/csrc/jit/tensorexpr/loopnest.h>
#include <torch/csrc/jit/tensorexpr/tensor.h>
#include <torch/csrc/jit/testing/file_check.h>
namespace torch {
namespace jit {
using namespace torch::jit::tensorexpr;
// Sum an array to a single value.
TEST(Reductions, ReduceSum1D) {
KernelScope kernel_scope;
Placeholder b(BufHandle("b", {10}, kFloat));
std::vector<float> in(10);
for (int j = 0; j < 10; ++j) {
in[j] = j;
}
std::vector<float> out(1, -1.f);
Tensor* c = Reduce("sum", {}, Sum(), b, {{10, "m"}});
LoopNest loop({c});
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {b, c});
cg.call({in, out});
ASSERT_EQ(out[0], 45);
}
// Sum a 2D tensor to a 1D tensor with dynamic shapes.
TEST(Reductions, ReduceSum2D) {
KernelScope kernel_scope;
const int M = 3;
const int N = 7;
VarHandle m("m", kInt);
VarHandle n("n", kInt);
Placeholder b(BufHandle("b", {m, n}, kFloat));
std::vector<float> in(M * N);
for (int i = 0; i < M; ++i) {
for (int j = 0; j < N; ++j) {
in[i * N + j] = j;
}
}
std::vector<float> out(M, -1.f);
Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}});
LoopNest loop({c});
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {b, c, n, m});
cg.call({in, out, 5, 7});
float expected = 0;
for (int i = 0; i < N; ++i) {
expected += i;
}
for (int i = 0; i < M; ++i) {
ASSERT_EQ(out[i], expected);
}
}
// Sum a 3D tensor to both a 2D and 1D tensor, then reduce the 2D tensor flat to
// check our work.
TEST(Reductions, ReduceSum3D) {
KernelScope kernel_scope;
const int M = 10;
VarHandle m("m", kInt);
Placeholder b(BufHandle("b", {2, 3, m}, kFloat));
Tensor* c = Reduce("sum", {{2, "l"}, {3, "n"}}, Sum(), b, {{m, "m"}});
LoopNest loop({c});
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {b, c, m});
std::vector<float> bData(2 * 3 * M, 0);
std::vector<float> cData(2 * 3, 6.0f);
std::vector<float> dData(2, 1.0f);
std::vector<float> eData(2, 1.0f);
for (int i = 0; i < 2 * 3; ++i) {
for (int j = 0; j < M; ++j) {
bData[i * M + j] = j;
}
}
cg.call({bData, cData, M});
float expected = 0;
for (int i = 0; i < M; ++i) {
expected += i;
}
for (int i = 0; i < 2 * 3; ++i) {
ASSERT_EQ(cData[i], expected);
}
Tensor* d = Reduce("sum2", {{2, "l"}}, Sum(), b, {{3, "n"}, {m, "m"}});
LoopNest loop2({d});
loop2.prepareForCodegen();
Stmt* s2 = loop2.root_stmt();
s2 = IRSimplifier::simplify(s2);
SimpleIREvaluator cg2(s2, {b, d, m});
cg2.call({bData, dData, M});
// We're combining an additional dimension of 3, so the sum is 3x.
expected = expected * 3;
for (int i = 0; i < 2; ++i) {
ASSERT_EQ(dData[i], expected);
}
// This is the same as just reducing the original result across that axis.
Placeholder c_buf(BufHandle(c->buf()));
Tensor* e = Reduce("sum3", {{2, "l"}}, Sum(), c_buf, {{3, "m"}});
LoopNest loop3({e});
loop3.prepareForCodegen();
Stmt* s3 = loop3.root_stmt();
s3 = IRSimplifier::simplify(s3);
SimpleIREvaluator cg3(s3, {c, e});
cg3.call({cData, eData});
for (int i = 0; i < 2; ++i) {
ASSERT_EQ(eData[i], expected);
}
}
// Sum a large (10 D) Tensor 5 dimensions in.
TEST(Reductions, ReduceSum10D) {
KernelScope kernel_scope;
Placeholder in_(BufHandle("in_", {2, 3, 2, 3, 2, 3, 2, 3, 2, 3}, kFloat));
const int InputSize = 2 * 3 * 2 * 3 * 2 * 3 * 2 * 3 * 2 * 3;
Placeholder out_(BufHandle("out_", {2, 3, 2, 3, 2}, kFloat));
const int OutputSize = 2 * 3 * 2 * 3 * 2;
std::vector<float> in(InputSize, 1.f);
std::vector<float> out(OutputSize, -1.f);
Tensor* c = Reduce(
"sum",
{{2, "a"}, {3, "b"}, {2, "c"}, {3, "d"}, {2, "e"}},
Sum(),
in_,
{{3, "f"}, {2, "g"}, {3, "h"}, {2, "i"}, {3, "j"}});
LoopNest loop({c});
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {in_, c});
cg.call({in, out});
float expected = InputSize / OutputSize;
for (int i = 0; i < OutputSize; ++i) {
ASSERT_EQ(out[i], expected);
}
}
// Reduce via Mul rather than Add using a custom Reducer.
TEST(Reductions, ReduceProduct) {
KernelScope kernel_scope;
const int M = 4;
const int N = 4;
Placeholder b(BufHandle("b", {M, N}, kFloat));
std::vector<float> in(M * N);
for (int i = 0; i < M; ++i) {
for (int j = 0; j < N; ++j) {
in[i * N + j] = 2 + j;
}
}
std::vector<float> out(M, -1.f);
Reducer product(
ExprHandle(1.f), [](ExprHandle a, ExprHandle b) { return a * b; });
Tensor* c = Reduce("product", {{M, "m"}}, product, b, {{N, "n"}});
LoopNest loop({c});
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {b, c});
cg.call({in, out});
float expected = 1;
for (int i = 0; i < N; ++i) {
expected *= 2 + i;
}
for (int i = 0; i < M; ++i) {
ASSERT_EQ(out[i], expected);
}
}
// Maximum reductions.
TEST(Reductions, ReduceMax) {
KernelScope kernel_scope;
Placeholder in_(BufHandle("b", {10}, kFloat));
std::vector<float> in(10);
std::vector<float> out(1, -1.f);
for (int j = 0; j < 10; ++j) {
in[j] = j;
}
Tensor* dm1 = Reduce("max", {}, Maximum(kFloat), in_, {{10, "m"}});
LoopNest loop({dm1});
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {in_, dm1});
cg.call({in, out});
ASSERT_EQ(out[0], 9);
Placeholder in2_(BufHandle("b", {2, 5}, kFloat));
std::vector<float> out2(2, -1.f);
Tensor* m2d = Reduce("max", {{2, "n"}}, Maximum(kFloat), in2_, {{5, "m"}});
LoopNest loop2({m2d});
loop2.prepareForCodegen();
s = loop2.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg2(s, {in2_, m2d});
cg2.call({in, out2});
ASSERT_EQ(out2[0], 4);
ASSERT_EQ(out2[1], 9);
}
// Minimum reduction, with custom initialization.
TEST(Reductions, ReduceMinCustomInitializer) {
KernelScope kernel_scope;
VarHandle minInit("minInit", kFloat);
Placeholder in_(BufHandle("b", {10}, kFloat));
std::vector<float> in(10);
std::vector<float> out(1, -1.f);
for (int j = 0; j < 10; ++j) {
in[j] = 10 + j;
}
Tensor* min = Reduce(
"min",
{},
Minimum(ExprHandle(minInit)),
[&](ParameterList& v) { return in_.load(v); },
{{10, "m"}});
LoopNest loop({min});
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {in_, min, minInit});
// Works normally (note that out data starts lower than the correct
// minimum).
cg.call({in, out, std::numeric_limits<float>::max()});
ASSERT_EQ(out[0], 10);
// With an initalizer lower than the min, that's the min.
cg.call({in, out, 5.f});
ASSERT_EQ(out[0], 5);
}
// Example implementation of Any/All.
// TODO: this is very awkward without logical And/Or operators.
TEST(Reductions, ReduceAnyAll) {
KernelScope kernel_scope;
VarHandle searchValue("searchValue", kInt);
Placeholder b(BufHandle("b", {4, 10}, kInt));
Reducer anyEqSV(ExprHandle(0), [](ExprHandle a, ExprHandle b) {
return CompareSelect::make(a, 1, 1, b, kEQ);
});
Tensor* any = Reduce(
"anyEqual",
{{4, "i"}},
anyEqSV,
[&](const auto& i, const auto& j) {
return CompareSelect::make(b.load(i, j), searchValue, kEQ);
},
{{10, "j"}});
LoopNest loop({any});
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {b, any, searchValue});
std::vector<int> in(40, 0);
std::vector<int> out(4, 0);
// input has 0-39 in 4 rows.
for (int i = 0; i < 40; ++i) {
in[i] = i;
}
cg.call({in, out, 1});
// only the first row has 1
ASSERT_EQ(out[0], 1);
ASSERT_EQ(out[1], 0);
ASSERT_EQ(out[2], 0);
ASSERT_EQ(out[3], 0);
cg.call({in, out, 15});
// 15 in the 3rd row
ASSERT_EQ(out[0], 0);
ASSERT_EQ(out[1], 1);
ASSERT_EQ(out[2], 0);
ASSERT_EQ(out[3], 0);
Reducer allGTSV(ExprHandle(1), [](ExprHandle a, ExprHandle b) {
return CompareSelect::make(a, 0, 0, b, kEQ);
});
Tensor* allGreaterThan = Reduce(
"allGreaterThan",
{{4, "i"}},
allGTSV,
[&](const auto& i, const auto& j) {
return CompareSelect::make(b.load(i, j), searchValue, kGT);
},
{{10, "j"}});
LoopNest loop2({allGreaterThan});
loop2.prepareForCodegen();
s = loop2.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg2(s, {b, allGreaterThan, searchValue});
cg2.call({in, out, 11});
// 11 is in row 2.
ASSERT_EQ(out[0], 0);
ASSERT_EQ(out[1], 0);
ASSERT_EQ(out[2], 1);
ASSERT_EQ(out[3], 1);
cg2.call({in, out, -3});
// All are positive.
ASSERT_EQ(out[0], 1);
ASSERT_EQ(out[1], 1);
ASSERT_EQ(out[2], 1);
ASSERT_EQ(out[3], 1);
}
TEST(Reductions, ReduceMatmul2D) {
KernelScope kernel_scope;
Placeholder tA(BufHandle("tA", {3, 2}, kFloat));
Placeholder tB(BufHandle("tB", {2, 3}, kFloat));
std::vector<float> tA_(6);
std::vector<float> tB_(6);
std::vector<float> out(9, -1.f);
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 2; ++j) {
tA_[i * 2 + j] = i * 2 + j;
tB_[j * 3 + i] = i * 2 + j;
}
}
Tensor* mm = Reduce(
"mm",
{{3, "m"}, {3, "n"}},
Sum(),
[&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
return tA.load(m, k) * tB.load(k, n);
},
{{2, "k"}});
LoopNest loop({mm});
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {tA, tB, mm});
cg.call({tA_, tB_, out});
std::vector<float> expected(
{1.f, 3.f, 5.f, 3.f, 13.f, 23.f, 5.f, 23.f, 41.f});
for (int i = 0; i < 9; ++i) {
ASSERT_EQ(out[i], expected[i]);
}
}
TEST(Reductions, ReduceRfactorLike) {
KernelScope kernel_scope;
Placeholder in(BufHandle("in", {10, 10}, kFloat));
std::vector<float> in_(100);
for (int i = 0; i < 100; ++i) {
in_[i] = i;
}
std::vector<float> in_rf_(10, -2.f);
std::vector<float> out(1, -1.f);
Tensor* l1 = Reduce("l1", {{10, "i"}}, Sum(), in, {{10, "j"}});
Placeholder in_rf(BufHandle(l1->buf()));
Tensor* l2 = Reduce("l2", {}, Sum(), in_rf, {{10, "i"}});
LoopNest loop({l1, l2});
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {in, l1, l2});
cg.call({in_, in_rf_, out});
ASSERT_EQ(out[0], 99 * 50);
}
TEST(Reductions, ReduceAsProducer) {
KernelScope kernel_scope;
const int M = 10;
VarHandle m("m", kInt);
Placeholder a(BufHandle("a", {2, 3}, kFloat));
Placeholder b(BufHandle("b", {2, 3, m}, kFloat));
Tensor* c = Reduce("sum", {{2, "l1"}, {3, "n1"}}, Sum(), b, {{m, "m1"}});
Tensor* d = Compute(
"scale",
{{2, "l2"}, {3, "n1"}},
[&](const VarHandle& l, const VarHandle& n) {
return c->call(l, n) * a.load(l, n);
});
LoopNest loop({d});
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {a, b, d, m});
std::vector<float> aData(2 * 3, 0);
std::vector<float> bData(2 * 3 * M, 0);
std::vector<float> dData(2 * 3, 6.0f);
for (int i = 0; i < 2 * 3; ++i) {
aData[i] = 6 - i;
for (int j = 0; j < M; ++j) {
bData[i * M + j] = j;
}
}
cg.call({aData, bData, dData, M});
float expected = 0;
for (int i = 0; i < M; ++i) {
expected += i;
}
for (int i = 0; i < 2 * 3; ++i) {
ASSERT_EQ(dData[i], expected * (6 - i));
}
}
TEST(Reductions, ReduceAsConsumer) {
KernelScope kernel_scope;
const int M = 10;
VarHandle m("m", kInt);
Placeholder a(BufHandle("a", {2, 3, m}, kFloat));
Placeholder b(BufHandle("b", {2, 3, m}, kFloat));
Tensor* c = Compute(
"scale",
{{2, "l2"}, {3, "n1"}, {m, "m1"}},
[&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
return b.load(l, n, m) * a.load(l, n, m);
});
Tensor* d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {m, "m1"}});
LoopNest loop({d});
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {a, b, d, m});
std::vector<float> aData(2 * 3 * M, 0);
std::vector<float> bData(2 * 3 * M, 0);
std::vector<float> dData(2, 6.0f);
for (int i = 0; i < 2 * 3; ++i) {
for (int j = 0; j < M; ++j) {
bData[i * M + j] = j + 1;
aData[i * M + j] = 6 - i;
}
}
cg.call({aData, bData, dData, M});
float expected[2] = {0, 0};
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < M; ++k) {
expected[i] += (k + 1) * (6 - (i * 3 + j));
}
}
}
for (int i = 0; i < 2; ++i) {
ASSERT_EQ(dData[i], expected[i]);
}
}
TEST(Reductions, SplitReduceAxis) {
KernelScope kernel_scope;
Placeholder in(BufHandle("in", {16, 8}, kFloat));
std::vector<float> in_(16 * 8);
for (int i = 0; i < 16; ++i) {
for (int j = 0; j < 8; ++j) {
in_[i * 8 + j] = i;
}
}
std::vector<float> out(16, -1.f);
Tensor* tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}});
LoopNest l({tensor});
For* x_outer;
For* x_inner;
For* x_tail;
std::vector<For*> loops = l.getLoopStmtsFor(tensor);
l.splitWithTail(loops[1], 2, &x_outer, &x_inner, &x_tail);
l.prepareForCodegen();
Stmt* s = l.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {in, tensor});
cg.call({in_, out});
for (int i = 0; i < 16; ++i) {
ASSERT_EQ(out[i], i * 8);
}
}
TEST(Reductions, SplitNonReduceAxis) {
KernelScope kernel_scope;
Placeholder in(BufHandle("in", {16, 8}, kFloat));
std::vector<float> in_(16 * 8);
for (int i = 0; i < 16; ++i) {
for (int j = 0; j < 8; ++j) {
in_[i * 8 + j] = i;
}
}
std::vector<float> out(16, -1.f);
Tensor* tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}});
LoopNest l({tensor});
For* x_outer;
For* x_inner;
For* x_tail;
std::vector<For*> loops = l.getLoopStmtsFor(tensor);
l.splitWithTail(loops[0], 2, &x_outer, &x_inner, &x_tail);
For* x_2;
For* x_1;
For* x_tail_2;
l.splitWithTail(x_outer, 2, &x_2, &x_1, &x_tail_2);
l.prepareForCodegen();
Stmt* s = l.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {in, tensor});
cg.call({in_, out});
for (int i = 0; i < 16; ++i) {
ASSERT_EQ(out[i], i * 8);
}
}
TEST(Reductions, ReorderedReductionInitializer) {
KernelScope kernel_scope;
/* From the quip:
for k in 0..1: // blockIdx
for m in 0..128:
for n in 0..64: // threadIdx
SumOp(c(k, n), 0, a(k, m, n), {m})
*/
Placeholder in(BufHandle("in", {1, 12, 6}, kFloat));
std::vector<float> in_(12 * 6, 1.f);
Tensor* tensor_ = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}});
LoopNest l_({tensor_});
l_.prepareForCodegen();
Stmt* s_ = Stmt::clone(l_.root_stmt());
s_ = IRSimplifier::simplify(s_);
Tensor* tensor = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}});
LoopNest l({tensor});
auto loops = l.getLoopStmtsFor(tensor);
l.setGPUBlockIndex(loops[0], 0);
l.setGPUThreadIndex(loops[1], 0);
l.reorderAxis(loops[1], loops[2]);
Stmt* s = l.root_stmt();
s = IRSimplifier::simplify(s);
l.prepareForCodegen();
s = l.root_stmt();
s = IRSimplifier::simplify(s);
std::vector<float> out1(16, -1.f);
SimpleIREvaluator cg(s_, {in, tensor_});
cg.call({in_, out1});
std::vector<float> out2(16, -1.f);
SimpleIREvaluator cg2(s, {in, tensor});
cg2.call({in_, out2});
for (int i = 0; i < 16; ++i) {
ASSERT_EQ(out1[i], out2[i]);
}
}
TEST(Reductions, ReduceRfactor) {
KernelScope kernel_scope;
const int M = 10;
const int N = 10;
VarHandle m("m", kInt);
VarHandle n("n", kInt);
Placeholder b(BufHandle("b", {m, n}, kFloat));
std::vector<float> in(M * N);
for (int j = 0; j < M * N; ++j) {
in[j] = j;
}
std::vector<float> out(1, -1.f);
Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}});
LoopNest loop({c});
std::vector<For*> loops = loop.getLoopStmtsFor(c);
auto v = loops.at(1)->var();
auto c_body = NodeFinder<ReduceOp>::find(loop.root_stmt())[0];
loop.rfactor(c_body, v);
auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
ASSERT_EQ(rc.size(), 2);
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {b, c, m, n});
cg.call({in, out, M, N});
ASSERT_EQ(out[0], 4950);
}
TEST(Reductions, Reduce3DRfactorInternal) {
KernelScope kernel_scope;
const int M = 10;
const int N = 10;
const int K = 10;
VarHandle m("m", kInt);
VarHandle n("n", kInt);
VarHandle k("k", kInt);
Placeholder b(BufHandle("b", {m, n, k}, kFloat));
std::vector<float> in(M * N * K);
for (int j = 0; j < M * N * K; ++j) {
in[j] = j;
}
std::vector<float> out(1, -1.f);
Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
LoopNest loop({c});
std::vector<For*> loops = loop.getLoopStmtsFor(c);
auto v = loops.at(1)->var();
auto c_body = NodeFinder<ReduceOp>::find(loop.root_stmt())[0];
loop.rfactor(c_body, v);
auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
ASSERT_EQ(rc.size(), 2);
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {b, c, m, n, k});
cg.call({in, out, M, N, K});
ASSERT_EQ(out[0], 499500);
}
TEST(Reductions, Reduce3DRfactorInner) {
KernelScope kernel_scope;
const int M = 10;
const int N = 10;
const int K = 10;
VarHandle m("m", kInt);
VarHandle n("n", kInt);
VarHandle k("k", kInt);
Placeholder b(BufHandle("b", {m, n, k}, kFloat));
std::vector<float> in(M * N * K);
for (int j = 0; j < M * N * K; ++j) {
in[j] = j;
}
std::vector<float> out(1, -1.f);
Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
LoopNest loop({c});
std::vector<For*> loops = loop.getLoopStmtsFor(c);
auto v = loops.at(2)->var();
auto c_body = NodeFinder<ReduceOp>::find(loop.root_stmt())[0];
loop.rfactor(c_body, v);
auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
ASSERT_EQ(rc.size(), 2);
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {b, c, m, n, k});
cg.call({in, out, M, N, K});
ASSERT_EQ(out[0], 499500);
}
TEST(Reductions, Reduce3DRfactorOuter) {
KernelScope kernel_scope;
const int M = 10;
const int N = 10;
const int K = 10;
VarHandle m("m", kInt);
VarHandle n("n", kInt);
VarHandle k("k", kInt);
Placeholder b(BufHandle("b", {m, n, k}, kFloat));
std::vector<float> in(M * N * K);
for (int j = 0; j < M * N * K; ++j) {
in[j] = j;
}
std::vector<float> out(1, -1.f);
Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
LoopNest loop({c});
std::vector<For*> loops = loop.getLoopStmtsFor(c);
auto v = loops.at(0)->var();
auto c_body = NodeFinder<ReduceOp>::find(loop.root_stmt())[0];
loop.rfactor(c_body, v);
auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
ASSERT_EQ(rc.size(), 2);
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {b, c, m, n, k});
cg.call({in, out, M, N, K});
ASSERT_EQ(out[0], 499500);
}
TEST(Reductions, Reduce3DRfactorWithOuter) {
KernelScope kernel_scope;
const int L = 5;
const int M = 5;
const int N = 5;
const int K = 5;
VarHandle l("l", kInt);
VarHandle m("m", kInt);
VarHandle n("n", kInt);
VarHandle k("k", kInt);
Placeholder b(BufHandle("b", {l, m, n, k}, kFloat));
std::vector<float> in(L * M * N * K);
for (int j = 0; j < M * N * K; ++j) {
in[j] = j;
}
std::vector<float> out(L, -1.f);
Tensor* c =
Reduce("sum", {{l, "l"}}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
LoopNest loop({c});
std::vector<For*> loops = loop.getLoopStmtsFor(c);
auto v = loops.at(3)->var();
auto c_body = NodeFinder<ReduceOp>::find(loop.root_stmt())[0];
loop.rfactor(c_body, v);
auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
ASSERT_EQ(rc.size(), 2);
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {b, c, l, m, n, k});
cg.call({in, out, L, M, N, K});
ASSERT_EQ(out[0], 7750);
}
TEST(Reductions, Reduce3DRfactorRepeated) {
KernelScope kernel_scope;
const int M = 5;
const int N = 5;
const int K = 5;
VarHandle m("m", kInt);
VarHandle n("n", kInt);
VarHandle k("k", kInt);
Placeholder b(BufHandle("b", {m, n, k}, kFloat));
std::vector<float> in(M * N * K);
for (int j = 0; j < M * N * K; ++j) {
in[j] = j;
}
Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
LoopNest orig_loopnest({c});
for (int rVar1 = 0; rVar1 < 3; ++rVar1) {
for (int rVar2 = 0; rVar2 < 2; ++rVar2) {
std::vector<float> out(1, -1.f);
LoopNest loop(orig_loopnest);
auto reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
ASSERT_EQ(reduces.size(), 1);
auto v1 = reduces[0]->reduce_args()[rVar1];
loop.rfactor(reduces[0], v1);
reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
ASSERT_EQ(reduces.size(), 2);
auto v2 = reduces[0]->reduce_args()[rVar2];
loop.rfactor(reduces[0], v2);
reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
ASSERT_EQ(reduces.size(), 3);
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {b, c, m, n, k});
cg.call({in, out, M, N, K});
ASSERT_EQ(out[0], 7750);
}
}
}
TEST(Reductions, ReduceRfactorInsertionPoint) {
KernelScope kernel_scope;
const int M = 10;
const int N = 10;
VarHandle m("m", kInt);
VarHandle n("n", kInt);
Placeholder b(BufHandle("b", {m, n}, kFloat));
std::vector<float> in(M * N);
for (int j = 0; j < M * N; ++j) {
in[j] = j;
}
std::vector<float> out(1, -1.f);
Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}});
LoopNest loop({c});
std::vector<For*> loops = loop.getLoopStmtsFor(c);
auto v = loops.at(0)->var();
auto c_body = NodeFinder<ReduceOp>::find(loop.root_stmt())[0];
loop.rfactor(c_body, v, loops.at(0)->body());
auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
ASSERT_EQ(rc.size(), 2);
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {b, c, m, n});
cg.call({in, out, M, N});
ASSERT_EQ(out[0], 4950);
}
TEST(Reductions, Reduce3DRfactorInsertionPoint) {
KernelScope kernel_scope;
const int M = 10;
const int N = 10;
const int K = 10;
VarHandle m("m", kInt);
VarHandle n("n", kInt);
VarHandle k("k", kInt);
Placeholder b(BufHandle("b", {m, n, k}, kFloat));
std::vector<float> in(M * N * K);
for (int j = 0; j < M * N * K; ++j) {
in[j] = j;
}
std::vector<float> out(M, -1.f);
Tensor* c = Reduce("sum", {{m, "m"}}, Sum(), b, {{n, "n"}, {k, "k"}});
LoopNest loop({c});
std::vector<For*> loops = loop.getLoopStmtsFor(c);
auto v = loops.at(1)->var();
auto c_body = NodeFinder<ReduceOp>::find(loop.root_stmt())[0];
loop.rfactor(c_body, v, loops.at(1)->body());
auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
ASSERT_EQ(rc.size(), 2);
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {b, c, m, n, k});
cg.call({in, out, M, N, K});
ASSERT_EQ(out[0], 4950);
}
TEST(Reductions, ReduceRepeatedInternalRfactor) {
KernelScope kernel_scope;
Placeholder in_(BufHandle("in_", {2, 3, 4, 5, 6}, kFloat));
const int InputSize = 2 * 3 * 4 * 5 * 6;
std::vector<float> in(InputSize, 1.f);
std::vector<float> out(1, -1.f);
std::vector<float> ref(1, -1.f);
Tensor* c = Reduce(
"sum",
{},
Sum(),
in_,
{{2, "a"}, {3, "b"}, {4, "c"}, {5, "d"}, {6, "e"}});
LoopNest refloop({c});
LoopNest loop(refloop);
refloop.prepareForCodegen();
SimpleIREvaluator ref_cg(
IRSimplifier::simplify(refloop.root_stmt()), {in_, c});
ref_cg.call({in, ref});
// rfactor out "c".
auto reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
loop.rfactor(reduces[0], reduces[0]->reduce_args()[3]);
// rfactor out "b".
reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
loop.rfactor(reduces[0], reduces[0]->reduce_args()[1]);
// rfactor out "d".
reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
loop.rfactor(reduces[0], reduces[0]->reduce_args()[1]);
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {in_, c});
cg.call({in, out});
ASSERT_EQ(ref[0], out[0]);
}
// Split a reduction axis with a tail loop.
TEST(Reductions, ReduceSplitTail) {
KernelScope kernel_scope;
const int M = 10;
const int N = 10;
const int K = 10;
Placeholder b(BufHandle("b", {M, N, K}, kFloat));
std::vector<float> in(M * N * K);
for (int j = 0; j < M * N * K; ++j) {
in[j] = j;
}
for (int i = 0; i < 3; ++i) {
std::vector<float> out(M, -1.f);
Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
LoopNest loop({c});
std::vector<For*> loops = loop.getLoopStmtsFor(c);
For *outer, *inner, *tail;
loop.splitWithTail(loops[i], 8, &outer, &inner, &tail);
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {b, c});
cg.call({in, out});
ASSERT_EQ(out[0], 4950);
}
}
// Split a reduction axis cleanly so there is no tail loop.
TEST(Reductions, ReduceSplitNoTail) {
KernelScope kernel_scope;
const int M = 10;
const int N = 10;
const int K = 10;
Placeholder b(BufHandle("b", {M, N, K}, kFloat));
std::vector<float> in(M * N * K);
for (int j = 0; j < M * N * K; ++j) {
in[j] = j;
}
for (int i = 0; i < 3; ++i) {
std::vector<float> out(M, -1.f);
Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
LoopNest loop({c});
std::vector<For*> loops = loop.getLoopStmtsFor(c);
For *outer, *inner, *tail;
loop.splitWithTail(loops[i], 5, &outer, &inner, &tail);
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {b, c});
cg.call({in, out});
ASSERT_EQ(out[0], 4950);
}
}
// Split a reduction axis with only a tail loop (the split loop will be size 0
// and eliminated out).
TEST(Reductions, ReduceOverSplitTail) {
KernelScope kernel_scope;
const int M = 10;
const int N = 10;
const int K = 10;
Placeholder b(BufHandle("b", {M, N, K}, kFloat));
std::vector<float> in(M * N * K);
for (int j = 0; j < M * N * K; ++j) {
in[j] = j;
}
for (int i = 0; i < 3; ++i) {
std::vector<float> out(M, -1.f);
Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
LoopNest loop({c});
std::vector<For*> loops = loop.getLoopStmtsFor(c);
For *outer, *inner, *tail;
loop.splitWithTail(loops[i], 16, &outer, &inner, &tail);
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {b, c});
cg.call({in, out});
ASSERT_EQ(out[0], 4950);
}
}
// Split a reduction axis with a mask.
TEST(Reductions, ReduceSplitMask) {
KernelScope kernel_scope;
const int M = 10;
const int N = 10;
const int K = 10;
Placeholder b(BufHandle("b", {M, N, K}, kFloat));
std::vector<float> in(M * N * K);
for (int j = 0; j < M * N * K; ++j) {
in[j] = j;
}
for (int i = 0; i < 3; ++i) {
std::vector<float> out(M, -1.f);
Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
LoopNest loop({c});
std::vector<For*> loops = loop.getLoopStmtsFor(c);
For *outer, *inner;
loop.splitWithMask(loops[i], 8, &outer, &inner);
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {b, c});
cg.call({in, out});
ASSERT_EQ(out[0], 4950);
}
}
// Split a reduction axis cleanly not requiring a mask.
TEST(Reductions, ReduceSplitNoMask) {
KernelScope kernel_scope;
const int M = 10;
const int N = 10;
const int K = 10;
Placeholder b(BufHandle("b", {M, N, K}, kFloat));
std::vector<float> in(M * N * K);
for (int j = 0; j < M * N * K; ++j) {
in[j] = j;
}
for (int i = 0; i < 3; ++i) {
std::vector<float> out(M, -1.f);
Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
LoopNest loop({c});
std::vector<For*> loops = loop.getLoopStmtsFor(c);
For *outer, *inner;
loop.splitWithMask(loops[i], 5, &outer, &inner);
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {b, c});
cg.call({in, out});
ASSERT_EQ(out[0], 4950);
}
}
// Split a reduction axis with all logic in the mask.
TEST(Reductions, ReduceOverSplitMask) {
KernelScope kernel_scope;
const int M = 10;
const int N = 10;
const int K = 10;
Placeholder b(BufHandle("b", {M, N, K}, kFloat));
std::vector<float> in(M * N * K);
for (int j = 0; j < M * N * K; ++j) {
in[j] = j;
}
for (int i = 0; i < 3; ++i) {
std::vector<float> out(M, -1.f);
Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
LoopNest loop({c});
std::vector<For*> loops = loop.getLoopStmtsFor(c);
For *outer, *inner;
loop.splitWithMask(loops[i], 16, &outer, &inner);
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {b, c});
cg.call({in, out});
ASSERT_EQ(out[0], 4950);
}
}
// Test an rfactor when there are two ReduceOps in the graph due to a
// splitWithTail.
TEST(Reductions, ReduceSplitRfactor) {
KernelScope kernel_scope;
const int M = 2;
const int N = 10;
const int K = 10;
const int SPLIT_FACTOR = 4;
Placeholder b(BufHandle("b", {M, N, K}, kFloat));
std::vector<float> in(M * N * K);
for (int m = 0; m < M; ++m) {
for (int j = 0; j < N * K; ++j) {
in[m * N * K + j] = j;
}
}
std::vector<float> out(M, -1.f);
Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
LoopNest loop({c});
std::vector<For*> loops = loop.getLoopStmtsFor(c);
For *o, *i, *t;
loop.splitWithTail(loops[2], SPLIT_FACTOR, &o, &i, &t);
auto reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
loop.rfactor(reduces[0], reduces[0]->reduce_args().back());
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {b, c});
cg.call({in, out});
for (int i = 0; i < M; ++i) {
ASSERT_EQ(out[0], 4950);
}
}
// Test an rfactor which ends up being eliminated since the total loop size is
// smaller than the split factor.
TEST(Reductions, ReduceOverSplitRfactor) {
KernelScope kernel_scope;
const int N = 10;
const int K = 10;
const int SPLIT_FACTOR = 16;
Placeholder b(BufHandle("b", {N, K}, kFloat));
std::vector<float> in(N * K);
for (int j = 0; j < N * K; ++j) {
in[j] = j;
}
std::vector<float> out(1, -1.f);
Tensor* c = Reduce("sum", {}, Sum(), b, {{N, "n"}, {K, "k"}});
LoopNest loop({c});
std::vector<For*> loops = loop.getLoopStmtsFor(c);
For *o, *i, *t;
loop.splitWithTail(loops[1], SPLIT_FACTOR, &o, &i, &t);
auto reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
loop.rfactor(reduces[0], reduces[0]->reduce_args().back());
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
SimpleIREvaluator cg(s, {b, c});
cg.call({in, out});
ASSERT_EQ(out[0], 4950);
std::ostringstream oss;
oss << *s;
// Check the IR to verify the rfactored reduce is eliminated.
// TODO: The alloc free should be eliminated here since it is size 0.
const std::string& verification_pattern =
R"IR(
# CHECK: Allocate(tmp_buf, float, {0});
# CHECK: sum[0] = 0.f;
# CHECK: for (int n = 0; n < 10; n++) {
# CHECK: for (int k_tail = 0; k_tail < 10; k_tail++) {
# CHECK: sum[0] = (sum[0]) + (b[k_tail + 10 * n]);
# CHECK: }
# CHECK: }
# CHECK: Free(tmp_buf);)IR";
// TODO: rfactor output is not consistent yet, will fix (@nickg).
// torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
}
TEST(Reductions, ReduceInlineReduction) {
KernelScope kernel_scope;
const int M = 4;
const int N = 5;
const int K = 6;
Placeholder a_buf("a", kFloat, {M});
Placeholder b_buf("b", kFloat, {M, N, K});
Tensor* x = Reduce("x", {{M, "m1"}}, Sum(), b_buf, {{N, "n1"}, {K, "k1"}});
Tensor* y = Compute("y", {{M, "m2"}}, [&](const VarHandle& m) {
return a_buf.load(m) + x->call(m);
});
PaddedBuffer<float> a_v(M);
PaddedBuffer<float> b_v(M, N, K);
for (int i = 0; i < M; i++) {
a_v(i) = i * i;
}
for (int i = 0; i < M; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < K; k++) {
b_v(i, j, k) = j * j * k;
}
}
}
LoopNest l1({y});
// Cannot inline a reduction computation
ASSERT_FALSE(l1.computeInline(x->buf()));
}
TEST(Reductions, ReduceInlineConsumer) {
KernelScope kernel_scope;
const int M = 4;
const int N = 5;
const int K = 6;
Placeholder a_buf("a", kFloat, {M, N, K});
Placeholder b_buf("b", kFloat, {M, N, K});
Tensor* x = Compute(
"x",
{{M, "m1"}, {N, "n1"}, {K, "k1"}},
[&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
return a_buf.load(m, n, k) + b_buf.load(m, n, k);
});
Tensor* y = Reduce("y", {{M, "m2"}}, Sum(), x, {{N, "n2"}, {K, "k2"}});
PaddedBuffer<float> a_v(M, N, K);
PaddedBuffer<float> b_v(M, N, K);
for (int i = 0; i < M; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < K; k++) {
a_v(i, j, k) = i * i + k;
b_v(i, j, k) = j * j + k;
}
}
}
LoopNest l1({y});
LoopNest l2(l1);
l2.computeInline(x->buf());
l1.prepareForCodegen();
l2.prepareForCodegen();
Stmt* stmt1 = IRSimplifier::simplify(l1.root_stmt());
Stmt* stmt2 = IRSimplifier::simplify(l2.root_stmt());
SimpleIREvaluator eval1(stmt1, {a_buf, b_buf, y});
SimpleIREvaluator eval2(stmt2, {a_buf, b_buf, y});
PaddedBuffer<float> y_1(M);
PaddedBuffer<float> y_2(M);
eval1(a_v, b_v, y_1);
eval2(a_v, b_v, y_2);
ExpectAllNear(y_1, y_2, 1e-5);
std::ostringstream oss1, oss2;
oss1 << *stmt1;
oss2 << *stmt2;
ASSERT_GT(oss1.str().size(), oss2.str().size());
}
TEST(Reductions, ReduceInlineReducerInternal) {
KernelScope kernel_scope;
const int M = 4;
const int N = 5;
const int K = 6;
Placeholder a_buf("a", kFloat, {M, N, K});
Placeholder b_buf("b", kFloat, {M, N, K});
Tensor* x = Compute(
"x",
{{M, "m1"}, {N, "n1"}, {K, "k1"}},
[&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
return a_buf.load(m, n, k) + b_buf.load(m, n, k);
});
Reducer minimum(ExprHandle(0.f), [&](ExprHandle a, ExprHandle b) {
return Add::make(ExprHandle(1.f), Min::make(a, b, false));
});
Tensor* y = Reduce("y", {{M, "m2"}}, minimum, x, {{N, "n2"}, {K, "k2"}});
PaddedBuffer<float> a_v(M, N, K);
PaddedBuffer<float> b_v(M, N, K);
for (int i = 0; i < M; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < K; k++) {
a_v(i, j, k) = i * i + k;
b_v(i, j, k) = j * j + k;
}
}
}
LoopNest l1({y});
LoopNest l2(l1);
l2.computeInline(x->buf());
l1.prepareForCodegen();
l2.prepareForCodegen();
Stmt* stmt1 = IRSimplifier::simplify(l1.root_stmt());
Stmt* stmt2 = IRSimplifier::simplify(l2.root_stmt());
SimpleIREvaluator eval1(stmt1, {a_buf, b_buf, y});
SimpleIREvaluator eval2(stmt2, {a_buf, b_buf, y});
PaddedBuffer<float> y_1(M);
PaddedBuffer<float> y_2(M);
eval1(a_v, b_v, y_1);
eval2(a_v, b_v, y_2);
ExpectAllNear(y_1, y_2, 1e-5);
std::ostringstream oss1, oss2;
oss1 << *stmt1;
oss2 << *stmt2;
ASSERT_GT(oss1.str().size(), oss2.str().size());
}
TEST(Reductions, ReductionCacheAccessesOuter) {
KernelScope kernel_scope;
int L = 4;
int N = 3;
int M = 2;
Placeholder a(BufHandle("a", {L, N, M}, kFloat));
Placeholder b(BufHandle("b", {L, N, M}, kFloat));
Tensor* c = Compute(
"scale",
{{L, "l2"}, {N, "n1"}, {M, "m1"}},
[&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
return b.load(l, n, m) * a.load(l, n, m);
});
Tensor* d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});
Tensor* e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
return b.load(0, 0, l) * d->call(l);
});
LoopNest l({e});
Stmt* d_loop = l.getLoopStmtsFor(d)[1];
l.cacheAccesses(d->buf(), "d_local", d_loop);
l.prepareForCodegen();
Stmt* result = IRSimplifier::simplify(l.root_stmt());
std::ostringstream oss;
oss << *result;
const std::string& expected_ir =
R"IR(
#CHECK: Allocate(d_local, float, {1});
#CHECK: sum[l1] = 0
#CHECK: d_local[0] = 0
#CHECK: for (int n1
#CHECK: for (int m1
#CHECK: d_local[0] = (d_local[0]) + (scale[
#CHECK: }
#CHECK: }
#CHECK: sum[l1] = (sum[l1]) + (d_local[0])
#CHECK: Free(d_local);
#CHECK-NOT: d_local
)IR";
torch::jit::testing::FileCheck().run(expected_ir, oss.str());
}
TEST(Reductions, ReductionCacheAccessesInner) {
KernelScope kernel_scope;
int L = 4;
int N = 3;
int M = 2;
Placeholder a(BufHandle("a", {L, N, M}, kFloat));
Placeholder b(BufHandle("b", {L, N, M}, kFloat));
Tensor* c = Compute(
"scale",
{{L, "l2"}, {N, "n1"}, {M, "m1"}},
[&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
return b.load(l, n, m) * a.load(l, n, m);
});
Tensor* d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});
Tensor* e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
return b.load(0, 0, l) * d->call(l);
});
LoopNest l({e});
Stmt* d_loop = l.getLoopStmtsFor(d)[2];
l.cacheAccesses(d->buf(), "d_local", d_loop);
l.prepareForCodegen();
Stmt* result = IRSimplifier::simplify(l.root_stmt());
std::ostringstream oss;
oss << *result;
const std::string& expected_ir =
R"IR(
#CHECK: sum[l1] = 0
#CHECK: for (int n1
#CHECK: Allocate(d_local, float, {1});
#CHECK: d_local[0] = 0
#CHECK: for (int m1
#CHECK: d_local[0] = (d_local[0]) + (scale[
#CHECK: }
#CHECK: sum[l1] = (sum[l1]) + (d_local[0])
#CHECK: Free(d_local);
#CHECK: }
#CHECK-NOT: d_local
)IR";
torch::jit::testing::FileCheck().run(expected_ir, oss.str());
}
TEST(Reductions, ReductionCacheBodyAccess) {
KernelScope kernel_scope;
Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));
Tensor* c = Compute(
"scale",
{{24, "l2"}, {32, "n1"}, {12, "m1"}},
[&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
return b.load(l, n, m) * a.load(l, n, m);
});
Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
return b.load(0, 0, l) * d->call(l);
});
LoopNest l({e});
Stmt* d_loop = l.getLoopStmtsFor(d)[1];
l.cacheAccesses(c->buf(), "scale_local", d_loop);
l.prepareForCodegen();
Stmt* result = IRSimplifier::simplify(l.root_stmt());
std::ostringstream oss;
oss << *result;
const std::string& expected_ir =
R"IR(
#CHECK: Allocate(scale_local, float, {384});
#CHECK: for (int j = 0; j < 32; j++) {
#CHECK: for (int k = 0; k < 12; k++) {
#CHECK: scale_local[k + 12 * j] = scale[(k + 384 * l1) + 12 * j];
#CHECK: sum[l1] = (sum[l1]) + (scale_local[12 * n1_1 + m1_1]);
#CHECK: Free(scale_local);
#CHECK: scale_1[l] = (b[l]) * (sum[l]);
)IR";
torch::jit::testing::FileCheck().run(expected_ir, oss.str());
}
TEST(Reductions, ReductionCacheConsumerAccess) {
KernelScope kernel_scope;
Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));
Tensor* c = Compute(
"scale",
{{24, "l2"}, {32, "n1"}, {12, "m1"}},
[&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
return b.load(l, n, m) * a.load(l, n, m);
});
Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
return b.load(0, 0, l) * d->call(l);
});
LoopNest l({e});
For* outer;
For* inner;
l.splitWithMask(l.getLoopStmtsFor(e)[0], 4, &outer, &inner);
Stmt* e_loop = l.getLoopStmtsFor(e)[1];
l.cacheAccesses(d->buf(), "sum_local", e_loop);
l.prepareForCodegen();
Stmt* result = IRSimplifier::simplify(l.root_stmt());
std::ostringstream oss;
oss << *result;
const std::string& expected_ir =
R"IR(
#CHECK: sum[l1] = (sum[l1]) + (scale[
#CHECK: Allocate(sum_local, float, {4});
#CHECK: for (int i = 0; i < 4
#CHECK: sum_local[i] = sum[i + 4 * l_outer];
#CHECK: scale_1[l_inner + 4 * l_outer] = (b[l_inner + 4 * l_outer]) * (sum_local[l_inner]);
)IR";
torch::jit::testing::FileCheck().run(expected_ir, oss.str());
}
TEST(Reductions, ReductionSplitCacheConsumerAccess) {
KernelScope kernel_scope;
Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));
Tensor* c = Compute(
"scale",
{{24, "l2"}, {32, "n1"}, {12, "m1"}},
[&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
return b.load(l, n, m) * a.load(l, n, m);
});
Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
return b.load(0, 0, l) * d->call(l);
});
LoopNest l({e});
For* outer;
For* inner;
// Split outer reduction axis.
l.splitWithMask(l.getLoopStmtsFor(d)[0], 4, &outer, &inner);
// Split reduction consumer.
l.splitWithMask(l.getLoopStmtsFor(e)[0], 4, &outer, &inner);
l.cacheAccesses(d->buf(), "sum_local", inner);
l.prepareForCodegen();
Stmt* result = IRSimplifier::simplify(l.root_stmt());
// reduction changes but cache does not.
std::ostringstream oss;
oss << *result;
const std::string& expected_ir =
R"IR(
#CHECK: sum[l1_inner + 4 * l1_outer] = (sum[l1_inner + 4 * l1_outer]) + (scale[((12 * n1_1 + 384 * l1_inner) + m1_1) + 1536 * l1_outer]);
#CHECK: Allocate(sum_local, float, {4});
#CHECK: for (int i = 0; i < 4
#CHECK: sum_local[i] = sum[i + 4 * l_outer];
#CHECK: scale_1[l_inner + 4 * l_outer] = (b[l_inner + 4 * l_outer]) * (sum_local[l_inner]);
)IR";
torch::jit::testing::FileCheck().run(expected_ir, oss.str());
}
TEST(Reductions, ReductionReorderCacheConsumerAccess) {
KernelScope kernel_scope;
Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));
Tensor* c = Compute(
"scale",
{{24, "l2"}, {32, "n1"}, {12, "m1"}},
[&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
return b.load(l, n, m) * a.load(l, n, m);
});
Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
return b.load(0, 0, l) * d->call(l);
});
LoopNest l({e});
For* outer;
For* inner;
// reorder outer reduction axes.
auto loops = l.getLoopStmtsFor(d);
l.reorderAxis(loops[0], loops[1]);
// Split reduction consumer.
l.splitWithMask(l.getLoopStmtsFor(e)[0], 4, &outer, &inner);
l.cacheAccesses(d->buf(), "sum_local", inner);
l.prepareForCodegen();
Stmt* result = IRSimplifier::simplify(l.root_stmt());
// neither reduction body not cache changes.
std::ostringstream oss;
oss << *result;
const std::string& expected_ir =
R"IR(
#CHECK: sum[l1] = (sum[l1]) + (scale[(12 * n1_1 + m1_1) + 384 * l1]);
#CHECK: Allocate(sum_local, float, {4});
#CHECK: for (int i = 0; i < 4
#CHECK: sum_local[i] = sum[i + 4 * l_outer];
#CHECK: scale_1[l_inner + 4 * l_outer] = (b[l_inner + 4 * l_outer]) * (sum_local[l_inner]);
)IR";
torch::jit::testing::FileCheck().run(expected_ir, oss.str());
}
TEST(Reductions, ReductionRfactorCacheTempOuter) {
KernelScope kernel_scope;
const int M = 10;
const int N = 10;
const int K = 10;
VarHandle m("m", kInt);
VarHandle n("n", kInt);
VarHandle k("k", kInt);
Placeholder b(BufHandle("B", {m, n, k}, kFloat));
std::vector<float> in(M * N * K);
for (int j = 0; j < M * N * K; ++j) {
in[j] = j;
}
std::vector<float> out(1, -1.f);
Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}});
LoopNest loop({c});
auto reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
loop.rfactor(reduces[0], reduces[0]->reduce_args()[1]);
auto stores = NodeFinder<Store>::find(loop.root_stmt());
std::vector<For*> loops = NodeFinder<For>::find(loop.root_stmt());
loop.cacheAccesses(stores[1]->buf(), "tmp2", loops[2]);
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
std::ostringstream oss;
oss << *s;
const std::string& expected_ir =
R"IR(
#CHECK: Allocate(tmp_buf, float, {n});
#CHECK: for (int a = 0; a < m
#CHECK: Allocate(tmp2, float, {n});
#CHECK: for (int i = 0; i < n
#CHECK: tmp2[i] = 0
#CHECK: }
#CHECK: for (int b = 0; b < n
#CHECK: for (int c
#CHECK: tmp2[b] = (tmp2[b]) + (B[
#CHECK: }
#CHECK: }
#CHECK: for (int i = 0; i < n
#CHECK: tmp_buf[i] = (tmp_buf[i]) + (tmp2[i]);
#CHECK: }
#CHECK: Free(tmp2);
#CHECK-NOT: tmp2
)IR";
torch::jit::testing::FileCheck().run(expected_ir, oss.str());
SimpleIREvaluator cg(s, {b, c, m, n, k});
cg.call({in, out, M, N, K});
ASSERT_EQ(out[0], 499500);
}
TEST(Reductions, ReductionRfactorCacheTempInner) {
KernelScope kernel_scope;
const int M = 10;
const int N = 10;
const int K = 10;
VarHandle m("m", kInt);
VarHandle n("n", kInt);
VarHandle k("k", kInt);
Placeholder b(BufHandle("B", {m, n, k}, kFloat));
std::vector<float> in(M * N * K);
for (int j = 0; j < M * N * K; ++j) {
in[j] = j;
}
std::vector<float> out(1, -1.f);
Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}});
LoopNest loop({c});
auto reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
loop.rfactor(reduces[0], reduces[0]->reduce_args()[1]);
auto stores = NodeFinder<Store>::find(loop.root_stmt());
std::vector<For*> loops = NodeFinder<For>::find(loop.root_stmt());
loop.cacheAccesses(stores[1]->buf(), "tmp2", loops[3]);
loop.prepareForCodegen();
Stmt* s = loop.root_stmt();
s = IRSimplifier::simplify(s);
std::ostringstream oss;
oss << *s;
const std::string& expected_ir =
R"IR(
#CHECK: Allocate(tmp_buf, float, {n});
#CHECK: for (int a = 0; a < m
#CHECK: for (int b = 0; b < n
#CHECK: Allocate(tmp2, float, {1});
#CHECK: tmp2[0] = 0
#CHECK: for (int c
#CHECK: tmp2[0] = (tmp2[0]) + (B[
#CHECK: }
#CHECK: tmp_buf[b] = (tmp_buf[b]) + (tmp2[0]);
#CHECK: Free(tmp2);
#CHECK-NOT: tmp2
)IR";
torch::jit::testing::FileCheck().run(expected_ir, oss.str());
SimpleIREvaluator cg(s, {b, c, m, n, k});
cg.call({in, out, M, N, K});
ASSERT_EQ(out[0], 499500);
}
TEST(Reductions, ReductionVectorize) {
KernelScope kernel_scope;
std::vector<float> in_(8 * 8);
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
in_[i * 8 + j] = i;
}
}
std::vector<float> out_before(8, -1.f);
std::vector<float> out_after(8, -1.f);
Placeholder in(BufHandle("in", {8, 8}, kFloat));
Tensor* tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}});
LoopNest l_before({tensor});
LoopNest l(l_before);
l_before.prepareForCodegen();
SimpleIREvaluator cg_before(l_before.root_stmt(), {in, tensor});
cg_before.call({in_, out_before});
l.vectorize(l.getLoopStmtsFor(tensor)[0]);
Stmt* s = l.root_stmt();
s = IRSimplifier::simplify(s);
std::ostringstream oss;
oss << *s;
const std::string& expected_ir =
R"IR(
#CHECK: sum[Ramp(0, 1, 8)] = Broadcast(0.f, 8);
#CHECK: for (int n = 0; n < 8; n++) {
#CHECK: sum[Ramp(0, 1, 8)] = ReduceOp((sum[Ramp(0, 1, 8)]) + (in[Ramp(n, 8, 8)]), reduce_args={n});
#CHECK: }
)IR";
torch::jit::testing::FileCheck().run(expected_ir, oss.str());
// Vectorizing should not change result.
l.prepareForCodegen();
s = IRSimplifier::simplify(l.root_stmt());
SimpleIREvaluator cg_after(s, {in, tensor});
cg_after.call({in_, out_after});
for (int i = 0; i < 8; ++i) {
ASSERT_EQ(out_before[i], out_after[i]);
}
}
TEST(Reductions, ReductionVectorizeInner) {
KernelScope kernel_scope;
Placeholder in(BufHandle("in", {8, 8}, kFloat));
Tensor* tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}});
LoopNest l({tensor});
ASSERT_THROWS_WITH(
l.vectorize(l.getLoopStmtsFor(tensor)[1]), "reduction axis");
}
TEST(Reductions, ReductionVectorizeRfactor) {
KernelScope kernel_scope;
std::vector<float> in_(8 * 8);
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
in_[i * 8 + j] = i;
}
}
std::vector<float> out_before(1, -1.f);
std::vector<float> out_after(1, -1.f);
Placeholder in(BufHandle("in", {8, 8}, kFloat));
Tensor* tensor = Reduce("sum", {}, Sum(), in, {{8, "m"}, {8, "n"}});
LoopNest l_before({tensor});
LoopNest l(l_before);
l_before.prepareForCodegen();
SimpleIREvaluator cg_before(l_before.root_stmt(), {in, tensor});
cg_before.call({in_, out_before});
ASSERT_THROWS_WITH(
l.vectorize(l.getLoopStmtsFor(tensor)[1]), "reduction axis");
// But if we rfactor this so it's not a reduce axis we can vectorize that
// loop.
std::vector<For*> loops = l.getLoopStmtsFor(tensor);
auto v = loops.at(1)->var();
auto tensor_body = NodeFinder<ReduceOp>::find(l.root_stmt())[0];
l.rfactor(tensor_body, v);
loops = NodeFinder<For>::find(l.root_stmt());
l.vectorize(loops[2]);
Stmt* s = l.root_stmt();
s = IRSimplifier::simplify(s);
std::ostringstream oss;
oss << *s;
const std::string& expected_ir =
R"IR(
#CHECK: sum = 0.f;
#CHECK: for (int n = 0; n < 8; n++) {
#CHECK: tmp_buf[n] = 0.f;
#CHECK: }
#CHECK: for (int m = 0; m < 8; m++) {
#CHECK: tmp_buf[Ramp(0, 1, 8)] = ReduceOp((tmp_buf[Ramp(0, 1, 8)]) + (in[Ramp(8 * m, 1, 8)]), reduce_args={m});
#CHECK: }
#CHECK: for (int n = 0; n < 8; n++) {
#CHECK: sum = ReduceOp((sum) + (tmp_buf[n]), reduce_args={n});
#CHECK: }
)IR";
torch::jit::testing::FileCheck().run(expected_ir, oss.str());
// Vectorizing should not change result.
l.prepareForCodegen();
s = IRSimplifier::simplify(l.root_stmt());
SimpleIREvaluator cg_after(s, {in, tensor});
cg_after.call({in_, out_after});
ASSERT_EQ(out_before[0], out_after[0]);
}
} // namespace jit
} // namespace torch