pytorch/test/cpp/tensorexpr/test_reductions.cpp

#include <limits>
#include <memory>
#include <sstream>
#include <stdexcept>
#include <unordered_map>

#include <test/cpp/tensorexpr/test_base.h>

#include <test/cpp/tensorexpr/padded_buffer.h>
#include <torch/csrc/jit/tensorexpr/analysis.h>
#include <torch/csrc/jit/tensorexpr/eval.h>
#include <torch/csrc/jit/tensorexpr/ir.h>
#include <torch/csrc/jit/tensorexpr/ir_printer.h>
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
#include <torch/csrc/jit/tensorexpr/loopnest.h>
#include <torch/csrc/jit/tensorexpr/tensor.h>
#include <torch/csrc/jit/testing/file_check.h>

namespace torch {
namespace jit {

using namespace torch::jit::tensorexpr;

// Sum an array to a single value.
void testReduceSum1D() {
  KernelScope kernel_scope;

  Placeholder b(BufHandle("b", {10}, kFloat));
  std::vector<float> in(10);
  for (int j = 0; j < 10; ++j) {
    in[j] = j;
  }

  std::vector<float> out(1, -1.f);

  Tensor* c = Reduce("sum", {}, Sum(), b, {{10, "m"}});
  LoopNest loop({c});
  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg(s, {b, c});

  cg.call({in, out});
  ASSERT_EQ(out[0], 45);
}
// Sum a 2D tensor to a 1D tensor with dynamic shapes.
void testReduceSum2D() {
  KernelScope kernel_scope;

  const int M = 3;
  const int N = 7;

  VarHandle m("m", kInt);
  VarHandle n("n", kInt);

  Placeholder b(BufHandle("b", {m, n}, kFloat));
  std::vector<float> in(M * N);
  for (int i = 0; i < M; ++i) {
    for (int j = 0; j < N; ++j) {
      in[i * N + j] = j;
    }
  }

  std::vector<float> out(M, -1.f);

  Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}});
  LoopNest loop({c});
  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg(s, {b, c, n, m});

  cg.call({in, out, 5, 7});

  float expected = 0;
  for (int i = 0; i < N; ++i) {
    expected += i;
  }

  for (int i = 0; i < M; ++i) {
    ASSERT_EQ(out[i], expected);
  }
}

// Sum a 3D tensor to both a 2D and 1D tensor, then reduce the 2D tensor flat to
// check our work.
void testReduceSum3D() {
  KernelScope kernel_scope;

  const int M = 10;
  VarHandle m("m", kInt);

  Placeholder b(BufHandle("b", {2, 3, m}, kFloat));

  Tensor* c = Reduce("sum", {{2, "l"}, {3, "n"}}, Sum(), b, {{m, "m"}});
  LoopNest loop({c});
  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg(s, {b, c, m});

  std::vector<float> bData(2 * 3 * M, 0);
  std::vector<float> cData(2 * 3, 6.0f);
  std::vector<float> dData(2, 1.0f);
  std::vector<float> eData(2, 1.0f);

  for (int i = 0; i < 2 * 3; ++i) {
    for (int j = 0; j < M; ++j) {
      bData[i * M + j] = j;
    }
  }

  cg.call({bData, cData, M});
  float expected = 0;
  for (int i = 0; i < M; ++i) {
    expected += i;
  }

  for (int i = 0; i < 2 * 3; ++i) {
    ASSERT_EQ(cData[i], expected);
  }

  Tensor* d = Reduce("sum2", {{2, "l"}}, Sum(), b, {{3, "n"}, {m, "m"}});
  LoopNest loop2({d});
  loop2.prepareForCodegen();
  Stmt* s2 = loop2.root_stmt();
  s2 = IRSimplifier::simplify(s2);

  SimpleIREvaluator cg2(s2, {b, d, m});
  cg2.call({bData, dData, M});

  // We're combining an additional dimension of 3, so the sum is 3x.
  expected = expected * 3;

  for (int i = 0; i < 2; ++i) {
    ASSERT_EQ(dData[i], expected);
  }

  // This is the same as just reducing the original result across that axis.
  Placeholder c_buf(BufHandle(c->buf()));
  Tensor* e = Reduce("sum3", {{2, "l"}}, Sum(), c_buf, {{3, "m"}});
  LoopNest loop3({e});
  loop3.prepareForCodegen();
  Stmt* s3 = loop3.root_stmt();
  s3 = IRSimplifier::simplify(s3);

  SimpleIREvaluator cg3(s3, {c, e});
  cg3.call({cData, eData});

  for (int i = 0; i < 2; ++i) {
    ASSERT_EQ(eData[i], expected);
  }
}

// Sum a large (10 D) Tensor 5 dimensions in.
void testReduceSum10D() {
  KernelScope kernel_scope;

  Placeholder in_(BufHandle("in_", {2, 3, 2, 3, 2, 3, 2, 3, 2, 3}, kFloat));
  const int InputSize = 2 * 3 * 2 * 3 * 2 * 3 * 2 * 3 * 2 * 3;
  Placeholder out_(BufHandle("out_", {2, 3, 2, 3, 2}, kFloat));
  const int OutputSize = 2 * 3 * 2 * 3 * 2;

  std::vector<float> in(InputSize, 1.f);
  std::vector<float> out(OutputSize, -1.f);

  Tensor* c = Reduce(
      "sum",
      {{2, "a"}, {3, "b"}, {2, "c"}, {3, "d"}, {2, "e"}},
      Sum(),
      in_,
      {{3, "f"}, {2, "g"}, {3, "h"}, {2, "i"}, {3, "j"}});
  LoopNest loop({c});
  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg(s, {in_, c});

  cg.call({in, out});

  float expected = InputSize / OutputSize;
  for (int i = 0; i < OutputSize; ++i) {
    ASSERT_EQ(out[i], expected);
  }
}

// Reduce via Mul rather than Add using a custom Reducer.
void testReduceProduct() {
  KernelScope kernel_scope;

  const int M = 4;
  const int N = 4;

  Placeholder b(BufHandle("b", {M, N}, kFloat));
  std::vector<float> in(M * N);
  for (int i = 0; i < M; ++i) {
    for (int j = 0; j < N; ++j) {
      in[i * N + j] = 2 + j;
    }
  }

  std::vector<float> out(M, -1.f);

  Reducer product(
      ExprHandle(1.f), [](ExprHandle a, ExprHandle b) { return a * b; });

  Tensor* c = Reduce("product", {{M, "m"}}, product, b, {{N, "n"}});
  LoopNest loop({c});
  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg(s, {b, c});

  cg.call({in, out});

  float expected = 1;
  for (int i = 0; i < N; ++i) {
    expected *= 2 + i;
  }

  for (int i = 0; i < M; ++i) {
    ASSERT_EQ(out[i], expected);
  }
}

// Maximum reductions.
void testReduceMax() {
  KernelScope kernel_scope;

  Placeholder in_(BufHandle("b", {10}, kFloat));

  std::vector<float> in(10);
  std::vector<float> out(1, -1.f);
  for (int j = 0; j < 10; ++j) {
    in[j] = j;
  }

  Tensor* dm1 = Reduce("max", {}, Maximum(kFloat), in_, {{10, "m"}});

  LoopNest loop({dm1});
  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);
  SimpleIREvaluator cg(s, {in_, dm1});

  cg.call({in, out});

  ASSERT_EQ(out[0], 9);

  Placeholder in2_(BufHandle("b", {2, 5}, kFloat));
  std::vector<float> out2(2, -1.f);

  Tensor* m2d = Reduce("max", {{2, "n"}}, Maximum(kFloat), in2_, {{5, "m"}});

  loop = LoopNest({m2d});
  loop.prepareForCodegen();
  s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg2(s, {in2_, m2d});
  cg2.call({in, out2});

  ASSERT_EQ(out2[0], 4);
  ASSERT_EQ(out2[1], 9);
}

// Minimum reduction, with custom initialization.
void testReduceMinCustomInitializer() {
  KernelScope kernel_scope;

  VarHandle minInit("minInit", kFloat);
  Placeholder in_(BufHandle("b", {10}, kFloat));

  std::vector<float> in(10);
  std::vector<float> out(1, -1.f);
  for (int j = 0; j < 10; ++j) {
    in[j] = 10 + j;
  }

  Tensor* min = Reduce(
      "min",
      {},
      Minimum(ExprHandle(minInit)),
      [&](ParameterList& v) { return in_.load(v); },
      {{10, "m"}});

  LoopNest loop({min});
  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg(s, {in_, min, minInit});

  // Works normally (note that out data starts lower than the correct
  // minimum).
  cg.call({in, out, std::numeric_limits<float>::max()});
  ASSERT_EQ(out[0], 10);

  // With an initalizer lower than the min, that's the min.
  cg.call({in, out, 5.f});
  ASSERT_EQ(out[0], 5);
}

// Example implementation of Any/All.
// TODO: this is very awkward without logical And/Or operators.
void testReduceAnyAll() {
  KernelScope kernel_scope;

  VarHandle searchValue("searchValue", kInt);
  Placeholder b(BufHandle("b", {4, 10}, kInt));

  Reducer anyEqSV(ExprHandle(0), [](ExprHandle a, ExprHandle b) {
    return CompareSelect::make(a, 1, 1, b, kEQ);
  });

  Tensor* any = Reduce(
      "anyEqual",
      {{4, "i"}},
      anyEqSV,
      [&](const auto& i, const auto& j) {
        return CompareSelect::make(b.load(i, j), searchValue, kEQ);
      },
      {{10, "j"}});

  LoopNest loop({any});
  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg(s, {b, any, searchValue});

  std::vector<int> in(40, 0);
  std::vector<int> out(4, 0);

  // input has 0-39 in 4 rows.
  for (int i = 0; i < 40; ++i) {
    in[i] = i;
  }
  cg.call({in, out, 1});

  // only the first row has 1
  ASSERT_EQ(out[0], 1);
  ASSERT_EQ(out[1], 0);
  ASSERT_EQ(out[2], 0);
  ASSERT_EQ(out[3], 0);

  cg.call({in, out, 15});

  // 15 in the 3rd row
  ASSERT_EQ(out[0], 0);
  ASSERT_EQ(out[1], 1);
  ASSERT_EQ(out[2], 0);
  ASSERT_EQ(out[3], 0);

  Reducer allGTSV(ExprHandle(1), [](ExprHandle a, ExprHandle b) {
    return CompareSelect::make(a, 0, 0, b, kEQ);
  });

  Tensor* allGreaterThan = Reduce(
      "allGreaterThan",
      {{4, "i"}},
      allGTSV,
      [&](const auto& i, const auto& j) {
        return CompareSelect::make(b.load(i, j), searchValue, kGT);
      },
      {{10, "j"}});

  loop = LoopNest({allGreaterThan});
  loop.prepareForCodegen();
  s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg2(s, {b, allGreaterThan, searchValue});

  cg2.call({in, out, 11});

  // 11 is in row 2.
  ASSERT_EQ(out[0], 0);
  ASSERT_EQ(out[1], 0);
  ASSERT_EQ(out[2], 1);
  ASSERT_EQ(out[3], 1);

  cg2.call({in, out, -3});

  // All are positive.
  ASSERT_EQ(out[0], 1);
  ASSERT_EQ(out[1], 1);
  ASSERT_EQ(out[2], 1);
  ASSERT_EQ(out[3], 1);
}

void testReduceMatmul2D() {
  KernelScope kernel_scope;

  Placeholder tA(BufHandle("tA", {3, 2}, kFloat));
  Placeholder tB(BufHandle("tB", {2, 3}, kFloat));

  std::vector<float> tA_(6);
  std::vector<float> tB_(6);

  std::vector<float> out(9, -1.f);
  for (int i = 0; i < 3; ++i) {
    for (int j = 0; j < 2; ++j) {
      tA_[i * 2 + j] = i * 2 + j;
      tB_[j * 3 + i] = i * 2 + j;
    }
  }

  Tensor* mm = Reduce(
      "mm",
      {{3, "m"}, {3, "n"}},
      Sum(),
      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
        return tA.load(m, k) * tB.load(k, n);
      },
      {{2, "k"}});

  LoopNest loop({mm});
  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg(s, {tA, tB, mm});
  cg.call({tA_, tB_, out});

  std::vector<float> expected(
      {1.f, 3.f, 5.f, 3.f, 13.f, 23.f, 5.f, 23.f, 41.f});

  for (int i = 0; i < 9; ++i) {
    ASSERT_EQ(out[i], expected[i]);
  }
}

void testReduceRfactorLike() {
  KernelScope kernel_scope;

  Placeholder in(BufHandle("in", {10, 10}, kFloat));
  std::vector<float> in_(100);
  for (int i = 0; i < 100; ++i) {
    in_[i] = i;
  }
  std::vector<float> in_rf_(10, -2.f);
  std::vector<float> out(1, -1.f);

  Tensor* l1 = Reduce("l1", {{10, "i"}}, Sum(), in, {{10, "j"}});
  Placeholder in_rf(BufHandle(l1->buf()));

  Tensor* l2 = Reduce("l2", {}, Sum(), in_rf, {{10, "i"}});

  LoopNest loop({l1, l2});
  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg(s, {in, l1, l2});
  cg.call({in_, in_rf_, out});

  ASSERT_EQ(out[0], 99 * 50);
}

void testReduceAsProducer() {
  KernelScope kernel_scope;

  const int M = 10;
  VarHandle m("m", kInt);

  Placeholder a(BufHandle("a", {2, 3}, kFloat));
  Placeholder b(BufHandle("b", {2, 3, m}, kFloat));

  Tensor* c = Reduce("sum", {{2, "l1"}, {3, "n1"}}, Sum(), b, {{m, "m1"}});
  Tensor* d = Compute(
      "scale",
      {{2, "l2"}, {3, "n1"}},
      [&](const VarHandle& l, const VarHandle& n) {
        return c->call(l, n) * a.load(l, n);
      });
  LoopNest loop({d});
  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg(s, {a, b, d, m});

  std::vector<float> aData(2 * 3, 0);
  std::vector<float> bData(2 * 3 * M, 0);
  std::vector<float> dData(2 * 3, 6.0f);

  for (int i = 0; i < 2 * 3; ++i) {
    aData[i] = 6 - i;
    for (int j = 0; j < M; ++j) {
      bData[i * M + j] = j;
    }
  }

  cg.call({aData, bData, dData, M});
  float expected = 0;
  for (int i = 0; i < M; ++i) {
    expected += i;
  }
  for (int i = 0; i < 2 * 3; ++i) {
    ASSERT_EQ(dData[i], expected * (6 - i));
  }
}

void testReduceAsConsumer() {
  KernelScope kernel_scope;

  const int M = 10;
  VarHandle m("m", kInt);

  Placeholder a(BufHandle("a", {2, 3, m}, kFloat));
  Placeholder b(BufHandle("b", {2, 3, m}, kFloat));

  Tensor* c = Compute(
      "scale",
      {{2, "l2"}, {3, "n1"}, {m, "m1"}},
      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
        return b.load(l, n, m) * a.load(l, n, m);
      });
  Tensor* d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {m, "m1"}});
  LoopNest loop({d});
  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg(s, {a, b, d, m});

  std::vector<float> aData(2 * 3 * M, 0);
  std::vector<float> bData(2 * 3 * M, 0);
  std::vector<float> dData(2, 6.0f);

  for (int i = 0; i < 2 * 3; ++i) {
    for (int j = 0; j < M; ++j) {
      bData[i * M + j] = j + 1;
      aData[i * M + j] = 6 - i;
    }
  }

  cg.call({aData, bData, dData, M});
  float expected[2] = {0, 0};
  for (int i = 0; i < 2; ++i) {
    for (int j = 0; j < 3; ++j) {
      for (int k = 0; k < M; ++k) {
        expected[i] += (k + 1) * (6 - (i * 3 + j));
      }
    }
  }

  for (int i = 0; i < 2; ++i) {
    ASSERT_EQ(dData[i], expected[i]);
  }
}

void testSplitReduceAxis() {
  KernelScope kernel_scope;

  Placeholder in(BufHandle("in", {16, 8}, kFloat));

  std::vector<float> in_(16 * 8);
  for (int i = 0; i < 16; ++i) {
    for (int j = 0; j < 8; ++j) {
      in_[i * 8 + j] = i;
    }
  }
  std::vector<float> out(16, -1.f);

  Tensor* tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}});
  LoopNest l({tensor});
  For* x_outer;
  For* x_inner;
  For* x_tail;
  std::vector<For*> loops = l.getLoopStmtsFor(tensor);
  l.splitWithTail(loops[1], 2, &x_outer, &x_inner, &x_tail);

  l.prepareForCodegen();

  Stmt* s = l.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg(s, {in, tensor});
  cg.call({in_, out});

  for (int i = 0; i < 16; ++i) {
    ASSERT_EQ(out[i], i * 8);
  }
}

void testSplitNonReduceAxis() {
  KernelScope kernel_scope;

  Placeholder in(BufHandle("in", {16, 8}, kFloat));

  std::vector<float> in_(16 * 8);
  for (int i = 0; i < 16; ++i) {
    for (int j = 0; j < 8; ++j) {
      in_[i * 8 + j] = i;
    }
  }
  std::vector<float> out(16, -1.f);
  Tensor* tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}});
  LoopNest l({tensor});
  For* x_outer;
  For* x_inner;
  For* x_tail;
  std::vector<For*> loops = l.getLoopStmtsFor(tensor);
  l.splitWithTail(loops[0], 2, &x_outer, &x_inner, &x_tail);

  For* x_2;
  For* x_1;
  For* x_tail_2;
  l.splitWithTail(x_outer, 2, &x_2, &x_1, &x_tail_2);

  l.prepareForCodegen();

  Stmt* s = l.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg(s, {in, tensor});
  cg.call({in_, out});

  for (int i = 0; i < 16; ++i) {
    ASSERT_EQ(out[i], i * 8);
  }
}

void testReorderedReductionInitializer() {
  KernelScope kernel_scope;
  /* From the quip:
  for k in 0..1:  // blockIdx
    for m in 0..128:
      for n in 0..64: // threadIdx
        SumOp(c(k, n), 0, a(k, m, n), {m})
  */

  Placeholder in(BufHandle("in", {1, 12, 6}, kFloat));
  std::vector<float> in_(12 * 6, 1.f);

  Tensor* tensor_ = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}});
  LoopNest l_({tensor_});

  l_.prepareForCodegen();
  Stmt* s_ = Stmt::clone(l_.root_stmt());
  s_ = IRSimplifier::simplify(s_);

  Tensor* tensor = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}});
  LoopNest l({tensor});

  auto loops = l.getLoopStmtsFor(tensor);
  l.setGPUBlockIndex(loops[0], 0);
  l.setGPUThreadIndex(loops[1], 0);

  l.reorderAxis(loops[1], loops[2]);

  Stmt* s = l.root_stmt();
  s = IRSimplifier::simplify(s);

  l.prepareForCodegen();

  s = l.root_stmt();
  s = IRSimplifier::simplify(s);

  std::vector<float> out1(16, -1.f);
  SimpleIREvaluator cg(s_, {in, tensor_});
  cg.call({in_, out1});

  std::vector<float> out2(16, -1.f);
  SimpleIREvaluator cg2(s, {in, tensor});
  cg2.call({in_, out2});

  for (int i = 0; i < 16; ++i) {
    ASSERT_EQ(out1[i], out2[i]);
  }
}

void testReduceRfactor() {
  KernelScope kernel_scope;

  const int M = 10;
  const int N = 10;
  VarHandle m("m", kInt);
  VarHandle n("n", kInt);

  Placeholder b(BufHandle("b", {m, n}, kFloat));
  std::vector<float> in(M * N);
  for (int j = 0; j < M * N; ++j) {
    in[j] = j;
  }

  std::vector<float> out(1, -1.f);

  Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}});
  LoopNest loop({c});
  std::vector<For*> loops = loop.getLoopStmtsFor(c);
  auto v = loops.at(1)->var();
  loop.rfactor(c->body(), v);
  auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
  ASSERT_EQ(rc.size(), 2);
  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg(s, {b, c, m, n});

  cg.call({in, out, M, N});
  ASSERT_EQ(out[0], 4950);
}

void testReduce3DRfactorInternal() {
  KernelScope kernel_scope;

  const int M = 10;
  const int N = 10;
  const int K = 10;
  VarHandle m("m", kInt);
  VarHandle n("n", kInt);
  VarHandle k("k", kInt);

  Placeholder b(BufHandle("b", {m, n, k}, kFloat));
  std::vector<float> in(M * N * K);
  for (int j = 0; j < M * N * K; ++j) {
    in[j] = j;
  }

  std::vector<float> out(1, -1.f);

  Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
  LoopNest loop({c});
  std::vector<For*> loops = loop.getLoopStmtsFor(c);
  auto v = loops.at(1)->var();
  loop.rfactor(c->body(), v);
  auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
  ASSERT_EQ(rc.size(), 2);
  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg(s, {b, c, m, n, k});

  cg.call({in, out, M, N, K});
  ASSERT_EQ(out[0], 499500);
}

void testReduce3DRfactorInner() {
  KernelScope kernel_scope;

  const int M = 10;
  const int N = 10;
  const int K = 10;
  VarHandle m("m", kInt);
  VarHandle n("n", kInt);
  VarHandle k("k", kInt);

  Placeholder b(BufHandle("b", {m, n, k}, kFloat));
  std::vector<float> in(M * N * K);
  for (int j = 0; j < M * N * K; ++j) {
    in[j] = j;
  }

  std::vector<float> out(1, -1.f);

  Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
  LoopNest loop({c});
  std::vector<For*> loops = loop.getLoopStmtsFor(c);
  auto v = loops.at(2)->var();
  loop.rfactor(c->body(), v);
  auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
  ASSERT_EQ(rc.size(), 2);
  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg(s, {b, c, m, n, k});

  cg.call({in, out, M, N, K});
  ASSERT_EQ(out[0], 499500);
}

void testReduce3DRfactorOuter() {
  KernelScope kernel_scope;

  const int M = 10;
  const int N = 10;
  const int K = 10;
  VarHandle m("m", kInt);
  VarHandle n("n", kInt);
  VarHandle k("k", kInt);

  Placeholder b(BufHandle("b", {m, n, k}, kFloat));
  std::vector<float> in(M * N * K);
  for (int j = 0; j < M * N * K; ++j) {
    in[j] = j;
  }

  std::vector<float> out(1, -1.f);

  Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
  LoopNest loop({c});
  std::vector<For*> loops = loop.getLoopStmtsFor(c);
  auto v = loops.at(0)->var();
  loop.rfactor(c->body(), v);
  auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
  ASSERT_EQ(rc.size(), 2);
  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg(s, {b, c, m, n, k});
  cg.call({in, out, M, N, K});
  ASSERT_EQ(out[0], 499500);
}

void testReduce3DRfactorWithOuter() {
  KernelScope kernel_scope;

  const int L = 5;
  const int M = 5;
  const int N = 5;
  const int K = 5;
  VarHandle l("l", kInt);
  VarHandle m("m", kInt);
  VarHandle n("n", kInt);
  VarHandle k("k", kInt);

  Placeholder b(BufHandle("b", {l, m, n, k}, kFloat));
  std::vector<float> in(L * M * N * K);
  for (int j = 0; j < M * N * K; ++j) {
    in[j] = j;
  }

  std::vector<float> out(L, -1.f);

  Tensor* c =
      Reduce("sum", {{l, "l"}}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
  LoopNest loop({c});
  std::vector<For*> loops = loop.getLoopStmtsFor(c);
  auto v = loops.at(3)->var();
  loop.rfactor(c->body(), v);
  auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
  ASSERT_EQ(rc.size(), 2);
  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg(s, {b, c, l, m, n, k});
  cg.call({in, out, L, M, N, K});
  ASSERT_EQ(out[0], 7750);
}

void testReduce3DRfactorRepeated() {
  KernelScope kernel_scope;

  const int M = 5;
  const int N = 5;
  const int K = 5;
  VarHandle m("m", kInt);
  VarHandle n("n", kInt);
  VarHandle k("k", kInt);

  Placeholder b(BufHandle("b", {m, n, k}, kFloat));
  std::vector<float> in(M * N * K);
  for (int j = 0; j < M * N * K; ++j) {
    in[j] = j;
  }

  Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});

  for (int rVar1 = 0; rVar1 < 3; ++rVar1) {
    for (int rVar2 = 0; rVar2 < 2; ++rVar2) {
      std::vector<float> out(1, -1.f);

      LoopNest loop({c});
      auto reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
      ASSERT_EQ(reduces.size(), 1);
      auto v1 = reduces[0]->reduce_args()[rVar1];
      loop.rfactor(reduces[0], v1);

      reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
      ASSERT_EQ(reduces.size(), 2);
      auto v2 = reduces[0]->reduce_args()[rVar2];
      loop.rfactor(reduces[0], v2);

      reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
      ASSERT_EQ(reduces.size(), 3);

      loop.prepareForCodegen();
      Stmt* s = loop.root_stmt();
      s = IRSimplifier::simplify(s);

      SimpleIREvaluator cg(s, {b, c, m, n, k});

      cg.call({in, out, M, N, K});
      ASSERT_EQ(out[0], 7750);
    }
  }
}

void testReduceRfactorInsertionPoint() {
  KernelScope kernel_scope;

  const int M = 10;
  const int N = 10;
  VarHandle m("m", kInt);
  VarHandle n("n", kInt);

  Placeholder b(BufHandle("b", {m, n}, kFloat));
  std::vector<float> in(M * N);
  for (int j = 0; j < M * N; ++j) {
    in[j] = j;
  }

  std::vector<float> out(1, -1.f);

  Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}});
  LoopNest loop({c});
  std::vector<For*> loops = loop.getLoopStmtsFor(c);
  auto v = loops.at(0)->var();
  loop.rfactor(c->body(), v, loops.at(0)->body());
  auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
  ASSERT_EQ(rc.size(), 2);
  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg(s, {b, c, m, n});

  cg.call({in, out, M, N});
  ASSERT_EQ(out[0], 4950);
}

void testReduce3DRfactorInsertionPoint() {
  KernelScope kernel_scope;

  const int M = 10;
  const int N = 10;
  const int K = 10;
  VarHandle m("m", kInt);
  VarHandle n("n", kInt);
  VarHandle k("k", kInt);

  Placeholder b(BufHandle("b", {m, n, k}, kFloat));
  std::vector<float> in(M * N * K);
  for (int j = 0; j < M * N * K; ++j) {
    in[j] = j;
  }

  std::vector<float> out(M, -1.f);

  Tensor* c = Reduce("sum", {{m, "m"}}, Sum(), b, {{n, "n"}, {k, "k"}});
  LoopNest loop({c});
  std::vector<For*> loops = loop.getLoopStmtsFor(c);
  auto v = loops.at(1)->var();
  loop.rfactor(c->body(), v, loops.at(1)->body());
  auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
  ASSERT_EQ(rc.size(), 2);
  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg(s, {b, c, m, n, k});
  cg.call({in, out, M, N, K});
  ASSERT_EQ(out[0], 4950);
}

void testReduceRepeatedInternalRfactor() {
  KernelScope kernel_scope;

  Placeholder in_(BufHandle("in_", {2, 3, 4, 5, 6}, kFloat));
  const int InputSize = 2 * 3 * 4 * 5 * 6;

  std::vector<float> in(InputSize, 1.f);
  std::vector<float> out(1, -1.f);
  std::vector<float> ref(1, -1.f);

  Tensor* c = Reduce(
      "sum",
      {},
      Sum(),
      in_,
      {{2, "a"}, {3, "b"}, {4, "c"}, {5, "d"}, {6, "e"}});
  LoopNest refloop({c});
  refloop.prepareForCodegen();
  SimpleIREvaluator ref_cg(
      IRSimplifier::simplify(refloop.root_stmt()), {in_, c});
  ref_cg.call({in, ref});

  LoopNest loop({c});

  // rfactor out "c".
  auto reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
  loop.rfactor(reduces[0], reduces[0]->reduce_args()[3]);

  // rfactor out "b".
  reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
  loop.rfactor(reduces[0], reduces[0]->reduce_args()[1]);

  // rfactor out "d".
  reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
  loop.rfactor(reduces[0], reduces[0]->reduce_args()[1]);

  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg(s, {in_, c});
  cg.call({in, out});

  ASSERT_EQ(ref[0], out[0]);
}

// Split a reduction axis with a tail loop.
void testReduceSplitTail() {
  KernelScope kernel_scope;

  const int M = 10;
  const int N = 10;
  const int K = 10;

  Placeholder b(BufHandle("b", {M, N, K}, kFloat));
  std::vector<float> in(M * N * K);
  for (int j = 0; j < M * N * K; ++j) {
    in[j] = j;
  }

  for (int i = 0; i < 3; ++i) {
    std::vector<float> out(M, -1.f);

    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
    LoopNest loop({c});
    std::vector<For*> loops = loop.getLoopStmtsFor(c);
    For *outer, *inner, *tail;
    loop.splitWithTail(loops[i], 8, &outer, &inner, &tail);

    loop.prepareForCodegen();
    Stmt* s = loop.root_stmt();
    s = IRSimplifier::simplify(s);

    SimpleIREvaluator cg(s, {b, c});

    cg.call({in, out});
    ASSERT_EQ(out[0], 4950);
  }
}

// Split a reduction axis cleanly so there is no tail loop.
void testReduceSplitNoTail() {
  KernelScope kernel_scope;

  const int M = 10;
  const int N = 10;
  const int K = 10;
  Placeholder b(BufHandle("b", {M, N, K}, kFloat));
  std::vector<float> in(M * N * K);
  for (int j = 0; j < M * N * K; ++j) {
    in[j] = j;
  }

  for (int i = 0; i < 3; ++i) {
    std::vector<float> out(M, -1.f);

    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
    LoopNest loop({c});
    std::vector<For*> loops = loop.getLoopStmtsFor(c);
    For *outer, *inner, *tail;
    loop.splitWithTail(loops[i], 5, &outer, &inner, &tail);

    loop.prepareForCodegen();
    Stmt* s = loop.root_stmt();
    s = IRSimplifier::simplify(s);

    SimpleIREvaluator cg(s, {b, c});

    cg.call({in, out});
    ASSERT_EQ(out[0], 4950);
  }
}

// Split a reduction axis with only a tail loop (the split loop will be size 0
// and eliminated out).
void testReduceOverSplitTail() {
  KernelScope kernel_scope;

  const int M = 10;
  const int N = 10;
  const int K = 10;

  Placeholder b(BufHandle("b", {M, N, K}, kFloat));
  std::vector<float> in(M * N * K);
  for (int j = 0; j < M * N * K; ++j) {
    in[j] = j;
  }

  for (int i = 0; i < 3; ++i) {
    std::vector<float> out(M, -1.f);

    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
    LoopNest loop({c});
    std::vector<For*> loops = loop.getLoopStmtsFor(c);
    For *outer, *inner, *tail;
    loop.splitWithTail(loops[i], 16, &outer, &inner, &tail);

    loop.prepareForCodegen();
    Stmt* s = loop.root_stmt();
    s = IRSimplifier::simplify(s);

    SimpleIREvaluator cg(s, {b, c});

    cg.call({in, out});
    ASSERT_EQ(out[0], 4950);
  }
}

// Split a reduction axis with a mask.
void testReduceSplitMask() {
  KernelScope kernel_scope;

  const int M = 10;
  const int N = 10;
  const int K = 10;

  Placeholder b(BufHandle("b", {M, N, K}, kFloat));
  std::vector<float> in(M * N * K);
  for (int j = 0; j < M * N * K; ++j) {
    in[j] = j;
  }

  for (int i = 0; i < 3; ++i) {
    std::vector<float> out(M, -1.f);

    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
    LoopNest loop({c});
    std::vector<For*> loops = loop.getLoopStmtsFor(c);
    For *outer, *inner;
    loop.splitWithMask(loops[i], 8, &outer, &inner);

    loop.prepareForCodegen();
    Stmt* s = loop.root_stmt();
    s = IRSimplifier::simplify(s);

    SimpleIREvaluator cg(s, {b, c});

    cg.call({in, out});
    ASSERT_EQ(out[0], 4950);
  }
}

// Split a reduction axis cleanly not requiring a mask.
void testReduceSplitNoMask() {
  KernelScope kernel_scope;

  const int M = 10;
  const int N = 10;
  const int K = 10;
  Placeholder b(BufHandle("b", {M, N, K}, kFloat));
  std::vector<float> in(M * N * K);
  for (int j = 0; j < M * N * K; ++j) {
    in[j] = j;
  }

  for (int i = 0; i < 3; ++i) {
    std::vector<float> out(M, -1.f);

    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
    LoopNest loop({c});
    std::vector<For*> loops = loop.getLoopStmtsFor(c);
    For *outer, *inner;
    loop.splitWithMask(loops[i], 5, &outer, &inner);

    loop.prepareForCodegen();
    Stmt* s = loop.root_stmt();
    s = IRSimplifier::simplify(s);

    SimpleIREvaluator cg(s, {b, c});

    cg.call({in, out});
    ASSERT_EQ(out[0], 4950);
  }
}

// Split a reduction axis with all logic in the mask.
void testReduceOverSplitMask() {
  KernelScope kernel_scope;

  const int M = 10;
  const int N = 10;
  const int K = 10;

  Placeholder b(BufHandle("b", {M, N, K}, kFloat));
  std::vector<float> in(M * N * K);
  for (int j = 0; j < M * N * K; ++j) {
    in[j] = j;
  }

  for (int i = 0; i < 3; ++i) {
    std::vector<float> out(M, -1.f);

    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
    LoopNest loop({c});
    std::vector<For*> loops = loop.getLoopStmtsFor(c);
    For *outer, *inner;
    loop.splitWithMask(loops[i], 16, &outer, &inner);

    loop.prepareForCodegen();
    Stmt* s = loop.root_stmt();
    s = IRSimplifier::simplify(s);

    SimpleIREvaluator cg(s, {b, c});

    cg.call({in, out});
    ASSERT_EQ(out[0], 4950);
  }
}

// Test an rfactor when there are two ReduceOps in the graph due to a
// splitWithTail.
void testReduceSplitRfactor() {
  KernelScope kernel_scope;

  const int M = 2;
  const int N = 10;
  const int K = 10;
  const int SPLIT_FACTOR = 4;

  Placeholder b(BufHandle("b", {M, N, K}, kFloat));
  std::vector<float> in(M * N * K);
  for (int m = 0; m < M; ++m) {
    for (int j = 0; j < N * K; ++j) {
      in[m * N * K + j] = j;
    }
  }

  std::vector<float> out(M, -1.f);

  Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
  LoopNest loop({c});
  std::vector<For*> loops = loop.getLoopStmtsFor(c);
  For *o, *i, *t;
  loop.splitWithTail(loops[2], SPLIT_FACTOR, &o, &i, &t);

  auto reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
  loop.rfactor(reduces[0], reduces[0]->reduce_args().back());
  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg(s, {b, c});

  cg.call({in, out});
  for (int i = 0; i < M; ++i) {
    ASSERT_EQ(out[0], 4950);
  }
}

// Test an rfactor which ends up being eliminated since the total loop size is
// smaller than the split factor.
void testReduceOverSplitRfactor() {
  KernelScope kernel_scope;

  const int N = 10;
  const int K = 10;
  const int SPLIT_FACTOR = 16;

  Placeholder b(BufHandle("b", {N, K}, kFloat));
  std::vector<float> in(N * K);
  for (int j = 0; j < N * K; ++j) {
    in[j] = j;
  }

  std::vector<float> out(1, -1.f);

  Tensor* c = Reduce("sum", {}, Sum(), b, {{N, "n"}, {K, "k"}});
  LoopNest loop({c});
  std::vector<For*> loops = loop.getLoopStmtsFor(c);
  For *o, *i, *t;
  loop.splitWithTail(loops[1], SPLIT_FACTOR, &o, &i, &t);

  auto reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
  loop.rfactor(reduces[0], reduces[0]->reduce_args().back());
  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  SimpleIREvaluator cg(s, {b, c});

  cg.call({in, out});
  ASSERT_EQ(out[0], 4950);

  std::ostringstream oss;
  oss << *s;

  // Check the IR to verify the rfactored reduce is eliminated.
  // TODO: The alloc free should be eliminated here since it is size 0.
  const std::string& verification_pattern =
      R"IR(
# CHECK: Allocate(tmp_buf, float, {0});
# CHECK: sum[0] = 0.f;
# CHECK: for (int n = 0; n < 10; n++) {
# CHECK:   for (int k_tail = 0; k_tail < 10; k_tail++) {
# CHECK:     sum[0] = (sum[0]) + (b[k_tail + 10 * n]);
# CHECK:   }
# CHECK: }
# CHECK: Free(tmp_buf);)IR";
  // TODO: rfactor output is not consistent yet, will fix (@nickg).
  // torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
}

void testReduceInlineReduction() {
  KernelScope kernel_scope;
  const int M = 4;
  const int N = 5;
  const int K = 6;

  Placeholder a_buf("a", kFloat, {M});
  Placeholder b_buf("b", kFloat, {M, N, K});

  Tensor* x = Reduce("x", {{M, "m1"}}, Sum(), b_buf, {{N, "n1"}, {K, "k1"}});
  Tensor* y = Compute("y", {{M, "m2"}}, [&](const VarHandle& m) {
    return a_buf.load(m) + x->call(m);
  });

  PaddedBuffer<float> a_v(M);
  PaddedBuffer<float> b_v(M, N, K);

  for (int i = 0; i < M; i++) {
    a_v(i) = i * i;
  }
  for (int i = 0; i < M; i++) {
    for (int j = 0; j < N; j++) {
      for (int k = 0; k < K; k++) {
        b_v(i, j, k) = j * j * k;
      }
    }
  }

  LoopNest l1({y});
  ASSERT_THROWS_WITH(
      l1.computeInline(x->buf()), "cannot inline a reduction computation");
}

void testReduceInlineConsumer() {
  KernelScope kernel_scope;
  const int M = 4;
  const int N = 5;
  const int K = 6;

  Placeholder a_buf("a", kFloat, {M, N, K});
  Placeholder b_buf("b", kFloat, {M, N, K});

  Tensor* x = Compute(
      "x",
      {{M, "m1"}, {N, "n1"}, {K, "k1"}},
      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
        return a_buf.load(m, n, k) + b_buf.load(m, n, k);
      });
  Tensor* y = Reduce("y", {{M, "m2"}}, Sum(), x, {{N, "n2"}, {K, "k2"}});

  PaddedBuffer<float> a_v(M, N, K);
  PaddedBuffer<float> b_v(M, N, K);

  for (int i = 0; i < M; i++) {
    for (int j = 0; j < N; j++) {
      for (int k = 0; k < K; k++) {
        a_v(i, j, k) = i * i + k;
        b_v(i, j, k) = j * j + k;
      }
    }
  }

  LoopNest l1({y});
  LoopNest l2({y});
  l2.computeInline(x->buf());

  l1.prepareForCodegen();
  l2.prepareForCodegen();

  Stmt* stmt1 = IRSimplifier::simplify(l1.root_stmt());
  Stmt* stmt2 = IRSimplifier::simplify(l2.root_stmt());

  SimpleIREvaluator eval1(stmt1, a_buf, b_buf, y);
  SimpleIREvaluator eval2(stmt2, a_buf, b_buf, y);

  PaddedBuffer<float> y_1(M);
  PaddedBuffer<float> y_2(M);

  eval1(a_v, b_v, y_1);
  eval2(a_v, b_v, y_2);
  ExpectAllNear(y_1, y_2, 1e-5);
  std::ostringstream oss1, oss2;
  oss1 << *stmt1;
  oss2 << *stmt2;
  ASSERT_GT(oss1.str().size(), oss2.str().size());
}

void testReduceInlineReducerInternal() {
  KernelScope kernel_scope;
  const int M = 4;
  const int N = 5;
  const int K = 6;

  Placeholder a_buf("a", kFloat, {M, N, K});
  Placeholder b_buf("b", kFloat, {M, N, K});

  Tensor* x = Compute(
      "x",
      {{M, "m1"}, {N, "n1"}, {K, "k1"}},
      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
        return a_buf.load(m, n, k) + b_buf.load(m, n, k);
      });

  Reducer minimum(ExprHandle(0.f), [&](ExprHandle a, ExprHandle b) {
    return Add::make(ExprHandle(1.f), Min::make(a, b, false));
  });
  Tensor* y = Reduce("y", {{M, "m2"}}, minimum, x, {{N, "n2"}, {K, "k2"}});

  PaddedBuffer<float> a_v(M, N, K);
  PaddedBuffer<float> b_v(M, N, K);

  for (int i = 0; i < M; i++) {
    for (int j = 0; j < N; j++) {
      for (int k = 0; k < K; k++) {
        a_v(i, j, k) = i * i + k;
        b_v(i, j, k) = j * j + k;
      }
    }
  }

  LoopNest l1({y});
  LoopNest l2({y});
  l2.computeInline(x->buf());

  l1.prepareForCodegen();
  l2.prepareForCodegen();

  Stmt* stmt1 = IRSimplifier::simplify(l1.root_stmt());
  Stmt* stmt2 = IRSimplifier::simplify(l2.root_stmt());

  SimpleIREvaluator eval1(stmt1, a_buf, b_buf, y);
  SimpleIREvaluator eval2(stmt2, a_buf, b_buf, y);

  PaddedBuffer<float> y_1(M);
  PaddedBuffer<float> y_2(M);

  eval1(a_v, b_v, y_1);
  eval2(a_v, b_v, y_2);
  ExpectAllNear(y_1, y_2, 1e-5);
  std::ostringstream oss1, oss2;
  oss1 << *stmt1;
  oss2 << *stmt2;
  ASSERT_GT(oss1.str().size(), oss2.str().size());
}

void testReductionCacheAccessesOuter() {
  KernelScope kernel_scope;

  int L = 4;
  int N = 3;
  int M = 2;

  Placeholder a(BufHandle("a", {L, N, M}, kFloat));
  Placeholder b(BufHandle("b", {L, N, M}, kFloat));

  Tensor* c = Compute(
      "scale",
      {{L, "l2"}, {N, "n1"}, {M, "m1"}},
      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
        return b.load(l, n, m) * a.load(l, n, m);
      });
  Tensor* d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});

  Tensor* e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
    return b.load(0, 0, l) * d->call(l);
  });

  LoopNest l({e});

  Stmt* d_loop = l.getLoopStmtsFor(d)[1];
  l.cacheAccesses(d->buf(), "d_local", d_loop);
  l.prepareForCodegen();

  Stmt* result = IRSimplifier::simplify(l.root_stmt());

  std::ostringstream oss;
  oss << *result;
  const std::string& expected_ir =
      R"IR(
#CHECK: Allocate(d_local, float, {1});
#CHECK: sum[l1] = 0
#CHECK: d_local[0] = 0
#CHECK: for (int n1
#CHECK:   for (int m1
#CHECK: d_local[0] = (d_local[0]) + (scale[
#CHECK:   }
#CHECK: }
#CHECK: sum[l1] = (sum[l1]) + (d_local[0])
#CHECK: Free(d_local);
#CHECK-NOT: d_local
      )IR";
  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
}

void testReductionCacheAccessesInner() {
  KernelScope kernel_scope;

  int L = 4;
  int N = 3;
  int M = 2;

  Placeholder a(BufHandle("a", {L, N, M}, kFloat));
  Placeholder b(BufHandle("b", {L, N, M}, kFloat));

  Tensor* c = Compute(
      "scale",
      {{L, "l2"}, {N, "n1"}, {M, "m1"}},
      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
        return b.load(l, n, m) * a.load(l, n, m);
      });
  Tensor* d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});

  Tensor* e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
    return b.load(0, 0, l) * d->call(l);
  });

  LoopNest l({e});

  Stmt* d_loop = l.getLoopStmtsFor(d)[2];
  l.cacheAccesses(d->buf(), "d_local", d_loop);
  l.prepareForCodegen();

  Stmt* result = IRSimplifier::simplify(l.root_stmt());

  std::ostringstream oss;
  oss << *result;
  const std::string& expected_ir =
      R"IR(
#CHECK: sum[l1] = 0
#CHECK: for (int n1
#CHECK:   Allocate(d_local, float, {1});
#CHECK:   d_local[0] = 0
#CHECK:   for (int m1
#CHECK:     d_local[0] = (d_local[0]) + (scale[
#CHECK:   }
#CHECK:   sum[l1] = (sum[l1]) + (d_local[0])
#CHECK:   Free(d_local);
#CHECK: }
#CHECK-NOT: d_local
      )IR";
  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
}

void testReductionCacheBodyAccess() {
  KernelScope kernel_scope;

  Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
  Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));

  Tensor* c = Compute(
      "scale",
      {{24, "l2"}, {32, "n1"}, {12, "m1"}},
      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
        return b.load(l, n, m) * a.load(l, n, m);
      });
  Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});

  Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
    return b.load(0, 0, l) * d->call(l);
  });

  LoopNest l({e});

  Stmt* d_loop = l.getLoopStmtsFor(d)[1];
  l.cacheAccesses(c->buf(), "scale_local", d_loop);

  l.prepareForCodegen();
  Stmt* result = IRSimplifier::simplify(l.root_stmt());

  std::ostringstream oss;
  oss << *result;
  const std::string& expected_ir =
      R"IR(
#CHECK: Allocate(scale_local, float, {1, 32, 12});
#CHECK: for (int j = 0; j < 32; j++) {
#CHECK:   for (int k = 0; k < 12; k++) {
#CHECK:     scale_local[k + 12 * j] = scale[(k + 384 * l1) + 12 * j];
#CHECK: sum[l1] = (sum[l1]) + (scale_local[12 * n1_1 + m1_1]);
#CHECK: Free(scale_local);
#CHECK: scale_1[l] = (b[l]) * (sum[l]);
      )IR";
  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
}

void testReductionCacheConsumerAccess() {
  KernelScope kernel_scope;

  Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
  Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));

  Tensor* c = Compute(
      "scale",
      {{24, "l2"}, {32, "n1"}, {12, "m1"}},
      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
        return b.load(l, n, m) * a.load(l, n, m);
      });
  Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});

  Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
    return b.load(0, 0, l) * d->call(l);
  });

  LoopNest l({e});

  For* outer;
  For* inner;
  l.splitWithMask(l.getLoopStmtsFor(e)[0], 4, &outer, &inner);

  Stmt* e_loop = l.getLoopStmtsFor(e)[1];
  l.cacheAccesses(d->buf(), "sum_local", e_loop);
  l.prepareForCodegen();

  Stmt* result = IRSimplifier::simplify(l.root_stmt());

  std::ostringstream oss;
  oss << *result;
  const std::string& expected_ir =
      R"IR(
#CHECK: sum[l1] = (sum[l1]) + (scale[
#CHECK: Allocate(sum_local, float, {4});
#CHECK: for (int i = 0; i < 4
#CHECK:   sum_local[i] = sum[i + 4 * l_outer];
#CHECK:   scale_1[l_inner + 4 * l_outer] = (b[l_inner + 4 * l_outer]) * (sum_local[l_inner]);
      )IR";
  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
}

void testReductionSplitCacheConsumerAccess() {
  KernelScope kernel_scope;

  Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
  Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));

  Tensor* c = Compute(
      "scale",
      {{24, "l2"}, {32, "n1"}, {12, "m1"}},
      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
        return b.load(l, n, m) * a.load(l, n, m);
      });
  Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});

  Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
    return b.load(0, 0, l) * d->call(l);
  });

  LoopNest l({e});

  For* outer;
  For* inner;

  // Split outer reduction axis.
  l.splitWithMask(l.getLoopStmtsFor(d)[0], 4, &outer, &inner);

  // Split reduction consumer.
  l.splitWithMask(l.getLoopStmtsFor(e)[0], 4, &outer, &inner);

  l.cacheAccesses(d->buf(), "sum_local", inner);
  l.prepareForCodegen();

  Stmt* result = IRSimplifier::simplify(l.root_stmt());

  // reduction changes but cache does not.
  std::ostringstream oss;
  oss << *result;
  const std::string& expected_ir =
      R"IR(
#CHECK: sum[l1_inner + 4 * l1_outer] = (sum[l1_inner + 4 * l1_outer]) + (scale[((12 * n1_1 + 384 * l1_inner) + m1_1) + 1536 * l1_outer]);
#CHECK: Allocate(sum_local, float, {4});
#CHECK: for (int i = 0; i < 4
#CHECK:   sum_local[i] = sum[i + 4 * l_outer];
#CHECK:   scale_1[l_inner + 4 * l_outer] = (b[l_inner + 4 * l_outer]) * (sum_local[l_inner]);
      )IR";
  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
}

void testReductionReorderCacheConsumerAccess() {
  KernelScope kernel_scope;

  Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
  Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));

  Tensor* c = Compute(
      "scale",
      {{24, "l2"}, {32, "n1"}, {12, "m1"}},
      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
        return b.load(l, n, m) * a.load(l, n, m);
      });
  Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});

  Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
    return b.load(0, 0, l) * d->call(l);
  });

  LoopNest l({e});

  For* outer;
  For* inner;

  // reorder outer reduction axes.
  auto loops = l.getLoopStmtsFor(d);
  l.reorderAxis(loops[0], loops[1]);

  // Split reduction consumer.
  l.splitWithMask(l.getLoopStmtsFor(e)[0], 4, &outer, &inner);

  l.cacheAccesses(d->buf(), "sum_local", inner);
  l.prepareForCodegen();

  Stmt* result = IRSimplifier::simplify(l.root_stmt());

  // neither reduction body not cache changes.
  std::ostringstream oss;
  oss << *result;
  const std::string& expected_ir =
      R"IR(
#CHECK: sum[l1] = (sum[l1]) + (scale[(12 * n1_1 + m1_1) + 384 * l1]);
#CHECK: Allocate(sum_local, float, {4});
#CHECK: for (int i = 0; i < 4
#CHECK:   sum_local[i] = sum[i + 4 * l_outer];
#CHECK: scale_1[l_inner + 4 * l_outer] = (b[l_inner + 4 * l_outer]) * (sum_local[l_inner]);
      )IR";
  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
}

void testReductionRfactorCacheTempOuter() {
  KernelScope kernel_scope;

  const int M = 10;
  const int N = 10;
  const int K = 10;
  VarHandle m("m", kInt);
  VarHandle n("n", kInt);
  VarHandle k("k", kInt);

  Placeholder b(BufHandle("B", {m, n, k}, kFloat));
  std::vector<float> in(M * N * K);
  for (int j = 0; j < M * N * K; ++j) {
    in[j] = j;
  }

  std::vector<float> out(1, -1.f);

  Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}});
  LoopNest loop({c});
  auto reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
  loop.rfactor(reduces[0], reduces[0]->reduce_args()[1]);

  reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
  std::vector<For*> loops = NodeFinder<For>::find(loop.root_stmt());
  loop.cacheAccesses(reduces[0]->accumulator(), "tmp2", loops[2]);
  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  std::ostringstream oss;
  oss << *s;
  const std::string& expected_ir =
      R"IR(
#CHECK: Allocate(tmp_buf, float, {n});
#CHECK: for (int a = 0; a < m
#CHECK:   Allocate(tmp2, float, {n});
#CHECK:   for (int i = 0; i < n
#CHECK:     tmp2[i] = 0
#CHECK:   }
#CHECK:   for (int b = 0; b < n
#CHECK:     for (int c
#CHECK:       tmp2[b] = (tmp2[b]) + (B[
#CHECK:     }
#CHECK:   }
#CHECK:   for (int i = 0; i < n
#CHECK:     tmp_buf[i] = (tmp_buf[i]) + (tmp2[i]);
#CHECK:   }
#CHECK:   Free(tmp2);
#CHECK-NOT: tmp2
      )IR";
  torch::jit::testing::FileCheck().run(expected_ir, oss.str());

  SimpleIREvaluator cg(s, {b, c, m, n, k});

  cg.call({in, out, M, N, K});
  ASSERT_EQ(out[0], 499500);
}

void testReductionRfactorCacheTempInner() {
  KernelScope kernel_scope;

  const int M = 10;
  const int N = 10;
  const int K = 10;
  VarHandle m("m", kInt);
  VarHandle n("n", kInt);
  VarHandle k("k", kInt);

  Placeholder b(BufHandle("B", {m, n, k}, kFloat));
  std::vector<float> in(M * N * K);
  for (int j = 0; j < M * N * K; ++j) {
    in[j] = j;
  }

  std::vector<float> out(1, -1.f);

  Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}});
  LoopNest loop({c});
  auto reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
  loop.rfactor(reduces[0], reduces[0]->reduce_args()[1]);

  reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
  std::vector<For*> loops = NodeFinder<For>::find(loop.root_stmt());
  loop.cacheAccesses(reduces[0]->accumulator(), "tmp2", loops[3]);
  loop.prepareForCodegen();
  Stmt* s = loop.root_stmt();
  s = IRSimplifier::simplify(s);

  std::ostringstream oss;
  oss << *s;
  const std::string& expected_ir =
      R"IR(
#CHECK: Allocate(tmp_buf, float, {n});
#CHECK: for (int a = 0; a < m
#CHECK:   for (int b = 0; b < n
#CHECK:     Allocate(tmp2, float, {1});
#CHECK:     tmp2[0] = 0
#CHECK:     for (int c
#CHECK:       tmp2[0] = (tmp2[0]) + (B[
#CHECK:     }
#CHECK:   tmp_buf[b] = (tmp_buf[b]) + (tmp2[0]);
#CHECK:   Free(tmp2);
#CHECK-NOT: tmp2
      )IR";
  torch::jit::testing::FileCheck().run(expected_ir, oss.str());

  SimpleIREvaluator cg(s, {b, c, m, n, k});

  cg.call({in, out, M, N, K});
  ASSERT_EQ(out[0], 499500);
}

} // namespace jit
} // namespace torch