pytorch/test/cpp/tensorexpr/test_llvm.cpp
PyTorch MergeBot e288c258f7 Revert "Remove tensorexpr tests (#158928)"
This reverts commit d742a2896c.

Reverted https://github.com/pytorch/pytorch/pull/158928 on behalf of https://github.com/yangw-dev due to this breaks bunch of internal dependency since some tests are still using the deleted test files from this pr, the internal reviewer please help fix this using codev ([comment](https://github.com/pytorch/pytorch/pull/158928#issuecomment-3134378616))
2025-07-29 23:32:07 +00:00

1800 lines
50 KiB
C++

#ifdef TORCH_ENABLE_LLVM
#include <gtest/gtest.h>
#include <test/cpp/tensorexpr/test_base.h>
#include <c10/util/irange.h>
#include <test/cpp/tensorexpr/padded_buffer.h>
#include <test/cpp/tensorexpr/test_utils.h>
#include <torch/csrc/jit/tensorexpr/eval.h>
#include <torch/csrc/jit/tensorexpr/ir.h>
#include <torch/csrc/jit/tensorexpr/ir_printer.h>
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
#include <torch/csrc/jit/tensorexpr/loopnest.h>
#include <torch/csrc/jit/tensorexpr/tensor.h>
#include <torch/csrc/jit/testing/file_check.h>
#include <cmath>
#include <numeric>
namespace torch {
namespace jit {
using namespace torch::jit::tensorexpr;
using LLVMExprEval = ExprEval<LLVMCodeGen>;
// Typed tests, can't use gtest params here due to the way we instantiate tests.
#define TEST_LLVM_SCALAR_TYPES(_) \
_(uint8_t, Byte, 24) \
_(int8_t, Char, -20) \
_(int16_t, Short, 3332) \
_(int, Int, 123456) \
_(int64_t, Long, 2631563121321) \
_(float, Float, 0.122) \
_(double, Double, 0.21312) \
_(at::Half, Half, 0.128f)
#define IMM_TEST(Type, Name, Val) \
TEST(LLVM, Name##ImmTest) { \
auto a = Name##Imm::make(Val); \
LLVMExprEval cg(a); \
if (std::is_floating_point<decltype(Val)>()) { \
ASSERT_NEAR(cg.value<Type>(), Val, 0.1); \
} else { \
ASSERT_EQ(cg.value<Type>(), Val); \
} \
}
TEST_LLVM_SCALAR_TYPES(IMM_TEST)
#undef IMM_TEST
#define ADD_TEST(Type, Name, Val) \
TEST(LLVM, Name##AddTest) { \
auto a = Name##Imm::make(Val); \
auto b = Name##Imm::make(Val * 2); \
auto c = Add::make(a, b); \
LLVMExprEval cg(c); \
if (std::is_floating_point<decltype(Val)>()) { \
ASSERT_NEAR(cg.value<Type>(), Val * 3, 0.1); \
} else { \
ASSERT_EQ(cg.value<Type>(), Val * 3); \
} \
}
TEST_LLVM_SCALAR_TYPES(ADD_TEST)
#undef ADD_TEST
#define SUB_TEST(Type, Name, Val) \
TEST(LLVM, Name##SubTest) { \
auto a = Name##Imm::make(Val * 2); \
auto b = Name##Imm::make(Val); \
auto c = Sub::make(a, b); \
LLVMExprEval cg(c); \
if (std::is_floating_point<decltype(Val)>()) { \
ASSERT_NEAR(cg.value<Type>(), Val, 0.1); \
} else { \
ASSERT_EQ(cg.value<Type>(), Val); \
} \
}
TEST_LLVM_SCALAR_TYPES(SUB_TEST)
#undef SUB_TEST
#define MUL_TEST(Type, Name, Val) \
TEST(LLVM, Name##MulTest) { \
auto a = Name##Imm::make(Val); \
auto b = Name##Imm::make((Type)4); \
auto c = Mul::make(a, b); \
LLVMExprEval cg(c); \
if (std::is_floating_point<decltype(Val)>()) { \
ASSERT_NEAR(cg.value<Type>(), Val * 4, 0.1); \
} else { \
ASSERT_EQ(cg.value<Type>(), Val * 4); \
} \
}
TEST_LLVM_SCALAR_TYPES(MUL_TEST)
#undef MUL_TEST
#define DIV_TEST(Type, Name, Val) \
TEST(LLVM, Name##DivTest) { \
auto a = Name##Imm::make((Type)6); \
auto b = Name##Imm::make((Type)3); \
auto c = Div::make(a, b); \
LLVMExprEval cg(c); \
if (std::is_floating_point<decltype(Val)>()) { \
ASSERT_NEAR(cg.value<Type>(), 2, 0.1); \
} else { \
ASSERT_EQ(cg.value<Type>(), 2); \
} \
}
TEST_LLVM_SCALAR_TYPES(DIV_TEST)
#undef DIV_TEST
TEST(LLVM, IntToFloatCastTest) {
auto a = IntImm::make(2);
auto b = Cast::make(kFloat, a);
LLVMExprEval cg(b, {});
ASSERT_EQ(cg.value<float>(), 2.0);
}
TEST(LLVM, FloatToIntCastTest) {
auto a = FloatImm::make(2.0);
auto b = Cast::make(kInt, a);
LLVMExprEval cg(b);
ASSERT_EQ(cg.value<int>(), 2);
}
TEST(LLVM, IntToLongCastTest) {
auto a = IntImm::make(12345);
auto b = Cast::make(kLong, a);
LLVMExprEval cg(b);
ASSERT_EQ(cg.value<int64_t>(), 12345);
}
TEST(LLVM, ByteToCharCastTest) {
auto a = ByteImm::make(250);
auto b = Cast::make(kChar, a);
LLVMExprEval cg(b);
ASSERT_EQ(cg.value<int8_t>(), (int8_t)250);
}
TEST(LLVM, HalfToLongCastTest) {
auto a = HalfImm::make(2.0);
auto b = Cast::make(kLong, a);
LLVMExprEval cg(b);
ASSERT_EQ(cg.value<int64_t>(), 2);
}
TEST(LLVM, ByteToDoubleCastTest) {
auto a = ByteImm::make(2);
auto b = Cast::make(kDouble, a);
LLVMExprEval cg(b);
ASSERT_EQ(cg.value<double>(), 2);
}
TEST(LLVM, FloatToByteCastTest) {
auto a = FloatImm::make(254.0);
auto b = Cast::make(kByte, a);
LLVMExprEval cg(b);
ASSERT_EQ(cg.value<uint8_t>(), 254);
}
TEST(LLVM, FloatToCharCastTest) {
auto a = FloatImm::make(-2.0);
auto b = Cast::make(kChar, a);
LLVMExprEval cg(b);
ASSERT_EQ(cg.value<int8_t>(), -2);
}
TEST(LLVM, ByteToFloatCastTest) {
auto a = ByteImm::make(254);
auto b = Cast::make(kFloat, a);
LLVMExprEval cg(b);
ASSERT_EQ(cg.value<float>(), 254.0);
}
TEST(LLVM, CharToFloatCastTest) {
auto a = CharImm::make(-2);
auto b = Cast::make(kFloat, a);
LLVMExprEval cg(b);
ASSERT_EQ(cg.value<float>(), -2.0);
}
TEST(LLVM, BitCast) {
/* constexpr int16_t ref16 = 1337; */
constexpr int32_t ref32 = 1337;
constexpr int64_t ref64 = 1337;
constexpr float reff32 = 1337.0f;
constexpr double reff64 = 1337.0f;
// this is broken
/*{
at::Half k_;
at::Half* k = &k_;
*reinterpret_cast<int16_t*>(k) = ref16;
auto a = HalfImm::make(k);
auto b = BitCast::make(kShort, a);
LLVMExprEval cg(b);
ASSERT_EQ(cg.value<int16_t>(), ref16);
}*/
{
float k = raw_bitcast<float>(ref32);
auto a = FloatImm::make(k);
auto b = BitCast::make(kInt, a);
LLVMExprEval cg(b);
ASSERT_EQ(cg.value<int32_t>(), ref32);
}
{
double k = raw_bitcast<double>(ref64);
auto a = DoubleImm::make(k);
auto b = BitCast::make(kLong, a);
LLVMExprEval cg(b);
ASSERT_EQ(cg.value<int64_t>(), ref64);
}
{
int64_t k = raw_bitcast<int64_t>(reff64);
auto a = LongImm::make(k);
auto b = BitCast::make(kDouble, a);
LLVMExprEval cg(b);
ASSERT_EQ(cg.value<double>(), reff64);
}
{
int32_t k = raw_bitcast<int32_t>(reff32);
auto a = IntImm::make(k);
auto b = BitCast::make(kFloat, a);
LLVMExprEval cg(b);
ASSERT_EQ(cg.value<float>(), reff32);
}
}
TEST(LLVM, fastLogFloat) {
const int kTotalSize = 128 * 128;
BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
VarHandle index = VarHandle("index", kInt);
ExprHandle load_a = a_buf.load(index);
StmtPtr store_b = b_buf.store({index}, fast_log(load_a));
StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
PaddedBuffer<float> a_v(kTotalSize);
PaddedBuffer<float> b_v(kTotalSize);
for (const auto i : c10::irange(kTotalSize)) {
a_v(i) = at::randn({1}).item().to<float>();
}
LLVMCodeGen ir_eval(stmt, {a_buf, b_buf});
ir_eval.call({a_v, b_v});
for (const auto i : c10::irange(kTotalSize)) {
auto test = b_v(i);
auto ref = std::log(a_v(i));
if (std::isnan(ref)) {
ASSERT_EQ(std::isnan(test), true);
} else {
ASSERT_FLOAT_EQ(test, ref);
}
}
}
TEST(LLVM, LetTest01) {
BufHandle a("A", {1}, kFloat);
std::vector<float> v = {1, 0};
std::vector<void*> args({v.data()});
VarHandle x("x", kFloat);
auto block = Block::make({
Let::make(x, 3.f),
a.store({0}, ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f))),
});
LLVMCodeGen cg(block, {a});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(v[0], 2.f + 3.f * 3.f + 4.f);
}
TEST(LLVM, LetTest02) {
BufHandle a("A", {1}, kFloat);
std::vector<float> v = {1, 0};
std::vector<void*> args({v.data()});
VarHandle x("x", kFloat);
VarHandle y("y", kFloat);
auto block = Block::make(
{Let::make(x, 3.f),
Let::make(y, 6.f),
a.store(
{IntImm::make(0)},
ExprHandle(2.f) + (x * ExprHandle(3.f) + y * ExprHandle(4.f)))});
LLVMCodeGen cg(block, {a});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(v[0], 2.f + 3.f * 3.f + 6.f * 4.f);
}
TEST(LLVM, LetTestMultitype) {
BufHandle a("A", {1}, kDouble);
std::vector<double> v = {1, 0};
std::vector<void*> args({v.data()});
VarHandle x("x", kByte);
VarHandle y("y", kHalf);
auto block = Block::make(
{Let::make(x, 3),
Let::make(y, 6.f),
a.store(
{0},
Cast::make(
kDouble,
ExprHandle(2.f) +
(x * ExprHandle(3.f) + y * ExprHandle(4.f))))});
LLVMCodeGen cg(block, {a});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(v[0], 2.f + 3 * 3.f + 6.f * 4.f);
}
TEST(LLVM, BufferTest) {
BufHandle a("A", {32}, kFloat);
std::vector<int32_t> v(5);
std::vector<void*> args({v.data()});
auto rv = IntImm::make(0);
LLVMExprEval cg(rv, {a});
ASSERT_EQ(cg.value<int>(args), 0);
}
TEST(LLVM, BlockTest) {
BufHandle a("A", {32}, kInt);
std::vector<int32_t> v = {1, 2};
std::vector<void*> args({v.data()});
auto block = Block::make({
a.store({0}, 3),
a.store({1}, 4),
a.store({0}, 4),
});
LLVMCodeGen cg(block, {a});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(v[0], 4);
ASSERT_EQ(v[1], 4);
}
TEST(LLVM, LoadStoreTest) {
BufHandle a("A", {1}, kInt);
BufHandle b("B", {1}, kInt);
std::vector<int32_t> a_buffer = {42};
std::vector<int32_t> b_buffer = {-11};
auto store = b.store({0}, a.load(0));
LLVMCodeGen cg(store, {a, b});
std::vector<void*> args({a_buffer.data(), b_buffer.data()});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(a_buffer[0], 42);
ASSERT_EQ(b_buffer[0], 42);
}
TEST(LLVM, IfThenElseTest) {
BufHandle a("A", {1}, kInt);
BufHandle b("B", {1}, kInt);
BufHandle c("C", {1}, kInt);
std::vector<int32_t> a_buffer = {42};
std::vector<int32_t> b_buffer = {-11};
std::vector<int32_t> c_buffer = {1};
auto store = b.store({0}, IfThenElse::make(c.load(0), a.load(0), 0));
LLVMCodeGen cg(store, {a, b, c});
std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(a_buffer[0], 42);
ASSERT_EQ(b_buffer[0], 42);
}
// if (x < 10) x = x + 1
TEST(LLVM, CondNoFalseBlockTest) {
BufHandle x("X", {1}, kInt);
auto cmp = CompareSelect::make(x.load(0), 10, CompareSelectOperation::kLT);
auto cond = Cond::make(cmp, x.store({0}, x.load(0) + 1), nullptr);
for (int32_t x_value : {0, 10, 20}) {
std::vector<int32_t> x_buffer = {x_value};
std::vector<void*> args({x_buffer.data()});
LLVMCodeGen cg(cond, {x});
ASSERT_EQ(cg.value<int>(args), 0);
if (x_value < 10) {
ASSERT_EQ(x_buffer[0], x_value + 1);
} else {
ASSERT_EQ(x_buffer[0], x_value);
}
}
}
// if (x < 10) {
// x = x + 1;
// } else {
// x = x - 1;
// }
TEST(LLVM, CondTest) {
BufHandle x("X", {1}, kInt);
auto cmp = CompareSelect::make(x.load(0), 10, CompareSelectOperation::kLT);
auto cond =
Cond::make(cmp, x.store({0}, x.load(0) + 1), x.store({0}, x.load(0) - 1));
auto block = Block::make({
cond,
x.store({0}, x.load(0) * 2),
});
for (int32_t x_value : {0, 10, 20}) {
std::vector<int32_t> x_buffer = {x_value};
std::vector<void*> args({x_buffer.data()});
LLVMCodeGen cg(block, {x});
ASSERT_EQ(cg.value<int>(args), 0);
if (x_value < 10) {
ASSERT_EQ(x_buffer[0], (x_value + 1) * 2);
} else {
ASSERT_EQ(x_buffer[0], (x_value - 1) * 2);
}
}
}
// if (x < 10) {
// if (x > 5) {
// x = x + 1;
// } else {
// x = x - 1;
// }
// } else {
// if (x <= 15) {
// x = x + 2;
// } else {
// x = x - 2;
// }
// }
TEST(LLVM, CondNestedTest) {
BufHandle x("X", {1}, kInt);
auto true_cmp =
CompareSelect::make(x.load(0), 5, CompareSelectOperation::kGT);
auto true_cond = Cond::make(
true_cmp, x.store({0}, x.load(0) + 1), x.store({0}, x.load(0) - 1));
auto false_cmp =
CompareSelect::make(x.load(0), 15, CompareSelectOperation::kLE);
auto false_cond = Cond::make(
false_cmp, x.store({0}, x.load(0) + 2), x.store({0}, x.load(0) - 2));
auto cmp = CompareSelect::make(x.load(0), 10, CompareSelectOperation::kLT);
auto cond = Cond::make(cmp, true_cond, false_cond);
for (int32_t x_value : {0, 8, 15, 20}) {
std::vector<int32_t> x_buffer = {x_value};
std::vector<void*> args({x_buffer.data()});
LLVMCodeGen cg(cond, {x});
ASSERT_EQ(cg.value<int>(args), 0);
if (x_value < 10) {
if (x_value > 5) {
ASSERT_EQ(x_buffer[0], x_value + 1);
} else {
ASSERT_EQ(x_buffer[0], x_value - 1);
}
} else {
if (x_value <= 15) {
ASSERT_EQ(x_buffer[0], x_value + 2);
} else {
ASSERT_EQ(x_buffer[0], x_value - 2);
}
}
}
}
TEST(LLVM, DirectVectorization) {
constexpr int M = 3;
constexpr int N = 64;
BufHandle a("a", {M, N}, kFloat);
BufHandle b("b", {M, N}, kFloat);
BufHandle c("c", {M, N}, kFloat);
VarHandle m("m", kInt);
VarHandle n("n", kInt);
StmtPtr s = For::make(
m,
0,
M,
Store::make(
c,
{Ramp::make(m * 64, 1, 64)},
Load::make({kFloat, 64}, a, {Ramp::make(m * 64, 1, 64)}) *
Load::make({kFloat, 64}, b, {Ramp::make(m * 64, 1, 64)})));
LLVMCodeGen cg(s, {a, b, c});
}
TEST(LLVM, VecLoadStoreTest) {
BufHandle a("A", {1}, kInt);
BufHandle b("B", {1}, kInt);
std::vector<int32_t> a_buffer = {1, 1, 1, 1};
std::vector<int32_t> b_buffer = {2, 2, 2, 2};
auto store = b.store({Ramp::make(0, 1, 4)}, a.load({Ramp::make(0, 1, 4)}));
LLVMCodeGen cg(store, {a, b});
std::vector<void*> args({a_buffer.data(), b_buffer.data()});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(a_buffer[0], 1);
ASSERT_EQ(a_buffer[1], 1);
ASSERT_EQ(a_buffer[2], 1);
ASSERT_EQ(a_buffer[3], 1);
ASSERT_EQ(b_buffer[0], 1);
ASSERT_EQ(b_buffer[1], 1);
ASSERT_EQ(b_buffer[2], 1);
ASSERT_EQ(b_buffer[3], 1);
}
#define FLOAT_INTRINSICS_TEST(Name, Lanes) \
TEST(LLVM, VecFloat_##Name##Lane##Lanes##Test) { \
BufHandle a("A", {1}, kFloat); \
BufHandle b("B", {1}, kFloat); \
float val = 0.5f; \
std::vector<float> a_buffer(Lanes, val); \
std::vector<float> b_buffer(Lanes, val); \
auto store = b.store( \
{Ramp::make(0, 1, Lanes)}, Name(a.load({Ramp::make(0, 1, Lanes)}))); \
LLVMCodeGen cg(store, {a, b}); \
std::vector<void*> args({a_buffer.data(), b_buffer.data()}); \
ASSERT_EQ(cg.value<int>(args), 0); \
for (const auto i : c10::irange(Lanes)) { \
ASSERT_FLOAT_EQ(a_buffer[i], val); \
} \
} // namespace jit
FLOAT_INTRINSICS_TEST(erf, 4)
FLOAT_INTRINSICS_TEST(erfc, 4)
FLOAT_INTRINSICS_TEST(acos, 4)
FLOAT_INTRINSICS_TEST(asin, 4)
FLOAT_INTRINSICS_TEST(atan, 4)
FLOAT_INTRINSICS_TEST(cosh, 4)
FLOAT_INTRINSICS_TEST(sinh, 4)
FLOAT_INTRINSICS_TEST(tanh, 4)
FLOAT_INTRINSICS_TEST(expm1, 4)
FLOAT_INTRINSICS_TEST(lgamma, 4)
FLOAT_INTRINSICS_TEST(erf, 8)
FLOAT_INTRINSICS_TEST(erfc, 8)
FLOAT_INTRINSICS_TEST(acos, 8)
FLOAT_INTRINSICS_TEST(asin, 8)
FLOAT_INTRINSICS_TEST(atan, 8)
FLOAT_INTRINSICS_TEST(cosh, 8)
FLOAT_INTRINSICS_TEST(sinh, 8)
FLOAT_INTRINSICS_TEST(tanh, 8)
FLOAT_INTRINSICS_TEST(expm1, 8)
FLOAT_INTRINSICS_TEST(lgamma, 8)
#undef FLOAT_INTRINSICS_TEST
#define DOUBLE_INTRINSICS_TEST(Name, Lanes) \
TEST(LLVM, VecDouble_##Name##Lane##Lanes##Test) { \
BufHandle a("A", {1}, kDouble); \
BufHandle b("B", {1}, kDouble); \
float val = 0.5f; \
std::vector<double> a_buffer(Lanes, val); \
std::vector<double> b_buffer(Lanes, val); \
auto store = b.store( \
{Ramp::make(0, 1, Lanes)}, Name(a.load({Ramp::make(0, 1, Lanes)}))); \
LLVMCodeGen cg(store, {a, b}); \
std::vector<void*> args({a_buffer.data(), b_buffer.data()}); \
ASSERT_EQ(cg.value<int>(args), 0); \
for (const auto i : c10::irange(Lanes)) { \
ASSERT_FLOAT_EQ(a_buffer[i], val); \
} \
} // namespace jit
DOUBLE_INTRINSICS_TEST(erf, 2)
DOUBLE_INTRINSICS_TEST(erfc, 2)
DOUBLE_INTRINSICS_TEST(acos, 2)
DOUBLE_INTRINSICS_TEST(asin, 2)
DOUBLE_INTRINSICS_TEST(atan, 2)
DOUBLE_INTRINSICS_TEST(cosh, 2)
DOUBLE_INTRINSICS_TEST(sinh, 2)
DOUBLE_INTRINSICS_TEST(tanh, 2)
DOUBLE_INTRINSICS_TEST(expm1, 2)
DOUBLE_INTRINSICS_TEST(lgamma, 2)
DOUBLE_INTRINSICS_TEST(erf, 4)
DOUBLE_INTRINSICS_TEST(erfc, 4)
DOUBLE_INTRINSICS_TEST(acos, 4)
DOUBLE_INTRINSICS_TEST(asin, 4)
DOUBLE_INTRINSICS_TEST(atan, 4)
DOUBLE_INTRINSICS_TEST(cosh, 4)
DOUBLE_INTRINSICS_TEST(sinh, 4)
DOUBLE_INTRINSICS_TEST(tanh, 4)
DOUBLE_INTRINSICS_TEST(expm1, 4)
DOUBLE_INTRINSICS_TEST(lgamma, 4)
#undef DOUBLE_INTRINSICS_TEST
TEST(LLVM, VectorizerLoadStoreTest) {
BufHandle a("A", {1}, kInt);
Tensor c = Compute("c", {4}, [&](const VarHandle& i) { return a.load(i); });
BufHandle c_buf(c.buf());
LoopNest l({c});
StmtPtr s = l.root_stmt();
ASSERT_TRUE(LoopNest::vectorize(to<For>(to<Block>(s)->front())));
ASSERT_TRUE(to<For>(to<Block>(s)->front()) == nullptr);
LLVMCodeGen cg(s, {a, c_buf});
std::vector<int> a_vec(4, 21);
std::vector<int> c_vec(4, 0);
std::vector<void*> args({a_vec.data(), c_vec.data()});
ASSERT_EQ(cg.value<int>(args), 0);
assertAllEqual(c_vec, 21);
}
TEST(LLVM, VectorizeBitCast) {
BufHandle a("A", {128}, kInt);
Tensor c = Compute("c", {128}, [&](const VarHandle& i) {
return bitcast<float>(a.load(i));
});
BufHandle c_buf(c.buf());
LoopNest l({c});
StmtPtr s = l.root_stmt();
ASSERT_TRUE(LoopNest::vectorize(to<For>(to<Block>(s)->front())));
ASSERT_TRUE(to<For>(to<Block>(s)->front()) == nullptr);
LLVMCodeGen cg(s, {a, c_buf});
std::vector<int> a_vec(128);
std::vector<float> c_vec(128);
for (const auto i : c10::irange(128)) {
a_vec[i] = raw_bitcast<int>(1337.f);
}
std::vector<void*> args({a_vec.data(), c_vec.data()});
ASSERT_EQ(cg.value<int>(args), 0);
assertAllEqual(c_vec, 1337.f);
}
TEST(LLVM, MemcpyTest) {
constexpr int N = 32;
BufHandle a("A", {N}, kInt);
BufHandle b("B", {N}, kInt);
std::vector<int32_t> a_buffer(N, 42);
std::vector<int32_t> b_buffer(N, 0);
VarHandle i("i", kInt);
auto expr = For::make(i, 0, N, b.store({i}, a.load(i)));
LLVMCodeGen cg(expr, {a, b});
std::vector<void*> args({a_buffer.data(), b_buffer.data()});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(a_buffer.size(), N);
ASSERT_EQ(b_buffer.size(), N);
assertAllEqual(a_buffer, 42);
assertAllEqual(b_buffer, 42);
}
TEST(LLVM, BzeroTest) {
constexpr int N = 32;
BufHandle b("B", {N}, kInt);
std::vector<int32_t> b_buffer(N, 11);
VarHandle i("i", kInt);
auto expr = For::make(i, 0, N, b.store({i}, 0));
LLVMCodeGen cg(expr, {b});
std::vector<void*> args({b_buffer.data()});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(b_buffer.size(), N);
assertAllEqual(b_buffer, 0);
}
TEST(LLVM, ElemwiseAdd) {
constexpr int N = 1024;
BufHandle a("A", {N}, kInt);
BufHandle b("B", {N}, kInt);
BufHandle c("C", {N}, kInt);
std::vector<int32_t> a_buffer(N, 41);
std::vector<int32_t> b_buffer(N, 1);
std::vector<int32_t> c_buffer(N, 1);
VarHandle i("i", kInt);
auto expr = For::make(i, 0, N, c.store({i}, Add::make(a.load(i), b.load(i))));
LLVMCodeGen cg(expr, {a, b, c});
std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(a_buffer.size(), N);
ASSERT_EQ(b_buffer.size(), N);
ASSERT_EQ(c_buffer.size(), N);
assertAllEqual(a_buffer, 41);
assertAllEqual(b_buffer, 1);
assertAllEqual(c_buffer, 42);
}
TEST(LLVM, ElemwiseAddFloat) {
constexpr int N = 1024;
BufHandle a("A", {N}, kFloat);
BufHandle b("B", {N}, kFloat);
BufHandle c("C", {N}, kFloat);
std::vector<float> a_buffer(N, 41);
std::vector<float> b_buffer(N, 1);
std::vector<float> c_buffer(N, 1);
VarHandle i("i", kInt);
auto expr = For::make(i, 0, N, c.store({i}, a.load(i) + b.load(i)));
LLVMCodeGen cg(expr, {a, b, c});
std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(a_buffer.size(), N);
ASSERT_EQ(b_buffer.size(), N);
ASSERT_EQ(c_buffer.size(), N);
assertAllEqual(a_buffer, 41.0f);
assertAllEqual(b_buffer, 1.0f);
assertAllEqual(c_buffer, 42.0f);
}
TEST(LLVM, ElemwiseLog10Float) {
constexpr int N = 1024;
BufHandle a("A", {N}, kFloat);
BufHandle b("B", {N}, kFloat);
std::vector<float> a_buffer(N, 10.0f);
std::vector<float> b_buffer(N, 2.0f);
VarHandle i("i", kInt);
auto expr = For::make(
i,
0,
N / 4,
b.store(
{Ramp::make(i * 4, 1, 4)}, log10(a.load({Ramp::make(i * 4, 1, 4)}))));
LLVMCodeGen cg(expr, {a, b});
std::vector<void*> args({a_buffer.data(), b_buffer.data()});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(a_buffer.size(), N);
ASSERT_EQ(b_buffer.size(), N);
assertAllEqual(a_buffer, 10.0f);
assertAllEqual(b_buffer, 1.0f);
}
TEST(LLVM, ElemwiseLog1pFloat) {
constexpr int N = 1024;
BufHandle a("A", {N}, kFloat);
BufHandle b("B", {N}, kFloat);
std::vector<float> a_buffer(N, expf(3.0f) - 1);
std::vector<float> b_buffer(N, 42.0f);
VarHandle i("i", kInt);
auto expr = For::make(
i,
0,
N / 4,
b.store(
{Ramp::make(i * 4, 1, 4)}, log1p(a.load({Ramp::make(i * 4, 1, 4)}))));
LLVMCodeGen cg(expr, {a, b});
std::vector<void*> args({a_buffer.data(), b_buffer.data()});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(a_buffer.size(), N);
ASSERT_EQ(b_buffer.size(), N);
assertAllEqual(a_buffer, expf(3.0f) - 1);
ExpectAllNear(b_buffer, 3.0f, 1e-5f);
}
TEST(LLVM, ElemwiseMaxInt) {
constexpr int N = 1024;
BufHandle a("A", {N}, kInt);
BufHandle b("B", {N}, kInt);
BufHandle c("C", {N}, kInt);
std::vector<int> a_buffer(N, 41);
std::vector<int> b_buffer(N, 1);
std::vector<int> c_buffer(N, 1);
VarHandle i("i", kInt);
auto expr =
For::make(i, 0, N, c.store({i}, Max::make(a.load(i), b.load(i), false)));
LLVMCodeGen cg(expr, {a, b, c});
std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(a_buffer.size(), N);
ASSERT_EQ(b_buffer.size(), N);
ASSERT_EQ(c_buffer.size(), N);
assertAllEqual(a_buffer, 41);
assertAllEqual(b_buffer, 1);
assertAllEqual(c_buffer, 41);
}
TEST(LLVM, ElemwiseMinInt) {
constexpr int N = 1024;
BufHandle a("A", {N}, kInt);
BufHandle b("B", {N}, kInt);
BufHandle c("C", {N}, kInt);
std::vector<int> a_buffer(N, 41);
std::vector<int> b_buffer(N, 1);
std::vector<int> c_buffer(N, 1);
VarHandle i("i", kInt);
auto expr =
For::make(i, 0, N, c.store({i}, Min::make(a.load(i), b.load(i), false)));
LLVMCodeGen cg(expr, {a, b, c});
std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(a_buffer.size(), N);
ASSERT_EQ(b_buffer.size(), N);
ASSERT_EQ(c_buffer.size(), N);
assertAllEqual(a_buffer, 41);
assertAllEqual(b_buffer, 1);
assertAllEqual(c_buffer, 1);
}
TEST(LLVM, ElemwiseMaxFloat) {
constexpr int N = 1024;
BufHandle a("A", {N}, kFloat);
BufHandle b("B", {N}, kFloat);
BufHandle c("C", {N}, kFloat);
std::vector<float> a_buffer(N, 41);
std::vector<float> b_buffer(N, 1);
std::vector<float> c_buffer(N, 1);
VarHandle i("i", kInt);
auto expr =
For::make(i, 0, N, c.store({i}, Max::make(a.load(i), b.load(i), false)));
LLVMCodeGen cg(expr, {a, b, c});
std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(a_buffer.size(), N);
ASSERT_EQ(b_buffer.size(), N);
ASSERT_EQ(c_buffer.size(), N);
assertAllEqual(a_buffer, 41.0f);
assertAllEqual(b_buffer, 1.0f);
assertAllEqual(c_buffer, 41.0f);
}
TEST(LLVM, ElemwiseMaxNaNFloat) {
constexpr int N = 1024;
BufHandle a("A", {N}, kFloat);
BufHandle b("B", {N}, kFloat);
BufHandle c("C", {N}, kFloat);
std::vector<float> a_buffer(N, NAN);
std::vector<float> b_buffer(N, 1);
std::vector<float> c_buffer(N, 1);
VarHandle i("i", kInt);
auto expr =
For::make(i, 0, N, c.store({i}, Max::make(a.load(i), b.load(i), false)));
LLVMCodeGen cg(expr, {a, b, c});
std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(a_buffer.size(), N);
ASSERT_EQ(b_buffer.size(), N);
ASSERT_EQ(c_buffer.size(), N);
assertAllEqual(b_buffer, 1.0f);
for (auto const& elt : c_buffer) {
ASSERT_TRUE(std::isnan(elt));
}
}
TEST(LLVM, ElemwiseMinFloat) {
constexpr int N = 1024;
BufHandle a("A", {N}, kFloat);
BufHandle b("B", {N}, kFloat);
BufHandle c("C", {N}, kFloat);
std::vector<float> a_buffer(N, 41);
std::vector<float> b_buffer(N, 1);
std::vector<float> c_buffer(N, 1);
VarHandle i("i", kInt);
auto expr =
For::make(i, 0, N, c.store({i}, Min::make(a.load(i), b.load(i), false)));
LLVMCodeGen cg(expr, {a, b, c});
std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(a_buffer.size(), N);
ASSERT_EQ(b_buffer.size(), N);
ASSERT_EQ(c_buffer.size(), N);
assertAllEqual(a_buffer, 41.0f);
assertAllEqual(b_buffer, 1.0f);
assertAllEqual(c_buffer, 1.0f);
}
TEST(LLVM, ElemwiseMinNaNFloat) {
constexpr int N = 1024;
BufHandle a("A", {N}, kFloat);
BufHandle b("B", {N}, kFloat);
BufHandle c("C", {N}, kFloat);
std::vector<float> a_buffer(N, NAN);
std::vector<float> b_buffer(N, 1);
std::vector<float> c_buffer(N, 1);
VarHandle i("i", kInt);
auto expr =
For::make(i, 0, N, c.store({i}, Min::make(a.load(i), b.load(i), false)));
LLVMCodeGen cg(expr, {a, b, c});
std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(a_buffer.size(), N);
ASSERT_EQ(b_buffer.size(), N);
ASSERT_EQ(c_buffer.size(), N);
assertAllEqual(b_buffer, 1.0f);
for (auto const& elt : c_buffer) {
ASSERT_TRUE(std::isnan(elt));
}
}
TEST(LLVM, ElemwiseMod) {
constexpr int N = 1024;
BufHandle a("A", {N}, kInt);
BufHandle b("B", {N}, kInt);
BufHandle c("C", {N}, kInt);
std::vector<int32_t> a_buffer(N, 41);
std::vector<int32_t> b_buffer(N, 23);
std::vector<int32_t> c_buffer(N, 18);
VarHandle i("i", kInt);
auto expr = For::make(i, 0, N, c.store({i}, Mod::make(a.load(i), b.load(i))));
LLVMCodeGen cg(expr, {a, b, c});
std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(a_buffer.size(), N);
ASSERT_EQ(b_buffer.size(), N);
ASSERT_EQ(c_buffer.size(), N);
assertAllEqual(a_buffer, 41);
assertAllEqual(b_buffer, 23);
assertAllEqual(c_buffer, 18);
}
TEST(LLVM, CompareSelectIntEQ) {
constexpr int N = 1024;
BufHandle a("A", {N}, kInt);
BufHandle b("B", {N}, kInt);
BufHandle c("C", {N}, kInt);
std::vector<int> a_buffer(N, 1);
std::vector<int> b_buffer(N, 1);
std::vector<int> c_buffer(N, 0);
std::vector<int> c_ref(N, 1);
for (int i = 0; i < N / 2; i++) {
b_buffer[i] = 0;
c_ref[i] = 0;
}
VarHandle i("i", kInt);
auto expr = For::make(
i,
0,
N,
c.store(
{i},
CompareSelect::make(
a.load(i), b.load(i), CompareSelectOperation::kEQ)));
LLVMCodeGen cg(expr, {a, b, c});
std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(a_buffer.size(), N);
ASSERT_EQ(b_buffer.size(), N);
ASSERT_EQ(c_buffer.size(), N);
assertAllEqual(a_buffer, 1);
for (const auto i : c10::irange(N)) {
ASSERT_EQ(c_ref[i], c_buffer[i]);
}
}
TEST(LLVM, CompareSelectFloatEQ) {
constexpr int N = 1024;
BufHandle a("A", {N}, kFloat);
BufHandle b("B", {N}, kFloat);
BufHandle c("C", {N}, kInt);
std::vector<float> a_buffer(N, 1.0f);
std::vector<float> b_buffer(N, 1.0f);
std::vector<int> c_buffer(N, 0);
VarHandle i("i", kInt);
auto expr = For::make(
i,
0,
N,
c.store(
{i},
CompareSelect::make(
a.load(i), b.load(i), CompareSelectOperation::kEQ)));
LLVMCodeGen cg(expr, {a, b, c});
std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(a_buffer.size(), N);
ASSERT_EQ(b_buffer.size(), N);
ASSERT_EQ(c_buffer.size(), N);
assertAllEqual(a_buffer, 1.0f);
assertAllEqual(b_buffer, 1.0f);
assertAllEqual(c_buffer, 1);
}
TEST(LLVM, CompareSelectByteGT) {
constexpr int N = 1024;
BufHandle a("A", {N}, kByte);
BufHandle b("B", {N}, kByte);
BufHandle c("C", {N}, kInt);
std::vector<uint8_t> a_buffer(N, 0);
std::vector<uint8_t> b_buffer(N, 0);
std::vector<int> c_buffer(N, 0);
std::vector<int> c_ref(N, 0);
for (int i = 0; i < N / 2; i++) {
a_buffer[i] = 128;
c_ref[i] = 1;
}
VarHandle i("i", kInt);
auto expr = For::make(
i,
0,
N,
c.store(
{i},
CompareSelect::make(
a.load(i), b.load(i), CompareSelectOperation::kGT)));
LLVMCodeGen cg(expr, {a, b, c});
std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(a_buffer.size(), N);
ASSERT_EQ(b_buffer.size(), N);
ASSERT_EQ(c_buffer.size(), N);
assertAllEqual(b_buffer, uint8_t(0));
for (const auto i : c10::irange(N)) {
ASSERT_EQ(c_ref[i], c_buffer[i]);
}
}
TEST(LLVM, CompareSelectByteGE) {
constexpr int N = 1024;
BufHandle a("A", {N}, kByte);
BufHandle b("B", {N}, kByte);
BufHandle c("C", {N}, kInt);
std::vector<uint8_t> a_buffer(N, 0);
std::vector<uint8_t> b_buffer(N, 0);
std::vector<int> c_buffer(N, 0);
std::vector<int> c_ref(N, 1);
VarHandle i("i", kInt);
auto expr = For::make(
i,
0,
N,
c.store(
{i},
CompareSelect::make(
a.load(i), b.load(i), CompareSelectOperation::kGE)));
LLVMCodeGen cg(expr, {a, b, c});
std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(a_buffer.size(), N);
ASSERT_EQ(b_buffer.size(), N);
ASSERT_EQ(c_buffer.size(), N);
assertAllEqual(b_buffer, uint8_t(0));
for (const auto i : c10::irange(N)) {
ASSERT_EQ(c_ref[i], c_buffer[i]);
}
}
TEST(LLVM, CompareSelectByteLT) {
constexpr int N = 1024;
BufHandle a("A", {N}, kByte);
BufHandle b("B", {N}, kByte);
BufHandle c("C", {N}, kInt);
std::vector<uint8_t> a_buffer(N, 0);
std::vector<uint8_t> b_buffer(N, 128);
std::vector<int> c_buffer(N, 0);
std::vector<int> c_ref(N, 1);
for (int i = 0; i < N / 2; i++) {
a_buffer[i] = 128;
c_ref[i] = 0;
}
VarHandle i("i", kInt);
auto expr = For::make(
i,
0,
N,
c.store(
{i},
CompareSelect::make(
a.load(i), b.load(i), CompareSelectOperation::kLT)));
LLVMCodeGen cg(expr, {a, b, c});
std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(a_buffer.size(), N);
ASSERT_EQ(b_buffer.size(), N);
ASSERT_EQ(c_buffer.size(), N);
assertAllEqual(b_buffer, uint8_t(128));
for (const auto i : c10::irange(N)) {
ASSERT_EQ(c_ref[i], c_buffer[i]);
}
}
TEST(LLVM, CompareSelectByteLE) {
constexpr int N = 1024;
BufHandle a("A", {N}, kByte);
BufHandle b("B", {N}, kByte);
BufHandle c("C", {N}, kInt);
std::vector<uint8_t> a_buffer(N, 0);
std::vector<uint8_t> b_buffer(N, 128);
std::vector<int> c_buffer(N, 0);
std::vector<int> c_ref(N, 1);
VarHandle i("i", kInt);
auto expr = For::make(
i,
0,
N,
c.store(
{i},
CompareSelect::make(
a.load(i), b.load(i), CompareSelectOperation::kLE)));
LLVMCodeGen cg(expr, {a, b, c});
std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(a_buffer.size(), N);
ASSERT_EQ(b_buffer.size(), N);
ASSERT_EQ(c_buffer.size(), N);
assertAllEqual(b_buffer, uint8_t(128));
for (const auto i : c10::irange(N)) {
ASSERT_EQ(c_ref[i], c_buffer[i]);
}
}
TEST(LLVM, StoreFloat) {
BufHandle result("result", {1}, kFloat);
std::vector<float> result_buffer = {0.0f};
auto expr = result.store({0}, FloatImm::make(3.14f));
LLVMCodeGen cg(expr, {result});
std::vector<void*> args({result_buffer.data()});
ASSERT_EQ(cg.value<int>(args), 0);
ASSERT_EQ(result_buffer[0], 3.14f);
}
TEST(LLVM, SimpleMath01) {
const int N = 1024;
Tensor tensor = Compute(
"f", {N}, [](const VarHandle& i) { return cast<float>(i * i + 1); });
LoopNest l({tensor});
StmtPtr stmt = l.root_stmt();
BufHandle f_buf(tensor.buf());
LLVMCodeGen cg(stmt, {f_buf});
PaddedBuffer<float> f_v(N, "f_v");
std::vector<void*> args({f_v.data()});
int value = cg.value<int>(args);
ASSERT_EQ(value, 0);
PaddedBuffer<float> f_ref(N, "f_ref");
for (const auto i : c10::irange(N)) {
f_ref(i) = i * i + 1;
}
ExpectAllNear(f_v, f_ref, 1e-5);
}
TEST(LLVM, ComputeMul) {
const int N = 1024;
BufHandle a("a", {N}, kFloat);
BufHandle b("b", {N}, kFloat);
Tensor c = Compute(
"c", {N}, [&](const VarHandle& i) { return a.load(i) * b.load(i); });
BufHandle c_buf(c.buf());
LoopNest l({c});
StmtPtr s = l.root_stmt();
LLVMCodeGen cg(s, {a, b, c_buf});
std::vector<float> a_vec(N, 21.0f);
std::vector<float> b_vec(N, 2.0f);
std::vector<float> c_vec(N, 0.0f);
std::vector<void*> args({a_vec.data(), b_vec.data(), c_vec.data()});
ASSERT_EQ(cg.value<int>(args), 0);
assertAllEqual(c_vec, 42.0f);
}
TEST(LLVM, BroadcastAdd) {
const int M = 32;
const int N = 1024;
BufHandle a("a", {M, N}, kFloat);
BufHandle b("b", {N}, kFloat);
Tensor c = Compute("c", {M, N}, [&](const VarHandle& i, const VarHandle& j) {
return a.load(i, j) + b.load(j);
});
BufHandle c_buf(c.buf());
LoopNest l({c});
l.prepareForCodegen();
StmtPtr s = l.root_stmt();
LLVMCodeGen cg(s, {a, b, c_buf});
std::vector<float> av(M * N);
std::iota(av.begin(), av.end(), 0);
std::vector<float> bv(N);
std::iota(bv.begin(), bv.end(), 0);
std::vector<float> cv(M * N, 0);
std::vector<void*> args({av.data(), bv.data(), cv.data()});
ASSERT_EQ(cg.value<int>(args), 0);
for (const auto i : c10::irange(M)) {
for (const auto j : c10::irange(N)) {
ASSERT_EQ(cv[i * N + j], av[i * N + j] + bv[j]);
}
}
}
TEST(LLVM, BitwiseOps) {
auto a = IntImm::make(59);
auto b = IntImm::make(11);
auto c = IntImm::make(101);
auto d = IntImm::make(2);
ExprHandle f = (((a ^ (b << 1)) & c) >> 2) | d;
LLVMExprEval cg(f);
ASSERT_EQ(cg.value<int>(), 11);
}
TEST(LLVM, ArithmeticRightShift) {
auto a = CharImm::make(-4);
auto b = CharImm::make(1);
ExprHandle f = a >> b;
LLVMExprEval cg(f);
ASSERT_EQ(cg.value<int8_t>(), -2);
}
TEST(LLVM, LogicalRightShift) {
auto a = ByteImm::make(0xfc);
auto b = ByteImm::make(1);
ExprHandle f = a >> b;
LLVMExprEval cg(f);
ASSERT_EQ(cg.value<uint8_t>(), 0x7e);
}
TEST(LLVM, DynamicShapeAdd) {
auto testWithSize = [](int32_t size) {
VarHandle n("n", kInt);
BufHandle a("a", {n}, kFloat);
BufHandle b("b", {n}, kFloat);
BufHandle c("c", {n}, kFloat);
VarHandle i("i", kInt);
StmtPtr s = For::make(i, 0, n, c.store({i}, a.load(i) + b.load(i)));
std::vector<float> aData(size, 1.0f);
std::vector<float> bData(size, 2.0f);
std::vector<float> cData(size, 0.0f);
LLVMCodeGen cg(s, {a, b, c, n});
std::vector<void*> args({aData.data(), bData.data(), cData.data(), &size});
cg.value<float>(args);
ExpectAllNear(cData, std::vector<float>(size, 3.0f), 1e-7);
};
testWithSize(1);
testWithSize(16);
testWithSize(37);
}
TEST(LLVM, BindDynamicShapeAdd) {
auto testWithSize = [](int32_t size) {
VarHandle n("n", kInt);
BufHandle a("a", {n}, kFloat);
BufHandle b("b", {n}, kFloat);
BufHandle c("c", {n}, kFloat);
VarHandle i("i", kInt);
StmtPtr s = For::make(i, 0, n, c.store({i}, a.load(i) + b.load(i)));
std::vector<float> aData(size, 1.0f);
std::vector<float> bData(size, 2.0f);
std::vector<float> cData(size, 0.0f);
LLVMCodeGen cg(s, {a, b, c, n});
cg.call({aData, bData, cData, size});
ExpectAllNear(cData, std::vector<float>(size, 3.0f), 1e-7);
};
testWithSize(1);
testWithSize(16);
testWithSize(37);
}
TEST(LLVM, TensorDynamicShapeAdd) {
auto testWithSize = [](int32_t size) {
VarHandle n("n", kInt);
BufHandle a("a", {n}, kFloat);
BufHandle b("b", {n}, kFloat);
Tensor c = Compute(
"c", {n}, [&](const VarHandle& i) { return a.load(i) + b.load(i); });
LoopNest l({c});
StmtPtr s = l.root_stmt();
LLVMCodeGen cg(s, {a, b, c, n});
std::vector<float> aData(size, 1.0f);
std::vector<float> bData(size, 2.0f);
std::vector<float> cData(size, 0.0f);
cg.call({aData, bData, cData, size});
ExpectAllNear(cData, std::vector<float>(size, 3.0f), 1e-7);
};
testWithSize(1);
testWithSize(16);
testWithSize(37);
}
TEST(LLVM, DynamicShape2D) {
auto testWithSize = [](int32_t M, int32_t N) {
VarHandle m("m", kInt);
VarHandle n("n", kInt);
BufHandle a("a", {m, n}, kFloat);
BufHandle b("b", {m, n}, kFloat);
Tensor c =
Compute("c", {m, n}, [&](const VarHandle& i, const VarHandle& j) {
return a.load(i, j) + b.load(i, j);
});
LoopNest l({c});
l.prepareForCodegen();
StmtPtr s = l.root_stmt();
LLVMCodeGen cg(s, {a, b, c, m, n});
std::vector<float> aData(M * N, 1.0f);
std::vector<float> bData(M * N, 2.0f);
std::vector<float> cData(M * N, 0.0f);
cg.call({aData, bData, cData, M, N});
ExpectAllNear(cData, std::vector<float>(M * N, 3.0f), 1e-7);
};
testWithSize(1, 8);
testWithSize(16, 32);
testWithSize(37, 11);
}
TEST(LLVM, EmptyStmt) {
StmtPtr s = alloc<Block>(std::vector<StmtPtr>({}));
LLVMCodeGen cg(s, {});
cg.call({});
// Just don't crash.
}
TEST(LLVM, EliminatedStmt) {
BufHandle a("a", {1}, kFloat);
Tensor c = Compute("c", {0}, [&](const VarHandle& m) { return m; });
LoopNest l({c});
l.prepareForCodegen();
StmtPtr s = l.root_stmt();
s = IRSimplifier::simplify(s);
LLVMCodeGen cg(s, {a, c});
std::vector<float> aData(1, 1.0f);
std::vector<float> cData(0, 0.0f);
cg.call({aData, cData});
}
TEST(LLVM, SimpleReduction) {
int M = 128;
int N = 64;
BufHandle a("a", {1, M, N}, kFloat);
Tensor b = Reduce("sum", {1}, Sum(), a, {M, N});
LoopNest loop({b});
loop.prepareForCodegen();
StmtPtr s = loop.root_stmt();
s = IRSimplifier::simplify(s);
LLVMCodeGen cg(s, {a, b});
PaddedBuffer<float> a_v(1, M, N, "a_v");
PaddedBuffer<float> b_v(1, "b_v");
PaddedBuffer<float> b_ref(1, "b_ref");
b_ref(0) = 0;
for (const auto i : c10::irange(M)) {
for (const auto j : c10::irange(N)) {
int v = i + j;
a_v(0, i, j) = v;
b_ref(0) += v;
}
}
cg.call({a_v, b_v});
ExpectAllNear(b_v, b_ref, 1e-5);
}
TEST(LLVM, RFactorReduction) {
int M = 128;
int N = 64;
BufHandle a("a", {1, M, N}, kFloat);
Tensor b = Reduce("sum", {1}, Sum(), a, {M, N});
LoopNest loop({b});
std::vector<ForPtr> loops = loop.getLoopStmtsFor(b);
ForPtr loop_m = loops.at(1);
ForPtr loop_n = loops.at(2);
loop.reorderAxis(loop_m, loop_n);
loops = loop.getLoopStmtsFor(b);
loop_m = loops.at(2);
loop_n = loops.at(1);
auto b_body = loop.getAllWritesToBuf(b.buf())[1];
ASSERT_TRUE(loop.rfactor(b_body, loop_n));
loop.prepareForCodegen();
StmtPtr s = loop.root_stmt();
s = IRSimplifier::simplify(s);
LLVMCodeGen cg(s, {a, b});
PaddedBuffer<float> a_v(1, M, N, "a_v");
PaddedBuffer<float> b_v(1, "b_v");
PaddedBuffer<float> b_ref(1, "b_ref");
b_ref(0) = 0;
for (const auto i : c10::irange(M)) {
for (const auto j : c10::irange(N)) {
int v = i + j;
a_v(0, i, j) = v;
b_ref(0) += v;
}
}
cg.call({a_v, b_v});
ExpectAllNear(b_v, b_ref, 1e-5);
}
TEST(LLVM, RFactorVectorizedReduction) {
int M = 128;
int N = 64;
BufHandle a("a", {1, M, N}, kFloat);
Tensor b = Reduce("sum", {1}, Sum(), a, {M, N});
LoopNest loopnest({b});
std::vector<ForPtr> loops = loopnest.getLoopStmtsFor(b);
// Reorder n and m loops
loopnest.reorderAxis(loops.at(1), loops.at(2));
auto b_body = loopnest.getAllWritesToBuf(b.buf()).at(1);
auto all_loops = loopnest.getAllLoopNestsWritingToBuf(b.buf());
ASSERT_TRUE(all_loops.size() == 2 && all_loops[1].size() == 3);
ASSERT_TRUE(loopnest.rfactor(b_body, all_loops[1][1]));
auto distributed_loops = loopnest.distributeLoop(all_loops[1][1]);
// Vectorize initializer of rfac_buf
ASSERT_TRUE(LoopNest::vectorize(distributed_loops[0]));
// Vectorize producer of rfac_buf
ASSERT_TRUE(LoopNest::vectorize(distributed_loops[1]));
loopnest.simplify();
loopnest.prepareForCodegen();
StmtPtr s = IRSimplifier::simplify(loopnest.root_stmt());
LLVMCodeGen cg(s, {a, b});
PaddedBuffer<float> a_v(1, M, N, "a_v");
PaddedBuffer<float> b_v(1, "b_v");
PaddedBuffer<float> b_ref(1, "b_ref");
b_ref(0) = 0;
for (const auto i : c10::irange(M)) {
for (const auto j : c10::irange(N)) {
int v = i + j;
a_v(0, i, j) = v;
b_ref(0) += v;
}
}
cg.call({a_v, b_v});
ExpectAllNear(b_v, b_ref, 1e-5);
}
template <bool outer, bool inner>
static void testSimpleParallel() {
// Compute a simple operation, and try all loop-axis combination to be
// parallel or sequential.
const int M = 4;
const int N = 6;
Tensor f = Compute("f", {M, N}, [](const VarHandle& m, const VarHandle& n) {
return cast<float>(m + n);
});
LoopNest loop_nest({f});
auto const& loops = loop_nest.getLoopStmtsFor(f);
ForPtr m = loops[0];
ForPtr n = loops[1];
if (outer) {
m->set_parallel();
}
if (inner) {
n->set_parallel();
}
loop_nest.prepareForCodegen();
StmtPtr stmt = loop_nest.root_stmt();
LLVMCodeGen cg(stmt, {f});
PaddedBuffer<float> f_v(M, N, "f_v");
std::vector<void*> args({f_v.data()});
int value = cg.value<int>(args);
ASSERT_EQ(value, 0);
PaddedBuffer<float> f_ref(M, N, "f_ref");
for (const auto m : c10::irange(M)) {
for (const auto n : c10::irange(N)) {
f_ref(m, n) = m + n;
}
}
ExpectAllNear(f_v, f_ref, 1e-5);
}
TEST(LLVM, SimpleParallelSS) {
testSimpleParallel<false, false>();
}
TEST(LLVM, SimpleParallelSP) {
testSimpleParallel<false, true>();
}
TEST(LLVM, SimpleParallelPS) {
testSimpleParallel<true, false>();
}
TEST(LLVM, SimpleParallelPP) {
testSimpleParallel<true, true>();
}
TEST(LLVM, CompositeParallel) {
int loop_count = 6;
int test_count = 1 << loop_count;
// Compute a composite operation, and try all loop-axis combination to be
// parallel or sequential.
for (const auto test_cfg : c10::irange(test_count)) {
int M = 5;
int N = 7;
Tensor t1 = Compute("t1", {M}, [](const VarHandle& m) { return m + 1.f; });
Tensor t2 = Compute("t2", {N}, [](const VarHandle& n) { return n + 2.f; });
Tensor t3 =
Compute("t3", {M, N}, [=](const VarHandle& m, const VarHandle& n) {
return t1.load(m) * t2.load(n);
});
Tensor t4 =
Compute("t4", {M, N}, [=](const VarHandle& m, const VarHandle& n) {
return t3.load(m, n) + m + n;
});
LoopNest loop_nest({t4}, {t1, t2, t3, t4});
std::vector<ForPtr> loop_list;
{
auto const& loops = loop_nest.getLoopStmtsFor(t1);
loop_list.push_back(loops[0]);
}
{
auto const& loops = loop_nest.getLoopStmtsFor(t2);
loop_list.push_back(loops[0]);
}
{
auto const& loops = loop_nest.getLoopStmtsFor(t3);
loop_list.push_back(loops[0]);
loop_list.push_back(loops[1]);
}
{
auto const& loops = loop_nest.getLoopStmtsFor(t4);
loop_list.push_back(loops[0]);
loop_list.push_back(loops[1]);
}
ASSERT_EQ(loop_list.size(), loop_count);
for (const auto i : c10::irange(loop_count)) {
if (test_cfg & (1 << i)) {
loop_list[i]->set_parallel();
}
}
loop_nest.prepareForCodegen();
StmtPtr stmt = loop_nest.root_stmt();
LLVMCodeGen cg(stmt, {t4});
PaddedBuffer<float> t4_v(M, N, "t4_v");
std::vector<void*> args({t4_v.data()});
int value = cg.value<int>(args);
ASSERT_EQ(value, 0);
PaddedBuffer<float> t4_ref(M, N, "t4_ref");
for (const auto m : c10::irange(M)) {
for (const auto n : c10::irange(N)) {
t4_ref(m, n) = (m + 1) * (n + 2) + m + n;
}
}
ExpectAllNear(t4_v, t4_ref, 1e-5);
}
}
TEST(LLVM, VectorizedGEMM) {
int M = 32;
int N = 32;
int K = 48;
BufHandle AP("A", {M, K}, kFloat);
BufHandle BP("B", {K, N}, kFloat);
Tensor CT = Reduce(
"gemm",
{M, N},
Sum(),
[&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
return AP.load(m, k) * BP.load(k, n);
},
{K});
LoopNest loop({CT});
{
auto const& loops = loop.getLoopStmtsFor(CT);
ForPtr m = loops[0];
loop.splitWithMask(m, 16);
}
{
auto const& loops = loop.getLoopStmtsFor(CT);
ForPtr n = loops[2];
loop.splitWithMask(n, 16);
}
// mo, mi, no, ni, k ->
// mo, no, mi, ni, k
{
auto const& loops = loop.getLoopStmtsFor(CT);
ForPtr mi = loops[1];
ForPtr no = loops[2];
loop.reorderAxis(mi, no);
}
// mo, no, mi, ni, k ->
// mo, no, mi, k, ni
{
auto const& loops = loop.getLoopStmtsFor(CT);
ForPtr ni = loops[3];
ForPtr k = loops[4];
loop.reorderAxis(ni, k);
}
// mo, no, mi, k, ni ->
// mo, no, k, mi, ni
{
auto const& loops = loop.getLoopStmtsFor(CT);
ForPtr mi = loops[2];
ForPtr k = loops[3];
loop.reorderAxis(mi, k);
}
{
auto loops = NodeFinder<For>::find(loop.root_stmt());
ASSERT_TRUE(LoopNest::vectorize(loops[3]));
ASSERT_TRUE(LoopNest::vectorize(loops.back()));
}
loop.prepareForCodegen();
StmtPtr s = loop.root_stmt();
s = IRSimplifier::simplify(s);
LLVMCodeGen cg(s, {AP, BP, CT});
PaddedBuffer<float> a_v(M, K, "a_v");
PaddedBuffer<float> b_v(K, N, "b_v");
PaddedBuffer<float> c_v(M, N, "c_v");
PaddedBuffer<float> c_ref(M, N, "c_ref");
for (const auto m : c10::irange(M)) {
for (const auto n : c10::irange(N)) {
c_ref(m, n) = 0.f;
for (const auto k : c10::irange(K)) {
c_ref(m, n) += a_v(m, k) * b_v(k, n);
}
}
}
cg.call({a_v, b_v, c_v});
ExpectAllNear(c_v, c_ref, 1e-5);
}
TEST(LLVM, CallRaw) {
const int M = 32;
VarHandle N("N", kInt);
BufHandle a("a", {M, N}, kFloat);
BufHandle b("b", {N}, kFloat);
Tensor c = Compute("c", {M, N}, [&](const VarHandle& i, const VarHandle& j) {
return a.load(i, j) + b.load(j);
});
LoopNest l({c});
l.prepareForCodegen();
StmtPtr s = l.root_stmt();
int32_t N_value = 1024;
std::vector<float> av(M * N_value);
std::iota(av.begin(), av.end(), 0);
std::vector<float> bv(N_value);
std::iota(bv.begin(), bv.end(), 0);
std::vector<float> cv(M * N_value, 0);
std::vector<void*> args({av.data(), bv.data(), cv.data(), &N_value});
LLVMCodeGen cg(s, {a, b, BufHandle(c.buf()), N});
cg.call_raw(args);
for (const auto i : c10::irange(M)) {
for (const auto j : c10::irange(N_value)) {
ASSERT_EQ(cv[i * N_value + j], av[i * N_value + j] + bv[j]);
}
}
SimpleIREvaluator eval(s, {a, b, BufHandle(c.buf()), N});
eval.call_raw(args);
for (const auto i : c10::irange(M)) {
for (const auto j : c10::irange(N_value)) {
ASSERT_EQ(cv[i * N_value + j], av[i * N_value + j] + bv[j]);
}
}
}
TEST(LLVM, CustomTarget) {
constexpr int M = 16;
BufHandle a("a", {M}, kFloat);
BufHandle b("b", {M}, kFloat);
BufHandle c("c", {M}, kFloat);
Tensor d = Compute("d", {M}, [&](const VarHandle& m) {
return a.load(m) * b.load(m) + c.load(m);
});
LoopNest nest({d});
nest.prepareForCodegen();
auto cg = LLVMCodeGenBuilder(nest.root_stmt(), {a, b, c, d})
.triple("i686-elf")
.cpu("i386")
.build();
std::ostringstream ss;
ss << cg->getCodeText("asm");
torch::jit::testing::FileCheck()
.check("fadds")
->check("fmuls")
->check_not("vfmadd")
->run(ss.str());
}
TEST(LLVM, CodeGenKernelFuncName) {
BufHandle a("A", {1}, kInt);
BufHandle b("B", {1}, kInt);
std::vector<int32_t> a_buffer = {42};
std::vector<int32_t> b_buffer = {-11};
auto store = b.store({0}, a.load(0));
{
LLVMCodeGen cg(store, {a, b});
// Check that the kernel function name used by LLVMCodeGen
// is not empty.
ASSERT_NE(cg.kernel_func_name(), "");
}
{
LLVMCodeGen cg(store, {a, b}, at::kCPU, "new_func");
// Check that the kernel function name used by LLVMCodeGen
// is the one that was given above.
ASSERT_EQ(cg.kernel_func_name(), "new_func");
}
}
} // namespace jit
} // namespace torch
#endif // TORCH_ENABLE_LLVM