diff --git a/benchmarks/cpp/tensorexpr/bench_approx.cpp b/benchmarks/cpp/tensorexpr/bench_approx.cpp deleted file mode 100644 index c9783300a54..00000000000 --- a/benchmarks/cpp/tensorexpr/bench_approx.cpp +++ /dev/null @@ -1,145 +0,0 @@ -#include -#include -#include -#include -#include -#include - -using namespace torch::jit::tensorexpr; - -static void log_sleef(benchmark::State& state) { - KernelScope ks; - auto N = VarHandle("N", kInt); - Placeholder A("A", kFloat, {N}); - torch::jit::tensorexpr::Tensor* B = - Compute("B", {N}, [&](const VarHandle& i) { - return log(A.load(i)); - }); - LoopNest ln({B}); - ln.prepareForCodegen(); - ln.vectorizeInnerLoops(); - Stmt* s = ln.root_stmt(); - s = torch::jit::tensorexpr::IRSimplifier::simplify(s); - std::vector args; - args.emplace_back(B); - args.emplace_back(A); - args.emplace_back(N); - LLVMCodeGen cg(s, args); - at::Tensor A_t = torch::abs(torch::randn({state.range(0)})); - at::Tensor B_t = torch::randn({state.range(0)}); - auto B_ref = at::log(A_t); - cg.call({B_t.data_ptr(), A_t.data_ptr(), state.range(0)}); - assert(at::allclose(B_t, B_ref)); - for (auto _ : state) { - cg.call({B_t.data_ptr(), A_t.data_ptr(), state.range(0)}); - } - state.counters["log/s"] = benchmark::Counter( - uint64_t(state.range(0) * state.iterations()), benchmark::Counter::kIsRate); -} - -static void log_fast(benchmark::State& state) { - KernelScope ks; - auto N = VarHandle("N", kInt); - Placeholder A("A", kFloat, {N}); - torch::jit::tensorexpr::Tensor* B = - Compute("B", {N}, [&](const VarHandle& i) { - return fast_log(A.load(i)); - }); - LoopNest ln({B}); - ln.prepareForCodegen(); - ln.vectorizeInnerLoops(); - Stmt* s = ln.root_stmt(); - s = torch::jit::tensorexpr::IRSimplifier::simplify(s); - std::vector args; - args.emplace_back(B); - args.emplace_back(A); - args.emplace_back(N); - LLVMCodeGen cg(s, args); - at::Tensor A_t = torch::abs(torch::randn({state.range(0)})); - at::Tensor B_t = torch::randn({state.range(0)}); - auto B_ref = at::log(A_t); - cg.call({B_t.data_ptr(), A_t.data_ptr(), state.range(0)}); - assert(at::allclose(B_t, B_ref)); - for (auto _ : state) { - cg.call({B_t.data_ptr(), A_t.data_ptr(), state.range(0)}); - } - state.counters["log/s"] = benchmark::Counter( - uint64_t(state.range(0) * state.iterations()), benchmark::Counter::kIsRate); -} - -static void log_aten(benchmark::State& state) { - at::Tensor A_t = torch::abs(torch::randn({state.range(0)})); - at::Tensor B_t = torch::randn({state.range(0)}); - for (auto _ : state) { - at::native::log_out(B_t, A_t); - } - state.counters["log/s"] = benchmark::Counter( - uint64_t(state.range(0) * state.iterations()), benchmark::Counter::kIsRate); -} - -static void logit_fast(benchmark::State& state) { - KernelScope ks; - auto N = VarHandle("N", kInt); - Placeholder A("A", kFloat, {N}); - torch::jit::tensorexpr::Tensor* B = - Compute("B", {N}, [&](const VarHandle& i) { - auto A_elem = A.load(i); - return fast_log(A_elem / (FloatImm::make(1.0f) - A_elem)); - }); - LoopNest ln({B}); - ln.prepareForCodegen(); - ln.vectorizeInnerLoops(); - Stmt* s = ln.root_stmt(); - s = torch::jit::tensorexpr::IRSimplifier::simplify(s); - std::vector args; - args.emplace_back(B); - args.emplace_back(A); - args.emplace_back(N); - LLVMCodeGen cg(s, args); - at::Tensor A_t = torch::abs(torch::randn({state.range(0)})); - at::Tensor B_t = torch::randn({state.range(0)}); - auto B_ref = at::logit(A_t); - cg.call({B_t.data_ptr(), A_t.data_ptr(), state.range(0)}); - assert(at::allclose(B_t, B_ref)); - for (auto _ : state) { - cg.call({B_t.data_ptr(), A_t.data_ptr(), state.range(0)}); - } - state.counters["logit/s"] = benchmark::Counter( - uint64_t(state.range(0) * state.iterations()), benchmark::Counter::kIsRate); -} - -static void logit_aten(benchmark::State& state) { - at::Tensor A_t = torch::abs(torch::randn({state.range(0)})); - at::Tensor B_t = torch::randn({state.range(0)}); - for (auto _ : state) { - at::native::logit_out(B_t, A_t); - } - state.counters["logit/s"] = benchmark::Counter( - uint64_t(state.range(0) * state.iterations()), benchmark::Counter::kIsRate); -} - -BENCHMARK(log_sleef) - ->Args({2<<5}) - ->Args({2<<8}) - ->Args({2<<12}) - ->Args({2<<14}); -BENCHMARK(log_fast) - ->Args({2<<5}) - ->Args({2<<8}) - ->Args({2<<12}) - ->Args({2<<14}); -BENCHMARK(log_aten) - ->Args({2<<5}) - ->Args({2<<8}) - ->Args({2<<12}) - ->Args({2<<14}); -BENCHMARK(logit_fast) - ->Args({2<<5}) - ->Args({2<<8}) - ->Args({2<<12}) - ->Args({2<<14}); -BENCHMARK(logit_aten) - ->Args({2<<5}) - ->Args({2<<8}) - ->Args({2<<12}) - ->Args({2<<14}); diff --git a/test/cpp/tensorexpr/test_aten.cpp b/test/cpp/tensorexpr/test_aten.cpp index a87de8143ba..39ddeb7822a 100644 --- a/test/cpp/tensorexpr/test_aten.cpp +++ b/test/cpp/tensorexpr/test_aten.cpp @@ -733,38 +733,6 @@ TEST(ATen, logFloat) { } } -TEST(ATen, fastLogFloat) { - KernelScope kernel_scope; - const int kTotalSize = 128 * 128; - Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); - - VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = a_buf.load(index); - Stmt* store_b = b_buf.store({index}, fast_log(load_a)); - Stmt* stmt = For::make(index, 0, kTotalSize, store_b); - - PaddedBuffer a_v(kTotalSize); - PaddedBuffer b_v(kTotalSize); - - for (int i = 0; i < kTotalSize; ++i) { - a_v(i) = at::randn({1}).item().to(); - } - - SimpleIREvaluator ir_eval(stmt, a_buf, b_buf); - ir_eval(a_v, b_v); - - for (int i = 0; i < kTotalSize; ++i) { - auto test = b_v(i); - auto ref = std::log(a_v(i)); - if (std::isnan(ref)) { - ASSERT_EQ(std::isnan(test), true); - } else { - ASSERT_FLOAT_EQ(test, ref); - } - } -} - TEST(ATen, log10Float) { KernelScope kernel_scope; const int kTotalSize = 128; diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp index 975ef962f51..c1d3392fff3 100644 --- a/test/cpp/tensorexpr/test_llvm.cpp +++ b/test/cpp/tensorexpr/test_llvm.cpp @@ -217,38 +217,6 @@ TEST(LLVM, BitCast) { } } -TEST(LLVM, fastLogFloat) { - KernelScope kernel_scope; - const int kTotalSize = 128 * 128; - Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); - - VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = a_buf.load(index); - Stmt* store_b = b_buf.store({index}, fast_log(load_a)); - Stmt* stmt = For::make(index, 0, kTotalSize, store_b); - - PaddedBuffer a_v(kTotalSize); - PaddedBuffer b_v(kTotalSize); - - for (int i = 0; i < kTotalSize; ++i) { - a_v(i) = at::randn({1}).item().to(); - } - - LLVMCodeGen ir_eval(stmt, {a_buf, b_buf}); - ir_eval.call({a_v, b_v}); - - for (int i = 0; i < kTotalSize; ++i) { - auto test = b_v(i); - auto ref = std::log(a_v(i)); - if (std::isnan(ref)) { - ASSERT_EQ(std::isnan(test), true); - } else { - ASSERT_FLOAT_EQ(test, ref); - } - } -} - TEST(LLVM, LetTest01) { KernelScope kernel_scope; diff --git a/torch/csrc/jit/tensorexpr/eval.h b/torch/csrc/jit/tensorexpr/eval.h index a9b04c57d82..f01c4969285 100644 --- a/torch/csrc/jit/tensorexpr/eval.h +++ b/torch/csrc/jit/tensorexpr/eval.h @@ -337,12 +337,9 @@ class SimpleIREvaluator : public CodeGen, public IRVisitor { std::vector result_v(lhs_v.size()); for (size_t i = 0; i < lhs_v.size(); i++) { switch (op_type) { - case IRNodeType::kLshift: { - typename std::make_unsigned::type a = - static_cast::type>(lhs_v[i]); - result_v[i] = a << rhs_v[i]; + case IRNodeType::kLshift: + result_v[i] = lhs_v[i] << rhs_v[i]; break; - } case IRNodeType::kRshift: result_v[i] = lhs_v[i] >> rhs_v[i]; break; diff --git a/torch/csrc/jit/tensorexpr/expr.cpp b/torch/csrc/jit/tensorexpr/expr.cpp index 267120d8a70..f8a9bfed3b6 100644 --- a/torch/csrc/jit/tensorexpr/expr.cpp +++ b/torch/csrc/jit/tensorexpr/expr.cpp @@ -128,40 +128,6 @@ ExprHandle fabs(const ExprHandle& v) { return Intrinsics::make(kFabs, v); } -ExprHandle fast_log(const ExprHandle& v) { - // this implementation is taken from sleef: - // https://github.com/shibatch/sleef/blob/master/src/libm/sleefsp.c#L1131 - // to generate coefficients, this tool is provided - // https://github.com/shibatch/sleef/blob/master/src/gencoef/gencoef.txt - auto ilogb2kf = [](ExprHandle x) { - auto y = (bitcast(x) >> IntImm::make(23)) & IntImm::make(0xff); - return y - IntImm::make(0x7f); - }; - - auto ldexp3kf = [](ExprHandle x, ExprHandle e) { - return bitcast(bitcast(x) + (e << IntImm::make(23))); - }; - auto e = ilogb2kf(v * FloatImm::make(1.0 / 0.75)); - auto m = ldexp3kf(v, IntImm::make(-1) * e); - auto one = FloatImm::make(1.0f); - auto x = (m - one) / (m + one); - auto x2 = x * x; - - auto mlaf = [](ExprHandle x, ExprHandle y, float z) { - return x * y + FloatImm::make(z); - }; - - auto t = FloatImm::make(0.2392828464508056640625); - t = mlaf(t, x2, 0.28518211841583251953125); - t = mlaf(t, x2, 0.400005877017974853515625); - t = mlaf(t, x2, 0.666666686534881591796875); - t = mlaf(t, x2, 2.0); - x = x * t + FloatImm::make(0.693147180559945286226764) * e; - x = IfThenElse::make(v < FloatImm::make(0), FloatImm::make(std::numeric_limits::quiet_NaN()), x); - x = IfThenElse::make(v == FloatImm::make(0), FloatImm::make(-std::numeric_limits::infinity()), x); - return x; -} - ExprHandle log(const ExprHandle& v) { return Intrinsics::make(kLog, v); } diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h index b2693497583..8ba9966cb6b 100644 --- a/torch/csrc/jit/tensorexpr/expr.h +++ b/torch/csrc/jit/tensorexpr/expr.h @@ -290,7 +290,6 @@ TORCH_API ExprHandle exp(const ExprHandle& v); TORCH_API ExprHandle expm1(const ExprHandle& v); TORCH_API ExprHandle fabs(const ExprHandle& v); TORCH_API ExprHandle log(const ExprHandle& v); -TORCH_API ExprHandle fast_log(const ExprHandle& v); TORCH_API ExprHandle log2(const ExprHandle& v); TORCH_API ExprHandle log10(const ExprHandle& v); TORCH_API ExprHandle log1p(const ExprHandle& v);