mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/52165 Apparently bitwise identicality is too high a bar (I'm seeing differences at this level depending on the HW platform, e.g., Broadwell is bitwise accurate but Skylake is 1ulp off). But anyways VML is accurate to 1 ulp, so let's allow that. ghstack-source-id: 121815001 Test Plan: test_approx Reviewed By: asuhan Differential Revision: D26408079 fbshipit-source-id: 46cbd1487c72ae7bc40567f2f72ed2b919707d0d
98 lines
3.0 KiB
C++
98 lines
3.0 KiB
C++
#ifdef TORCH_ENABLE_LLVM
|
|
|
|
#include <gtest/gtest.h>
|
|
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
|
|
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
|
|
#include <torch/csrc/jit/tensorexpr/loopnest.h>
|
|
#include <torch/csrc/jit/tensorexpr/tensor.h>
|
|
#include <torch/torch.h>
|
|
#include <cstring>
|
|
|
|
using namespace torch::indexing;
|
|
namespace te = torch::jit::tensorexpr;
|
|
|
|
static void vectorize(te::LoopNest* ln, te::Tensor* target, int width) {
|
|
auto loops = ln->getLoopStmtsFor(target);
|
|
te::For *outer, *inner, *tail;
|
|
ln->splitWithTail(loops[0], width, &outer, &inner, &tail);
|
|
ln->vectorize(inner);
|
|
}
|
|
|
|
std::string diffs(const at::Tensor& a, const at::Tensor& b) {
|
|
auto diff = torch::abs(a.flatten() - b.flatten());
|
|
auto count_diffs = torch::sum(diff > 0.f);
|
|
auto greatest_diff_index = torch::argmax(diff);
|
|
std::stringstream ss;
|
|
ss << "Found " << count_diffs << " unequal element(s). "
|
|
<< "The greatest difference was " << diff.index({greatest_diff_index})
|
|
<< " at index " << greatest_diff_index;
|
|
return ss.str();
|
|
}
|
|
|
|
TEST(Approx, log_vml) {
|
|
te::KernelScope ks;
|
|
te::VarHandle N("N", te::kInt);
|
|
te::Placeholder A("A", te::kFloat, {N});
|
|
te::Tensor* B = te::Compute(
|
|
"B", {N}, [&](const te::VarHandle& i) { return log_vml(A.load(i)); });
|
|
|
|
te::LoopNest ln({B});
|
|
ln.prepareForCodegen();
|
|
vectorize(&ln, B, 8);
|
|
te::Stmt* s = ln.root_stmt();
|
|
s = te::IRSimplifier::simplify(s);
|
|
te::LLVMCodeGen cg(s, {A, B, N});
|
|
|
|
auto eps = std::numeric_limits<float>::epsilon();
|
|
auto test = [&](const at::Tensor& A_t) {
|
|
at::Tensor B_ref = at::log(A_t);
|
|
at::Tensor B_t = at::empty_like(A_t);
|
|
auto ap = A_t.data_ptr<float>();
|
|
auto bp = B_t.data_ptr<float>();
|
|
cg.call({ap, bp, A_t.numel()});
|
|
// Results should be bit-identical.
|
|
ASSERT_TRUE(torch::allclose(
|
|
B_t, B_ref, /*rtol=*/eps, /*atol=*/0.0f, /*equal_nan=*/true))
|
|
<< "Input[:8]\n"
|
|
<< A_t.index({Slice(0, 8)}) << "\n"
|
|
<< "Test[:8]\n"
|
|
<< B_t.index({Slice(0, 8)}) << "\n"
|
|
<< "Ref[:8]\n"
|
|
<< B_ref.index({Slice(0, 8)}) << diffs(B_t, B_ref);
|
|
};
|
|
|
|
// Generate every single-precision FP value in [1.0, 2.0).
|
|
at::Tensor A_t = torch::arange(1.0f, 2.0f, eps);
|
|
ASSERT_EQ(A_t.numel(), 1 << 23);
|
|
|
|
test(A_t);
|
|
|
|
test(A_t * 2.0f);
|
|
test(A_t * 0.5f);
|
|
|
|
test(A_t * 4.0f);
|
|
test(A_t * 0.25f);
|
|
|
|
test(A_t * powf(2.0f, 16));
|
|
test(A_t * powf(2.0f, -16));
|
|
|
|
test(A_t * powf(2.0f, 126));
|
|
test(A_t * powf(2.0f, -126));
|
|
|
|
test(torch::full({32}, INFINITY));
|
|
test(torch::full({32}, NAN));
|
|
|
|
auto min = std::numeric_limits<float>::min();
|
|
auto denorm_min = std::numeric_limits<float>::denorm_min();
|
|
|
|
// Denormals aren't bit precise, because sleef isn't bit-precise either.
|
|
A_t = torch::arange(0.0f, min, denorm_min);
|
|
ASSERT_EQ(A_t.numel(), 1 << 23);
|
|
auto B_ref = at::log(A_t);
|
|
auto B_t = at::empty_like(B_ref);
|
|
cg.call({A_t.data_ptr<float>(), B_t.data_ptr<float>(), A_t.numel()});
|
|
ASSERT_TRUE(torch::allclose(B_t, B_ref));
|
|
}
|
|
|
|
#endif // TORCH_ENABLE_LLVM
|