diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc index e08ce45186d..3d49ec34d9b 100644 --- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc +++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc @@ -100,7 +100,7 @@ operator()(llvm::Module& module) const { CHECK(!llvm::verifyModule(module, &llvm::dbgs())); - runtime::RewriteIRRuntimeFunctions(&module); + runtime::RewriteIRRuntimeFunctions(&module, enable_fast_math_); // Buffer for holding machine code prior to constructing the ObjectFile. llvm::SmallVector stream_buffer; diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.h b/tensorflow/compiler/xla/service/cpu/compiler_functor.h index 2fbcc83f538..e9dbd416c5a 100644 --- a/tensorflow/compiler/xla/service/cpu/compiler_functor.h +++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.h @@ -42,7 +42,7 @@ class CompilerFunctor { explicit CompilerFunctor( llvm::TargetMachine* target_machine, const Disassembler* disassembler, - int opt_level, bool optimize_for_size, + int opt_level, bool optimize_for_size, bool enable_fast_math, const VectorIntrinsics& available_intrinsics, LLVMCompiler::ModuleHook pre_optimization_hook = nullptr, LLVMCompiler::ModuleHook post_optimization_hook = nullptr) @@ -50,6 +50,7 @@ class CompilerFunctor { disassembler_(CHECK_NOTNULL(disassembler)), opt_level_(opt_level), optimize_for_size_(optimize_for_size), + enable_fast_math_(enable_fast_math), available_intrinsics_(available_intrinsics), pre_optimization_hook_(pre_optimization_hook), post_optimization_hook_(post_optimization_hook) {} @@ -72,6 +73,7 @@ class CompilerFunctor { const Disassembler* disassembler_; const unsigned opt_level_; const bool optimize_for_size_; + const bool enable_fast_math_; const VectorIntrinsics available_intrinsics_; LLVMCompiler::ModuleHook pre_optimization_hook_; LLVMCompiler::ModuleHook post_optimization_hook_; diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc index 19d96933bd9..eca9b0f4bef 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc @@ -442,6 +442,7 @@ StatusOr> CpuCompiler::Compile( CompilerTargetOptions(module->config()), CodeGenOptLevel(module->config()), options::OptimizeForSizeRequested(module->config()), + module->config().debug_options().xla_enable_fast_math(), pre_optimization_ir_hook, post_optimization_ir_hook); llvm_module->setDataLayout(jit->data_layout()); llvm_module->setTargetTriple(jit->target_triple().getTriple()); @@ -794,6 +795,7 @@ CpuCompiler::CompileAheadOfTime(std::vector> modules, CompilerFunctor compiler_functor( target_machine.get(), &disassembler, opt_level, options::OptimizeForSizeRequested(module->config()), + module->config().debug_options().xla_enable_fast_math(), CompilerFunctor::AllIntrinsics(), pre_optimization_ir_dump_hook, post_optimization_ir_dump_hook); llvm::object::OwningBinary object_file = diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc index 77e4425aa28..424306a194b 100644 --- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc +++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc @@ -30,9 +30,33 @@ const char* const kTanhV4F32SymbolName = "__xla_cpu_runtime_TanhV4F32"; const char* const kTanhV8F32SymbolName = "__xla_cpu_runtime_TanhV8F32"; namespace { +llvm::Value* EmitFMinOrMax(llvm::IRBuilder<>* ir_builder, llvm::Module* module, + llvm::Type* vector_type, llvm::Value* lhs, + llvm::Value* rhs, bool is_min, + bool enable_fast_math) { + if (enable_fast_math) { + // Using an unordered comparison lets LLVM generate a vminps / vmaxps + // instruction on x86. vminps/vmaxps choose the second operand if either + // operand is a NaN and thus don't accurately implement the semantics of the + // minnum and maxnum intrinsics, necessitating different IR emission. + // + // We can _probably_ do this even when fast math is disabled, but we can + // certainly do this if fast math is enabled (and nnan applies). + auto* compare = ir_builder->CreateFCmp( + is_min ? llvm::FCmpInst::FCMP_ULE : llvm::FCmpInst::FCMP_UGE, lhs, rhs); + return ir_builder->CreateSelect(compare, lhs, rhs); + } else { + llvm::Function* intrinsic = llvm::Intrinsic::getDeclaration( + module, is_min ? llvm::Intrinsic::minnum : llvm::Intrinsic::maxnum, + vector_type); + return ir_builder->CreateCall(intrinsic, {lhs, rhs}); + } +} + llvm::Function* EmitVectorF32TanhIfNeeded(llvm::Module* module, llvm::StringRef function_name, - int vector_width) { + int vector_width, + bool enable_fast_math) { llvm::Function* vector_tanh_function = module->getFunction(function_name); if (vector_tanh_function == nullptr) { // If the function declaration is not present in the module, there can't be @@ -45,11 +69,6 @@ llvm::Function* EmitVectorF32TanhIfNeeded(llvm::Module* module, llvm::VectorType* vector_type = llvm::VectorType::get(float_type, vector_width); - llvm::Function* min_intrinsic = llvm::Intrinsic::getDeclaration( - module, llvm::Intrinsic::minnum, vector_type); - llvm::Function* max_intrinsic = llvm::Intrinsic::getDeclaration( - module, llvm::Intrinsic::maxnum, vector_type); - llvm::BasicBlock* vector_tanh_body = llvm::BasicBlock::Create(*context, "body", vector_tanh_function); @@ -59,15 +78,24 @@ llvm::Function* EmitVectorF32TanhIfNeeded(llvm::Module* module, fast_math_flags.setUnsafeAlgebra(); ir_builder.setFastMathFlags(fast_math_flags); + auto emit_fmin = [&](llvm::Value* lhs, llvm::Value* rhs) { + return EmitFMinOrMax(&ir_builder, module, vector_type, lhs, rhs, + /*is_min=*/true, + /*enable_fast_math=*/enable_fast_math); + }; + auto emit_fmax = [&](llvm::Value* lhs, llvm::Value* rhs) { + return EmitFMinOrMax(&ir_builder, module, vector_type, lhs, rhs, + /*is_min=*/false, + /*enable_fast_math=*/enable_fast_math); + }; + llvm::Value* input = &*vector_tanh_function->arg_begin(); CHECK_EQ(input->getType(), vector_type); // This implements the same rational interpolant as implemented in Eigen3. - llvm::Value* input_clamped = ir_builder.CreateCall( - min_intrinsic, - {ir_builder.CreateCall(max_intrinsic, - {input, llvm::ConstantFP::get(vector_type, -9.0)}), - llvm::ConstantFP::get(vector_type, 9.0)}); + llvm::Value* input_clamped = + emit_fmin(emit_fmax(input, llvm::ConstantFP::get(vector_type, -9.0)), + llvm::ConstantFP::get(vector_type, 9.0)); std::array numerator_coeffs( {{-2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f, @@ -105,11 +133,13 @@ llvm::Function* EmitVectorF32TanhIfNeeded(llvm::Module* module, } } // namespace -void RewriteIRRuntimeFunctions(llvm::Module* module) { - auto* tanh_v4f32 = EmitVectorF32TanhIfNeeded(module, kTanhV4F32SymbolName, - /*vector_width=*/4); - auto* tanh_v8f32 = EmitVectorF32TanhIfNeeded(module, kTanhV8F32SymbolName, - /*vector_width=*/8); +void RewriteIRRuntimeFunctions(llvm::Module* module, bool enable_fast_math) { + auto* tanh_v4f32 = + EmitVectorF32TanhIfNeeded(module, kTanhV4F32SymbolName, + /*vector_width=*/4, enable_fast_math); + auto* tanh_v8f32 = + EmitVectorF32TanhIfNeeded(module, kTanhV8F32SymbolName, + /*vector_width=*/8, enable_fast_math); // Gather all the call sites, force inline them and then delete the vector // function bodies. diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h index 4a0c9d89469..3082b39b634 100644 --- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h +++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h @@ -33,7 +33,7 @@ extern const char* const kTanhV8F32SymbolName; // |LinkIRRuntimeFunctions| rewrites calls to these functions into generic LLVM // IR. -void RewriteIRRuntimeFunctions(llvm::Module* module); +void RewriteIRRuntimeFunctions(llvm::Module* module, bool enable_fast_math); } // namespace runtime } // namespace cpu diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc index b527af287aa..f45e30ce0d6 100644 --- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc +++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc @@ -171,7 +171,7 @@ CompilerFunctor::VectorIntrinsics GetAvailableIntrinsics() { SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options, llvm::CodeGenOpt::Level opt_level, - bool optimize_for_size, + bool optimize_for_size, bool enable_fast_math, LLVMCompiler::ModuleHook pre_optimization_hook, LLVMCompiler::ModuleHook post_optimization_hook) : target_machine_( @@ -186,12 +186,12 @@ SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options, data_layout_(target_machine_->createDataLayout()), object_layer_( [] { return std::make_shared(); }), - compile_layer_( - object_layer_, - CompilerFunctor(target_machine_.get(), &disassembler_, opt_level, - optimize_for_size, GetAvailableIntrinsics(), - std::move(pre_optimization_hook), - std::move(post_optimization_hook))) { + compile_layer_(object_layer_, + CompilerFunctor(target_machine_.get(), &disassembler_, + opt_level, optimize_for_size, + enable_fast_math, GetAvailableIntrinsics(), + std::move(pre_optimization_hook), + std::move(post_optimization_hook))) { VLOG(1) << "CPU target: " << target_machine_->getTargetCPU().str() << " features: " << target_machine_->getTargetFeatureString().str(); } diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h index 1e084879e1c..331e18bc8b3 100644 --- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h +++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h @@ -63,6 +63,7 @@ class SimpleOrcJIT { // level optimizations are applied. SimpleOrcJIT(const llvm::TargetOptions& target_options, llvm::CodeGenOpt::Level opt_level, bool optimize_for_size, + bool enable_fast_math, LLVMCompiler::ModuleHook pre_optimization_hook, LLVMCompiler::ModuleHook post_optimization_hook); diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc index 557c7868410..c11c7faadf3 100644 --- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc @@ -292,6 +292,7 @@ StatusOr ElementalIrEmitter::EmitFloatBinaryOp( llvm::Value* ElementalIrEmitter::EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value) const { + // TODO(b/64580527): We can do better here if fast-math is enabled. return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::maxnum, {lhs_value, rhs_value}, {lhs_value->getType()}, ir_builder_); @@ -299,6 +300,7 @@ llvm::Value* ElementalIrEmitter::EmitFloatMax(llvm::Value* lhs_value, llvm::Value* ElementalIrEmitter::EmitFloatMin(llvm::Value* lhs_value, llvm::Value* rhs_value) const { + // TODO(b/64580527): We can do better here if fast-math is enabled. return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::minnum, {lhs_value, rhs_value}, {lhs_value->getType()}, ir_builder_);