diff --git a/include/llvm/Target/TargetOptions.h b/include/llvm/Target/TargetOptions.h index 84287fb5d76..3a1809a80a0 100644 --- a/include/llvm/Target/TargetOptions.h +++ b/include/llvm/Target/TargetOptions.h @@ -30,12 +30,20 @@ namespace llvm { }; } + namespace FPOpFusion { + enum FPOpFusionMode { + Fast, // Enable fusion of FP ops wherever it's profitable. + Standard, // Only allow fusion of 'blessed' ops (currently just fmuladd). + Strict // Never fuse FP-ops. + }; + } + class TargetOptions { public: TargetOptions() : PrintMachineCode(false), NoFramePointerElim(false), NoFramePointerElimNonLeaf(false), LessPreciseFPMADOption(false), - AllowExcessFPPrecision(false), UnsafeFPMath(false), NoInfsFPMath(false), + UnsafeFPMath(false), NoInfsFPMath(false), NoNaNsFPMath(false), HonorSignDependentRoundingFPMathOption(false), UseSoftFloat(false), NoZerosInBSS(false), JITExceptionHandling(false), JITEmitDebugInfo(false), JITEmitDebugInfoToDisk(false), @@ -43,7 +51,8 @@ namespace llvm { StackAlignmentOverride(0), RealignStack(true), DisableJumpTables(false), EnableFastISel(false), PositionIndependentExecutable(false), EnableSegmentedStacks(false), - UseInitArray(false), TrapFuncName(""), FloatABIType(FloatABI::Default) + UseInitArray(false), TrapFuncName(""), FloatABIType(FloatABI::Default), + AllowFPOpFusion(FPOpFusion::Standard) {} /// PrintMachineCode - This flag is enabled when the -print-machineinstrs @@ -74,14 +83,6 @@ namespace llvm { unsigned LessPreciseFPMADOption : 1; bool LessPreciseFPMAD() const; - /// AllowExcessFPPrecision - This flag is enabled when the - /// -enable-excess-fp-precision flag is specified on the command line. This - /// flag is OFF by default. When it is turned on, the code generator is - /// allowed to produce results that are "more precise" than IEEE allows. - /// This includes use of FMA-like operations and use of the X86 FP registers - /// without rounding all over the place. - unsigned AllowExcessFPPrecision : 1; - /// UnsafeFPMath - This flag is enabled when the /// -enable-unsafe-fp-math flag is specified on the command line. When /// this flag is off (the default), the code generator is not allowed to @@ -189,6 +190,25 @@ namespace llvm { /// Such a combination is unfortunately popular (e.g. arm-apple-darwin). /// Hard presumes that the normal FP ABI is used. FloatABI::ABIType FloatABIType; + + /// AllowFPOpFusion - This flag is set by the -fuse-fp-ops=xxx option. + /// This controls the creation of fused FP ops that store intermediate + /// results in higher precision than IEEE allows (E.g. FMAs). + /// + /// Fast mode - allows formation of fused FP ops whenever they're + /// profitable. + /// Standard mode - allow fusion only for 'blessed' FP ops. At present the + /// only blessed op is the fmuladd intrinsic. In the future more blessed ops + /// may be added. + /// Strict mode - allow fusion only if/when it can be proven that the excess + /// precision won't effect the result. + /// + /// Note: This option only controls formation of fused ops by the optimizers. + /// Fused operations that are explicitly specified (e.g. FMA via the + /// llvm.fma.* intrinsic) will always be honored, regardless of the value of + /// this option. + FPOpFusion::FPOpFusionMode AllowFPOpFusion; + }; } // End llvm namespace diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 7babf4ab953..0bdd8b85497 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -5644,7 +5644,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { N0.getOperand(1), N1)); // FADD -> FMA combines: - if ((DAG.getTarget().Options.AllowExcessFPPrecision || + if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast || DAG.getTarget().Options.UnsafeFPMath) && DAG.getTarget().getTargetLowering()->isFMAFasterThanMulAndAdd(VT) && TLI.isOperationLegal(ISD::FMA, VT)) { @@ -5721,7 +5721,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { } // FSUB -> FMA combines: - if ((DAG.getTarget().Options.AllowExcessFPPrecision || + if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast || DAG.getTarget().Options.UnsafeFPMath) && DAG.getTarget().getTargetLowering()->isFMAFasterThanMulAndAdd(VT) && TLI.isOperationLegal(ISD::FMA, VT)) { diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 4152aa1ae16..50fd45e88bc 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4934,7 +4934,9 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { return 0; case Intrinsic::fmuladd: { EVT VT = TLI.getValueType(I.getType()); - if (TLI.isOperationLegal(ISD::FMA, VT) && TLI.isFMAFasterThanMulAndAdd(VT)){ + if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict && + TLI.isOperationLegal(ISD::FMA, VT) && + TLI.isFMAFasterThanMulAndAdd(VT)){ setValue(&I, DAG.getNode(ISD::FMA, dl, getValue(I.getArgOperand(0)).getValueType(), getValue(I.getArgOperand(0)), diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 81e3527a6f0..67f050131bf 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -236,7 +236,8 @@ def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">; // Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available. // But only select them if more precision in FP computation is allowed. // Do not use them for Darwin platforms. -def UseFusedMAC : Predicate<"TM.Options.AllowExcessFPPrecision && " +def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion ==" + " FPOpFusion::Fast) && " "!Subtarget->isTargetDarwin()">; def DontUseFusedMAC : Predicate<"!Subtarget->hasVFP4() || " "Subtarget->isTargetDarwin()">; diff --git a/test/CodeGen/ARM/fusedMAC.ll b/test/CodeGen/ARM/fusedMAC.ll index 0cc1cddf218..725dd274e37 100644 --- a/test/CodeGen/ARM/fusedMAC.ll +++ b/test/CodeGen/ARM/fusedMAC.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=armv7-eabi -mattr=+neon,+vfp4 -enable-excess-fp-precision | FileCheck %s +; RUN: llc < %s -mtriple=armv7-eabi -mattr=+neon,+vfp4 -fuse-fp-ops=fast | FileCheck %s ; Check generated fused MAC and MLS. define double @fusedMACTest1(double %d1, double %d2, double %d3) { diff --git a/test/CodeGen/PowerPC/a2-fp-basic.ll b/test/CodeGen/PowerPC/a2-fp-basic.ll index a47e662cc87..a4370fa452a 100644 --- a/test/CodeGen/PowerPC/a2-fp-basic.ll +++ b/test/CodeGen/PowerPC/a2-fp-basic.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=ppc64 -mcpu=a2 -enable-excess-fp-precision | FileCheck %s +; RUN: llc < %s -march=ppc64 -mcpu=a2 -fuse-fp-ops=fast | FileCheck %s %0 = type { double, double } diff --git a/test/CodeGen/PowerPC/fma.ll b/test/CodeGen/PowerPC/fma.ll index 02847147edb..4e05c279b79 100644 --- a/test/CodeGen/PowerPC/fma.ll +++ b/test/CodeGen/PowerPC/fma.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=ppc32 -enable-excess-fp-precision | \ +; RUN: llc < %s -march=ppc32 -fuse-fp-ops=fast | \ ; RUN: egrep {fn?madd|fn?msub} | count 8 define double @test_FMADD1(double %A, double %B, double %C) { diff --git a/test/CodeGen/PowerPC/ppc440-fp-basic.ll b/test/CodeGen/PowerPC/ppc440-fp-basic.ll index 25ec5f892c5..6884570a8aa 100644 --- a/test/CodeGen/PowerPC/ppc440-fp-basic.ll +++ b/test/CodeGen/PowerPC/ppc440-fp-basic.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=ppc32 -mcpu=440 -enable-excess-fp-precision | FileCheck %s +; RUN: llc < %s -march=ppc32 -mcpu=440 -fuse-fp-ops=fast | FileCheck %s %0 = type { double, double } diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp index b303cec3b51..e08bad93b07 100644 --- a/tools/llc/llc.cpp +++ b/tools/llc/llc.cpp @@ -155,11 +155,6 @@ DisableFPElimNonLeaf("disable-non-leaf-fp-elim", cl::desc("Disable frame pointer elimination optimization for non-leaf funcs"), cl::init(false)); -static cl::opt -EnableExcessPrecision("enable-excess-fp-precision", - cl::desc("Enable optimizations that may increase FP precision"), - cl::init(false)); - static cl::opt EnableUnsafeFPMath("enable-unsafe-fp-math", cl::desc("Enable optimizations that may decrease FP precision"), @@ -199,6 +194,19 @@ FloatABIForCalls("float-abi", "Hard float ABI (uses FP registers)"), clEnumValEnd)); +static cl::opt +FuseFPOps("fuse-fp-ops", + cl::desc("Enable aggresive formation of fused FP ops"), + cl::init(FPOpFusion::Standard), + cl::values( + clEnumValN(FPOpFusion::Fast, "fast", + "Fuse FP ops whenever profitable"), + clEnumValN(FPOpFusion::Standard, "standard", + "Only fuse 'blessed' FP ops."), + clEnumValN(FPOpFusion::Strict, "strict", + "Only fuse FP ops when the result won't be effected."), + clEnumValEnd)); + static cl::opt DontPlaceZerosInBSS("nozero-initialized-in-bss", cl::desc("Don't place zero-initialized symbols into bss section"), @@ -404,7 +412,7 @@ int main(int argc, char **argv) { Options.LessPreciseFPMADOption = EnableFPMAD; Options.NoFramePointerElim = DisableFPElim; Options.NoFramePointerElimNonLeaf = DisableFPElimNonLeaf; - Options.AllowExcessFPPrecision = EnableExcessPrecision; + Options.AllowFPOpFusion = FuseFPOps; Options.UnsafeFPMath = EnableUnsafeFPMath; Options.NoInfsFPMath = EnableNoInfsFPMath; Options.NoNaNsFPMath = EnableNoNaNsFPMath;