diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 65e5342fe4b..72a9df77076 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -71,6 +71,12 @@ static cl::opt ExperimentalVectorShuffleLowering( cl::desc("Enable an experimental vector shuffle lowering code path."), cl::Hidden); +static cl::opt ReciprocalEstimateRefinementSteps( + "x86-recip-refinement-steps", cl::init(1), + cl::desc("Specify the number of Newton-Raphson iterations applied to the " + "result of the hardware reciprocal estimate instruction."), + cl::NotHidden); + // Forward declarations. static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2); @@ -14543,9 +14549,7 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, // along with FMA, this could be a throughput win. if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || (Subtarget->hasAVX() && VT == MVT::v8f32)) { - // TODO: Expose this as a user-configurable parameter to allow for - // speed vs. accuracy flexibility. - RefinementSteps = 1; + RefinementSteps = ReciprocalEstimateRefinementSteps; return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); } return SDValue(); diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll index dd5563c965f..83b86accdb3 100644 --- a/test/CodeGen/X86/recip-fastmath.ll +++ b/test/CodeGen/X86/recip-fastmath.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+use-recip-est,+avx -x86-recip-refinement-steps=2 | FileCheck %s --check-prefix=REFINE ; If the target's divss/divps instructions are substantially ; slower than rcpss/rcpps with a Newton-Raphson refinement, @@ -21,11 +22,23 @@ define float @reciprocal_estimate(float %x) #0 { ; BTVER2-LABEL: reciprocal_estimate: ; BTVER2: vrcpss -; BTVER2-NEXT: vmulss -; BTVER2-NEXT: vsubss -; BTVER2-NEXT: vmulss -; BTVER2-NEXT: vaddss +; BTVER2: vmulss +; BTVER2: vsubss +; BTVER2: vmulss +; BTVER2: vaddss ; BTVER2-NEXT: retq + +; REFINE-LABEL: reciprocal_estimate: +; REFINE: vrcpss +; REFINE: vmulss +; REFINE: vsubss +; REFINE: vmulss +; REFINE: vaddss +; REFINE: vmulss +; REFINE: vsubss +; REFINE: vmulss +; REFINE: vaddss +; REFINE-NEXT: retq } define <4 x float> @reciprocal_estimate_v4f32(<4 x float> %x) #0 { @@ -40,11 +53,23 @@ define <4 x float> @reciprocal_estimate_v4f32(<4 x float> %x) #0 { ; BTVER2-LABEL: reciprocal_estimate_v4f32: ; BTVER2: vrcpps -; BTVER2-NEXT: vmulps -; BTVER2-NEXT: vsubps -; BTVER2-NEXT: vmulps -; BTVER2-NEXT: vaddps +; BTVER2: vmulps +; BTVER2: vsubps +; BTVER2: vmulps +; BTVER2: vaddps ; BTVER2-NEXT: retq + +; REFINE-LABEL: reciprocal_estimate_v4f32: +; REFINE: vrcpps +; REFINE: vmulps +; REFINE: vsubps +; REFINE: vmulps +; REFINE: vaddps +; REFINE: vmulps +; REFINE: vsubps +; REFINE: vmulps +; REFINE: vaddps +; REFINE-NEXT: retq } define <8 x float> @reciprocal_estimate_v8f32(<8 x float> %x) #0 { @@ -62,11 +87,23 @@ define <8 x float> @reciprocal_estimate_v8f32(<8 x float> %x) #0 { ; BTVER2-LABEL: reciprocal_estimate_v8f32: ; BTVER2: vrcpps -; BTVER2-NEXT: vmulps -; BTVER2-NEXT: vsubps -; BTVER2-NEXT: vmulps -; BTVER2-NEXT: vaddps +; BTVER2: vmulps +; BTVER2: vsubps +; BTVER2: vmulps +; BTVER2: vaddps ; BTVER2-NEXT: retq + +; REFINE-LABEL: reciprocal_estimate_v8f32: +; REFINE: vrcpps +; REFINE: vmulps +; REFINE: vsubps +; REFINE: vmulps +; REFINE: vaddps +; REFINE: vmulps +; REFINE: vsubps +; REFINE: vmulps +; REFINE: vaddps +; REFINE-NEXT: retq } attributes #0 = { "unsafe-fp-math"="true" }