R600: Implement getRecipEstimate

This requires a new hook to prevent expanding sqrt in terms of rsqrt and reciprocal. v_rcp_f32, v_rsq_f32, and v_sqrt_f32 are all the same rate, so this expansion would just double the number of instructions and cycles. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225828 91177308-0d34-0410-b5e6-96231b3b80d8
2025-11-01 15:17:25 +00:00 · 2015-01-13 20:53:23 +00:00
parent 8603a3d1c5
commit 7c06364dc0
5 changed files with 42 additions and 2 deletions
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -403,6 +403,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
  // large sequence of instructions.
  setIntDivIsCheap(false);
  setPow2SDivIsCheap(false);
+  setFsqrtIsCheap(true);

  // FIXME: Need to really handle these.
  MaxStoresPerMemcpy  = 4096;
@@ -2585,6 +2586,28 @@ SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand,
  return SDValue();
 }

+SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
+                                               DAGCombinerInfo &DCI,
+                                               unsigned &RefinementSteps) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = Operand.getValueType();
+
+  if (VT == MVT::f32) {
+    // Reciprocal, < 1 ulp error.
+    //
+    // This reciprocal approximation converges to < 0.5 ulp error with one
+    // newton rhapson performed with two fused multiple adds (FMAs).
+
+    RefinementSteps = 0;
+    return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
+  }
+
+  // TODO: There is also f64 rcp instruction, but the documentation is less
+  // clear on its precision.
+
+  return SDValue();
+}
+
 static void computeKnownBitsForMinMax(const SDValue Op0,
                                      const SDValue Op1,
                                      APInt &KnownZero,