From d02540a1d7d367f4f7616637606b9eaf321cf30a Mon Sep 17 00:00:00 2001
From: Cameron Esfahani <dirty@apple.com>
Date: Thu, 5 Feb 2015 02:09:33 +0000
Subject: [PATCH] Value soft float calls as more expensive in the inliner.

Summary: When evaluating floating point instructions in the inliner, ask the TTI whether it is an expensive operation.  By default, it's not an expensive operation.  This keeps the default behavior the same as before.  The ARM TTI has been updated to return back TCC_Expensive for targets which don't have hardware floating point.

Reviewers: chandlerc, echristo

Reviewed By: echristo

Subscribers: t.p.northover, aemerson, llvm-commits

Differential Revision: http://reviews.llvm.org/D6936

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@228263 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/TargetTransformInfo.h   |  10 ++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   2 +
 include/llvm/CodeGen/BasicTTIImpl.h           |   6 +
 lib/Analysis/IPA/InlineCost.cpp               |  19 +++
 lib/Analysis/TargetTransformInfo.cpp          |   4 +
 lib/Target/ARM/ARMSubtarget.h                 |   3 +-
 lib/Target/ARM/ARMTargetTransformInfo.cpp     |  19 +++
 lib/Target/ARM/ARMTargetTransformInfo.h       |   2 +
 test/Transforms/Inline/inline-fp.ll           | 136 ++++++++++++++++++
 9 files changed, 200 insertions(+), 1 deletion(-)
 create mode 100644 test/Transforms/Inline/inline-fp.ll

diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index 53da46019b3..1c3bcdbdf74 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -325,6 +325,10 @@ public:
   /// \brief Return true if the hardware has a fast square-root instruction.
   bool haveFastSqrt(Type *Ty) const;
 
+  /// \brief Return the expected cost of supporting the floating point operation
+  /// of the specified type.
+  unsigned getFPOpCost(Type *Ty) const;
+
   /// \brief Return the expected cost of materializing for the given integer
   /// immediate of the specified type.
   unsigned getIntImmCost(const APInt &Imm, Type *Ty) const;
@@ -516,6 +520,7 @@ public:
   virtual bool shouldBuildLookupTables() = 0;
   virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;
   virtual bool haveFastSqrt(Type *Ty) = 0;
+  virtual unsigned getFPOpCost(Type *Ty) = 0;
   virtual unsigned getIntImmCost(const APInt &Imm, Type *Ty) = 0;
   virtual unsigned getIntImmCost(unsigned Opc, unsigned Idx, const APInt &Imm,
                                  Type *Ty) = 0;
@@ -631,6 +636,11 @@ public:
     return Impl.getPopcntSupport(IntTyWidthInBit);
   }
   bool haveFastSqrt(Type *Ty) override { return Impl.haveFastSqrt(Ty); }
+
+  unsigned getFPOpCost(Type *Ty) override {
+    return Impl.getFPOpCost(Ty);
+  }
+
   unsigned getIntImmCost(const APInt &Imm, Type *Ty) override {
     return Impl.getIntImmCost(Imm, Ty);
   }
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index 488f96e6ade..b90f6888238 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -239,6 +239,8 @@ public:
 
   bool haveFastSqrt(Type *Ty) { return false; }
 
+  unsigned getFPOpCost(Type *Ty) { return TargetTransformInfo::TCC_Basic; }
+
   unsigned getIntImmCost(const APInt &Imm, Type *Ty) { return TTI::TCC_Basic; }
 
   unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h
index 61664327244..53cdcac7258 100644
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -167,6 +167,12 @@ public:
            TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
   }
 
+  unsigned getFPOpCost(Type *Ty) {
+    // By default, FP instructions are no more expensive since they are
+    // implemented in HW.  Target specific TTI can override this.
+    return TargetTransformInfo::TCC_Basic;
+  }
+
   void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP) {
     // This unrolling functionality is target independent, but to provide some
     // motivation for its intended use, for x86:
diff --git a/lib/Analysis/IPA/InlineCost.cpp b/lib/Analysis/IPA/InlineCost.cpp
index 166488bf67e..c180f36b923 100644
--- a/lib/Analysis/IPA/InlineCost.cpp
+++ b/lib/Analysis/IPA/InlineCost.cpp
@@ -907,6 +907,25 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
       ++NumVectorInstructions;
 
+    // If the instruction is floating point, and the target says this operation is
+    // expensive or the function has the "use-soft-float" attribute, this may
+    // eventually become a library call.  Treat the cost as such.
+    if (I->getType()->isFloatingPointTy()) {
+      bool hasSoftFloatAttr = false;
+
+      // If the function has the "use-soft-float" attribute, mark it as expensive.
+      if (F.hasFnAttribute("use-soft-float")) {
+        Attribute Attr = F.getFnAttribute("use-soft-float");
+        StringRef Val = Attr.getValueAsString();
+        if (Val == "true")
+          hasSoftFloatAttr = true;
+      }
+
+      if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive ||
+          hasSoftFloatAttr)
+        Cost += InlineConstants::CallPenalty;
+    }
+
     // If the instruction simplified to a constant, there is no cost to this
     // instruction. Visit the instructions using our InstVisitor to account for
     // all of the per-instruction logic. The visit tree returns true if we
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 5a50d36ee8b..b5440e2a2c3 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -148,6 +148,10 @@ bool TargetTransformInfo::haveFastSqrt(Type *Ty) const {
   return TTIImpl->haveFastSqrt(Ty);
 }
 
+unsigned TargetTransformInfo::getFPOpCost(Type *Ty) const {
+  return TTIImpl->getFPOpCost(Ty);
+}
+
 unsigned TargetTransformInfo::getIntImmCost(const APInt &Imm, Type *Ty) const {
   return TTIImpl->getIntImmCost(Imm, Ty);
 }
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index dd1ab3aabcf..64d499a4dc5 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -310,7 +310,8 @@ public:
   bool hasCRC() const { return HasCRC; }
   bool hasVirtualization() const { return HasVirtualization; }
   bool useNEONForSinglePrecisionFP() const {
-    return hasNEON() && UseNEONForSinglePrecisionFP; }
+    return hasNEON() && UseNEONForSinglePrecisionFP;
+  }
 
   bool hasDivide() const { return HasHardwareDivide; }
   bool hasDivideInARMMode() const { return HasHardwareDivideInARM; }
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 1cb1efb1924..4e1b371640b 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -314,6 +314,25 @@ unsigned ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
   return 1;
 }
 
+unsigned ARMTTIImpl::getFPOpCost(Type *Ty) {
+  // Use similar logic that's in ARMISelLowering:
+  // Any ARM CPU with VFP2 has floating point, but Thumb1 didn't have access
+  // to VFP.
+
+  if (ST->hasVFP2() && !ST->isThumb1Only()) {
+    if (Ty->isFloatTy()) {
+      return TargetTransformInfo::TCC_Basic;
+    }
+
+    if (Ty->isDoubleTy()) {
+      return ST->isFPOnlySP() ? TargetTransformInfo::TCC_Expensive :
+        TargetTransformInfo::TCC_Basic;
+    }
+  }
+
+  return TargetTransformInfo::TCC_Expensive;
+}
+
 unsigned ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                                     Type *SubTp) {
   // We only handle costs of reverse and alternate shuffles for now.
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index 6167fae9c8a..97590f60893 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -114,6 +114,8 @@ public:
 
   unsigned getAddressComputationCost(Type *Val, bool IsComplex);
 
+  unsigned getFPOpCost(Type *Ty);
+
   unsigned getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
       TTI::OperandValueKind Op1Info = TTI::OK_AnyValue,
diff --git a/test/Transforms/Inline/inline-fp.ll b/test/Transforms/Inline/inline-fp.ll
new file mode 100644
index 00000000000..4d18ce87a5a
--- /dev/null
+++ b/test/Transforms/Inline/inline-fp.ll
@@ -0,0 +1,136 @@
+; RUN: opt -S -inline < %s | FileCheck %s
+; Make sure that soft float implementations are calculated as being more expensive
+; to the inliner.
+
+define i32 @test_nofp() #0 {
+; f_nofp() has the "use-soft-float" attribute, so it should never get inlined.
+; CHECK-LABEL: test_nofp
+; CHECK: call float @f_nofp 
+entry:
+  %responseX = alloca i32, align 4
+  %responseY = alloca i32, align 4
+  %responseZ = alloca i32, align 4
+  %valueX = alloca i8, align 1
+  %valueY = alloca i8, align 1
+  %valueZ = alloca i8, align 1
+
+  call void @getX(i32* %responseX, i8* %valueX)
+  call void @getY(i32* %responseY, i8* %valueY)
+  call void @getZ(i32* %responseZ, i8* %valueZ)
+
+  %0 = load i32* %responseX
+  %1 = load i8* %valueX
+  %call = call float @f_nofp(i32 %0, i8 zeroext %1)
+  %2 = load i32* %responseZ
+  %3 = load i8* %valueZ
+  %call2 = call float @f_nofp(i32 %2, i8 zeroext %3)
+  %call3 = call float @fabsf(float %call)
+  %cmp = fcmp ogt float %call3, 0x3FC1EB8520000000
+  br i1 %cmp, label %if.end12, label %if.else
+
+if.else:                                          ; preds = %entry
+  %4 = load i32* %responseY
+  %5 = load i8* %valueY
+  %call1 = call float @f_nofp(i32 %4, i8 zeroext %5)
+  %call4 = call float @fabsf(float %call1)
+  %cmp5 = fcmp ogt float %call4, 0x3FC1EB8520000000
+  br i1 %cmp5, label %if.end12, label %if.else7
+
+if.else7:                                         ; preds = %if.else
+  %call8 = call float @fabsf(float %call2)
+  %cmp9 = fcmp ogt float %call8, 0x3FC1EB8520000000
+  br i1 %cmp9, label %if.then10, label %if.end12
+
+if.then10:                                        ; preds = %if.else7
+  br label %if.end12
+
+if.end12:                                         ; preds = %if.else, %entry, %if.then10, %if.else7
+  %success.0 = phi i32 [ 0, %if.then10 ], [ 1, %if.else7 ], [ 0, %entry ], [ 0, %if.else ]
+  ret i32 %success.0
+}
+
+define i32 @test_hasfp() #0 {
+; f_hasfp()  does not have the "use-soft-float" attribute, so it should get inlined.
+; CHECK-LABEL: test_hasfp
+; CHECK-NOT: call float @f_hasfp 
+entry:
+  %responseX = alloca i32, align 4
+  %responseY = alloca i32, align 4
+  %responseZ = alloca i32, align 4
+  %valueX = alloca i8, align 1
+  %valueY = alloca i8, align 1
+  %valueZ = alloca i8, align 1
+
+  call void @getX(i32* %responseX, i8* %valueX)
+  call void @getY(i32* %responseY, i8* %valueY)
+  call void @getZ(i32* %responseZ, i8* %valueZ)
+
+  %0 = load i32* %responseX
+  %1 = load i8* %valueX
+  %call = call float @f_hasfp(i32 %0, i8 zeroext %1)
+  %2 = load i32* %responseZ
+  %3 = load i8* %valueZ
+  %call2 = call float @f_hasfp(i32 %2, i8 zeroext %3)
+  %call3 = call float @fabsf(float %call)
+  %cmp = fcmp ogt float %call3, 0x3FC1EB8520000000
+  br i1 %cmp, label %if.end12, label %if.else
+
+if.else:                                          ; preds = %entry
+  %4 = load i32* %responseY
+  %5 = load i8* %valueY
+  %call1 = call float @f_hasfp(i32 %4, i8 zeroext %5)
+  %call4 = call float @fabsf(float %call1)
+  %cmp5 = fcmp ogt float %call4, 0x3FC1EB8520000000
+  br i1 %cmp5, label %if.end12, label %if.else7
+
+if.else7:                                         ; preds = %if.else
+  %call8 = call float @fabsf(float %call2)
+  %cmp9 = fcmp ogt float %call8, 0x3FC1EB8520000000
+  br i1 %cmp9, label %if.then10, label %if.end12
+
+if.then10:                                        ; preds = %if.else7
+  br label %if.end12
+
+if.end12:                                         ; preds = %if.else, %entry, %if.then10, %if.else7
+  %success.0 = phi i32 [ 0, %if.then10 ], [ 1, %if.else7 ], [ 0, %entry ], [ 0, %if.else ]
+  ret i32 %success.0
+}
+
+declare void @getX(i32*, i8*) #0
+
+declare void @getY(i32*, i8*) #0
+
+declare void @getZ(i32*, i8*) #0
+
+define internal float @f_hasfp(i32 %response, i8 zeroext %value1) #0 {
+entry:
+  %conv = zext i8 %value1 to i32
+  %sub = add nsw i32 %conv, -1
+  %conv1 = sitofp i32 %sub to float
+  %0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1)
+  %mul = fmul float %0, 2.620000e+03
+  %conv2 = sitofp i32 %response to float
+  %sub3 = fsub float %conv2, %mul
+  %div = fdiv float %sub3, %mul
+  ret float %div
+}
+
+define internal float @f_nofp(i32 %response, i8 zeroext %value1) #1 {
+entry:
+  %conv = zext i8 %value1 to i32
+  %sub = add nsw i32 %conv, -1
+  %conv1 = sitofp i32 %sub to float
+  %0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1)
+  %mul = fmul float %0, 2.620000e+03
+  %conv2 = sitofp i32 %response to float
+  %sub3 = fsub float %conv2, %mul
+  %div = fdiv float %sub3, %mul
+  ret float %div
+}
+
+declare float @fabsf(float) optsize minsize
+
+declare float @llvm.pow.f32(float, float) optsize minsize
+
+attributes #0 = { minsize optsize }
+attributes #1 = { minsize optsize "use-soft-float"="true" }