LoopVectorize: Vectorize math builtin calls.

This properly asks TargetLibraryInfo if a call is available and if it is, it
can be translated into the corresponding LLVM builtin. We don't vectorize sqrt()
yet because I'm not sure about the semantics for negative numbers. The other
intrinsic should be exact equivalents to the libm functions.

Differential Revision: http://llvm-reviews.chandlerc.com/D465

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@176188 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Benjamin Kramer 2013-02-27 15:24:19 +00:00
parent f404449e23
commit 2fe71f8f7e
2 changed files with 161 additions and 50 deletions

View File

@ -79,6 +79,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetLibraryInfo.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
@ -138,10 +139,11 @@ class LoopVectorizationCostModel;
class InnerLoopVectorizer {
public:
InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
DominatorTree *DT, DataLayout *DL, unsigned VecWidth,
DominatorTree *DT, DataLayout *DL,
const TargetLibraryInfo *TLI, unsigned VecWidth,
unsigned UnrollFactor)
: OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), VF(VecWidth),
UF(UnrollFactor), Builder(SE->getContext()), Induction(0),
: OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI),
VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(0),
OldInduction(0), WidenMap(UnrollFactor) {}
// Perform the actual loop widening (vectorization).
@ -268,6 +270,9 @@ private:
DominatorTree *DT;
/// Data Layout.
DataLayout *DL;
/// Target Library Info.
const TargetLibraryInfo *TLI;
/// The vectorization SIMD factor to use. Each vector will have this many
/// vector elements.
unsigned VF;
@ -320,8 +325,9 @@ class LoopVectorizationLegality {
public:
LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
DominatorTree *DT, TargetTransformInfo* TTI,
AliasAnalysis* AA)
: TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), Induction(0) {}
AliasAnalysis *AA, TargetLibraryInfo *TLI)
: TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI),
Induction(0) {}
/// This enum represents the kinds of reductions that we support.
enum ReductionKind {
@ -504,6 +510,8 @@ private:
TargetTransformInfo *TTI;
/// Alias Analysis.
AliasAnalysis *AA;
/// Target Library Info.
TargetLibraryInfo *TLI;
// --- vectorization state --- //
@ -540,8 +548,8 @@ public:
LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
LoopVectorizationLegality *Legal,
const TargetTransformInfo &TTI,
DataLayout *DL)
: TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL) {}
DataLayout *DL, const TargetLibraryInfo *TLI)
: TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI) {}
/// Information about vectorization costs
struct VectorizationFactor {
@ -614,6 +622,8 @@ private:
const TargetTransformInfo &TTI;
/// Target data layout information.
DataLayout *DL;
/// Target Library Info.
const TargetLibraryInfo *TLI;
};
/// The LoopVectorize Pass.
@ -631,6 +641,7 @@ struct LoopVectorize : public LoopPass {
TargetTransformInfo *TTI;
DominatorTree *DT;
AliasAnalysis *AA;
TargetLibraryInfo *TLI;
virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
// We only vectorize innermost loops.
@ -643,19 +654,20 @@ struct LoopVectorize : public LoopPass {
TTI = &getAnalysis<TargetTransformInfo>();
DT = &getAnalysis<DominatorTree>();
AA = getAnalysisIfAvailable<AliasAnalysis>();
TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
DEBUG(dbgs() << "LV: Checking a loop in \"" <<
L->getHeader()->getParent()->getName() << "\"\n");
// Check if it is legal to vectorize the loop.
LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA);
LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA, TLI);
if (!LVL.canVectorize()) {
DEBUG(dbgs() << "LV: Not vectorizing.\n");
return false;
}
// Use the cost model.
LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL);
LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI);
// Check the function attributes to find out if this function should be
// optimized for size.
@ -689,7 +701,7 @@ struct LoopVectorize : public LoopPass {
DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
// If we decided that it is *legal* to vectorize the loop then do it.
InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF.Width, UF);
InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
LB.vectorize(&LVL);
DEBUG(verifyFunction(*L->getHeader()->getParent()));
@ -1438,34 +1450,108 @@ getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp) {
}
}
static bool
isTriviallyVectorizableIntrinsic(Instruction *Inst) {
IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
if (!II)
return false;
switch (II->getIntrinsicID()) {
case Intrinsic::sqrt:
case Intrinsic::sin:
case Intrinsic::cos:
case Intrinsic::exp:
case Intrinsic::exp2:
case Intrinsic::log:
case Intrinsic::log10:
case Intrinsic::log2:
case Intrinsic::fabs:
case Intrinsic::floor:
case Intrinsic::ceil:
case Intrinsic::trunc:
case Intrinsic::rint:
case Intrinsic::nearbyint:
case Intrinsic::pow:
case Intrinsic::fma:
case Intrinsic::fmuladd:
return true;
default:
return false;
static Intrinsic::ID
getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
// If we have an intrinsic call, check if it is trivially vectorizable.
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
switch (II->getIntrinsicID()) {
case Intrinsic::sqrt:
case Intrinsic::sin:
case Intrinsic::cos:
case Intrinsic::exp:
case Intrinsic::exp2:
case Intrinsic::log:
case Intrinsic::log10:
case Intrinsic::log2:
case Intrinsic::fabs:
case Intrinsic::floor:
case Intrinsic::ceil:
case Intrinsic::trunc:
case Intrinsic::rint:
case Intrinsic::nearbyint:
case Intrinsic::pow:
case Intrinsic::fma:
case Intrinsic::fmuladd:
return II->getIntrinsicID();
default:
return Intrinsic::not_intrinsic;
}
}
return false;
if (!TLI)
return Intrinsic::not_intrinsic;
LibFunc::Func Func;
Function *F = CI->getCalledFunction();
// We're going to make assumptions on the semantics of the functions, check
// that the target knows that it's available in this environment.
if (!F || !TLI->getLibFunc(F->getName(), Func))
return Intrinsic::not_intrinsic;
// Otherwise check if we have a call to a function that can be turned into a
// vector intrinsic.
switch (Func) {
default:
break;
case LibFunc::sin:
case LibFunc::sinf:
case LibFunc::sinl:
return Intrinsic::sin;
case LibFunc::cos:
case LibFunc::cosf:
case LibFunc::cosl:
return Intrinsic::cos;
case LibFunc::exp:
case LibFunc::expf:
case LibFunc::expl:
return Intrinsic::exp;
case LibFunc::exp2:
case LibFunc::exp2f:
case LibFunc::exp2l:
return Intrinsic::exp2;
case LibFunc::log:
case LibFunc::logf:
case LibFunc::logl:
return Intrinsic::log;
case LibFunc::log10:
case LibFunc::log10f:
case LibFunc::log10l:
return Intrinsic::log10;
case LibFunc::log2:
case LibFunc::log2f:
case LibFunc::log2l:
return Intrinsic::log2;
case LibFunc::fabs:
case LibFunc::fabsf:
case LibFunc::fabsl:
return Intrinsic::fabs;
case LibFunc::floor:
case LibFunc::floorf:
case LibFunc::floorl:
return Intrinsic::floor;
case LibFunc::ceil:
case LibFunc::ceilf:
case LibFunc::ceill:
return Intrinsic::ceil;
case LibFunc::trunc:
case LibFunc::truncf:
case LibFunc::truncl:
return Intrinsic::trunc;
case LibFunc::rint:
case LibFunc::rintf:
case LibFunc::rintl:
return Intrinsic::rint;
case LibFunc::nearbyint:
case LibFunc::nearbyintf:
case LibFunc::nearbyintl:
return Intrinsic::nearbyint;
case LibFunc::pow:
case LibFunc::powf:
case LibFunc::powl:
return Intrinsic::pow;
}
return Intrinsic::not_intrinsic;
}
/// This function translates the reduction kind to an LLVM binary operator.
@ -1991,17 +2077,17 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
}
case Instruction::Call: {
assert(isTriviallyVectorizableIntrinsic(it));
Module *M = BB->getParent()->getParent();
IntrinsicInst *II = cast<IntrinsicInst>(it);
Intrinsic::ID ID = II->getIntrinsicID();
CallInst *CI = cast<CallInst>(it);
Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
assert(ID && "Not an intrinsic call!");
for (unsigned Part = 0; Part < UF; ++Part) {
SmallVector<Value*, 4> Args;
for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) {
VectorParts &Arg = getVectorValue(II->getArgOperand(i));
for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
Args.push_back(Arg[Part]);
}
Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) };
Type *Tys[] = { VectorType::get(CI->getType()->getScalarType(), VF) };
Function *F = Intrinsic::getDeclaration(M, ID, Tys);
Entry[Part] = Builder.CreateCall(F, Args);
}
@ -2222,7 +2308,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// We still don't handle functions.
CallInst *CI = dyn_cast<CallInst>(it);
if (CI && !isTriviallyVectorizableIntrinsic(it)) {
if (CI && !getIntrinsicIDForCall(CI, TLI)) {
DEBUG(dbgs() << "LV: Found a call site.\n");
return false;
}
@ -3305,13 +3391,14 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
}
case Instruction::Call: {
assert(isTriviallyVectorizableIntrinsic(I));
IntrinsicInst *II = cast<IntrinsicInst>(I);
Type *RetTy = ToVectorTy(II->getType(), VF);
CallInst *CI = cast<CallInst>(I);
Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
assert(ID && "Not an intrinsic call!");
Type *RetTy = ToVectorTy(CI->getType(), VF);
SmallVector<Type*, 4> Tys;
for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i)
Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF));
return TTI.getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys);
for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
}
default: {
// We are scalarizing the instruction. Return the cost of the scalar

View File

@ -902,6 +902,30 @@ for.end: ; preds = %for.body, %entry
ret void
}
; CHECK: fabs_libm
; CHECK: call <4 x float> @llvm.fabs.v4f32
; CHECK: ret void
define void @fabs_libm(float* nocapture %x) nounwind {
entry:
br label %for.body
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds float* %x, i64 %indvars.iv
%0 = load float* %arrayidx, align 4
%call = tail call float @fabsf(float %0) nounwind readnone
store float %call, float* %arrayidx, align 4
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, 1024
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body
ret void
}
declare float @fabsf(float) nounwind readnone
declare double @llvm.pow.f64(double, double) nounwind readnone
!0 = metadata !{metadata !"float", metadata !1}