diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 4e89ac7c093..40e00984c85 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -20,6 +20,7 @@ #include "VecUtils.h" #include "llvm/Transforms/Vectorize.h" +#include "llvm/ADT/MapVector.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -47,7 +48,7 @@ namespace { /// The SLPVectorizer Pass. struct SLPVectorizer : public FunctionPass { - typedef std::map StoreListMap; + typedef MapVector StoreListMap; /// Pass identification, replacement for typeid static char ID; diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp index 50d2af0f653..21e6cdde307 100644 --- a/lib/Transforms/Vectorize/VecUtils.cpp +++ b/lib/Transforms/Vectorize/VecUtils.cpp @@ -46,7 +46,7 @@ namespace llvm { BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl, TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp) : - BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa), L(Lp) { + Builder(S->getContext()), BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa), L(Lp) { numberInstructions(); } @@ -121,6 +121,7 @@ bool BoUpSLP::vectorizeStoreChain(ArrayRef Chain, int CostThreshold) { DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); if (Cost < CostThreshold) { DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + Builder.SetInsertPoint(getInsertionPoint(getLastIndex(Operands,VF))); vectorizeTree(Operands, VF); i += VF - 1; Changed = true; @@ -131,7 +132,7 @@ bool BoUpSLP::vectorizeStoreChain(ArrayRef Chain, int CostThreshold) { } bool BoUpSLP::vectorizeStores(ArrayRef Stores, int costThreshold) { - ValueSet Heads, Tails; + SetVector Heads, Tails; SmallDenseMap ConsecutiveChain; // We may run into multiple chains that merge into a single chain. We mark the @@ -152,7 +153,8 @@ bool BoUpSLP::vectorizeStores(ArrayRef Stores, int costThreshold) { } // For stores that start but don't end a link in the chain: - for (ValueSet::iterator it = Heads.begin(), e = Heads.end();it != e; ++it) { + for (SetVector::iterator it = Heads.begin(), e = Heads.end(); + it != e; ++it) { if (Tails.count(*it)) continue; // We found a store instr that starts a chain. Now follow the chain and try @@ -224,9 +226,14 @@ Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) { } void BoUpSLP::vectorizeArith(ArrayRef Operands) { + int LastIdx = getLastIndex(Operands, Operands.size()); + Instruction *Loc = getInsertionPoint(LastIdx); + Builder.SetInsertPoint(Loc); + + assert(getFirstUserIndex(Operands, Operands.size()) > LastIdx && + "Vectorizing with in-tree users"); + Value *Vec = vectorizeTree(Operands, Operands.size()); - BasicBlock::iterator Loc = cast(Vec); - IRBuilder<> Builder(++Loc); // After vectorizing the operands we need to generate extractelement // instructions and replace all of the uses of the scalar values with // the values that we extracted from the vectorized tree. @@ -246,7 +253,13 @@ int BoUpSLP::getTreeCost(ArrayRef VL) { MustExtract.clear(); // Find the location of the last root. - unsigned LastRootIndex = InstrIdx[GetLastInstr(VL, VL.size())]; + int LastRootIndex = getLastIndex(VL, VL.size()); + int FirstUserIndex = getFirstUserIndex(VL, VL.size()); + + // Don't vectorize if there are users of the tree roots inside the tree + // itself. + if (LastRootIndex > FirstUserIndex) + return max_cost; // Scan the tree and find which value is used by which lane, and which values // must be scalarized. @@ -254,7 +267,7 @@ int BoUpSLP::getTreeCost(ArrayRef VL) { // Check that instructions with multiple users can be vectorized. Mark unsafe // instructions. - for (ValueSet::iterator it = MultiUserVals.begin(), + for (SetVector::iterator it = MultiUserVals.begin(), e = MultiUserVals.end(); it != e; ++it) { // Check that all of the users of this instr are within the tree // and that they are all from the same lane. @@ -267,18 +280,21 @@ int BoUpSLP::getTreeCost(ArrayRef VL) { // We don't have an ordering problem if the user is not in this basic // block. Instruction *Inst = cast(*I); - if (Inst->getParent() == BB) { - // We don't have an ordering problem if the user is after the last - // root. - unsigned Idx = InstrIdx[Inst]; - if (Idx < LastRootIndex) { - MustScalarize.insert(*it); - DEBUG(dbgs()<<"SLP: Adding to MustScalarize " - "because of an unsafe out of tree usage.\n"); - break; - } + if (Inst->getParent() != BB) { + MustExtract.insert(*it); + continue; } + // We don't have an ordering problem if the user is after the last root. + int Idx = InstrIdx[Inst]; + if (Idx < LastRootIndex) { + MustScalarize.insert(*it); + DEBUG(dbgs()<<"SLP: Adding to MustScalarize " + "because of an unsafe out of tree usage.\n"); + break; + } + + DEBUG(dbgs()<<"SLP: Adding to MustExtract " "because of a safe out of tree usage.\n"); MustExtract.insert(*it); @@ -332,14 +348,6 @@ void BoUpSLP::getTreeUses_rec(ArrayRef VL, unsigned Depth) { if (!I || Opcode != I->getOpcode()) return; } - // Mark instructions with multiple users. - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *I = dyn_cast(VL[i]); - // Remember to check if all of the users of this instr are vectorized - // within our tree. - if (I && I->getNumUses() > 1) MultiUserVals.insert(I); - } - for (int i = 0, e = VL.size(); i < e; ++i) { // Check that the instruction is only used within // one lane. @@ -348,6 +356,19 @@ void BoUpSLP::getTreeUses_rec(ArrayRef VL, unsigned Depth) { LaneMap[VL[i]] = i; } + // Mark instructions with multiple users. + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + Instruction *I = dyn_cast(VL[i]); + // Remember to check if all of the users of this instr are vectorized + // within our tree. At depth zero we have no local users, only external + // users that we don't care about. + if (Depth && I && I->getNumUses() > 1) { + DEBUG(dbgs()<<"SLP: Adding to MultiUserVals " + "because it has multiple users:" << *I << " \n"); + MultiUserVals.insert(I); + } + } + switch (Opcode) { case Instruction::ZExt: case Instruction::SExt: @@ -461,11 +482,9 @@ int BoUpSLP::getTreeCost_rec(ArrayRef VL, unsigned Depth) { // Check if it is safe to sink the loads or the stores. if (Opcode == Instruction::Load || Opcode == Instruction::Store) { - int MaxIdx = InstrIdx[VL0]; - for (unsigned i = 1, e = VL.size(); i < e; ++i ) - MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]); - + int MaxIdx = getLastIndex(VL, VL.size()); Instruction *Last = InstrVec[MaxIdx]; + for (unsigned i = 0, e = VL.size(); i < e; ++i ) { if (VL[i] == Last) continue; Value *Barrier = isUnsafeToSink(cast(VL[i]), Last); @@ -592,15 +611,40 @@ int BoUpSLP::getTreeCost_rec(ArrayRef VL, unsigned Depth) { } } -Instruction *BoUpSLP::GetLastInstr(ArrayRef VL, unsigned VF) { +int BoUpSLP::getLastIndex(ArrayRef VL, unsigned VF) { int MaxIdx = InstrIdx[BB->getFirstNonPHI()]; for (unsigned i = 0; i < VF; ++i ) MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]); - return InstrVec[MaxIdx + 1]; + return MaxIdx; +} + +int BoUpSLP::getFirstUserIndex(ArrayRef VL, unsigned VF) { + // Find the first user of the values. + int FirstUser = InstrVec.size(); + for (unsigned i = 0; i < VF; ++i) { + for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end(); + U != UE; ++U) { + Instruction *Instr = dyn_cast(*U); + if (!Instr || Instr->getParent() != BB) + continue; + + FirstUser = std::min(FirstUser, InstrIdx[Instr]); + } + } + return FirstUser; +} + +int BoUpSLP::getLastIndex(Instruction *I, Instruction *J) { + assert(I->getParent() == BB && "Invalid parent for instruction I"); + assert(J->getParent() == BB && "Invalid parent for instruction J"); + return std::max(InstrIdx[I],InstrIdx[J]); +} + +Instruction *BoUpSLP::getInsertionPoint(unsigned Index) { + return InstrVec[Index + 1]; } Value *BoUpSLP::Scalarize(ArrayRef VL, VectorType *Ty) { - IRBuilder<> Builder(GetLastInstr(VL, Ty->getNumElements())); Value *Vec = UndefValue::get(Ty); for (unsigned i=0; i < Ty->getNumElements(); ++i) { // Generate the 'InsertElement' instruction. @@ -620,18 +664,26 @@ Value *BoUpSLP::Scalarize(ArrayRef VL, VectorType *Ty) { Value *BoUpSLP::vectorizeTree(ArrayRef VL, int VF) { Value *V = vectorizeTree_rec(VL, VF); - Instruction *LastInstr = GetLastInstr(VL, VL.size()); - int LastInstrIdx = InstrIdx[LastInstr]; - IRBuilder<> Builder(LastInstr); - for (ValueSet::iterator it = MustExtract.begin(), e = MustExtract.end(); - it != e; ++it) { + int LastInstrIdx = getLastIndex(VL, VL.size()); + for (SetVector::iterator it = MustExtract.begin(), + e = MustExtract.end(); it != e; ++it) { Instruction *I = cast(*it); + + // This is a scalarized value, so we can use the original value. + // No need to extract from the vector. + if (!LaneMap.count(I)) + continue; + Value *Vec = VectorizedValues[I]; - assert(LaneMap.count(I) && "Unable to find the lane for the external use"); + // We decided not to vectorize I because one of its users was not + // vectorizerd. This is okay. + if (!Vec) + continue; + Value *Idx = Builder.getInt32(LaneMap[I]); Value *Extract = Builder.CreateExtractElement(Vec, Idx); bool Replaced = false; - for (Value::use_iterator U = I->use_begin(), UE = U->use_end(); U != UE; + for (Value::use_iterator U = I->use_begin(), UE = I->use_end(); U != UE; ++U) { Instruction *UI = cast(*U); if (UI->getParent() != I->getParent() || InstrIdx[UI] > LastInstrIdx) @@ -670,19 +722,27 @@ Value *BoUpSLP::vectorizeTree_rec(ArrayRef VL, int VF) { } // Check that this is a simple vector constant. - if (AllConst || AllSameScalar) return Scalarize(VL, VecTy); + if (AllConst || AllSameScalar) + return Scalarize(VL, VecTy); // Scalarize unknown structures. Instruction *VL0 = dyn_cast(VL[0]); - if (!VL0) return Scalarize(VL, VecTy); + if (!VL0) + return Scalarize(VL, VecTy); - if (VectorizedValues.count(VL0)) return VectorizedValues[VL0]; + if (VectorizedValues.count(VL0)) { + Value * Vec = VectorizedValues[VL0]; + for (int i = 0; i < VF; ++i) + VectorizedValues[VL[i]] = Vec; + return Vec; + } unsigned Opcode = VL0->getOpcode(); for (unsigned i = 0, e = VF; i < e; ++i) { Instruction *I = dyn_cast(VL[i]); // If not all of the instructions are identical then we have to scalarize. - if (!I || Opcode != I->getOpcode()) return Scalarize(VL, VecTy); + if (!I || Opcode != I->getOpcode()) + return Scalarize(VL, VecTy); } switch (Opcode) { @@ -702,7 +762,6 @@ Value *BoUpSLP::vectorizeTree_rec(ArrayRef VL, int VF) { for (int i = 0; i < VF; ++i) INVL.push_back(cast(VL[i])->getOperand(0)); Value *InVec = vectorizeTree_rec(INVL, VF); - IRBuilder<> Builder(GetLastInstr(VL, VF)); CastInst *CI = dyn_cast(VL0); Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); @@ -737,7 +796,6 @@ Value *BoUpSLP::vectorizeTree_rec(ArrayRef VL, int VF) { Value *LHS = vectorizeTree_rec(LHSVL, VF); Value *RHS = vectorizeTree_rec(RHSVL, VF); - IRBuilder<> Builder(GetLastInstr(VL, VF)); BinaryOperator *BinOp = cast(VL0); Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS,RHS); @@ -755,10 +813,13 @@ Value *BoUpSLP::vectorizeTree_rec(ArrayRef VL, int VF) { if (!isConsecutiveAccess(VL[i-1], VL[i])) return Scalarize(VL, VecTy); - IRBuilder<> Builder(GetLastInstr(VL, VF)); - Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), - VecTy->getPointerTo()); - LI = Builder.CreateLoad(VecPtr); + // Loads are inserted at the head of the tree because we don't want to sink + // them all the way down past store instructions. + Instruction *Loc = getInsertionPoint(getLastIndex(VL, VL.size())); + IRBuilder<> LoadBuilder(Loc); + Value *VecPtr = LoadBuilder.CreateBitCast(LI->getPointerOperand(), + VecTy->getPointerTo()); + LI = LoadBuilder.CreateLoad(VecPtr); LI->setAlignment(Alignment); for (int i = 0; i < VF; ++i) @@ -775,8 +836,6 @@ Value *BoUpSLP::vectorizeTree_rec(ArrayRef VL, int VF) { ValueOp.push_back(cast(VL[i])->getValueOperand()); Value *VecValue = vectorizeTree_rec(ValueOp, VF); - - IRBuilder<> Builder(GetLastInstr(VL, VF)); Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo()); Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment); diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h index abb35840e93..d41d2ed63c8 100644 --- a/lib/Transforms/Vectorize/VecUtils.h +++ b/lib/Transforms/Vectorize/VecUtils.h @@ -16,9 +16,11 @@ #define LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/IR/IRBuilder.h" #include namespace llvm { @@ -107,9 +109,19 @@ private: /// \returns the pointer to the barrier instruction if we can't sink. Value *isUnsafeToSink(Instruction *Src, Instruction *Dst); - /// \returns the instruction that appears last in the BB from \p VL. + /// \returns the index of the last instrucion in the BB from \p VL. /// Only consider the first \p VF elements. - Instruction *GetLastInstr(ArrayRef VL, unsigned VF); + int getLastIndex(ArrayRef VL, unsigned VF); + + /// \returns the index of the first User of \p VL. + /// Only consider the first \p VF elements. + int getFirstUserIndex(ArrayRef VL, unsigned VF); + + /// \returns the instruction \p I or \p Jt hat appears last in the BB . + int getLastIndex(Instruction *I, Instruction *J); + + /// \returns the insertion point for \p Index. + Instruction *getInsertionPoint(unsigned Index); /// \returns a vector from a collection of scalars in \p VL. Value *Scalarize(ArrayRef VL, VectorType *Ty); @@ -130,17 +142,17 @@ private: /// Contains values that have users outside of the vectorized graph. /// We need to generate extract instructions for these values. /// NOTICE: The vectorization methods also use this set. - ValueSet MustExtract; + SetVector MustExtract; /// Contains a list of values that are used outside the current tree. This /// set must be reset between runs. - ValueSet MultiUserVals; + SetVector MultiUserVals; /// Maps values in the tree to the vector lanes that uses them. This map must /// be reset between runs of getCost. std::map LaneMap; /// A list of instructions to ignore while sinking /// memory instructions. This map must be reset between runs of getCost. - SmallPtrSet MemBarrierIgnoreList; + ValueSet MemBarrierIgnoreList; // -- Containers that are used during vectorizeTree -- // @@ -155,6 +167,9 @@ private: /// Iterating over this list is faster than calling LICM. ValueList GatherInstructions; + /// Instruction builder to construct the vectorized tree. + IRBuilder<> Builder; + // Analysis and block reference. BasicBlock *BB; ScalarEvolution *SE; diff --git a/test/Transforms/SLPVectorizer/X86/crash_povray.ll b/test/Transforms/SLPVectorizer/X86/crash_povray.ll new file mode 100644 index 00000000000..7ef8df49f05 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_povray.ll @@ -0,0 +1,34 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +%struct.hoge = type { double, double, double} + +define void @zot(%struct.hoge* %arg) { +bb: + %tmp = load double* undef, align 8 + %tmp1 = fsub double %tmp, undef + %tmp2 = load double* undef, align 8 + %tmp3 = fsub double %tmp2, undef + %tmp4 = fmul double %tmp3, undef + %tmp5 = fmul double %tmp3, undef + %tmp6 = fsub double %tmp5, undef + %tmp7 = getelementptr inbounds %struct.hoge* %arg, i64 0, i32 1 + store double %tmp6, double* %tmp7, align 8 + %tmp8 = fmul double %tmp1, undef + %tmp9 = fsub double %tmp8, undef + %tmp10 = getelementptr inbounds %struct.hoge* %arg, i64 0, i32 2 + store double %tmp9, double* %tmp10, align 8 + br i1 undef, label %bb11, label %bb12 + +bb11: ; preds = %bb + br label %bb14 + +bb12: ; preds = %bb + %tmp13 = fmul double undef, %tmp2 + br label %bb14 + +bb14: ; preds = %bb12, %bb11 + ret void +} diff --git a/test/Transforms/SLPVectorizer/X86/in-tree-user.ll b/test/Transforms/SLPVectorizer/X86/in-tree-user.ll new file mode 100644 index 00000000000..69dc8897d63 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/in-tree-user.ll @@ -0,0 +1,50 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.7.0" + +@.str = private unnamed_addr constant [6 x i8] c"bingo\00", align 1 + +; We can't vectorize when the roots are used inside the tree. +;CHECK: @in_tree_user +;CHECK-NOT: load <2 x double> +;CHECK: ret +define void @in_tree_user(double* nocapture %A, i32 %n) { +entry: + %conv = sitofp i32 %n to double + br label %for.body + +for.body: ; preds = %for.inc, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] + %0 = shl nsw i64 %indvars.iv, 1 + %arrayidx = getelementptr inbounds double* %A, i64 %0 + %1 = load double* %arrayidx, align 8 + %mul1 = fmul double %conv, %1 + %mul2 = fmul double %mul1, 7.000000e+00 + %add = fadd double %mul2, 5.000000e+00 + %BadValue = fadd double %add, %add ; <------------------ In tree user. + %2 = or i64 %0, 1 + %arrayidx6 = getelementptr inbounds double* %A, i64 %2 + %3 = load double* %arrayidx6, align 8 + %mul8 = fmul double %conv, %3 + %mul9 = fmul double %mul8, 4.000000e+00 + %add10 = fadd double %mul9, 9.000000e+00 + %cmp11 = fcmp ogt double %add, %add10 + br i1 %cmp11, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([6 x i8]* @.str, i64 0, i64 0)) + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 100 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.inc + ret void +} + +declare i32 @printf(i8* nocapture, ...) + diff --git a/test/Transforms/SLPVectorizer/X86/multi_user.ll b/test/Transforms/SLPVectorizer/X86/multi_user.ll index aaa6063fded..d4d4d289502 100644 --- a/test/Transforms/SLPVectorizer/X86/multi_user.ll +++ b/test/Transforms/SLPVectorizer/X86/multi_user.ll @@ -12,8 +12,8 @@ target triple = "x86_64-apple-macosx10.7.0" ;} ;CHECK: @foo -;CHECK: insertelement <4 x i32> ;CHECK: load <4 x i32> +;CHECK: insertelement <4 x i32> ;CHECK: add <4 x i32> ;CHECK: store <4 x i32> ;CHECK: ret diff --git a/test/Transforms/SLPVectorizer/X86/ordering.ll b/test/Transforms/SLPVectorizer/X86/ordering.ll new file mode 100644 index 00000000000..588e115f5f9 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/ordering.ll @@ -0,0 +1,19 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +define void @updateModelQPFrame(i32 %m_Bits) { +entry: + %0 = load double* undef, align 8 + %mul = fmul double undef, %0 + %mul2 = fmul double undef, %mul + %mul4 = fmul double %0, %mul2 + %mul5 = fmul double undef, 4.000000e+00 + %mul7 = fmul double undef, %mul5 + %conv = sitofp i32 %m_Bits to double + %mul8 = fmul double %conv, %mul7 + %add = fadd double %mul4, %mul8 + %cmp11 = fcmp olt double %add, 0.000000e+00 + ret void +}