diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp index 3efaaf6edd1..d8be0ae721c 100644 --- a/lib/Transforms/Vectorize/VecUtils.cpp +++ b/lib/Transforms/Vectorize/VecUtils.cpp @@ -6,7 +6,7 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "VecUtils" +#define DEBUG_TYPE "SLP" #include "VecUtils.h" #include "llvm/ADT/DenseMap.h" @@ -37,6 +37,10 @@ using namespace llvm; +static const unsigned MinVecRegSize = 128; + +static const unsigned RecursionMaxDepth = 6; + namespace llvm { BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl, @@ -98,9 +102,39 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { return ((-Offset) == Sz); } +bool BoUpSLP::vectorizeStoreChain(ValueList &Chain, int CostThreshold) { + Type *StoreTy = cast(Chain[0])->getValueOperand()->getType(); + unsigned Sz = DL->getTypeSizeInBits(StoreTy); + unsigned VF = MinVecRegSize / Sz; + + if (!isPowerOf2_32(Sz) || VF < 2) return false; + + bool Changed = false; + for (unsigned i = 0, e = Chain.size(); i < e; ++i) { + if (i + VF > e) return Changed; + DEBUG(dbgs()<<"SLP: Analyzing " << VF << " stores at offset "<< i << "\n"); + ValueList Operands(&Chain[i], &Chain[i] + VF); + + int Cost = getTreeCost(Operands); + DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); + if (Cost < CostThreshold) { + DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + vectorizeTree(Operands, VF); + i += VF; + Changed = true; + } + } + + return Changed; +} + bool BoUpSLP::vectorizeStores(StoreList &Stores, int costThreshold) { ValueSet Heads, Tails; SmallDenseMap ConsecutiveChain; + + // We may run into multiple chains that merge into a single chain. We mark the + // stores that we vectorized so that we don't visit the same store twice. + ValueSet VectorizedStores; bool Changed = false; // Do a quadratic search on all of the given stores and find @@ -123,27 +157,17 @@ bool BoUpSLP::vectorizeStores(StoreList &Stores, int costThreshold) { // to vectorize it. ValueList Operands; Value *I = *it; - int MinCost = 0, MinVF = 0; + // Collect the chain into a list. while (Tails.count(I) || Heads.count(I)) { + if (VectorizedStores.count(I)) break; Operands.push_back(I); - unsigned VF = Operands.size(); - if (isPowerOf2_32(VF) && VF > 1) { - int cost = getTreeRollCost(Operands, 0); - DEBUG(dbgs() << "Found cost=" << cost << " for VF=" << VF << "\n"); - if (cost < MinCost) { MinCost = cost; MinVF = VF; } - } // Move to the next value in the chain. I = ConsecutiveChain[I]; } - if (MinCost <= costThreshold && MinVF > 1) { - DEBUG(dbgs() << "Decided to vectorize cost=" << MinCost << "\n"); - vectorizeTree(Operands, MinVF); - Stores.clear(); - // The current numbering is invalid because we added and removed instrs. - numberInstructions(); - Changed = true; - } + bool Vectorized = vectorizeStoreChain(Operands, costThreshold); + if (Vectorized) VectorizedStores.insert(Operands.begin(), Operands.end()); + Changed |= Vectorized; } return Changed; @@ -184,8 +208,138 @@ Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) { return 0; } -int BoUpSLP::getTreeRollCost(ValueList &VL, unsigned Depth) { - if (Depth == 6) return max_cost; +int BoUpSLP::getTreeCost(ValueList &VL) { + // Get rid of the list of stores that were removed, and from the + // lists of instructions with multiple users. + MemBarrierIgnoreList.clear(); + LaneMap.clear(); + MultiUserVals.clear(); + MustScalarize.clear(); + + // Scan the tree and find which value is used by which lane, and which values + // must be scalarized. + getTreeUses_rec(VL, 0); + + // Check that instructions with multiple users can be vectorized. Mark unsafe + // instructions. + for (ValueSet::iterator it = MultiUserVals.begin(), + e = MultiUserVals.end(); it != e; ++it) { + // Check that all of the users of this instr are within the tree + // and that they are all from the same lane. + int Lane = -1; + for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end(); + I != E; ++I) { + if (LaneMap.find(*I) == LaneMap.end()) { + MustScalarize.insert(*it); + DEBUG(dbgs()<<"SLP: Adding " << **it << + " to MustScalarize because of an out of tree usage.\n"); + break; + } + if (Lane == -1) Lane = LaneMap[*I]; + if (Lane != LaneMap[*I]) { + MustScalarize.insert(*it); + DEBUG(dbgs()<<"Adding " << **it << + " to MustScalarize because multiple lane use it: " + << Lane << " and " << LaneMap[*I] << ".\n"); + break; + } + } + } + + // Now calculate the cost of vectorizing the tree. + return getTreeCost_rec(VL, 0); +} + +void BoUpSLP::getTreeUses_rec(ValueList &VL, unsigned Depth) { + if (Depth == RecursionMaxDepth) return; + + // Don't handle vectors. + if (VL[0]->getType()->isVectorTy()) return; + if (StoreInst *SI = dyn_cast(VL[0])) + if (SI->getValueOperand()->getType()->isVectorTy()) return; + + // Check if all of the operands are constants. + bool AllConst = true; + bool AllSameScalar = true; + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + AllConst &= isa(VL[i]); + AllSameScalar &= (VL[0] == VL[i]); + Instruction *I = dyn_cast(VL[i]); + // If one of the instructions is out of this BB, we need to scalarize all. + if (I && I->getParent() != BB) return; + } + + // If all of the operands are identical or constant we have a simple solution. + if (AllConst || AllSameScalar) return; + + // Scalarize unknown structures. + Instruction *VL0 = dyn_cast(VL[0]); + if (!VL0) return; + + unsigned Opcode = VL0->getOpcode(); + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + Instruction *I = dyn_cast(VL[i]); + // If not all of the instructions are identical then we have to scalarize. + if (!I || Opcode != I->getOpcode()) return; + } + + // Mark instructions with multiple users. + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + Instruction *I = dyn_cast(VL[i]); + // Remember to check if all of the users of this instr are vectorized + // within our tree. + if (I && I->getNumUses() > 1) MultiUserVals.insert(I); + } + + for (int i = 0, e = VL.size(); i < e; ++i) { + // Check that the instruction is only used within + // one lane. + if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) return; + // Make this instruction as 'seen' and remember the lane. + LaneMap[VL[i]] = i; + } + + switch (Opcode) { + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(i)); + + getTreeUses_rec(Operands, Depth+1); + } + } + case Instruction::Store: { + ValueList Operands; + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(0)); + getTreeUses_rec(Operands, Depth+1); + return; + } + default: + return; + } +} + +int BoUpSLP::getTreeCost_rec(ValueList &VL, unsigned Depth) { Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast(VL[0])) @@ -193,9 +347,10 @@ int BoUpSLP::getTreeRollCost(ValueList &VL, unsigned Depth) { /// Don't mess with vectors. if (ScalarTy->isVectorTy()) return max_cost; - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + if (Depth == RecursionMaxDepth) return getScalarizationCost(VecTy); + // Check if all of the operands are constants. bool AllConst = true; bool AllSameScalar = true; @@ -204,8 +359,8 @@ int BoUpSLP::getTreeRollCost(ValueList &VL, unsigned Depth) { AllSameScalar &= (VL[0] == VL[i]); // Must have a single use. Instruction *I = dyn_cast(VL[i]); - // Need to scalarize instructions with multiple users or from other BBs. - if (I && ((I->getNumUses() > 1) || (I->getParent() != BB))) + // This instruction is outside the basic block or if it is a known hazard. + if (MustScalarize.count(VL[i]) || (I && I->getParent() != BB)) return getScalarizationCost(VecTy); } @@ -239,7 +394,7 @@ int BoUpSLP::getTreeRollCost(ValueList &VL, unsigned Depth) { if (VL[i] == Last) continue; Value *Barrier = isUnsafeToSink(cast(VL[i]), Last); if (Barrier) { - DEBUG(dbgs() << "LR: Can't sink " << *VL[i] << "\n down to " << + DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last << "\n because of " << *Barrier << "\n"); return max_cost; } @@ -265,20 +420,22 @@ int BoUpSLP::getTreeRollCost(ValueList &VL, unsigned Depth) { case Instruction::And: case Instruction::Or: case Instruction::Xor: { - ValueList Operands; int Cost = 0; // Calculate the cost of all of the operands. for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; // Prepare the operand vector. for (unsigned j = 0; j < VL.size(); ++j) Operands.push_back(cast(VL[j])->getOperand(i)); - Cost += getTreeRollCost(Operands, Depth+1); - Operands.clear(); + + Cost += getTreeCost_rec(Operands, Depth+1); + if (Cost >= max_cost) return max_cost; } // Calculate the cost of this instruction. int ScalarCost = VecTy->getNumElements() * TTI->getArithmeticInstrCost(Opcode, ScalarTy); + int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); Cost += (VecCost - ScalarCost); return Cost; @@ -308,8 +465,7 @@ int BoUpSLP::getTreeRollCost(ValueList &VL, unsigned Depth) { MemBarrierIgnoreList.insert(VL[j]); } - int TotalCost = StoreCost + getTreeRollCost(Operands, Depth + 1); - MemBarrierIgnoreList.clear(); + int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1); return TotalCost; } default: @@ -334,6 +490,15 @@ Value *BoUpSLP::Scalarize(ValueList &VL, VectorType *Ty) { } Value *BoUpSLP::vectorizeTree(ValueList &VL, int VF) { + Value *V = vectorizeTree_rec(VL, VF); + // We moved some instructions around. We have to number them again + // before we can do any analysis. + numberInstructions(); + MustScalarize.clear(); + return V; +} + +Value *BoUpSLP::vectorizeTree_rec(ValueList &VL, int VF) { Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast(VL[0])) ScalarTy = SI->getValueOperand()->getType(); @@ -345,19 +510,21 @@ Value *BoUpSLP::vectorizeTree(ValueList &VL, int VF) { for (unsigned i = 0, e = VF; i < e; ++i) { AllConst &= !!dyn_cast(VL[i]); AllSameScalar &= (VL[0] == VL[i]); - // Must have a single use. + // The instruction must be in the same BB, and it must be vectorizable. Instruction *I = dyn_cast(VL[i]); - if (I && (I->getNumUses() > 1 || I->getParent() != BB)) + if (MustScalarize.count(VL[i]) || (I && I->getParent() != BB)) return Scalarize(VL, VecTy); } - // Is this a simple vector constant. + // Check that this is a simple vector constant. if (AllConst || AllSameScalar) return Scalarize(VL, VecTy); // Scalarize unknown structures. Instruction *VL0 = dyn_cast(VL[0]); if (!VL0) return Scalarize(VL, VecTy); + if (VectorizedValues.count(VL0)) return VectorizedValues[VL0]; + unsigned Opcode = VL0->getOpcode(); for (unsigned i = 0, e = VF; i < e; ++i) { Instruction *I = dyn_cast(VL[i]); @@ -390,11 +557,13 @@ Value *BoUpSLP::vectorizeTree(ValueList &VL, int VF) { LHSVL.push_back(cast(VL[i])->getOperand(1)); } - Value *RHS = vectorizeTree(RHSVL, VF); - Value *LHS = vectorizeTree(LHSVL, VF); + Value *RHS = vectorizeTree_rec(RHSVL, VF); + Value *LHS = vectorizeTree_rec(LHSVL, VF); IRBuilder<> Builder(GetLastInstr(VL, VF)); BinaryOperator *BinOp = dyn_cast(VL0); - return Builder.CreateBinOp(BinOp->getOpcode(), RHS,LHS); + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), RHS,LHS); + VectorizedValues[VL0] = V; + return V; } case Instruction::Load: { LoadInst *LI = dyn_cast(VL0); @@ -410,6 +579,7 @@ Value *BoUpSLP::vectorizeTree(ValueList &VL, int VF) { VecTy->getPointerTo()); LI = Builder.CreateLoad(VecPtr); LI->setAlignment(Alignment); + VectorizedValues[VL0] = LI; return LI; } case Instruction::Store: { @@ -420,7 +590,7 @@ Value *BoUpSLP::vectorizeTree(ValueList &VL, int VF) { for (int i = 0; i < VF; ++i) ValueOp.push_back(cast(VL[i])->getValueOperand()); - Value *VecValue = vectorizeTree(ValueOp, VF); + Value *VecValue = vectorizeTree_rec(ValueOp, VF); IRBuilder<> Builder(GetLastInstr(VL, VF)); Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), @@ -432,7 +602,9 @@ Value *BoUpSLP::vectorizeTree(ValueList &VL, int VF) { return 0; } default: - return Scalarize(VL, VecTy); + Value *S = Scalarize(VL, VecTy); + VectorizedValues[VL0] = S; + return S; } } diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h index b1c2b0c649b..808fb10f9f4 100644 --- a/lib/Transforms/Vectorize/VecUtils.h +++ b/lib/Transforms/Vectorize/VecUtils.h @@ -42,6 +42,14 @@ struct BoUpSLP { BoUpSLP(BasicBlock *Bb, ScalarEvolution *Se, DataLayout *Dl, TargetTransformInfo *Tti, AliasAnalysis *Aa); + /// \brief Take the pointer operand from the Load/Store instruction. + /// \returns NULL if this is not a valid Load/Store instruction. + static Value *getPointerOperand(Value *I); + + /// \brief Take the address space operand from the Load/Store instruction. + /// \returns -1 if this is not a valid Load/Store instruction. + static unsigned getAddressSpaceOperand(Value *I); + /// \returns true if the memory operations A and B are consecutive. bool isConsecutiveAccess(Value *A, Value *B); @@ -51,25 +59,31 @@ struct BoUpSLP { /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. - int getTreeRollCost(ValueList &VL, unsigned Depth); - - /// \brief Take the pointer operand from the Load/Store instruction. - /// \returns NULL if this is not a valid Load/Store instruction. - static Value *getPointerOperand(Value *I); - - /// \brief Take the address space operand from the Load/Store instruction. - /// \returns -1 if this is not a valid Load/Store instruction. - static unsigned getAddressSpaceOperand(Value *I); + int getTreeCost(ValueList &VL); /// \brief Attempts to order and vectorize a sequence of stores. This /// function does a quadratic scan of the given stores. /// \returns true if the basic block was modified. bool vectorizeStores(StoreList &Stores, int costThreshold); +private: + /// \returns This method contains the recursive part of getTreeCost. + int getTreeCost_rec(ValueList &VL, unsigned Depth); + + /// \returns This recursive method looks for vectorization hazards such as + /// values that are used by multiple users and checks that values are used + /// by only one vector lane. It updates the variables LaneMap, MultiUserVals. + void getTreeUses_rec(ValueList &VL, unsigned Depth); + + /// \brief This method contains the recursive part of vectorizeTree. + Value *vectorizeTree_rec(ValueList &VL, int VF); + /// \brief Number all of the instructions in the block. void numberInstructions(); -private: + /// \brief Vectorize a sorted sequence of stores. + bool vectorizeStoreChain(ValueList &Chain, int CostThreshold); + /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. int getScalarizationCost(Type *Ty); @@ -89,12 +103,34 @@ private: /// \returns a vector from a collection of scalars in \p VL. Value *Scalarize(ValueList &VL, VectorType *Ty); +private: // Maps instructions to numbers and back. SmallDenseMap InstrIdx; + // Maps integers to Instructions. std::vector InstrVec; + + // -- containers that are used during getTreeCost -- // + + /// Contains values that must be scalarized because they are used + /// by multiple lanes, or by users outside the tree. + /// NOTICE: The vectorization methods also use this set. + ValueSet MustScalarize; + + // Contains a list of values that are used outside the current tree. This + // set must be reset between runs. + ValueSet MultiUserVals; + // Maps values in the tree to the vector lanes that uses them. This map must + // be reset between runs of getCost. + std::map LaneMap; // A list of instructions to ignore while sinking - // memory instructions. + // memory instructions. This map must be reset between runs of getCost. SmallSet MemBarrierIgnoreList; + + // -- containers that are used during vectorizeTree -- // + // Maps between the first scalar to the vector. This map must be reset between + // runs. + DenseMap VectorizedValues; + // Analysis and block reference. BasicBlock *BB; ScalarEvolution *SE; diff --git a/test/Transforms/SLPVectorizer/X86/diamond.ll b/test/Transforms/SLPVectorizer/X86/diamond.ll new file mode 100644 index 00000000000..83b4fdce4e5 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/diamond.ll @@ -0,0 +1,83 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; int foo(int * restrict B, int * restrict A, int n, int m) { +; B[0] = n * A[0] + m * A[0]; +; B[1] = n * A[1] + m * A[1]; +; B[2] = n * A[2] + m * A[2]; +; B[3] = n * A[3] + m * A[3]; +; return 0; +; } + +; CHECK: @foo +; CHECK: load <4 x i32> +; CHECK: mul <4 x i32> +; CHECK: store <4 x i32> +; CHECK: ret +define i32 @foo(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) #0 { +entry: + %0 = load i32* %A, align 4, !tbaa !0 + %mul238 = add i32 %m, %n + %add = mul i32 %0, %mul238 + store i32 %add, i32* %B, align 4, !tbaa !0 + %arrayidx4 = getelementptr inbounds i32* %A, i64 1 + %1 = load i32* %arrayidx4, align 4, !tbaa !0 + %add8 = mul i32 %1, %mul238 + %arrayidx9 = getelementptr inbounds i32* %B, i64 1 + store i32 %add8, i32* %arrayidx9, align 4, !tbaa !0 + %arrayidx10 = getelementptr inbounds i32* %A, i64 2 + %2 = load i32* %arrayidx10, align 4, !tbaa !0 + %add14 = mul i32 %2, %mul238 + %arrayidx15 = getelementptr inbounds i32* %B, i64 2 + store i32 %add14, i32* %arrayidx15, align 4, !tbaa !0 + %arrayidx16 = getelementptr inbounds i32* %A, i64 3 + %3 = load i32* %arrayidx16, align 4, !tbaa !0 + %add20 = mul i32 %3, %mul238 + %arrayidx21 = getelementptr inbounds i32* %B, i64 3 + store i32 %add20, i32* %arrayidx21, align 4, !tbaa !0 + ret i32 0 +} + + +; int foo_fail(int * restrict B, int * restrict A, int n, int m) { +; B[0] = n * A[0] + m * A[0]; +; B[1] = n * A[1] + m * A[1]; +; B[2] = n * A[2] + m * A[2]; +; B[3] = n * A[3] + m * A[3]; +; return A[0]; +; } + +; CHECK: @foo_fail +; CHECK-NOT: load <4 x i32> +; CHECK: ret +define i32 @foo_fail(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) #0 { +entry: + %0 = load i32* %A, align 4, !tbaa !0 + %mul238 = add i32 %m, %n + %add = mul i32 %0, %mul238 + store i32 %add, i32* %B, align 4, !tbaa !0 + %arrayidx4 = getelementptr inbounds i32* %A, i64 1 + %1 = load i32* %arrayidx4, align 4, !tbaa !0 + %add8 = mul i32 %1, %mul238 + %arrayidx9 = getelementptr inbounds i32* %B, i64 1 + store i32 %add8, i32* %arrayidx9, align 4, !tbaa !0 + %arrayidx10 = getelementptr inbounds i32* %A, i64 2 + %2 = load i32* %arrayidx10, align 4, !tbaa !0 + %add14 = mul i32 %2, %mul238 + %arrayidx15 = getelementptr inbounds i32* %B, i64 2 + store i32 %add14, i32* %arrayidx15, align 4, !tbaa !0 + %arrayidx16 = getelementptr inbounds i32* %A, i64 3 + %3 = load i32* %arrayidx16, align 4, !tbaa !0 + %add20 = mul i32 %3, %mul238 + %arrayidx21 = getelementptr inbounds i32* %B, i64 3 + store i32 %add20, i32* %arrayidx21, align 4, !tbaa !0 + ret i32 %0 ;<--------- This value has multiple users and can't be vectorized. +} + +attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!0 = metadata !{metadata !"int", metadata !1} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA"}