From 369cc938d261de3295eb70d0738f54ef1a82806c Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sun, 7 Jul 2013 06:57:07 +0000 Subject: [PATCH] SLPVectorizer: Implement DCE as part of vectorization. This is a complete re-write if the bottom-up vectorization class. Before this commit we scanned the instruction tree 3 times. First in search of merge points for the trees. Second, for estimating the cost. And finally for vectorization. There was a lot of code duplication and adding the DCE exposed bugs. The new design is simpler and DCE was a part of the design. In this implementation we build the tree once. After that we estimate the cost by scanning the different entries in the constructed tree (in any order). The vectorization phase also works on the built tree. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@185774 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/SLPVectorizer.cpp | 2070 +++++++++-------- .../SLPVectorizer/X86/crash_7zip.ll | 38 + .../SLPVectorizer/X86/crash_bullet.ll | 38 + .../SLPVectorizer/X86/crash_bullet2.ll | 38 + .../SLPVectorizer/X86/crash_dequeue.ll | 40 + .../SLPVectorizer/X86/crash_flop7.ll | 46 + .../SLPVectorizer/X86/crash_lame.ll | 24 + .../SLPVectorizer/X86/crash_lencod.ll | 66 + .../SLPVectorizer/X86/crash_lencod2.ll | 23 + .../SLPVectorizer/X86/crash_mandeltext.ll | 53 + .../Transforms/SLPVectorizer/X86/crash_rc4.ll | 28 + .../SLPVectorizer/X86/crash_sim4b1.ll | 113 + .../SLPVectorizer/X86/crash_smallpt.ll | 65 + .../SLPVectorizer/X86/crash_smallpt2.ll | 46 + test/Transforms/SLPVectorizer/X86/diamond.ll | 6 +- .../SLPVectorizer/X86/long_chains.ll | 13 +- test/Transforms/SLPVectorizer/X86/saxpy.ll | 16 + 17 files changed, 1695 insertions(+), 1028 deletions(-) create mode 100644 test/Transforms/SLPVectorizer/X86/crash_7zip.ll create mode 100644 test/Transforms/SLPVectorizer/X86/crash_bullet.ll create mode 100644 test/Transforms/SLPVectorizer/X86/crash_bullet2.ll create mode 100644 test/Transforms/SLPVectorizer/X86/crash_dequeue.ll create mode 100644 test/Transforms/SLPVectorizer/X86/crash_flop7.ll create mode 100644 test/Transforms/SLPVectorizer/X86/crash_lame.ll create mode 100644 test/Transforms/SLPVectorizer/X86/crash_lencod.ll create mode 100644 test/Transforms/SLPVectorizer/X86/crash_lencod2.ll create mode 100644 test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll create mode 100644 test/Transforms/SLPVectorizer/X86/crash_rc4.ll create mode 100644 test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll create mode 100644 test/Transforms/SLPVectorizer/X86/crash_smallpt.ll create mode 100644 test/Transforms/SLPVectorizer/X86/crash_smallpt2.ll diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 85c01bdeff9..d2e7450d248 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -59,7 +59,7 @@ static const unsigned RecursionMaxDepth = 12; class BuilderLocGuard { public: BuilderLocGuard(IRBuilder<> &B) : Builder(B), Loc(B.GetInsertPoint()) {} - ~BuilderLocGuard() { Builder.SetInsertPoint(Loc); } + ~BuilderLocGuard() { if (Loc) Builder.SetInsertPoint(Loc); } private: // Prevent copying. @@ -91,6 +91,7 @@ struct BlockNumbering { } int getIndex(Instruction *I) { + assert(I->getParent() == BB && "Invalid instruction"); if (!Valid) numberInstructions(); assert(InstrIdx.count(I) && "Unknown instruction"); @@ -117,26 +118,162 @@ private: std::vector InstrVec; }; -class FuncSLP { +/// \returns the parent basic block if all of the instructions in \p VL +/// are in the same block or null otherwise. +static BasicBlock *getSameBlock(ArrayRef VL) { + Instruction *I0 = dyn_cast(VL[0]); + if (!I0) + return 0; + BasicBlock *BB = I0->getParent(); + for (int i = 1, e = VL.size(); i < e; i++) { + Instruction *I = dyn_cast(VL[i]); + if (!I) + return 0; + + if (BB != I->getParent()) + return 0; + } + return BB; +} + +/// \returns True if all of the values in \p VL are constants. +static bool allConstant(ArrayRef VL) { + for (unsigned i = 0, e = VL.size(); i < e; ++i) + if (!isa(VL[i])) + return false; + return true; +} + +/// \returns True if all of the values in \p VL are identical. +static bool isSplat(ArrayRef VL) { + for (unsigned i = 1, e = VL.size(); i < e; ++i) + if (VL[i] != VL[0]) + return false; + return true; +} + +/// \returns The opcode if all of the Instructions in \p VL have the same +/// opcode, or zero. +static unsigned getSameOpcode(ArrayRef VL) { + Instruction *I0 = dyn_cast(VL[0]); + if (!I0) + return 0; + unsigned Opcode = I0->getOpcode(); + for (int i = 1, e = VL.size(); i < e; i++) { + Instruction *I = dyn_cast(VL[i]); + if (!I || Opcode != I->getOpcode()) + return 0; + } + return Opcode; +} + +/// \returns The type that all of the values in \p VL have or null if there +/// are different types. +static Type* getSameType(ArrayRef VL) { + Type *Ty = VL[0]->getType(); + for (int i = 1, e = VL.size(); i < e; i++) + if (VL[0]->getType() != Ty) + return 0; + + return Ty; +} + +/// \returns True if the ExtractElement instructions in VL can be vectorized +/// to use the original vector. +static bool CanReuseExtract(ArrayRef VL) { + assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode"); + // Check if all of the extracts come from the same vector and from the + // correct offset. + Value *VL0 = VL[0]; + ExtractElementInst *E0 = cast(VL0); + Value *Vec = E0->getOperand(0); + + // We have to extract from the same vector type. + unsigned NElts = Vec->getType()->getVectorNumElements(); + + if (NElts != VL.size()) + return false; + + // Check that all of the indices extract from the correct offset. + ConstantInt *CI = dyn_cast(E0->getOperand(1)); + if (!CI || CI->getZExtValue()) + return false; + + for (unsigned i = 1, e = VL.size(); i < e; ++i) { + ExtractElementInst *E = cast(VL[i]); + ConstantInt *CI = dyn_cast(E->getOperand(1)); + + if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) + return false; + } + + return true; +} + +/// Bottom Up SLP Vectorizer. +class BoUpSLP { +public: typedef SmallVector ValueList; typedef SmallVector InstrList; typedef SmallPtrSet ValueSet; typedef SmallVector StoreList; -public: - static const int MAX_COST = INT_MIN; - - FuncSLP(Function *Func, ScalarEvolution *Se, DataLayout *Dl, - TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li, + BoUpSLP(Function *Func, ScalarEvolution *Se, DataLayout *Dl, + TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li, DominatorTree *Dt) : F(Func), SE(Se), DL(Dl), TTI(Tti), AA(Aa), LI(Li), DT(Dt), Builder(Se->getContext()) { - for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { - BasicBlock *BB = it; - BlocksNumbers[BB] = BlockNumbering(BB); + // Setup the block numbering utility for all of the blocks in the + // function. + for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { + BasicBlock *BB = it; + BlocksNumbers[BB] = BlockNumbering(BB); + } } + + /// \brief Vectorize the tree that starts with the elements in \p VL. + void vectorizeTree(); + + /// \returns the vectorization cost of the subtree that starts at \p VL. + /// A negative number means that this is profitable. + int getTreeCost(); + + /// Construct a vectorizable tree that starts at \p Roots. + void buildTree(ArrayRef Roots); + + /// Clear the internal data structures that are created by 'buildTree'. + void deleteTree() { + VectorizableTree.clear(); + ScalarToTreeEntry.clear(); + MustGather.clear(); + MemBarrierIgnoreList.clear(); } + /// \returns the scalarization cost for this list of values. Assuming that + /// this subtree gets vectorized, we may need to extract the values from the + /// roots. This method calculates the cost of extracting the values. + int getGatherCost(ArrayRef VL); + + /// \returns true if the memory operations A and B are consecutive. + bool isConsecutiveAccess(Value *A, Value *B); + + /// \brief Perform LICM and CSE on the newly generated gather sequences. + void optimizeGatherSequence(); +private: + struct TreeEntry; + + /// \returns the cost of the vectorizable entry. + int getEntryCost(TreeEntry *E); + + /// This is the recursive part of buildTree. + void buildTree_rec(ArrayRef Roots, unsigned Depth); + + /// Vectorizer a single entry in the tree. + Value *vectorizeTree(TreeEntry *E); + + /// Vectorizer a single entry in the tree, starting in \p VL. + Value *vectorizeTree(ArrayRef VL); + /// \brief Take the pointer operand from the Load/Store instruction. /// \returns NULL if this is not a valid Load/Store instruction. static Value *getPointerOperand(Value *I); @@ -145,45 +282,6 @@ public: /// \returns -1 if this is not a valid Load/Store instruction. static unsigned getAddressSpaceOperand(Value *I); - /// \returns true if the memory operations A and B are consecutive. - bool isConsecutiveAccess(Value *A, Value *B); - - /// \brief Vectorize the tree that starts with the elements in \p VL. - /// \returns the vectorized value. - Value *vectorizeTree(ArrayRef VL); - - /// \returns the vectorization cost of the subtree that starts at \p VL. - /// A negative number means that this is profitable. - int getTreeCost(ArrayRef VL); - - /// \returns the scalarization cost for this list of values. Assuming that - /// this subtree gets vectorized, we may need to extract the values from the - /// roots. This method calculates the cost of extracting the values. - int getGatherCost(ArrayRef VL); - - /// \brief Attempts to order and vectorize a sequence of stores. This - /// function does a quadratic scan of the given stores. - /// \returns true if the basic block was modified. - bool vectorizeStores(ArrayRef Stores, int costThreshold); - - /// \brief Vectorize a group of scalars into a vector tree. - /// \returns the vectorized value. - Value *vectorizeArith(ArrayRef Operands); - - /// \brief This method contains the recursive part of getTreeCost. - int getTreeCost_rec(ArrayRef VL, unsigned Depth); - - /// \brief This recursive method looks for vectorization hazards such as - /// values that are used by multiple users and checks that values are used - /// by only one vector lane. It updates the variables LaneMap, MultiUserVals. - void getTreeUses_rec(ArrayRef VL, unsigned Depth); - - /// \brief This method contains the recursive part of vectorizeTree. - Value *vectorizeTree_rec(ArrayRef VL); - - /// \brief Vectorize a sorted sequence of stores. - bool vectorizeStoreChain(ArrayRef Chain, int CostThreshold); - /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. int getGatherCost(Type *Ty); @@ -211,58 +309,66 @@ public: /// \returns a vector from a collection of scalars in \p VL. Value *Gather(ArrayRef VL, VectorType *Ty); - /// \brief Perform LICM and CSE on the newly generated gather sequences. - void optimizeGatherSequence(); + struct TreeEntry { + TreeEntry() : Scalars(), VectorizedValue(0), LastScalarIndex(0), + NeedToGather(0) {} - bool needToGatherAny(ArrayRef VL) { - for (int i = 0, e = VL.size(); i < e; ++i) - if (MustGather.count(VL[i])) - return true; - return false; - } + /// \returns true if the scalars in VL are equal to this entry. + bool isSame(ArrayRef VL) { + assert(VL.size() == Scalars.size() && "Invalid size"); + for (int i = 0, e = VL.size(); i != e; ++i) + if (VL[i] != Scalars[i]) + return false; + return true; + } - void forgetNumbering() { - for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) - BlocksNumbers[it].forget(); + /// A vector of scalars. + ValueList Scalars; + + /// The Scalars are vectorized into this value. It is initialized to Null. + Value *VectorizedValue; + + /// The index in the basic block of the last scalar. + int LastScalarIndex; + + /// Do we need to gather this sequence ? + bool NeedToGather; + }; + + /// Create a new VectorizableTree entry. + TreeEntry *newTreeEntry(ArrayRef VL, bool Vectorized) { + VectorizableTree.push_back(TreeEntry()); + int idx = VectorizableTree.size() - 1; + TreeEntry *Last = &VectorizableTree[idx]; + Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); + Last->NeedToGather = !Vectorized; + if (Vectorized) { + Last->LastScalarIndex = getLastIndex(VL); + for (int i = 0, e = VL.size(); i != e; ++i) { + assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!"); + ScalarToTreeEntry[VL[i]] = idx; + } + } else { + Last->LastScalarIndex = 0; + MustGather.insert(VL.begin(), VL.end()); + } + return Last; } /// -- Vectorization State -- + /// Holds all of the tree entries. + std::vector VectorizableTree; + + /// Maps a specific scalar to its tree entry. + SmallDenseMap ScalarToTreeEntry; + + /// A list of scalars that we found that we need to keep as scalars. + ValueSet MustGather; - /// Maps values in the tree to the vector lanes that uses them. This map must - /// be reset between runs of getCost. - std::map LaneMap; /// A list of instructions to ignore while sinking /// memory instructions. This map must be reset between runs of getCost. ValueSet MemBarrierIgnoreList; - /// Maps between the first scalar to the vector. This map must be reset - /// between runs. - DenseMap VectorizedValues; - - /// Contains values that must be gathered because they are used - /// by multiple lanes, or by users outside the tree. - /// NOTICE: The vectorization methods also use this set. - ValueSet MustGather; - - /// Contains PHINodes that are being processed. We use this data structure - /// to stop cycles in the graph. - ValueSet VisitedPHIs; - - /// Contains a list of values that are used outside the current tree, the - /// first element in the bundle and the insertion point for extracts. This - /// set must be reset between runs. - struct UseInfo{ - UseInfo(Instruction *VL0, int I) : - Leader(VL0), LastIndex(I) {} - UseInfo() : Leader(0), LastIndex(0) {} - /// The first element in the bundle. - Instruction *Leader; - /// The insertion index. - int LastIndex; - }; - MapVector MultiUserVals; - SetVector ExtractedLane; - /// Holds all of the instructions that we gathered. SetVector GatherSeq; @@ -281,14 +387,478 @@ public: IRBuilder<> Builder; }; -int FuncSLP::getGatherCost(Type *Ty) { +void BoUpSLP::buildTree(ArrayRef Roots) { + deleteTree(); + buildTree_rec(Roots, 0); +} + + +void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { + bool SameTy = getSameType(VL); (void)SameTy; + assert(SameTy && "Invalid types!"); + + if (Depth == RecursionMaxDepth) { + DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); + newTreeEntry(VL, false); + return; + } + + // Don't handle vectors. + if (VL[0]->getType()->isVectorTy()) { + DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); + newTreeEntry(VL, false); + return; + } + + if (StoreInst *SI = dyn_cast(VL[0])) + if (SI->getValueOperand()->getType()->isVectorTy()) { + DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); + newTreeEntry(VL, false); + return; + } + + // If all of the operands are identical or constant we have a simple solution. + if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || + !getSameOpcode(VL)) { + DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); + newTreeEntry(VL, false); + return; + } + + // We now know that this is a vector of instructions of the same type from + // the same block. + + // Check if this is a duplicate of another entry. + if (ScalarToTreeEntry.count(VL[0])) { + int Idx = ScalarToTreeEntry[VL[0]]; + TreeEntry *E = &VectorizableTree[Idx]; + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); + if (E->Scalars[i] != VL[i]) { + DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); + newTreeEntry(VL, false); + return; + } + } + DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n"); + return; + } + + // Check that none of the instructions in the bundle are already in the tree. + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + if (ScalarToTreeEntry.count(VL[i])) { + DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << + ") is already in tree.\n"); + newTreeEntry(VL, false); + return; + } + } + + // If any of the scalars appears in the table OR it is marked as a value that + // needs to stat scalar then we need to gather the scalars. + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + if (ScalarToTreeEntry.count(VL[i]) || MustGather.count(VL[i])) { + DEBUG(dbgs() << "SLP: Gathering due to gathered scalar. \n"); + newTreeEntry(VL, false); + return; + } + } + + // Check that all of the users of the scalars that we want to vectorize are + // schedulable. + Instruction *VL0 = cast(VL[0]); + int MyLastIndex = getLastIndex(VL); + BasicBlock *BB = cast(VL0)->getParent(); + + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + Instruction *Scalar = cast(VL[i]); + DEBUG(dbgs() << "SLP: Checking users of " << *Scalar << ". \n"); + for (Value::use_iterator U = Scalar->use_begin(), UE = Scalar->use_end(); + U != UE; ++U) { + DEBUG(dbgs() << "SLP: \tUser " << **U << ". \n"); + Instruction *User = dyn_cast(*U); + if (!User) { + DEBUG(dbgs() << "SLP: Gathering due unknown user. \n"); + newTreeEntry(VL, false); + return; + } + + // We don't care if the user is in a different basic block. + BasicBlock *UserBlock = User->getParent(); + if (UserBlock != BB) { + DEBUG(dbgs() << "SLP: User from a different basic block " + << *User << ". \n"); + continue; + } + + // If this is a PHINode within this basic block then we can place the + // extract wherever we want. + if (isa(*User)) { + DEBUG(dbgs() << "SLP: \tWe can schedule PHIs:" << *User << ". \n"); + continue; + } + + // Check if this is a safe in-tree user. + if (ScalarToTreeEntry.count(User)) { + int Idx = ScalarToTreeEntry[User]; + int VecLocation = VectorizableTree[Idx].LastScalarIndex; + if (VecLocation <= MyLastIndex) { + DEBUG(dbgs() << "SLP: Gathering due to unschedulable vector. \n"); + newTreeEntry(VL, false); + return; + } + DEBUG(dbgs() << "SLP: In-tree user (" << *User << ") at #" << + VecLocation << " vector value (" << *Scalar << ") at #" + << MyLastIndex << ".\n"); + continue; + } + + // Make sure that we can schedule this unknown user. + BlockNumbering &BN = BlocksNumbers[BB]; + int UserIndex = BN.getIndex(User); + if (UserIndex < MyLastIndex) { + + DEBUG(dbgs() << "SLP: Can't schedule extractelement for " + << *User << ". \n"); + newTreeEntry(VL, false); + return; + } + } + } + + // Check that every instructions appears once in this bundle. + for (unsigned i = 0, e = VL.size(); i < e; ++i) + for (unsigned j = i+1; j < e; ++j) + if (VL[i] == VL[j]) { + DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); + newTreeEntry(VL, false); + return; + } + + // Check that instructions in this bundle don't reference other instructions. + // The runtime of this check is O(N * N-1 * uses(N)) and a typical N is 4. + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end(); + U != UE; ++U) { + for (unsigned j = 0; j < e; ++j) { + if (i != j && *U == VL[j]) { + DEBUG(dbgs() << "SLP: Intra-bundle dependencies!" << **U << ". \n"); + newTreeEntry(VL, false); + return; + } + } + } + } + + DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); + + unsigned Opcode = getSameOpcode(VL); + + // Check if it is safe to sink the loads or the stores. + if (Opcode == Instruction::Load || Opcode == Instruction::Store) { + Instruction *Last = getLastInstruction(VL); + + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + if (VL[i] == Last) + continue; + Value *Barrier = getSinkBarrier(cast(VL[i]), Last); + if (Barrier) { + DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last + << "\n because of " << *Barrier << ". Gathering.\n"); + newTreeEntry(VL, false); + return; + } + } + } + + switch (Opcode) { + case Instruction::PHI: { + PHINode *PH = dyn_cast(VL0); + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); + + for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getIncomingValue(i)); + + buildTree_rec(Operands, Depth + 1); + } + return; + } + case Instruction::ExtractElement: { + bool Reuse = CanReuseExtract(VL); + if (Reuse) { + DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); + } + newTreeEntry(VL, Reuse); + return; + } + case Instruction::Load: { + // Check if the loads are consecutive or of we need to swizzle them. + for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) + if (!isConsecutiveAccess(VL[i], VL[i + 1])) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Need to swizzle loads.\n"); + return; + } + + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of loads.\n"); + return; + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcTy = VL0->getOperand(0)->getType(); + for (unsigned i = 0; i < VL.size(); ++i) { + Type *Ty = cast(VL[i])->getOperand(0)->getType(); + if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); + return; + } + } + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of casts.\n"); + + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth+1); + } + return; + } + case Instruction::ICmp: + case Instruction::FCmp: { + // Check that all of the compares have the same predicate. + CmpInst::Predicate P0 = dyn_cast(VL0)->getPredicate(); + for (unsigned i = 1, e = VL.size(); i < e; ++i) { + CmpInst *Cmp = cast(VL[i]); + if (Cmp->getPredicate() != P0) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); + return; + } + } + + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of compares.\n"); + + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth+1); + } + return; + } + case Instruction::Select: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); + + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth+1); + } + return; + } + case Instruction::Store: { + // Check if the stores are consecutive or of we need to swizzle them. + for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) + if (!isConsecutiveAccess(VL[i], VL[i + 1])) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Non consecutive store.\n"); + return; + } + + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of stores.\n"); + + ValueList Operands; + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(0)); + + // We can ignore these values because we are sinking them down. + MemBarrierIgnoreList.insert(VL.begin(), VL.end()); + buildTree_rec(Operands, Depth + 1); + return; + } + default: + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); + return; + } +} + +int BoUpSLP::getEntryCost(TreeEntry *E) { + ArrayRef VL = E->Scalars; + + Type *ScalarTy = VL[0]->getType(); + if (StoreInst *SI = dyn_cast(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + + if (E->NeedToGather) { + if (allConstant(VL)) + return 0; + if (isSplat(VL)) { + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); + } + return getGatherCost(E->Scalars); + } + + assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) && + "Invalid VL"); + Instruction *VL0 = cast(VL[0]); + unsigned Opcode = VL0->getOpcode(); + switch (Opcode) { + case Instruction::PHI: { + return 0; + } + case Instruction::ExtractElement: { + if (CanReuseExtract(VL)) + return 0; + return getGatherCost(VecTy); + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcTy = VL0->getOperand(0)->getType(); + + // Calculate the cost of this instruction. + int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), + VL0->getType(), SrcTy); + + VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); + int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); + return VecCost - ScalarCost; + } + case Instruction::FCmp: + case Instruction::ICmp: + case Instruction::Select: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Calculate the cost of this instruction. + int ScalarCost = 0; + int VecCost = 0; + if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp || + Opcode == Instruction::Select) { + VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); + ScalarCost = VecTy->getNumElements() * + TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); + VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); + } else { + ScalarCost = VecTy->getNumElements() * + TTI->getArithmeticInstrCost(Opcode, ScalarTy); + VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); + } + return VecCost - ScalarCost; + } + case Instruction::Load: { + // Cost of wide load - cost of scalar loads. + int ScalarLdCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + return VecLdCost - ScalarLdCost; + } + case Instruction::Store: { + // We know that we can merge the stores. Calculate the cost. + int ScalarStCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); + int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); + return VecStCost - ScalarStCost; + } + default: + llvm_unreachable("Unknown instruction"); + } +} + +int BoUpSLP::getTreeCost() { + int Cost = 0; + DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << + VectorizableTree.size() << ".\n"); + + for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) { + int C = getEntryCost(&VectorizableTree[i]); + DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " + << *VectorizableTree[i].Scalars[0] << " .\n"); + Cost += C; + } + DEBUG(dbgs() << "SLP: Total Cost " << Cost << ".\n"); + return Cost; +} + +int BoUpSLP::getGatherCost(Type *Ty) { int Cost = 0; for (unsigned i = 0, e = cast(Ty)->getNumElements(); i < e; ++i) Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); return Cost; } -int FuncSLP::getGatherCost(ArrayRef VL) { +int BoUpSLP::getGatherCost(ArrayRef VL) { // Find the type of the operands in VL. Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast(VL[0])) @@ -298,7 +868,7 @@ int FuncSLP::getGatherCost(ArrayRef VL) { return getGatherCost(VecTy); } -AliasAnalysis::Location FuncSLP::getLocation(Instruction *I) { +AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) { if (StoreInst *SI = dyn_cast(I)) return AA->getLocation(SI); if (LoadInst *LI = dyn_cast(I)) @@ -306,7 +876,7 @@ AliasAnalysis::Location FuncSLP::getLocation(Instruction *I) { return AliasAnalysis::Location(); } -Value *FuncSLP::getPointerOperand(Value *I) { +Value *BoUpSLP::getPointerOperand(Value *I) { if (LoadInst *LI = dyn_cast(I)) return LI->getPointerOperand(); if (StoreInst *SI = dyn_cast(I)) @@ -314,7 +884,7 @@ Value *FuncSLP::getPointerOperand(Value *I) { return 0; } -unsigned FuncSLP::getAddressSpaceOperand(Value *I) { +unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { if (LoadInst *L = dyn_cast(I)) return L->getPointerAddressSpace(); if (StoreInst *S = dyn_cast(I)) @@ -322,7 +892,7 @@ unsigned FuncSLP::getAddressSpaceOperand(Value *I) { return -1; } -bool FuncSLP::isConsecutiveAccess(Value *A, Value *B) { +bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { Value *PtrA = getPointerOperand(A); Value *PtrB = getPointerOperand(B); unsigned ASA = getAddressSpaceOperand(A); @@ -354,7 +924,7 @@ bool FuncSLP::isConsecutiveAccess(Value *A, Value *B) { return ((-Offset) == Sz); } -Value *FuncSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) { +Value *BoUpSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) { assert(Src->getParent() == Dst->getParent() && "Not the same BB"); BasicBlock::iterator I = Src, E = Dst; /// Scan all of the instruction from SRC to DST and check if @@ -379,234 +949,7 @@ Value *FuncSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) { return 0; } -static BasicBlock *getSameBlock(ArrayRef VL) { - BasicBlock *BB = 0; - for (int i = 0, e = VL.size(); i < e; i++) { - Instruction *I = dyn_cast(VL[i]); - if (!I) - return 0; - - if (!BB) { - BB = I->getParent(); - continue; - } - - if (BB != I->getParent()) - return 0; - } - return BB; -} - -static bool allConstant(ArrayRef VL) { - for (unsigned i = 0, e = VL.size(); i < e; ++i) - if (!isa(VL[i])) - return false; - return true; -} - -static bool isSplat(ArrayRef VL) { - for (unsigned i = 1, e = VL.size(); i < e; ++i) - if (VL[i] != VL[0]) - return false; - return true; -} - -static unsigned getSameOpcode(ArrayRef VL) { - unsigned Opcode = 0; - for (int i = 0, e = VL.size(); i < e; i++) { - if (Instruction *I = dyn_cast(VL[i])) { - if (!Opcode) { - Opcode = I->getOpcode(); - continue; - } - if (Opcode != I->getOpcode()) - return 0; - } - } - return Opcode; -} - -static bool CanReuseExtract(ArrayRef VL, unsigned VF, - VectorType *VecTy) { - assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode"); - // Check if all of the extracts come from the same vector and from the - // correct offset. - Value *VL0 = VL[0]; - ExtractElementInst *E0 = cast(VL0); - Value *Vec = E0->getOperand(0); - - // We have to extract from the same vector type. - if (Vec->getType() != VecTy) - return false; - - // Check that all of the indices extract from the correct offset. - ConstantInt *CI = dyn_cast(E0->getOperand(1)); - if (!CI || CI->getZExtValue()) - return false; - - for (unsigned i = 1, e = VF; i < e; ++i) { - ExtractElementInst *E = cast(VL[i]); - ConstantInt *CI = dyn_cast(E->getOperand(1)); - - if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) - return false; - } - - return true; -} - -void FuncSLP::getTreeUses_rec(ArrayRef VL, unsigned Depth) { - if (Depth == RecursionMaxDepth) - return MustGather.insert(VL.begin(), VL.end()); - - // Don't handle vectors. - if (VL[0]->getType()->isVectorTy()) - return; - - if (StoreInst *SI = dyn_cast(VL[0])) - if (SI->getValueOperand()->getType()->isVectorTy()) - return; - - // If all of the operands are identical or constant we have a simple solution. - if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL)) - return MustGather.insert(VL.begin(), VL.end()); - - // Stop the scan at unknown IR. - Instruction *VL0 = dyn_cast(VL[0]); - assert(VL0 && "Invalid instruction"); - - // Mark instructions with multiple users. - int LastIndex = getLastIndex(VL); - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - if (PHINode *PN = dyn_cast(VL[i])) { - unsigned NumUses = 0; - // Check that PHINodes have only one external (non-self) use. - for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end(); - U != UE; ++U) { - // Don't count self uses. - if (*U == PN) - continue; - NumUses++; - } - if (NumUses > 1) { - DEBUG(dbgs() << "SLP: Adding PHI to MultiUserVals " - "because it has " << NumUses << " users:" << *PN << " \n"); - UseInfo UI(VL0, 0); - MultiUserVals[PN] = UI; - } - continue; - } - - Instruction *I = dyn_cast(VL[i]); - // Remember to check if all of the users of this instruction are vectorized - // within our tree. At depth zero we have no local users, only external - // users that we don't care about. - if (Depth && I && I->getNumUses() > 1) { - DEBUG(dbgs() << "SLP: Adding to MultiUserVals " - "because it has " << I->getNumUses() << " users:" << *I << " \n"); - UseInfo UI(VL0, LastIndex); - MultiUserVals[I] = UI; - } - } - - // Check that the instruction is only used within one lane. - for (int i = 0, e = VL.size(); i < e; ++i) { - if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) { - DEBUG(dbgs() << "SLP: Value used by multiple lanes:" << *VL[i] << "\n"); - return MustGather.insert(VL.begin(), VL.end()); - } - // Make this instruction as 'seen' and remember the lane. - LaneMap[VL[i]] = i; - } - - unsigned Opcode = getSameOpcode(VL); - if (!Opcode) - return MustGather.insert(VL.begin(), VL.end()); - - switch (Opcode) { - case Instruction::PHI: { - PHINode *PH = dyn_cast(VL0); - - // Stop self cycles. - if (VisitedPHIs.count(PH)) - return; - - VisitedPHIs.insert(PH); - for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast(VL[j])->getIncomingValue(i)); - - getTreeUses_rec(Operands, Depth + 1); - } - return; - } - case Instruction::ExtractElement: { - VectorType *VecTy = VectorType::get(VL[0]->getType(), VL.size()); - // No need to follow ExtractElements that are going to be optimized away. - if (CanReuseExtract(VL, VL.size(), VecTy)) - return; - // Fall through. - } - case Instruction::Load: - return; - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: - case Instruction::Select: - case Instruction::ICmp: - case Instruction::FCmp: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast(VL[j])->getOperand(i)); - - getTreeUses_rec(Operands, Depth + 1); - } - return; - } - case Instruction::Store: { - ValueList Operands; - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast(VL[j])->getOperand(0)); - getTreeUses_rec(Operands, Depth + 1); - return; - } - default: - return MustGather.insert(VL.begin(), VL.end()); - } -} - -int FuncSLP::getLastIndex(ArrayRef VL) { +int BoUpSLP::getLastIndex(ArrayRef VL) { BasicBlock *BB = cast(VL[0])->getParent(); assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); BlockNumbering &BN = BlocksNumbers[BB]; @@ -617,7 +960,7 @@ int FuncSLP::getLastIndex(ArrayRef VL) { return MaxIdx; } -Instruction *FuncSLP::getLastInstruction(ArrayRef VL) { +Instruction *BoUpSLP::getLastInstruction(ArrayRef VL) { BasicBlock *BB = cast(VL[0])->getParent(); assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); BlockNumbering &BN = BlocksNumbers[BB]; @@ -625,15 +968,17 @@ Instruction *FuncSLP::getLastInstruction(ArrayRef VL) { int MaxIdx = BN.getIndex(cast(VL[0])); for (unsigned i = 1, e = VL.size(); i < e; ++i) MaxIdx = std::max(MaxIdx, BN.getIndex(cast(VL[i]))); - return BN.getInstruction(MaxIdx); + Instruction *I = BN.getInstruction(MaxIdx); + assert(I && "bad location"); + return I; } -Instruction *FuncSLP::getInstructionForIndex(unsigned Index, BasicBlock *BB) { +Instruction *BoUpSLP::getInstructionForIndex(unsigned Index, BasicBlock *BB) { BlockNumbering &BN = BlocksNumbers[BB]; return BN.getInstruction(Index); } -int FuncSLP::getFirstUserIndex(ArrayRef VL) { +int BoUpSLP::getFirstUserIndex(ArrayRef VL) { BasicBlock *BB = getSameBlock(VL); assert(BB && "All instructions must come from the same block"); BlockNumbering &BN = BlocksNumbers[BB]; @@ -654,444 +999,7 @@ int FuncSLP::getFirstUserIndex(ArrayRef VL) { return FirstUser; } -int FuncSLP::getTreeCost_rec(ArrayRef VL, unsigned Depth) { - Type *ScalarTy = VL[0]->getType(); - - if (StoreInst *SI = dyn_cast(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - - /// Don't mess with vectors. - if (ScalarTy->isVectorTy()) - return FuncSLP::MAX_COST; - - if (allConstant(VL)) - return 0; - - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); - - if (isSplat(VL)) - return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); - - int GatherCost = getGatherCost(VecTy); - if (Depth == RecursionMaxDepth || needToGatherAny(VL)) - return GatherCost; - - BasicBlock *BB = getSameBlock(VL); - unsigned Opcode = getSameOpcode(VL); - assert(Opcode && BB && "Invalid Instruction Value"); - - // Check if it is safe to sink the loads or the stores. - if (Opcode == Instruction::Load || Opcode == Instruction::Store) { - int MaxIdx = getLastIndex(VL); - Instruction *Last = getInstructionForIndex(MaxIdx, BB); - - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - if (VL[i] == Last) - continue; - Value *Barrier = getSinkBarrier(cast(VL[i]), Last); - if (Barrier) { - DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last - << "\n because of " << *Barrier << "\n"); - return MAX_COST; - } - } - } - - // Calculate the extract cost. - unsigned ExternalUserExtractCost = 0; - for (unsigned i = 0, e = VL.size(); i < e; ++i) - if (ExtractedLane.count(cast(VL[i]))) - ExternalUserExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); - - Instruction *VL0 = cast(VL[0]); - switch (Opcode) { - case Instruction::PHI: { - PHINode *PH = dyn_cast(VL0); - - // Stop self cycles. - if (VisitedPHIs.count(PH)) - return 0; - - VisitedPHIs.insert(PH); - int TotalCost = 0; - // Calculate the cost of all of the operands. - for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast(VL[j])->getIncomingValue(i)); - - int Cost = getTreeCost_rec(Operands, Depth + 1); - if (Cost == MAX_COST) - return MAX_COST; - TotalCost += TotalCost; - } - - if (TotalCost > GatherCost) { - MustGather.insert(VL.begin(), VL.end()); - return GatherCost; - } - - return TotalCost + ExternalUserExtractCost; - } - case Instruction::ExtractElement: { - if (CanReuseExtract(VL, VL.size(), VecTy)) - return 0; - return getGatherCost(VecTy); - } - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - ValueList Operands; - Type *SrcTy = VL0->getOperand(0)->getType(); - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) { - Operands.push_back(cast(VL[j])->getOperand(0)); - // Check that the casted type is the same for all users. - if (cast(VL[j])->getOperand(0)->getType() != SrcTy) - return getGatherCost(VecTy); - } - - int Cost = getTreeCost_rec(Operands, Depth + 1); - if (Cost == MAX_COST) - return MAX_COST; - - // Calculate the cost of this instruction. - int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), - VL0->getType(), SrcTy); - - VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); - int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); - Cost += (VecCost - ScalarCost); - - if (Cost > GatherCost) { - MustGather.insert(VL.begin(), VL.end()); - return GatherCost; - } - - return Cost + ExternalUserExtractCost; - } - case Instruction::FCmp: - case Instruction::ICmp: { - // Check that all of the compares have the same predicate. - CmpInst::Predicate P0 = dyn_cast(VL0)->getPredicate(); - for (unsigned i = 1, e = VL.size(); i < e; ++i) { - CmpInst *Cmp = cast(VL[i]); - if (Cmp->getPredicate() != P0) - return getGatherCost(VecTy); - } - // Fall through. - } - case Instruction::Select: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - int TotalCost = 0; - // Calculate the cost of all of the operands. - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast(VL[j])->getOperand(i)); - - int Cost = getTreeCost_rec(Operands, Depth + 1); - if (Cost == MAX_COST) - return MAX_COST; - TotalCost += Cost; - } - - // Calculate the cost of this instruction. - int ScalarCost = 0; - int VecCost = 0; - if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp || - Opcode == Instruction::Select) { - VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); - ScalarCost = - VecTy->getNumElements() * - TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); - VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); - } else { - ScalarCost = VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Opcode, ScalarTy); - VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); - } - TotalCost += (VecCost - ScalarCost); - - if (TotalCost > GatherCost) { - MustGather.insert(VL.begin(), VL.end()); - return GatherCost; - } - - return TotalCost + ExternalUserExtractCost; - } - case Instruction::Load: { - // If we are scalarize the loads, add the cost of forming the vector. - for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) - if (!isConsecutiveAccess(VL[i], VL[i + 1])) - return getGatherCost(VecTy); - - // Cost of wide load - cost of scalar loads. - int ScalarLdCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); - int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); - int TotalCost = VecLdCost - ScalarLdCost; - - if (TotalCost > GatherCost) { - MustGather.insert(VL.begin(), VL.end()); - return GatherCost; - } - - return TotalCost + ExternalUserExtractCost; - } - case Instruction::Store: { - // We know that we can merge the stores. Calculate the cost. - int ScalarStCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); - int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); - int StoreCost = VecStCost - ScalarStCost; - - ValueList Operands; - for (unsigned j = 0; j < VL.size(); ++j) { - Operands.push_back(cast(VL[j])->getOperand(0)); - MemBarrierIgnoreList.insert(VL[j]); - } - - int Cost = getTreeCost_rec(Operands, Depth + 1); - if (Cost == MAX_COST) - return MAX_COST; - - int TotalCost = StoreCost + Cost; - return TotalCost + ExternalUserExtractCost; - } - default: - // Unable to vectorize unknown instructions. - return getGatherCost(VecTy); - } -} - -int FuncSLP::getTreeCost(ArrayRef VL) { - // Get rid of the list of stores that were removed, and from the - // lists of instructions with multiple users. - MemBarrierIgnoreList.clear(); - LaneMap.clear(); - MultiUserVals.clear(); - ExtractedLane.clear(); - MustGather.clear(); - VisitedPHIs.clear(); - - if (!getSameBlock(VL)) - return MAX_COST; - - // Find the location of the last root. - int LastRootIndex = getLastIndex(VL); - int FirstUserIndex = getFirstUserIndex(VL); - - // Don't vectorize if there are users of the tree roots inside the tree - // itself. - if (LastRootIndex > FirstUserIndex) - return MAX_COST; - - // Scan the tree and find which value is used by which lane, and which values - // must be scalarized. - getTreeUses_rec(VL, 0); - - // Check that instructions with multiple users can be vectorized. Mark - // unsafe instructions. - for (MapVector::iterator UI = MultiUserVals.begin(), - e = MultiUserVals.end(); UI != e; ++UI) { - Instruction *Scalar = UI->first; - - if (MustGather.count(Scalar)) - continue; - - assert(LaneMap.count(Scalar) && "Unknown scalar"); - int ScalarLane = LaneMap[Scalar]; - - bool ExternalUse = false; - // Check that all of the users of this instr are within the tree. - for (Value::use_iterator Usr = Scalar->use_begin(), - UE = Scalar->use_end(); Usr != UE; ++Usr) { - // If this user is within the tree, make sure it is from the same lane. - // Notice that we have both in-tree and out-of-tree users. - if (LaneMap.count(*Usr)) { - if (LaneMap[*Usr] != ScalarLane) { - DEBUG(dbgs() << "SLP: Adding to MustExtract " - "because of an out-of-lane usage.\n"); - MustGather.insert(Scalar); - break; - } - continue; - } - - // We have an out-of-tree user. Check if we can place an 'extract'. - Instruction *User = cast(*Usr); - // We care about the order only if the user is in the same block. - if (User->getParent() == Scalar->getParent()) { - int LastLoc = UI->second.LastIndex; - BlockNumbering &BN = BlocksNumbers[User->getParent()]; - int UserIdx = BN.getIndex(User); - if (UserIdx <= LastLoc) { - DEBUG(dbgs() << "SLP: Adding to MustExtract because of an external " - "user that we can't schedule.\n"); - MustGather.insert(Scalar); - break; - } - } - // We have an external user. - ExternalUse = true; - } - - if (ExternalUse) { - // Items that are left in MultiUserVals are to be extracted. - // ExtractLane is used for the lookup. - ExtractedLane.insert(Scalar); - } - - } - - // Now calculate the cost of vectorizing the tree. - return getTreeCost_rec(VL, 0); -} -bool FuncSLP::vectorizeStoreChain(ArrayRef Chain, int CostThreshold) { - unsigned ChainLen = Chain.size(); - DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen - << "\n"); - Type *StoreTy = cast(Chain[0])->getValueOperand()->getType(); - unsigned Sz = DL->getTypeSizeInBits(StoreTy); - unsigned VF = MinVecRegSize / Sz; - - if (!isPowerOf2_32(Sz) || VF < 2) - return false; - - bool Changed = false; - // Look for profitable vectorizable trees at all offsets, starting at zero. - for (unsigned i = 0, e = ChainLen; i < e; ++i) { - if (i + VF > e) - break; - DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i - << "\n"); - ArrayRef Operands = Chain.slice(i, VF); - - int Cost = getTreeCost(Operands); - if (Cost == FuncSLP::MAX_COST) - continue; - DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); - if (Cost < CostThreshold) { - DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); - vectorizeTree(Operands); - - // Remove the scalar stores. - for (int j = 0, e = VF; j < e; ++j) - cast(Operands[j])->eraseFromParent(); - - // Move to the next bundle. - i += VF - 1; - Changed = true; - } - } - - if (Changed || ChainLen > VF) - return Changed; - - // Handle short chains. This helps us catch types such as <3 x float> that - // are smaller than vector size. - int Cost = getTreeCost(Chain); - if (Cost == FuncSLP::MAX_COST) - return false; - if (Cost < CostThreshold) { - DEBUG(dbgs() << "SLP: Found store chain cost = " << Cost - << " for size = " << ChainLen << "\n"); - vectorizeTree(Chain); - - // Remove all of the scalar stores. - for (int i = 0, e = Chain.size(); i < e; ++i) - cast(Chain[i])->eraseFromParent(); - - return true; - } - - return false; -} - -bool FuncSLP::vectorizeStores(ArrayRef Stores, int costThreshold) { - SetVector Heads, Tails; - SmallDenseMap ConsecutiveChain; - - // We may run into multiple chains that merge into a single chain. We mark the - // stores that we vectorized so that we don't visit the same store twice. - ValueSet VectorizedStores; - bool Changed = false; - - // Do a quadratic search on all of the given stores and find - // all of the pairs of loads that follow each other. - for (unsigned i = 0, e = Stores.size(); i < e; ++i) - for (unsigned j = 0; j < e; ++j) { - if (i == j) - continue; - - if (isConsecutiveAccess(Stores[i], Stores[j])) { - Tails.insert(Stores[j]); - Heads.insert(Stores[i]); - ConsecutiveChain[Stores[i]] = Stores[j]; - } - } - - // For stores that start but don't end a link in the chain: - for (SetVector::iterator it = Heads.begin(), e = Heads.end(); - it != e; ++it) { - if (Tails.count(*it)) - continue; - - // We found a store instr that starts a chain. Now follow the chain and try - // to vectorize it. - ValueList Operands; - Value *I = *it; - // Collect the chain into a list. - while (Tails.count(I) || Heads.count(I)) { - if (VectorizedStores.count(I)) - break; - Operands.push_back(I); - // Move to the next value in the chain. - I = ConsecutiveChain[I]; - } - - bool Vectorized = vectorizeStoreChain(Operands, costThreshold); - - // Mark the vectorized stores so that we don't vectorize them again. - if (Vectorized) - VectorizedStores.insert(Operands.begin(), Operands.end()); - Changed |= Vectorized; - } - - return Changed; -} - -Value *FuncSLP::Gather(ArrayRef VL, VectorType *Ty) { +Value *BoUpSLP::Gather(ArrayRef VL, VectorType *Ty) { Value *Vec = UndefValue::get(Ty); // Generate the 'InsertElement' instruction. for (unsigned i = 0; i < Ty->getNumElements(); ++i) { @@ -1103,282 +1011,292 @@ Value *FuncSLP::Gather(ArrayRef VL, VectorType *Ty) { return Vec; } -Value *FuncSLP::vectorizeTree_rec(ArrayRef VL) { - BuilderLocGuard Guard(Builder); +Value *BoUpSLP::vectorizeTree(ArrayRef VL) { + if (ScalarToTreeEntry.count(VL[0])) { + int Idx = ScalarToTreeEntry[VL[0]]; + TreeEntry *E = &VectorizableTree[Idx]; + if (E->isSame(VL)) + return vectorizeTree(E); + } Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast(VL[0])) ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); - if (needToGatherAny(VL)) - return Gather(VL, VecTy); + return Gather(VL, VecTy); +} - if (VectorizedValues.count(VL[0])) { - DEBUG(dbgs() << "SLP: Diamond merged at depth.\n"); - return VectorizedValues[VL[0]]; +Value *BoUpSLP::vectorizeTree(TreeEntry *E) { + BuilderLocGuard Guard(Builder); + + if (E->VectorizedValue) { + DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); + return E->VectorizedValue; } - Instruction *VL0 = cast(VL[0]); + Type *ScalarTy = E->Scalars[0]->getType(); + if (StoreInst *SI = dyn_cast(E->Scalars[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); + + if (E->NeedToGather) { + return Gather(E->Scalars, VecTy); + } + + Instruction *VL0 = cast(E->Scalars[0]); unsigned Opcode = VL0->getOpcode(); - assert(Opcode == getSameOpcode(VL) && "Invalid opcode"); + assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode"); switch (Opcode) { - case Instruction::PHI: { - PHINode *PH = dyn_cast(VL0); - Builder.SetInsertPoint(PH->getParent()->getFirstInsertionPt()); - PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); - VectorizedValues[VL0] = NewPhi; + case Instruction::PHI: { + PHINode *PH = dyn_cast(VL0); + Builder.SetInsertPoint(PH->getParent()->getFirstInsertionPt()); + PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); + E->VectorizedValue = NewPhi; - for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { - ValueList Operands; - BasicBlock *IBB = PH->getIncomingBlock(i); + for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { + ValueList Operands; + BasicBlock *IBB = PH->getIncomingBlock(i); - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast(VL[j])->getIncomingValueForBlock(IBB)); + // Prepare the operand vector. + for (unsigned j = 0; j < E->Scalars.size(); ++j) + Operands.push_back(cast(E->Scalars[j])-> + getIncomingValueForBlock(IBB)); - Builder.SetInsertPoint(IBB->getTerminator()); - Value *Vec = vectorizeTree_rec(Operands); - NewPhi->addIncoming(Vec, IBB); + Builder.SetInsertPoint(IBB->getTerminator()); + Value *Vec = vectorizeTree(Operands); + NewPhi->addIncoming(Vec, IBB); + } + + assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && + "Invalid number of incoming values"); + return NewPhi; } - assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && - "Invalid number of incoming values"); - return NewPhi; - } - - case Instruction::ExtractElement: { - if (CanReuseExtract(VL, VL.size(), VecTy)) - return VL0->getOperand(0); - return Gather(VL, VecTy); - } - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - ValueList INVL; - for (int i = 0, e = VL.size(); i < e; ++i) - INVL.push_back(cast(VL[i])->getOperand(0)); - - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *InVec = vectorizeTree_rec(INVL); - CastInst *CI = dyn_cast(VL0); - Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); - VectorizedValues[VL0] = V; - return V; - } - case Instruction::FCmp: - case Instruction::ICmp: { - // Check that all of the compares have the same predicate. - CmpInst::Predicate P0 = dyn_cast(VL0)->getPredicate(); - for (unsigned i = 1, e = VL.size(); i < e; ++i) { - CmpInst *Cmp = cast(VL[i]); - if (Cmp->getPredicate() != P0) - return Gather(VL, VecTy); + case Instruction::ExtractElement: { + if (CanReuseExtract(E->Scalars)) { + Value *V = VL0->getOperand(0); + E->VectorizedValue = V; + return V; + } + return Gather(E->Scalars, VecTy); } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + ValueList INVL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) + INVL.push_back(cast(E->Scalars[i])->getOperand(0)); - ValueList LHSV, RHSV; - for (int i = 0, e = VL.size(); i < e; ++i) { - LHSV.push_back(cast(VL[i])->getOperand(0)); - RHSV.push_back(cast(VL[i])->getOperand(1)); + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Value *InVec = vectorizeTree(INVL); + CastInst *CI = dyn_cast(VL0); + Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); + E->VectorizedValue = V; + return V; } + case Instruction::FCmp: + case Instruction::ICmp: { + ValueList LHSV, RHSV; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + LHSV.push_back(cast(E->Scalars[i])->getOperand(0)); + RHSV.push_back(cast(E->Scalars[i])->getOperand(1)); + } - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *L = vectorizeTree_rec(LHSV); - Value *R = vectorizeTree_rec(RHSV); - Value *V; + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Value *L = vectorizeTree(LHSV); + Value *R = vectorizeTree(RHSV); + Value *V; - if (Opcode == Instruction::FCmp) - V = Builder.CreateFCmp(P0, L, R); - else - V = Builder.CreateICmp(P0, L, R); + CmpInst::Predicate P0 = dyn_cast(VL0)->getPredicate(); + if (Opcode == Instruction::FCmp) + V = Builder.CreateFCmp(P0, L, R); + else + V = Builder.CreateICmp(P0, L, R); - VectorizedValues[VL0] = V; - return V; - } - case Instruction::Select: { - ValueList TrueVec, FalseVec, CondVec; - for (int i = 0, e = VL.size(); i < e; ++i) { - CondVec.push_back(cast(VL[i])->getOperand(0)); - TrueVec.push_back(cast(VL[i])->getOperand(1)); - FalseVec.push_back(cast(VL[i])->getOperand(2)); + E->VectorizedValue = V; + return V; } + case Instruction::Select: { + ValueList TrueVec, FalseVec, CondVec; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + CondVec.push_back(cast(E->Scalars[i])->getOperand(0)); + TrueVec.push_back(cast(E->Scalars[i])->getOperand(1)); + FalseVec.push_back(cast(E->Scalars[i])->getOperand(2)); + } - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *True = vectorizeTree_rec(TrueVec); - Value *False = vectorizeTree_rec(FalseVec); - Value *Cond = vectorizeTree_rec(CondVec); - Value *V = Builder.CreateSelect(Cond, True, False); - VectorizedValues[VL0] = V; - return V; - } - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - ValueList LHSVL, RHSVL; - for (int i = 0, e = VL.size(); i < e; ++i) { - LHSVL.push_back(cast(VL[i])->getOperand(0)); - RHSVL.push_back(cast(VL[i])->getOperand(1)); + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Value *Cond = vectorizeTree(CondVec); + Value *True = vectorizeTree(TrueVec); + Value *False = vectorizeTree(FalseVec); + Value *V = Builder.CreateSelect(Cond, True, False); + E->VectorizedValue = V; + return V; } + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + ValueList LHSVL, RHSVL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + LHSVL.push_back(cast(E->Scalars[i])->getOperand(0)); + RHSVL.push_back(cast(E->Scalars[i])->getOperand(1)); + } - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *LHS = vectorizeTree_rec(LHSVL); - Value *RHS = vectorizeTree_rec(RHSVL); + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Value *LHS = vectorizeTree(LHSVL); + Value *RHS = vectorizeTree(RHSVL); - if (LHS == RHS) { - assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order"); + if (LHS == RHS && isa(LHS)) { + assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order"); + } + + BinaryOperator *BinOp = cast(VL0); + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); + E->VectorizedValue = V; + return V; } + case Instruction::Load: { + // Loads are inserted at the head of the tree because we don't want to + // sink them all the way down past store instructions. + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + LoadInst *LI = cast(VL0); + Value *VecPtr = + Builder.CreateBitCast(LI->getPointerOperand(), VecTy->getPointerTo()); + unsigned Alignment = LI->getAlignment(); + LI = Builder.CreateLoad(VecPtr); + LI->setAlignment(Alignment); + E->VectorizedValue = LI; + return LI; + } + case Instruction::Store: { + StoreInst *SI = cast(VL0); + unsigned Alignment = SI->getAlignment(); - BinaryOperator *BinOp = cast(VL0); - Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); - VectorizedValues[VL0] = V; - return V; - } - case Instruction::Load: { - // Check if all of the loads are consecutive. - for (unsigned i = 1, e = VL.size(); i < e; ++i) - if (!isConsecutiveAccess(VL[i - 1], VL[i])) - return Gather(VL, VecTy); - - // Loads are inserted at the head of the tree because we don't want to - // sink them all the way down past store instructions. - Builder.SetInsertPoint(getLastInstruction(VL)); - LoadInst *LI = cast(VL0); - Value *VecPtr = - Builder.CreateBitCast(LI->getPointerOperand(), VecTy->getPointerTo()); - unsigned Alignment = LI->getAlignment(); - LI = Builder.CreateLoad(VecPtr); - LI->setAlignment(Alignment); - - VectorizedValues[VL0] = LI; - return LI; - } - case Instruction::Store: { - StoreInst *SI = cast(VL0); - unsigned Alignment = SI->getAlignment(); - - ValueList ValueOp; - for (int i = 0, e = VL.size(); i < e; ++i) - ValueOp.push_back(cast(VL[i])->getValueOperand()); - - Value *VecValue = vectorizeTree_rec(ValueOp); - - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *VecPtr = - Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo()); - Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment); - return 0; - } - default: - return Gather(VL, VecTy); + ValueList ValueOp; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) + ValueOp.push_back(cast(E->Scalars[i])->getValueOperand()); + + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Value *VecValue = vectorizeTree(ValueOp); + Value *VecPtr = + Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo()); + StoreInst *S = Builder.CreateStore(VecValue, VecPtr); + S->setAlignment(Alignment); + E->VectorizedValue = S; + return S; + } + default: + llvm_unreachable("unknown inst"); } + return 0; } -Value *FuncSLP::vectorizeTree(ArrayRef VL) { - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *V = vectorizeTree_rec(VL); +void BoUpSLP::vectorizeTree() { + vectorizeTree(&VectorizableTree[0]); - DEBUG(dbgs() << "SLP: Placing 'extracts'\n"); - for (SetVector::iterator it = ExtractedLane.begin(), e = - ExtractedLane.end(); it != e; ++it) { - Instruction *Scalar = *it; - DEBUG(dbgs() << "SLP: Looking at " << *Scalar); + // For each vectorized value: + for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { + TreeEntry *Entry = &VectorizableTree[EIdx]; - if (!Scalar) - continue; + // For each lane: + for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { + Value *Scalar = Entry->Scalars[Lane]; - Instruction *Loc = 0; - - assert(MultiUserVals.count(Scalar) && "Can't find the lane to extract"); - Instruction *Leader = MultiUserVals[Scalar].Leader; - - // This value is gathered so we don't need to extract from anywhere. - if (!VectorizedValues.count(Leader)) - continue; - - Value *Vec = VectorizedValues[Leader]; - if (PHINode *PN = dyn_cast(Vec)) { - Loc = PN->getParent()->getFirstInsertionPt(); - } else { - Instruction *I = cast(Vec); - BasicBlock::iterator L = *I; - Loc = ++L; - } - - Builder.SetInsertPoint(Loc); - assert(LaneMap.count(Scalar) && "Can't find the extracted lane."); - int Lane = LaneMap[Scalar]; - Value *Idx = Builder.getInt32(Lane); - Value *Extract = Builder.CreateExtractElement(Vec, Idx); - - bool Replaced = false;; - for (Value::use_iterator U = Scalar->use_begin(), UE = Scalar->use_end(); - U != UE; ++U) { - Instruction *UI = cast(*U); - // No need to replace instructions that are inside our lane map. - if (LaneMap.count(UI)) + // No need to handle users of gathered values. + if (Entry->NeedToGather) continue; - UI->replaceUsesOfWith(Scalar ,Extract); - Replaced = true; + Value *Vec = Entry->VectorizedValue; + assert(Vec && "Can't find vectorizable value"); + + SmallVector Users(Scalar->use_begin(), Scalar->use_end()); + + for (SmallVector::iterator User = Users.begin(), + UE = Users.end(); User != UE; ++User) { + DEBUG(dbgs() << "SLP: \tupdating user " << **User << ".\n"); + + bool Gathered = MustGather.count(*User); + + // Skip in-tree scalars that become vectors. + if (ScalarToTreeEntry.count(*User) && !Gathered) { + DEBUG(dbgs() << "SLP: \tUser will be removed soon:" << + **User << ".\n"); + int Idx = ScalarToTreeEntry[*User]; (void) Idx; + assert(!VectorizableTree[Idx].NeedToGather && "bad state ?"); + continue; + } + + if (!isa(*User)) + continue; + + // Generate extracts for out-of-tree users. + // Find the insertion point for the extractelement lane. + Instruction *Loc = 0; + if (PHINode *PN = dyn_cast(Vec)) { + Loc = PN->getParent()->getFirstInsertionPt(); + } else if (Instruction *Iv = dyn_cast(Vec)){ + Loc = ++((BasicBlock::iterator)*Iv); + } else { + Loc = F->getEntryBlock().begin(); + } + + Builder.SetInsertPoint(Loc); + Value *Ex = Builder.CreateExtractElement(Vec, Builder.getInt32(Lane)); + (*User)->replaceUsesOfWith(Scalar, Ex); + DEBUG(dbgs() << "SLP: \tupdated user:" << **User << ".\n"); + } + + Type *Ty = Scalar->getType(); + if (!Ty->isVoidTy()) { + for (Value::use_iterator User = Scalar->use_begin(), UE = Scalar->use_end(); + User != UE; ++User) { + DEBUG(dbgs() << "SLP: \tvalidating user:" << **User << ".\n"); + assert(!MustGather.count(*User) && + "Replacing gathered value with undef"); + assert(ScalarToTreeEntry.count(*User) && + "Replacing out-of-tree value with undef"); + } + Value *Undef = UndefValue::get(Ty); + Scalar->replaceAllUsesWith(Undef); + } + DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); + cast(Scalar)->eraseFromParent(); } - assert(Replaced && "Must replace at least one outside user"); - (void)Replaced; } - // We moved some instructions around. We have to number them again - // before we can do any analysis. - forgetNumbering(); - - // Clear the state. - MustGather.clear(); - VisitedPHIs.clear(); - VectorizedValues.clear(); - MemBarrierIgnoreList.clear(); - return V; -} - -Value *FuncSLP::vectorizeArith(ArrayRef Operands) { - Instruction *LastInst = getLastInstruction(Operands); - Value *Vec = vectorizeTree(Operands); - // After vectorizing the operands we need to generate extractelement - // instructions and replace all of the uses of the scalar values with - // the values that we extracted from the vectorized tree. - Builder.SetInsertPoint(LastInst); - for (unsigned i = 0, e = Operands.size(); i != e; ++i) { - Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i)); - Operands[i]->replaceAllUsesWith(S); + for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { + BlocksNumbers[it].forget(); } - - forgetNumbering(); - return Vec; } -void FuncSLP::optimizeGatherSequence() { +void BoUpSLP::optimizeGatherSequence() { + DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() + << " gather sequences instructions.\n"); // LICM InsertElementInst sequences. for (SetVector::iterator it = GatherSeq.begin(), e = GatherSeq.end(); it != e; ++it) { @@ -1449,8 +1367,6 @@ void FuncSLP::optimizeGatherSequence() { assert((*v)->getNumUses() == 0 && "Can't remove instructions with uses"); (*v)->eraseFromParent(); } - - forgetNumbering(); } /// The SLPVectorizer Pass. @@ -1492,7 +1408,7 @@ struct SLPVectorizer : public FunctionPass { // Use the bollom up slp vectorizer to construct chains that start with // he store instructions. - FuncSLP R(&F, SE, DL, TTI, AA, LI, DT); + BoUpSLP R(&F, SE, DL, TTI, AA, LI, DT); // Scan the blocks in the function in post order. for (po_iterator it = po_begin(&F.getEntryBlock()), @@ -1536,31 +1452,146 @@ private: /// object. We sort the stores to their base objects to reduce the cost of the /// quadratic search on the stores. TODO: We can further reduce this cost /// if we flush the chain creation every time we run into a memory barrier. - unsigned collectStores(BasicBlock *BB, FuncSLP &R); + unsigned collectStores(BasicBlock *BB, BoUpSLP &R); /// \brief Try to vectorize a chain that starts at two arithmetic instrs. - bool tryToVectorizePair(Value *A, Value *B, FuncSLP &R); + bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); /// \brief Try to vectorize a list of operands. If \p NeedExtracts is true /// then we calculate the cost of extracting the scalars from the vector. /// \returns true if a value was vectorized. - bool tryToVectorizeList(ArrayRef VL, FuncSLP &R, bool NeedExtracts); + bool tryToVectorizeList(ArrayRef VL, BoUpSLP &R, bool NeedExtracts); /// \brief Try to vectorize a chain that may start at the operands of \V; - bool tryToVectorize(BinaryOperator *V, FuncSLP &R); + bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); /// \brief Vectorize the stores that were collected in StoreRefs. - bool vectorizeStoreChains(FuncSLP &R); + bool vectorizeStoreChains(BoUpSLP &R); /// \brief Scan the basic block and look for patterns that are likely to start /// a vectorization chain. - bool vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R); + bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); + bool vectorizeStoreChain(ArrayRef Chain, int CostThreshold, + BoUpSLP &R); + + bool vectorizeStores(ArrayRef Stores, int costThreshold, + BoUpSLP &R); private: StoreListMap StoreRefs; }; -unsigned SLPVectorizer::collectStores(BasicBlock *BB, FuncSLP &R) { +bool SLPVectorizer::vectorizeStoreChain(ArrayRef Chain, + int CostThreshold, BoUpSLP &R) { + unsigned ChainLen = Chain.size(); + DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen + << "\n"); + Type *StoreTy = cast(Chain[0])->getValueOperand()->getType(); + unsigned Sz = DL->getTypeSizeInBits(StoreTy); + unsigned VF = MinVecRegSize / Sz; + + if (!isPowerOf2_32(Sz) || VF < 2) + return false; + + bool Changed = false; + // Look for profitable vectorizable trees at all offsets, starting at zero. + for (unsigned i = 0, e = ChainLen; i < e; ++i) { + if (i + VF > e) + break; + DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i + << "\n"); + ArrayRef Operands = Chain.slice(i, VF); + + R.buildTree(Operands); + + int Cost = R.getTreeCost(); + + DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); + if (Cost < CostThreshold) { + DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + R.vectorizeTree(); + + // Move to the next bundle. + i += VF - 1; + Changed = true; + } + } + + if (Changed || ChainLen > VF) + return Changed; + + // Handle short chains. This helps us catch types such as <3 x float> that + // are smaller than vector size. + R.buildTree(Chain); + + int Cost = R.getTreeCost(); + + if (Cost < CostThreshold) { + DEBUG(dbgs() << "SLP: Found store chain cost = " << Cost + << " for size = " << ChainLen << "\n"); + R.vectorizeTree(); + return true; + } + + return false; +} + +bool SLPVectorizer::vectorizeStores(ArrayRef Stores, + int costThreshold, BoUpSLP &R) { + SetVector Heads, Tails; + SmallDenseMap ConsecutiveChain; + + // We may run into multiple chains that merge into a single chain. We mark the + // stores that we vectorized so that we don't visit the same store twice. + BoUpSLP::ValueSet VectorizedStores; + bool Changed = false; + + // Do a quadratic search on all of the given stores and find + // all of the pairs of loads that follow each other. + for (unsigned i = 0, e = Stores.size(); i < e; ++i) + for (unsigned j = 0; j < e; ++j) { + if (i == j) + continue; + + if (R.isConsecutiveAccess(Stores[i], Stores[j])) { + Tails.insert(Stores[j]); + Heads.insert(Stores[i]); + ConsecutiveChain[Stores[i]] = Stores[j]; + } + } + + // For stores that start but don't end a link in the chain: + for (SetVector::iterator it = Heads.begin(), e = Heads.end(); + it != e; ++it) { + if (Tails.count(*it)) + continue; + + // We found a store instr that starts a chain. Now follow the chain and try + // to vectorize it. + BoUpSLP::ValueList Operands; + Value *I = *it; + // Collect the chain into a list. + while (Tails.count(I) || Heads.count(I)) { + if (VectorizedStores.count(I)) + break; + Operands.push_back(I); + // Move to the next value in the chain. + I = ConsecutiveChain[I]; + } + + bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R); + + // Mark the vectorized stores so that we don't vectorize them again. + if (Vectorized) + VectorizedStores.insert(Operands.begin(), Operands.end()); + Changed |= Vectorized; + } + + return Changed; +} + + +unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { unsigned count = 0; StoreRefs.clear(); for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { @@ -1585,14 +1616,14 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, FuncSLP &R) { return count; } -bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, FuncSLP &R) { +bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { if (!A || !B) return false; Value *VL[] = { A, B }; return tryToVectorizeList(VL, R, true); } -bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, FuncSLP &R, +bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, bool NeedExtracts) { if (VL.size() < 2) return false; @@ -1615,9 +1646,8 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, FuncSLP &R, return 0; } - int Cost = R.getTreeCost(VL); - if (Cost == FuncSLP::MAX_COST) - return false; + R.buildTree(VL); + int Cost = R.getTreeCost(); int ExtrCost = NeedExtracts ? R.getGatherCost(VL) : 0; DEBUG(dbgs() << "SLP: Cost of pair:" << Cost @@ -1625,11 +1655,11 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, FuncSLP &R, if ((Cost + ExtrCost) >= -SLPCostThreshold) return false; DEBUG(dbgs() << "SLP: Vectorizing pair.\n"); - R.vectorizeArith(VL); + R.vectorizeTree(); return true; } -bool SLPVectorizer::tryToVectorize(BinaryOperator *V, FuncSLP &R) { +bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { if (!V) return false; @@ -1669,7 +1699,7 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, FuncSLP &R) { return 0; } -bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R) { +bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { if (isa(it)) @@ -1737,7 +1767,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R) { return Changed; } -bool SLPVectorizer::vectorizeStoreChains(FuncSLP &R) { +bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { bool Changed = false; // Attempt to sort and vectorize each of the store-groups. for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end(); @@ -1748,7 +1778,7 @@ bool SLPVectorizer::vectorizeStoreChains(FuncSLP &R) { DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << it->second.size() << ".\n"); - Changed |= R.vectorizeStores(it->second, -SLPCostThreshold); + Changed |= vectorizeStores(it->second, -SLPCostThreshold, R); } return Changed; } diff --git a/test/Transforms/SLPVectorizer/X86/crash_7zip.ll b/test/Transforms/SLPVectorizer/X86/crash_7zip.ll new file mode 100644 index 00000000000..51b1c08fb36 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_7zip.ll @@ -0,0 +1,38 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +%struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334 = type { %struct._CLzmaProps.0.27.54.81.102.123.144.165.180.195.228.258.333, i16*, i8*, i8*, i32, i32, i64, i64, i32, i32, i32, [4 x i32], i32, i32, i32, i32, i32, [20 x i8] } +%struct._CLzmaProps.0.27.54.81.102.123.144.165.180.195.228.258.333 = type { i32, i32, i32, i32 } + +define fastcc void @LzmaDec_DecodeReal2(%struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* %p) { +entry: + %range20.i = getelementptr inbounds %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* %p, i64 0, i32 4 + %code21.i = getelementptr inbounds %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* %p, i64 0, i32 5 + br label %do.body66.i + +do.body66.i: ; preds = %do.cond.i, %entry + %range.2.i = phi i32 [ %range.4.i, %do.cond.i ], [ undef, %entry ] + %code.2.i = phi i32 [ %code.4.i, %do.cond.i ], [ undef, %entry ] + %.range.2.i = select i1 undef, i32 undef, i32 %range.2.i + %.code.2.i = select i1 undef, i32 undef, i32 %code.2.i + br i1 undef, label %do.cond.i, label %if.else.i + +if.else.i: ; preds = %do.body66.i + %sub91.i = sub i32 %.range.2.i, undef + %sub92.i = sub i32 %.code.2.i, undef + br label %do.cond.i + +do.cond.i: ; preds = %if.else.i, %do.body66.i + %range.4.i = phi i32 [ %sub91.i, %if.else.i ], [ undef, %do.body66.i ] + %code.4.i = phi i32 [ %sub92.i, %if.else.i ], [ %.code.2.i, %do.body66.i ] + br i1 undef, label %do.body66.i, label %do.end1006.i + +do.end1006.i: ; preds = %do.cond.i + %.range.4.i = select i1 undef, i32 undef, i32 %range.4.i + %.code.4.i = select i1 undef, i32 undef, i32 %code.4.i + store i32 %.range.4.i, i32* %range20.i, align 4 + store i32 %.code.4.i, i32* %code21.i, align 4 + ret void +} diff --git a/test/Transforms/SLPVectorizer/X86/crash_bullet.ll b/test/Transforms/SLPVectorizer/X86/crash_bullet.ll new file mode 100644 index 00000000000..565905df2c6 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_bullet.ll @@ -0,0 +1,38 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +%"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960" = type { i32, i32 } + +define void @_ZN23btGeneric6DofConstraint8getInfo1EPN17btTypedConstraint17btConstraintInfo1E(%"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* nocapture %info) { +entry: + br i1 undef, label %if.else, label %if.then + +if.then: ; preds = %entry + ret void + +if.else: ; preds = %entry + %m_numConstraintRows4 = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* %info, i64 0, i32 0 + %nub5 = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* %info, i64 0, i32 1 + br i1 undef, label %land.lhs.true.i.1, label %if.then7.1 + +land.lhs.true.i.1: ; preds = %if.else + br i1 undef, label %for.inc.1, label %if.then7.1 + +if.then7.1: ; preds = %land.lhs.true.i.1, %if.else + %inc.1 = add nsw i32 0, 1 + store i32 %inc.1, i32* %m_numConstraintRows4, align 4 + %dec.1 = add nsw i32 6, -1 + store i32 %dec.1, i32* %nub5, align 4 + br label %for.inc.1 + +for.inc.1: ; preds = %if.then7.1, %land.lhs.true.i.1 + %0 = phi i32 [ %dec.1, %if.then7.1 ], [ 6, %land.lhs.true.i.1 ] + %1 = phi i32 [ %inc.1, %if.then7.1 ], [ 0, %land.lhs.true.i.1 ] + %inc.2 = add nsw i32 %1, 1 + store i32 %inc.2, i32* %m_numConstraintRows4, align 4 + %dec.2 = add nsw i32 %0, -1 + store i32 %dec.2, i32* %nub5, align 4 + unreachable +} diff --git a/test/Transforms/SLPVectorizer/X86/crash_bullet2.ll b/test/Transforms/SLPVectorizer/X86/crash_bullet2.ll new file mode 100644 index 00000000000..df026d12cb9 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_bullet2.ll @@ -0,0 +1,38 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +%class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332 = type { float, [3 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330], [3 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330], %class.btVector4.7.32.67.92.117.142.177.187.262.282.331, %class.btVector4.7.32.67.92.117.142.177.187.262.282.331, %class.btVector3.5.30.65.90.115.140.175.185.260.280.330, %class.btVector3.5.30.65.90.115.140.175.185.260.280.330, %class.btVector3.5.30.65.90.115.140.175.185.260.280.330, %class.btVector3.5.30.65.90.115.140.175.185.260.280.330, [4 x float], float, float, [4 x float], float, float, [16 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330], [16 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330], [16 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330] } +%class.btVector3.5.30.65.90.115.140.175.185.260.280.330 = type { [4 x float] } +%class.btVector4.7.32.67.92.117.142.177.187.262.282.331 = type { %class.btVector3.5.30.65.90.115.140.175.185.260.280.330 } + +define void @_ZN30GIM_TRIANGLE_CALCULATION_CACHE18triangle_collisionERK9btVector3S2_S2_fS2_S2_S2_fR25GIM_TRIANGLE_CONTACT_DATA(%class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* %this) { +entry: + %arrayidx26 = getelementptr inbounds %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* %this, i64 0, i32 2, i64 0, i32 0, i64 1 + %arrayidx36 = getelementptr inbounds %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* %this, i64 0, i32 2, i64 0, i32 0, i64 2 + %0 = load float* %arrayidx36, align 4 + %add587 = fadd float undef, undef + %sub600 = fsub float %add587, undef + store float %sub600, float* undef, align 4 + %sub613 = fsub float %add587, %sub600 + store float %sub613, float* %arrayidx26, align 4 + %add626 = fadd float %0, undef + %sub639 = fsub float %add626, undef + %sub652 = fsub float %add626, %sub639 + store float %sub652, float* %arrayidx36, align 4 + br i1 undef, label %if.else1609, label %if.then1595 + +if.then1595: ; preds = %entry + br i1 undef, label %return, label %for.body.lr.ph.i.i1702 + +for.body.lr.ph.i.i1702: ; preds = %if.then1595 + unreachable + +if.else1609: ; preds = %entry + unreachable + +return: ; preds = %if.then1595 + ret void +} + diff --git a/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll b/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll new file mode 100644 index 00000000000..ce0159071c6 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll @@ -0,0 +1,40 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" +%"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731" = type { double*, double*, double*, double** } + +; Function Attrs: nounwind ssp uwtable +define void @_ZSt6uniqueISt15_Deque_iteratorIdRdPdEET_S4_S4_(%"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__first, %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* nocapture %__last) { +entry: + %_M_cur2.i.i = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__first, i64 0, i32 0 + %0 = load double** %_M_cur2.i.i, align 8 + %_M_first3.i.i = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__first, i64 0, i32 1 + %_M_cur2.i.i81 = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__last, i64 0, i32 0 + %1 = load double** %_M_cur2.i.i81, align 8 + %_M_first3.i.i83 = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__last, i64 0, i32 1 + %2 = load double** %_M_first3.i.i83, align 8 + br i1 undef, label %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit, label %while.cond.i.preheader + +while.cond.i.preheader: ; preds = %entry + br label %while.cond.i + +while.cond.i: ; preds = %while.body.i, %while.cond.i.preheader + br i1 undef, label %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit, label %while.body.i + +while.body.i: ; preds = %while.cond.i + br i1 undef, label %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit, label %while.cond.i + +_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit: ; preds = %while.body.i, %while.cond.i, %entry + %3 = phi double* [ %2, %entry ], [ %2, %while.cond.i ], [ undef, %while.body.i ] + %4 = phi double* [ %0, %entry ], [ %1, %while.cond.i ], [ undef, %while.body.i ] + store double* %4, double** %_M_cur2.i.i, align 8 + store double* %3, double** %_M_first3.i.i, align 8 + br i1 undef, label %if.then.i55, label %while.cond + +if.then.i55: ; preds = %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit + br label %while.cond + +while.cond: ; preds = %while.cond, %if.then.i55, %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit + br label %while.cond +} diff --git a/test/Transforms/SLPVectorizer/X86/crash_flop7.ll b/test/Transforms/SLPVectorizer/X86/crash_flop7.ll new file mode 100644 index 00000000000..e11be488f79 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_flop7.ll @@ -0,0 +1,46 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; Function Attrs: nounwind ssp uwtable +define void @main() #0 { +entry: + br i1 undef, label %while.body, label %while.end + +while.body: ; preds = %entry + unreachable + +while.end: ; preds = %entry + br i1 undef, label %for.end80, label %for.body75.lr.ph + +for.body75.lr.ph: ; preds = %while.end + br label %for.body75 + +for.body75: ; preds = %for.body75, %for.body75.lr.ph + br label %for.body75 + +for.end80: ; preds = %while.end + br i1 undef, label %for.end300, label %for.body267.lr.ph + +for.body267.lr.ph: ; preds = %for.end80 + br label %for.body267 + +for.body267: ; preds = %for.body267, %for.body267.lr.ph + %s.71010 = phi double [ 0.000000e+00, %for.body267.lr.ph ], [ %add297, %for.body267 ] + %mul269 = fmul double undef, undef + %mul270 = fmul double %mul269, %mul269 + %add282 = fadd double undef, undef + %mul283 = fmul double %mul269, %add282 + %add293 = fadd double undef, undef + %mul294 = fmul double %mul270, %add293 + %add295 = fadd double undef, %mul294 + %div296 = fdiv double %mul283, %add295 + %add297 = fadd double %s.71010, %div296 + br i1 undef, label %for.body267, label %for.end300 + +for.end300: ; preds = %for.body267, %for.end80 + unreachable +} + +attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/Transforms/SLPVectorizer/X86/crash_lame.ll b/test/Transforms/SLPVectorizer/X86/crash_lame.ll new file mode 100644 index 00000000000..cfc3fa35a1e --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_lame.ll @@ -0,0 +1,24 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; Function Attrs: nounwind ssp uwtable +define fastcc void @dct36(double* %inbuf) #0 { +entry: + %arrayidx41 = getelementptr inbounds double* %inbuf, i64 2 + %arrayidx44 = getelementptr inbounds double* %inbuf, i64 1 + %0 = load double* %arrayidx44, align 8, !tbaa !0 + %add46 = fadd double %0, undef + store double %add46, double* %arrayidx41, align 8, !tbaa !0 + %1 = load double* %inbuf, align 8, !tbaa !0 + %add49 = fadd double %1, %0 + store double %add49, double* %arrayidx44, align 8, !tbaa !0 + ret void +} + +attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!0 = metadata !{metadata !"double", metadata !1} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA"} diff --git a/test/Transforms/SLPVectorizer/X86/crash_lencod.ll b/test/Transforms/SLPVectorizer/X86/crash_lencod.ll new file mode 100644 index 00000000000..b35a5d74e2f --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_lencod.ll @@ -0,0 +1,66 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; Function Attrs: nounwind ssp uwtable +define void @RCModelEstimator() { +entry: + br i1 undef, label %for.body.lr.ph, label %for.end.thread + +for.end.thread: ; preds = %entry + unreachable + +for.body.lr.ph: ; preds = %entry + br i1 undef, label %for.end, label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph + br i1 undef, label %for.end, label %for.body + +for.end: ; preds = %for.body, %for.body.lr.ph + br i1 undef, label %for.body3, label %if.end103 + +for.cond14.preheader: ; preds = %for.inc11 + br i1 undef, label %for.body16.lr.ph, label %if.end103 + +for.body16.lr.ph: ; preds = %for.cond14.preheader + br label %for.body16 + +for.body3: ; preds = %for.inc11, %for.end + br i1 undef, label %if.then7, label %for.inc11 + +if.then7: ; preds = %for.body3 + br label %for.inc11 + +for.inc11: ; preds = %if.then7, %for.body3 + br i1 false, label %for.cond14.preheader, label %for.body3 + +for.body16: ; preds = %for.body16, %for.body16.lr.ph + br i1 undef, label %for.end39, label %for.body16 + +for.end39: ; preds = %for.body16 + br i1 undef, label %if.end103, label %for.cond45.preheader + +for.cond45.preheader: ; preds = %for.end39 + br i1 undef, label %if.then88, label %if.else + +if.then88: ; preds = %for.cond45.preheader + %mul89 = fmul double 0.000000e+00, 0.000000e+00 + %mul90 = fmul double 0.000000e+00, 0.000000e+00 + %sub91 = fsub double %mul89, %mul90 + %div92 = fdiv double %sub91, undef + %mul94 = fmul double 0.000000e+00, 0.000000e+00 + %mul95 = fmul double 0.000000e+00, 0.000000e+00 + %sub96 = fsub double %mul94, %mul95 + %div97 = fdiv double %sub96, undef + br label %if.end103 + +if.else: ; preds = %for.cond45.preheader + br label %if.end103 + +if.end103: ; preds = %if.else, %if.then88, %for.end39, %for.cond14.preheader, %for.end + %0 = phi double [ 0.000000e+00, %for.end39 ], [ %div97, %if.then88 ], [ 0.000000e+00, %if.else ], [ 0.000000e+00, %for.cond14.preheader ], [ 0.000000e+00, %for.end ] + %1 = phi double [ undef, %for.end39 ], [ %div92, %if.then88 ], [ undef, %if.else ], [ 0.000000e+00, %for.cond14.preheader ], [ 0.000000e+00, %for.end ] + ret void +} + diff --git a/test/Transforms/SLPVectorizer/X86/crash_lencod2.ll b/test/Transforms/SLPVectorizer/X86/crash_lencod2.ll new file mode 100644 index 00000000000..d1e719c160a --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_lencod2.ll @@ -0,0 +1,23 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; Function Attrs: nounwind ssp uwtable +define void @intrapred_luma() #0 { +entry: + %conv153 = trunc i32 undef to i16 + %arrayidx154 = getelementptr inbounds [13 x i16]* undef, i64 0, i64 12 + store i16 %conv153, i16* %arrayidx154, align 8, !tbaa !0 + %arrayidx155 = getelementptr inbounds [13 x i16]* undef, i64 0, i64 11 + store i16 %conv153, i16* %arrayidx155, align 2, !tbaa !0 + %arrayidx156 = getelementptr inbounds [13 x i16]* undef, i64 0, i64 10 + store i16 %conv153, i16* %arrayidx156, align 4, !tbaa !0 + ret void +} + +attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!0 = metadata !{metadata !"short", metadata !1} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA"} diff --git a/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll b/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll new file mode 100644 index 00000000000..b3ca235c2c7 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll @@ -0,0 +1,53 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +define void @main() { +entry: + br label %for.body + +for.body: ; preds = %for.end44, %entry + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %if.then25, %for.body + br label %for.body6 + +for.body6: ; preds = %for.inc21, %for.cond4.preheader + br label %for.body12 + +for.body12: ; preds = %if.end, %for.body6 + %fZImg.069 = phi double [ undef, %for.body6 ], [ %add19, %if.end ] + %fZReal.068 = phi double [ undef, %for.body6 ], [ %add20, %if.end ] + %mul13 = fmul double %fZReal.068, %fZReal.068 + %mul14 = fmul double %fZImg.069, %fZImg.069 + %add15 = fadd double %mul13, %mul14 + %cmp16 = fcmp ogt double %add15, 4.000000e+00 + br i1 %cmp16, label %for.inc21, label %if.end + +if.end: ; preds = %for.body12 + %mul18 = fmul double undef, %fZImg.069 + %add19 = fadd double undef, %mul18 + %sub = fsub double %mul13, %mul14 + %add20 = fadd double undef, %sub + br i1 undef, label %for.body12, label %for.inc21 + +for.inc21: ; preds = %if.end, %for.body12 + br i1 undef, label %for.end23, label %for.body6 + +for.end23: ; preds = %for.inc21 + br i1 undef, label %if.then25, label %if.then26 + +if.then25: ; preds = %for.end23 + br i1 undef, label %for.end44, label %for.cond4.preheader + +if.then26: ; preds = %for.end23 + unreachable + +for.end44: ; preds = %if.then25 + br i1 undef, label %for.end48, label %for.body + +for.end48: ; preds = %for.end44 + ret void +} + diff --git a/test/Transforms/SLPVectorizer/X86/crash_rc4.ll b/test/Transforms/SLPVectorizer/X86/crash_rc4.ll new file mode 100644 index 00000000000..20374706d2b --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_rc4.ll @@ -0,0 +1,28 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +%struct.rc4_state.0.24 = type { i32, i32, [256 x i32] } + +define void @rc4_crypt(%struct.rc4_state.0.24* nocapture %s) { +entry: + %x1 = getelementptr inbounds %struct.rc4_state.0.24* %s, i64 0, i32 0 + %y2 = getelementptr inbounds %struct.rc4_state.0.24* %s, i64 0, i32 1 + br i1 undef, label %for.body, label %for.end + +for.body: ; preds = %for.body, %entry + %x.045 = phi i32 [ %conv4, %for.body ], [ undef, %entry ] + %conv4 = and i32 undef, 255 + %conv7 = and i32 undef, 255 + %idxprom842 = zext i32 %conv7 to i64 + br i1 undef, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %x.0.lcssa = phi i32 [ undef, %entry ], [ %conv4, %for.body ] + %y.0.lcssa = phi i32 [ undef, %entry ], [ %conv7, %for.body ] + store i32 %x.0.lcssa, i32* %x1, align 4 + store i32 %y.0.lcssa, i32* %y2, align 4 + ret void +} + diff --git a/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll b/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll new file mode 100644 index 00000000000..05415456cf0 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll @@ -0,0 +1,113 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +%struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171 = type { i32, i32, i32, i32, i32, i32, [8 x i8] } + +define void @SIM4() { +entry: + br i1 undef, label %return, label %lor.lhs.false + +lor.lhs.false: ; preds = %entry + br i1 undef, label %return, label %if.end + +if.end: ; preds = %lor.lhs.false + br i1 undef, label %for.end605, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %if.end + br label %for.body + +for.body: ; preds = %for.inc603, %for.body.lr.ph + br i1 undef, label %for.inc603, label %if.end12 + +if.end12: ; preds = %for.body + br i1 undef, label %land.lhs.true, label %land.lhs.true167 + +land.lhs.true: ; preds = %if.end12 + br i1 undef, label %if.then17, label %land.lhs.true167 + +if.then17: ; preds = %land.lhs.true + br i1 undef, label %if.end98, label %land.rhs.lr.ph + +land.rhs.lr.ph: ; preds = %if.then17 + unreachable + +if.end98: ; preds = %if.then17 + %from299 = getelementptr inbounds %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 1 + br i1 undef, label %land.lhs.true167, label %if.then103 + +if.then103: ; preds = %if.end98 + %.sub100 = select i1 undef, i32 250, i32 undef + %mul114 = shl nsw i32 %.sub100, 2 + %from1115 = getelementptr inbounds %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 0 + %cond125 = select i1 undef, i32 undef, i32 %mul114 + br label %for.cond.i + +for.cond.i: ; preds = %land.rhs.i874, %if.then103 + %row.0.i = phi i32 [ undef, %land.rhs.i874 ], [ %.sub100, %if.then103 ] + %col.0.i = phi i32 [ undef, %land.rhs.i874 ], [ %cond125, %if.then103 ] + br i1 undef, label %land.rhs.i874, label %for.end.i + +land.rhs.i874: ; preds = %for.cond.i + br i1 undef, label %for.cond.i, label %for.end.i + +for.end.i: ; preds = %land.rhs.i874, %for.cond.i + br i1 undef, label %if.then.i, label %if.end.i + +if.then.i: ; preds = %for.end.i + %add14.i = add nsw i32 %row.0.i, undef + %add15.i = add nsw i32 %col.0.i, undef + br label %extend_bw.exit + +if.end.i: ; preds = %for.end.i + %add16.i = add i32 %cond125, %.sub100 + %cmp26514.i = icmp slt i32 %add16.i, 0 + br i1 %cmp26514.i, label %for.end33.i, label %for.body28.lr.ph.i + +for.body28.lr.ph.i: ; preds = %if.end.i + br label %for.end33.i + +for.end33.i: ; preds = %for.body28.lr.ph.i, %if.end.i + br i1 undef, label %for.end58.i, label %for.body52.lr.ph.i + +for.body52.lr.ph.i: ; preds = %for.end33.i + br label %for.end58.i + +for.end58.i: ; preds = %for.body52.lr.ph.i, %for.end33.i + br label %while.cond260.i + +while.cond260.i: ; preds = %land.rhs263.i, %for.end58.i + br i1 undef, label %land.rhs263.i, label %while.end275.i + +land.rhs263.i: ; preds = %while.cond260.i + br i1 undef, label %while.cond260.i, label %while.end275.i + +while.end275.i: ; preds = %land.rhs263.i, %while.cond260.i + br label %extend_bw.exit + +extend_bw.exit: ; preds = %while.end275.i, %if.then.i + %add14.i1262 = phi i32 [ %add14.i, %if.then.i ], [ undef, %while.end275.i ] + %add15.i1261 = phi i32 [ %add15.i, %if.then.i ], [ undef, %while.end275.i ] + br i1 false, label %if.then157, label %land.lhs.true167 + +if.then157: ; preds = %extend_bw.exit + %add158 = add nsw i32 %add14.i1262, 1 + store i32 %add158, i32* %from299, align 4 + %add160 = add nsw i32 %add15.i1261, 1 + store i32 %add160, i32* %from1115, align 4 + br label %land.lhs.true167 + +land.lhs.true167: ; preds = %if.then157, %extend_bw.exit, %if.end98, %land.lhs.true, %if.end12 + unreachable + +for.inc603: ; preds = %for.body + br i1 undef, label %for.body, label %for.end605 + +for.end605: ; preds = %for.inc603, %if.end + unreachable + +return: ; preds = %lor.lhs.false, %entry + ret void +} + diff --git a/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll new file mode 100644 index 00000000000..ac7e412a534 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll @@ -0,0 +1,65 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +%struct.Ray.5.11.53.113.119.137.149.185.329.389.415 = type { %struct.Vec.0.6.48.108.114.132.144.180.324.384.414, %struct.Vec.0.6.48.108.114.132.144.180.324.384.414 } +%struct.Vec.0.6.48.108.114.132.144.180.324.384.414 = type { double, double, double } + +; Function Attrs: ssp uwtable +define void @main() #0 { +entry: + br i1 undef, label %cond.true, label %cond.end + +cond.true: ; preds = %entry + unreachable + +cond.end: ; preds = %entry + br label %invoke.cont + +invoke.cont: ; preds = %invoke.cont, %cond.end + br i1 undef, label %arrayctor.cont, label %invoke.cont + +arrayctor.cont: ; preds = %invoke.cont + %agg.tmp99208.sroa.0.0.idx = getelementptr inbounds %struct.Ray.5.11.53.113.119.137.149.185.329.389.415* undef, i64 0, i32 0, i32 0 + %agg.tmp99208.sroa.1.8.idx388 = getelementptr inbounds %struct.Ray.5.11.53.113.119.137.149.185.329.389.415* undef, i64 0, i32 0, i32 1 + %agg.tmp101211.sroa.0.0.idx = getelementptr inbounds %struct.Ray.5.11.53.113.119.137.149.185.329.389.415* undef, i64 0, i32 1, i32 0 + %agg.tmp101211.sroa.1.8.idx390 = getelementptr inbounds %struct.Ray.5.11.53.113.119.137.149.185.329.389.415* undef, i64 0, i32 1, i32 1 + br label %for.cond36.preheader + +for.cond36.preheader: ; preds = %_Z5clampd.exit.1, %arrayctor.cont + br i1 undef, label %for.body42.lr.ph.us, label %_Z5clampd.exit.1 + +cond.false51.us: ; preds = %for.body42.lr.ph.us + unreachable + +cond.true48.us: ; preds = %for.body42.lr.ph.us + br i1 undef, label %cond.true63.us, label %cond.false66.us + +cond.false66.us: ; preds = %cond.true48.us + %add.i276.us = fadd double 0.000000e+00, undef + %add.i264.us = fadd double %add.i276.us, 0.000000e+00 + %add4.i267.us = fadd double undef, 0xBFA5CC2D1960285F + %mul.i254.us = fmul double %add.i264.us, 1.400000e+02 + %mul2.i256.us = fmul double %add4.i267.us, 1.400000e+02 + %add.i243.us = fadd double %mul.i254.us, 5.000000e+01 + %add4.i246.us = fadd double %mul2.i256.us, 5.200000e+01 + %mul.i.i.us = fmul double undef, %add.i264.us + %mul2.i.i.us = fmul double undef, %add4.i267.us + store double %add.i243.us, double* %agg.tmp99208.sroa.0.0.idx, align 8 + store double %add4.i246.us, double* %agg.tmp99208.sroa.1.8.idx388, align 8 + store double %mul.i.i.us, double* %agg.tmp101211.sroa.0.0.idx, align 8 + store double %mul2.i.i.us, double* %agg.tmp101211.sroa.1.8.idx390, align 8 + unreachable + +cond.true63.us: ; preds = %cond.true48.us + unreachable + +for.body42.lr.ph.us: ; preds = %for.cond36.preheader + br i1 undef, label %cond.true48.us, label %cond.false51.us + +_Z5clampd.exit.1: ; preds = %for.cond36.preheader + br label %for.cond36.preheader +} + +attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/Transforms/SLPVectorizer/X86/crash_smallpt2.ll b/test/Transforms/SLPVectorizer/X86/crash_smallpt2.ll new file mode 100644 index 00000000000..84c7b3a0460 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_smallpt2.ll @@ -0,0 +1,46 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +%struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601 = type { %struct.Vec.0.6.48.90.132.186.192.198.234.252.258.264.270.276.282.288.378.432.438.450.456.594.600, %struct.Vec.0.6.48.90.132.186.192.198.234.252.258.264.270.276.282.288.378.432.438.450.456.594.600 } +%struct.Vec.0.6.48.90.132.186.192.198.234.252.258.264.270.276.282.288.378.432.438.450.456.594.600 = type { double, double, double } + +; Function Attrs: ssp uwtable +define void @_Z8radianceRK3RayiPt() #0 { +entry: + br i1 undef, label %if.then78, label %if.then38 + +if.then38: ; preds = %entry + %mul.i.i790 = fmul double undef, undef + %mul3.i.i792 = fmul double undef, undef + %mul.i764 = fmul double undef, %mul3.i.i792 + %mul4.i767 = fmul double undef, undef + %sub.i768 = fsub double %mul.i764, %mul4.i767 + %mul6.i770 = fmul double undef, %mul.i.i790 + %mul9.i772 = fmul double undef, %mul3.i.i792 + %sub10.i773 = fsub double %mul6.i770, %mul9.i772 + %mul.i736 = fmul double undef, %sub.i768 + %mul2.i738 = fmul double undef, %sub10.i773 + %mul.i727 = fmul double undef, %mul.i736 + %mul2.i729 = fmul double undef, %mul2.i738 + %add.i716 = fadd double undef, %mul.i727 + %add4.i719 = fadd double undef, %mul2.i729 + %add.i695 = fadd double undef, %add.i716 + %add4.i698 = fadd double undef, %add4.i719 + %mul.i.i679 = fmul double undef, %add.i695 + %mul2.i.i680 = fmul double undef, %add4.i698 + %agg.tmp74663.sroa.0.0.idx = getelementptr inbounds %struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601* undef, i64 0, i32 1, i32 0 + store double %mul.i.i679, double* %agg.tmp74663.sroa.0.0.idx, align 8 + %agg.tmp74663.sroa.1.8.idx943 = getelementptr inbounds %struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601* undef, i64 0, i32 1, i32 1 + store double %mul2.i.i680, double* %agg.tmp74663.sroa.1.8.idx943, align 8 + br label %return + +if.then78: ; preds = %entry + br label %return + +return: ; preds = %if.then78, %if.then38 + ret void +} + +attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/Transforms/SLPVectorizer/X86/diamond.ll b/test/Transforms/SLPVectorizer/X86/diamond.ll index 008f09db454..2a237eaffdf 100644 --- a/test/Transforms/SLPVectorizer/X86/diamond.ll +++ b/test/Transforms/SLPVectorizer/X86/diamond.ll @@ -50,7 +50,8 @@ entry: ; } ; CHECK: @extr_user -; CHECK: load i32* +; CHECK: load <4 x i32> +; CHECK-NEXT: extractelement <4 x i32> ; CHECK: store <4 x i32> ; CHECK-NEXT: ret define i32 @extr_user(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) { @@ -79,7 +80,8 @@ entry: ; In this example we have an external user that is not the first element in the vector. ; CHECK: @extr_user1 -; CHECK: load i32* +; CHECK: load <4 x i32> +; CHECK-NEXT: extractelement <4 x i32> ; CHECK: store <4 x i32> ; CHECK-NEXT: ret define i32 @extr_user1(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) { diff --git a/test/Transforms/SLPVectorizer/X86/long_chains.ll b/test/Transforms/SLPVectorizer/X86/long_chains.ll index 0a2ace3f21f..5af3e6d6e90 100644 --- a/test/Transforms/SLPVectorizer/X86/long_chains.ll +++ b/test/Transforms/SLPVectorizer/X86/long_chains.ll @@ -3,12 +3,13 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" +; At this point we can't vectorize only parts of the tree. + ; CHECK: test -; CHECK: sitofp i8 -; CHECK-NEXT: sitofp i8 -; CHECK-NEXT: insertelement -; CHECK-NEXT: insertelement -; CHECK-NEXT: fmul <2 x double> +; CHECK: insertelement <2 x i8> +; CHECK: insertelement <2 x i8> +; CHECK: sitofp <2 x i8> +; CHECK: fmul <2 x double> ; CHECK: ret define i32 @test(double* nocapture %A, i8* nocapture %B) { entry: @@ -18,7 +19,7 @@ entry: %add = add i8 %0, 3 %add4 = add i8 %1, 3 %conv6 = sitofp i8 %add to double - %conv7 = sitofp i8 %add4 to double ; <--- This is inefficient. The chain stops here. + %conv7 = sitofp i8 %add4 to double %mul = fmul double %conv6, %conv6 %add8 = fadd double %mul, 1.000000e+00 %mul9 = fmul double %conv7, %conv7 diff --git a/test/Transforms/SLPVectorizer/X86/saxpy.ll b/test/Transforms/SLPVectorizer/X86/saxpy.ll index b520913a398..46263416a90 100644 --- a/test/Transforms/SLPVectorizer/X86/saxpy.ll +++ b/test/Transforms/SLPVectorizer/X86/saxpy.ll @@ -43,3 +43,19 @@ define void @SAXPY(i32* noalias nocapture %x, i32* noalias nocapture %y, i32 %a, ret void } +; Make sure we don't crash on this one. +define void @SAXPY_crash(i32* noalias nocapture %x, i32* noalias nocapture %y, i64 %i) { + %1 = add i64 %i, 1 + %2 = getelementptr inbounds i32* %x, i64 %1 + %3 = getelementptr inbounds i32* %y, i64 %1 + %4 = load i32* %3, align 4 + %5 = add nsw i32 undef, %4 + store i32 %5, i32* %2, align 4 + %6 = add i64 %i, 2 + %7 = getelementptr inbounds i32* %x, i64 %6 + %8 = getelementptr inbounds i32* %y, i64 %6 + %9 = load i32* %8, align 4 + %10 = add nsw i32 undef, %9 + store i32 %10, i32* %7, align 4 + ret void +}