From e2d124d396d23b784611e8d17b6076579b0a4c60 Mon Sep 17 00:00:00 2001 From: Arnold Schwaighofer Date: Thu, 10 Apr 2014 13:41:35 +0000 Subject: [PATCH] Reapply "SLPVectorizer: Ignore users that are insertelements we can reschedule them" This commit reapplies 205018. After 205855 we should correctly vectorize intrinsics. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@205965 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/SLPVectorizer.cpp | 93 +++++++++++++------ .../X86/insert-element-build-vector.ll | 24 +++++ 2 files changed, 88 insertions(+), 29 deletions(-) diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 6a78d756ccc..58ec5867efb 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -366,13 +366,13 @@ public: /// A negative number means that this is profitable. int getTreeCost(); - /// Construct a vectorizable tree that starts at \p Roots and is possibly - /// used by a reduction of \p RdxOps. - void buildTree(ArrayRef Roots, ValueSet *RdxOps = 0); + /// Construct a vectorizable tree that starts at \p Roots, ignoring users for + /// the purpose of scheduling and extraction in the \p UserIgnoreLst. + void buildTree(ArrayRef Roots, + ArrayRef UserIgnoreLst = None); /// Clear the internal data structures that are created by 'buildTree'. void deleteTree() { - RdxOps = 0; VectorizableTree.clear(); ScalarToTreeEntry.clear(); MustGather.clear(); @@ -528,8 +528,8 @@ private: /// Numbers instructions in different blocks. DenseMap BlocksNumbers; - /// Reduction operators. - ValueSet *RdxOps; + /// List of users to ignore during scheduling and that don't need extracting. + ArrayRef UserIgnoreList; // Analysis and block reference. Function *F; @@ -543,9 +543,10 @@ private: IRBuilder<> Builder; }; -void BoUpSLP::buildTree(ArrayRef Roots, ValueSet *Rdx) { +void BoUpSLP::buildTree(ArrayRef Roots, + ArrayRef UserIgnoreLst) { deleteTree(); - RdxOps = Rdx; + UserIgnoreList = UserIgnoreLst; if (!getSameType(Roots)) return; buildTree_rec(Roots, 0); @@ -577,8 +578,9 @@ void BoUpSLP::buildTree(ArrayRef Roots, ValueSet *Rdx) { if (!UserInst) continue; - // Ignore uses that are part of the reduction. - if (Rdx && std::find(Rdx->begin(), Rdx->end(), UserInst) != Rdx->end()) + // Ignore users in the user ignore list. + if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) != + UserIgnoreList.end()) continue; DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " << @@ -709,8 +711,9 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { continue; } - // This user is part of the reduction. - if (RdxOps && RdxOps->count(UI)) + // Ignore users in the user ignore list. + if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UI) != + UserIgnoreList.end()) continue; // Make sure that we can schedule this unknown user. @@ -1749,8 +1752,9 @@ Value *BoUpSLP::vectorizeTree() { DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); assert((ScalarToTreeEntry.count(U) || - // It is legal to replace the reduction users by undef. - (RdxOps && RdxOps->count(U))) && + // It is legal to replace users in the ignorelist by undef. + (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) != + UserIgnoreList.end())) && "Replacing out-of-tree value with undef"); } #endif @@ -1954,8 +1958,11 @@ private: bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); /// \brief Try to vectorize a list of operands. + /// \@param BuildVector A list of users to ignore for the purpose of + /// scheduling and that don't need extracting. /// \returns true if a value was vectorized. - bool tryToVectorizeList(ArrayRef VL, BoUpSLP &R); + bool tryToVectorizeList(ArrayRef VL, BoUpSLP &R, + ArrayRef BuildVector = None); /// \brief Try to vectorize a chain that may start at the operands of \V; bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); @@ -2128,7 +2135,8 @@ bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { return tryToVectorizeList(VL, R); } -bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R) { +bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, + ArrayRef BuildVector) { if (VL.size() < 2) return false; @@ -2178,13 +2186,33 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R) { << "\n"); ArrayRef Ops = VL.slice(i, OpsWidth); - R.buildTree(Ops); + ArrayRef BuildVectorSlice; + if (!BuildVector.empty()) + BuildVectorSlice = BuildVector.slice(i, OpsWidth); + + R.buildTree(Ops, BuildVectorSlice); int Cost = R.getTreeCost(); if (Cost < -SLPCostThreshold) { DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); - R.vectorizeTree(); + Value *VectorizedRoot = R.vectorizeTree(); + // Reconstruct the build vector by extracting the vectorized root. This + // way we handle the case where some elements of the vector are undefined. + // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2)) + if (!BuildVectorSlice.empty()) { + Instruction *InsertAfter = cast(VectorizedRoot); + for (auto &V : BuildVectorSlice) { + InsertElementInst *IE = cast(V); + IRBuilder<> Builder(++BasicBlock::iterator(InsertAfter)); + Instruction *Extract = cast( + Builder.CreateExtractElement(VectorizedRoot, IE->getOperand(2))); + IE->setOperand(1, Extract); + IE->removeFromParent(); + IE->insertAfter(Extract); + InsertAfter = IE; + } + } // Move to the next bundle. i += VF - 1; Changed = true; @@ -2293,7 +2321,7 @@ static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, /// *p = /// class HorizontalReduction { - SmallPtrSet ReductionOps; + SmallVector ReductionOps; SmallVector ReducedVals; BinaryOperator *ReductionRoot; @@ -2387,7 +2415,7 @@ public: // We need to be able to reassociate the adds. if (!TreeN->isAssociative()) return false; - ReductionOps.insert(TreeN); + ReductionOps.push_back(TreeN); } // Retract. Stack.pop_back(); @@ -2424,7 +2452,7 @@ public: for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { ArrayRef ValsToReduce(&ReducedVals[i], ReduxWidth); - V.buildTree(ValsToReduce, &ReductionOps); + V.buildTree(ValsToReduce, ReductionOps); // Estimate cost. int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); @@ -2543,13 +2571,16 @@ private: /// /// Returns true if it matches /// -static bool findBuildVector(InsertElementInst *IE, - SmallVectorImpl &Ops) { - if (!isa(IE->getOperand(0))) +static bool findBuildVector(InsertElementInst *FirstInsertElem, + SmallVectorImpl &BuildVector, + SmallVectorImpl &BuildVectorOpds) { + if (!isa(FirstInsertElem->getOperand(0))) return false; + InsertElementInst *IE = FirstInsertElem; while (true) { - Ops.push_back(IE->getOperand(1)); + BuildVector.push_back(IE); + BuildVectorOpds.push_back(IE->getOperand(1)); if (IE->use_empty()) return false; @@ -2719,12 +2750,16 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { } // Try to vectorize trees that start at insertelement instructions. - if (InsertElementInst *IE = dyn_cast(it)) { - SmallVector Ops; - if (!findBuildVector(IE, Ops)) + if (InsertElementInst *FirstInsertElem = dyn_cast(it)) { + SmallVector BuildVector; + SmallVector BuildVectorOpds; + if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds)) continue; - if (tryToVectorizeList(Ops, R)) { + // Vectorize starting with the build vector operands ignoring the + // BuildVector instructions for the purpose of scheduling and user + // extraction. + if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) { Changed = true; it = BB->begin(); e = BB->end(); diff --git a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll index 7537ea3b053..620292bdaf3 100644 --- a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -195,6 +195,30 @@ define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, ret <4 x float> %rb } +; Make sure that vectorization happens even if insertelements operations +; must be rescheduled. The case here is from compiling Julia. +define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: @reschedule_extract( +; CHECK: %1 = fadd <4 x float> %a, %b + %a0 = extractelement <4 x float> %a, i32 0 + %b0 = extractelement <4 x float> %b, i32 0 + %c0 = fadd float %a0, %b0 + %v0 = insertelement <4 x float> undef, float %c0, i32 0 + %a1 = extractelement <4 x float> %a, i32 1 + %b1 = extractelement <4 x float> %b, i32 1 + %c1 = fadd float %a1, %b1 + %v1 = insertelement <4 x float> %v0, float %c1, i32 1 + %a2 = extractelement <4 x float> %a, i32 2 + %b2 = extractelement <4 x float> %b, i32 2 + %c2 = fadd float %a2, %b2 + %v2 = insertelement <4 x float> %v1, float %c2, i32 2 + %a3 = extractelement <4 x float> %a, i32 3 + %b3 = extractelement <4 x float> %b, i32 3 + %c3 = fadd float %a3, %b3 + %v3 = insertelement <4 x float> %v2, float %c3, i32 3 + ret <4 x float> %v3 +} + ; Check that cost model for vectorization takes credit for ; instructions that are erased. define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {