diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp index 9b9436683b1..55adf8a8161 100644 --- a/lib/Transforms/Vectorize/VecUtils.cpp +++ b/lib/Transforms/Vectorize/VecUtils.cpp @@ -243,6 +243,10 @@ int BoUpSLP::getTreeCost(ArrayRef VL) { LaneMap.clear(); MultiUserVals.clear(); MustScalarize.clear(); + MustExtract.clear(); + + // Find the location of the last root. + unsigned LastRootIndex = InstrIdx[GetLastInstr(VL, VL.size())]; // Scan the tree and find which value is used by which lane, and which values // must be scalarized. @@ -258,15 +262,31 @@ int BoUpSLP::getTreeCost(ArrayRef VL) { for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end(); I != E; ++I) { if (LaneMap.find(*I) == LaneMap.end()) { - MustScalarize.insert(*it); - DEBUG(dbgs()<<"SLP: Adding " << **it << - " to MustScalarize because of an out of tree usage.\n"); - break; + DEBUG(dbgs()<<"SLP: Instr " << **it << " has multiple users.\n"); + + // We don't have an ordering problem if the user is not in this basic + // block. + Instruction *Inst = cast(*I); + if (Inst->getParent() == BB) { + // We don't have an ordering problem if the user is after the last + // root. + unsigned Idx = InstrIdx[Inst]; + if (Idx < LastRootIndex) { + MustScalarize.insert(*it); + DEBUG(dbgs()<<"SLP: Adding to MustScalarize " + "because of an unsafe out of tree usage.\n"); + break; + } + } + + DEBUG(dbgs()<<"SLP: Adding to MustExtract " + "because of a safe out of tree usage.\n"); + MustExtract.insert(*it); } if (Lane == -1) Lane = LaneMap[*I]; if (Lane != LaneMap[*I]) { MustScalarize.insert(*it); - DEBUG(dbgs()<<"Adding " << **it << + DEBUG(dbgs()<<"SLP: Adding " << **it << " to MustScalarize because multiple lane use it: " << Lane << " and " << LaneMap[*I] << ".\n"); break; @@ -456,6 +476,13 @@ int BoUpSLP::getTreeCost_rec(ArrayRef VL, unsigned Depth) { } } + // Calculate the extract cost. + unsigned ExternalUserExtractCost = 0; + for (unsigned i = 0, e = VL.size(); i < e; ++i) + if (MustExtract.count(VL[i])) + ExternalUserExtractCost += + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); + switch (Opcode) { case Instruction::ZExt: case Instruction::SExt: @@ -469,7 +496,7 @@ int BoUpSLP::getTreeCost_rec(ArrayRef VL, unsigned Depth) { case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { - int Cost = 0; + int Cost = ExternalUserExtractCost; ValueList Operands; Type *SrcTy = VL0->getOperand(0)->getType(); // Prepare the operand vector. @@ -510,7 +537,7 @@ int BoUpSLP::getTreeCost_rec(ArrayRef VL, unsigned Depth) { case Instruction::And: case Instruction::Or: case Instruction::Xor: { - int Cost = 0; + int Cost = ExternalUserExtractCost; // Calculate the cost of all of the operands. for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; @@ -540,7 +567,7 @@ int BoUpSLP::getTreeCost_rec(ArrayRef VL, unsigned Depth) { int ScalarLdCost = VecTy->getNumElements() * TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); - return VecLdCost - ScalarLdCost; + return VecLdCost - ScalarLdCost + ExternalUserExtractCost; } case Instruction::Store: { // We know that we can merge the stores. Calculate the cost. @@ -556,7 +583,7 @@ int BoUpSLP::getTreeCost_rec(ArrayRef VL, unsigned Depth) { } int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1); - return TotalCost; + return TotalCost + ExternalUserExtractCost; } default: // Unable to vectorize unknown instructions. @@ -588,10 +615,24 @@ Value *BoUpSLP::Scalarize(ArrayRef VL, VectorType *Ty) { Value *BoUpSLP::vectorizeTree(ArrayRef VL, int VF) { Value *V = vectorizeTree_rec(VL, VF); + + Instruction *LastInstr = GetLastInstr(VL, VL.size()); + IRBuilder<> Builder(LastInstr); + for (ValueSet::iterator it = MustExtract.begin(), e = MustExtract.end(); + it != e; ++it) { + Instruction *I = cast(*it); + Value *Vec = VectorizedValues[I]; + assert(LaneMap.count(I) && "Unable to find the lane for the external use"); + Value *Idx = Builder.getInt32(LaneMap[I]); + Value *Extract = Builder.CreateExtractElement(Vec, Idx); + I->replaceAllUsesWith(Extract); + } + // We moved some instructions around. We have to number them again // before we can do any analysis. numberInstructions(); MustScalarize.clear(); + MustExtract.clear(); return V; } diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h index 5456c6c7795..abb35840e93 100644 --- a/lib/Transforms/Vectorize/VecUtils.h +++ b/lib/Transforms/Vectorize/VecUtils.h @@ -127,6 +127,11 @@ private: /// NOTICE: The vectorization methods also use this set. ValueSet MustScalarize; + /// Contains values that have users outside of the vectorized graph. + /// We need to generate extract instructions for these values. + /// NOTICE: The vectorization methods also use this set. + ValueSet MustExtract; + /// Contains a list of values that are used outside the current tree. This /// set must be reset between runs. ValueSet MultiUserVals; diff --git a/test/Transforms/SLPVectorizer/X86/diamond.ll b/test/Transforms/SLPVectorizer/X86/diamond.ll index 8e85cb6c9b8..49c8712d202 100644 --- a/test/Transforms/SLPVectorizer/X86/diamond.ll +++ b/test/Transforms/SLPVectorizer/X86/diamond.ll @@ -41,7 +41,7 @@ entry: } -; int foo_fail(int * restrict B, int * restrict A, int n, int m) { +; int extr_user(int * restrict B, int * restrict A, int n, int m) { ; B[0] = n * A[0] + m * A[0]; ; B[1] = n * A[1] + m * A[1]; ; B[2] = n * A[2] + m * A[2]; @@ -49,10 +49,11 @@ entry: ; return A[0]; ; } -; CHECK: @foo_fail -; CHECK-NOT: load <4 x i32> +; CHECK: @extr_user +; CHECK: store <4 x i32> +; CHECK-NEXT: extractelement <4 x i32> ; CHECK: ret -define i32 @foo_fail(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) { +define i32 @extr_user(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) { entry: %0 = load i32* %A, align 4 %mul238 = add i32 %m, %n