diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp index 6e36ff7b5d2..4653a7d7c8c 100644 --- a/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/lib/Transforms/Vectorize/BBVectorize.cpp @@ -58,6 +58,11 @@ static cl::opt<unsigned> ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, cl::desc("The required chain depth for vectorization")); +static cl::opt<bool> +UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false), + cl::Hidden, cl::desc("Use the chain depth requirement with" + " target information")); + static cl::opt<unsigned> SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, cl::desc("The maximum search distance for instruction pairs")); @@ -227,6 +232,9 @@ namespace { DenseMap<ValuePair, int> &CandidatePairCostSavings, std::vector<Value *> &PairableInsts, bool NonPow2Len); + // FIXME: The current implementation does not account for pairs that + // are connected in multiple ways. For example: + // C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap) enum PairConnectionType { PairConnectionDirect, PairConnectionSwap, @@ -1665,20 +1673,39 @@ namespace { int EffSize = 0; if (VTTI) { + DenseSet<Value *> PrunedTreeInstrs; + for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), + E = PrunedTree.end(); S != E; ++S) { + PrunedTreeInstrs.insert(S->first); + PrunedTreeInstrs.insert(S->second); + } + + // The set of pairs that have already contributed to the total cost. + DenseSet<ValuePair> IncomingPairs; + // The node weights represent the cost savings associated with // fusing the pair of instructions. for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), E = PrunedTree.end(); S != E; ++S) { - if (getDepthFactor(S->first)) - EffSize += CandidatePairCostSavings.find(*S)->second; + bool FlipOrder = false; - // The edge weights contribute in a negative sense: they represent - // the cost of shuffles. + if (getDepthFactor(S->first)) { + int ESContrib = CandidatePairCostSavings.find(*S)->second; + DEBUG(if (DebugPairSelection) dbgs() << "\tweight {" + << *S->first << " <-> " << *S->second << "} = " << + ESContrib << "\n"); + EffSize += ESContrib; + } + + // The edge weights contribute in a negative sense: they represent + // the cost of shuffles. VPPIteratorPair IP = ConnectedPairDeps.equal_range(*S); if (IP.first != ConnectedPairDeps.end()) { unsigned NumDepsDirect = 0, NumDepsSwap = 0; for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first; Q != IP.second; ++Q) { + if (!PrunedTree.count(Q->second)) + continue; DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(VPPair(Q->second, Q->first)); assert(R != PairConnectionTypes.end() && @@ -1692,12 +1719,14 @@ namespace { // If there are more swaps than direct connections, then // the pair order will be flipped during fusion. So the real // number of swaps is the minimum number. - bool FlipOrder = !FixedOrderPairs.count(*S) && + FlipOrder = !FixedOrderPairs.count(*S) && ((NumDepsSwap > NumDepsDirect) || FixedOrderPairs.count(ValuePair(S->second, S->first))); for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first; Q != IP.second; ++Q) { + if (!PrunedTree.count(Q->second)) + continue; DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(VPPair(Q->second, Q->first)); assert(R != PairConnectionTypes.end() && @@ -1707,9 +1736,161 @@ namespace { Type *VTy = getVecTypeForPair(Ty1, Ty2); if ((R->second == PairConnectionDirect && FlipOrder) || (R->second == PairConnectionSwap && !FlipOrder) || - R->second == PairConnectionSplat) - EffSize -= (int) getInstrCost(Instruction::ShuffleVector, - VTy, VTy); + R->second == PairConnectionSplat) { + int ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + VTy, VTy); + DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << + *Q->second.first << " <-> " << *Q->second.second << + "} -> {" << + *S->first << " <-> " << *S->second << "} = " << + ESContrib << "\n"); + EffSize -= ESContrib; + } + } + } + + // Compute the cost of outgoing edges. We assume that edges outgoing + // to shuffles, inserts or extracts can be merged, and so contribute + // no additional cost. + if (!S->first->getType()->isVoidTy()) { + Type *Ty1 = S->first->getType(), + *Ty2 = S->second->getType(); + Type *VTy = getVecTypeForPair(Ty1, Ty2); + + bool NeedsExtraction = false; + for (Value::use_iterator I = S->first->use_begin(), + IE = S->first->use_end(); I != IE; ++I) { + if (isa<ShuffleVectorInst>(*I) || + isa<InsertElementInst>(*I) || + isa<ExtractElementInst>(*I)) + continue; + if (PrunedTreeInstrs.count(*I)) + continue; + NeedsExtraction = true; + break; + } + + if (NeedsExtraction) { + int ESContrib; + if (Ty1->isVectorTy()) + ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + Ty1, VTy); + else + ESContrib = (int) VTTI->getVectorInstrCost( + Instruction::ExtractElement, VTy, 0); + + DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << + *S->first << "} = " << ESContrib << "\n"); + EffSize -= ESContrib; + } + + NeedsExtraction = false; + for (Value::use_iterator I = S->second->use_begin(), + IE = S->second->use_end(); I != IE; ++I) { + if (isa<ShuffleVectorInst>(*I) || + isa<InsertElementInst>(*I) || + isa<ExtractElementInst>(*I)) + continue; + if (PrunedTreeInstrs.count(*I)) + continue; + NeedsExtraction = true; + break; + } + + if (NeedsExtraction) { + int ESContrib; + if (Ty2->isVectorTy()) + ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + Ty2, VTy); + else + ESContrib = (int) VTTI->getVectorInstrCost( + Instruction::ExtractElement, VTy, 1); + DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << + *S->second << "} = " << ESContrib << "\n"); + EffSize -= ESContrib; + } + } + + // Compute the cost of incoming edges. + if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) { + Instruction *S1 = cast<Instruction>(S->first), + *S2 = cast<Instruction>(S->second); + for (unsigned o = 0; o < S1->getNumOperands(); ++o) { + Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o); + + // Combining constants into vector constants (or small vector + // constants into larger ones are assumed free). + if (isa<Constant>(O1) && isa<Constant>(O2)) + continue; + + if (FlipOrder) + std::swap(O1, O2); + + ValuePair VP = ValuePair(O1, O2); + ValuePair VPR = ValuePair(O2, O1); + + // Internal edges are not handled here. + if (PrunedTree.count(VP) || PrunedTree.count(VPR)) + continue; + + Type *Ty1 = O1->getType(), + *Ty2 = O2->getType(); + Type *VTy = getVecTypeForPair(Ty1, Ty2); + + // Combining vector operations of the same type is also assumed + // folded with other operations. + if (Ty1 == Ty2 && + (isa<ShuffleVectorInst>(O1) || + isa<InsertElementInst>(O1) || + isa<InsertElementInst>(O1)) && + (isa<ShuffleVectorInst>(O2) || + isa<InsertElementInst>(O2) || + isa<InsertElementInst>(O2))) + continue; + + int ESContrib; + // This pair has already been formed. + if (IncomingPairs.count(VP)) { + continue; + } else if (IncomingPairs.count(VPR)) { + ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + VTy, VTy); + } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) { + ESContrib = (int) VTTI->getVectorInstrCost( + Instruction::InsertElement, VTy, 0); + ESContrib += (int) VTTI->getVectorInstrCost( + Instruction::InsertElement, VTy, 1); + } else if (!Ty1->isVectorTy()) { + // O1 needs to be inserted into a vector of size O2, and then + // both need to be shuffled together. + ESContrib = (int) VTTI->getVectorInstrCost( + Instruction::InsertElement, Ty2, 0); + ESContrib += (int) getInstrCost(Instruction::ShuffleVector, + VTy, Ty2); + } else if (!Ty2->isVectorTy()) { + // O2 needs to be inserted into a vector of size O1, and then + // both need to be shuffled together. + ESContrib = (int) VTTI->getVectorInstrCost( + Instruction::InsertElement, Ty1, 0); + ESContrib += (int) getInstrCost(Instruction::ShuffleVector, + VTy, Ty1); + } else { + Type *TyBig = Ty1, *TySmall = Ty2; + if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements()) + std::swap(TyBig, TySmall); + + ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + VTy, TyBig); + if (TyBig != TySmall) + ESContrib += (int) getInstrCost(Instruction::ShuffleVector, + TyBig, TySmall); + } + + DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" + << *O1 << " <-> " << *O2 << "} = " << + ESContrib << "\n"); + EffSize -= ESContrib; + IncomingPairs.insert(VP); } } } @@ -1724,7 +1905,8 @@ namespace { << *J->first << " <-> " << *J->second << "} of depth " << MaxDepth << " and size " << PrunedTree.size() << " (effective size: " << EffSize << ")\n"); - if (MaxDepth >= Config.ReqChainDepth && + if (((VTTI && !UseChainDepthWithTI) || + MaxDepth >= Config.ReqChainDepth) && EffSize > 0 && EffSize > BestEffSize) { BestMaxDepth = MaxDepth; BestEffSize = EffSize; diff --git a/test/Transforms/BBVectorize/X86/simple-ldstr.ll b/test/Transforms/BBVectorize/X86/simple-ldstr.ll new file mode 100644 index 00000000000..0124399bad9 --- /dev/null +++ b/test/Transforms/BBVectorize/X86/simple-ldstr.ll @@ -0,0 +1,29 @@ +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s + +; Simple 3-pair chain with loads and stores +define void @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly { +entry: + %i0 = load double* %a, align 8 + %i1 = load double* %b, align 8 + %mul = fmul double %i0, %i1 + %arrayidx3 = getelementptr inbounds double* %a, i64 1 + %i3 = load double* %arrayidx3, align 8 + %arrayidx4 = getelementptr inbounds double* %b, i64 1 + %i4 = load double* %arrayidx4, align 8 + %mul5 = fmul double %i3, %i4 + store double %mul, double* %c, align 8 + %arrayidx5 = getelementptr inbounds double* %c, i64 1 + store double %mul5, double* %arrayidx5, align 8 + ret void +; CHECK: @test1 +; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>* +; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>* +; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8 +; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8 +; CHECK: %mul = fmul <2 x double> %i0, %i1 +; CHECK: %0 = bitcast double* %c to <2 x double>* +; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8 +; CHECK: ret void +} + diff --git a/test/Transforms/BBVectorize/X86/simple.ll b/test/Transforms/BBVectorize/X86/simple.ll index d11c9b92f0d..0113e38bb1c 100644 --- a/test/Transforms/BBVectorize/X86/simple.ll +++ b/test/Transforms/BBVectorize/X86/simple.ll @@ -3,25 +3,84 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; Basic depth-3 chain define double @test1(double %A1, double %A2, double %B1, double %B2) { + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 + %Y1 = fmul double %X1, %A1 + %Y2 = fmul double %X2, %A2 + %Z1 = fadd double %Y1, %B1 + %Z2 = fadd double %Y2, %B2 + %R = fmul double %Z1, %Z2 + ret double %R ; CHECK: @test1 +; CHECK-NOT: fmul <2 x double> +; CHECK: ret double %R +} + +; Basic chain +define double @test1a(double %A1, double %A2, double %B1, double %B2) { + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 + %Y1 = fmul double %X1, %A1 + %Y2 = fmul double %X2, %A2 + %Z1 = fadd double %Y1, %B1 + %Z2 = fadd double %Y2, %B2 + %W1 = fadd double %Y1, %Z1 + %W2 = fadd double %Y2, %Z2 + %V1 = fadd double %W1, %Z1 + %V2 = fadd double %W2, %Z2 + %Q1 = fadd double %W1, %V1 + %Q2 = fadd double %W2, %V2 + %S1 = fadd double %W1, %Q1 + %S2 = fadd double %W2, %Q2 + %R = fmul double %S1, %S2 + ret double %R +; CHECK: @test1a ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1 ; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1 +; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2 +; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2 +; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2 +; CHECK: %W1 = fadd <2 x double> %Y1, %Z1 +; CHECK: %V1 = fadd <2 x double> %W1, %Z1 +; CHECK: %Q1 = fadd <2 x double> %W1, %V1 +; CHECK: %S1 = fadd <2 x double> %W1, %Q1 +; CHECK: %S1.v.r1 = extractelement <2 x double> %S1, i32 0 +; CHECK: %S1.v.r2 = extractelement <2 x double> %S1, i32 1 +; CHECK: %R = fmul double %S1.v.r1, %S1.v.r2 +; CHECK: ret double %R +} + +; Basic depth-3 chain (last pair permuted) +define double @test2(double %A1, double %A2, double %B1, double %B2) { %X1 = fsub double %A1, %B1 %X2 = fsub double %A2, %B2 -; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2 %Y1 = fmul double %X1, %A1 %Y2 = fmul double %X2, %A2 -; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2 - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 -; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2 + %Z1 = fadd double %Y2, %B1 + %Z2 = fadd double %Y1, %B2 %R = fmul double %Z1, %Z2 -; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0 -; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1 -; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2 ret double %R +; CHECK: @test2 +; CHECK-NOT: fmul <2 x double> +; CHECK: ret double %R +} + +; Basic depth-4 chain (internal permutation) +define double @test4(double %A1, double %A2, double %B1, double %B2) { + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 + %Y1 = fmul double %X1, %A1 + %Y2 = fmul double %X2, %A2 + %Z1 = fadd double %Y2, %B1 + %Z2 = fadd double %Y1, %B2 + %W1 = fadd double %Y2, %Z1 + %W2 = fadd double %Y1, %Z2 + %R = fmul double %Z1, %Z2 + ret double %R +; CHECK: @test4 +; CHECK-NOT: fmul <2 x double> ; CHECK: ret double %R } @@ -37,8 +96,8 @@ define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) { %Q2 = shufflevector <8 x i8> %Z2, <8 x i8> %Z2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 2, i32 4, i32 4, i32 1> %R = mul <8 x i8> %Q1, %Q2 ret <8 x i8> %R -; CHECK-TI: @test6 -; CHECK-TI-NOT: sub <16 x i8> -; CHECK-TI: ret <8 x i8> +; CHECK: @test6 +; CHECK-NOT: sub <16 x i8> +; CHECK: ret <8 x i8> }