Allow BBVectorize to form non-2^n-length vectors.

The original algorithm only used recursive pair fusion of equal-length types. This is now extended to allow pairing of any types that share the same underlying scalar type. Because we would still generally prefer the 2^n-length types, those are formed first. Then a second set of iterations form the non-2^n-length types. Also, a call to SimplifyInstructionsInBlock has been added after each pairing iteration. This takes care of DCE (and a few other things) that make the following iterations execute somewhat faster. For the same reason, some of the simple shuffle-combination cases are now handled internally. There is some additional refactoring work to be done, but I've had many requests for this feature, so additional refactoring will come soon in future commits (as will additional test cases). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@159330 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-05 17:39:16 +00:00 · 2012-06-28 05:42:42 +00:00 · 2012-06-28 05:42:42 +00:00 · 64e1b28643
commit 64e1b28643
parent ec4e85e336
4 changed files with 571 additions and 134 deletions
--- a/include/llvm/Transforms/Vectorize.h
+++ b/include/llvm/Transforms/Vectorize.h
@ -86,6 +86,9 @@ struct VectorizeConfig {
  /// @brief The maximum number of pairing iterations.
  unsigned MaxIter;

+  /// @brief Don't try to form odd-length vectors.
+  bool Pow2LenOnly;
+
  /// @brief Don't boost the chain-depth contribution of loads and stores.
  bool NoMemOpBoost;

--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@ -42,6 +42,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Vectorize.h"
 #include <algorithm>
 #include <map>
@ -67,6 +68,10 @@ static cl::opt<unsigned>
 MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
  cl::desc("The maximum number of pairing iterations"));

+static cl::opt<bool>
+Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to form non-2^n-length vectors"));
+
 static cl::opt<unsigned>
 MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden,
  cl::desc("The maximum number of pairable instructions per group"));
@ -191,12 +196,12 @@ namespace {

    // FIXME: const correct?

-    bool vectorizePairs(BasicBlock &BB);
+    bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false);

    bool getCandidatePairs(BasicBlock &BB,
                       BasicBlock::iterator &Start,
                       std::multimap<Value *, Value *> &CandidatePairs,
-                       std::vector<Value *> &PairableInsts);
+                       std::vector<Value *> &PairableInsts, bool NonPow2Len);

    void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs,
                       std::vector<Value *> &PairableInsts,
@ -220,7 +225,7 @@ namespace {
    bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);

    bool areInstsCompatible(Instruction *I, Instruction *J,
-                       bool IsSimpleLoadStore);
+                       bool IsSimpleLoadStore, bool NonPow2Len);

    bool trackUsesOfI(DenseSet<Value *> &Users,
                      AliasSetTracker &WriteSet, Instruction *I,
@ -275,12 +280,18 @@ namespace {
                     Instruction *J, unsigned o, bool &FlipMemInputs);

    void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
-                     unsigned NumElem, unsigned MaskOffset, unsigned NumInElem,
-                     unsigned IdxOffset, std::vector<Constant*> &Mask);
+                     unsigned MaskOffset, unsigned NumInElem,
+                     unsigned NumInElem1, unsigned IdxOffset,
+                     std::vector<Constant*> &Mask);

    Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
                     Instruction *J);

+    bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J,
+                       unsigned o, Value *&LOp, unsigned numElemL,
+                       Type *ArgTypeL, Type *ArgTypeR,
+                       unsigned IdxOff = 0);
+
    Value *getReplacementInput(LLVMContext& Context, Instruction *I,
                     Instruction *J, unsigned o, bool FlipMemInputs);

@ -319,7 +330,8 @@ namespace {
      // Iterate a sufficient number of times to merge types of size 1 bit,
      // then 2 bits, then 4, etc. up to half of the target vector width of the
      // target vector register.
-      for (unsigned v = 2, n = 1;
+      unsigned n = 1;
+      for (unsigned v = 2;
           v <= Config.VectorBits && (!Config.MaxIter || n <= Config.MaxIter);
           v *= 2, ++n) {
        DEBUG(dbgs() << "BBV: fusing loop #" << n <<
@ -331,6 +343,16 @@ namespace {
          break;
      }

+      if (changed && !Pow2LenOnly) {
+        ++n;
+        for (; !Config.MaxIter || n <= Config.MaxIter; ++n) {
+          DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " <<
+                n << " for " << BB.getName() << " in " <<
+                BB.getParent()->getName() << "...\n");
+          if (!vectorizePairs(BB, true)) break;
+        }
+      }
+
      DEBUG(dbgs() << "BBV: done!\n");
      return changed;
    }
@ -352,15 +374,43 @@ namespace {
      AU.setPreservesCFG();
    }

-    // This returns the vector type that holds a pair of the provided type.
-    // If the provided type is already a vector, then its length is doubled.
-    static inline VectorType *getVecTypeForPair(Type *ElemTy) {
+    static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) {
+      assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() &&
+             "Cannot form vector from incompatible scalar types");
+      Type *STy = ElemTy->getScalarType();
+
+      unsigned numElem;
      if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
-        unsigned numElem = VTy->getNumElements();
-        return VectorType::get(ElemTy->getScalarType(), numElem*2);
+        numElem = VTy->getNumElements();
+      } else {
+        numElem = 1;
      }

-      return VectorType::get(ElemTy, 2);
+      if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) {
+        numElem += VTy->getNumElements();
+      } else {
+        numElem += 1;
+      }
+
+      return VectorType::get(STy, numElem);
+    }
+
+    static inline void getInstructionTypes(Instruction *I,
+                                           Type *&T1, Type *&T2) {
+      if (isa<StoreInst>(I)) {
+        // For stores, it is the value type, not the pointer type that matters
+        // because the value is what will come from a vector register.
+  
+        Value *IVal = cast<StoreInst>(I)->getValueOperand();
+        T1 = IVal->getType();
+      } else {
+        T1 = I->getType();
+      }
+  
+      if (I->isCast())
+        T2 = cast<CastInst>(I)->getSrcTy();
+      else
+        T2 = T1;
    }

    // Returns the weight associated with the provided value. A chain of
@ -396,8 +446,7 @@ namespace {
    // true if the offset could be determined to be some constant value.
    // For example, if OffsetInElmts == 1, then J accesses the memory directly
    // after I; if OffsetInElmts == -1 then I accesses the memory
-    // directly after J. This function assumes that both instructions
-    // have the same type.
+    // directly after J.
    bool getPairPtrInfo(Instruction *I, Instruction *J,
        Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
        int64_t &OffsetInElmts) {
@ -429,7 +478,12 @@ namespace {
        Type *VTy = cast<PointerType>(IPtr->getType())->getElementType();
        int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy);

-        assert(VTy == cast<PointerType>(JPtr->getType())->getElementType());
+        Type *VTy2 = cast<PointerType>(JPtr->getType())->getElementType();
+        if (VTy != VTy2 && Offset < 0) {
+          int64_t VTy2TSS = (int64_t) TD->getTypeStoreSize(VTy2);
+          OffsetInElmts = Offset/VTy2TSS;
+          return (abs64(Offset) % VTy2TSS) == 0;
+        }

        OffsetInElmts = Offset/VTyTSS;
        return (abs64(Offset) % VTyTSS) == 0;
@ -482,7 +536,7 @@ namespace {

  // This function implements one vectorization iteration on the provided
  // basic block. It returns true if the block is changed.
-  bool BBVectorize::vectorizePairs(BasicBlock &BB) {
+  bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) {
    bool ShouldContinue;
    BasicBlock::iterator Start = BB.getFirstInsertionPt();

@ -493,7 +547,7 @@ namespace {
      std::vector<Value *> PairableInsts;
      std::multimap<Value *, Value *> CandidatePairs;
      ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
-                                         PairableInsts);
+                                         PairableInsts, NonPow2Len);
      if (PairableInsts.empty()) continue;

      // Now we have a map of all of the pairable instructions and we need to
@ -540,6 +594,10 @@ namespace {
    // passes should coalesce the build/extract combinations.

    fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs);
+
+    // It is important to cleanup here so that future iterations of this
+    // function have less work to do.
+    (void) SimplifyInstructionsInBlock(&BB, TD);
    return true;
  }

@ -598,20 +656,7 @@ namespace {
      return false;

    Type *T1, *T2;
-    if (isa<StoreInst>(I)) {
-      // For stores, it is the value type, not the pointer type that matters
-      // because the value is what will come from a vector register.
-
-      Value *IVal = cast<StoreInst>(I)->getValueOperand();
-      T1 = IVal->getType();
-    } else {
-      T1 = I->getType();
-    }
-
-    if (I->isCast())
-      T2 = cast<CastInst>(I)->getSrcTy();
-    else
-      T2 = T1;
+    getInstructionTypes(I, T1, T2);

    // Not every type can be vectorized...
    if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
@ -642,8 +687,8 @@ namespace {
         T2->getScalarType()->isPointerTy()))
      return false;

-    if (T1->getPrimitiveSizeInBits() > Config.VectorBits/2 ||
-        T2->getPrimitiveSizeInBits() > Config.VectorBits/2)
+    if (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
+        T2->getPrimitiveSizeInBits() >= Config.VectorBits)
      return false;

    return true;
@ -654,13 +699,23 @@ namespace {
  // that I has already been determined to be vectorizable and that J is not
  // in the use tree of I.
  bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
-                       bool IsSimpleLoadStore) {
+                       bool IsSimpleLoadStore, bool NonPow2Len) {
    DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
                     " <-> " << *J << "\n");

    // Loads and stores can be merged if they have different alignments,
    // but are otherwise the same.
-    if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment))
+    if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment |
+                      (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0)))
+      return false;
+
+    Type *IT1, *IT2, *JT1, *JT2;
+    getInstructionTypes(I, IT1, IT2);
+    getInstructionTypes(J, JT1, JT2);
+    unsigned MaxTypeBits = std::max(
+      IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(),
+      IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits());
+    if (MaxTypeBits > Config.VectorBits)
      return false;

    // FIXME: handle addsub-type operations!
@ -672,8 +727,11 @@ namespace {
      if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
            OffsetInElmts) && abs64(OffsetInElmts) == 1) {
        if (Config.AlignedOnly) {
-          Type *aType = isa<StoreInst>(I) ?
+          Type *aTypeI = isa<StoreInst>(I) ?
            cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
+          Type *aTypeJ = isa<StoreInst>(J) ?
+            cast<StoreInst>(J)->getValueOperand()->getType() : J->getType();
+
          // An aligned load or store is possible only if the instruction
          // with the lower offset has an alignment suitable for the
          // vector type.
@ -681,7 +739,7 @@ namespace {
          unsigned BottomAlignment = IAlignment;
          if (OffsetInElmts < 0) BottomAlignment = JAlignment;

-          Type *VType = getVecTypeForPair(aType);
+          Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
          unsigned VecAlignment = TD->getPrefTypeAlignment(VType);
          if (BottomAlignment < VecAlignment)
            return false;
@ -776,7 +834,7 @@ namespace {
  bool BBVectorize::getCandidatePairs(BasicBlock &BB,
                       BasicBlock::iterator &Start,
                       std::multimap<Value *, Value *> &CandidatePairs,
-                       std::vector<Value *> &PairableInsts) {
+                       std::vector<Value *> &PairableInsts, bool NonPow2Len) {
    BasicBlock::iterator E = BB.end();
    if (Start == E) return false;

@ -812,7 +870,7 @@ namespace {

        // J does not use I, and comes before the first use of I, so it can be
        // merged with I if the instructions are compatible.
-        if (!areInstsCompatible(I, J, IsSimpleLoadStore)) continue;
+        if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len)) continue;

        // J is a candidate for merging with I.
        if (!PairableInsts.size() ||
@ -1450,8 +1508,9 @@ namespace {
      VPtr = JPtr;
    }

-    Type *ArgType = cast<PointerType>(IPtr->getType())->getElementType();
-    Type *VArgType = getVecTypeForPair(ArgType);
+    Type *ArgTypeI = cast<PointerType>(IPtr->getType())->getElementType();
+    Type *ArgTypeJ = cast<PointerType>(JPtr->getType())->getElementType();
+    Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
    Type *VArgPtrType = PointerType::get(VArgType,
      cast<PointerType>(IPtr->getType())->getAddressSpace());
    return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
@ -1459,15 +1518,17 @@ namespace {
  }

  void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
-                     unsigned NumElem, unsigned MaskOffset, unsigned NumInElem,
-                     unsigned IdxOffset, std::vector<Constant*> &Mask) {
-    for (unsigned v = 0; v < NumElem/2; ++v) {
+                     unsigned MaskOffset, unsigned NumInElem,
+                     unsigned NumInElem1, unsigned IdxOffset,
+                     std::vector<Constant*> &Mask) {
+    unsigned NumElem1 = cast<VectorType>(J->getType())->getNumElements();
+    for (unsigned v = 0; v < NumElem1; ++v) {
      int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
      if (m < 0) {
        Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context));
      } else {
        unsigned mm = m + (int) IdxOffset;
-        if (m >= (int) NumInElem)
+        if (m >= (int) NumInElem1)
          mm += (int) NumInElem;

        Mask[v+MaskOffset] =
@ -1483,8 +1544,11 @@ namespace {
    // This is the shuffle mask. We need to append the second
    // mask to the first, and the numbers need to be adjusted.

-    Type *ArgType = I->getType();
-    Type *VArgType = getVecTypeForPair(ArgType);
+    Type *ArgTypeI = I->getType();
+    Type *ArgTypeJ = J->getType();
+    Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
+
+    unsigned NumElemI = cast<VectorType>(ArgTypeI)->getNumElements();

    // Get the total number of elements in the fused vector type.
    // By definition, this must equal the number of elements in
@ -1492,19 +1556,81 @@ namespace {
    unsigned NumElem = cast<VectorType>(VArgType)->getNumElements();
    std::vector<Constant*> Mask(NumElem);

-    Type *OpType = I->getOperand(0)->getType();
-    unsigned NumInElem = cast<VectorType>(OpType)->getNumElements();
+    Type *OpTypeI = I->getOperand(0)->getType();
+    unsigned NumInElemI = cast<VectorType>(OpTypeI)->getNumElements();
+    Type *OpTypeJ = J->getOperand(0)->getType();
+    unsigned NumInElemJ = cast<VectorType>(OpTypeJ)->getNumElements();
+
+    // The fused vector will be:
+    // -----------------------------------------------------
+    // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ |
+    // -----------------------------------------------------
+    // from which we'll extract NumElem total elements (where the first NumElemI
+    // of them come from the mask in I and the remainder come from the mask
+    // in J.

    // For the mask from the first pair...
-    fillNewShuffleMask(Context, I, NumElem, 0, NumInElem, 0, Mask);
+    fillNewShuffleMask(Context, I, 0,        NumInElemJ, NumInElemI,
+                       0,          Mask);

    // For the mask from the second pair...
-    fillNewShuffleMask(Context, J, NumElem, NumElem/2, NumInElem, NumInElem,
-                       Mask);
+    fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ,
+                       NumInElemI, Mask);

    return ConstantVector::get(Mask);
  }

+  bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I,
+                                  Instruction *J, unsigned o, Value *&LOp,
+                                  unsigned numElemL,
+                                  Type *ArgTypeL, Type *ArgTypeH,
+                                  unsigned IdxOff) {
+    bool ExpandedIEChain = false;
+    if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) {
+      // If we have a pure insertelement chain, then this can be rewritten
+      // into a chain that directly builds the larger type.
+      bool PureChain = true;
+      InsertElementInst *LIENext = LIE;
+      do {
+        if (!isa<UndefValue>(LIENext->getOperand(0)) &&
+            !isa<InsertElementInst>(LIENext->getOperand(0))) {
+          PureChain = false;
+          break;
+        }
+      } while ((LIENext =
+                 dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
+
+      if (PureChain) {
+        SmallVector<Value *, 8> VectElemts(numElemL,
+          UndefValue::get(ArgTypeL->getScalarType()));
+        InsertElementInst *LIENext = LIE;
+        do {
+          unsigned Idx =
+            cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue();
+          VectElemts[Idx] = LIENext->getOperand(1);
+        } while ((LIENext =
+                   dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
+
+        LIENext = 0;
+        Value *LIEPrev = UndefValue::get(ArgTypeH);
+        for (unsigned i = 0; i < numElemL; ++i) {
+          if (isa<UndefValue>(VectElemts[i])) continue;
+          LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i],
+                             ConstantInt::get(Type::getInt32Ty(Context),
+                                              i + IdxOff),
+                             getReplacementName(I, true, o, i+1));
+          LIENext->insertBefore(J);
+          LIEPrev = LIENext;
+        }
+
+        LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH);
+        ExpandedIEChain = true;
+      }
+    }
+
+    return ExpandedIEChain;
+  }
+
  // Returns the value to be used as the specified operand of the vector
  // instruction that fuses I with J.
  Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
@ -1512,84 +1638,333 @@ namespace {
    Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
    Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);

-      // Compute the fused vector type for this operand
-    Type *ArgType = I->getOperand(o)->getType();
-    VectorType *VArgType = getVecTypeForPair(ArgType);
+    // Compute the fused vector type for this operand
+    Type *ArgTypeI = I->getOperand(o)->getType();
+    Type *ArgTypeJ = J->getOperand(o)->getType();
+    VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);

    Instruction *L = I, *H = J;
+    Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ;
    if (FlipMemInputs) {
      L = J;
      H = I;
+      ArgTypeL = ArgTypeJ;
+      ArgTypeH = ArgTypeI;
+    }
+
+    unsigned numElemL;
+    if (ArgTypeL->isVectorTy())
+      numElemL = cast<VectorType>(ArgTypeL)->getNumElements();
+    else
+      numElemL = 1;
+
+    unsigned numElemH;
+    if (ArgTypeH->isVectorTy())
+      numElemH = cast<VectorType>(ArgTypeH)->getNumElements();
+    else
+      numElemH = 1;
+
+    Value *LOp = L->getOperand(o);
+    Value *HOp = H->getOperand(o);
+    unsigned numElem = VArgType->getNumElements();
+
+    // First, we check if we can reuse the "original" vector outputs (if these
+    // exist). We might need a shuffle.
+    ExtractElementInst *LEE = dyn_cast<ExtractElementInst>(LOp);
+    ExtractElementInst *HEE = dyn_cast<ExtractElementInst>(HOp);
+    ShuffleVectorInst *LSV = dyn_cast<ShuffleVectorInst>(LOp);
+    ShuffleVectorInst *HSV = dyn_cast<ShuffleVectorInst>(HOp);
+
+    // FIXME: If we're fusing shuffle instructions, then we can't apply this
+    // optimization. The input vectors to the shuffle might be a different
+    // length from the shuffle outputs. Unfortunately, the replacement
+    // shuffle mask has already been formed, and the mask entries are sensitive
+    // to the sizes of the inputs.
+    bool IsSizeChangeShuffle =
+      isa<ShuffleVectorInst>(L) &&
+        (LOp->getType() != L->getType() || HOp->getType() != H->getType());
+
+    if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) {
+      // We can have at most two unique vector inputs.
+      bool CanUseInputs = true;
+      Value *I1, *I2 = 0;
+      if (LEE) {
+        I1 = LEE->getOperand(0);
+      } else {
+        I1 = LSV->getOperand(0);
+        I2 = LSV->getOperand(1);
+        if (I2 == I1 || isa<UndefValue>(I2))
+          I2 = 0;
+      }
+  
+      if (HEE) {
+        Value *I3 = HEE->getOperand(0);
+        if (!I2 && I3 != I1)
+          I2 = I3;
+        else if (I3 != I1 && I3 != I2)
+          CanUseInputs = false;
+      } else {
+        Value *I3 = HSV->getOperand(0);
+        if (!I2 && I3 != I1)
+          I2 = I3;
+        else if (I3 != I1 && I3 != I2)
+          CanUseInputs = false;
+
+        if (CanUseInputs) {
+          Value *I4 = HSV->getOperand(1);
+          if (!isa<UndefValue>(I4)) {
+            if (!I2 && I4 != I1)
+              I2 = I4;
+            else if (I4 != I1 && I4 != I2)
+              CanUseInputs = false;
+          }
+        }
+      }
+
+      if (CanUseInputs) {
+        unsigned LOpElem =
+          cast<VectorType>(cast<Instruction>(LOp)->getOperand(0)->getType())
+            ->getNumElements();
+        unsigned HOpElem =
+          cast<VectorType>(cast<Instruction>(HOp)->getOperand(0)->getType())
+            ->getNumElements();
+
+        // We have one or two input vectors. We need to map each index of the
+        // operands to the index of the original vector.
+        SmallVector<std::pair<int, int>, 8>  II(numElem);
+        for (unsigned i = 0; i < numElemL; ++i) {
+          int Idx, INum;
+          if (LEE) {
+            Idx =
+              cast<ConstantInt>(LEE->getOperand(1))->getSExtValue();
+            INum = LEE->getOperand(0) == I1 ? 0 : 1;
+          } else {
+            Idx = LSV->getMaskValue(i);
+            if (Idx < (int) LOpElem) {
+              INum = LSV->getOperand(0) == I1 ? 0 : 1;
+            } else {
+              Idx -= LOpElem;
+              INum = LSV->getOperand(1) == I1 ? 0 : 1;
+            }
+          }
+
+          II[i] = std::pair<int, int>(Idx, INum);
+        }
+        for (unsigned i = 0; i < numElemH; ++i) {
+          int Idx, INum;
+          if (HEE) {
+            Idx =
+              cast<ConstantInt>(HEE->getOperand(1))->getSExtValue();
+            INum = HEE->getOperand(0) == I1 ? 0 : 1;
+          } else {
+            Idx = HSV->getMaskValue(i);
+            if (Idx < (int) HOpElem) {
+              INum = HSV->getOperand(0) == I1 ? 0 : 1;
+            } else {
+              Idx -= HOpElem;
+              INum = HSV->getOperand(1) == I1 ? 0 : 1;
+            }
+          }
+
+          II[i + numElemL] = std::pair<int, int>(Idx, INum);
+        }
+
+        // We now have an array which tells us from which index of which
+        // input vector each element of the operand comes.
+        VectorType *I1T = cast<VectorType>(I1->getType());
+        unsigned I1Elem = I1T->getNumElements();
+
+        if (!I2) {
+          // In this case there is only one underlying vector input. Check for
+          // the trivial case where we can use the input directly.
+          if (I1Elem == numElem) {
+            bool ElemInOrder = true;
+            for (unsigned i = 0; i < numElem; ++i) {
+              if (II[i].first != (int) i && II[i].first != -1) {
+                ElemInOrder = false;
+                break;
+              }
+            }
+
+            if (ElemInOrder)
+              return I1;
+          }
+
+          // A shuffle is needed.
+          std::vector<Constant *> Mask(numElem);
+          for (unsigned i = 0; i < numElem; ++i) {
+            int Idx = II[i].first;
+            if (Idx == -1)
+              Mask[i] = UndefValue::get(Type::getInt32Ty(Context));
+            else
+              Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
+          }
+
+          Instruction *S =
+            new ShuffleVectorInst(I1, UndefValue::get(I1T),
+                                  ConstantVector::get(Mask),
+                                  getReplacementName(I, true, o));
+          S->insertBefore(J);
+          return S;
+        }
+
+        VectorType *I2T = cast<VectorType>(I2->getType());
+        unsigned I2Elem = I2T->getNumElements();
+
+        // This input comes from two distinct vectors. The first step is to
+        // make sure that both vectors are the same length. If not, the
+        // smaller one will need to grow before they can be shuffled together.
+        if (I1Elem < I2Elem) {
+          std::vector<Constant *> Mask(I2Elem);
+          unsigned v = 0;
+          for (; v < I1Elem; ++v)
+            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          for (; v < I2Elem; ++v)
+            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+
+          Instruction *NewI1 =
+            new ShuffleVectorInst(I1, UndefValue::get(I1T),
+                                  ConstantVector::get(Mask),
+                                  getReplacementName(I, true, o, 1));
+          NewI1->insertBefore(J);
+          I1 = NewI1;
+          I1T = I2T;
+          I1Elem = I2Elem;
+        } else if (I1Elem > I2Elem) {
+          std::vector<Constant *> Mask(I1Elem);
+          unsigned v = 0;
+          for (; v < I2Elem; ++v)
+            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          for (; v < I1Elem; ++v)
+            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+
+          Instruction *NewI2 =
+            new ShuffleVectorInst(I2, UndefValue::get(I2T),
+                                  ConstantVector::get(Mask),
+                                  getReplacementName(I, true, o, 1));
+          NewI2->insertBefore(J);
+          I2 = NewI2;
+          I2T = I1T;
+          I2Elem = I1Elem;
+        }
+
+        // Now that both I1 and I2 are the same length we can shuffle them
+        // together (and use the result).
+        std::vector<Constant *> Mask(numElem);
+        for (unsigned v = 0; v < numElem; ++v) {
+          if (II[v].first == -1) {
+            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+          } else {
+            int Idx = II[v].first + II[v].second * I1Elem;
+            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
+          }
+        }
+
+        Instruction *NewOp =
+          new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask),
+                                getReplacementName(I, true, o));
+        NewOp->insertBefore(J);
+        return NewOp;
+      }
+    }
+
+    Type *ArgType = ArgTypeL;
+    if (numElemL < numElemH) {
+      if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH,
+                                         ArgTypeL, VArgType, 1)) {
+        // This is another short-circuit case: we're combining a scalar into
+        // a vector that is formed by an IE chain. We've just expanded the IE
+        // chain, now insert the scalar and we're done.
+
+        Instruction *S = InsertElementInst::Create(HOp, LOp, CV0,
+                                               getReplacementName(I, true, o));
+        S->insertBefore(J);
+        return S;
+      } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL,
+                                ArgTypeH)) {
+        // The two vector inputs to the shuffle must be the same length,
+        // so extend the smaller vector to be the same length as the larger one.
+        Instruction *NLOp;
+        if (numElemL > 1) {
+  
+          std::vector<Constant *> Mask(numElemH);
+          unsigned v = 0;
+          for (; v < numElemL; ++v)
+            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          for (; v < numElemH; ++v)
+            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+    
+          NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),
+                                       ConstantVector::get(Mask),
+                                       getReplacementName(I, true, o, 1));
+        } else {
+          NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0,
+                                           getReplacementName(I, true, o, 1));
+        }
+  
+        NLOp->insertBefore(J);
+        LOp = NLOp;
+      }
+
+      ArgType = ArgTypeH;
+    } else if (numElemL > numElemH) {
+      if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL,
+                                         ArgTypeH, VArgType)) {
+        Instruction *S =
+          InsertElementInst::Create(LOp, HOp, 
+                                    ConstantInt::get(Type::getInt32Ty(Context),
+                                                     numElemL),
+                                    getReplacementName(I, true, o));
+        S->insertBefore(J);
+        return S;
+      } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH,
+                                ArgTypeL)) {
+        Instruction *NHOp;
+        if (numElemH > 1) {
+          std::vector<Constant *> Mask(numElemL);
+          unsigned v = 0;
+          for (; v < numElemH; ++v)
+            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          for (; v < numElemL; ++v)
+            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+    
+          NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),
+                                       ConstantVector::get(Mask),
+                                       getReplacementName(I, true, o, 1));
+        } else {
+          NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0,
+                                           getReplacementName(I, true, o, 1));
+        }
+  
+        NHOp->insertBefore(J);
+        HOp = NHOp;
+      }
    }

    if (ArgType->isVectorTy()) {
      unsigned numElem = cast<VectorType>(VArgType)->getNumElements();
      std::vector<Constant*> Mask(numElem);
-      for (unsigned v = 0; v < numElem; ++v)
-        Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+      for (unsigned v = 0; v < numElem; ++v) {
+        unsigned Idx = v;
+        // If the low vector was expanded, we need to skip the extra
+        // undefined entries.
+        if (v >= numElemL && numElemH > numElemL)
+          Idx += (numElemH - numElemL);
+        Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
+      }

-      Instruction *BV = new ShuffleVectorInst(L->getOperand(o),
-                                              H->getOperand(o),
+      Instruction *BV = new ShuffleVectorInst(LOp, HOp,
                                              ConstantVector::get(Mask),
                                              getReplacementName(I, true, o));
      BV->insertBefore(J);
      return BV;
    }

-    // If these two inputs are the output of another vector instruction,
-    // then we should use that output directly. It might be necessary to
-    // permute it first. [When pairings are fused recursively, you can
-    // end up with cases where a large vector is decomposed into scalars
-    // using extractelement instructions, then built into size-2
-    // vectors using insertelement and the into larger vectors using
-    // shuffles. InstCombine does not simplify all of these cases well,
-    // and so we make sure that shuffles are generated here when possible.
-    ExtractElementInst *LEE
-      = dyn_cast<ExtractElementInst>(L->getOperand(o));
-    ExtractElementInst *HEE
-      = dyn_cast<ExtractElementInst>(H->getOperand(o));
-
-    if (LEE && HEE &&
-        LEE->getOperand(0)->getType() == HEE->getOperand(0)->getType()) {
-      VectorType *EEType = cast<VectorType>(LEE->getOperand(0)->getType());
-      unsigned LowIndx = cast<ConstantInt>(LEE->getOperand(1))->getZExtValue();
-      unsigned HighIndx = cast<ConstantInt>(HEE->getOperand(1))->getZExtValue();
-      if (LEE->getOperand(0) == HEE->getOperand(0)) {
-        if (LowIndx == 0 && HighIndx == 1)
-          return LEE->getOperand(0);
-
-        std::vector<Constant*> Mask(2);
-        Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx);
-        Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx);
-
-        Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0),
-                                          UndefValue::get(EEType),
-                                          ConstantVector::get(Mask),
-                                          getReplacementName(I, true, o));
-        BV->insertBefore(J);
-        return BV;
-      }
-
-      std::vector<Constant*> Mask(2);
-      HighIndx += EEType->getNumElements();
-      Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx);
-      Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx);
-
-      Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0),
-                                          HEE->getOperand(0),
-                                          ConstantVector::get(Mask),
-                                          getReplacementName(I, true, o));
-      BV->insertBefore(J);
-      return BV;
-    }
-
    Instruction *BV1 = InsertElementInst::Create(
-                                          UndefValue::get(VArgType),
-                                          L->getOperand(o), CV0,
+                                          UndefValue::get(VArgType), LOp, CV0,
                                          getReplacementName(I, true, o, 1));
    BV1->insertBefore(I);
-    Instruction *BV2 = InsertElementInst::Create(BV1, H->getOperand(o),
-                                          CV1,
+    Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1,
                                          getReplacementName(I, true, o, 2));
    BV2->insertBefore(J);
    return BV2;
@ -1620,10 +1995,10 @@ namespace {
          BasicBlock &BB = *I->getParent();

          Module *M = BB.getParent()->getParent();
-          Type *ArgType = I->getType();
-          Type *VArgType = getVecTypeForPair(ArgType);
+          Type *ArgTypeI = I->getType();
+          Type *ArgTypeJ = J->getType();
+          Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);

-          // FIXME: is it safe to do this here?
          ReplacedOperands[o] = Intrinsic::getDeclaration(M,
            (Intrinsic::ID) IID, VArgType);
          continue;
@ -1653,35 +2028,59 @@ namespace {
                     Instruction *&InsertionPt,
                     Instruction *&K1, Instruction *&K2,
                     bool &FlipMemInputs) {
-    Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
-    Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
-
    if (isa<StoreInst>(I)) {
      AA->replaceWithNewValue(I, K);
      AA->replaceWithNewValue(J, K);
    } else {
      Type *IType = I->getType();
-      Type *VType = getVecTypeForPair(IType);
+      Type *JType = J->getType();
+
+      VectorType *VType = getVecTypeForPair(IType, JType);
+      unsigned numElem = VType->getNumElements();
+
+      unsigned numElemI, numElemJ;
+      if (IType->isVectorTy())
+        numElemI = cast<VectorType>(IType)->getNumElements();
+      else
+        numElemI = 1;
+
+      if (JType->isVectorTy())
+        numElemJ = cast<VectorType>(JType)->getNumElements();
+      else
+        numElemJ = 1;

      if (IType->isVectorTy()) {
-          unsigned numElem = cast<VectorType>(IType)->getNumElements();
-          std::vector<Constant*> Mask1(numElem), Mask2(numElem);
-          for (unsigned v = 0; v < numElem; ++v) {
-            Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-            Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElem+v);
-          }
+        std::vector<Constant*> Mask1(numElemI), Mask2(numElemI);
+        for (unsigned v = 0; v < numElemI; ++v) {
+          Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ+v);
+        }

-          K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
-                                       ConstantVector::get(
-                                         FlipMemInputs ? Mask2 : Mask1),
-                                       getReplacementName(K, false, 1));
-          K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
-                                       ConstantVector::get(
-                                         FlipMemInputs ? Mask1 : Mask2),
-                                       getReplacementName(K, false, 2));
+        K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
+                                   ConstantVector::get(
+                                     FlipMemInputs ? Mask2 : Mask1),
+                                   getReplacementName(K, false, 1));
      } else {
+        Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
+        Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1);
        K1 = ExtractElementInst::Create(K, FlipMemInputs ? CV1 : CV0,
                                          getReplacementName(K, false, 1));
+      }
+
+      if (JType->isVectorTy()) {
+        std::vector<Constant*> Mask1(numElemJ), Mask2(numElemJ);
+        for (unsigned v = 0; v < numElemJ; ++v) {
+          Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI+v);
+        }
+
+        K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
+                                   ConstantVector::get(
+                                     FlipMemInputs ? Mask1 : Mask2),
+                                   getReplacementName(K, false, 2));
+      } else {
+        Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
+        Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1);
        K2 = ExtractElementInst::Create(K, FlipMemInputs ? CV0 : CV1,
                                          getReplacementName(K, false, 2));
      }
@ -1884,7 +2283,7 @@ namespace {
      if (I->hasName()) K->takeName(I);

      if (!isa<StoreInst>(K))
-        K->mutateType(getVecTypeForPair(I->getType()));
+        K->mutateType(getVecTypeForPair(I->getType(), J->getType()));

      combineMetadata(K, J);

@ -1996,6 +2395,7 @@ VectorizeConfig::VectorizeConfig() {
  SplatBreaksChain = ::SplatBreaksChain;
  MaxInsts = ::MaxInsts;
  MaxIter = ::MaxIter;
+  Pow2LenOnly = ::Pow2LenOnly;
  NoMemOpBoost = ::NoMemOpBoost;
  FastDep = ::FastDep;
 }
--- a/test/Transforms/BBVectorize/simple.ll
+++ b/test/Transforms/BBVectorize/simple.ll
@ -138,8 +138,7 @@ define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) {
 ; CHECK: %Z1 = add <16 x i8> %Y1, %X1.v.i1
        %Q1 = shufflevector <8 x i8> %Z1, <8 x i8> %Z2, <8 x i32> <i32 15, i32 8, i32 6, i32 1, i32 13, i32 10, i32 4, i32 3>
        %Q2 = shufflevector <8 x i8> %Z2, <8 x i8> %Z2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 2, i32 4, i32 4, i32 1>
-; CHECK: %Z1.v.r2 = shufflevector <16 x i8> %Z1, <16 x i8> undef, <8 x i32> <i32 8, i32 undef, i32 10, i32 undef, i32 undef, i32 13, i32 undef, i32 15>
-; CHECK: %Q1.v.i1 = shufflevector <8 x i8> %Z1.v.r2, <8 x i8> undef, <16 x i32> <i32 0, i32 undef, i32 2, i32 undef, i32 undef, i32 5, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK: %Q1.v.i1 = shufflevector <16 x i8> %Z1, <16 x i8> undef, <16 x i32> <i32 8, i32 undef, i32 10, i32 undef, i32 undef, i32 13, i32 undef, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK: %Q1 = shufflevector <16 x i8> %Z1, <16 x i8> %Q1.v.i1, <16 x i32> <i32 23, i32 16, i32 6, i32 1, i32 21, i32 18, i32 4, i32 3, i32 14, i32 15, i32 8, i32 9, i32 10, i32 12, i32 12, i32 9>
 	%R  = mul <8 x i8> %Q1, %Q2
 ; CHECK: %Q1.v.r1 = shufflevector <16 x i8> %Q1, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
--- a/test/Transforms/BBVectorize/simple3.ll
+++ b/test/Transforms/BBVectorize/simple3.ll
@ -0,0 +1,35 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-vector-bits=192 -instcombine -gvn -S | FileCheck %s
+
+; Basic depth-3 chain
+define double @test1(double %A1, double %A2, double %A3, double %B1, double %B2, double %B3) {
+; CHECK: @test1
+; CHECK: %X1.v.i1.11 = insertelement <3 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i1.22 = insertelement <3 x double> %X1.v.i1.11, double %B2, i32 1
+; CHECK: %X1.v.i1 = insertelement <3 x double> %X1.v.i1.22, double %B3, i32 2
+; CHECK: %X1.v.i0.13 = insertelement <3 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i0.24 = insertelement <3 x double> %X1.v.i0.13, double %A2, i32 1
+; CHECK: %X1.v.i0 = insertelement <3 x double> %X1.v.i0.24, double %A3, i32 2
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%X3 = fsub double %A3, %B3
+; CHECK: %X1 = fsub <3 x double> %X1.v.i0, %X1.v.i1
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+	%Y3 = fmul double %X3, %A3
+; CHECK: %Y1 = fmul <3 x double> %X1, %X1.v.i0
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%Z3 = fadd double %Y3, %B3
+; CHECK: %Z1 = fadd <3 x double> %Y1, %X1.v.i1
+        %R1 = fmul double %Z1, %Z2
+	%R  = fmul double %R1, %Z3
+; CHECK: %Z1.v.r210 = extractelement <3 x double> %Z1, i32 2
+; CHECK: %Z1.v.r1 = extractelement <3 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <3 x double> %Z1, i32 1
+; CHECK: %R1 = fmul double %Z1.v.r1, %Z1.v.r2
+; CHECK: %R = fmul double %R1, %Z1.v.r210
+	ret double %R
+; CHECK: ret double %R
+}
+