Improved widening loads by adding support for wider loads if

the alignment allows. Fixed a bug where we didn't use a vector load/store for PR5626. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@94338 91177308-0d34-0410-b5e6-96231b3b80d8
2025-06-26 23:24:34 +00:00 · 2010-01-24 00:05:03 +00:00
parent f7d0d163c5
commit eb38ebf15c
9 changed files with 593 additions and 276 deletions
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@ -1533,10 +1533,10 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
    Idx = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, Idx);
    // If EltVT smaller than OpVT, only store the bits necessary.
-    if (EltVT.bitsLT(OpVT))
+    if (!OpVT.isVector() && EltVT.bitsLT(OpVT)) {
      Stores.push_back(DAG.getTruncStore(DAG.getEntryNode(), dl,
                          Node->getOperand(i), Idx, SV, Offset, EltVT));
-    else
+    } else
      Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, 
                                    Node->getOperand(i), Idx, SV, Offset));
  }
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@ -633,43 +633,33 @@ private:
  // Vector Widening Utilities Support: LegalizeVectorTypes.cpp
  //===--------------------------------------------------------------------===//
-  /// Helper genWidenVectorLoads - Helper function to generate a set of
+  /// Helper GenWidenVectorLoads - Helper function to generate a set of
  /// loads to load a vector with a resulting wider type. It takes
-  ///   ExtType: Extension type
+  ///   LdChain: list of chains for the load to be generated.
-  ///   LdChain: list of chains for the load we have generated.
+  ///   Ld:      load to widen
-  ///   Chain:   incoming chain for the ld vector.
+  SDValue GenWidenVectorLoads(SmallVector<SDValue, 16>& LdChain,
-  ///   BasePtr: base pointer to load from.
+                              LoadSDNode *LD);
-  ///   SV:         memory disambiguation source value.
+
-  ///   SVOffset:   memory disambiugation offset.
+  /// GenWidenVectorExtLoads - Helper function to generate a set of extension
-  ///   Alignment:  alignment of the memory.
+  /// loads to load a ector with a resulting wider type.  It takes
-  ///   isVolatile: volatile load.
+  ///   LdChain: list of chains for the load to be generated.
-  ///   LdWidth:    width of memory that we want to load.
+  ///   Ld:      load to widen
-  ///   ResType:    the wider result result type for the resulting vector.
+  ///   ExtType: extension element type
-  ///   dl:         DebugLoc to be applied to new nodes
+  SDValue GenWidenVectorExtLoads(SmallVector<SDValue, 16>& LdChain,
-  SDValue GenWidenVectorLoads(SmallVector<SDValue, 16>& LdChain, SDValue Chain,
+                                 LoadSDNode *LD, ISD::LoadExtType ExtType);
                              SDValue BasePtr, const Value *SV,
                              int SVOffset, unsigned Alignment,
                              bool isVolatile, unsigned LdWidth,
                              EVT ResType, DebugLoc dl);
  /// Helper genWidenVectorStores - Helper function to generate a set of
  /// stores to store a widen vector into non widen memory
  /// It takes
  ///   StChain: list of chains for the stores we have generated
-  ///   Chain:   incoming chain for the ld vector
+  ///   ST:      store of a widen value
-  ///   BasePtr: base pointer to load from
+  void GenWidenVectorStores(SmallVector<SDValue, 16>& StChain, StoreSDNode *ST);
-  ///   SV:      memory disambiguation source value
+
-  ///   SVOffset:   memory disambiugation offset
+  /// Helper genWidenVectorTruncStores - Helper function to generate a set of
-  ///   Alignment:  alignment of the memory
+  /// stores to store a truncate widen vector into non widen memory
-  ///   isVolatile: volatile lod
+  ///   StChain: list of chains for the stores we have generated
-  ///   ValOp:   value to store
+  ///   ST:      store of a widen value
-  ///   StWidth: width of memory that we want to store
+  void GenWidenVectorTruncStores(SmallVector<SDValue, 16>& StChain,
-  ///   dl:         DebugLoc to be applied to new nodes
+                                 StoreSDNode *ST);
  void GenWidenVectorStores(SmallVector<SDValue, 16>& StChain, SDValue Chain,
                            SDValue BasePtr, const Value *SV,
                            int SVOffset, unsigned Alignment,
                            bool isVolatile, SDValue ValOp,
                            unsigned StWidth, DebugLoc dl);
  /// Modifies a vector input (widen or narrows) to a vector of NVT.  The
  /// input vector must have the same element type as NVT.
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@ -1655,68 +1655,24 @@ SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) {
 SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
  LoadSDNode *LD = cast<LoadSDNode>(N);
  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), LD->getValueType(0));
  EVT LdVT    = LD->getMemoryVT();
  DebugLoc dl = N->getDebugLoc();
  assert(LdVT.isVector() && WidenVT.isVector());
  // Load information
  SDValue   Chain = LD->getChain();
  SDValue   BasePtr = LD->getBasePtr();
  int       SVOffset = LD->getSrcValueOffset();
  unsigned  Align    = LD->getAlignment();
  bool      isVolatile = LD->isVolatile();
  const Value *SV = LD->getSrcValue();
  ISD::LoadExtType ExtType = LD->getExtensionType();
  SDValue Result;
  SmallVector<SDValue, 16> LdChain;  // Chain for the series of load
-  if (ExtType != ISD::NON_EXTLOAD) {
+  if (ExtType != ISD::NON_EXTLOAD)
-    // For extension loads, we can not play the tricks of chopping legal
+    Result = GenWidenVectorExtLoads(LdChain, LD, ExtType);
-    // vector types and bit cast it to the right type.  Instead, we unroll
+  else
-    // the load and build a vector.
+    Result = GenWidenVectorLoads(LdChain, LD);
    EVT EltVT = WidenVT.getVectorElementType();
    EVT LdEltVT = LdVT.getVectorElementType();
    unsigned NumElts = LdVT.getVectorNumElements();
-    // Load each element and widen
+  // If we generate a single load, we can use that for the chain.  Otherwise,
-    unsigned WidenNumElts = WidenVT.getVectorNumElements();
+  // build a factor node to remember the multiple loads are independent and
-    SmallVector<SDValue, 16> Ops(WidenNumElts);
+  // chain to that.
-    unsigned Increment = LdEltVT.getSizeInBits() / 8;
+  SDValue NewChain;
-    Ops[0] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr, SV, SVOffset,
+  if (LdChain.size() == 1)
-                            LdEltVT, isVolatile, Align);
+    NewChain = LdChain[0];
-    LdChain.push_back(Ops[0].getValue(1));
+  else
-    unsigned i = 0, Offset = Increment;
+    NewChain = DAG.getNode(ISD::TokenFactor, LD->getDebugLoc(), MVT::Other,
-    for (i=1; i < NumElts; ++i, Offset += Increment) {
+                           &LdChain[0], LdChain.size());
      SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
                                       BasePtr, DAG.getIntPtrConstant(Offset));
      Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr, SV,
                              SVOffset + Offset, LdEltVT, isVolatile, Align);
      LdChain.push_back(Ops[i].getValue(1));
    }
    // Fill the rest with undefs
    SDValue UndefVal = DAG.getUNDEF(EltVT);
    for (; i != WidenNumElts; ++i)
      Ops[i] = UndefVal;
    Result =  DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], Ops.size());
  } else {
    assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType());
    unsigned int LdWidth = LdVT.getSizeInBits();
    Result = GenWidenVectorLoads(LdChain, Chain, BasePtr, SV, SVOffset,
                                 Align, isVolatile, LdWidth, WidenVT, dl);
  }
 // If we generate a single load, we can use that for the chain.  Otherwise,
 // build a factor node to remember the multiple loads are independent and
 // chain to that.
 SDValue NewChain;
 if (LdChain.size() == 1)
   NewChain = LdChain[0];
 else
   NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &LdChain[0],
                          LdChain.size());
  // Modified the chain - switch anything that used the old chain to use
  // the new one.
@ -1954,57 +1910,17 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
  // We have to widen the value but we want only to store the original
  // vector type.
  StoreSDNode *ST = cast<StoreSDNode>(N);
  SDValue  Chain = ST->getChain();
  SDValue  BasePtr = ST->getBasePtr();
  const    Value *SV = ST->getSrcValue();
  int      SVOffset = ST->getSrcValueOffset();
  unsigned Align = ST->getAlignment();
  bool     isVolatile = ST->isVolatile();
  SDValue  ValOp = GetWidenedVector(ST->getValue());
  DebugLoc dl = N->getDebugLoc();
  EVT StVT = ST->getMemoryVT();
  EVT ValVT = ValOp.getValueType();
  // It must be true that we the widen vector type is bigger than where
  // we need to store.
  assert(StVT.isVector() && ValOp.getValueType().isVector());
  assert(StVT.bitsLT(ValOp.getValueType()));
  SmallVector<SDValue, 16> StChain;
-  if (ST->isTruncatingStore()) {
+  if (ST->isTruncatingStore())
-    // For truncating stores, we can not play the tricks of chopping legal
+    GenWidenVectorTruncStores(StChain, ST);
-    // vector types and bit cast it to the right type.  Instead, we unroll
+  else
-    // the store.
+    GenWidenVectorStores(StChain, ST);
-    EVT StEltVT  = StVT.getVectorElementType();
+
    EVT ValEltVT = ValVT.getVectorElementType();
    unsigned Increment = ValEltVT.getSizeInBits() / 8;
    unsigned NumElts = StVT.getVectorNumElements();
    SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
                              DAG.getIntPtrConstant(0));
    StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, BasePtr, SV,
                                        SVOffset, StEltVT,
                                        isVolatile, Align));
    unsigned Offset = Increment;
    for (unsigned i=1; i < NumElts; ++i, Offset += Increment) {
      SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
                                       BasePtr, DAG.getIntPtrConstant(Offset));
      SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
                              DAG.getIntPtrConstant(0));
      StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, NewBasePtr, SV,
                                          SVOffset + Offset, StEltVT,
                                          isVolatile, MinAlign(Align, Offset)));
    }
  }
  else {
    assert(StVT.getVectorElementType() == ValVT.getVectorElementType());
    // Store value
    GenWidenVectorStores(StChain, Chain, BasePtr, SV, SVOffset,
                         Align, isVolatile, ValOp, StVT.getSizeInBits(), dl);
  }
  if (StChain.size() == 1)
    return StChain[0];
  else
-    return DAG.getNode(ISD::TokenFactor, dl,
+    return DAG.getNode(ISD::TokenFactor, ST->getDebugLoc(),
                       MVT::Other,&StChain[0],StChain.size());
 }
@ -2012,179 +1928,383 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
 // Vector Widening Utilities
 //===----------------------------------------------------------------------===//
 // Utility function to find the type to chop up a widen vector for load/store
 //  TLI:       Target lowering used to determine legal types.
 //  Width:     Width left need to load/store.
 //  WidenVT:   The widen vector type to load to/store from
 //  Align:     If 0, don't allow use of a wider type
 //  WidenEx:   If Align is not 0, the amount additional we can load/store from.
-// Utility function to find a vector type and its associated element
+static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
-// type from a preferred width and whose vector type must be the same size
+                       unsigned Width, EVT WidenVT,
-// as the VecVT.
+                       unsigned Align = 0, unsigned WidenEx = 0) {
-//  TLI:   Target lowering used to determine legal types.
+  EVT WidenEltVT = WidenVT.getVectorElementType();
-//  Width: Preferred width to store.
+  unsigned WidenWidth = WidenVT.getSizeInBits();
-//  VecVT: Vector value type whose size we must match.
+  unsigned WidenEltWidth = WidenEltVT.getSizeInBits();
-// Returns NewVecVT and NewEltVT - the vector type and its associated
+  unsigned AlignInBits = Align*8;
-// element type.
+
-static void FindAssocWidenVecType(SelectionDAG& DAG,
+  // If we have one element to load/store, return it.
-                                  const TargetLowering &TLI, unsigned Width,
+  EVT RetVT = WidenEltVT;
-                                  EVT VecVT,
+  if (Width == WidenEltWidth)
-                                  EVT& NewEltVT, EVT& NewVecVT) {
+    return RetVT;
-  unsigned EltWidth = Width + 1;
+
-  if (TLI.isTypeLegal(VecVT)) {
+  // See if there is larger legal integer than the element type to load/store 
-    // We start with the preferred with, making it a power of 2 and find a
+  unsigned VT;
-    // legal vector type of that width.  If not, we reduce it by another of 2.
+  for (VT = (unsigned)MVT::LAST_INTEGER_VALUETYPE;
-    // For incoming type is legal, this process will end as a vector of the
+       VT >= (unsigned)MVT::FIRST_INTEGER_VALUETYPE; --VT) {
-    // smallest loadable type should always be legal.
+    EVT MemVT((MVT::SimpleValueType) VT);
-    do {
+    unsigned MemVTWidth = MemVT.getSizeInBits();
-      assert(EltWidth > 0);
+    if (MemVT.getSizeInBits() <= WidenEltWidth)
-      EltWidth = 1 << Log2_32(EltWidth - 1);
+      break;
-      NewEltVT = EVT::getIntegerVT(*DAG.getContext(), EltWidth);
+    if (TLI.isTypeLegal(MemVT) && (WidenWidth % MemVTWidth) == 0 &&
-      unsigned NumElts = VecVT.getSizeInBits() / EltWidth;
+        (MemVTWidth <= Width ||
-      NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewEltVT, NumElts);
+         (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
-    } while (!TLI.isTypeLegal(NewVecVT) ||
+      RetVT = MemVT;
-             VecVT.getSizeInBits() != NewVecVT.getSizeInBits());
+      break;
-  } else {
+    }
    // The incoming vector type is illegal and is the result of widening
    // a vector to a power of 2. In this case, we will use the preferred
    // with as long as it is a multiple of the incoming vector length.
    // The legalization process will eventually make this into a legal type
    // and remove the illegal bit converts (which would turn to stack converts
    // if they are allow to exist).
     do {
      assert(EltWidth > 0);
      EltWidth = 1 << Log2_32(EltWidth - 1);
      NewEltVT = EVT::getIntegerVT(*DAG.getContext(), EltWidth);
      unsigned NumElts = VecVT.getSizeInBits() / EltWidth;
      NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewEltVT, NumElts);
    } while (!TLI.isTypeLegal(NewEltVT) ||
             VecVT.getSizeInBits() != NewVecVT.getSizeInBits());
  }
  // See if there is a larger vector type to load/store that has the same vector
  // element type and is evenly divisible with the WidenVT.
  for (VT = (unsigned)MVT::LAST_VECTOR_VALUETYPE;
       VT >= (unsigned)MVT::FIRST_VECTOR_VALUETYPE; --VT) {
    EVT MemVT = (MVT::SimpleValueType) VT;
    unsigned MemVTWidth = MemVT.getSizeInBits();
    if (TLI.isTypeLegal(MemVT) && WidenEltVT == MemVT.getVectorElementType() &&
        (WidenWidth % MemVTWidth) == 0 &&
        (MemVTWidth <= Width ||
         (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
      if (RetVT.getSizeInBits() < MemVTWidth || MemVT == WidenVT)
        return MemVT;
    }
  }
  return RetVT;
 }
 // Builds a vector type from scalar loads
 //  VecTy: Resulting Vector type
 //  LDOps: Load operators to build a vector type
 //  [Start,End) the list of loads to use.
 static SDValue BuildVectorFromScalar(SelectionDAG& DAG, EVT VecTy,
                                     SmallVector<SDValue, 16>& LdOps,
                                     unsigned Start, unsigned End) {
  DebugLoc dl = LdOps[Start].getDebugLoc();
  EVT LdTy = LdOps[Start].getValueType();
  unsigned Width = VecTy.getSizeInBits();
  unsigned NumElts = Width / LdTy.getSizeInBits();
  EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), LdTy, NumElts);
  unsigned Idx = 1;
  SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT,LdOps[Start]);
  for (unsigned i = Start + 1; i != End; ++i) {
    EVT NewLdTy = LdOps[i].getValueType();
    if (NewLdTy != LdTy) {
      NumElts = Width / NewLdTy.getSizeInBits();
      NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewLdTy, NumElts);
      VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, NewVecVT, VecOp);
      // Readjust position and vector position based on new load type
      Idx = Idx * LdTy.getSizeInBits() / NewLdTy.getSizeInBits();
      LdTy = NewLdTy;
    }
    VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, VecOp, LdOps[i],
                        DAG.getIntPtrConstant(Idx++));
  }
  return DAG.getNode(ISD::BIT_CONVERT, dl, VecTy, VecOp);
 }
 SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVector<SDValue, 16>& LdChain,
-                                              SDValue      Chain,
+                                              LoadSDNode * LD) {
                                              SDValue      BasePtr,
                                              const Value *SV,
                                              int          SVOffset,
                                              unsigned     Alignment,
                                              bool         isVolatile,
                                              unsigned     LdWidth,
                                              EVT          ResType,
                                              DebugLoc     dl) {
  // The strategy assumes that we can efficiently load powers of two widths.
-  // The routines chops the vector into the largest power of 2 load and
+  // The routines chops the vector into the largest vector loads with the same
-  // can be inserted into a legal vector and then cast the result into the
+  // element type or scalar loads and then recombines it to the widen vector
-  // vector type we want.  This avoids unnecessary stack converts.
+  // type.
  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), LD->getValueType(0));
  unsigned WidenWidth = WidenVT.getSizeInBits();
  EVT LdVT    = LD->getMemoryVT();
  DebugLoc dl = LD->getDebugLoc();
  assert(LdVT.isVector() && WidenVT.isVector());
  assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType());
-  // TODO: If the Ldwidth is legal, alignment is the same as the LdWidth, and
+  // Load information
-  //       the load is nonvolatile, we an use a wider load for the value.
+  SDValue   Chain = LD->getChain();
  SDValue   BasePtr = LD->getBasePtr();
  int       SVOffset = LD->getSrcValueOffset();
  unsigned  Align    = LD->getAlignment();
  bool      isVolatile = LD->isVolatile();
  const Value *SV = LD->getSrcValue();
  int LdWidth = LdVT.getSizeInBits();
  int WidthDiff = WidenWidth - LdWidth;          // Difference
  unsigned LdAlign = (isVolatile) ? 0 : Align; // Allow wider loads
  // Find the vector type that can load from.
-  EVT NewEltVT, NewVecVT;
+  EVT NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
-  unsigned NewEltVTWidth;
+  int NewVTWidth = NewVT.getSizeInBits();
-  FindAssocWidenVecType(DAG, TLI, LdWidth, ResType, NewEltVT, NewVecVT);
+  SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, SV, SVOffset,
-  NewEltVTWidth = NewEltVT.getSizeInBits();
+                             isVolatile, Align);
  SDValue LdOp = DAG.getLoad(NewEltVT, dl, Chain, BasePtr, SV, SVOffset,
                             isVolatile, Alignment);
  SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp);
  LdChain.push_back(LdOp.getValue(1));
  // Check if we can load the element with one instruction
-  if (LdWidth == NewEltVTWidth) {
+  if (LdWidth <= NewVTWidth) {
-    return DAG.getNode(ISD::BIT_CONVERT, dl, ResType, VecOp);
+    if (NewVT.isVector()) {
      if (NewVT != WidenVT) {
        assert(WidenWidth % NewVTWidth == 0);
        unsigned NumConcat = WidenWidth / NewVTWidth;
        SmallVector<SDValue, 16> ConcatOps(NumConcat);
        SDValue UndefVal = DAG.getUNDEF(NewVT);
        ConcatOps[0] = LdOp;
        for (unsigned i = 1; i != NumConcat; ++i)
          ConcatOps[i] = UndefVal;
        return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, &ConcatOps[0],
                           NumConcat);
      } else
        return LdOp;
    } else {
      unsigned NumElts = WidenWidth / LdWidth;
      EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
      SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp);
      return DAG.getNode(ISD::BIT_CONVERT, dl, WidenVT, VecOp);
    }
  }
-  unsigned Idx = 1;
+  // Load vector by using multiple loads from largest vector to scalar
-  LdWidth -= NewEltVTWidth;
+  SmallVector<SDValue, 16> LdOps;
  LdOps.push_back(LdOp);
  LdWidth -= NewVTWidth;
  unsigned Offset = 0;
  while (LdWidth > 0) {
-    unsigned Increment = NewEltVTWidth / 8;
+    unsigned Increment = NewVTWidth / 8;
    Offset += Increment;
    BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
                          DAG.getIntPtrConstant(Increment));
-    if (LdWidth < NewEltVTWidth) {
+    if (LdWidth < NewVTWidth) {
-      // Our current type we are using is too large, use a smaller size by
+      // Our current type we are using is too large, find a better size
-      // using a smaller power of 2
+      NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
-      unsigned oNewEltVTWidth = NewEltVTWidth;
+      NewVTWidth = NewVT.getSizeInBits();
      FindAssocWidenVecType(DAG, TLI, LdWidth, ResType, NewEltVT, NewVecVT);
      NewEltVTWidth = NewEltVT.getSizeInBits();
      // Readjust position and vector position based on new load type
      Idx = Idx * (oNewEltVTWidth/NewEltVTWidth);
      VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, NewVecVT, VecOp);
    }
-    SDValue LdOp = DAG.getLoad(NewEltVT, dl, Chain, BasePtr, SV,
+    SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, SV,
-                                 SVOffset+Offset, isVolatile,
+                               SVOffset+Offset, isVolatile,
-                                 MinAlign(Alignment, Offset));
+                               MinAlign(Align, Increment));
    LdChain.push_back(LdOp.getValue(1));
-    VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, VecOp, LdOp,
+    LdOps.push_back(LdOp);
                        DAG.getIntPtrConstant(Idx++));
-    LdWidth -= NewEltVTWidth;
+    LdWidth -= NewVTWidth;
  }
-  return DAG.getNode(ISD::BIT_CONVERT, dl, ResType, VecOp);
+  // Build the vector from the loads operations
  unsigned End = LdOps.size();
  if (LdOps[0].getValueType().isVector()) {
    // If the load contains vectors, build the vector using concat vector.
    // All of the vectors used to loads are power of 2 and the scalars load
    // can be combined to make a power of 2 vector.
    SmallVector<SDValue, 16> ConcatOps(End);
    int i = End - 1;
    int Idx = End;
    EVT LdTy = LdOps[i].getValueType();
    // First combine the scalar loads to a vector
    if (!LdTy.isVector())  {
      for (--i; i >= 0; --i) {
        LdTy = LdOps[i].getValueType();
        if (LdTy.isVector())
          break;
      }
      ConcatOps[--Idx] = BuildVectorFromScalar(DAG, LdTy, LdOps, i+1, End);
    }
    ConcatOps[--Idx] = LdOps[i];
    for (--i; i >= 0; --i) {
      EVT NewLdTy = LdOps[i].getValueType();
      if (NewLdTy != LdTy) {
        // Create a larger vector
        ConcatOps[End-1] = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewLdTy,
                                       &ConcatOps[Idx], End - Idx);
        Idx = End - 1;
        LdTy = NewLdTy;
      }
      ConcatOps[--Idx] = LdOps[i];
    }
    if (WidenWidth != LdTy.getSizeInBits()*(End - Idx)) {
      // We need to fill the rest with undefs to build the vector
      unsigned NumOps = WidenWidth / LdTy.getSizeInBits();
      SmallVector<SDValue, 16> WidenOps(NumOps);
      SDValue UndefVal = DAG.getUNDEF(LdTy);
      unsigned i = 0;
      for (; i != End-Idx; ++i)
        WidenOps[i] = ConcatOps[Idx+i];
      for (; i != NumOps; ++i)
        WidenOps[i] = UndefVal;
      return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, &WidenOps[0],NumOps);
    } else
      return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT,
                         &ConcatOps[Idx], End - Idx);
  } else // All the loads are scalar loads.
    return BuildVectorFromScalar(DAG, WidenVT, LdOps, 0, End);
 }
-void DAGTypeLegalizer::GenWidenVectorStores(SmallVector<SDValue, 16>& StChain,
+SDValue
-                                            SDValue   Chain,
+DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVector<SDValue, 16>& LdChain,
-                                            SDValue   BasePtr,
+                                         LoadSDNode * LD,
-                                            const Value *SV,
+                                         ISD::LoadExtType ExtType) {
-                                            int         SVOffset,
+  // For extension loads, it may not be more efficient to chop up the vector
-                                            unsigned    Alignment,
+  // and then extended it.  Instead, we unroll the load and build a new vector.
-                                            bool        isVolatile,
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
-                                            SDValue     ValOp,
+  EVT LdVT    = LD->getMemoryVT();
-                                            unsigned    StWidth,
+  DebugLoc dl = LD->getDebugLoc();
-                                            DebugLoc    dl) {
+  assert(LdVT.isVector() && WidenVT.isVector());
  // Breaks the stores into a series of power of 2 width stores.  For any
  // width, we convert the vector to the vector of element size that we
  // want to store.  This avoids requiring a stack convert.
-  // Find a width of the element type we can store with
+  // Load information
-  EVT WidenVT = ValOp.getValueType();
+  SDValue   Chain = LD->getChain();
-  EVT NewEltVT, NewVecVT;
+  SDValue   BasePtr = LD->getBasePtr();
  int       SVOffset = LD->getSrcValueOffset();
  unsigned  Align    = LD->getAlignment();
  bool      isVolatile = LD->isVolatile();
  const Value *SV = LD->getSrcValue();
-  FindAssocWidenVecType(DAG, TLI, StWidth, WidenVT, NewEltVT, NewVecVT);
+  EVT EltVT = WidenVT.getVectorElementType();
-  unsigned NewEltVTWidth = NewEltVT.getSizeInBits();
+  EVT LdEltVT = LdVT.getVectorElementType();
  unsigned NumElts = LdVT.getVectorNumElements();
-  SDValue VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, NewVecVT, ValOp);
+  // Load each element and widen
-  SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewEltVT, VecOp,
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
-                            DAG.getIntPtrConstant(0));
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
-  SDValue StOp = DAG.getStore(Chain, dl, EOp, BasePtr, SV, SVOffset,
+  unsigned Increment = LdEltVT.getSizeInBits() / 8;
-                               isVolatile, Alignment);
+  Ops[0] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr, SV, SVOffset,
-  StChain.push_back(StOp);
+                          LdEltVT, isVolatile, Align);
-
+  LdChain.push_back(Ops[0].getValue(1));
-  // Check if we are done
+  unsigned i = 0, Offset = Increment;
-  if (StWidth == NewEltVTWidth) {
+  for (i=1; i < NumElts; ++i, Offset += Increment) {
-    return;
+    SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
                                     BasePtr, DAG.getIntPtrConstant(Offset));
    Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr, SV,
                            SVOffset + Offset, LdEltVT, isVolatile, Align);
    LdChain.push_back(Ops[i].getValue(1));
  }
-  unsigned Idx = 1;
+  // Fill the rest with undefs
-  StWidth -= NewEltVTWidth;
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
-  unsigned Offset = 0;
+  for (; i != WidenNumElts; ++i)
    Ops[i] = UndefVal;
-  while (StWidth > 0) {
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], Ops.size());
-    unsigned Increment = NewEltVTWidth / 8;
+}
    Offset += Increment;
    BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
                          DAG.getIntPtrConstant(Increment));
    if (StWidth < NewEltVTWidth) {
      // Our current type we are using is too large, use a smaller size by
      // using a smaller power of 2
      unsigned oNewEltVTWidth = NewEltVTWidth;
      FindAssocWidenVecType(DAG, TLI, StWidth, WidenVT, NewEltVT, NewVecVT);
      NewEltVTWidth = NewEltVT.getSizeInBits();
      // Readjust position and vector position based on new load type
      Idx = Idx * (oNewEltVTWidth/NewEltVTWidth);
      VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, NewVecVT, VecOp);
    }
-    EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewEltVT, VecOp,
+void DAGTypeLegalizer::GenWidenVectorStores(SmallVector<SDValue, 16>& StChain,
                                            StoreSDNode *ST) {
  // The strategy assumes that we can efficiently store powers of two widths.
  // The routines chops the vector into the largest vector stores with the same
  // element type or scalar stores.
  SDValue  Chain = ST->getChain();
  SDValue  BasePtr = ST->getBasePtr();
  const    Value *SV = ST->getSrcValue();
  int      SVOffset = ST->getSrcValueOffset();
  unsigned Align = ST->getAlignment();
  bool     isVolatile = ST->isVolatile();
  SDValue  ValOp = GetWidenedVector(ST->getValue());
  DebugLoc dl = ST->getDebugLoc();
  EVT StVT = ST->getMemoryVT();
  unsigned StWidth = StVT.getSizeInBits();
  EVT ValVT = ValOp.getValueType();
  unsigned ValWidth = ValVT.getSizeInBits();
  EVT ValEltVT = ValVT.getVectorElementType();
  unsigned ValEltWidth = ValEltVT.getSizeInBits();
  assert(StVT.getVectorElementType() == ValEltVT);
  int Idx = 0;          // current index to store
  unsigned Offset = 0;  // offset from base to store
  while (StWidth != 0) {
    // Find the largest vector type we can store with
    EVT NewVT = FindMemType(DAG, TLI, StWidth, ValVT);
    unsigned NewVTWidth = NewVT.getSizeInBits();
    unsigned Increment = NewVTWidth / 8;
    if (NewVT.isVector()) {
      unsigned NumVTElts = NewVT.getVectorNumElements();
      do {
        SDValue EOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NewVT, ValOp,
                                   DAG.getIntPtrConstant(Idx));
        StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr, SV,
                                       SVOffset + Offset, isVolatile,
                                       MinAlign(Align, Offset)));
        StWidth -= NewVTWidth;
        Offset += Increment;
        Idx += NumVTElts;
        BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
                              DAG.getIntPtrConstant(Increment));
      } while (StWidth != 0 && StWidth >= NewVTWidth);
    } else {
      // Cast the vector to the scalar type we can store
      unsigned NumElts = ValWidth / NewVTWidth;
      EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
      SDValue VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, NewVecVT, ValOp);
      // Readjust index position based on new vector type
      Idx = Idx * ValEltWidth / NewVTWidth;
      do {
        SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, VecOp,
                      DAG.getIntPtrConstant(Idx++));
-    StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr, SV,
+        StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr, SV,
                                   SVOffset + Offset, isVolatile,
-                                   MinAlign(Alignment, Offset)));
+                                   MinAlign(Align, Offset)));
-    StWidth -= NewEltVTWidth;
+        StWidth -= NewVTWidth;
        Offset += Increment;
        BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
                              DAG.getIntPtrConstant(Increment));
      } while (StWidth != 0  && StWidth >= NewVTWidth);
      // Restore index back to be relative to the original widen element type
      Idx = Idx * NewVTWidth / ValEltWidth;
    }
  }
 }
 void
 DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVector<SDValue, 16>& StChain,
                                            StoreSDNode *ST) {
  // For extension loads, it may not be more efficient to truncate the vector
  // and then store it.  Instead, we extract each element and then store it.
  SDValue  Chain = ST->getChain();
  SDValue  BasePtr = ST->getBasePtr();
  const    Value *SV = ST->getSrcValue();
  int      SVOffset = ST->getSrcValueOffset();
  unsigned Align = ST->getAlignment();
  bool     isVolatile = ST->isVolatile();
  SDValue  ValOp = GetWidenedVector(ST->getValue());
  DebugLoc dl = ST->getDebugLoc();
  EVT StVT = ST->getMemoryVT();
  EVT ValVT = ValOp.getValueType();
  // It must be true that we the widen vector type is bigger than where
  // we need to store.
  assert(StVT.isVector() && ValOp.getValueType().isVector());
  assert(StVT.bitsLT(ValOp.getValueType()));
  // For truncating stores, we can not play the tricks of chopping legal
  // vector types and bit cast it to the right type.  Instead, we unroll
  // the store.
  EVT StEltVT  = StVT.getVectorElementType();
  EVT ValEltVT = ValVT.getVectorElementType();
  unsigned Increment = ValEltVT.getSizeInBits() / 8;
  unsigned NumElts = StVT.getVectorNumElements();
  SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
                            DAG.getIntPtrConstant(0));
  StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, BasePtr, SV,
                                      SVOffset, StEltVT,
                                      isVolatile, Align));
  unsigned Offset = Increment;
  for (unsigned i=1; i < NumElts; ++i, Offset += Increment) {
    SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
                                     BasePtr, DAG.getIntPtrConstant(Offset));
    SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
                            DAG.getIntPtrConstant(0));
    StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, NewBasePtr, SV,
                                        SVOffset + Offset, StEltVT,
                                        isVolatile, MinAlign(Align, Offset)));
  }
 }
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -747,6 +747,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2f64, Custom);
    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2i64, Custom);
    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i8, Custom);
    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i16, Custom);
    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i32, Custom);
    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
      EVT VT = (MVT::SimpleValueType)i;
@ -3686,6 +3692,33 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
  return SDValue();
 }
 SDValue
 X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
  // We support concatenate two MMX registers and place them in a MMX
  // register.  This is better than doing a stack convert.
  DebugLoc dl = Op.getDebugLoc();
  EVT ResVT = Op.getValueType();
  assert(Op.getNumOperands() == 2);
  assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
         ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
  int Mask[2];
  SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0));
  SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
  InVec = Op.getOperand(1);
  if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
    unsigned NumElts = ResVT.getVectorNumElements();
    VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp);
    VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp,
                       InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1));
  } else {
    InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec);
    SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
    Mask[0] = 0; Mask[1] = 2;
    VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask);
  }
  return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp);
 }
 // v8i16 shuffles - Prefer shuffles in the following order:
 // 1. [all]   pshuflw, pshufhw, optional move
 // 2. [ssse3] 1 x pshufb
@ -7238,6 +7271,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op,DAG);
  case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
  case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@ -156,6 +156,11 @@ namespace llvm {
      /// relative displacements.
      WrapperRIP,
      /// MOVQ2DQ - Copies a 64-bit value from a vector to another vector.
      /// Can be used to move a vector value from a MMX register to a XMM
      /// register.
      MOVQ2DQ,
      /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to
      /// i32, corresponds to X86::PEXTRB.
      PEXTRB,
@ -634,6 +639,7 @@ namespace llvm {
    SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
                                   SelectionDAG &DAG);
    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG);
    SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG);
    SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG);
    SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG);
    SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG);
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@ -501,6 +501,20 @@ let Constraints = "$src1 = $dst" in {
                               (iPTR imm:$src3))))]>;
 }
 // MMX to XMM for vector types
 def MMX_X86movq2dq : SDNode<"X86ISD::MOVQ2DQ", SDTypeProfile<1, 1,
                            [SDTCisVT<0, v2i64>, SDTCisVT<1, v1i64>]>>;
 def : Pat<(v2i64 (MMX_X86movq2dq VR64:$src)),
          (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
 def : Pat<(v2i64 (MMX_X86movq2dq (load_mmx addr:$src))),
          (v2i64 (MOVQI2PQIrm addr:$src))>;
 def : Pat<(v2i64 (MMX_X86movq2dq (v1i64 (bitconvert
                            (v2i32 (scalar_to_vector (loadi32 addr:$src))))))),
          (v2i64 (MOVDI2PDIrm addr:$src))>;
 // Mask creation
 def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR64:$src),
                          "pmovmskb\t{$src, $dst|$dst, $src}",
--- a/test/CodeGen/X86/widen_cast-2.ll
+++ b/test/CodeGen/X86/widen_cast-2.ll
@ -2,10 +2,8 @@
 ; CHECK: pextrd
 ; CHECK: pextrd
 ; CHECK: movd
-; CHECK: pextrd
+; CHECK: movaps
-; CHECK: pextrd
+
 ; CHECK: pextrd
 ; CHECK: movd
 ; bitcast v14i16 to v7i32
--- a/test/CodeGen/X86/widen_load-1.ll
+++ b/test/CodeGen/X86/widen_load-1.ll
@ -3,7 +3,7 @@
 ; This load should be before the call, not after.
-; CHECK: movq    compl+128(%rip), %xmm0
+; CHECK: movaps    compl+128(%rip), %xmm0
 ; CHECK: movaps  %xmm0, (%rsp)
 ; CHECK: callq   killcommon
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll
@ -0,0 +1,155 @@
 ; RUN: llc < %s -o - -march=x86-64 -mattr=+sse42 -disable-mmx | FileCheck %s
 ; Test based on pr5626 to load/store
 ;
 %i32vec3 = type <3 x i32>
 define void @add3i32(%i32vec3*  sret %ret, %i32vec3* %ap, %i32vec3* %bp)  {
 ; CHECK: movaps
 ; CHECK: paddd
 ; CHECK: pextrd
 ; CHECK: movq
 	%a = load %i32vec3* %ap, align 16
 	%b = load %i32vec3* %bp, align 16
 	%x = add %i32vec3 %a, %b
 	store %i32vec3 %x, %i32vec3* %ret, align 16
 	ret void
 }
 define void @add3i32_2(%i32vec3*  sret %ret, %i32vec3* %ap, %i32vec3* %bp)  {
 ; CHECK: movq
 ; CHECK: pinsrd
 ; CHECK: movq
 ; CHECK: pinsrd
 ; CHECK: paddd
 ; CHECK: pextrd
 ; CHECK: movq
 	%a = load %i32vec3* %ap
 	%b = load %i32vec3* %bp
 	%x = add %i32vec3 %a, %b
 	store %i32vec3 %x, %i32vec3* %ret
 	ret void
 }
 %i32vec7 = type <7 x i32>
 define void @add7i32(%i32vec7*  sret %ret, %i32vec7* %ap, %i32vec7* %bp)  {
 ; CHECK: movaps
 ; CHECK: movaps
 ; CHECK: paddd
 ; CHECK: paddd
 ; CHECK: pextrd
 ; CHECK: movq
 ; CHECK: movaps
 	%a = load %i32vec7* %ap, align 16
 	%b = load %i32vec7* %bp, align 16
 	%x = add %i32vec7 %a, %b
 	store %i32vec7 %x, %i32vec7* %ret, align 16
 	ret void
 }
 %i32vec12 = type <12 x i32>
 define void @add12i32(%i32vec12*  sret %ret, %i32vec12* %ap, %i32vec12* %bp)  {
 ; CHECK: movaps
 ; CHECK: movaps
 ; CHECK: movaps
 ; CHECK: paddd
 ; CHECK: paddd
 ; CHECK: paddd
 ; CHECK: movaps
 ; CHECK: movaps
 ; CHECK: movaps
 	%a = load %i32vec12* %ap, align 16
 	%b = load %i32vec12* %bp, align 16
 	%x = add %i32vec12 %a, %b
 	store %i32vec12 %x, %i32vec12* %ret, align 16
 	ret void
 }
 %i16vec3 = type <3 x i16>
 define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind {
 ; CHECK: movaps
 ; CHECK: paddw
 ; CHECK: movd
 ; CHECK: pextrw
 	%a = load %i16vec3* %ap, align 16
 	%b = load %i16vec3* %bp, align 16
 	%x = add %i16vec3 %a, %b
 	store %i16vec3 %x, %i16vec3* %ret, align 16
 	ret void
 }
 %i16vec4 = type <4 x i16>
 define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp) nounwind {
 ; CHECK: movaps
 ; CHECK: paddw
 ; CHECK: movq
 	%a = load %i16vec4* %ap, align 16
 	%b = load %i16vec4* %bp, align 16
 	%x = add %i16vec4 %a, %b
 	store %i16vec4 %x, %i16vec4* %ret, align 16
 	ret void
 }
 %i16vec12 = type <12 x i16>
 define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12* %bp) nounwind {
 ; CHECK: movaps
 ; CHECK: movaps
 ; CHECK: paddw
 ; CHECK: paddw
 ; CHECK: movq
 ; CHECK: movaps
 	%a = load %i16vec12* %ap, align 16
 	%b = load %i16vec12* %bp, align 16
 	%x = add %i16vec12 %a, %b
 	store %i16vec12 %x, %i16vec12* %ret, align 16
 	ret void
 }
 %i16vec18 = type <18 x i16>
 define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18* %bp) nounwind {
 ; CHECK: movaps
 ; CHECK: movaps
 ; CHECK: movaps
 ; CHECK: paddw
 ; CHECK: paddw
 ; CHECK: paddw
 ; CHECK: movd
 ; CHECK: movaps
 ; CHECK: movaps
 	%a = load %i16vec18* %ap, align 16
 	%b = load %i16vec18* %bp, align 16
 	%x = add %i16vec18 %a, %b
 	store %i16vec18 %x, %i16vec18* %ret, align 16
 	ret void
 }
 %i8vec3 = type <3 x i8>
 define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind {
 ; CHECK: movaps
 ; CHECK: paddb
 ; CHECK: pextrb
 ; CHECK: movb
 	%a = load %i8vec3* %ap, align 16
 	%b = load %i8vec3* %bp, align 16
 	%x = add %i8vec3 %a, %b
 	store %i8vec3 %x, %i8vec3* %ret, align 16
 	ret void
 }
 %i8vec31 = type <31 x i8>
 define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp) nounwind {
 ; CHECK: movaps
 ; CHECK: movaps
 ; CHECK: paddb
 ; CHECK: paddb
 ; CHECK: movq
 ; CHECK: pextrb
 ; CHECK: pextrw
 	%a = load %i8vec31* %ap, align 16
 	%b = load %i8vec31* %bp, align 16
 	%x = add %i8vec31 %a, %b
 	store %i8vec31 %x, %i8vec31* %ret, align 16
 	ret void
 }