diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 6099ba93260..cde20c3cb37 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -2850,6 +2850,7 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::VPERMILPS:
   case X86ISD::VPERMILPD:
   case X86ISD::VPERM2F128:
+  case X86ISD::VPERM2I128:
     return true;
   }
   return false;
@@ -2891,6 +2892,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   case X86ISD::SHUFPD:
   case X86ISD::SHUFPS:
   case X86ISD::VPERM2F128:
+  case X86ISD::VPERM2I128:
     return DAG.getNode(Opc, dl, VT, V1, V2,
                        DAG.getConstant(TargetMask, MVT::i8));
   }
@@ -3283,8 +3285,8 @@ static bool isVSHUFPYMask(const SmallVectorImpl<int> &Mask, EVT VT,
       return false;
 
   // For VSHUFPSY, the mask of the second half must be the same as the first
-  // but with // the appropriate offsets. This works in the same way as
-  // VPERMILPS // works with masks.
+  // but with the appropriate offsets. This works in the same way as
+  // VPERMILPS works with masks.
   for (int i = QuarterSize*2; i < QuarterSize*3; ++i) {
     if (!isUndefOrInRange(Mask[i], HalfSize, NumElems))
       return false;
@@ -3358,8 +3360,8 @@ static bool isCommutedVSHUFPY(ShuffleVectorSDNode *N, bool HasAVX) {
       return false;
 
   // For VSHUFPSY, the mask of the second half must be the same as the first
-  // but with // the appropriate offsets. This works in the same way as
-  // VPERMILPS // works with masks.
+  // but with the appropriate offsets. This works in the same way as
+  // VPERMILPS works with masks.
   for (int i = QuarterSize*2; i < QuarterSize*3; ++i) {
     if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2))
       return false;
@@ -3753,15 +3755,15 @@ bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
   return ::isMOVLMask(M, N->getValueType(0));
 }
 
-/// isVPERM2F128Mask - Match 256-bit shuffles where the elements are considered
+/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
 /// as permutations between 128-bit chunks or halves. As an example: this
 /// shuffle bellow:
 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
 /// The first half comes from the second half of V1 and the second half from the
 /// the second half of V2.
-static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
-                             const X86Subtarget *Subtarget) {
-  if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
+static bool isVPERM2X128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
+                             bool HasAVX) {
+  if (!HasAVX || VT.getSizeInBits() != 256)
     return false;
 
   // The shuffle result is divided into half A and half B. In total the two
@@ -3789,9 +3791,9 @@ static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
   return MatchA && MatchB;
 }
 
-/// getShuffleVPERM2F128Immediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VPERM2F128 instructions.
-static unsigned getShuffleVPERM2F128Immediate(SDNode *N) {
+/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
+static unsigned getShuffleVPERM2X128Immediate(SDNode *N) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   EVT VT = SVOp->getValueType(0);
 
@@ -3814,80 +3816,47 @@ static unsigned getShuffleVPERM2F128Immediate(SDNode *N) {
   return (FstHalf | (SndHalf << 4));
 }
 
-/// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand
+/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
 /// Note that VPERMIL mask matching is different depending whether theunderlying
 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
 /// to the same elements of the low, but to the higher half of the source.
 /// In VPERMILPD the two lanes could be shuffled independently of each other
 /// with the same restriction that lanes can't be crossed.
-static bool isVPERMILPDMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                            const X86Subtarget *Subtarget) {
+static bool isVPERMILPMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                           bool HasAVX) {
   int NumElts = VT.getVectorNumElements();
   int NumLanes = VT.getSizeInBits()/128;
 
-  if (!Subtarget->hasAVX())
+  if (!HasAVX)
     return false;
 
-  // Only match 256-bit with 64-bit types
-  if (VT.getSizeInBits() != 256 || NumElts != 4)
+  // Only match 256-bit with 32/64-bit types
+  if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8))
     return false;
 
-  // The mask on the high lane is independent of the low. Both can match
-  // any element in inside its own lane, but can't cross.
   int LaneSize = NumElts/NumLanes;
-  for (int l = 0; l < NumLanes; ++l)
-    for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) {
-      int LaneStart = l*LaneSize;
-      if (!isUndefOrInRange(Mask[i], LaneStart, LaneStart+LaneSize))
+  for (int l = 0; l != NumLanes; ++l) {
+    int LaneStart = l*LaneSize;
+    for (int i = 0; i != LaneSize; ++i) {
+      if (!isUndefOrInRange(Mask[i+LaneStart], LaneStart, LaneStart+LaneSize))
+        return false;
+      if (NumElts == 4 || l == 0)
+        continue;
+      // VPERMILPS handling
+      if (Mask[i] < 0)
+        continue;
+      if (!isUndefOrEqual(Mask[i+LaneStart], Mask[i]+LaneSize))
         return false;
     }
-
-  return true;
-}
-
-/// isVPERMILPSMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to VPERMILPS*.
-/// Note that VPERMIL mask matching is different depending whether theunderlying
-/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
-/// to the same elements of the low, but to the higher half of the source.
-/// In VPERMILPD the two lanes could be shuffled independently of each other
-/// with the same restriction that lanes can't be crossed.
-static bool isVPERMILPSMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                            const X86Subtarget *Subtarget) {
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned NumLanes = VT.getSizeInBits()/128;
-
-  if (!Subtarget->hasAVX())
-    return false;
-
-  // Only match 256-bit with 32-bit types
-  if (VT.getSizeInBits() != 256 || NumElts != 8)
-    return false;
-
-  // The mask on the high lane should be the same as the low. Actually,
-  // they can differ if any of the corresponding index in a lane is undef
-  // and the other stays in range.
-  int LaneSize = NumElts/NumLanes;
-  for (int i = 0; i < LaneSize; ++i) {
-    int HighElt = i+LaneSize;
-    bool HighValid = isUndefOrInRange(Mask[HighElt], LaneSize, NumElts);
-    bool LowValid = isUndefOrInRange(Mask[i], 0, LaneSize);
-
-    if (!HighValid || !LowValid)
-      return false;
-    if (Mask[i] < 0 || Mask[HighElt] < 0)
-      continue;
-    if (Mask[HighElt]-Mask[i] != LaneSize)
-      return false;
   }
 
   return true;
 }
 
-/// getShuffleVPERMILPSImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VPERMILPS* instructions.
-static unsigned getShuffleVPERMILPSImmediate(SDNode *N) {
+/// getShuffleVPERMILPImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERMILPS/D* instructions.
+static unsigned getShuffleVPERMILPImmediate(SDNode *N) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   EVT VT = SVOp->getValueType(0);
 
@@ -3899,43 +3868,22 @@ static unsigned getShuffleVPERMILPSImmediate(SDNode *N) {
   // where a mask will match because the same mask element is undef on the
   // first half but valid on the second. This would get pathological cases
   // such as: shuffle <u, 0, 1, 2, 4, 4, 5, 6>, which is completely valid.
+  unsigned Shift = (LaneSize == 4) ? 2 : 1;
   unsigned Mask = 0;
-  for (int l = 0; l < NumLanes; ++l) {
-    for (int i = 0; i < LaneSize; ++i) {
-      int MaskElt = SVOp->getMaskElt(i+(l*LaneSize));
-      if (MaskElt < 0)
-        continue;
-      if (MaskElt >= LaneSize)
-        MaskElt -= LaneSize;
-      Mask |= MaskElt << (i*2);
-    }
+  for (int i = 0; i != NumElts; ++i) {
+    int MaskElt = SVOp->getMaskElt(i);
+    if (MaskElt < 0)
+      continue;
+    MaskElt %= LaneSize;
+    unsigned Shamt = i;
+    // VPERMILPSY, the mask of the first half must be equal to the second one
+    if (NumElts == 8) Shamt %= LaneSize;
+    Mask |= MaskElt << (Shamt*Shift);
   }
 
   return Mask;
 }
 
-/// getShuffleVPERMILPDImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VPERMILPD* instructions.
-static unsigned getShuffleVPERMILPDImmediate(SDNode *N) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
-  EVT VT = SVOp->getValueType(0);
-
-  int NumElts = VT.getVectorNumElements();
-  int NumLanes = VT.getSizeInBits()/128;
-
-  unsigned Mask = 0;
-  int LaneSize = NumElts/NumLanes;
-  for (int l = 0; l < NumLanes; ++l)
-    for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) {
-      int MaskElt = SVOp->getMaskElt(i);
-      if (MaskElt < 0)
-        continue;
-      Mask |= (MaskElt-l*LaneSize) << i;
-    }
-
-  return Mask;
-}
-
 /// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
 /// element of vector 2 and the other elements to come from vector 1 in order.
@@ -4677,6 +4625,7 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
                         ShuffleMask);
       break;
     case X86ISD::VPERM2F128:
+    case X86ISD::VPERM2I128:
       ImmN = N->getOperand(N->getNumOperands()-1);
       DecodeVPERM2F128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                            ShuffleMask);
@@ -6596,6 +6545,22 @@ static inline unsigned getVPERMILOpcode(EVT VT) {
   return 0;
 }
 
+static inline unsigned getVPERM2X128Opcode(EVT VT, bool HasAVX2) {
+  switch(VT.getSimpleVT().SimpleTy) {
+  case MVT::v32i8:
+  case MVT::v16i16:
+  case MVT::v8i32:
+  case MVT::v4i64:
+    if (HasAVX2)   return X86ISD::VPERM2I128;
+    // else use fp unit for int vperm
+  case MVT::v8f32:
+  case MVT::v4f64: return X86ISD::VPERM2F128;
+  default:
+    llvm_unreachable("Unknown type for vpermil");
+  }
+  return 0;
+}
+
 static
 SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
                                const TargetLowering &TLI,
@@ -6910,22 +6875,17 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   if (isMOVDDUPYMask(SVOp, Subtarget))
     return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
 
-  // Handle VPERMILPS* permutations
-  if (isVPERMILPSMask(M, VT, Subtarget))
+  // Handle VPERMILPS/D* permutations
+  if (isVPERMILPMask(M, VT, Subtarget->hasAVX()))
     return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
-                                getShuffleVPERMILPSImmediate(SVOp), DAG);
+                                getShuffleVPERMILPImmediate(SVOp), DAG);
 
-  // Handle VPERMILPD* permutations
-  if (isVPERMILPDMask(M, VT, Subtarget))
-    return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
-                                getShuffleVPERMILPDImmediate(SVOp), DAG);
+  // Handle VPERM2F128/VPERM2I128 permutations
+  if (isVPERM2X128Mask(M, VT, Subtarget->hasAVX()))
+    return getTargetShuffleNode(getVPERM2X128Opcode(VT, HasAVX2), dl, VT, V1,
+                                V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
 
-  // Handle VPERM2F128 permutations
-  if (isVPERM2F128Mask(M, VT, Subtarget))
-    return getTargetShuffleNode(X86ISD::VPERM2F128, dl, VT, V1, V2,
-                                getShuffleVPERM2F128Immediate(SVOp), DAG);
-
-  // Handle VSHUFPSY permutations
+  // Handle VSHUFPS/DY permutations
   if (isVSHUFPYMask(M, VT, Subtarget->hasAVX()))
     return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
                                 getShuffleVSHUFPYImmediate(SVOp), DAG);
@@ -11223,6 +11183,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VPERMILPS:          return "X86ISD::VPERMILPS";
   case X86ISD::VPERMILPD:          return "X86ISD::VPERMILPD";
   case X86ISD::VPERM2F128:         return "X86ISD::VPERM2F128";
+  case X86ISD::VPERM2I128:         return "X86ISD::VPERM2I128";
   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
@@ -14810,6 +14771,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::VPERMILPS:
   case X86ISD::VPERMILPD:
   case X86ISD::VPERM2F128:
+  case X86ISD::VPERM2I128:
   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
   }
 
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 638f80c6e00..e0a0e295783 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -280,6 +280,7 @@ namespace llvm {
       VPERMILPS,
       VPERMILPD,
       VPERM2F128,
+      VPERM2I128,
       VBROADCAST,
 
       // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index db2b652c290..ff9c1433fad 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -140,6 +140,7 @@ def X86VPermilps  : SDNode<"X86ISD::VPERMILPS", SDTShuff2OpI>;
 def X86VPermilpd  : SDNode<"X86ISD::VPERMILPD", SDTShuff2OpI>;
 
 def X86VPerm2f128 : SDNode<"X86ISD::VPERM2F128", SDTShuff3OpI>;
+def X86VPerm2i128 : SDNode<"X86ISD::VPERM2I128", SDTShuff3OpI>;
 
 def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
 
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 2b240fc03a4..28515ba1d6b 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -7359,7 +7359,7 @@ def : Pat<(int_x86_avx_vperm2f128_pd_256
                   VR256:$src1, (memopv4f64 addr:$src2), imm:$src3),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>;
 def : Pat<(int_x86_avx_vperm2f128_si_256
-                  VR256:$src1, (memopv8i32 addr:$src2), imm:$src3),
+                  VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)), imm:$src3),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>;
 
 def : Pat<(v8f32 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
@@ -7375,6 +7375,25 @@ def : Pat<(v32i8 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
 def : Pat<(v16i16 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 
+def : Pat<(v8f32 (X86VPerm2f128 VR256:$src1,
+                  (memopv8f32 addr:$src2), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v8i32 (X86VPerm2f128 VR256:$src1,
+                  (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2f128 VR256:$src1,
+                  (memopv4i64 addr:$src2), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v4f64 (X86VPerm2f128 VR256:$src1,
+                  (memopv4f64 addr:$src2), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2f128 VR256:$src1,
+                  (bc_v32i8 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2f128 VR256:$src1,
+                  (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+
 //===----------------------------------------------------------------------===//
 // VZERO - Zero YMM registers
 //
@@ -7571,7 +7590,7 @@ defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", memopv4f64, int_x86_avx2_permpd>,
                              VEX_W;
 
 //===----------------------------------------------------------------------===//
-// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
+// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
 //
 def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, i8imm:$src3),
@@ -7587,6 +7606,30 @@ def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
             imm:$src3))]>,
           VEX_4V;
 
+let Predicates = [HasAVX2] in {
+def : Pat<(v8i32 (X86VPerm2i128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2i128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2i128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2i128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+
+def : Pat<(v32i8 (X86VPerm2i128 VR256:$src1, (bc_v32i8 (memopv4i64 addr:$src2)),
+                  (i8 imm:$imm))),
+          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2i128 VR256:$src1,
+                   (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v8i32 (X86VPerm2i128 VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)),
+                  (i8 imm:$imm))),
+          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2i128 VR256:$src1, (memopv4i64 addr:$src2),
+                  (i8 imm:$imm))),
+          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
 //===----------------------------------------------------------------------===//
 // VINSERTI128 - Insert packed integer values
 //
diff --git a/test/CodeGen/X86/avx2-vperm2i128.ll b/test/CodeGen/X86/avx2-vperm2i128.ll
new file mode 100644
index 00000000000..0f24ac33316
--- /dev/null
+++ b/test/CodeGen/X86/avx2-vperm2i128.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
+
+; CHECK: vperm2i128 $17
+define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
+entry:
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i8> %shuffle
+}
+
+; CHECK: vperm2i128 $33
+define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+entry:
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x i64> %shuffle
+}
+
+; CHECK: vperm2i128 $49
+define <8 x i32> @E3(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
+entry:
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i32> %shuffle
+}
+
+; CHECK: vperm2i128 $2
+define <16 x i16> @E4(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
+entry:
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <16 x i16> %shuffle
+}
+
+; CHECK: vperm2i128 $2, (%
+define <16 x i16> @E5(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
+entry:
+  %c = load <16 x i16>* %a
+  %d = load <16 x i16>* %b
+  %shuffle = shufflevector <16 x i16> %c, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <16 x i16> %shuffle
+}