[X86][SSE] psrl(w/d/q) and psll(w/d/q) bit shifts for SSE2

Patch to match cases where shuffle masks can be reduced to bit shifts. Similar to byte shift shuffle matching from D5699. Differential Revision: http://reviews.llvm.org/D6649 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@228047 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-05 01:31:05 +00:00 · 2015-02-03 21:58:29 +00:00 · 2015-02-03 21:58:29 +00:00 · 3d04e48cb6
commit 3d04e48cb6
parent 8c775a4e7b
9 changed files with 670 additions and 15 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -7808,6 +7808,79 @@ static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
  return SDValue();
 }

+/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
+///
+/// Attempts to match a shuffle mask against the PSRL(W/D/Q) and PSLL(W/D/Q)
+/// SSE2 and AVX2 logical bit-shift instructions. The function matches
+/// elements from one of the input vectors shuffled to the left or right
+/// with zeroable elements 'shifted in'.
+static SDValue lowerVectorShuffleAsBitShift(SDLoc DL, MVT VT, SDValue V1,
+                                            SDValue V2, ArrayRef<int> Mask,
+                                            SelectionDAG &DAG) {
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  int Size = Mask.size();
+  assert(Size == VT.getVectorNumElements() && "Unexpected mask size");
+
+  // PSRL : (little-endian) right bit shift.
+  // [  1, zz,  3, zz]
+  // [ -1, -1,  7, zz]
+  // PSHL : (little-endian) left bit shift.
+  // [ zz, 0, zz,  2 ]
+  // [ -1, 4, zz, -1 ]
+  auto MatchBitShift = [&](int Shift, int Scale) -> SDValue {
+    MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
+    MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
+    assert(TLI.isTypeLegal(ShiftVT) && "Illegal integer vector type");
+
+    bool MatchLeft = true, MatchRight = true;
+    for (int i = 0; i != Size; i += Scale) {
+      for (int j = 0; j != Shift; j++) {
+        MatchLeft &= Zeroable[i + j];
+      }
+      for (int j = Scale - Shift; j != Scale; j++) {
+        MatchRight &= Zeroable[i + j];
+      }
+    }
+    if (!(MatchLeft || MatchRight))
+      return SDValue();
+
+    bool MatchV1 = true, MatchV2 = true;
+    for (int i = 0; i != Size; i += Scale) {
+      unsigned Pos = MatchLeft ? i + Shift : i;
+      unsigned Low = MatchLeft ? i : i + Shift;
+      unsigned Len = Scale - Shift;
+      MatchV1 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low);
+      MatchV2 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low + Size);
+    }
+    if (!(MatchV1 || MatchV2))
+      return SDValue();
+
+    // Cast the inputs to ShiftVT to match VSRLI/VSHLI and back again.
+    unsigned OpCode = MatchLeft ? X86ISD::VSHLI : X86ISD::VSRLI;
+    int ShiftAmt = Shift * VT.getScalarSizeInBits();
+    SDValue V = MatchV1 ? V1 : V2;
+    V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
+    V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, MVT::i8));
+    return DAG.getNode(ISD::BITCAST, DL, VT, V);
+  };
+
+  // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
+  // keep doubling the size of the integer elements up to that. We can
+  // then shift the elements of the integer vector by whole multiples of
+  // their width within the elements of the larger integer vector. Test each
+  // multiple to see if we can find a match with the moved element indices
+  // and that the shifted in elements are all zeroable.
+  for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 64; Scale *= 2)
+    for (int Shift = 1; Shift != Scale; Shift++)
+      if (SDValue BitShift = MatchBitShift(Shift, Scale))
+        return BitShift;
+
+  // no match
+  return SDValue();
+}
+
 /// \brief Lower a vector shuffle as a zero or any extension.
 ///
 /// Given a specific number of elements, element bit width, and extension
@ -8654,6 +8727,11 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                       getV4X86ShuffleImm8ForMask(Mask, DAG));
  }

+  // Try to use bit shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsBitShift(
+          DL, MVT::v4i32, V1, V2, Mask, DAG))
+    return Shift;
+
  // Try to use byte shift instructions.
  if (SDValue Shift = lowerVectorShuffleAsByteShift(
          DL, MVT::v4i32, V1, V2, Mask, DAG))
@ -8739,6 +8817,11 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
                                                        Mask, Subtarget, DAG))
    return Broadcast;

+  // Try to use bit shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsBitShift(
+          DL, MVT::v8i16, V, V, Mask, DAG))
+    return Shift;
+
  // Try to use byte shift instructions.
  if (SDValue Shift = lowerVectorShuffleAsByteShift(
          DL, MVT::v8i16, V, V, Mask, DAG))
@ -9356,6 +9439,11 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
                            "to be V1-input shuffles.");

+  // Try to use bit shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsBitShift(
+          DL, MVT::v8i16, V1, V2, Mask, DAG))
+    return Shift;
+
  // Try to use byte shift instructions.
  if (SDValue Shift = lowerVectorShuffleAsByteShift(
          DL, MVT::v8i16, V1, V2, Mask, DAG))
@ -9512,6 +9600,11 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  ArrayRef<int> OrigMask = SVOp->getMask();
  assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");

+  // Try to use bit shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsBitShift(
+          DL, MVT::v16i8, V1, V2, OrigMask, DAG))
+    return Shift;
+
  // Try to use byte shift instructions.
  if (SDValue Shift = lowerVectorShuffleAsByteShift(
          DL, MVT::v16i8, V1, V2, OrigMask, DAG))
@ -10602,6 +10695,11 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
  }

+  // Try to use bit shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsBitShift(
+          DL, MVT::v8i32, V1, V2, Mask, DAG))
+    return Shift;
+
  // Try to simplify this by merging 128-bit lanes to enable a lane-based
  // shuffle.
  if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
@ -10685,6 +10783,11 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
            DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
  }

+  // Try to use bit shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsBitShift(
+          DL, MVT::v16i16, V1, V2, Mask, DAG))
+    return Shift;
+
  // Try to simplify this by merging 128-bit lanes to enable a lane-based
  // shuffle.
  if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
@ -10763,6 +10866,11 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
  }

+  // Try to use bit shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsBitShift(
+          DL, MVT::v32i8, V1, V2, Mask, DAG))
+    return Shift;
+
  // Try to simplify this by merging 128-bit lanes to enable a lane-based
  // shuffle.
  if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
--- a/test/CodeGen/X86/combine-or.ll
+++ b/test/CodeGen/X86/combine-or.ll
@ -204,16 +204,14 @@ define <2 x i64> @test16(<2 x i64> %a, <2 x i64> %b) {
 ; shuffle instruction when the shuffle indexes are not compatible.

 define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test17:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    xorps %xmm2, %xmm2
-; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,2]
-; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; CHECK-NEXT:    movq {{.*#+}} xmm0 = xmm1[0],zero
-; CHECK-NEXT:    orps %xmm2, %xmm0
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 2>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+; CHECK-LABEL: test17:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    psllq $32, %xmm0
+; CHECK-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 2>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
  %or = or <4 x i32> %shuf1, %shuf2
  ret <4 x i32> %or
 }
--- a/test/CodeGen/X86/vec_insert-5.ll
+++ b/test/CodeGen/X86/vec_insert-5.ll
@ -63,7 +63,7 @@ define <4 x float> @t4(<4 x float>* %P) nounwind {
 define <16 x i8> @t5(<16 x i8> %x) nounwind {
 ; CHECK-LABEL: t5:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-NEXT:    psrlw $8, %xmm0
 ; CHECK-NEXT:    retl
  %s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 17>
  ret <16 x i8> %s
@ -72,7 +72,7 @@ define <16 x i8> @t5(<16 x i8> %x) nounwind {
 define <16 x i8> @t6(<16 x i8> %x) nounwind {
 ; CHECK-LABEL: t6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-NEXT:    psrlw $8, %xmm0
 ; CHECK-NEXT:    retl
  %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  ret <16 x i8> %s
@ -86,3 +86,21 @@ define <16 x i8> @t7(<16 x i8> %x) nounwind {
  %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2>
  ret <16 x i8> %s
 }
+
+define <16 x i8> @t8(<16 x i8> %x) nounwind {
+; CHECK-LABEL: t8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-NEXT:    retl
+  %s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 17>
+  ret <16 x i8> %s
+}
+
+define <16 x i8> @t9(<16 x i8> %x) nounwind {
+; CHECK-LABEL: t9:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-NEXT:    retl
+  %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 14, i32 undef, i32 undef>
+  ret <16 x i8> %s
+}
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@ -705,21 +705,21 @@ define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
 ; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    movd %edi, %xmm0
-; SSSE3-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
+; SSSE3-NEXT:    pslld $24, %xmm0
 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    movd %edi, %xmm0
-; SSE41-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
+; SSE41-NEXT:    pslld $24, %xmm0
 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovd %edi, %xmm0
-; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
+; AVX-NEXT:    vpslld $24, %xmm0
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX-NEXT:    retq
  %a = insertelement <16 x i8> undef, i8 %i, i32 3
@ -1185,6 +1185,108 @@ entry:
  ret void
 }

+;
+; Shuffle to logical bit shifts
+;
+
+define <16 x i8> @shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
+; SSE:       # BB#0:
+; SSE-NEXT:    psllw $8, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsllw $8, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 16, i32 2, i32 16, i32 4, i32 16, i32 6, i32 16, i32 8, i32 16, i32 10, i32 16, i32 12, i32 16, i32 14>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
+; SSE:       # BB#0:
+; SSE-NEXT:    pslld $24, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpslld $24, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 12>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08:
+; SSE:       # BB#0:
+; SSE-NEXT:    psllq $56, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsllq $56, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 8>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14:
+; SSE:       # BB#0:
+; SSE-NEXT:    psllq $8, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsllq $8, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 undef, i32 2, i32 3, i32 undef, i32 5, i32 6, i32 16, i32 8, i32 9, i32 undef, i32 11, i32 12, i32 13, i32 14>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrlw $8, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlw $8, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 undef, i32 16, i32 undef, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15, i32 16>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrld $16, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 2, i32 3, i32 16, i32 16, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 14, i32 15, i32 16, i32 16>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrlq $56, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlq $56, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16>
+  ret <16 x i8> %shuffle
+}
+
 define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) {
 ; SSE2-LABEL: PR12412:
 ; SSE2:       # BB#0: # %entry
--- a/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll
@ -1373,3 +1373,35 @@ define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {
  %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
  ret <4 x float> %shuffle
 }
+
+;
+; Shuffle to logical bit shifts
+;
+
+define <4 x i32> @shuffle_v4i32_z0zX(<4 x i32> %a) {
+; SSE-LABEL: shuffle_v4i32_z0zX:
+; SSE:       # BB#0:
+; SSE-NEXT:    psllq $32, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_z0zX:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsllq $32, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 0, i32 4, i32 undef>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_1z3z(<4 x i32> %a) {
+; SSE-LABEL: shuffle_v4i32_1z3z:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrlq $32, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_1z3z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlq $32, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
+  ret <4 x i32> %shuffle
+}
--- a/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll
@ -1918,6 +1918,121 @@ define <8 x i16> @shuffle_v8i16_0z1z2z3z(<8 x i16> %a) {
  ret <8 x i16> %shuffle
 }

+;
+; Shuffle to logical bit shifts
+;
+define <8 x i16> @shuffle_v8i16_z0z2z4z6(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_z0z2z4z6:
+; SSE:       # BB#0:
+; SSE-NEXT:    pslld $16, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_z0z2z4z6:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpslld $16, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 0, i32 8, i32 2, i32 8, i32 4, i32 8, i32 6>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_zzz0zzz4(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_zzz0zzz4:
+; SSE:       # BB#0:
+; SSE-NEXT:    psllq $48, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_zzz0zzz4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsllq $48, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 8, i32 8, i32 0, i32 8, i32 8, i32 8, i32 4>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_zz01zX4X(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_zz01zX4X:
+; SSE:       # BB#0:
+; SSE-NEXT:    psllq $32, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_zz01zX4X:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsllq $32, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 8, i32 0, i32 1, i32 8, i32 undef, i32 4, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_z0X2z456(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_z0X2z456:
+; SSE:       # BB#0:
+; SSE-NEXT:    psllq $16, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_z0X2z456:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsllq $16, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 0, i32 undef, i32 2, i32 8, i32 4, i32 5, i32 6>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_1z3zXz7z(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_1z3zXz7z:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_1z3zXz7z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrld $16, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 1, i32 8, i32 3, i32 8, i32 undef, i32 8, i32 7, i32 8>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_1X3z567z(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_1X3z567z:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrlq $16, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_1X3z567z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlq $16, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 1, i32 undef, i32 3, i32 8, i32 5, i32 6, i32 7, i32 8>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_23zz67zz(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_23zz67zz:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrlq $32, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_23zz67zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlq $32, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 2, i32 3, i32 8, i32 8, i32 6, i32 7, i32 8, i32 8>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_3zXXXzzz(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_3zXXXzzz:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrlq $48, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_3zXXXzzz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlq $48, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 3, i32 8, i32 undef, i32 undef, i32 undef, i32 8, i32 8, i32 8>
+  ret <8 x i16> %shuffle
+}
+
 define <8 x i16> @shuffle_v8i16_01u3zzuz(<8 x i16> %a) {
 ; SSE-LABEL: shuffle_v8i16_01u3zzuz:
 ; SSE:       # BB#0:
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@ -1363,6 +1363,110 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_2
  ret <16 x i16> %shuffle
 }

+;
+; Shuffle to logical bit shifts
+;
+
+define <16 x i16> @shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(<16 x i16> %a) {
+; AVX1-LABEL: shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpslld $16, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 16, i32 0, i32 16, i32 2, i32 16, i32 4, i32 16, i32 6, i32 16, i32 8, i32 16, i32 10, i32 16, i32 12, i32 16, i32 14>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(<16 x i16> %a) {
+; AVX1-LABEL: shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsllq $48, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 12>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz(<16 x i16> %a) {
+; AVX1-LABEL: shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrld $16, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 1, i32 16, i32 3, i32 16, i32 5, i32 16, i32 7, i32 16, i32 9, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15, i32 16>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz(<16 x i16> %a) {
+; AVX1-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,9,12,13,2,3,2,3,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrlq $32, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 2, i32 3, i32 16, i32 16, i32 6, i32 7, i32 16, i32 16, i32 10, i32 11, i32 16, i32 16, i32 14, i32 15, i32 16, i32 16>
+  ret <16 x i16> %shuffle
+}
+
 define <16 x i16> @shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz(<16 x i16> %a) {
 ; AVX1-LABEL: shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz:
 ; AVX1:       # BB#0:
--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@ -1655,6 +1655,148 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_
  ret <32 x i8> %shuffle
 }

+;
+; Shuffle to logical bit shifts
+;
+
+define <32 x i8> @shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshuflw $0, %xmm3, %xmm3 # xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsllw $8, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 0, i32 32, i32 2, i32 32, i32 4, i32 32, i32 6, i32 32, i32 8, i32 32, i32 10, i32 32, i32 12, i32 32, i32 14, i32 32, i32 16, i32 32, i32 18, i32 32, i32 20, i32 32, i32 22, i32 32, i32 24, i32 32, i32 26, i32 32, i32 28, i32 32, i32 30>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,0,1,128,128,4,5,128,128,8,9,128,128,12,13]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,0],zero,zero,xmm3[0,0],zero,zero,xmm3[0,0],zero,zero,xmm3[0,0],zero,zero
+; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpslld $16, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 32, i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 32, i32 32, i32 8, i32 9, i32 32, i32 32, i32 12, i32 13, i32 32, i32 32, i32 16, i32 17, i32 32, i32 32, i32 20, i32 21, i32 32, i32 32, i32 24, i32 25, i32 32, i32 32, i32 28, i32 29>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,0,1,128,128,128,128,128,128,8,9]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,0,0,0,0,0],zero,zero,xmm3[0,0,0,0,0,0],zero,zero
+; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsllq $48, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 0, i32 1, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 8, i32 9, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 24, i32 25>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshuflw $0, %xmm3, %xmm3 # xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrlw $8, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 32, i32 3, i32 32, i32 5, i32 32, i32 7, i32 32, i32 9, i32 32, i32 11, i32 32, i32 13, i32 32, i32 15, i32 32, i32 17, i32 32, i32 19, i32 32, i32 21, i32 32, i32 23, i32 32, i32 25, i32 32, i32 27, i32 32, i32 29, i32 32, i32 31, i32 32>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,128,128,6,7,128,128,10,11,128,128,14,15,128,128]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,0],zero,zero,xmm3[0,0],zero,zero,xmm3[0,0],zero,zero,xmm3[0,0]
+; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrld $16, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 2, i32 3, i32 32, i32 32, i32 6, i32 7, i32 32, i32 32, i32 10, i32 11, i32 32, i32 32, i32 14, i32 15, i32 32, i32 32, i32 18, i32 19, i32 32, i32 32, i32 22, i32 23, i32 32, i32 32, i32 26, i32 27, i32 32, i32 32, i32 30, i32 31, i32 32, i32 32>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <7,128,128,128,15,128,128,128,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = zero,xmm3[0,0,0],zero,xmm3[0,0,0,u,u,u,u,u,u,u,u]
+; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrlq $56, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 7, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 23, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+  ret <32 x i8> %shuffle
+}
+
 define <32 x i8> @shuffle_v32i8_32_zz_zz_zz_zz_zz_zz_zz_33_zz_zz_zz_zz_zz_zz_zz_34_zz_zz_zz_zz_zz_zz_zz_35_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) {
 ; AVX1-LABEL: shuffle_v32i8_32_zz_zz_zz_zz_zz_zz_zz_33_zz_zz_zz_zz_zz_zz_zz_34_zz_zz_zz_zz_zz_zz_zz_35_zz_zz_zz_zz_zz_zz_zz:
 ; AVX1:       # BB#0:
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@ -1848,6 +1848,42 @@ define <8 x float> @splat_v8f32(<4 x float> %r) {
  ret <8 x float> %1
 }

+;
+; Shuffle to logical bit shifts
+;
+
+define <8 x i32> @shuffle_v8i32_z0U2zUz6(<8 x i32> %a) {
+; AVX1-LABEL: shuffle_v8i32_z0U2zUz6:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_z0U2zUz6:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsllq $32, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 8, i32 0, i32 undef, i32 2, i32 8, i32 undef, i32 8, i32 6>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_1U3z5zUU(<8 x i32> %a) {
+; AVX1-LABEL: shuffle_v8i32_1U3z5zUU:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_1U3z5zUU:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrlq $32, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 1, i32 undef, i32 3, i32 8, i32 5, i32 8, i32 undef, i32 undef>
+  ret <8 x i32> %shuffle
+}
+
 define <8x float> @concat_v2f32_1(<2 x float>* %tmp64, <2 x float>* %tmp65) {
 ; ALL-LABEL: concat_v2f32_1:
 ; ALL:       # BB#0: # %entry