From 330aa6fd6b5f2468e9eafca0586fd8ca4dc555eb Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Fri, 19 Sep 2014 06:07:49 +0000 Subject: [PATCH] [x86] Add a dedicated lowering path for zext-compatible vector shuffles to the new vector shuffle lowering code. This allows us to emit PMOVZX variants consistently for patterns where it is a viable lowering. This instruction is both fast and allows us to fold loads into it. This only hooks the new lowering up for i16 and i8 element widths, mostly so I could manage the change to the tests. I'll add the i32 one next, although it is significantly less interesting. One thing to note is that we already had some tests for these patterns but those tests had far less horrible instructions. The problem is that those tests weren't checking the strict start and end of the instruction sequence. =[ As a consequence something changed in the lowering making us generate *TERRIBLE* code for these patterns in SSE2 through SSSE3. I've consolidated all of the tests and spelled out the madness that we currently emit for these shuffles. I'm going to try to figure out what has gone wrong here. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218102 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 134 +++++++++++++++++++++ test/CodeGen/X86/vector-shuffle-128-v16.ll | 130 ++++++++++++++------ test/CodeGen/X86/vector-shuffle-128-v8.ll | 92 ++++++++++++++ 3 files changed, 321 insertions(+), 35 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 8343a856eda..ccbe0af1547 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -19,6 +19,7 @@ #include "X86MachineFunctionInfo.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" @@ -7353,6 +7354,125 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, DAG.getConstant(Rotation * Scale, MVT::i8))); } +/// \brief Compute whether each element of a shuffle is zeroable. +/// +/// A "zeroable" vector shuffle element is one which can be lowered to zero. +/// Either it is an undef element in the shuffle mask, the element of the input +/// referenced is undef, or the element of the input referenced is known to be +/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle +/// as many lanes with this technique as possible to simplify the remaining +/// shuffle. +static SmallBitVector computeZeroableShuffleElements(ArrayRef Mask, + SDValue V1, SDValue V2) { + SmallBitVector Zeroable(Mask.size(), false); + + bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); + + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + int M = Mask[i]; + // Handle the easy cases. + if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { + Zeroable[i] = true; + continue; + } + + // If this is an index into a build_vector node, dig out the input value and + // use it. + SDValue V = M < Size ? V1 : V2; + if (V.getOpcode() != ISD::BUILD_VECTOR) + continue; + + SDValue Input = V.getOperand(M % Size); + // The UNDEF opcode check really should be dead code here, but not quite + // worth asserting on (it isn't invalid, just unexpected). + if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input)) + Zeroable[i] = true; + } + + return Zeroable; +} + +/// \brief Try to lower a vector shuffle as a zero extension. +/// +/// This tries to use the SSE4.1 PMOVZX instruction family to lower a vector +/// shuffle throuh a zero extension. It doesn't check for the availability or +/// profitability of this lowering though, it tries to aggressively match this +/// pattern. It handles both blends with all-zero inputs to explicitly +/// zero-extend and undef-lanes (sometimes undef due to masking out later). +static SDValue lowerVectorShuffleAsZeroExtend(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + SelectionDAG &DAG) { + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + int Bits = VT.getSizeInBits(); + int EltBits = VT.getScalarSizeInBits(); + int NumElements = Mask.size(); + + // Define a helper function to check a particular zext-stride and lower to it + // if valid. + auto LowerWithStride = [&](int Stride) -> SDValue { + SDValue InputV; + for (int i = 0; i < NumElements; ++i) { + if (Mask[i] == -1) + continue; // Valid anywhere but doesn't tell us anything. + if (i % Stride != 0) { + // Each of the extend elements needs to be zeroable. + if (!Zeroable[i]) + return SDValue(); + else + continue; + } + + // Each of the base elements needs to be consecutive indices into the + // same input vector. + SDValue V = Mask[i] < NumElements ? V1 : V2; + if (!InputV) + InputV = V; + else if (InputV != V) + return SDValue(); // Flip-flopping inputs. + + if (Mask[i] % NumElements != i / Stride) + return SDValue(); // Non-consecutive strided elemenst. + } + + // If we fail to find an input, we have a zero-shuffle which should always + // have already been handled. + // FIXME: Maybe handle this here in case during blending we end up with one? + if (!InputV) + return SDValue(); + + // Found a valid lowering! Compute all the types and the operation. We force + // everything to integer types here as that's the only way zext makes sense. + MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Stride), + NumElements / Stride); + + InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV)); + }; + + // The widest stride possible for zero extending is to a 64-bit integer. + assert(Bits % 64 == 0 && + "The number of bits in a vector must be divisible by 64 on x86!"); + int NumExtElements = Bits / 64; + + // Each iteration, try extending the elements half as much, but into twice as + // many elements. + for (; NumExtElements < NumElements; NumExtElements *= 2) { + assert( + NumElements % NumExtElements == 0 && + "The input vector size must be divisble by the extended size."); + int Stride = NumElements / NumExtElements; + if (SDValue V = LowerWithStride(Stride)) + return V; + } + + // No viable zext lowering found. + return SDValue(); +} + /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. /// /// This is the basis function for the 2-lane 64-bit shuffles as we have full @@ -8390,6 +8510,14 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + // Whenever we can lower this as a zext, that instruction is strictly faster + // than any alternative. + if (Subtarget->hasSSE41()) + if (SDValue ZExt = lowerVectorShuffleAsZeroExtend(DL, MVT::v8i16, V1, V2, + OrigMask, DAG)) + return ZExt; + + auto isV1 = [](int M) { return M >= 0 && M < 8; }; auto isV2 = [](int M) { return M >= 8; }; @@ -8553,6 +8681,12 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, OrigMask, DAG)) return Rotate; + // Try to use a zext lowering. + if (Subtarget->hasSSE41()) + if (SDValue ZExt = lowerVectorShuffleAsZeroExtend(DL, MVT::v16i8, V1, V2, + OrigMask, DAG)) + return ZExt; + int MaskStorage[16] = { OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7], diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index 9fe3e0b530e..2285e21ba90 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -291,41 +291,6 @@ define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20( ret <16 x i8> %shuffle } -define <16 x i8> @zext_to_v8i16_shuffle(<16 x i8> %a) { -; SSE2-LABEL: @zext_to_v8i16_shuffle -; SSE2: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw %xmm1, %xmm0 -; -; SSSE3-LABEL: @zext_to_v8i16_shuffle -; SSSE3: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpcklbw %xmm1, %xmm0 -; -; SSE41-LABEL: @zext_to_v8i16_shuffle -; SSE41: pxor %xmm1, %xmm1 -; SSE41-NEXT: punpcklbw %xmm1, %xmm0 - %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> - ret <16 x i8> %shuffle -} - -define <16 x i8> @zext_to_v4i32_shuffle(<16 x i8> %a) { -; SSE2-LABEL: @zext_to_v4i32_shuffle -; SSE2: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw %xmm1, %xmm0 -; SSE2-NEXT: punpcklbw %xmm1, %xmm0 -; -; SSSE3-LABEL: @zext_to_v4i32_shuffle -; SSSE3: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpcklbw %xmm1, %xmm0 -; SSSE3-NEXT: punpcklbw %xmm1, %xmm0 -; -; SSE41-LABEL: @zext_to_v4i32_shuffle -; SSE41: pxor %xmm1, %xmm1 -; SSE41-NEXT: punpcklbw %xmm1, %xmm0 -; SSE41-NEXT: punpcklbw %xmm1, %xmm0 - %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> - ret <16 x i8> %shuffle -} - define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) { ; SSE2-LABEL: @trunc_v4i32_shuffle ; SSE2: # BB#0: @@ -545,3 +510,98 @@ define <16 x i8> @shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30( %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } + +define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %a) { +; SSSE3-LABEL: @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbq %xmm0, %xmm0 +; SSE41-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) { +; SSSE3-LABEL: @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]] +; SSSE3-NEXT: pxor %[[X2:xmm[0-9]+]], %[[X2]] +; SSSE3-NEXT: pshufb {{.*}} # [[X2]] = zero,[[X2]][2,4,6],zero,[[X2]][10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: por %xmm0, %[[X2]] +; SSSE3-NEXT: punpcklbw {{.*}} # [[X2]] = [[X2]][0],[[X1]][0],[[X2]][1],[[X1]][1],[[X2]][2],[[X1]][2],[[X2]][3],[[X1]][3],[[X2]][4],[[X1]][4],[[X2]][5],[[X1]][5],[[X2]][6],[[X1]][6],[[X2]][7],[[X1]][7] +; SSSE3-NEXT: movdqa %[[X2]], %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbq %xmm0, %xmm0 +; SSE41-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu(<16 x i8> %a) { +; SSSE3-LABEL: @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbd %xmm0, %xmm0 +; SSE41-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz(<16 x i8> %a) { +; SSSE3-LABEL: @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz +; SSSE3: # BB#0: +; SSSE3-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]] +; SSSE3-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7] +; SSSE3-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbd %xmm0, %xmm0 +; SSE41-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu(<16 x i8> %a) { +; SSSE3-LABEL: @shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu +; SSSE3: # BB#0: +; SSSE3-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbw %xmm0, %xmm0 +; SSE41-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz(<16 x i8> %a) { +; SSSE3-LABEL: @shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz +; SSSE3: # BB#0: +; SSSE3-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]] +; SSSE3-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbw %xmm0, %xmm0 +; SSE41-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %shuffle +} diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index b61e282404c..89231586d06 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -981,3 +981,95 @@ define <8 x i16> @shuffle_v8i16_u6uu9abu(<8 x i16> %a, <8 x i16> %b) { %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } + +define <8 x i16> @shuffle_v8i16_0uuu1uuu(<8 x i16> %a) { +; SSE2-LABEL: @shuffle_v8i16_0uuu1uuu +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,6,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v8i16_0uuu1uuu +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] +; SSSE3-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v8i16_0uuu1uuu +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxwq %xmm0, %xmm0 +; SSE41-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_0zzz1zzz(<8 x i16> %a) { +; SSE2-LABEL: @shuffle_v8i16_0zzz1zzz +; SSE2: # BB#0: +; SSE2-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]] +; SSE2-NEXT: pxor %[[X2:xmm[0-9]+]], %[[X2]] +; SSE2-NEXT: punpcklwd {{.*}} # [[X2]] = [[X2]][0],xmm0[0],[[X2]][1],xmm0[1],[[X2]][2],xmm0[2],[[X2]][3],xmm0[3] +; SSE2-NEXT: pshufd {{.*}} # xmm0 = [[X2]][0,3,2,1] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,2,3,0,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v8i16_0zzz1zzz +; SSSE3: # BB#0: +; SSSE3-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]] +; SSSE3-NEXT: pxor %[[X2:xmm[0-9]+]], %[[X2]] +; SSSE3-NEXT: punpcklwd {{.*}} # [[X2]] = [[X2]][0],xmm0[0],[[X2]][1],xmm0[1],[[X2]][2],xmm0[2],[[X2]][3],xmm0[3] +; SSSE3-NEXT: pshufb {{.*}} # [[X2]] = [[X2]][2,3,8,9,6,7,0,1,8,9,6,7,4,5,6,7] +; SSSE3-NEXT: punpcklwd {{.*}} # [[X2]] = [[X2]][0],[[X1]][0],[[X2]][1],[[X1]][1],[[X2]][2],[[X1]][2],[[X2]][3],[[X1]][3] +; SSSE3-NEXT: movdqa %[[X2]], %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v8i16_0zzz1zzz +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxwq %xmm0, %xmm0 +; SSE41-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_0u1u2u3u(<8 x i16> %a) { +; SSE2-LABEL: @shuffle_v8i16_0u1u2u3u +; SSE2: # BB#0: +; SSE2-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v8i16_0u1u2u3u +; SSSE3: # BB#0: +; SSSE3-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v8i16_0u1u2u3u +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxwd %xmm0, %xmm0 +; SSE41-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_0z1z2z3z(<8 x i16> %a) { +; SSE2-LABEL: @shuffle_v8i16_0z1z2z3z +; SSE2: # BB#0: +; SSE2-NEXT: pxor %[[X:xmm[0-9]+]], %[[X]] +; SSE2-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v8i16_0z1z2z3z +; SSSE3: # BB#0: +; SSSE3-NEXT: pxor %[[X:xmm[0-9]+]], %[[X]] +; SSSE3-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v8i16_0z1z2z3z +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxwd %xmm0, %xmm0 +; SSE41-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %shuffle +}