diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp index fc2932b181f..432cf930b4a 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -102,6 +102,17 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; + case X86::VPBLENDWYrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPBLENDWYrmi: + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodeBLENDMask(MVT::v16i16, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; case X86::VPBLENDDrri: Src2Name = getRegName(MI->getOperand(2).getReg()); diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp index 713e147fbf5..a3f45233454 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -301,11 +301,18 @@ void DecodePSHUFBMask(ArrayRef RawMask, } } -void DecodeBLENDMask(MVT VT, unsigned Imm, - SmallVectorImpl &ShuffleMask) { +void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) { + int ElementBits = VT.getScalarSizeInBits(); int NumElements = VT.getVectorNumElements(); - for (int i = 0; i < NumElements; ++i) - ShuffleMask.push_back(((Imm >> i) & 1) ? NumElements + i : i); + for (int i = 0; i < NumElements; ++i) { + // If there are more than 8 elements in the vector, then any immediate blend + // mask applies to each 128-bit lane. There can never be more than + // 8 elements in a 128-bit lane with an immediate blend. + int Bit = NumElements > 8 ? i % (128 / ElementBits) : i; + assert(Bit < 8 && + "Immediate blends only operate over 8 elements at a time!"); + ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElements + i : i); + } } /// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD. diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index f93f1490f18..9be0f23597a 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7183,6 +7183,56 @@ static bool isSingleInputShuffleMask(ArrayRef Mask) { return true; } +/// \brief Test whether there are elements crossing 128-bit lanes in this +/// shuffle mask. +/// +/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations +/// and we routinely test for these. +static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef Mask) { + int LaneSize = 128 / VT.getScalarSizeInBits(); + int Size = Mask.size(); + for (int i = 0; i < Size; ++i) + if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) + return true; + return false; +} + +/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane. +/// +/// This checks a shuffle mask to see if it is performing the same +/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies +/// that it is also not lane-crossing. It may however involve a blend from the +/// same lane of a second vector. +/// +/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is +/// non-trivial to compute in the face of undef lanes. The representation is +/// *not* suitable for use with existing 128-bit shuffles as it will contain +/// entries from both V1 and V2 inputs to the wider mask. +static bool +is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, + SmallVectorImpl &RepeatedMask) { + int LaneSize = 128 / VT.getScalarSizeInBits(); + RepeatedMask.resize(LaneSize, -1); + int Size = Mask.size(); + for (int i = 0; i < Size; ++i) { + if (Mask[i] < 0) + continue; + if ((Mask[i] % Size) / LaneSize != i / LaneSize) + // This entry crosses lanes, so there is no way to model this shuffle. + return false; + + // Ok, handle the in-lane shuffles by detecting if and when they repeat. + if (RepeatedMask[i % LaneSize] == -1) + // This is the first non-undef entry in this slot of a 128-bit lane. + RepeatedMask[i % LaneSize] = + Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size; + else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i]) + // Found a mismatch with the repeated mask. + return false; + } + return true; +} + // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC // 2013 will allow us to use it as a non-type template parameter. namespace { @@ -7312,6 +7362,38 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, DAG.getConstant(BlendMask, MVT::i8))); } + case MVT::v16i16: { + assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + SmallVector RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { + // We can lower these with PBLENDW which is mirrored across 128-bit lanes. + assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); + BlendMask = 0; + for (int i = 0; i < 8; ++i) + if (RepeatedMask[i] >= 16) + BlendMask |= 1u << i; + return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, + DAG.getConstant(BlendMask, MVT::i8)); + } + + // Fall back to a fully general variable byte blend. + SDValue PBLENDVMask[32]; + // Scale the blend by the number of bytes per element. + int Scale = VT.getScalarSizeInBits() / 8; + assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!"); + for (int i = 0, Size = Mask.size(); i < Size; ++i) + for (int j = 0; j < Scale; ++j) + PBLENDVMask[Scale * i + j] = + Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) + : DAG.getConstant(Mask[i] < Size ? 0 : 0x80, MVT::i8); + + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2); + return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode( + X86ISD::BLENDV, DL, MVT::v32i8, V1, V2, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PBLENDVMask))); + } + default: llvm_unreachable("Not a supported integer vector type!"); } @@ -9215,56 +9297,6 @@ static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, } } -/// \brief Test whether there are elements crossing 128-bit lanes in this -/// shuffle mask. -/// -/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations -/// and we routinely test for these. -static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef Mask) { - int LaneSize = 128 / VT.getScalarSizeInBits(); - int Size = Mask.size(); - for (int i = 0; i < Size; ++i) - if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) - return true; - return false; -} - -/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane. -/// -/// This checks a shuffle mask to see if it is performing the same -/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies -/// that it is also not lane-crossing. It may however involve a blend from the -/// same lane of a second vector. -/// -/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is -/// non-trivial to compute in the face of undef lanes. The representation is -/// *not* suitable for use with existing 128-bit shuffles as it will contain -/// entries from both V1 and V2 inputs to the wider mask. -static bool -is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, - SmallVectorImpl &RepeatedMask) { - int LaneSize = 128 / VT.getScalarSizeInBits(); - RepeatedMask.resize(LaneSize, -1); - int Size = Mask.size(); - for (int i = 0; i < Size; ++i) { - if (Mask[i] < 0) - continue; - if ((Mask[i] % Size) / LaneSize != i / LaneSize) - // This entry crosses lanes, so there is no way to model this shuffle. - return false; - - // Ok, handle the in-lane shuffles by detecting if and when they repeat. - if (RepeatedMask[i % LaneSize] == -1) - // This is the first non-undef entry in this slot of a 128-bit lane. - RepeatedMask[i % LaneSize] = - Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size; - else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i]) - // Found a mismatch with the repeated mask. - return false; - } - return true; -} - /// \brief Generic routine to split a 256-bit vector shuffle into 128-bit /// shuffles. /// @@ -9581,9 +9613,74 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!"); - // FIXME: Actually implement this using AVX2!!! - (void)Mask; - return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG); + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // If the shuffle mask is repeated in each 128-bit lane we can use more + // efficient instructions that mirror the shuffles across the two 128-bit + // lanes. + SmallVector RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { + assert(RepeatedMask.size() == 8 && "Unexpected repeated mask size!"); + // FIXME: It might be worth it to call into the (terribly complex) v8i16 + // lowering here. + + // Use dedicated unpack instructions for masks that match their pattern. + // + if (isShuffleEquivalent(Mask, + // First 128-bit lane: + 0, 16, 1, 17, 2, 18, 3, 19, + // Second 128-bit lane: + 8, 24, 9, 25, 10, 26, 11, 27)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2); + if (isShuffleEquivalent(Mask, + // First 128-bit lane: + 4, 20, 5, 21, 6, 22, 7, 23, + // Second 128-bit lane: + 12, 28, 13, 29, 14, 30, 15, 31)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2); + } + + // There are no generalized cross-lane shuffle operations available on i16 + // element types. + // FIXME: We should teach the "split and lower" path to do something more + // clever, or do it ourselves here. The optimal lowering of cross-lane + // shuffles I am aware of is to swap the lanes into a copy, shuffle both the + // original and the copy, and then blend to pick up the cross-lane elements. + // This is four instructions with a tree height of three which is better than + // the worst case for a gather-cross-scatter approach such as used in SSE2 + // v8i16 lowering (where we don't have blends). While for cross-lane blends it + // results in a blend tree, blends are very cheap in AVX2 and newer chips. We + // might also want to special case situations where we can always do a single + // VPERMD to produce a non-lane-crossing shuffle. + if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) + return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG); + + if (isSingleInputShuffleMask(Mask)) { + SDValue PSHUFBMask[32]; + for (int i = 0; i < 16; ++i) { + if (Mask[i] == -1) { + PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8); + continue; + } + + int M = i < 8 ? Mask[i] : Mask[i] - 8; + assert(M >= 0 && M < 8 && "Invalid single-input mask!"); + PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8); + PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8); + } + return DAG.getNode( + ISD::BITCAST, DL, MVT::v16i16, + DAG.getNode( + X86ISD::PSHUFB, DL, MVT::v32i8, + DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1), + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask))); + } + + // Otherwise fall back on generic blend lowering. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i16, V1, V2, + Mask, DAG); } /// \brief Handle lowering of 32-lane 8-bit integer shuffles. diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 004d2aca941..39281c8915d 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -1098,7 +1098,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { // a constant shuffle mask. We won't be able to do this at the MC layer // because the mask isn't an immediate. case X86::PSHUFBrm: - case X86::VPSHUFBrm: { + case X86::VPSHUFBrm: + case X86::VPSHUFBYrm: { if (!OutStreamer.isVerboseAsm()) break; assert(MI->getNumOperands() > 5 && diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index 9145e0c7878..0d3a6220343 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -385,11 +385,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_0 ; ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08 ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*}} # xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -407,11 +403,7 @@ define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_1 ; ; AVX2-LABEL: @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15 ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*}} # xmm2 = [14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,30,31,30,31,30,31,30,31,30,31,30,31,30,31,30,31] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -430,12 +422,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_1 ; ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12 ; AVX2: # BB#0: -; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -454,12 +441,7 @@ define <16 x i16> @shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_1 ; ; AVX2-LABEL: @shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15 ; AVX2: # BB#0: -; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm0[3,3,3,3,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,7,7,7] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshuflw {{.*}} # xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[6,7,6,7,6,7,6,7,14,15,14,15,14,15,14,15,22,23,22,23,22,23,22,23,30,31,30,31,30,31,30,31] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -478,12 +460,7 @@ define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_1 ; ; AVX2-LABEL: @shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14 ; AVX2: # BB#0: -; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm0[0,0,2,2,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,4,6,6] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,6,6] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,16,17,16,17,20,21,20,21,24,25,24,25,28,29,28,29] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -502,12 +479,7 @@ define <16 x i16> @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_1 ; ; AVX2-LABEL: @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15 ; AVX2: # BB#0: -; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm0[1,1,3,3,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,5,5,7,7] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshuflw {{.*}} # xmm0 = xmm0[1,1,3,3,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15,18,19,18,19,22,23,22,23,26,27,26,27,30,31,30,31] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -637,11 +609,7 @@ define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_1 ; ; AVX2-LABEL: @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15 ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpblendw {{.*}} # xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX2-NEXT: vpblendw {{.*}} # xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*}} # ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -684,16 +652,9 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_2 ; ; AVX2-LABEL: @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24 ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vmovdqa {{.*}} # xmm3 = [0,1,0,1,4,5,0,1,0,1,0,1,12,13,0,1] -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vpshufd {{.*}} # xmm4 = xmm4[0,0,0,0] -; AVX2-NEXT: vpblendw {{.*}} # xmm2 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,0,0,0] -; AVX2-NEXT: vpblendw {{.*}} # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm1 = ymm1[u,u,0,1,u,u,0,1,u,u,0,1,u,u,0,1,u,u,16,17,u,u,16,17,u,u,16,17,u,u,16,17] +; AVX2-NEXT: vpshufd {{.*}} # ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vpblendw {{.*}} # ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -713,13 +674,8 @@ define <16 x i16> @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_1 ; ; AVX2-LABEL: @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15 ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpshuflw {{.*}} # xmm3 = xmm3[0,0,0,0,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*}} # xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*}} # xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm1 = ymm1[0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -741,15 +697,9 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_1 ; ; AVX2-LABEL: @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12 ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpshufhw {{.*}} # xmm2 = xmm2[0,1,2,3,7,6,5,4] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpshuflw {{.*}} # xmm3 = xmm3[3,2,1,0,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*}} # xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-NEXT: vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] -; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm1[3,2,1,0,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*}} # xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[u,u,u,u,u,u,u,u,14,15,12,13,10,11,8,9,u,u,u,u,u,u,u,u,30,31,28,29,26,27,24,25] +; AVX2-NEXT: vpshufb {{.*}} # ymm1 = ymm1[6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -773,17 +723,9 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_0 ; ; AVX2-LABEL: @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08 ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpshuflw {{.*}} # xmm2 = xmm2[3,2,1,0,4,5,6,7] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpshufd {{.*}} # xmm3 = xmm3[0,1,0,1] -; AVX2-NEXT: vpshufhw {{.*}} # xmm3 = xmm3[0,1,2,3,7,6,5,4] -; AVX2-NEXT: vpblendd {{.*}} # xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm1[3,2,1,0,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1] -; AVX2-NEXT: vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] -; AVX2-NEXT: vpblendd {{.*}} # xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[u,u,u,u,u,u,u,u,6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17] +; AVX2-NEXT: vpshufb {{.*}} # ymm1 = ymm1[6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -801,11 +743,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_0 ; ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08 ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*}} # xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -823,11 +761,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_0 ; ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08 ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*}} # xmm2 = [0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -845,11 +779,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_0 ; ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08 ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*}} # xmm2 = [0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -867,11 +797,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_0 ; ; AVX2-LABEL: @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08 ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*}} # xmm2 = [0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,24,25,16,17,16,17,16,17,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -889,11 +815,7 @@ define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_0 ; ; AVX2-LABEL: @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08 ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*}} # xmm2 = [0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,26,27,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -911,11 +833,7 @@ define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_0 ; ; AVX2-LABEL: @shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08 ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*}} # xmm2 = [0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,28,29,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -933,11 +851,7 @@ define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_0 ; ; AVX2-LABEL: @shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08 ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*}} # xmm2 = [14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,30,31,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -959,15 +873,7 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_2 ; ; AVX2-LABEL: @shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27 ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpunpcklwd {{.*}} # xmm2 = xmm2[0,0,1,1,2,2,3,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpmovzxwd %xmm3, %xmm3 -; AVX2-NEXT: vpblendw {{.*}} # xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX2-NEXT: vpunpcklwd {{.*}} # xmm1 = xmm1[0,0,1,1,2,2,3,3] -; AVX2-NEXT: vpmovzxwd %xmm0, %xmm0 -; AVX2-NEXT: vpblendw {{.*}} # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpunpcklwd {{.*}} # ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -989,15 +895,7 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_3 ; ; AVX2-LABEL: @shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31 ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpunpckhwd {{.*}} # xmm2 = xmm2[4,4,5,5,6,6,7,7] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpunpckhwd {{.*}} # xmm3 = xmm3[4,4,5,5,6,6,7,7] -; AVX2-NEXT: vpblendw {{.*}} # xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX2-NEXT: vpunpckhwd {{.*}} # xmm1 = xmm1[4,4,5,5,6,6,7,7] -; AVX2-NEXT: vpunpckhwd {{.*}} # xmm0 = xmm0[4,4,5,5,6,6,7,7] -; AVX2-NEXT: vpblendw {{.*}} # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpunpckhwd {{.*}} # ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -1019,15 +917,9 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_3 ; ; AVX2-LABEL: @shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31 ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpunpckhwd {{.*}} # xmm2 = xmm2[4,4,5,5,6,6,7,7] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpunpckhwd {{.*}} # xmm3 = xmm3[4,4,5,5,6,6,7,7] -; AVX2-NEXT: vpblendw {{.*}} # xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX2-NEXT: vpunpcklwd {{.*}} # xmm1 = xmm1[0,0,1,1,2,2,3,3] -; AVX2-NEXT: vpmovzxwd %xmm0, %xmm0 -; AVX2-NEXT: vpblendw {{.*}} # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31] +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31,u,u] +; AVX2-NEXT: vpblendw {{.*}} # ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -1049,15 +941,9 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_2 ; ; AVX2-LABEL: @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27 ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpunpcklwd {{.*}} # xmm2 = xmm2[0,0,1,1,2,2,3,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpmovzxwd %xmm3, %xmm3 -; AVX2-NEXT: vpblendw {{.*}} # xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX2-NEXT: vpunpckhwd {{.*}} # xmm1 = xmm1[4,4,5,5,6,6,7,7] -; AVX2-NEXT: vpunpckhwd {{.*}} # xmm0 = xmm0[4,4,5,5,6,6,7,7] -; AVX2-NEXT: vpblendw {{.*}} # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm1 = ymm1[u,u,8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23,u,u] +; AVX2-NEXT: vpblendw {{.*}} # ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -1074,10 +960,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_0 ; ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08 ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*}} # xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*}} # xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,18,19,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -1094,10 +977,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_0 ; ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08 ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*}} # xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,4,5,0,1,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,20,21,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -1114,10 +994,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_0 ; ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08 ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*}} # xmm1 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,6,7,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,22,23,16,17,16,17,16,17,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -1134,10 +1011,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_0 ; ; AVX2-LABEL: @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08 ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*}} # xmm1 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,0,1,0,1,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,16,17,16,17,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -1154,10 +1028,7 @@ define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_0 ; ; AVX2-LABEL: @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08 ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*}} # xmm1 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,26,27,16,17,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -1174,10 +1045,7 @@ define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_0 ; ; AVX2-LABEL: @shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08 ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*}} # xmm1 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,12,13,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,28,29,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -1194,10 +1062,7 @@ define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_1 ; ; AVX2-LABEL: @shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15 ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*}} # xmm1 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,14,15] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,30,31] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -1215,11 +1080,7 @@ define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_0 ; ; AVX2-LABEL: @shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08 ; AVX2: # BB#0: -; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm0[0,0,2,2,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,4,6,6] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*}} # xmm0 = xmm0[12,13,12,13,8,9,8,9,4,5,4,5,0,1,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,28,29,28,29,24,25,24,25,20,21,20,21,16,17,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -1237,11 +1098,7 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_1 ; ; AVX2-LABEL: @shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12 ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*}} # xmm1 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -1258,10 +1115,7 @@ define <16 x i16> @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_0 ; ; AVX2-LABEL: @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08 ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*}} # xmm1 = xmm0[0,1,2,3,4,5,0,1,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,12,13,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,u,u,u,u,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,28,29,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -1278,10 +1132,7 @@ define <16 x i16> @shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_1 ; ; AVX2-LABEL: @shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15 ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*}} # xmm1 = xmm0[14,15,2,3,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,0,1,14,15] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[14,15,u,u,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,16,17,30,31] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -1299,11 +1150,7 @@ define <16 x i16> @shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_0 ; ; AVX2-LABEL: @shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08 ; AVX2: # BB#0: -; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm0[0,1,2,2,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,4,6,6] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*}} # xmm0 = xmm0[12,13,12,13,12,13,8,9,4,5,4,5,0,1,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,u,u,u,u,4,5,8,9,8,9,u,u,12,13,28,29,28,29,u,u,24,25,20,21,20,21,16,17,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -1322,12 +1169,7 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_1 ; ; AVX2-LABEL: @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12 ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*}} # xmm1 = xmm0[2,1,2,3] -; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshuflw {{.*}} # xmm0 = xmm0[0,0,0,3,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[8,9,8,9,8,9,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,u,u,u,u,24,25,24,25,24,25] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle