diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index c09c38a31a1..1bfacfe65a2 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -9470,6 +9470,61 @@ static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, } } +/// \brief Helper function to test whether a shuffle mask could be +/// simplified by widening the elements being shuffled. +/// +/// Appends the mask for wider elements in WidenedMask if valid. Otherwise +/// leaves it in an unspecified state. +/// +/// NOTE: This must handle normal vector shuffle masks and *target* vector +/// shuffle masks. The latter have the special property of a '-2' representing +/// a zero-ed lane of a vector. +static bool canWidenShuffleElements(ArrayRef Mask, + SmallVectorImpl &WidenedMask) { + for (int i = 0, Size = Mask.size(); i < Size; i += 2) { + // If both elements are undef, its trivial. + if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) { + WidenedMask.push_back(SM_SentinelUndef); + continue; + } + + // Check for an undef mask and a mask value properly aligned to fit with + // a pair of values. If we find such a case, use the non-undef mask's value. + if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) { + WidenedMask.push_back(Mask[i + 1] / 2); + continue; + } + if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) { + WidenedMask.push_back(Mask[i] / 2); + continue; + } + + // When zeroing, we need to spread the zeroing across both lanes to widen. + if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) { + if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) && + (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) { + WidenedMask.push_back(SM_SentinelZero); + continue; + } + return false; + } + + // Finally check if the two mask values are adjacent and aligned with + // a pair. + if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) { + WidenedMask.push_back(Mask[i] / 2); + continue; + } + + // Otherwise we can't safely widen the elements used in this shuffle. + return false; + } + assert(WidenedMask.size() == Mask.size() / 2 && + "Incorrect size of mask after widening the elements!"); + + return true; +} + /// \brief Generic routine to split ector shuffle into half-sized shuffles. /// /// This routine just extracts two subvectors, shuffles them independently, and @@ -9583,6 +9638,43 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT, return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); } +/// \brief Handle lowering 2-lane 128-bit shuffles. +static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + // Blends are faster and handle all the non-lane-crossing cases. + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), + VT.getVectorNumElements() / 2); + // Check for patterns which can be matched with a single insert of a 128-bit + // subvector. + if (isShuffleEquivalent(Mask, 0, 1, 0, 1) || + isShuffleEquivalent(Mask, 0, 1, 4, 5)) { + SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, + DAG.getIntPtrConstant(0)); + SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, + Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0)); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); + } + if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) { + SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, + DAG.getIntPtrConstant(0)); + SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2, + DAG.getIntPtrConstant(2)); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); + } + + // Otherwise form a 128-bit permutation. + // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half. + unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4; + return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, + DAG.getConstant(PermMask, MVT::i8)); +} + /// \brief Handle lowering of 4-lane 64-bit floating point shuffles. /// /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 @@ -9597,6 +9689,11 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + SmallVector WidenedMask; + if (canWidenShuffleElements(Mask, WidenedMask)) + return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget, + DAG); + if (isSingleInputShuffleMask(Mask)) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1, @@ -9682,6 +9779,11 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!"); + SmallVector WidenedMask; + if (canWidenShuffleElements(Mask, WidenedMask)) + return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget, + DAG); + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Blend; @@ -10191,61 +10293,6 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); } -/// \brief Helper function to test whether a shuffle mask could be -/// simplified by widening the elements being shuffled. -/// -/// Appends the mask for wider elements in WidenedMask if valid. Otherwise -/// leaves it in an unspecified state. -/// -/// NOTE: This must handle normal vector shuffle masks and *target* vector -/// shuffle masks. The latter have the special property of a '-2' representing -/// a zero-ed lane of a vector. -static bool canWidenShuffleElements(ArrayRef Mask, - SmallVectorImpl &WidenedMask) { - for (int i = 0, Size = Mask.size(); i < Size; i += 2) { - // If both elements are undef, its trivial. - if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) { - WidenedMask.push_back(SM_SentinelUndef); - continue; - } - - // Check for an undef mask and a mask value properly aligned to fit with - // a pair of values. If we find such a case, use the non-undef mask's value. - if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) { - WidenedMask.push_back(Mask[i + 1] / 2); - continue; - } - if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) { - WidenedMask.push_back(Mask[i] / 2); - continue; - } - - // When zeroing, we need to spread the zeroing across both lanes to widen. - if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) { - if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) && - (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) { - WidenedMask.push_back(SM_SentinelZero); - continue; - } - return false; - } - - // Finally check if the two mask values are adjacent and aligned with - // a pair. - if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) { - WidenedMask.push_back(Mask[i] / 2); - continue; - } - - // Otherwise we can't safely widen the elements used in this shuffle. - return false; - } - assert(WidenedMask.size() == Mask.size() / 2 && - "Incorrect size of mask after widening the elements!"); - - return true; -} - /// \brief Top-level lowering for x86 vector shuffles. /// /// This handles decomposition, canonicalization, and lowering of all x86 diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll index a8e5d885443..fa554e97856 100644 --- a/test/CodeGen/X86/avx-vperm2x128.ll +++ b/test/CodeGen/X86/avx-vperm2x128.ll @@ -2,15 +2,10 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; AVX1-LABEL: A: -; AVX1: ## BB#0: ## %entry -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: A: -; AVX2: ## BB#0: ## %entry -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: retq +; ALL-LABEL: A: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; ALL-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -27,66 +22,40 @@ entry: } define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; AVX1-LABEL: C: -; AVX1: ## BB#0: ## %entry -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: C: -; AVX2: ## BB#0: ## %entry -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq +; ALL-LABEL: C: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @D(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; AVX1-LABEL: D: -; AVX1: ## BB#0: ## %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: D: -; AVX2: ## BB#0: ## %entry -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-NEXT: retq +; ALL-LABEL: D: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; ALL-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { -; AVX1-LABEL: E: -; AVX1: ## BB#0: ## %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: E: -; AVX2: ## BB#0: ## %entry -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-NEXT: retq +; ALL-LABEL: E: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; ALL-NEXT: retq entry: %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { -; AVX1-LABEL: E2: -; AVX1: ## BB#0: ## %entry -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-NEXT: retq -; -; AVX2-LABEL: E2: -; AVX2: ## BB#0: ## %entry -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: retq +; ALL-LABEL: E2: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] +; ALL-NEXT: retq entry: %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle @@ -95,15 +64,18 @@ entry: define <32 x i8> @Ei(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { ; AVX1-LABEL: Ei: ; AVX1: ## BB#0: ## %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: Ei: ; AVX2: ## BB#0: ## %entry ; AVX2-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-NEXT: retq entry: ; add forces execution domain @@ -115,19 +87,19 @@ entry: define <4 x i64> @E2i(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { ; AVX1-LABEL: E2i: ; AVX1: ## BB#0: ## %entry -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: E2i: ; AVX2: ## BB#0: ## %entry ; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] ; AVX2-NEXT: retq entry: ; add forces execution domain @@ -139,17 +111,19 @@ entry: define <8 x i32> @E3i(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { ; AVX1-LABEL: E3i: ; AVX1: ## BB#0: ## %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1] +; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: E3i: ; AVX2: ## BB#0: ## %entry ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: retq entry: ; add forces execution domain @@ -162,15 +136,13 @@ define <16 x i16> @E4i(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone s ; AVX1-LABEL: E4i: ; AVX1: ## BB#0: ## %entry ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: E4i: ; AVX2: ## BB#0: ## %entry ; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq entry: ; add forces execution domain @@ -184,9 +156,8 @@ define <16 x i16> @E5i(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ; AVX1: ## BB#0: ## %entry ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovapd (%rsi), %ymm1 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-NEXT: vmovaps (%rsi), %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: E5i: @@ -194,8 +165,7 @@ define <16 x i16> @E5i(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq entry: %c = load <16 x i16>* %a @@ -208,19 +178,10 @@ entry: ;;;; Cases with undef indicies mixed in the mask define <8 x float> @F(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; AVX1-LABEL: F: -; AVX1: ## BB#0: ## %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] -; AVX1-NEXT: retq -; -; AVX2-LABEL: F: -; AVX2: ## BB#0: ## %entry -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] -; AVX2-NEXT: retq +; ALL-LABEL: F: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1,0,1] +; ALL-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -229,19 +190,12 @@ entry: ;;;; Cases we must not select vperm2f128 define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; AVX1-LABEL: G: -; AVX1: ## BB#0: ## %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] -; AVX1-NEXT: retq -; -; AVX2-LABEL: G: -; AVX2: ## BB#0: ## %entry -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7] -; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] -; AVX2-NEXT: retq +; ALL-LABEL: G: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1,0,1] +; ALL-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] +; ALL-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle diff --git a/test/CodeGen/X86/avx512-build-vector.ll b/test/CodeGen/X86/avx512-build-vector.ll index 38f0cbcc8d1..9e9ad31c916 100644 --- a/test/CodeGen/X86/avx512-build-vector.ll +++ b/test/CodeGen/X86/avx512-build-vector.ll @@ -4,7 +4,7 @@ define <16 x i32> @test1(i32* %x) { ; CHECK-LABEL: test1: ; CHECK: ## BB#0: ; CHECK-NEXT: vmovd (%rdi), %xmm0 -; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7] ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 7899a52a741..64cdee75db8 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -274,33 +274,19 @@ define <4 x double> @shuffle_v4f64_4163(<4 x double> %a, <4 x double> %b) { } define <4 x double> @shuffle_v4f64_0145(<4 x double> %a, <4 x double> %b) { -; AVX1-LABEL: shuffle_v4f64_0145: -; AVX1: # BB#0: -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4f64_0145: -; AVX2: # BB#0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX2-NEXT: retq +; ALL-LABEL: shuffle_v4f64_0145: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } define <4 x double> @shuffle_v4f64_4501(<4 x double> %a, <4 x double> %b) { -; AVX1-LABEL: shuffle_v4f64_4501: -; AVX1: # BB#0: -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4f64_4501: -; AVX2: # BB#0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX2-NEXT: retq +; ALL-LABEL: shuffle_v4f64_4501: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -476,15 +462,15 @@ define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) { define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_0142: ; AVX1: # BB#0: -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,1,2,2] ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_0142: ; AVX2: # BB#0: +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2] -; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -531,17 +517,10 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) { } define <4 x i64> @shuffle_v4i64_0145(<4 x i64> %a, <4 x i64> %b) { -; AVX1-LABEL: shuffle_v4i64_0145: -; AVX1: # BB#0: -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4i64_0145: -; AVX2: # BB#0: -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: retq +; ALL-LABEL: shuffle_v4i64_0145: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; ALL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -558,8 +537,8 @@ define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) { ; ; AVX2-LABEL: shuffle_v4i64_0451: ; AVX2: # BB#0: +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] ; AVX2-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -567,17 +546,10 @@ define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) { } define <4 x i64> @shuffle_v4i64_4501(<4 x i64> %a, <4 x i64> %b) { -; AVX1-LABEL: shuffle_v4i64_4501: -; AVX1: # BB#0: -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4i64_4501: -; AVX2: # BB#0: -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: retq +; ALL-LABEL: shuffle_v4i64_4501: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -594,7 +566,7 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) { ; ; AVX2-LABEL: shuffle_v4i64_4015: ; AVX2: # BB#0: -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-NEXT: retq @@ -605,7 +577,7 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) { define <4 x i64> @shuffle_v4i64_2u35(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_2u35: ; AVX1: # BB#0: -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -614,7 +586,7 @@ define <4 x i64> @shuffle_v4i64_2u35(<4 x i64> %a, <4 x i64> %b) { ; ; AVX2-LABEL: shuffle_v4i64_2u35: ; AVX2: # BB#0: -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll index 2f02f2fc08f..d9a69c73893 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -101,10 +101,10 @@ define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_01014545: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,1,0,1] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -148,8 +148,8 @@ define <8 x double> @shuffle_v8f64_81a3c5e7(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_08080808: ; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 -; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] ; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ; ALL-NEXT: retq @@ -160,13 +160,13 @@ define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_08084c4c(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_08084c4c: ; ALL: # BB#0: -; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 -; ALL-NEXT: vbroadcastsd %xmm2, %ymm2 -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 ; ALL-NEXT: vbroadcastsd %xmm3, %ymm3 -; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 -; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] ; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq @@ -691,10 +691,8 @@ define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_c348cda0: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,3,2,3] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; ALL-NEXT: vbroadcastsd %xmm3, %ymm3 -; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[0,1],ymm2[0,1] ; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 ; ALL-NEXT: vbroadcastsd %xmm1, %ymm4 ; ALL-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3] @@ -711,18 +709,18 @@ define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_f511235a: ; ALL: # BB#0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[0,1,1,3] -; ALL-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[2,3,2,3] -; ALL-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3] +; ALL-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; ALL-NEXT: vpermpd {{.*#+}} ymm4 = ymm3[0,1,1,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3] ; ALL-NEXT: vpermilpd {{.*#+}} ymm4 = ymm1[0,0,2,2] -; ALL-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3] ; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,1] -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3] ; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1 ; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3] ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] -; ALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm0, %zmm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -826,10 +824,10 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_01014545: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -873,8 +871,8 @@ define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_08080808: ; ALL: # BB#0: +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 -; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 ; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; ALL-NEXT: retq @@ -885,13 +883,13 @@ define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_08084c4c(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_08084c4c: ; ALL: # BB#0: -; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; ALL-NEXT: vpbroadcastq %xmm2, %ymm2 -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; ALL-NEXT: vpbroadcastq %xmm3, %ymm3 -; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 -; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 ; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq @@ -1416,17 +1414,17 @@ define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_6caa87e5: ; ALL: # BB#0: -; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,3,2,1] -; ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] +; ALL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[0,1,0,1] +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; ALL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] ; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5] -; ALL-NEXT: vpbroadcastq %xmm2, %ymm2 -; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; ALL-NEXT: vpbroadcastq %xmm3, %ymm3 +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] ; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; ALL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle