From 42ceb12123be2852f905696d155e96b76a3a2c6a Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Mon, 22 Jun 2015 09:01:15 +0000 Subject: [PATCH] Reverted AVX-512 vector shuffle git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@240258 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 250 ++------ test/CodeGen/X86/avx512-build-vector.ll | 10 - test/CodeGen/X86/avx512-shuffle.ll | 392 ------------ test/CodeGen/X86/vector-shuffle-512-v8.ll | 713 +++++++++++++++------- 4 files changed, 573 insertions(+), 792 deletions(-) delete mode 100644 test/CodeGen/X86/avx512-shuffle.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index d41f7f64411..67e733384ab 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -6259,42 +6259,6 @@ is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, return true; } -/// \brief Test whether a shuffle mask is equivalent within each 256-bit lane. -/// -/// This checks a shuffle mask to see if it is performing the same -/// 256-bit lane-relative shuffle in each 256-bit lane. This trivially implies -/// that it is also not lane-crossing. It may however involve a blend from the -/// same lane of a second vector. -/// -/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is -/// non-trivial to compute in the face of undef lanes. The representation is -/// *not* suitable for use with existing 256-bit shuffles as it will contain -/// entries from both V1 and V2 inputs to the wider mask. -static bool -is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, - SmallVectorImpl &RepeatedMask) { - int LaneSize = 256 / VT.getScalarSizeInBits(); - RepeatedMask.resize(LaneSize, -1); - int Size = Mask.size(); - for (int i = 0; i < Size; ++i) { - if (Mask[i] < 0) - continue; - if ((Mask[i] % Size) / LaneSize != i / LaneSize) - // This entry crosses lanes, so there is no way to model this shuffle. - return false; - - // Ok, handle the in-lane shuffles by detecting if and when they repeat. - if (RepeatedMask[i % LaneSize] == -1) - // This is the first non-undef entry in this slot of a 256-bit lane. - RepeatedMask[i % LaneSize] = - Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size; - else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i]) - // Found a mismatch with the repeated mask. - return false; - } - return true; -} - /// \brief Checks whether a shuffle mask is equivalent to an explicit list of /// arguments. /// @@ -6354,22 +6318,6 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef Mask, SDLoc DL, return DAG.getConstant(Imm, DL, MVT::i8); } -/// \brief Get a 8-bit shuffle, 1 bit per lane, immediate for a mask. -/// -/// This helper function produces an 8-bit shuffle immediate corresponding to -/// the ubiquitous shuffle encoding scheme used in x86 instructions for -/// shuffling 8 lanes. -static SDValue get1bitLaneShuffleImm8ForMask(ArrayRef Mask, SDLoc DL, - SelectionDAG &DAG) { - assert(Mask.size() <= 8 && - "Up to 8 elts may be in Imm8 1-bit lane shuffle mask"); - unsigned Imm = 0; - for (unsigned i = 0; i < Mask.size(); ++i) - if (Mask[i] >= 0) - Imm |= (Mask[i] % 2) << i; - return DAG.getConstant(Imm, DL, MVT::i8); -} - /// \brief Try to emit a blend instruction for a shuffle using bit math. /// /// This is used as a fallback approach when first class blend instructions are @@ -9385,30 +9333,6 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, DAG.getConstant(PermMask, DL, MVT::i8)); } -/// \brief Handle lowering 4-lane 128-bit shuffles. -static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef WidenedMask, - SelectionDAG &DAG) { - - assert(WidenedMask.size() == 4 && "Unexpected mask size for 128bit shuffle!"); - // form a 128-bit permutation. - // convert the 64-bit shuffle mask selection values into 128-bit selection - // bits defined by a vshuf64x2 instruction's immediate control byte. - unsigned PermMask = 0, Imm = 0; - - for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) { - if(WidenedMask[i] == SM_SentinelZero) - return SDValue(); - - // use first element in place of undef musk - Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i]; - PermMask |= (Imm % 4) << (i * 2); - } - - return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, - DAG.getConstant(PermMask, DL, MVT::i8)); -} - /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then /// shuffling each lane. /// @@ -10144,105 +10068,35 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, } } -static SDValue lowerVectorShuffleWithVALIGN(SDLoc DL, MVT VT, - ArrayRef Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG) { - - assert(VT.getScalarSizeInBits() >= 32 && "Unexpected data type for VALIGN"); - // VALIGN pattern 2, 3, 4, 5, .. (sequential, shifted right) - int AlignVal = -1; - for (int i = 0; i < (signed)VT.getVectorNumElements(); ++i) { - if (Mask[i] < 0) - continue; - if (Mask[i] < i) - return SDValue(); - if (AlignVal == -1) - AlignVal = Mask[i] - i; - else if (Mask[i] - i != AlignVal) - return SDValue(); - } - // Vector source operands should be swapped - return DAG.getNode(X86ISD::VALIGN, DL, VT, V2, V1, - DAG.getConstant(AlignVal, DL, MVT::i8)); -} - -static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT, - ArrayRef Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG) { - - assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV"); - - MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); - MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); - - SmallVector VPermMask; - for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) - VPermMask.push_back(Mask[i] < 0 ? DAG.getUNDEF(MaskEltVT) : - DAG.getConstant(Mask[i], DL,MaskEltVT)); - SDValue MaskNode = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVecVT, - VPermMask); - if (isSingleInputShuffleMask(Mask)) - return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); - - return DAG.getNode(X86ISD::VPERMV3, DL, VT, MaskNode, V1, V2); -} - - /// \brief Handle lowering of 8-lane 64-bit floating point shuffles. -static SDValue lowerV8X64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, +static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); - MVT VT = Op.getSimpleValueType(); - assert((V1.getSimpleValueType() == MVT::v8f64 || - V1.getSimpleValueType() == MVT::v8i64) && "Bad operand type!"); - assert((V2.getSimpleValueType() == MVT::v8f64 || - V2.getSimpleValueType() == MVT::v8i64) && "Bad operand type!"); + assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - SmallVector WidenedMask; - if (canWidenShuffleElements(Mask, WidenedMask)) - if(SDValue Op = lowerV4X128VectorShuffle(DL, VT, V1, V2, WidenedMask, DAG)) - return Op; // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14})) - return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2); if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15})) - return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2); - if (SDValue Op = lowerVectorShuffleWithVALIGN(DL, VT, Mask, V1, V2, DAG)) - return Op; - - if (SDValue Op = lowerVectorShuffleWithSHUFPD(DL, VT, Mask, V1, V2, DAG)) - return Op; - - // PERMILPD instruction - mask 0/1, 0/1, 2/3, 2/3, 4/5, 4/5, 6/7, 6/7 - if (isSingleInputShuffleMask(Mask)) { - if (!is128BitLaneCrossingShuffleMask(VT, Mask)) - return DAG.getNode(X86ISD::VPERMILPI, DL, VT, V1, - get1bitLaneShuffleImm8ForMask(Mask, DL, DAG)); - - SmallVector RepeatedMask; - if (is256BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) - return DAG.getNode(X86ISD::VPERMI, DL, VT, V1, - getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); - } - return lowerVectorShuffleWithPERMV(DL, VT, Mask, V1, V2, DAG); + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG); } -/// \brief Handle lowering of 16-lane 32-bit integer shuffles. -static SDValue lowerV16X32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, +/// \brief Handle lowering of 16-lane 32-bit floating point shuffles. +static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); - assert((V1.getSimpleValueType() == MVT::v16i32 || - V1.getSimpleValueType() == MVT::v16f32) && "Bad operand type!"); - assert((V2.getSimpleValueType() == MVT::v16i32 || - V2.getSimpleValueType() == MVT::v16f32) && "Bad operand type!"); + assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); @@ -10253,39 +10107,67 @@ static SDValue lowerV16X32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 0, 16, 1, 17, 4, 20, 5, 21, // Second 128-bit lane. 8, 24, 9, 25, 12, 28, 13, 29})) - return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2); if (isShuffleEquivalent(V1, V2, Mask, {// First 128-bit lane. 2, 18, 3, 19, 6, 22, 7, 23, // Second 128-bit lane. 10, 26, 11, 27, 14, 30, 15, 31})) - return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, - 12, 12, 14, 14})) - return DAG.getNode(X86ISD::MOVSLDUP, DL, VT, V1); - if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, - 13, 13, 15, 15})) - return DAG.getNode(X86ISD::MOVSHDUP, DL, VT, V1); + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG); +} - SmallVector RepeatedMask; - if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) { - if (isSingleInputShuffleMask(Mask)) { - unsigned Opc = VT.isInteger() ? X86ISD::PSHUFD : X86ISD::VPERMILPI; - return DAG.getNode(Opc, DL, VT, V1, - getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); - } +/// \brief Handle lowering of 8-lane 64-bit integer shuffles. +static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - for (int i = 0; i < 4; ++i) - if (RepeatedMask[i] >= 16) - RepeatedMask[i] -= 12; - return lowerVectorShuffleWithSHUFPS(DL, VT, RepeatedMask, V1, V2, DAG); - } + // X86 has dedicated unpack instructions that can handle specific blend + // operations: UNPCKH and UNPCKL. + if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2); - if (SDValue Op = lowerVectorShuffleWithVALIGN(DL, VT, Mask, V1, V2, DAG)) - return Op; + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG); +} - return lowerVectorShuffleWithPERMV(DL, VT, Mask, V1, V2, DAG); +/// \brief Handle lowering of 16-lane 32-bit integer shuffles. +static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, + {// First 128-bit lane. + 0, 16, 1, 17, 4, 20, 5, 21, + // Second 128-bit lane. + 8, 24, 9, 25, 12, 28, 13, 29})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, + {// First 128-bit lane. + 2, 18, 3, 19, 6, 22, 7, 23, + // Second 128-bit lane. + 10, 26, 11, 27, 14, 30, 15, 31})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG); } /// \brief Handle lowering of 32-lane 16-bit integer shuffles. @@ -10345,11 +10227,13 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, // the requisite ISA extensions for that element type are available. switch (VT.SimpleTy) { case MVT::v8f64: - case MVT::v8i64: - return lowerV8X64VectorShuffle(Op, V1, V2, Subtarget, DAG); + return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v16f32: + return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v8i64: + return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v16i32: - return lowerV16X32VectorShuffle(Op, V1, V2, Subtarget, DAG); + return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v32i16: if (Subtarget->hasBWI()) return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG); diff --git a/test/CodeGen/X86/avx512-build-vector.ll b/test/CodeGen/X86/avx512-build-vector.ll index e70d9f3ad52..e5373c575c1 100644 --- a/test/CodeGen/X86/avx512-build-vector.ll +++ b/test/CodeGen/X86/avx512-build-vector.ll @@ -1,15 +1,5 @@ ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -define <16 x i32> @test1(i32* %x) { -; CHECK-LABEL: test1: -; CHECK: vmovd (%rdi), %xmm -; CHECK: vmovdqa32 -; CHECK: vpermt2d %zmm - %y = load i32, i32* %x, align 4 - %res = insertelement <16 x i32>zeroinitializer, i32 %y, i32 4 - ret <16 x i32>%res -} - define <16 x i32> @test2(<16 x i32> %x) { ; CHECK-LABEL: test2: ; CHECK: ## BB#0: diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll deleted file mode 100644 index 7e9eda58737..00000000000 --- a/test/CodeGen/X86/avx512-shuffle.ll +++ /dev/null @@ -1,392 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK-SKX - -; CHECK-LABEL: test1: -; CHECK: vpermps -; CHECK: ret -define <16 x float> @test1(<16 x float> %a) nounwind { - %c = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> - ret <16 x float> %c -} - -; CHECK-LABEL: test2: -; CHECK: vpermd -; CHECK: ret -define <16 x i32> @test2(<16 x i32> %a) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> - ret <16 x i32> %c -} - -; CHECK-LABEL: test3: -; CHECK: vpermq -; CHECK: ret -define <8 x i64> @test3(<8 x i64> %a) nounwind { - %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> - ret <8 x i64> %c -} - -; CHECK-LABEL: test4: -; CHECK: vpermpd -; CHECK: ret -define <8 x double> @test4(<8 x double> %a) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32> - ret <8 x double> %c -} - -; CHECK-LABEL: test5: -; CHECK: vpermt2pd -; CHECK: ret -define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> - ret <8 x double> %c -} - -; CHECK-LABEL: test6: -; CHECK: vpermq $30 -; CHECK: ret -define <8 x i64> @test6(<8 x i64> %a) nounwind { - %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> - ret <8 x i64> %c -} - -; CHECK-LABEL: test7: -; CHECK: vpermt2q -; CHECK: ret -define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind { - %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> - ret <8 x i64> %c -} - -; CHECK-LABEL: test8: -; CHECK: vpermt2d -; CHECK: ret -define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> - ret <16 x i32> %c -} - -; CHECK-LABEL: test9: -; CHECK: vpermt2ps -; CHECK: ret -define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind { - %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> - ret <16 x float> %c -} - -; CHECK-LABEL: test10: -; CHECK: vpermt2ps ( -; CHECK: ret -define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind { - %c = load <16 x float>, <16 x float>* %b - %d = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> - ret <16 x float> %d -} - -; CHECK-LABEL: test11: -; CHECK: vpermt2d -; CHECK: ret -define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind { - %c = load <16 x i32>, <16 x i32>* %b - %d = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> - ret <16 x i32> %d -} - -; CHECK-LABEL: test13 -; CHECK: vpermilps $177, %zmm -; CHECK: ret -define <16 x float> @test13(<16 x float> %a) { - %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> - ret <16 x float> %b -} - -; CHECK-LABEL: test14 -; CHECK: vpermilpd $203, %zmm -; CHECK: ret -define <8 x double> @test14(<8 x double> %a) { - %b = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32> - ret <8 x double> %b -} - -; CHECK-LABEL: test15 -; CHECK: vpshufd $177, %zmm -; CHECK: ret -define <16 x i32> @test15(<16 x i32> %a) { -; mask 1-0-3-2 = 10110001 = 0xb1 = 177 - %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> - ret <16 x i32> %b -} -; CHECK-LABEL: test16 -; CHECK: valignq $3, %zmm0, %zmm1 -; CHECK: ret -define <8 x double> @test16(<8 x double> %a, <8 x double> %b) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> - ret <8 x double> %c -} - -; CHECK-LABEL: test17 -; CHECK: vshufpd $19, %zmm1, %zmm0 -; CHECK: ret -define <8 x double> @test17(<8 x double> %a, <8 x double> %b) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> - ret <8 x double> %c -} - -; CHECK-LABEL: test18 -; CHECK: vpunpckhdq %zmm -; CHECK: ret -define <16 x i32> @test18(<16 x i32> %a, <16 x i32> %c) { - %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> - ret <16 x i32> %b -} - -; CHECK-LABEL: test19 -; CHECK: vpunpckldq %zmm -; CHECK: ret -define <16 x i32> @test19(<16 x i32> %a, <16 x i32> %c) { - %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> - ret <16 x i32> %b -} - -; CHECK-LABEL: test20 -; CHECK: vpunpckhqdq %zmm -; CHECK: ret -define <8 x i64> @test20(<8 x i64> %a, <8 x i64> %c) { - %b = shufflevector <8 x i64> %a, <8 x i64> %c, <8 x i32> - ret <8 x i64> %b -} - -; CHECK-LABEL: test21 -; CHECK: vbroadcastsd %xmm0, %zmm -; CHECK: ret -define <8 x double> @test21(<8 x double> %a, <8 x double> %b) { - %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> - ret <8 x double> %shuffle -} - -; CHECK-LABEL: test22 -; CHECK: vpbroadcastq %xmm0, %zmm -; CHECK: ret -define <8 x i64> @test22(<8 x i64> %a, <8 x i64> %b) { - %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> - ret <8 x i64> %shuffle -} - -; CHECK-LABEL: @test23 -; CHECK: vshufps -; CHECK: vshufps -; CHECK: ret -define <16 x i32> @test23(<16 x i32> %a, <16 x i32> %b) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> - ret <16 x i32> %c -} - -; CHECK-LABEL: @test24 -; CHECK: vpermt2d -; CHECK: ret -define <16 x i32> @test24(<16 x i32> %a, <16 x i32> %b) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> - ret <16 x i32> %c -} - -; CHECK-LABEL: @test25 -; CHECK: vshufps $52 -; CHECK: ret -define <16 x i32> @test25(<16 x i32> %a, <16 x i32> %b) nounwind { -; mask - 0-1-3-0 00110100 = 0x34 = 52 - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> - ret <16 x i32> %c -} - -; CHECK-LABEL: @test26 -; CHECK: vmovshdup -; CHECK: ret -define <16 x i32> @test26(<16 x i32> %a) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> - ret <16 x i32> %c -} - -; CHECK-LABEL: @test27 -; CHECK: ret -define <16 x i32> @test27(<4 x i32>%a) { - %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> - ret <16 x i32> %res -} - -; CHECK-LABEL: test28 -; CHECK: vpshufhw $177, %ymm -; CHECK: ret -define <16 x i16> @test28(<16 x i16> %a) { - %b = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> - ret <16 x i16> %b -} - -; CHECK-LABEL: test29 -; CHECK: vunpcklps %zmm -; CHECK: ret -define <16 x float> @test29(<16 x float> %a, <16 x float> %c) { - %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> - ret <16 x float> %b -} - -; CHECK-LABEL: @test30 -; CHECK: vshufps $144, %zmm -; CHECK: ret -define <16 x float> @test30(<16 x float> %a, <16 x float> %c) { - %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> - ret <16 x float> %b -} - -; CHECK-LABEL: test31 -; CHECK: valignd $3, %zmm0, %zmm1 -; CHECK: ret -define <16 x i32> @test31(<16 x i32> %a, <16 x i32> %b) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> - ret <16 x i32> %c -} - -; CHECK-LABEL: test32 -; CHECK: vshufpd $99, %zmm0, %zmm1 -; CHECK: ret -define <8 x double> @test32(<8 x double> %a, <8 x double> %b) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> - ret <8 x double> %c -} - -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -define <8 x double> @test_vshuff64x2_512(<8 x double> %x, <8 x double> %x1) nounwind { -; CHECK-LABEL: test_vshuff64x2_512: -; CHECK: ## BB#0: -; CHECK-NEXT: vshuff64x2 $136, %zmm0, %zmm0, %zmm0 -; CHECK-NEXT: retq - %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> - ret <8 x double> %res -} - -define <8 x double> @test_vshuff64x2_512_mask(<8 x double> %x, <8 x double> %x1, <8 x i1> %mask) nounwind { -; CHECK-LABEL: test_vshuff64x2_512_mask: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxwq %xmm2, %zmm1 -; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 -; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1 -; CHECK-NEXT: vshuff64x2 $136, %zmm0, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: retq - %y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> - %res = select <8 x i1> %mask, <8 x double> %y, <8 x double> zeroinitializer - ret <8 x double> %res -} - -define <8 x i64> @test_vshufi64x2_512_mask(<8 x i64> %x, <8 x i64> %x1, <8 x i1> %mask) nounwind { -; CHECK-LABEL: test_vshufi64x2_512_mask: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxwq %xmm2, %zmm1 -; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 -; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1 -; CHECK-NEXT: vshufi64x2 $168, %zmm0, %zmm0, %zmm0 {%k1} -; CHECK-NEXT: retq - %y = shufflevector <8 x i64> %x, <8 x i64> %x1, <8 x i32> - %res = select <8 x i1> %mask, <8 x i64> %y, <8 x i64> %x - ret <8 x i64> %res -} - -define <8 x double> @test_vshuff64x2_512_mem(<8 x double> %x, <8 x double> *%ptr) nounwind { -; CHECK-LABEL: test_vshuff64x2_512_mem: -; CHECK: ## BB#0: -; CHECK-NEXT: vshuff64x2 $40, %zmm0, %zmm0, %zmm0 -; CHECK-NEXT: retq - %x1 = load <8 x double>,<8 x double> *%ptr,align 1 - %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> - ret <8 x double> %res -} - -define <16 x float> @test_vshuff32x4_512_mem(<16 x float> %x, <16 x float> *%ptr) nounwind { -; CHECK-LABEL: test_vshuff32x4_512_mem: -; CHECK: ## BB#0: -; CHECK-NEXT: vshuff64x2 $20, %zmm0, %zmm0, %zmm0 -; CHECK-NEXT: retq - %x1 = load <16 x float>,<16 x float> *%ptr,align 1 - %res = shufflevector <16 x float> %x, <16 x float> %x1, <16 x i32> - ret <16 x float> %res -} - -define <16 x i32> @test_align_v16i32_rr(<16 x i32> %a, <16 x i32> %b) nounwind { -; CHECK-LABEL: test_align_v16i32_rr: -; CHECK: ## BB#0: -; CHECK-NEXT: valignd $3, %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> - ret <16 x i32> %c -} - -define <16 x i32> @test_align_v16i32_rm(<16 x i32>* %a.ptr, <16 x i32> %b) nounwind { -; CHECK-LABEL: test_align_v16i32_rm: -; CHECK: ## BB#0: -; CHECK-NEXT: valignd $3, (%rdi), %zmm0, %zmm0 -; CHECK-NEXT: retq - %a = load <16 x i32>, <16 x i32>* %a.ptr - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> - ret <16 x i32> %c -} - -define <16 x i32> @test_align_v16i32_rm_mask(<16 x i32>* %a.ptr, <16 x i32> %b, <16 x i1> %mask) nounwind { -; CHECK-LABEL: test_align_v16i32_rm_mask: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxbd %xmm1, %zmm1 -; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1 -; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1 -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1 -; CHECK-NEXT: valignd $3, %zmm1, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 -; CHECK-NEXT: retq -; -; CHECK-SKX-LABEL: test_align_v16i32_rm_mask: -; CHECK-SKX: ## BB#0: -; CHECK-SKX-NEXT: vpmovb2m %xmm1, %k1 -; CHECK-SKX-NEXT: vmovdqa32 (%rdi), %zmm1 -; CHECK-SKX-NEXT: valignd $3, %zmm1, %zmm0, %zmm1 {%k1} -; CHECK-SKX-NEXT: vmovaps %zmm1, %zmm0 -; CHECK-SKX-NEXT: retq - %a = load <16 x i32>, <16 x i32>* %a.ptr - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> - %res = select <16 x i1> %mask,<16 x i32> %c, <16 x i32> %a - ret <16 x i32> %res -} - -define <8 x double> @test_align_v8f64_rr(<8 x double> %a, <8 x double> %b) nounwind { -; CHECK-LABEL: test_align_v8f64_rr: -; CHECK: ## BB#0: -; CHECK-NEXT: valignq $3, %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> - ret <8 x double> %c -} - -define <8 x double> @test_align_v18f64_rm(<8 x double>* %a.ptr, <8 x double> %b) nounwind { -; CHECK-LABEL: test_align_v18f64_rm: -; CHECK: ## BB#0: -; CHECK-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 -; CHECK-NEXT: retq - %a = load <8 x double>, <8 x double>* %a.ptr - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> - ret <8 x double> %c -} - -define <8 x double> @test_align_v18f64_rm_mask(<8 x double>* %a.ptr, <8 x double> %b, <8 x i1> %mask) nounwind { -; CHECK-LABEL: test_align_v18f64_rm_mask: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxwq %xmm1, %zmm1 -; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 -; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1 -; CHECK-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: retq -; -; CHECK-SKX-LABEL: test_align_v18f64_rm_mask: -; CHECK-SKX: ## BB#0: -; CHECK-SKX-NEXT: vpmovw2m %xmm1, %k1 -; CHECK-SKX-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 {%k1} {z} -; CHECK-SKX-NEXT: retq - %a = load <8 x double>, <8 x double>* %a.ptr - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> - %res = select <8 x i1> %mask,<8 x double> %c, <8 x double> zeroinitializer - ret <8 x double> %res -} - diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll index 2c6c8a3e7ad..62d4af7809b 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -15,8 +15,9 @@ define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00000010: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -25,8 +26,9 @@ define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00000200: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -35,8 +37,9 @@ define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00003000: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -45,8 +48,11 @@ define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00040000: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -55,8 +61,11 @@ define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00500000: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3] +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,0] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -65,8 +74,11 @@ define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_06000000: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,0] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -75,11 +87,11 @@ define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_70000000: ; ALL: # BB#0: -; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; ALL-NEXT: movl $7, %eax -; ALL-NEXT: vpinsrq $0, %rax, %xmm1, %xmm2 -; ALL-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,0,0,0] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -88,7 +100,10 @@ define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_01014545: ; ALL: # BB#0: -; ALL-NEXT: vshuff64x2 $160, %zmm0, %zmm0, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -97,8 +112,9 @@ define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00112233: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -107,8 +123,9 @@ define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00001111: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -117,7 +134,11 @@ define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_81a3c5e7(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_81a3c5e7: ; ALL: # BB#0: -; ALL-NEXT: vshufpd $170, %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -126,9 +147,10 @@ define <8 x double> @shuffle_v8f64_81a3c5e7(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_08080808: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -137,9 +159,15 @@ define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_08084c4c(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_08084c4c: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vbroadcastsd %xmm3, %ymm3 +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -148,9 +176,13 @@ define <8 x double> @shuffle_v8f64_08084c4c(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_8823cc67: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vbroadcastsd %xmm3, %ymm3 +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -159,9 +191,13 @@ define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_9832dc76: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3] +; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,0,3,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -170,9 +206,13 @@ define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_9810dc54: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 +; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,0,3,2] +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -181,9 +221,15 @@ define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_08194c5d(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_08194c5d: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -192,9 +238,15 @@ define <8 x double> @shuffle_v8f64_08194c5d(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_2a3b6e7f(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_2a3b6e7f: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -203,9 +255,13 @@ define <8 x double> @shuffle_v8f64_2a3b6e7f(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_08192a3b(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_08192a3b: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,2,2,3] +; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,1,3,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -214,9 +270,11 @@ define <8 x double> @shuffle_v8f64_08192a3b(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_08991abb: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,1,1] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -225,9 +283,12 @@ define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_091b2d3f(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_091b2d3f: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,1,3,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -236,9 +297,11 @@ define <8 x double> @shuffle_v8f64_091b2d3f(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_09ab1def: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermilpd {{.*#+}} ymm3 = ymm0[1,0,2,2] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -247,7 +310,10 @@ define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00014445(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00014445: ; ALL: # BB#0: -; ALL-NEXT: vpermpd $64, %zmm0, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,1] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -256,7 +322,10 @@ define <8 x double> @shuffle_v8f64_00014445(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00204464(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00204464: ; ALL: # BB#0: -; ALL-NEXT: vpermpd $32, %zmm0, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -265,7 +334,10 @@ define <8 x double> @shuffle_v8f64_00204464(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_03004744(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_03004744: ; ALL: # BB#0: -; ALL-NEXT: vpermpd $12, %zmm0, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,3,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -274,7 +346,10 @@ define <8 x double> @shuffle_v8f64_03004744(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_10005444(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_10005444: ; ALL: # BB#0: -; ALL-NEXT: vpermpd $1, %zmm0, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -283,7 +358,10 @@ define <8 x double> @shuffle_v8f64_10005444(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_22006644(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_22006644: ; ALL: # BB#0: -; ALL-NEXT: vpermpd $10, %zmm0, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -292,7 +370,10 @@ define <8 x double> @shuffle_v8f64_22006644(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_33307774(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_33307774: ; ALL: # BB#0: -; ALL-NEXT: vpermpd $63, %zmm0, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,3,3,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -301,7 +382,10 @@ define <8 x double> @shuffle_v8f64_33307774(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_32107654(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_32107654: ; ALL: # BB#0: -; ALL-NEXT: vpermpd $27, %zmm0, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,2,1,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -310,7 +394,10 @@ define <8 x double> @shuffle_v8f64_32107654(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00234467(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00234467: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $136, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -319,7 +406,10 @@ define <8 x double> @shuffle_v8f64_00234467(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00224466: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $0, %zmm0, %zmm0 +; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -328,7 +418,10 @@ define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_10325476(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_10325476: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $85, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -337,7 +430,10 @@ define <8 x double> @shuffle_v8f64_10325476(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_11335577(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_11335577: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $255, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -346,7 +442,10 @@ define <8 x double> @shuffle_v8f64_11335577(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_10235467(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_10235467: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $153, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,2,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -355,7 +454,10 @@ define <8 x double> @shuffle_v8f64_10235467(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_10225466(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_10225466: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $17, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,2,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -364,8 +466,10 @@ define <8 x double> @shuffle_v8f64_10225466(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00015444(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00015444: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,1] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -374,8 +478,10 @@ define <8 x double> @shuffle_v8f64_00015444(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00204644(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00204644: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -384,8 +490,10 @@ define <8 x double> @shuffle_v8f64_00204644(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_03004474(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_03004474: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,3,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,3,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -394,8 +502,10 @@ define <8 x double> @shuffle_v8f64_03004474(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_10004444: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -404,8 +514,10 @@ define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_22006446(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_22006446: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -414,8 +526,10 @@ define <8 x double> @shuffle_v8f64_22006446(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_33307474(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_33307474: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,3,3,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,3,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -424,8 +538,9 @@ define <8 x double> @shuffle_v8f64_33307474(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_32104567(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_32104567: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,2,1,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -434,8 +549,10 @@ define <8 x double> @shuffle_v8f64_32104567(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00236744(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00236744: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -444,8 +561,10 @@ define <8 x double> @shuffle_v8f64_00236744(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00226644: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -454,7 +573,9 @@ define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_10324567(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_10324567: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $165, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -463,7 +584,9 @@ define <8 x double> @shuffle_v8f64_10324567(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_11334567(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_11334567: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $175, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -472,7 +595,9 @@ define <8 x double> @shuffle_v8f64_11334567(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_01235467(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_01235467: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $154, %zmm0, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -481,7 +606,9 @@ define <8 x double> @shuffle_v8f64_01235467(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_01235466: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $26, %zmm0, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -490,8 +617,10 @@ define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_002u6u44: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -500,8 +629,10 @@ define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00uu66uu: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -510,7 +641,9 @@ define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_103245uu(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_103245uu: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $37, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -519,7 +652,9 @@ define <8 x double> @shuffle_v8f64_103245uu(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_1133uu67(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_1133uu67: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $143, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -528,7 +663,9 @@ define <8 x double> @shuffle_v8f64_1133uu67(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_0uu354uu(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_0uu354uu: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $24, %zmm0, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -537,7 +674,9 @@ define <8 x double> @shuffle_v8f64_0uu354uu(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_uuu3uu66: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $8, %zmm0, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -546,9 +685,16 @@ define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_c348cda0: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm2[0,1] +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vbroadcastsd %xmm1, %ymm4 +; ALL-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3] +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -557,9 +703,17 @@ define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_f511235a: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2,3] +; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,3,1,3] +; ALL-NEXT: vmovddup {{.*#+}} ymm4 = ymm1[0,0,2,2] +; ALL-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,1] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -577,8 +731,9 @@ define <8 x i64> @shuffle_v8i64_00000000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00000010: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -587,8 +742,9 @@ define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00000200: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -597,8 +753,9 @@ define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00003000: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -607,8 +764,11 @@ define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00040000: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -617,8 +777,11 @@ define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00500000: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0] +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -627,8 +790,11 @@ define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_06000000: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,0] +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -637,11 +803,11 @@ define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_70000000: ; ALL: # BB#0: -; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; ALL-NEXT: movl $7, %eax -; ALL-NEXT: vpinsrq $0, %rax, %xmm1, %xmm2 -; ALL-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,0,0,0] +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -650,7 +816,10 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_01014545: ; ALL: # BB#0: -; ALL-NEXT: vshufi64x2 $160, %zmm0, %zmm0, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -659,8 +828,9 @@ define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00112233(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00112233: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -669,8 +839,9 @@ define <8 x i64> @shuffle_v8i64_00112233(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00001111(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00001111: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -679,7 +850,11 @@ define <8 x i64> @shuffle_v8i64_00001111(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_81a3c5e7: ; ALL: # BB#0: -; ALL-NEXT: vshufpd $170, %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -688,9 +863,10 @@ define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_08080808: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -699,9 +875,15 @@ define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_08084c4c(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_08084c4c: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vpbroadcastq %xmm3, %ymm3 +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -710,9 +892,13 @@ define <8 x i64> @shuffle_v8i64_08084c4c(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_8823cc67: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vpbroadcastq %xmm3, %ymm3 +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -721,9 +907,13 @@ define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_9832dc76: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; ALL-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -732,9 +922,13 @@ define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_9810dc54: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2 +; ALL-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -743,9 +937,15 @@ define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_08194c5d(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_08194c5d: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -754,9 +954,15 @@ define <8 x i64> @shuffle_v8i64_08194c5d(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_2a3b6e7f(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_2a3b6e7f: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -765,9 +971,13 @@ define <8 x i64> @shuffle_v8i64_2a3b6e7f(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_08192a3b(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_08192a3b: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,2,2,3] +; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,1,3,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -776,9 +986,11 @@ define <8 x i64> @shuffle_v8i64_08192a3b(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_08991abb(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_08991abb: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,0,1,1] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,3] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -787,9 +999,12 @@ define <8 x i64> @shuffle_v8i64_08991abb(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_091b2d3f(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_091b2d3f: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,1,3,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -798,9 +1013,11 @@ define <8 x i64> @shuffle_v8i64_091b2d3f(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_09ab1def(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_09ab1def: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -809,7 +1026,10 @@ define <8 x i64> @shuffle_v8i64_09ab1def(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00014445(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00014445: ; ALL: # BB#0: -; ALL-NEXT: vpermq $64, %zmm0, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -818,7 +1038,10 @@ define <8 x i64> @shuffle_v8i64_00014445(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00204464(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00204464: ; ALL: # BB#0: -; ALL-NEXT: vpermq $32, %zmm0, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -827,7 +1050,10 @@ define <8 x i64> @shuffle_v8i64_00204464(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_03004744(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_03004744: ; ALL: # BB#0: -; ALL-NEXT: vpermq $12, %zmm0, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,3,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -836,7 +1062,10 @@ define <8 x i64> @shuffle_v8i64_03004744(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_10005444(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10005444: ; ALL: # BB#0: -; ALL-NEXT: vpermq $1, %zmm0, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -845,7 +1074,10 @@ define <8 x i64> @shuffle_v8i64_10005444(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_22006644(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_22006644: ; ALL: # BB#0: -; ALL-NEXT: vpermq $10, %zmm0, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -854,7 +1086,10 @@ define <8 x i64> @shuffle_v8i64_22006644(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_33307774(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_33307774: ; ALL: # BB#0: -; ALL-NEXT: vpermq $63, %zmm0, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,3,3,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -863,7 +1098,10 @@ define <8 x i64> @shuffle_v8i64_33307774(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_32107654(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_32107654: ; ALL: # BB#0: -; ALL-NEXT: vpermq $27, %zmm0, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -872,7 +1110,10 @@ define <8 x i64> @shuffle_v8i64_32107654(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00234467(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00234467: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $136, %zmm0, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,3] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -881,7 +1122,10 @@ define <8 x i64> @shuffle_v8i64_00234467(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00224466: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $0, %zmm0, %zmm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -890,7 +1134,10 @@ define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_10325476(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10325476: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $85, %zmm0, %zmm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -899,7 +1146,10 @@ define <8 x i64> @shuffle_v8i64_10325476(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_11335577: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $255, %zmm0, %zmm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -908,7 +1158,10 @@ define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_10235467(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10235467: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $153, %zmm0, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,2,3] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,3] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -917,7 +1170,10 @@ define <8 x i64> @shuffle_v8i64_10235467(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_10225466(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10225466: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $17, %zmm0, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,2,2] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,2] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -926,8 +1182,10 @@ define <8 x i64> @shuffle_v8i64_10225466(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00015444(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00015444: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -936,8 +1194,10 @@ define <8 x i64> @shuffle_v8i64_00015444(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00204644(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00204644: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -946,8 +1206,10 @@ define <8 x i64> @shuffle_v8i64_00204644(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_03004474(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_03004474: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,3,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,3,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -956,8 +1218,10 @@ define <8 x i64> @shuffle_v8i64_03004474(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_10004444(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10004444: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -966,8 +1230,10 @@ define <8 x i64> @shuffle_v8i64_10004444(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_22006446(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_22006446: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,0,2] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -976,8 +1242,10 @@ define <8 x i64> @shuffle_v8i64_22006446(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_33307474(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_33307474: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,3,3,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,3,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -986,8 +1254,9 @@ define <8 x i64> @shuffle_v8i64_33307474(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_32104567(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_32104567: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -996,8 +1265,10 @@ define <8 x i64> @shuffle_v8i64_32104567(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00236744(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00236744: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,3] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1006,8 +1277,10 @@ define <8 x i64> @shuffle_v8i64_00236744(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00226644(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00226644: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1016,7 +1289,9 @@ define <8 x i64> @shuffle_v8i64_00226644(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_10324567(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10324567: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $165, %zmm0, %zmm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1025,7 +1300,9 @@ define <8 x i64> @shuffle_v8i64_10324567(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_11334567(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_11334567: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $175, %zmm0, %zmm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1034,7 +1311,9 @@ define <8 x i64> @shuffle_v8i64_11334567(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_01235467(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_01235467: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $154, %zmm0, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,0,2,3] +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1043,7 +1322,9 @@ define <8 x i64> @shuffle_v8i64_01235467(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_01235466(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_01235466: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $26, %zmm0, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,0,2,2] +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1052,8 +1333,10 @@ define <8 x i64> @shuffle_v8i64_01235466(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_002u6u44: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1062,8 +1345,10 @@ define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00uu66uu: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1072,7 +1357,9 @@ define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_103245uu(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_103245uu: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $37, %zmm0, %zmm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1081,7 +1368,9 @@ define <8 x i64> @shuffle_v8i64_103245uu(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_1133uu67(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_1133uu67: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $143, %zmm0, %zmm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1090,7 +1379,9 @@ define <8 x i64> @shuffle_v8i64_1133uu67(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_0uu354uu(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_0uu354uu: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $24, %zmm0, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1099,7 +1390,9 @@ define <8 x i64> @shuffle_v8i64_0uu354uu(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_uuu3uu66: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $8, %zmm0, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5] +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1108,9 +1401,15 @@ define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_6caa87e5: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle