[x86] Teach the AVX1 path of the new vector shuffle lowering one more

trick that I missed.

VPERMILPS has a non-immediate memory operand mode that allows it to do
asymetric shuffles in the two 128-bit lanes. Use this rather than two
shuffles and a blend.

However, it turns out the variable shuffle path to VPERMILPS (and
VPERMILPD, although that one offers no functional differenc from the
immediate operand other than variability) wasn't even plumbed through
codegen. Do such plumbing so that we can reasonably emit
a variable-masked VPERMILP instruction. Also plumb basic comment parsing
and printing through so that the tests are reasonable.

There are still a few tests which don't show the shuffle pattern. These
are tests with undef lanes. I'll teach the shuffle decoding and printing
to handle undef mask entries in a follow-up. I've looked at the masks
and they seem reasonable.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218300 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Chandler Carruth 2014-09-23 10:08:29 +00:00
parent 5f843038fb
commit 8f637786d8
8 changed files with 116 additions and 121 deletions

View File

@ -287,4 +287,27 @@ void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
}
}
void DecodeVPERMILPMask(const ConstantDataSequential *C,
SmallVectorImpl<int> &ShuffleMask) {
Type *MaskTy = C->getType();
assert(MaskTy->isVectorTy() && "Expected a vector constant mask!");
assert(MaskTy->getVectorElementType()->isIntegerTy() &&
"Expected integer constant mask elements!");
int ElementBits = MaskTy->getScalarSizeInBits();
int NumElements = MaskTy->getVectorNumElements();
assert((NumElements == 2 || NumElements == 4 || NumElements == 8) &&
"Unexpected number of vector elements.");
assert((unsigned)NumElements == C->getNumElements() &&
"Constant mask has a different number of elements!");
ShuffleMask.reserve(NumElements);
for (int i = 0; i < NumElements; ++i) {
int Base = (i * ElementBits / 128) * (128 / ElementBits);
uint64_t Element = C->getElementAsInteger(i);
// Only the least significant 2 bits of the integer are used.
int Index = Base + (Element & 0x3);
ShuffleMask.push_back(Index);
}
}
} // llvm namespace

View File

@ -84,6 +84,10 @@ void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
/// No VT provided since it only works on 256-bit, 4 element vectors.
void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
/// \brief Decode a VPERMILP variable mask from an IR-level vector constant.
void DecodeVPERMILPMask(const ConstantDataSequential *C,
SmallVectorImpl<int> &ShuffleMask);
} // llvm namespace
#endif

View File

@ -9395,26 +9395,15 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
}
// If we have a single input shuffle with different shuffle patterns in the
// two 128-bit lanes, just do two shuffles and blend them together. This will
// be faster than extracting the high 128-bit lane, shuffling it, and
// re-inserting it. Especially on newer processors where blending is *the*
// fastest operation.
// two 128-bit lanes use the variable mask to VPERMILPS.
if (isSingleInputShuffleMask(Mask)) {
int LoMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
int HiMask[4] = {Mask[4], Mask[5], Mask[6], Mask[7]};
for (int &M : HiMask)
if (M >= 0)
M -= 4;
SDValue Lo = V1, Hi = V1;
if (!isNoopShuffleMask(LoMask))
Lo = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Lo,
getV4X86ShuffleImm8ForMask(LoMask, DAG));
if (!isNoopShuffleMask(HiMask))
Hi = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Hi,
getV4X86ShuffleImm8ForMask(HiMask, DAG));
unsigned BlendMask = 1 << 4 | 1 << 5 | 1 << 6 | 1 << 7;
return DAG.getNode(X86ISD::BLENDI, DL, MVT::v8f32, Lo, Hi,
DAG.getConstant(BlendMask, MVT::i8));
SDValue VPermMask[8];
for (int i = 0; i < 8; ++i)
VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
: DAG.getConstant(Mask[i], MVT::i32);
return DAG.getNode(
X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
}
// Shuffle the input elements into the desired positions in V1 and V2 and

View File

@ -343,6 +343,7 @@ namespace llvm {
MOVSS,
UNPCKL,
UNPCKH,
VPERMILPV,
VPERMILPI,
VPERMV,
VPERMV3,

View File

@ -188,6 +188,8 @@ def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
def SDTShuff3Op : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>;
def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisVec<2>]>;
def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
SDTCisSameAs<0,1>, SDTCisInt<2>]>;
def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
@ -232,6 +234,7 @@ def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>;
def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
def X86VPermilpv : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>;
def X86VPermilpi : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>;
def X86VPermv : SDNode<"X86ISD::VPERMV", SDTShuff2Op>;
def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>;

View File

@ -8418,6 +8418,15 @@ let ExeDomain = SSEPackedDouble in {
}
let Predicates = [HasAVX] in {
def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))),
(VPERMILPSYrr VR256:$src1, VR256:$src2)>;
def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
(VPERMILPSYrm VR256:$src1, addr:$src2)>;
def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))),
(VPERMILPDYrr VR256:$src1, VR256:$src2)>;
def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))),
(VPERMILPDYrm VR256:$src1, addr:$src2)>;
def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
(VPERMILPSYri VR256:$src1, imm:$imm)>;
def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
@ -8428,6 +8437,15 @@ def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)),
def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))),
(VPERMILPDYmi addr:$src1, imm:$imm)>;
def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))),
(VPERMILPSrr VR128:$src1, VR128:$src2)>;
def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))),
(VPERMILPSrm VR128:$src1, addr:$src2)>;
def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))),
(VPERMILPDrr VR128:$src1, VR128:$src2)>;
def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))),
(VPERMILPDrm VR128:$src1, addr:$src2)>;
def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))),
(VPERMILPDri VR128:$src1, imm:$imm)>;
def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))),

View File

@ -1022,15 +1022,19 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case X86::PSHUFBrm:
case X86::VPSHUFBrm:
// Lower PSHUFB normally but add a comment if we can find a constant
// shuffle mask. We won't be able to do this at the MC layer because the
// mask isn't an immediate.
case X86::VPERMILPSrm:
case X86::VPERMILPDrm:
case X86::VPERMILPSYrm:
case X86::VPERMILPDYrm:
// Lower PSHUFB and VPERMILP normally but add a comment if we can find
// a constant shuffle mask. We won't be able to do this at the MC layer
// because the mask isn't an immediate.
std::string Comment;
raw_string_ostream CS(Comment);
SmallVector<int, 16> Mask;
assert(MI->getNumOperands() >= 6 &&
"Wrong number of operands for PSHUFBrm or VPSHUFBrm");
// All of these instructions accept a constant pool operand as their fifth.
assert(MI->getNumOperands() > 5 && "We should always have at least 5 operands!");
const MachineOperand &DstOp = MI->getOperand(0);
const MachineOperand &SrcOp = MI->getOperand(1);
const MachineOperand &MaskOp = MI->getOperand(5);
@ -1061,7 +1065,18 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
assert(MaskTy == C->getType() &&
"Expected a constant of the same type!");
DecodePSHUFBMask(C, Mask);
switch (MI->getOpcode()) {
case X86::PSHUFBrm:
case X86::VPSHUFBrm:
DecodePSHUFBMask(C, Mask);
break;
case X86::VPERMILPSrm:
case X86::VPERMILPDrm:
case X86::VPERMILPSYrm:
case X86::VPERMILPDYrm:
DecodeVPERMILPMask(C, Mask);
}
assert(Mask.size() == MaskTy->getVectorNumElements() &&
"Shuffle mask has a different size than its type!");
}

View File

@ -381,9 +381,7 @@ define <8 x float> @shuffle_v8f32_10225466(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00015444(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: @shuffle_v8f32_00015444
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[1,0,0,0,5,4,4,4]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,0,0,1,4,4,4,5]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,0,0,1,5,4,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
ret <8 x float> %shuffle
@ -392,9 +390,7 @@ define <8 x float> @shuffle_v8f32_00015444(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00204644(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: @shuffle_v8f32_00204644
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[0,2,0,0,4,6,4,4]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,0,2,0,4,4,6,4]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,0,2,0,4,6,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
ret <8 x float> %shuffle
@ -403,9 +399,7 @@ define <8 x float> @shuffle_v8f32_00204644(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_03004474(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: @shuffle_v8f32_03004474
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[0,0,3,0,4,4,7,4]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,3,0,0,4,7,4,4]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,3,0,0,4,4,7,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
ret <8 x float> %shuffle
@ -414,9 +408,7 @@ define <8 x float> @shuffle_v8f32_03004474(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_10004444(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: @shuffle_v8f32_10004444
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[0,0,0,0,4,4,4,4]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[1,0,0,0,5,4,4,4]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[1,0,0,0,4,4,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
ret <8 x float> %shuffle
@ -425,9 +417,7 @@ define <8 x float> @shuffle_v8f32_10004444(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_22006446(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: @shuffle_v8f32_22006446
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[2,0,0,2,6,4,4,6]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[2,2,0,0,6,6,4,4]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[2,2,0,0,6,4,4,6]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
ret <8 x float> %shuffle
@ -436,9 +426,7 @@ define <8 x float> @shuffle_v8f32_22006446(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_33307474(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: @shuffle_v8f32_33307474
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[3,0,3,0,7,4,7,4]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[3,3,3,0,7,7,7,4]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[3,3,3,0,7,4,7,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
ret <8 x float> %shuffle
@ -447,8 +435,7 @@ define <8 x float> @shuffle_v8f32_33307474(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_32104567(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: @shuffle_v8f32_32104567
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[3,2,1,0,7,6,5,4]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[3,2,1,0,4,5,6,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %shuffle
@ -457,9 +444,7 @@ define <8 x float> @shuffle_v8f32_32104567(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00236744(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: @shuffle_v8f32_00236744
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[2,3,0,0,6,7,4,4]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,4,4,6,7]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,6,7,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
ret <8 x float> %shuffle
@ -468,9 +453,7 @@ define <8 x float> @shuffle_v8f32_00236744(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00226644(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: @shuffle_v8f32_00226644
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[2,2,0,0,6,6,4,4]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,0,2,2,4,4,6,6]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,0,2,2,6,6,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
ret <8 x float> %shuffle
@ -479,8 +462,7 @@ define <8 x float> @shuffle_v8f32_00226644(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_10324567(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: @shuffle_v8f32_10324567
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[1,0,3,2,5,4,7,6]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[1,0,3,2,4,5,6,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %shuffle
@ -489,8 +471,7 @@ define <8 x float> @shuffle_v8f32_10324567(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_11334567(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: @shuffle_v8f32_11334567
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[1,1,3,3,5,5,7,7]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[1,1,3,3,4,5,6,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %shuffle
@ -499,8 +480,7 @@ define <8 x float> @shuffle_v8f32_11334567(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_01235467(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: @shuffle_v8f32_01235467
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[1,0,2,3,5,4,6,7]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,1,2,3,5,4,6,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
ret <8 x float> %shuffle
@ -509,8 +489,7 @@ define <8 x float> @shuffle_v8f32_01235467(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_01235466(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: @shuffle_v8f32_01235466
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[1,0,2,2,5,4,6,6]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,1,2,3,5,4,6,6]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
ret <8 x float> %shuffle
@ -519,9 +498,7 @@ define <8 x float> @shuffle_v8f32_01235466(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_002u6u44(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: @shuffle_v8f32_002u6u44
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[2,1,0,0,6,5,4,4]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,4,4,6,7]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}}, %ymm0, %ymm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
ret <8 x float> %shuffle
@ -530,9 +507,7 @@ define <8 x float> @shuffle_v8f32_002u6u44(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00uu66uu(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: @shuffle_v8f32_00uu66uu
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[2,2,2,3,6,6,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,4,4,6,7]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}}, %ymm0, %ymm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
ret <8 x float> %shuffle
@ -541,8 +516,7 @@ define <8 x float> @shuffle_v8f32_00uu66uu(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_103245uu(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: @shuffle_v8f32_103245uu
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[1,0,3,2,5,4,7,6]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}}, %ymm0, %ymm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
ret <8 x float> %shuffle
@ -551,8 +525,7 @@ define <8 x float> @shuffle_v8f32_103245uu(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_1133uu67(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: @shuffle_v8f32_1133uu67
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[1,1,3,3,5,5,7,7]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}}, %ymm0, %ymm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
ret <8 x float> %shuffle
@ -561,8 +534,7 @@ define <8 x float> @shuffle_v8f32_1133uu67(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_0uu354uu(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: @shuffle_v8f32_0uu354uu
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[1,0,2,3,5,4,6,7]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}}, %ymm0, %ymm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
ret <8 x float> %shuffle
@ -571,8 +543,7 @@ define <8 x float> @shuffle_v8f32_0uu354uu(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_uuu3uu66(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: @shuffle_v8f32_uuu3uu66
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[0,1,2,2,4,5,6,6]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}}, %ymm0, %ymm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
ret <8 x float> %shuffle
@ -956,9 +927,7 @@ define <8 x i32> @shuffle_v8i32_10225466(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00015444(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: @shuffle_v8i32_00015444
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[1,0,0,0,5,4,4,4]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,0,0,1,4,4,4,5]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,0,0,1,5,4,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
ret <8 x i32> %shuffle
@ -967,9 +936,7 @@ define <8 x i32> @shuffle_v8i32_00015444(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00204644(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: @shuffle_v8i32_00204644
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[0,2,0,0,4,6,4,4]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,0,2,0,4,4,6,4]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,0,2,0,4,6,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
ret <8 x i32> %shuffle
@ -978,9 +945,7 @@ define <8 x i32> @shuffle_v8i32_00204644(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_03004474(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: @shuffle_v8i32_03004474
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[0,0,3,0,4,4,7,4]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,3,0,0,4,7,4,4]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,3,0,0,4,4,7,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
ret <8 x i32> %shuffle
@ -989,9 +954,7 @@ define <8 x i32> @shuffle_v8i32_03004474(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_10004444(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: @shuffle_v8i32_10004444
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[0,0,0,0,4,4,4,4]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[1,0,0,0,5,4,4,4]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[1,0,0,0,4,4,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
ret <8 x i32> %shuffle
@ -1000,9 +963,7 @@ define <8 x i32> @shuffle_v8i32_10004444(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_22006446(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: @shuffle_v8i32_22006446
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[2,0,0,2,6,4,4,6]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[2,2,0,0,6,6,4,4]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[2,2,0,0,6,4,4,6]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
ret <8 x i32> %shuffle
@ -1011,9 +972,7 @@ define <8 x i32> @shuffle_v8i32_22006446(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_33307474(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: @shuffle_v8i32_33307474
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[3,0,3,0,7,4,7,4]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[3,3,3,0,7,7,7,4]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[3,3,3,0,7,4,7,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
ret <8 x i32> %shuffle
@ -1022,8 +981,7 @@ define <8 x i32> @shuffle_v8i32_33307474(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_32104567(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: @shuffle_v8i32_32104567
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[3,2,1,0,7,6,5,4]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[3,2,1,0,4,5,6,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %shuffle
@ -1032,9 +990,7 @@ define <8 x i32> @shuffle_v8i32_32104567(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00236744(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: @shuffle_v8i32_00236744
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[2,3,0,0,6,7,4,4]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,4,4,6,7]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,6,7,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
ret <8 x i32> %shuffle
@ -1043,9 +999,7 @@ define <8 x i32> @shuffle_v8i32_00236744(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00226644(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: @shuffle_v8i32_00226644
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[2,2,0,0,6,6,4,4]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,0,2,2,4,4,6,6]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,0,2,2,6,6,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
ret <8 x i32> %shuffle
@ -1054,8 +1008,7 @@ define <8 x i32> @shuffle_v8i32_00226644(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_10324567(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: @shuffle_v8i32_10324567
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[1,0,3,2,5,4,7,6]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[1,0,3,2,4,5,6,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %shuffle
@ -1064,8 +1017,7 @@ define <8 x i32> @shuffle_v8i32_10324567(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_11334567(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: @shuffle_v8i32_11334567
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[1,1,3,3,5,5,7,7]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[1,1,3,3,4,5,6,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %shuffle
@ -1074,8 +1026,7 @@ define <8 x i32> @shuffle_v8i32_11334567(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_01235467(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: @shuffle_v8i32_01235467
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[1,0,2,3,5,4,6,7]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,1,2,3,5,4,6,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
ret <8 x i32> %shuffle
@ -1084,8 +1035,7 @@ define <8 x i32> @shuffle_v8i32_01235467(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_01235466(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: @shuffle_v8i32_01235466
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[1,0,2,2,5,4,6,6]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,1,2,3,5,4,6,6]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
ret <8 x i32> %shuffle
@ -1094,9 +1044,7 @@ define <8 x i32> @shuffle_v8i32_01235466(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_002u6u44(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: @shuffle_v8i32_002u6u44
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[2,1,0,0,6,5,4,4]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,4,4,6,7]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}}, %ymm0, %ymm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
ret <8 x i32> %shuffle
@ -1105,9 +1053,7 @@ define <8 x i32> @shuffle_v8i32_002u6u44(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00uu66uu(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: @shuffle_v8i32_00uu66uu
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[2,2,2,3,6,6,6,7]
; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,4,4,6,7]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}}, %ymm0, %ymm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
ret <8 x i32> %shuffle
@ -1116,8 +1062,7 @@ define <8 x i32> @shuffle_v8i32_00uu66uu(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_103245uu(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: @shuffle_v8i32_103245uu
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[1,0,3,2,5,4,7,6]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}}, %ymm0, %ymm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
ret <8 x i32> %shuffle
@ -1126,8 +1071,7 @@ define <8 x i32> @shuffle_v8i32_103245uu(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_1133uu67(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: @shuffle_v8i32_1133uu67
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[1,1,3,3,5,5,7,7]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}}, %ymm0, %ymm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
ret <8 x i32> %shuffle
@ -1136,8 +1080,7 @@ define <8 x i32> @shuffle_v8i32_1133uu67(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_0uu354uu(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: @shuffle_v8i32_0uu354uu
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[1,0,2,3,5,4,6,7]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}}, %ymm0, %ymm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
ret <8 x i32> %shuffle
@ -1146,8 +1089,7 @@ define <8 x i32> @shuffle_v8i32_0uu354uu(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_uuu3uu66(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: @shuffle_v8i32_uuu3uu66
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm0[0,1,2,2,4,5,6,6]
; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*}}, %ymm0, %ymm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
ret <8 x i32> %shuffle