[x86] Undo a flawed transform I added to form UNPCK instructions when

AVX is available, and generally tidy up things surrounding UNPCK formation. Originally, I was thinking that the only advantage of PSHUFD over UNPCK instruction variants was its free copy, and otherwise we should use the shorter encoding UNPCK instructions. This isn't right though, there is a larger advantage of being able to fold a load into the operand of a PSHUFD. For UNPCK, the operand *must* be in a register so it can be the second input. This removes the UNPCK formation in the target-specific DAG combine for v4i32 shuffles. It also lifts the v8 and v16 cases out of the AVX-specific check as they are potentially replacing multiple instructions with a single instruction and so should always be valuable. The floating point checks are simplified accordingly. This also adjusts the formation of PSHUFD instructions to attempt to match the shuffle mask to one which would fit an UNPCK instruction variant. This was originally motivated to allow it to match the UNPCK instructions in the combiner, but clearly won't now. Eventually, we should add a MachineCombiner pass that can form UNPCK instructions post-RA when the operand is known to be in a register and thus there is no loss. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217755 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-05 17:39:16 +00:00 · 2014-09-15 10:35:41 +00:00 · 2014-09-15 10:35:41 +00:00 · 04402a6c13
commit 04402a6c13
parent a6cc351c5b
7 changed files with 100 additions and 101 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -7598,11 +7598,22 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  int NumV2Elements =
      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });

-  if (NumV2Elements == 0)
+  if (NumV2Elements == 0) {
    // Straight shuffle of a single input vector. For everything from SSE2
    // onward this has a single fast instruction with no scary immediates.
+    // We coerce the shuffle pattern to be compatible with UNPCK instructions
+    // but we aren't actually going to use the UNPCK instruction because doing
+    // so prevents folding a load into this instruction or making a copy.
+    const int UnpackLoMask[] = {0, 0, 1, 1};
+    const int UnpackHiMask[] = {2, 2, 3, 3};
+    if (isShuffleEquivalent(Mask, 0, 0, 1, 1))
+      Mask = UnpackLoMask;
+    else if (isShuffleEquivalent(Mask, 2, 2, 3, 3))
+      Mask = UnpackHiMask;
+
    return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
                       getV4X86ShuffleImm8ForMask(Mask, DAG));
+  }

  // Use dedicated unpack instructions for masks that match their pattern.
  if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
@ -19347,86 +19358,75 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
  bool FloatDomain = VT.isFloatingPoint();

  // For floating point shuffles, we don't have free copies in the shuffle
-  // instructions, so this always makes sense to canonicalize.
+  // instructions or the ability to load as part of the instruction, so
+  // canonicalize their shuffles to UNPCK or MOV variants.
  //
-  // For integer shuffles, if we don't have access to VEX encodings, the generic
-  // PSHUF instructions are preferable to some of the specialized forms despite
-  // requiring one more byte to encode because they can implicitly copy.
-  //
-  // IF we *do* have VEX encodings, then we can use shorter, more specific
-  // shuffle instructions freely as they can copy due to the extra register
-  // operand.
-  if (FloatDomain || Subtarget->hasAVX()) {
-    // We have both floating point and integer variants of shuffles that dup
-    // either the low or high half of the vector.
-    if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
-      bool Lo = Mask.equals(0, 0);
-      unsigned Shuffle;
-      MVT ShuffleVT;
-      // If the input is a floating point, check if we have SSE3 which will let
-      // us use MOVDDUP. That instruction is no slower than UNPCKLPD but has the
-      // option to fold the input operand into even an unaligned memory load.
-      if (FloatDomain && Lo && Subtarget->hasSSE3()) {
-        Shuffle = X86ISD::MOVDDUP;
-        ShuffleVT = MVT::v2f64;
-      } else if (FloatDomain) {
-        // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
-        // than the UNPCK variants.
-        Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
-        ShuffleVT = MVT::v4f32;
-      } else if (Subtarget->hasSSE2()) {
-        // We model everything else using UNPCK instructions. While MOVLHPS and
-        // MOVHLPS are shorter encodings they cannot accept a memory operand
-        // which overly constrains subsequent lowering.
-        Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
-        ShuffleVT = MVT::v2i64;
-      } else {
-        // No available instructions here.
-        return false;
-      }
-      if (Depth == 1 && Root->getOpcode() == Shuffle)
-        return false; // Nothing to do!
-      Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
-      DCI.AddToWorklist(Op.getNode());
-      if (Shuffle == X86ISD::MOVDDUP)
-        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
-      else
-        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
-      DCI.AddToWorklist(Op.getNode());
-      DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
-                    /*AddTo*/ true);
-      return true;
+  // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
+  // vectors because it can have a load folded into it that UNPCK cannot. This
+  // doesn't preclude something switching to the shorter encoding post-RA.
+  if (FloatDomain && (Mask.equals(0, 0) || Mask.equals(1, 1))) {
+    bool Lo = Mask.equals(0, 0);
+    unsigned Shuffle;
+    MVT ShuffleVT;
+    // Check if we have SSE3 which will let us use MOVDDUP. That instruction
+    // is no slower than UNPCKLPD but has the option to fold the input operand
+    // into even an unaligned memory load.
+    if (Lo && Subtarget->hasSSE3()) {
+      Shuffle = X86ISD::MOVDDUP;
+      ShuffleVT = MVT::v2f64;
+    } else {
+      // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
+      // than the UNPCK variants.
+      Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
+      ShuffleVT = MVT::v4f32;
    }
-
-    // FIXME: We should match UNPCKLPS and UNPCKHPS here.
-
-    // For the integer domain we have specialized instructions for duplicating
-    // any element size from the low or high half.
-    if (!FloatDomain &&
-        (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3) ||
-         Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
-         Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
-         Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
-         Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
-                     15))) {
-      bool Lo = Mask[0] == 0;
-      unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
-      if (Depth == 1 && Root->getOpcode() == Shuffle)
-        return false; // Nothing to do!
-      MVT ShuffleVT;
-      switch (Mask.size()) {
-      case 4: ShuffleVT = MVT::v4i32; break;
-      case 8: ShuffleVT = MVT::v8i16; break;
-      case 16: ShuffleVT = MVT::v16i8; break;
-      };
-      Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
-      DCI.AddToWorklist(Op.getNode());
+    if (Depth == 1 && Root->getOpcode() == Shuffle)
+      return false; // Nothing to do!
+    Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+    DCI.AddToWorklist(Op.getNode());
+    if (Shuffle == X86ISD::MOVDDUP)
+      Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
+    else
      Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
-      DCI.AddToWorklist(Op.getNode());
-      DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
-                    /*AddTo*/ true);
-      return true;
-    }
+    DCI.AddToWorklist(Op.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+                  /*AddTo*/ true);
+    return true;
+  }
+
+  // FIXME: We should match UNPCKLPS and UNPCKHPS here.
+
+  // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
+  // variants as none of these have single-instruction variants that are
+  // superior to the UNPCK formulation.
+  if (!FloatDomain &&
+      (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
+       Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
+       Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
+       Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
+                   15))) {
+    bool Lo = Mask[0] == 0;
+    unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+    if (Depth == 1 && Root->getOpcode() == Shuffle)
+      return false; // Nothing to do!
+    MVT ShuffleVT;
+    switch (Mask.size()) {
+    case 8:
+      ShuffleVT = MVT::v8i16;
+      break;
+    case 16:
+      ShuffleVT = MVT::v16i8;
+      break;
+    default:
+      llvm_unreachable("Impossible mask size!");
+    };
+    Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+    DCI.AddToWorklist(Op.getNode());
+    Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
+    DCI.AddToWorklist(Op.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+                  /*AddTo*/ true);
+    return true;
  }

  // Don't try to re-form single instruction chains under any circumstances now
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll
@ -72,9 +72,9 @@ entry:
  ret <4 x i64> %shuffle
 }

-; CHECK: vpunpcklqdq
+; CHECK: vmovlhps
 ; CHECK-NEXT: vextractf128  $1
-; CHECK-NEXT: vpunpcklqdq
+; CHECK-NEXT: vmovlhps
 ; CHECK-NEXT: vinsertf128 $1
 define <4 x i64> @C(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
 entry:
@ -83,7 +83,7 @@ entry:
 }

 ; CHECK: vpshufd $-96
-; CHECK: vpunpckhdq
+; CHECK: vpshufd $-6
 ; CHECK: vinsertf128 $1
 define <8 x i32> @D(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
 entry:
--- a/test/CodeGen/X86/avx-sext.ll
+++ b/test/CodeGen/X86/avx-sext.ll
@ -156,7 +156,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {

 ; AVX-LABEL: sext_16i8_to_16i16
 ; AVX: vpmovsxbw
-; AVX: vpunpckhqdq
+; AVX: vmovhlps
 ; AVX: vpmovsxbw
 ; AVX: ret
 define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) {
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll
@ -19,7 +19,7 @@ entry:
 }

 ; CHECK: vmovq
-; CHECK-NEXT: vpunpcklqdq %xmm
+; CHECK-NEXT: vmovlhps %xmm
 ; CHECK-NEXT: vinsertf128 $1
 define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
 entry:
--- a/test/CodeGen/X86/exedepsfix-broadcast.ll
+++ b/test/CodeGen/X86/exedepsfix-broadcast.ll
@ -95,8 +95,8 @@ define <4 x double> @ExeDepsFix_broadcastsd256(<4 x double> %arg, <4 x double> %
 ; CHECK-LABEL: ExeDepsFix_broadcastsd_inreg
 ; ExeDepsFix works top down, thus it coalesces vpunpcklqdq domain with
 ; vpand and there is nothing more you can do to match vmaxpd.
-; CHECK: vpunpcklqdq
-; CHECK: vpand
+; CHECK: vmovlhps
+; CHECK: vandps
 ; CHECK: vmaxpd
 ; CHECK: ret
 define <2 x double> @ExeDepsFix_broadcastsd_inreg(<2 x double> %arg, <2 x double> %arg2, i64 %broadcastvalue) {
--- a/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll
@ -639,7 +639,7 @@ define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @shuffle_v8i16_XXXdXXXX(<8 x i16> %a, <8 x i16> %b) {
 ; ALL-LABEL: @shuffle_v8i16_XXXdXXXX
 ; ALL:       # BB#0:
-; ALL-NEXT:    pshufd {{.*}} # xmm0 = xmm1[0,2,2,3]
+; ALL-NEXT:    pshufd {{.*}} # xmm0 = xmm1[2,2,3,3]
 ; ALL-NEXT:    retq
  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
  ret <8 x i16> %shuffle
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@ -6,7 +6,7 @@ target triple = "x86_64-unknown-unknown"
 define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: @shuffle_v4i64_0001
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm1 = xmm0[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm0[0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
@ -18,7 +18,7 @@ define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm1 = xmm1[0],xmm0[0]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
@ -41,7 +41,7 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
@ -52,7 +52,7 @@ define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: @shuffle_v4i64_1000
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
@ -63,8 +63,8 @@ define <4 x i64> @shuffle_v4i64_2200(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: @shuffle_v4i64_2200
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
@ -76,7 +76,7 @@ define <4 x i64> @shuffle_v4i64_3330(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vshufpd {{.*}} # xmm0 = xmm1[1],xmm0[0]
-; AVX1-NEXT:    vpunpckhqdq {{.*}} # xmm1 = xmm1[1,1]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm1[2,3,2,3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
@ -281,7 +281,7 @@ define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: @shuffle_v4i64_0124
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm1 = xmm2[0],xmm1[1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@ -292,7 +292,7 @@ define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: @shuffle_v4i64_0142
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm2 = xmm2[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm2 = xmm2[0,1,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@ -304,7 +304,7 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@ -316,7 +316,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@ -336,7 +336,7 @@ define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpshufd {{.*}} # xmm2 = xmm1[2,3,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm2 = xmm2[0],xmm0[1]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@ -356,7 +356,7 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpshufd {{.*}} # xmm2 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@ -368,8 +368,7 @@ define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: @stress_test1
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; AVX1-NEXT:    vpunpckhqdq {{.*}} # xmm0 = xmm0[1,1]
-; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
 ; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm1[2,3,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0