[x86] Teach the vector combiner that picks a canonical shuffle from to

support transforming the forms from the new vector shuffle lowering to use 'movddup' when appropriate. A bunch of the cases where we actually form 'movddup' don't actually show up in the test results because something even later than DAG legalization maps them back to 'unpcklpd'. If this shows back up as a performance problem, I'll probably chase it down, but it is at least an encoded size loss. =/ To make this work, also always do this canonicalizing step for floating point vectors where the baseline shuffle instructions don't provide any free copies of their inputs. This also causes us to canonicalize unpck[hl]pd into mov{hl,lh}ps (resp.) which is a nice encoding space win. There is one test which is "regressed" by this: extractelement-load. There, the test case where the optimization it is testing *fails*, the exact instruction pattern which results is slightly different. This should probably be fixed by having the appropriate extract formed earlier in the DAG, but that would defeat the purpose of the test.... If this test case is critically important for anyone, please let me know and I'll try to work on it. The prior behavior was actually contrary to the comment in the test case and seems likely to have been an accident. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217738 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-09 16:45:03 +00:00 · 2014-09-14 22:41:37 +00:00 · 2014-09-14 22:41:37 +00:00 · 33957173a7
commit 33957173a7
parent e1c77ca37a
9 changed files with 88 additions and 31 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -19301,26 +19301,52 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
  // Use the float domain if the operand type is a floating point type.
  bool FloatDomain = VT.isFloatingPoint();

-  // If we don't have access to VEX encodings, the generic PSHUF instructions
-  // are preferable to some of the specialized forms despite requiring one more
-  // byte to encode because they can implicitly copy.
+  // For floating point shuffles, we don't have free copies in the shuffle
+  // instructions, so this always makes sense to canonicalize.
  //
-  // IF we *do* have VEX encodings, than we can use shorter, more specific
+  // For integer shuffles, if we don't have access to VEX encodings, the generic
+  // PSHUF instructions are preferable to some of the specialized forms despite
+  // requiring one more byte to encode because they can implicitly copy.
+  //
+  // IF we *do* have VEX encodings, then we can use shorter, more specific
  // shuffle instructions freely as they can copy due to the extra register
  // operand.
-  if (Subtarget->hasAVX()) {
+  if (FloatDomain || Subtarget->hasAVX()) {
    // We have both floating point and integer variants of shuffles that dup
    // either the low or high half of the vector.
    if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
      bool Lo = Mask.equals(0, 0);
-      unsigned Shuffle = FloatDomain ? (Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS)
-                                     : (Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH);
+      unsigned Shuffle;
+      MVT ShuffleVT;
+      // If the input is a floating point, check if we have SSE3 which will let
+      // us use MOVDDUP. That instruction is no slower than UNPCKLPD but has the
+      // option to fold the input operand into even an unaligned memory load.
+      if (FloatDomain && Lo && Subtarget->hasSSE3()) {
+        Shuffle = X86ISD::MOVDDUP;
+        ShuffleVT = MVT::v2f64;
+      } else if (FloatDomain) {
+        // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
+        // than the UNPCK variants.
+        Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
+        ShuffleVT = MVT::v4f32;
+      } else if (Subtarget->hasSSE2()) {
+        // We model everything else using UNPCK instructions. While MOVLHPS and
+        // MOVHLPS are shorter encodings they cannot accept a memory operand
+        // which overly constrains subsequent lowering.
+        Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+        ShuffleVT = MVT::v2i64;
+      } else {
+        // No available instructions here.
+        return false;
+      }
      if (Depth == 1 && Root->getOpcode() == Shuffle)
        return false; // Nothing to do!
-      MVT ShuffleVT = FloatDomain ? MVT::v4f32 : MVT::v2i64;
      Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
      DCI.AddToWorklist(Op.getNode());
-      Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
+      if (Shuffle == X86ISD::MOVDDUP)
+        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
+      else
+        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
      DCI.AddToWorklist(Op.getNode());
      DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
                    /*AddTo*/ true);
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll
@ -30,7 +30,7 @@ entry:
  ret <4 x i64> %vecinit6.i
 }

-; CHECK: vmovlhps %xmm
+; CHECK: vunpcklpd %xmm
 ; CHECK-NEXT: vinsertf128 $1
 define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
 entry:
--- a/test/CodeGen/X86/combine-vec-shuffle-4.ll
+++ b/test/CodeGen/X86/combine-vec-shuffle-4.ll
@ -77,7 +77,7 @@ define <4 x float> @test7(<4 x float> %a) {
 ; Mask: [0,1,0,1]
 ; CHECK-NOT: pshufd
 ; CHECK-NOT: shufps
-; CHECK: movlhps 
+; CHECK: unpcklpd
 ; CHECK-NEXT: ret

 define <4 x float> @test8(<4 x float> %a) {
@ -89,7 +89,7 @@ define <4 x float> @test8(<4 x float> %a) {
 ; Mask: [0,1,0,u]
 ; CHECK-NOT: pshufd
 ; CHECK-NOT: shufps
-; CHECK: movlhps
+; CHECK: unpcklpd
 ; CHECK-NEXT: ret

 define <4 x float> @test9(<4 x float> %a) {
@ -196,7 +196,7 @@ define <4 x float> @test17(<4 x float> %a) {
 ; Mask: [0,1,0,1]
 ; CHECK-NOT: pshufd
 ; CHECK-NOT: shufps
-; CHECK: movlhps 
+; CHECK: unpcklpd
 ; CHECK-NEXT: ret

 define <4 x float> @test18(<4 x float> %a) {
@ -208,7 +208,7 @@ define <4 x float> @test18(<4 x float> %a) {
 ; Mask: [0,1,0,u]
 ; CHECK-NOT: pshufd
 ; CHECK-NOT: shufps
-; CHECK: movlhps
+; CHECK: unpcklpd
 ; CHECK-NEXT: ret

 define <4 x float> @test19(<4 x float> %a) {
--- a/test/CodeGen/X86/extractelement-load.ll
+++ b/test/CodeGen/X86/extractelement-load.ll
@ -36,7 +36,9 @@ define void @t3() {
 ;
 ; This movs the entire vector, shuffling the high double down. If we fixed the
 ; FIXME above it would just move the high double directly.
-; CHECK: movhpd %xmm
+; CHECK: movups
+; CHECK: movhlps
+; CHECK: movlps

 bb:
  %tmp13 = load <2 x double>* undef, align 1
--- a/test/CodeGen/X86/vec_extract.ll
+++ b/test/CodeGen/X86/vec_extract.ll
@ -41,7 +41,7 @@ entry:
 define double @test4(double %A) nounwind {
 ; CHECK-LABEL: test4:
 ; CHECK:         calll {{.*}}foo
-; CHECK-NEXT:    unpckhpd %[[X:xmm[0-9]+]], %[[X]]
+; CHECK-NEXT:    movhlps %[[X:xmm[0-9]+]], %[[X]]
 ; CHECK-NEXT:    addsd {{.*}}(%{{.*}}), %[[X2]]
 ; CHECK-NEXT:    movsd %[[X2]], [[mem:.*\(%.*\)]]
 ; CHECK-NEXT:    fldl [[mem]]
--- a/test/CodeGen/X86/vec_shuffle-38.ll
+++ b/test/CodeGen/X86/vec_shuffle-38.ll
@ -7,7 +7,7 @@ define <2 x double> @ld(<2 x double> %p) nounwind optsize ssp {
 }

 define <2 x double> @hd(<2 x double> %p) nounwind optsize ssp {
-; CHECK: unpckhpd
+; CHECK: movhlps
  %shuffle = shufflevector <2 x double> %p, <2 x double> undef, <2 x i32> <i32 1, i32 1>
  ret <2 x double> %shuffle
 }
--- a/test/CodeGen/X86/vec_splat.ll
+++ b/test/CodeGen/X86/vec_splat.ll
@ -28,7 +28,7 @@ define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) nounwind {
 	ret void

 ; SSE2-LABEL: test_v2sd:
-; SSE2: shufpd $0
+; SSE2: movlhps

 ; SSE3-LABEL: test_v2sd:
 ; SSE3: movddup
--- a/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll
@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE3

 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
@ -48,7 +49,7 @@ define <2 x i64> @shuffle_v2i64_33(<2 x i64> %a, <2 x i64> %b) {

 define <2 x double> @shuffle_v2f64_00(<2 x double> %a, <2 x double> %b) {
 ; CHECK-SSE2-LABEL: @shuffle_v2f64_00
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[0,0]
+; CHECK-SSE2:         movlhps {{.*}} # xmm0 = xmm0[0,0]
 ; CHECK-SSE2-NEXT:    retq
  %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 0>
  ret <2 x double> %shuffle
@ -62,17 +63,15 @@ define <2 x double> @shuffle_v2f64_10(<2 x double> %a, <2 x double> %b) {
 }
 define <2 x double> @shuffle_v2f64_11(<2 x double> %a, <2 x double> %b) {
 ; CHECK-SSE2-LABEL: @shuffle_v2f64_11
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[1,1]
+; CHECK-SSE2:         movhlps {{.*}} # xmm0 = xmm0[1,1]
 ; CHECK-SSE2-NEXT:    retq
  %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1>
  ret <2 x double> %shuffle
 }
 define <2 x double> @shuffle_v2f64_22(<2 x double> %a, <2 x double> %b) {
-; FIXME: Should these use movapd + shufpd to remove a domain change at the cost
-;        of a mov?
-;
 ; CHECK-SSE2-LABEL: @shuffle_v2f64_22
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[0,1,0,1]
+; CHECK-SSE2:         movlhps {{.*}} # xmm1 = xmm1[0,0]
+; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    retq
  %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 2>
  ret <2 x double> %shuffle
@ -86,7 +85,8 @@ define <2 x double> @shuffle_v2f64_32(<2 x double> %a, <2 x double> %b) {
 }
 define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) {
 ; CHECK-SSE2-LABEL: @shuffle_v2f64_33
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[2,3,2,3]
+; CHECK-SSE2:         movhlps {{.*}} # xmm1 = xmm1[1,1]
+; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    retq
  %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 3>
  ret <2 x double> %shuffle
@ -217,3 +217,32 @@ define <2 x i64> @shuffle_v2i64_31_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
  ret <2 x i64> %shuffle
 }
+
+
+define <2 x double> @insert_dup_reg_v2f64(double %a) {
+; CHECK-SSE2-LABEL: @insert_dup_reg_v2f64
+; CHECK-SSE2:         movlhps {{.*}} # xmm0 = xmm0[0,0]
+; CHECK-SSE2-NEXT:    retq
+;
+; FIXME: This should match movddup as well!
+; CHECK-SSE3-LABEL: @insert_dup_reg_v2f64
+; CHECK-SSE3:         unpcklpd {{.*}} # xmm0 = xmm0[0,0]
+; CHECK-SSE3-NEXT:    retq
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %shuffle
+}
+define <2 x double> @insert_dup_mem_v2f64(double* %ptr) {
+; CHECK-SSE2-LABEL: @insert_dup_mem_v2f64
+; CHECK-SSE2:         movsd {{.*}}, %xmm0
+; CHECK-SSE2-NEXT:    movlhps {{.*}} # xmm0 = xmm0[0,0]
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-SSE3-LABEL: @insert_dup_mem_v2f64
+; CHECK-SSE3:         movddup {{.*}}, %xmm0
+; CHECK-SSE3-NEXT:    retq
+  %a = load double* %ptr
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %shuffle
+}
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@ -98,7 +98,7 @@ define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) {
 define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) {
 ; AVX1-LABEL: @shuffle_v4f64_0001
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm1 = xmm0[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm1 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
@ -109,7 +109,7 @@ define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vunpcklpd {{.*}} # xmm1 = xmm1[0],xmm0[0]
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
@ -120,7 +120,7 @@ define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vshufpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
@ -130,7 +130,7 @@ define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {
 ; AVX1-LABEL: @shuffle_v4f64_1000
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vshufpd {{.*}} # xmm1 = xmm0[1,0]
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
@ -140,8 +140,8 @@ define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) {
 ; AVX1-LABEL: @shuffle_v4f64_2200
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm1 = xmm1[0,0]
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm1 = xmm1[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>