diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index f04a7810a13..f3774321a07 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -19301,26 +19301,52 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef Mask, // Use the float domain if the operand type is a floating point type. bool FloatDomain = VT.isFloatingPoint(); - // If we don't have access to VEX encodings, the generic PSHUF instructions - // are preferable to some of the specialized forms despite requiring one more - // byte to encode because they can implicitly copy. + // For floating point shuffles, we don't have free copies in the shuffle + // instructions, so this always makes sense to canonicalize. // - // IF we *do* have VEX encodings, than we can use shorter, more specific + // For integer shuffles, if we don't have access to VEX encodings, the generic + // PSHUF instructions are preferable to some of the specialized forms despite + // requiring one more byte to encode because they can implicitly copy. + // + // IF we *do* have VEX encodings, then we can use shorter, more specific // shuffle instructions freely as they can copy due to the extra register // operand. - if (Subtarget->hasAVX()) { + if (FloatDomain || Subtarget->hasAVX()) { // We have both floating point and integer variants of shuffles that dup // either the low or high half of the vector. if (Mask.equals(0, 0) || Mask.equals(1, 1)) { bool Lo = Mask.equals(0, 0); - unsigned Shuffle = FloatDomain ? (Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS) - : (Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH); + unsigned Shuffle; + MVT ShuffleVT; + // If the input is a floating point, check if we have SSE3 which will let + // us use MOVDDUP. That instruction is no slower than UNPCKLPD but has the + // option to fold the input operand into even an unaligned memory load. + if (FloatDomain && Lo && Subtarget->hasSSE3()) { + Shuffle = X86ISD::MOVDDUP; + ShuffleVT = MVT::v2f64; + } else if (FloatDomain) { + // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller + // than the UNPCK variants. + Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS; + ShuffleVT = MVT::v4f32; + } else if (Subtarget->hasSSE2()) { + // We model everything else using UNPCK instructions. While MOVLHPS and + // MOVHLPS are shorter encodings they cannot accept a memory operand + // which overly constrains subsequent lowering. + Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; + ShuffleVT = MVT::v2i64; + } else { + // No available instructions here. + return false; + } if (Depth == 1 && Root->getOpcode() == Shuffle) return false; // Nothing to do! - MVT ShuffleVT = FloatDomain ? MVT::v4f32 : MVT::v2i64; Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); DCI.AddToWorklist(Op.getNode()); - Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); + if (Shuffle == X86ISD::MOVDDUP) + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); + else + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); DCI.AddToWorklist(Op.getNode()); DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), /*AddTo*/ true); diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll index 3856aeac3fd..a2537ce5c04 100644 --- a/test/CodeGen/X86/avx-splat.ll +++ b/test/CodeGen/X86/avx-splat.ll @@ -30,7 +30,7 @@ entry: ret <4 x i64> %vecinit6.i } -; CHECK: vmovlhps %xmm +; CHECK: vunpcklpd %xmm ; CHECK-NEXT: vinsertf128 $1 define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp { entry: diff --git a/test/CodeGen/X86/combine-vec-shuffle-4.ll b/test/CodeGen/X86/combine-vec-shuffle-4.ll index 0ddec2c12fb..84820e4e3c3 100644 --- a/test/CodeGen/X86/combine-vec-shuffle-4.ll +++ b/test/CodeGen/X86/combine-vec-shuffle-4.ll @@ -77,7 +77,7 @@ define <4 x float> @test7(<4 x float> %a) { ; Mask: [0,1,0,1] ; CHECK-NOT: pshufd ; CHECK-NOT: shufps -; CHECK: movlhps +; CHECK: unpcklpd ; CHECK-NEXT: ret define <4 x float> @test8(<4 x float> %a) { @@ -89,7 +89,7 @@ define <4 x float> @test8(<4 x float> %a) { ; Mask: [0,1,0,u] ; CHECK-NOT: pshufd ; CHECK-NOT: shufps -; CHECK: movlhps +; CHECK: unpcklpd ; CHECK-NEXT: ret define <4 x float> @test9(<4 x float> %a) { @@ -196,7 +196,7 @@ define <4 x float> @test17(<4 x float> %a) { ; Mask: [0,1,0,1] ; CHECK-NOT: pshufd ; CHECK-NOT: shufps -; CHECK: movlhps +; CHECK: unpcklpd ; CHECK-NEXT: ret define <4 x float> @test18(<4 x float> %a) { @@ -208,7 +208,7 @@ define <4 x float> @test18(<4 x float> %a) { ; Mask: [0,1,0,u] ; CHECK-NOT: pshufd ; CHECK-NOT: shufps -; CHECK: movlhps +; CHECK: unpcklpd ; CHECK-NEXT: ret define <4 x float> @test19(<4 x float> %a) { diff --git a/test/CodeGen/X86/extractelement-load.ll b/test/CodeGen/X86/extractelement-load.ll index 3e31b4b190b..0d5d299ed10 100644 --- a/test/CodeGen/X86/extractelement-load.ll +++ b/test/CodeGen/X86/extractelement-load.ll @@ -36,7 +36,9 @@ define void @t3() { ; ; This movs the entire vector, shuffling the high double down. If we fixed the ; FIXME above it would just move the high double directly. -; CHECK: movhpd %xmm +; CHECK: movups +; CHECK: movhlps +; CHECK: movlps bb: %tmp13 = load <2 x double>* undef, align 1 diff --git a/test/CodeGen/X86/vec_extract.ll b/test/CodeGen/X86/vec_extract.ll index 93380427f49..6391ef61682 100644 --- a/test/CodeGen/X86/vec_extract.ll +++ b/test/CodeGen/X86/vec_extract.ll @@ -41,7 +41,7 @@ entry: define double @test4(double %A) nounwind { ; CHECK-LABEL: test4: ; CHECK: calll {{.*}}foo -; CHECK-NEXT: unpckhpd %[[X:xmm[0-9]+]], %[[X]] +; CHECK-NEXT: movhlps %[[X:xmm[0-9]+]], %[[X]] ; CHECK-NEXT: addsd {{.*}}(%{{.*}}), %[[X2]] ; CHECK-NEXT: movsd %[[X2]], [[mem:.*\(%.*\)]] ; CHECK-NEXT: fldl [[mem]] diff --git a/test/CodeGen/X86/vec_shuffle-38.ll b/test/CodeGen/X86/vec_shuffle-38.ll index ec196df7aef..7e4f747f735 100644 --- a/test/CodeGen/X86/vec_shuffle-38.ll +++ b/test/CodeGen/X86/vec_shuffle-38.ll @@ -7,7 +7,7 @@ define <2 x double> @ld(<2 x double> %p) nounwind optsize ssp { } define <2 x double> @hd(<2 x double> %p) nounwind optsize ssp { -; CHECK: unpckhpd +; CHECK: movhlps %shuffle = shufflevector <2 x double> %p, <2 x double> undef, <2 x i32> ret <2 x double> %shuffle } diff --git a/test/CodeGen/X86/vec_splat.ll b/test/CodeGen/X86/vec_splat.ll index 28f2a9074cb..07eeb3575c7 100644 --- a/test/CodeGen/X86/vec_splat.ll +++ b/test/CodeGen/X86/vec_splat.ll @@ -28,7 +28,7 @@ define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) nounwind { ret void ; SSE2-LABEL: test_v2sd: -; SSE2: shufpd $0 +; SSE2: movlhps ; SSE3-LABEL: test_v2sd: ; SSE3: movddup diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll index d0e8dfd242a..49d2eeb482f 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE3 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" @@ -48,7 +49,7 @@ define <2 x i64> @shuffle_v2i64_33(<2 x i64> %a, <2 x i64> %b) { define <2 x double> @shuffle_v2f64_00(<2 x double> %a, <2 x double> %b) { ; CHECK-SSE2-LABEL: @shuffle_v2f64_00 -; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[0,0] +; CHECK-SSE2: movlhps {{.*}} # xmm0 = xmm0[0,0] ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle @@ -62,17 +63,15 @@ define <2 x double> @shuffle_v2f64_10(<2 x double> %a, <2 x double> %b) { } define <2 x double> @shuffle_v2f64_11(<2 x double> %a, <2 x double> %b) { ; CHECK-SSE2-LABEL: @shuffle_v2f64_11 -; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[1,1] +; CHECK-SSE2: movhlps {{.*}} # xmm0 = xmm0[1,1] ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle } define <2 x double> @shuffle_v2f64_22(<2 x double> %a, <2 x double> %b) { -; FIXME: Should these use movapd + shufpd to remove a domain change at the cost -; of a mov? -; ; CHECK-SSE2-LABEL: @shuffle_v2f64_22 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[0,1,0,1] +; CHECK-SSE2: movlhps {{.*}} # xmm1 = xmm1[0,0] +; CHECK-SSE2-NEXT: movaps %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle @@ -86,7 +85,8 @@ define <2 x double> @shuffle_v2f64_32(<2 x double> %a, <2 x double> %b) { } define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) { ; CHECK-SSE2-LABEL: @shuffle_v2f64_33 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[2,3,2,3] +; CHECK-SSE2: movhlps {{.*}} # xmm1 = xmm1[1,1] +; CHECK-SSE2-NEXT: movaps %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle @@ -217,3 +217,32 @@ define <2 x i64> @shuffle_v2i64_31_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64 %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } + + +define <2 x double> @insert_dup_reg_v2f64(double %a) { +; CHECK-SSE2-LABEL: @insert_dup_reg_v2f64 +; CHECK-SSE2: movlhps {{.*}} # xmm0 = xmm0[0,0] +; CHECK-SSE2-NEXT: retq +; +; FIXME: This should match movddup as well! +; CHECK-SSE3-LABEL: @insert_dup_reg_v2f64 +; CHECK-SSE3: unpcklpd {{.*}} # xmm0 = xmm0[0,0] +; CHECK-SSE3-NEXT: retq + %v = insertelement <2 x double> undef, double %a, i32 0 + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> + ret <2 x double> %shuffle +} +define <2 x double> @insert_dup_mem_v2f64(double* %ptr) { +; CHECK-SSE2-LABEL: @insert_dup_mem_v2f64 +; CHECK-SSE2: movsd {{.*}}, %xmm0 +; CHECK-SSE2-NEXT: movlhps {{.*}} # xmm0 = xmm0[0,0] +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE3-LABEL: @insert_dup_mem_v2f64 +; CHECK-SSE3: movddup {{.*}}, %xmm0 +; CHECK-SSE3-NEXT: retq + %a = load double* %ptr + %v = insertelement <2 x double> undef, double %a, i32 0 + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> + ret <2 x double> %shuffle +} diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index cde96dbb30f..a21b78985d7 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -98,7 +98,7 @@ define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) { define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) { ; AVX1-LABEL: @shuffle_v4f64_0001 ; AVX1: # BB#0: -; AVX1-NEXT: vmovlhps {{.*}} # xmm1 = xmm0[0,0] +; AVX1-NEXT: vunpcklpd {{.*}} # xmm1 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> @@ -109,7 +109,7 @@ define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vunpcklpd {{.*}} # xmm1 = xmm1[0],xmm0[0] -; AVX1-NEXT: vmovlhps {{.*}} # xmm0 = xmm0[0,0] +; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> @@ -120,7 +120,7 @@ define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm0[0],xmm1[1] -; AVX1-NEXT: vmovlhps {{.*}} # xmm0 = xmm0[0,0] +; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> @@ -130,7 +130,7 @@ define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) { ; AVX1-LABEL: @shuffle_v4f64_1000 ; AVX1: # BB#0: ; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm0[1,0] -; AVX1-NEXT: vmovlhps {{.*}} # xmm0 = xmm0[0,0] +; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> @@ -140,8 +140,8 @@ define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) { ; AVX1-LABEL: @shuffle_v4f64_2200 ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovlhps {{.*}} # xmm1 = xmm1[0,0] -; AVX1-NEXT: vmovlhps {{.*}} # xmm0 = xmm0[0,0] +; AVX1-NEXT: vunpcklpd {{.*}} # xmm1 = xmm1[0,0] +; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32>