From a3805f1c73d7bdb8fd51e5add0f0fb58b71f1086 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Sat, 16 Aug 2014 09:42:15 +0000 Subject: [PATCH] [x86] Teach lots of the new vector shuffle lowering to use UNPCK instructions for blend operations at 128 bits. This was a serious hole in our prior blend lowering. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@215819 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 24 +++++++++++++++++++ test/CodeGen/X86/vector-shuffle-128-v2.ll | 28 +++++++++++------------ test/CodeGen/X86/vector-shuffle-128-v4.ll | 6 ++--- test/CodeGen/X86/vector-shuffle-256-v4.ll | 4 ++-- 4 files changed, 43 insertions(+), 19 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 5dd85f4e28f..efcc7ed2b15 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7142,6 +7142,12 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); assert(Mask[1] >= 2 && "Non-canonicalized blend!"); + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 2)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2); + if (isShuffleEquivalent(Mask, 1, 3)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2); + unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2, DAG.getConstant(SHUFPDMask, MVT::i8)); @@ -7178,6 +7184,12 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(WidenedMask, DAG))); } + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 2)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2); + if (isShuffleEquivalent(Mask, 1, 3)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2); + // We implement this with SHUFPD which is pretty lame because it will likely // incur 2 cycles of stall for integer vectors on Nehalem and older chips. // However, all the alternatives are still more cycles and newer chips don't @@ -7216,6 +7228,12 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, getV4X86ShuffleImm8ForMask(Mask, DAG)); + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2); + if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2); + if (NumV2Elements == 1) { int V2Index = std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - @@ -7304,6 +7322,12 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(Mask, DAG)); + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2); + if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); + // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build // up the inputs, bypassing domain shift penalties that we would encur if we diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll index 78b4ee7e5dd..d0e8dfd242a 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -95,15 +95,15 @@ define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) { define <2 x i64> @shuffle_v2i64_02(<2 x i64> %a, <2 x i64> %b) { ; CHECK-SSE2-LABEL: @shuffle_v2i64_02 -; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[0] +; CHECK-SSE2: punpcklqdq {{.*}} # xmm0 = xmm0[0],xmm1[0] ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_02_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { ; CHECK-SSE2-LABEL: @shuffle_v2i64_02_copy -; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm2[0] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0 +; CHECK-SSE2: punpcklqdq {{.*}} # xmm1 = xmm1[0],xmm2[0] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle @@ -140,31 +140,31 @@ define <2 x i64> @shuffle_v2i64_12_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64 } define <2 x i64> @shuffle_v2i64_13(<2 x i64> %a, <2 x i64> %b) { ; CHECK-SSE2-LABEL: @shuffle_v2i64_13 -; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[1],xmm1[1] +; CHECK-SSE2: punpckhqdq {{.*}} # xmm0 = xmm0[1],xmm1[1] ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_13_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { ; CHECK-SSE2-LABEL: @shuffle_v2i64_13_copy -; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0 +; CHECK-SSE2: punpckhqdq {{.*}} # xmm1 = xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_20(<2 x i64> %a, <2 x i64> %b) { ; CHECK-SSE2-LABEL: @shuffle_v2i64_20 -; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[0] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0 +; CHECK-SSE2: punpcklqdq {{.*}} # xmm1 = xmm1[0],xmm0[0] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_20_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { ; CHECK-SSE2-LABEL: @shuffle_v2i64_20_copy -; CHECK-SSE2: shufpd {{.*}} # xmm2 = xmm2[0],xmm1[0] -; CHECK-SSE2-NEXT: movapd %xmm2, %xmm0 +; CHECK-SSE2: punpcklqdq {{.*}} # xmm2 = xmm2[0],xmm1[0] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle @@ -203,16 +203,16 @@ define <2 x i64> @shuffle_v2i64_30_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64 } define <2 x i64> @shuffle_v2i64_31(<2 x i64> %a, <2 x i64> %b) { ; CHECK-SSE2-LABEL: @shuffle_v2i64_31 -; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[1],xmm0[1] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0 +; CHECK-SSE2: punpckhqdq {{.*}} # xmm1 = xmm1[1],xmm0[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_31_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { ; CHECK-SSE2-LABEL: @shuffle_v2i64_31_copy -; CHECK-SSE2: shufpd {{.*}} # xmm2 = xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movapd %xmm2, %xmm0 +; CHECK-SSE2: punpckhqdq {{.*}} # xmm2 = xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll index 3b7c146dd36..d409d235033 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -155,7 +155,7 @@ define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) { } define <4 x i32> @shuffle_v4i32_0145(<4 x i32> %a, <4 x i32> %b) { ; CHECK-SSE2-LABEL: @shuffle_v4i32_0145 -; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[0] +; CHECK-SSE2: punpcklqdq {{.*}} # xmm0 = xmm0[0],xmm1[0] ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle @@ -170,8 +170,8 @@ define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) { } define <4 x i32> @shuffle_v4i32_4501(<4 x i32> %a, <4 x i32> %b) { ; CHECK-SSE2-LABEL: @shuffle_v4i32_4501 -; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[0] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0 +; CHECK-SSE2: punpcklqdq {{.*}} # xmm1 = xmm1[0],xmm0[0] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 7051888302e..306c85bd478 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -17,7 +17,7 @@ define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @shuffle_v4i64_0020 ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm1[0],xmm0[0] ; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -108,7 +108,7 @@ define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) { ; AVX1-LABEL: @shuffle_v4f64_0020 ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm1[0],xmm0[0] +; AVX1-NEXT: vunpcklpd {{.*}} # xmm1 = xmm1[0],xmm0[0] ; AVX1-NEXT: vmovlhps {{.*}} # xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq