From 89436b4160f6a7d203fceec94db8cdca9b4290d3 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Fri, 19 Sep 2014 09:45:21 +0000 Subject: [PATCH] [x86] Recognize that we can use duplication to widen v16i8 shuffles due to undef lanes as well as defined widenable lanes. This dramatically improves the lowering we use for undef-shuffles in a zext-ish pattern for SSE2. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218115 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 6 +++--- test/CodeGen/X86/vector-shuffle-128-v16.ll | 17 ++++++++++------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 91ca1d6cc6e..9fa6267b4d2 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -8716,10 +8716,10 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // FIXME: We should check for other patterns which can be widened into an // i16 shuffle as well. auto canWidenViaDuplication = [](ArrayRef Mask) { - for (int i = 0; i < 16; i += 2) { - if (Mask[i] != Mask[i + 1]) + for (int i = 0; i < 16; i += 2) + if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1]) return false; - } + return true; }; auto tryToWidenViaDuplication = [&]() -> SDValue { diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index b260cdeb53e..e3684bfef00 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -512,10 +512,16 @@ define <16 x i8> @shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30( } define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %a) { +; SSE2-LABEL: @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu +; SSE2: # BB#0: +; SSE2-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,6,7] +; SSE2-NEXT: retq +; ; SSSE3-LABEL: @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}},1,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}}] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu @@ -549,17 +555,14 @@ define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz( define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu(<16 x i8> %a) { ; SSE2-LABEL: @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu ; SSE2: # BB#0: -; SSE2-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]] -; SSE2-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7] -; SSE2-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu