From 95b14b00dae387a1a2035a1fcaae37a00fef8b2e Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Thu, 10 Jul 2014 04:34:06 +0000 Subject: [PATCH] [x86] Initial improvements to the new shuffle lowering for v16i8 shuffles specifically for cases where a small subset of the elements in the input vector are actually used. This is specifically targetted at improving the shuffles generated for trunc operations, but also helps out splat-like operations. There is still some really low-hanging fruit here that I want to address but this is a huge step in the right direction. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@212680 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 46 +++++++++++++++++----- test/CodeGen/X86/vector-shuffle-128-v16.ll | 38 ++++++++++-------- 2 files changed, 57 insertions(+), 27 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 140b2a830ce..57b21cce12c 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7815,16 +7815,42 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL); - auto buildLoAndHiV8s = - [&](SDValue V, ArrayRef LoBlendMask, ArrayRef HiBlendMask) { - SDValue LoV = - DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, - DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); - SDValue HiV = - DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, - DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); - SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, LoV, HiV, LoBlendMask); - SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, LoV, HiV, HiBlendMask); + auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef LoBlendMask, + MutableArrayRef HiBlendMask) { + SDValue V1, V2; + // Check if any of the odd lanes in the v16i8 are used. If not, we can mask + // them out and avoid using UNPCK{L,H} to extract the elements of V as + // i16s. + if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(), + [](int M) { return M >= 0 && M % 2 == 1; }) && + std::none_of(HiBlendMask.begin(), HiBlendMask.end(), + [](int M) { return M >= 0 && M % 2 == 1; })) { + // Use a mask to drop the high bytes. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); + V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1, + DAG.getConstant(0x00FF, MVT::v8i16)); + + // This will be a single vector shuffle instead of a blend so nuke V2. + V2 = DAG.getUNDEF(MVT::v8i16); + + // Squash the masks to point directly into V1. + for (int &M : LoBlendMask) + if (M >= 0) + M /= 2; + for (int &M : HiBlendMask) + if (M >= 0) + M /= 2; + } else { + // Otherwise just unpack the low half of V into V1 and the high half into + // V2 so that we can blend them as i16s. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); + } + + SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask); + SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask); return std::make_pair(BlendedLo, BlendedHi); }; SDValue V1Lo, V1Hi, V2Lo, V2Hi; diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index 0c317c3dd22..85d2bb56292 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -30,11 +30,9 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01( define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i8> %a, <16 x i8> %b) { ; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08 ; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: punpckhbw %xmm1, %xmm2 -; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0 -; CHECK-SSE2-NEXT: punpcklwd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pand +; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-SSE2-NEXT: packuswb %xmm0, %xmm0 ; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 ; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] @@ -60,18 +58,11 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03( define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i8> %a, <16 x i8> %b) { ; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12 ; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm2 = xmm2[0,2,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: punpckhbw %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: punpcklwd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,1] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,0,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,7] +; CHECK-SSE2-NEXT: pand +; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7] +; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,1] +; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,3,3,4,5,6,7] +; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,5,6,7] ; CHECK-SSE2-NEXT: packuswb %xmm0, %xmm0 ; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 ; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,0,1] @@ -189,3 +180,16 @@ define <16 x i8> @zext_to_v4i32_shuffle(<16 x i8> %a) { %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } + +define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) { +; CHECK-SSE2-LABEL: @trunc_v4i32_shuffle +; CHECK-SSE2: # BB#0: +; CHECK-SSE2-NEXT: pand +; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7] +; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7] +; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: packuswb %xmm0, %xmm0 +; CHECK-SSE2-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + ret <16 x i8> %shuffle +}