diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index f4f41c1d399..596be841751 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -9681,6 +9681,64 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); } +/// \brief Either split a vector in halves or decompose the shuffles and the +/// blend. +/// +/// This is provided as a good fallback for many lowerings of non-single-input +/// shuffles with more than one 128-bit lane. In those cases, we want to select +/// between splitting the shuffle into 128-bit components and stitching those +/// back together vs. extracting the single-input shuffles and blending those +/// results. +static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + SelectionDAG &DAG) { + assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to " + "lower single-input shuffles as it " + "could then recurse on itself."); + int Size = Mask.size(); + + // If this can be modeled as a broadcast of two elements followed by a blend, + // prefer that lowering. This is especially important because broadcasts can + // often fold with memory operands. + auto DoBothBroadcast = [&] { + int V1BroadcastIdx = -1, V2BroadcastIdx = -1; + for (int M : Mask) + if (M >= Size) { + if (V2BroadcastIdx == -1) + V2BroadcastIdx = M - Size; + else if (M - Size != V2BroadcastIdx) + return false; + } else if (M >= 0) { + if (V1BroadcastIdx == -1) + V1BroadcastIdx = M; + else if (M != V1BroadcastIdx) + return false; + } + return true; + }; + if (DoBothBroadcast()) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, + DAG); + + // If the inputs all stem from a single 128-bit lane of each input, then we + // split them rather than blending because the split will decompose to + // unusually few instructions. + int LaneCount = VT.getSizeInBits() / 128; + int LaneSize = Size / LaneCount; + SmallBitVector LaneInputs[2]; + LaneInputs[0].resize(LaneCount, false); + LaneInputs[1].resize(LaneCount, false); + for (int i = 0; i < Size; ++i) + if (Mask[i] >= 0) + LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true; + if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1) + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + + // Otherwise, just fall back to decomposed shuffles and a blend. This requires + // that the decomposed single-input shuffles don't end up here. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); +} + /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as /// a permutation and blend of those lanes. /// @@ -9855,9 +9913,14 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getConstant(SHUFPDMask, MVT::i8)); } - // Otherwise fall back on generic blend lowering. - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, - Mask, DAG); + // If we have AVX2 then we always want to lower with a blend because an v4 we + // can fully permute the elements. + if (Subtarget->hasAVX2()) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, + Mask, DAG); + + // Otherwise fall back on generic lowering. + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); } /// \brief Handle lowering of 4-lane 64-bit integer shuffles. @@ -9997,9 +10060,14 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG); } - // Otherwise fall back on generic blend lowering. - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, - Mask, DAG); + // If we have AVX2 then we always want to lower with a blend because at v8 we + // can fully permute the elements. + if (Subtarget->hasAVX2()) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, + Mask, DAG); + + // Otherwise fall back on generic lowering. + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); } /// \brief Handle lowering of 8-lane 32-bit integer shuffles. @@ -10125,9 +10193,8 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask))); } - // Otherwise fall back on generic blend lowering. - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i16, V1, V2, - Mask, DAG); + // Otherwise fall back on generic lowering. + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG); } /// \brief Handle lowering of 32-lane 8-bit integer shuffles. @@ -10191,9 +10258,8 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)); } - // Otherwise fall back on generic blend lowering. - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v32i8, V1, V2, - Mask, DAG); + // Otherwise fall back on generic lowering. + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG); } /// \brief High-level routine to lower various 256-bit x86 vector shuffles. diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll index 5afdcabe7f6..a103405a85d 100644 --- a/test/CodeGen/X86/avx-vperm2x128.ll +++ b/test/CodeGen/X86/avx-vperm2x128.ll @@ -182,12 +182,20 @@ entry: ;;;; Cases we must not select vperm2f128 define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: G: -; ALL: ## BB#0: ## %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; ALL-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7] -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] -; ALL-NEXT: retq +; AVX1-LABEL: G: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: G: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7] +; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 2fa7e4d81f9..d217e8773a0 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -528,11 +528,9 @@ define <4 x i64> @shuffle_v4i64_0145(<4 x i64> %a, <4 x i64> %b) { define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_0451: ; AVX1: # BB#0: -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0,0] -; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_0451: @@ -557,11 +555,9 @@ define <4 x i64> @shuffle_v4i64_4501(<4 x i64> %a, <4 x i64> %b) { define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_4015: ; AVX1: # BB#0: -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0,0] -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_4015: @@ -577,11 +573,9 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) { define <4 x i64> @shuffle_v4i64_2u35(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_2u35: ; AVX1: # BB#0: -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_2u35: @@ -623,10 +617,9 @@ define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) { ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2] -; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: stress_test1: diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index e57a4b81a2b..84ef87a479e 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -275,13 +275,9 @@ define <8 x float> @shuffle_v8f32_2a3b6e7f(<8 x float> %a, <8 x float> %b) { define <8 x float> @shuffle_v8f32_08192a3b(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_08192a3b: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,2,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8f32_08192a3b: @@ -299,12 +295,11 @@ define <8 x float> @shuffle_v8f32_08192a3b(<8 x float> %a, <8 x float> %b) { define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_08991abb: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm1[2,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0,0,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8f32_08991abb: @@ -1010,13 +1005,9 @@ define <8 x i32> @shuffle_v8i32_2a3b6e7f(<8 x i32> %a, <8 x i32> %b) { define <8 x i32> @shuffle_v8i32_08192a3b(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_08192a3b: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,2,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8i32_08192a3b: @@ -1034,12 +1025,11 @@ define <8 x i32> @shuffle_v8i32_08192a3b(<8 x i32> %a, <8 x i32> %b) { define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_08991abb: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm1[2,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0,0,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8i32_08991abb: diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll index 07250fe97b7..f61e5f6fd84 100644 --- a/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2538,11 +2538,10 @@ define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_unneeded_subvector2: