diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 4d090529784..17f925bd3e8 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -9863,14 +9863,48 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, } } -/// \brief Tiny helper function to test whether a shuffle mask could be +/// \brief Helper function to test whether a shuffle mask could be /// simplified by widening the elements being shuffled. -static bool canWidenShuffleElements(ArrayRef Mask) { - for (int i = 0, Size = Mask.size(); i < Size; i += 2) - if ((Mask[i] != -1 && Mask[i] % 2 != 0) || - (Mask[i + 1] != -1 && (Mask[i + 1] % 2 != 1 || - (Mask[i] != -1 && Mask[i] + 1 != Mask[i + 1])))) - return false; +/// +/// Appends the mask for wider elements in WidenedMask if valid. Otherwise +/// leaves it in an unspecified state. +/// +/// NOTE: This must handle normal vector shuffle masks and *target* vector +/// shuffle masks. The latter have the special property of a '-2' representing +/// a zero-ed lane of a vector. +static bool canWidenShuffleElements(ArrayRef Mask, + SmallVectorImpl &WidenedMask) { + for (int i = 0, Size = Mask.size(); i < Size; i += 2) { + // Check for any of the sentinel values (negative) and if they are the same, + // we can widen to that. + if (Mask[i] < 0 && Mask[i] == Mask[i + 1]) { + WidenedMask.push_back(Mask[i]); + continue; + } + + // Check for an undef mask and a mask value properly aligned to fit with + // a pair of values. If we find such a case, use the non-undef mask's value. + if (Mask[i] == -1 && Mask[i + 1] % 2 == 1) { + WidenedMask.push_back(Mask[i + 1] / 2); + continue; + } + if (Mask[i + 1] == -1 && Mask[i] % 2 == 0) { + WidenedMask.push_back(Mask[i] / 2); + continue; + } + + // Finally check if the two mask values are adjacent and aligned with + // a pair. + if (Mask[i] != -1 && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) { + WidenedMask.push_back(Mask[i] / 2); + continue; + } + + // Otherwise we can't safely widen the elements used in this shuffle. + return false; + } + assert(WidenedMask.size() == Mask.size() / 2 && + "Incorrect size of mask after widening the elements!"); return true; } @@ -9922,20 +9956,16 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, // lanes but wider integers. We cap this to not form integers larger than i64 // but it might be interesting to form i128 integers to handle flipping the // low and high halves of AVX 256-bit vectors. + SmallVector WidenedMask; if (VT.isInteger() && VT.getScalarSizeInBits() < 64 && - canWidenShuffleElements(Mask)) { - SmallVector NewMask; - for (int i = 0, Size = Mask.size(); i < Size; i += 2) - NewMask.push_back(Mask[i] != -1 - ? Mask[i] / 2 - : (Mask[i + 1] != -1 ? Mask[i + 1] / 2 : -1)); + canWidenShuffleElements(Mask, WidenedMask)) { MVT NewVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2), VT.getVectorNumElements() / 2); V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getVectorShuffle(NewVT, dl, V1, V2, NewMask)); + DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask)); } int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0; @@ -20697,10 +20727,10 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, // elements, and shrink them to the half-width mask. It does this in a loop // so it will reduce the size of the mask to the minimal width mask which // performs an equivalent shuffle. - while (Mask.size() > 1 && canWidenShuffleElements(Mask)) { - for (int i = 0, e = Mask.size() / 2; i < e; ++i) - Mask[i] = Mask[2 * i] / 2; - Mask.resize(Mask.size() / 2); + SmallVector WidenedMask; + while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { + Mask = std::move(WidenedMask); + WidenedMask.clear(); } return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI, @@ -20971,12 +21001,13 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, return SDValue(); // We combined away this shuffle, so we're done. // See if this reduces to a PSHUFD which is no more expensive and can - // combine with more operations. - if (canWidenShuffleElements(Mask)) { - int DMask[] = {-1, -1, -1, -1}; + // combine with more operations. Note that it has to at least flip the + // dwords as otherwise it would have been removed as a no-op. + if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3]) { + int DMask[] = {0, 1, 2, 3}; int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; - DMask[DOffset + 0] = DOffset + Mask[0] / 2; - DMask[DOffset + 1] = DOffset + Mask[2] / 2; + DMask[DOffset + 0] = DOffset + 1; + DMask[DOffset + 1] = DOffset + 0; V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V); DCI.AddToWorklist(V.getNode()); V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V, diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index 0fcd0cca7a2..afe04589f15 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1262,17 +1262,14 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55 ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*}} # xmm3 = [0,1,2,3,4,5,6,7,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpmovzxbw %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*}} # xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpmovzxbw %xmm4, %xmm4 -; AVX1-NEXT: vmovdqa {{.*}} # xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpunpcklbw {{.*}} # xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpunpcklbw {{.*}} # xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX1-NEXT: vpmovzxbw %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklbw {{.*}} # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -1289,7 +1286,7 @@ define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_ ; AVX1-LABEL: @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63 ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*}} # xmm3 = [8,9,10,11,12,13,14,15,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vmovdqa {{.*}} # xmm3 = <8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 @@ -1312,12 +1309,11 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_ ; AVX1-LABEL: @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63 ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*}} # xmm3 = [8,9,10,11,12,13,14,15,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vmovdqa {{.*}} # xmm3 = <8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpunpcklbw {{.*}} # xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-NEXT: vpshufb {{.*}} # xmm1 = xmm1[0,1,2,3,4,5,6,7,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vpmovzxbw %xmm0, %xmm0 ; AVX1-NEXT: vpshufb {{.*}} # xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpunpcklbw {{.*}} # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -1338,12 +1334,11 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_ define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55 ; AVX1: # BB#0: -; AVX1-NEXT: vmovdqa {{.*}} # xmm2 = [8,9,10,11,12,13,14,15,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vmovdqa {{.*}} # xmm2 = <8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*}} # xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpshufb {{.*}} # xmm1 = xmm1[0,1,2,3,4,5,6,7,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmovzxbw %xmm0, %xmm0 ; AVX1-NEXT: vpshufb {{.*}} # xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] @@ -1567,3 +1562,47 @@ define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } + +define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39 +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*}} # xmm2 = zero,zero,xmm0[u],zero,xmm0[u,u,u,u,u,u,u,7,u,u,u,u] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpshufb {{.*}} # xmm4 = xmm3[4,3,u,3,u,u,u,u,u,u,u],zero,xmm3[u,u,u,u] +; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpshufb {{.*}} # xmm2 = xmm2[0,1],zero,xmm2[3],zero,zero,zero,zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpshufb {{.*}} # xmm5 = xmm4[u,u,4,u,1,6],zero,zero,xmm4[0],zero,xmm4[11,u],zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*}} # xmm6 = xmm1[u,u],zero,xmm1[u],zero,zero,xmm1[5,0],zero,xmm1[10],zero,xmm1[u,4,2,4,7] +; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpshufb {{.*}} # xmm5 = zero,zero,xmm5[2],zero,xmm5[4,5,6,7,8,9,10],zero,xmm5[12,13,14,15] +; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpshufb {{.*}} # xmm3 = xmm3[u,u],zero,zero,xmm3[u,u,u,u,1,6,13,u,u],zero,xmm3[u,u] +; AVX1-NEXT: vpshufb {{.*}} # xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u] +; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*}} # xmm0 = zero,zero,xmm0[2,3],zero,zero,zero,zero,xmm0[8,9,10],zero,zero,xmm0[13],zero,zero +; AVX1-NEXT: vpshufb {{.*}} # xmm3 = zero,zero,xmm4[u,u],zero,zero,xmm4[12],zero,xmm4[u,u,u],zero,zero,xmm4[u,0,3] +; AVX1-NEXT: vpshufb {{.*}} # xmm1 = xmm1[10,13,u,u,3,3],zero,xmm1[8,u,u,u,12,1,u],zero,zero +; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb {{.*}} # xmm1 = xmm1[0,1],zero,zero,xmm1[4,5,6,7],zero,zero,zero,xmm1[11,12],zero,xmm1[14,15] +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39 +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*}} # ymm2 = ymm1[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*}} # ymm2 = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23] +; AVX2-NEXT: vpshufb {{.*}} # ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u] +; AVX2-NEXT: vmovdqa {{.*}} # ymm3 = <0,0,u,u,0,0,128,0,u,u,u,0,0,u,128,128,u,u,0,u,0,0,128,128,0,128,0,u,128,128,128,128> +; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vperm2i128 {{.*}} # ymm2 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*}} # ymm2 = ymm2[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*}} # ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7] +; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [0,0,128,128,0,0,0,0,128,128,128,0,0,128,0,0,128,128,0,128,0,0,0,0,0,0,0,128,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %shuffle +}