diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index f85b00a84f8..5b0d315a6d0 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7488,6 +7488,81 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(NewMask, DAG)); } +static SDValue lowerIntegerElementInsertionVectorShuffle( + MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef Mask, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + int V2Index = std::find_if(Mask.begin(), Mask.end(), + [&Mask](int M) { return M >= (int)Mask.size(); }) - + Mask.begin(); + + // Check for a single input from a SCALAR_TO_VECTOR node. + // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and + // all the smarts here sunk into that routine. However, the current + // lowering of BUILD_VECTOR makes that nearly impossible until the old + // vector shuffle lowering is dead. + if ((Mask[V2Index] == (int)Mask.size() && + V2.getOpcode() == ISD::SCALAR_TO_VECTOR) || + V2.getOpcode() == ISD::BUILD_VECTOR) { + SDValue V2S = V2.getOperand(Mask[V2Index] - Mask.size()); + + bool V1IsAllZero = false; + if (ISD::isBuildVectorAllZeros(V1.getNode())) { + V1IsAllZero = true; + } else if (V1.getOpcode() == ISD::BUILD_VECTOR) { + V1IsAllZero = true; + for (int M : Mask) { + if (M < 0 || M >= (int)Mask.size()) + continue; + SDValue Input = V1.getOperand(M); + if (Input.getOpcode() != ISD::UNDEF && !X86::isZeroNode(Input)) { + // A non-zero input! + V1IsAllZero = false; + break; + } + } + } + if (V1IsAllZero) { + // First, we need to zext the scalar if it is smaller than an i32. + MVT EltVT = VT.getVectorElementType(); + assert(EltVT == V2S.getSimpleValueType() && + "Different scalar and element types!"); + MVT ExtVT = VT; + if (EltVT == MVT::i8 || EltVT == MVT::i16) { + // Zero-extend directly to i32. + ExtVT = MVT::v4i32; + V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); + } + + V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S)); + if (ExtVT != VT) + V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); + + if (V2Index != 0) { + // If we have 4 or fewer lanes we can cheaply shuffle the element into + // the desired position. Otherwise it is more efficient to do a vector + // shift left. We know that we can do a vector shift left because all + // the inputs are zero. + if (VT.getVectorNumElements() <= 4) { + SmallVector V2Shuffle(Mask.size(), 1); + V2Shuffle[V2Index] = 0; + V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle); + } else { + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2); + V2 = DAG.getNode( + X86ISD::VSHLDQ, DL, MVT::v2i64, V2, + DAG.getConstant( + V2Index * EltVT.getSizeInBits(), + DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64))); + V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); + } + } + return V2; + } + } + return SDValue(); +} + /// \brief Lower 4-lane i32 vector shuffles. /// /// We try to handle these with integer-domain shuffles where we can, but for @@ -7519,50 +7594,10 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); // There are special ways we can lower some single-element blends. - if (NumV2Elements == 1) { - int V2Index = - std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - - Mask.begin(); - - // Check for a single input from a SCALAR_TO_VECTOR node. - // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and - // all the smarts here sunk into that routine. However, the current - // lowering of BUILD_VECTOR makes that nearly impossible until the old - // vector shuffle lowering is dead. - if ((Mask[V2Index] == 4 && V2.getOpcode() == ISD::SCALAR_TO_VECTOR) || - V2.getOpcode() == ISD::BUILD_VECTOR) { - SDValue V2S = V2.getOperand(Mask[V2Index] - 4); - - bool V1IsAllZero = false; - if (ISD::isBuildVectorAllZeros(V1.getNode())) { - V1IsAllZero = true; - } else if (V1.getOpcode() == ISD::BUILD_VECTOR) { - V1IsAllZero = true; - for (int M : Mask) { - if (M < 0 || M >= 4) - continue; - SDValue Input = V1.getOperand(M); - if (Input.getOpcode() != ISD::UNDEF && !X86::isZeroNode(Input)) { - // A non-zero input! - V1IsAllZero = false; - break; - } - } - } - if (V1IsAllZero) { - V2 = DAG.getNode( - X86ISD::VZEXT_MOVL, DL, MVT::v4i32, - DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V2S)); - if (V2Index != 0) { - int V2Shuffle[] = {1, 1, 1, 1}; - V2Shuffle[V2Index] = 0; - V2 = DAG.getVectorShuffle(MVT::v4i32, DL, V2, - DAG.getUNDEF(MVT::v4i32), V2Shuffle); - } - return V2; - } - } - } + if (NumV2Elements == 1) + if (SDValue V = lowerIntegerElementInsertionVectorShuffle( + MVT::v4i32, DL, V1, V2, Mask, Subtarget, DAG)) + return V; // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build @@ -8210,6 +8245,12 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized " "to be V1-input shuffles."); + // There are special ways we can lower some single-element blends. + if (NumV2Inputs == 1) + if (SDValue V = lowerIntegerElementInsertionVectorShuffle( + MVT::v8i16, DL, V1, V2, Mask, Subtarget, DAG)) + return V; + if (NumV1Inputs + NumV2Inputs <= 4) return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG); @@ -8347,8 +8388,11 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, MutableArrayRef LoMask = Mask.slice(0, 8); MutableArrayRef HiMask = Mask.slice(8, 8); + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; }); + // For single-input shuffles, there are some nicer lowering tricks we can use. - if (isSingleInputShuffleMask(Mask)) { + if (NumV2Elements == 0) { // Check whether we can widen this to an i16 shuffle by duplicating bytes. // Notably, this handles splat and partial-splat shuffles more efficiently. // However, it only makes sense if the pre-duplication shuffle simplifies @@ -8495,6 +8539,12 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2); } + // There are special ways we can lower some single-element blends. + if (NumV2Elements == 1) + if (SDValue V = lowerIntegerElementInsertionVectorShuffle( + MVT::v16i8, DL, V1, V2, Mask, Subtarget, DAG)) + return V; + // Check whether a compaction lowering can be done. This handles shuffles // which take every Nth element for some even N. See the helper function for // details. diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index 6f49a03cb8b..38734eda941 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -325,3 +325,50 @@ define <16 x i8> @PR20540(<8 x i8> %a) { %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } + +define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { +; SSE2-LABEL: @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz +; SSE2: # BB#0: +; SSE2-NEXT: movzbl {{.*}}, %[[R:.*]] +; SSE2-NEXT: movd %[[R]], %xmm0 +; SSE2-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 0 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { +; SSE2-LABEL: @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz +; SSE2: # BB#0: +; SSE2-NEXT: movzbl {{.*}}, %[[R:.*]] +; SSE2-NEXT: movd %[[R]], %xmm0 +; SSE2-NEXT: pslldq $5, %xmm0 +; SSE2-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 0 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) { +; SSE2-LABEL: @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16 +; SSE2: # BB#0: +; SSE2-NEXT: movzbl {{.*}}, %[[R:.*]] +; SSE2-NEXT: movd %[[R]], %xmm0 +; SSE2-NEXT: pslldq $15, %xmm0 +; SSE2-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 0 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { +; SSE2-LABEL: @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz +; SSE2: # BB#0: +; SSE2-NEXT: movzbl {{.*}}, %[[R:.*]] +; SSE2-NEXT: movd %[[R]], %xmm0 +; SSE2-NEXT: pslldq $2, %xmm0 +; SSE2-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 3 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle +} diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index f1e17377c13..33993aae682 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -771,3 +771,62 @@ define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) { %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } + +define <8 x i16> @shuffle_v8i16_8zzzzzzz(i16 %i) { +; ALL-LABEL: @shuffle_v8i16_8zzzzzzz +; ALL: # BB#0: +; ALL-NEXT: movzwl {{.*}}, %[[R:.*]] +; ALL-NEXT: movd %[[R]], %xmm0 +; ALL-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) { +; ALL-LABEL: @shuffle_v8i16_z8zzzzzz +; ALL: # BB#0: +; ALL-NEXT: movzwl {{.*}}, %[[R:.*]] +; ALL-NEXT: movd %[[R]], %xmm0 +; ALL-NEXT: pslldq $2, %xmm0 +; ALL-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) { +; ALL-LABEL: @shuffle_v8i16_zzzzz8zz +; ALL: # BB#0: +; ALL-NEXT: movzwl {{.*}}, %[[R:.*]] +; ALL-NEXT: movd %[[R]], %xmm0 +; ALL-NEXT: pslldq $10, %xmm0 +; ALL-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) { +; ALL-LABEL: @shuffle_v8i16_zuuzuuz8 +; ALL: # BB#0: +; ALL-NEXT: movzwl {{.*}}, %[[R:.*]] +; ALL-NEXT: movd %[[R]], %xmm0 +; ALL-NEXT: pslldq $14, %xmm0 +; ALL-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) { +; ALL-LABEL: @shuffle_v8i16_zzBzzzzz +; ALL: # BB#0: +; ALL-NEXT: movzwl {{.*}}, %[[R:.*]] +; ALL-NEXT: movd %[[R]], %xmm0 +; ALL-NEXT: pslldq $4, %xmm0 +; ALL-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 3 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle +}