diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 8b102e4fbb9..c6f73baeb2a 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7182,21 +7182,6 @@ static bool isSingleInputShuffleMask(ArrayRef Mask) { return true; } -/// \brief Check wether all of one set of inputs to a shuffle mask are in place. -/// -/// Mask entries pointing at the other input or undef will be skipped. -static bool isShuffleMaskInputInPlace(ArrayRef Mask, bool LoInput = true) { - int Size = Mask.size(); - for (int i = 0; i < Size; ++i) { - int M = Mask[i]; - if (M == -1 || (LoInput && M >= 4) || (!LoInput && M < 4)) - continue; - if (M - (LoInput ? 0 : Size) != i) - return false; - } - return true; -} - // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC // 2013 will allow us to use it as a non-type template parameter. namespace { @@ -7385,13 +7370,48 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // INSERTPS when the V1 elements are already in the correct locations // because otherwise we can just always use two SHUFPS instructions which // are much smaller to encode than a SHUFPS and an INSERTPS. - if (Subtarget->hasSSE41() && - isShuffleMaskInputInPlace(Mask, /*LoInput*/ true)) { - // Insert the V2 element into the desired position. - SDValue InsertPSMask = - DAG.getIntPtrConstant(Mask[V2Index] << 6 | V2Index << 4); - return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, - InsertPSMask); + if (Subtarget->hasSSE41()) { + // When using INSERTPS we can zero any lane of the destination. Collect + // the zero inputs into a mask and drop them from the lanes of V1 which + // actually need to be present as inputs to the INSERTPS. + unsigned ZMask = 0; + if (ISD::isBuildVectorAllZeros(V1.getNode())) { + ZMask = 0xF ^ (1 << V2Index); + } else if (V1.getOpcode() == ISD::BUILD_VECTOR) { + for (int i = 0; i < 4; ++i) { + int M = Mask[i]; + if (M >= 4) + continue; + if (M > -1) { + SDValue Input = V1.getOperand(M); + if (Input.getOpcode() != ISD::UNDEF && + !X86::isZeroNode(Input)) { + // A non-zero input! + ZMask = 0; + break; + } + } + ZMask |= 1 << i; + } + } + + // Synthesize a shuffle mask for the non-zero and non-v2 inputs. + int InsertShuffleMask[4] = {-1, -1, -1, -1}; + for (int i = 0; i < 4; ++i) + if (i != V2Index && (ZMask & (1 << i)) == 0) + InsertShuffleMask[i] = Mask[i]; + + if (isNoopShuffleMask(InsertShuffleMask)) { + // Replace V1 with undef if nothing from V1 survives the INSERTPS. + if ((ZMask | 1 << V2Index) == 0xF) + V1 = DAG.getUNDEF(MVT::v4f32); + + // Insert the V2 element into the desired position. + SDValue InsertPSMask = + DAG.getIntPtrConstant(Mask[V2Index] << 6 | V2Index << 4 | ZMask); + return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, + InsertPSMask); + } } // Compute the index adjacent to V2Index and in the same half by toggling diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll index 0c43e0e9d27..7f448835b5d 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -207,3 +207,113 @@ define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) { %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle } + +define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) { +; SSE2-LABEL: @shuffle_v4f32_4zzz +; SSE2: xorps %[[X:xmm[0-9]+]], %[[X]] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,0],[[X]][1,0] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2],[[X]][2,3] +; SSE2-NEXT: retq +; +; SSE41-LABEL: @shuffle_v4f32_4zzz +; SSE41: insertps {{.*}} # xmm0 = xmm0[0],zero,zero,zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: @shuffle_v4f32_4zzz +; AVX1: vinsertps {{.*}} # xmm0 = xmm0[0],zero,zero,zero +; AVX1-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) { +; SSE2-LABEL: @shuffle_v4f32_z4zz +; SSE2: xorps %[[X:xmm[0-9]+]], %[[X]] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,0],[[X]][2,0] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[2,0],[[X]][3,0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: @shuffle_v4f32_z4zz +; SSE41: insertps {{.*}} # xmm0 = zero,xmm0[0],zero,zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: @shuffle_v4f32_z4zz +; AVX1: vinsertps {{.*}} # xmm0 = zero,xmm0[0],zero,zero +; AVX1-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) { +; SSE2-LABEL: @shuffle_v4f32_zz4z +; SSE2: xorps %[[X:xmm[0-9]+]], %[[X]] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,0],[[X]][0,0] +; SSE2-NEXT: shufps {{.*}} # [[X]] = [[X]][0,0],xmm0[0,2] +; SSE2-NEXT: movaps %[[X]], %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: @shuffle_v4f32_zz4z +; SSE41: insertps {{.*}} # xmm0 = zero,zero,xmm0[0],zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: @shuffle_v4f32_zz4z +; AVX1: vinsertps {{.*}} # xmm0 = zero,zero,xmm0[0],zero +; AVX1-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) { +; SSE2-LABEL: @shuffle_v4f32_zuu4 +; SSE2: xorps %[[X:xmm[0-9]+]], %[[X]] +; SSE2-NEXT: shufps {{.*}} # [[X]] = [[X]][0,1],xmm0[2,0] +; SSE2-NEXT: movaps %[[X]], %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: @shuffle_v4f32_zuu4 +; SSE41: insertps {{.*}} # xmm0 = zero,zero,zero,xmm0[0] +; SSE41-NEXT: retq +; +; AVX1-LABEL: @shuffle_v4f32_zuu4 +; AVX1: vinsertps {{.*}} # xmm0 = zero,zero,zero,xmm0[0] +; AVX1-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) { +; SSE2-LABEL: @shuffle_v4f32_zzz7 +; SSE2: xorps %[[X:xmm[0-9]+]], %[[X]] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[3,0],[[X]][2,0] +; SSE2-NEXT: shufps {{.*}} # [[X]] = [[X]][0,1],xmm0[2,0] +; SSE2-NEXT: movaps %[[X]], %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: @shuffle_v4f32_zzz7 +; SSE41: insertps {{.*}} # xmm0 = zero,zero,zero,xmm0[3] +; SSE41-NEXT: retq +; +; AVX1-LABEL: @shuffle_v4f32_zzz7 +; AVX1: vinsertps {{.*}} # xmm0 = zero,zero,zero,xmm0[3] +; AVX1-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) { +; SSE2-LABEL: @shuffle_v4f32_z6zz +; SSE2: xorps %[[X:xmm[0-9]+]], %[[X]] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[2,0],[[X]][0,0] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[2,0],[[X]][2,3] +; SSE2-NEXT: retq +; +; SSE41-LABEL: @shuffle_v4f32_z6zz +; SSE41: insertps {{.*}} # xmm0 = zero,xmm0[2],zero,zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: @shuffle_v4f32_z6zz +; AVX1: vinsertps {{.*}} # xmm0 = zero,xmm0[2],zero,zero +; AVX1-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> + ret <4 x float> %shuffle +}