diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index da3ec8b35eb..b5744e093d2 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7258,8 +7258,27 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, if (Mask[i] >= 0 && Mask[i] != i) return SDValue(); // Shuffled V1 input! } - return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, - DAG.getConstant(BlendMask, MVT::i8)); + if (VT == MVT::v4f32 || VT == MVT::v2f64) + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, + DAG.getConstant(BlendMask, MVT::i8)); + assert(!VT.isFloatingPoint() && "Only v4f32 and v2f64 are supported!"); + + // For integer shuffles we need to expand the mask and cast the inputs to + // v8i16s prior to blending. + assert((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64) && + "Not a supported integer vector type!"); + int Scale = 8 / VT.getVectorNumElements(); + BlendMask = 0; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= Size) + for (int j = 0; j < Scale; ++j) + BlendMask |= 1u << (i * Scale + j); + + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, + DAG.getConstant(BlendMask, MVT::i8))); } /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. @@ -7343,6 +7362,11 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (isShuffleEquivalent(Mask, 1, 3)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2); + if (Subtarget->hasSSE41()) + if (SDValue Blend = + lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, DAG)) + return Blend; + // We implement this with SHUFPD which is pretty lame because it will likely // incur 2 cycles of stall for integer vectors on Nehalem and older chips. // However, all the alternatives are still more cycles and newer chips don't @@ -7631,6 +7655,11 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, MVT::v4i32, DL, V1, V2, Mask, Subtarget, DAG)) return V; + if (Subtarget->hasSSE41()) + if (SDValue Blend = + lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, DAG)) + return Blend; + // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build // up the inputs, bypassing domain shift penalties that we would encur if we @@ -8289,6 +8318,11 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, MVT::v8i16, DL, V1, V2, Mask, Subtarget, DAG)) return V; + if (Subtarget->hasSSE41()) + if (SDValue Blend = + lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) + return Blend; + if (NumV1Inputs + NumV2Inputs <= 4) return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG); diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll index f6382a98559..d23b0794305 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -170,7 +170,7 @@ define <2 x i64> @shuffle_v2i64_03(<2 x i64> %a, <2 x i64> %b) { ; SSE3-NEXT: retq ; ; SSE41-LABEL: @shuffle_v2i64_03 -; SSE41: blendpd {{.*}} # xmm0 = xmm0[0],xmm1[1] +; SSE41: pblendw {{.*}} # xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle @@ -187,8 +187,8 @@ define <2 x i64> @shuffle_v2i64_03_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64 ; SSE3-NEXT: retq ; ; SSE41-LABEL: @shuffle_v2i64_03_copy -; SSE41: blendpd {{.*}} # xmm1 = xmm1[0],xmm2[1] -; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41: pblendw {{.*}} # xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle @@ -251,8 +251,8 @@ define <2 x i64> @shuffle_v2i64_21(<2 x i64> %a, <2 x i64> %b) { ; SSE3-NEXT: retq ; ; SSE41-LABEL: @shuffle_v2i64_21 -; SSE41: blendpd {{.*}} # xmm1 = xmm1[0],xmm0[1] -; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41: pblendw {{.*}} # xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle @@ -269,8 +269,8 @@ define <2 x i64> @shuffle_v2i64_21_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64 ; SSE3-NEXT: retq ; ; SSE41-LABEL: @shuffle_v2i64_21_copy -; SSE41: blendpd {{.*}} # xmm2 = xmm2[0],xmm1[1] -; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41: pblendw {{.*}} # xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index affa933768b..757ef8bf176 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -40,7 +40,7 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @shuffle_v4i64_0300 ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm0[0],xmm1[1] +; AVX1-NEXT: vpblendw {{.*}} # xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -282,7 +282,7 @@ define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm2[0],xmm1[1] +; AVX1-NEXT: vpblendw {{.*}} # xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -293,7 +293,7 @@ define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpshufd {{.*}} # xmm2 = xmm2[0,1,0,1] -; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm1[0],xmm2[1] +; AVX1-NEXT: vpblendw {{.*}} # xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -305,7 +305,7 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0] ; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1] +; AVX1-NEXT: vpblendw {{.*}} # xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -317,7 +317,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0] ; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1] +; AVX1-NEXT: vpblendw {{.*}} # xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -335,9 +335,9 @@ define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @shuffle_v4i64_0451 ; AVX1: # BB#0: ; AVX1-NEXT: vpshufd {{.*}} # xmm2 = xmm1[2,3,0,1] -; AVX1-NEXT: vblendpd {{.*}} # xmm2 = xmm2[0],xmm0[1] +; AVX1-NEXT: vpblendw {{.*}} # xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1] +; AVX1-NEXT: vpblendw {{.*}} # xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -355,9 +355,9 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @shuffle_v4i64_4015 ; AVX1: # BB#0: ; AVX1-NEXT: vpshufd {{.*}} # xmm2 = xmm0[2,3,0,1] -; AVX1-NEXT: vblendpd {{.*}} # xmm2 = xmm2[0],xmm1[1] +; AVX1-NEXT: vpblendw {{.*}} # xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1] +; AVX1-NEXT: vpblendw {{.*}} # xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -369,7 +369,7 @@ define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[2,3,2,3] -; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1] +; AVX1-NEXT: vpblendw {{.*}} # xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[2,3,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq