[x86] Teach the new vector shuffle lowering to use BLENDPS and BLENDPD.

These are super simple. They even take precedence over crazy
instructions like INSERTPS because they have very high throughput on
modern x86 chips.

I still have to teach the integer shuffle variants about this to avoid
so many domain crossings. However, due to the particular instructions
available, that's a touch more complex and so a separate patch.

Also, the backend doesn't seem to realize it can commute blend
instructions by negating the mask. That would help remove a number of
copies here. Suggestions on how to do this welcome, it's an area I'm
less familiar with.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217744 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Chandler Carruth 2014-09-14 23:43:33 +00:00
parent 0309c5d4bc
commit e610c324e1
4 changed files with 134 additions and 37 deletions

View File

@ -7233,6 +7233,31 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
return DAG.getConstant(Imm, MVT::i8);
}
/// \brief Try to emit a blend instruction for a shuffle.
///
/// This doesn't do any checks for the availability of instructions for blending
/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
/// be matched in the backend with the type given. What it does check for is
/// that the shuffle mask is in fact a blend.
static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG) {
unsigned BlendMask = 0;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
if (Mask[i] >= Size) {
if (Mask[i] != i + Size)
return SDValue(); // Shuffled V2 input!
BlendMask |= 1u << i;
continue;
}
if (Mask[i] >= 0 && Mask[i] != i)
return SDValue(); // Shuffled V1 input!
}
return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
DAG.getConstant(BlendMask, MVT::i8));
}
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
///
/// This is the basis function for the 2-lane 64-bit shuffles as we have full
@ -7267,6 +7292,11 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
if (isShuffleEquivalent(Mask, 1, 3))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
if (Subtarget->hasSSE41())
if (SDValue Blend =
lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, DAG))
return Blend;
unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
DAG.getConstant(SHUFPDMask, MVT::i8));
@ -7353,6 +7383,11 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
if (Subtarget->hasSSE41())
if (SDValue Blend =
lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, DAG))
return Blend;
if (NumV2Elements == 1) {
int V2Index =
std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -

View File

@ -111,17 +111,35 @@ define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) {
ret <2 x double> %shuffle
}
define <2 x double> @shuffle_v2f64_03(<2 x double> %a, <2 x double> %b) {
; ALL-LABEL: @shuffle_v2f64_03
; ALL: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
; ALL-NEXT: retq
; SSE2-LABEL: @shuffle_v2f64_03
; SSE2: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: @shuffle_v2f64_03
; SSE3: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
; SSE3-NEXT: retq
;
; SSE41-LABEL: @shuffle_v2f64_03
; SSE41: blendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
; SSE41-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 3>
ret <2 x double> %shuffle
}
define <2 x double> @shuffle_v2f64_21(<2 x double> %a, <2 x double> %b) {
; ALL-LABEL: @shuffle_v2f64_21
; ALL: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
; ALL-NEXT: movapd %xmm1, %xmm0
; ALL-NEXT: retq
; SSE2-LABEL: @shuffle_v2f64_21
; SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: @shuffle_v2f64_21
; SSE3: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
; SSE3-NEXT: movapd %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSE41-LABEL: @shuffle_v2f64_21
; SSE41: blendpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 1>
ret <2 x double> %shuffle
}
@ -143,17 +161,35 @@ define <2 x i64> @shuffle_v2i64_02_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_03(<2 x i64> %a, <2 x i64> %b) {
; ALL-LABEL: @shuffle_v2i64_03
; ALL: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
; ALL-NEXT: retq
; SSE2-LABEL: @shuffle_v2i64_03
; SSE2: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: @shuffle_v2i64_03
; SSE3: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
; SSE3-NEXT: retq
;
; SSE41-LABEL: @shuffle_v2i64_03
; SSE41: blendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
; SSE41-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_03_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
; ALL-LABEL: @shuffle_v2i64_03_copy
; ALL: shufpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
; ALL-NEXT: movapd %xmm1, %xmm0
; ALL-NEXT: retq
; SSE2-LABEL: @shuffle_v2i64_03_copy
; SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: @shuffle_v2i64_03_copy
; SSE3: shufpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
; SSE3-NEXT: movapd %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSE41-LABEL: @shuffle_v2i64_03_copy
; SSE41: blendpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
ret <2 x i64> %shuffle
}
@ -204,18 +240,38 @@ define <2 x i64> @shuffle_v2i64_20_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_21(<2 x i64> %a, <2 x i64> %b) {
; ALL-LABEL: @shuffle_v2i64_21
; ALL: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
; ALL-NEXT: movapd %xmm1, %xmm0
; ALL-NEXT: retq
; SSE2-LABEL: @shuffle_v2i64_21
; SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: @shuffle_v2i64_21
; SSE3: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
; SSE3-NEXT: movapd %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSE41-LABEL: @shuffle_v2i64_21
; SSE41: blendpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_21_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
; ALL-LABEL: @shuffle_v2i64_21_copy
; ALL: shufpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
; ALL-NEXT: movapd %xmm2, %xmm0
; ALL-NEXT: retq
; SSE2-LABEL: @shuffle_v2i64_21_copy
; SSE2: shufpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
; SSE2-NEXT: movapd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: @shuffle_v2i64_21_copy
; SSE3: shufpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
; SSE3-NEXT: movapd %xmm2, %xmm0
; SSE3-NEXT: retq
;
; SSE41-LABEL: @shuffle_v2i64_21_copy
; SSE41: blendpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1>
ret <2 x i64> %shuffle
}

View File

@ -216,11 +216,14 @@ define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: @shuffle_v4f32_4zzz
; SSE41: insertps {{.*}} # xmm0 = xmm0[0],zero,zero,zero
; SSE41: xorps %[[X:xmm[0-9]+]], %[[X]]
; SSE41-NEXT: blendps {{.*}} # [[X]] = xmm0[0],[[X]][1,2,3]
; SSE41-NEXT: movaps %[[X]], %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: @shuffle_v4f32_4zzz
; AVX1: vinsertps {{.*}} # xmm0 = xmm0[0],zero,zero,zero
; AVX1: vxorps %[[X:xmm[0-9]+]], %[[X]]
; AVX1-NEXT: vblendps {{.*}} # xmm0 = xmm0[0],[[X]][1,2,3]
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
ret <4 x float> %shuffle
@ -290,11 +293,14 @@ define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: @shuffle_v4f32_zzz7
; SSE41: insertps {{.*}} # xmm0 = zero,zero,zero,xmm0[3]
; SSE41: xorps %[[X:xmm[0-9]+]], %[[X]]
; SSE41-NEXT: blendps {{.*}} # [[X]] = [[X]][0,1,2],xmm0[3]
; SSE41-NEXT: movaps %[[X]], %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: @shuffle_v4f32_zzz7
; AVX1: vinsertps {{.*}} # xmm0 = zero,zero,zero,xmm0[3]
; AVX1: vxorps %[[X:xmm[0-9]+]], %[[X]]
; AVX1-NEXT: vblendps {{.*}} # xmm0 = [[X]][0,1,2],xmm0[3]
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
ret <4 x float> %shuffle

View File

@ -40,7 +40,7 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: @shuffle_v4i64_0300
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@ -119,7 +119,7 @@ define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: @shuffle_v4f64_0300
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@ -282,7 +282,7 @@ define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0]
; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm2[0],xmm1[1]
; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm2[0],xmm1[1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
@ -293,7 +293,7 @@ define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) {
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm2 = xmm2[0,0]
; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
@ -305,7 +305,7 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0]
; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0]
; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
@ -317,7 +317,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0]
; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm1[0],xmm0[1]
; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
@ -335,9 +335,9 @@ define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: @shuffle_v4i64_0451
; AVX1: # BB#0:
; AVX1-NEXT: vpshufd {{.*}} # xmm2 = xmm1[2,3,0,1]
; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm2[0],xmm0[1]
; AVX1-NEXT: vblendpd {{.*}} # xmm2 = xmm2[0],xmm0[1]
; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0]
; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
@ -355,9 +355,9 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: @shuffle_v4i64_4015
; AVX1: # BB#0:
; AVX1-NEXT: vpshufd {{.*}} # xmm2 = xmm0[2,3,0,1]
; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
; AVX1-NEXT: vblendpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm1[0],xmm0[1]
; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
@ -370,7 +370,7 @@ define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpunpckhqdq {{.*}} # xmm0 = xmm0[1,1]
; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[2,3,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq