[DAG] Improved target independent vector shuffle folding logic.

This patch teaches the DAGCombiner how to combine shuffles according to rules:
   shuffle(shuffle(A, Undef, M0), B, M1) -> shuffle(B, A, M2)
   shuffle(shuffle(A, B, M0), B, M1) -> shuffle(B, A, M2)
   shuffle(shuffle(A, B, M0), A, M1) -> shuffle(B, A, M2)


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@222090 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Andrea Di Biagio 2014-11-15 22:56:25 +00:00
parent 01e39346f3
commit 37f645cb34
2 changed files with 40 additions and 44 deletions

View File

@ -11239,6 +11239,26 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
return DAG.getVectorShuffle(VT, SDLoc(N), SV0, N1, &Mask[0]);
return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, &Mask[0]);
}
// Compute the commuted shuffle mask.
for (unsigned i = 0; i != NumElts; ++i) {
int idx = Mask[i];
if (idx < 0)
continue;
else if (idx < (int)NumElts)
Mask[i] = idx + NumElts;
else
Mask[i] = idx - NumElts;
}
if (TLI.isShuffleMaskLegal(Mask, VT)) {
if (IsSV1Undef)
// shuffle(shuffle(A, Undef, M0), B, M1) -> shuffle(B, A, M2)
return DAG.getVectorShuffle(VT, SDLoc(N), N1, SV0, &Mask[0]);
// shuffle(shuffle(A, B, M0), B, M1) -> shuffle(B, A, M2)
// shuffle(shuffle(A, B, M0), A, M1) -> shuffle(B, A, M2)
return DAG.getVectorShuffle(VT, SDLoc(N), SV1, SV0, &Mask[0]);
}
}
return SDValue();

View File

@ -1146,18 +1146,14 @@ define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: combine_test2:
; SSE2: # BB#0:
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSE2-NEXT: movss %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test2:
; SSSE3: # BB#0:
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSSE3-NEXT: movss %xmm0, %xmm1
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_test2:
@ -1268,18 +1264,14 @@ define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: combine_test7:
; SSE2: # BB#0:
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSE2-NEXT: movss %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test7:
; SSSE3: # BB#0:
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSSE3-NEXT: movss %xmm0, %xmm1
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_test7:
@ -1385,14 +1377,12 @@ define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: combine_test12:
; SSE2: # BB#0:
; SSE2-NEXT: movss %xmm0, %xmm1
; SSE2-NEXT: movss %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test12:
; SSSE3: # BB#0:
; SSSE3-NEXT: movss %xmm0, %xmm1
; SSSE3-NEXT: movss %xmm0, %xmm1
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
@ -1486,14 +1476,12 @@ define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: combine_test17:
; SSE2: # BB#0:
; SSE2-NEXT: movss %xmm0, %xmm1
; SSE2-NEXT: movss %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test17:
; SSSE3: # BB#0:
; SSSE3-NEXT: movss %xmm0, %xmm1
; SSSE3-NEXT: movss %xmm0, %xmm1
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
@ -1700,30 +1688,24 @@ define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) {
; SSE2-LABEL: combine_test1c:
; SSE2: # BB#0:
; SSE2-NEXT: movd (%rdi), %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: movd (%rsi), %xmm1
; SSE2-NEXT: movd (%rdi), %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSE2-NEXT: movd (%rsi), %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: movss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test1c:
; SSSE3: # BB#0:
; SSSE3-NEXT: movd (%rdi), %xmm0
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: movd (%rsi), %xmm1
; SSSE3-NEXT: movd (%rdi), %xmm1
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSSE3-NEXT: movd (%rsi), %xmm0
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: movss %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_test1c:
@ -1984,19 +1966,13 @@ define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: combine_blend_123:
; SSE2: # BB#0:
; SSE2-NEXT: movaps %xmm1, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
; SSE2-NEXT: movsd %xmm2, %xmm1
; SSE2-NEXT: movss %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_blend_123:
; SSSE3: # BB#0:
; SSSE3-NEXT: movaps %xmm1, %xmm2
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
; SSSE3-NEXT: movsd %xmm2, %xmm1
; SSSE3-NEXT: movss %xmm0, %xmm1
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;