[X86] Teach the DAGCombiner how to fold a OR of two shufflevector nodes.

This patch teaches the DAGCombiner how to fold a binary OR between two
shufflevector into a single shuffle vector when possible.

The rules are:
  1. fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask1)
  2. fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf B, A, Mask2)

The DAGCombiner can take advantage of the fact that OR is commutative and
compute two possible shuffle masks (Mask1 and Mask2) for the resulting
shuffle node.

Before folding a dag according to either rule 1 or 2, DAGCombiner verifies
that the resulting shuffle mask is legal for the target.
DAGCombiner would firstly try to fold according to 1.; If not possible
then it will try to fold according to 2.
If both Mask1 and Mask2 are illegal then we conservatively don't fold
the OR instruction.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@203156 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Andrea Di Biagio 2014-03-06 20:19:52 +00:00
parent d0d94d6afb
commit e54158504f
2 changed files with 321 additions and 0 deletions

View File

@ -3200,6 +3200,60 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
return N0;
if (ISD::isBuildVectorAllOnes(N1.getNode()))
return N1;
// fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask1)
// fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf B, A, Mask2)
// Do this only if the resulting shuffle is legal.
if (isa<ShuffleVectorSDNode>(N0) &&
isa<ShuffleVectorSDNode>(N1) &&
N0->getOperand(1) == N1->getOperand(1) &&
ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode())) {
bool CanFold = true;
unsigned NumElts = VT.getVectorNumElements();
const ShuffleVectorSDNode *SV0 = cast<ShuffleVectorSDNode>(N0);
const ShuffleVectorSDNode *SV1 = cast<ShuffleVectorSDNode>(N1);
// We construct two shuffle masks:
// - Mask1 is a shuffle mask for a shuffle with N0 as the first operand
// and N1 as the second operand.
// - Mask2 is a shuffle mask for a shuffle with N1 as the first operand
// and N0 as the second operand.
// We do this because OR is commutable and therefore there might be
// two ways to fold this node into a shuffle.
SmallVector<int,4> Mask1;
SmallVector<int,4> Mask2;
for (unsigned i = 0; i != NumElts && CanFold; ++i) {
int M0 = SV0->getMaskElt(i);
int M1 = SV1->getMaskElt(i);
// Both shuffle indexes are undef. Propagate Undef.
if (M0 < 0 && M1 < 0) {
Mask1.push_back(M0);
Mask2.push_back(M0);
continue;
}
if (M0 < 0 || M1 < 0 ||
(M0 < (int)NumElts && M1 < (int)NumElts) ||
(M0 >= (int)NumElts && M1 >= (int)NumElts)) {
CanFold = false;
break;
}
Mask1.push_back(M0 < (int)NumElts ? M0 : M1 + NumElts);
Mask2.push_back(M1 < (int)NumElts ? M1 : M0 + NumElts);
}
if (CanFold) {
// Fold this sequence only if the resulting shuffle is 'legal'.
if (TLI.isShuffleMaskLegal(Mask1, VT))
return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(0),
N1->getOperand(0), &Mask1[0]);
if (TLI.isShuffleMaskLegal(Mask2, VT))
return DAG.getVectorShuffle(VT, SDLoc(N), N1->getOperand(0),
N0->getOperand(0), &Mask2[0]);
}
}
}
// fold (or x, undef) -> -1

View File

@ -0,0 +1,267 @@
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
; Verify that each of the following test cases is folded into a single
; instruction which performs a blend operation.
define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) {
%shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
%shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
%or = or <2 x i64> %shuf1, %shuf2
ret <2 x i64> %or
}
; CHECK-LABEL: test1
; CHECK-NOT: xorps
; CHECK: movsd
; CHECK-NOT: orps
; CHECK: ret
define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
%or = or <4 x i32> %shuf1, %shuf2
ret <4 x i32> %or
}
; CHECK-LABEL: test2
; CHECK-NOT: xorps
; CHECK: shufps
; CHECK: ret
define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
%shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
%shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
%or = or <2 x i64> %shuf1, %shuf2
ret <2 x i64> %or
}
; CHECK-LABEL: test3
; CHECK-NOT: xorps
; CHECK: movsd
; CHECK-NEXT: ret
define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
%or = or <4 x i32> %shuf1, %shuf2
ret <4 x i32> %or
}
; CHECK-LABEL: test4
; CHECK-NOT: xorps
; CHECK: movss
; CHECK-NOT: orps
; CHECK: ret
define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
%or = or <4 x i32> %shuf1, %shuf2
ret <4 x i32> %or
}
; CHECK-LABEL: test5
; CHECK-NOT: xorps
; CHECK: movss
; CHECK-NEXT: ret
define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) {
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
%or = or <4 x i32> %shuf1, %shuf2
ret <4 x i32> %or
}
; CHECK-LABEL: test6
; CHECK-NOT: xorps
; CHECK: shufps
; CHECK-NEXT: ret
define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) {
%and1 = and <4 x i32> %a, <i32 -1, i32 -1, i32 0, i32 0>
%and2 = and <4 x i32> %b, <i32 0, i32 0, i32 -1, i32 -1>
%or = or <4 x i32> %and1, %and2
ret <4 x i32> %or
}
; CHECK-LABEL: test7
; CHECK-NOT: xorps
; CHECK: shufps
; CHECK-NEXT: ret
define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) {
%and1 = and <2 x i64> %a, <i64 -1, i64 0>
%and2 = and <2 x i64> %b, <i64 0, i64 -1>
%or = or <2 x i64> %and1, %and2
ret <2 x i64> %or
}
; CHECK-LABEL: test8
; CHECK-NOT: xorps
; CHECK: movsd
; CHECK-NOT: orps
; CHECK: ret
define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
%and1 = and <4 x i32> %a, <i32 0, i32 0, i32 -1, i32 -1>
%and2 = and <4 x i32> %b, <i32 -1, i32 -1, i32 0, i32 0>
%or = or <4 x i32> %and1, %and2
ret <4 x i32> %or
}
; CHECK-LABEL: test9
; CHECK-NOT: xorps
; CHECK: shufps
; CHECK: ret
define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
%and1 = and <2 x i64> %a, <i64 0, i64 -1>
%and2 = and <2 x i64> %b, <i64 -1, i64 0>
%or = or <2 x i64> %and1, %and2
ret <2 x i64> %or
}
; CHECK-LABEL: test10
; CHECK-NOT: xorps
; CHECK: movsd
; CHECK-NEXT: ret
define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
%and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 0>
%and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 -1, i32 -1>
%or = or <4 x i32> %and1, %and2
ret <4 x i32> %or
}
; CHECK-LABEL: test11
; CHECK-NOT: xorps
; CHECK: movss
; CHECK-NOT: orps
; CHECK: ret
define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
%and1 = and <4 x i32> %a, <i32 0, i32 -1, i32 -1, i32 -1>
%and2 = and <4 x i32> %b, <i32 -1, i32 0, i32 0, i32 0>
%or = or <4 x i32> %and1, %and2
ret <4 x i32> %or
}
; CHECK-LABEL: test12
; CHECK-NOT: xorps
; CHECK: movss
; CHECK-NEXT: ret
; Verify that the following test cases are folded into single shuffles.
define <4 x i32> @test13(<4 x i32> %a, <4 x i32> %b) {
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 1, i32 1, i32 4, i32 4>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
%or = or <4 x i32> %shuf1, %shuf2
ret <4 x i32> %or
}
; CHECK-LABEL: test13
; CHECK-NOT: xorps
; CHECK: shufps
; CHECK-NEXT: ret
define <2 x i64> @test14(<2 x i64> %a, <2 x i64> %b) {
%shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
%shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
%or = or <2 x i64> %shuf1, %shuf2
ret <2 x i64> %or
}
; CHECK-LABEL: test14
; CHECK-NOT: pslldq
; CHECK-NOT: por
; CHECK: punpcklqdq
; CHECK-NEXT: ret
define <4 x i32> @test15(<4 x i32> %a, <4 x i32> %b) {
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 1>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 2, i32 1, i32 4, i32 4>
%or = or <4 x i32> %shuf1, %shuf2
ret <4 x i32> %or
}
; CHECK-LABEL: test15
; CHECK-NOT: xorps
; CHECK: shufps
; CHECK-NOT: shufps
; CHECK-NOT: orps
; CHECK: ret
define <2 x i64> @test16(<2 x i64> %a, <2 x i64> %b) {
%shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
%shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
%or = or <2 x i64> %shuf1, %shuf2
ret <2 x i64> %or
}
; CHECK-LABEL: test16
; CHECK-NOT: pslldq
; CHECK-NOT: por
; CHECK: punpcklqdq
; CHECK: ret
; Verify that the dag-combiner does not fold a OR of two shuffles into a single
; shuffle instruction when the shuffle indexes are not compatible.
define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 2>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
%or = or <4 x i32> %shuf1, %shuf2
ret <4 x i32> %or
}
; CHECK-LABEL: test17
; CHECK: por
; CHECK-NEXT: ret
define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 4>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
%or = or <4 x i32> %shuf1, %shuf2
ret <4 x i32> %or
}
; CHECK-LABEL: test18
; CHECK: orps
; CHECK: ret
define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 3>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 2, i32 2>
%or = or <4 x i32> %shuf1, %shuf2
ret <4 x i32> %or
}
; CHECK-LABEL: test19
; CHECK: por
; CHECK-NEXT: ret
define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) {
%shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
%shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
%or = or <2 x i64> %shuf1, %shuf2
ret <2 x i64> %or
}
; CHECK-LABEL: test20
; CHECK-NOT: xorps
; CHECK: orps
; CHECK-NEXT: ret
define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
%shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
%shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
%or = or <2 x i64> %shuf1, %shuf2
ret <2 x i64> %or
}
; CHECK-LABEL: test21
; CHECK: por
; CHECK-NEXT: ret