Fix PR12312

- Add a target-specific DAG optimization to recognize a pattern PTEST-able.
  Such a pattern is a OR'd tree with X86ISD::OR as the root node. When
  X86ISD::OR node has only its flag result being used as a boolean value and
  all its leaves are extracted from the same vector, it could be folded into an
  X86ISD::PTEST node.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@162735 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Michael Liao 2012-08-28 03:34:40 +00:00
parent 325907d086
commit dbf8b5be97
2 changed files with 160 additions and 10 deletions

View File

@ -14036,7 +14036,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
//
// where Op could be BRCOND or CMOV.
//
static SDValue BoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
// Quit if not CMP and SUB with its value result used.
if (Cmp.getOpcode() != X86ISD::CMP &&
(Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
@ -14087,7 +14087,85 @@ static SDValue BoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
return SetCC.getOperand(1);
}
static bool IsValidFCMOVCondition(X86::CondCode CC) {
/// checkFlaggedOrCombine - DAG combination on X86ISD::OR, i.e. with EFLAGS
/// updated. If only flag result is used and the result is evaluated from a
/// series of element extraction, try to combine it into a PTEST.
static SDValue checkFlaggedOrCombine(SDValue Or, X86::CondCode &CC,
SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
SDNode *N = Or.getNode();
DebugLoc DL = N->getDebugLoc();
// Only SSE4.1 and beyond supports PTEST or like.
if (!Subtarget->hasSSE41())
return SDValue();
if (N->getOpcode() != X86ISD::OR)
return SDValue();
// Quit if the value result of OR is used.
if (N->hasAnyUseOfValue(0))
return SDValue();
// Quit if not used as a boolean value.
if (CC != X86::COND_E && CC != X86::COND_NE)
return SDValue();
SmallVector<SDValue, 8> Opnds;
SDValue VecIn;
EVT VT = MVT::Other;
unsigned Mask = 0;
// Recognize a special case where a vector is casted into wide integer to
// test all 0s.
Opnds.push_back(N->getOperand(0));
Opnds.push_back(N->getOperand(1));
for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot;
// BFS traverse all OR'd operands.
if (I->getOpcode() == ISD::OR) {
Opnds.push_back(I->getOperand(0));
Opnds.push_back(I->getOperand(1));
// Re-evaluate the number of nodes to be traversed.
e = Opnds.size();
continue;
}
// Quit if a non-EXTRACT_VECTOR_ELT
if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
// Quit if without a constant index.
SDValue Idx = I->getOperand(1);
if (!isa<ConstantSDNode>(Idx))
return SDValue();
// Check if all elements are extracted from the same vector.
SDValue ExtractedFromVec = I->getOperand(0);
if (VecIn.getNode() == 0) {
VT = ExtractedFromVec.getValueType();
// FIXME: only 128-bit vector is supported so far.
if (!VT.is128BitVector())
return SDValue();
VecIn = ExtractedFromVec;
} else if (VecIn != ExtractedFromVec)
return SDValue();
// Record the constant index.
Mask |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
}
assert(VT.is128BitVector() && "Only 128-bit vector PTEST is supported so far.");
// Quit if not all elements are used.
if (Mask != (1U << VT.getVectorNumElements()) - 1U)
return SDValue();
return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIn, VecIn);
}
static bool isValidFCMOVCondition(X86::CondCode CC) {
switch (CC) {
default:
return false;
@ -14105,7 +14183,8 @@ static bool IsValidFCMOVCondition(X86::CondCode CC) {
/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
DebugLoc DL = N->getDebugLoc();
// If the flag operand isn't dead, don't touch this CMOV.
@ -14130,10 +14209,18 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
SDValue Flags;
Flags = BoolTestSetCCCombine(Cond, CC);
Flags = checkBoolTestSetCCCombine(Cond, CC);
if (Flags.getNode() &&
// Extra check as FCMOV only supports a subset of X86 cond.
(FalseOp.getValueType() != MVT::f80 || IsValidFCMOVCondition(CC))) {
(FalseOp.getValueType() != MVT::f80 || isValidFCMOVCondition(CC))) {
SDValue Ops[] = { FalseOp, TrueOp,
DAG.getConstant(CC, MVT::i8), Flags };
return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
Ops, array_lengthof(Ops));
}
Flags = checkFlaggedOrCombine(Cond, CC, DAG, Subtarget);
if (Flags.getNode()) {
SDValue Ops[] = { FalseOp, TrueOp,
DAG.getConstant(CC, MVT::i8), Flags };
return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
@ -15641,7 +15728,9 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
}
// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
DebugLoc DL = N->getDebugLoc();
X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
SDValue EFLAGS = N->getOperand(1);
@ -15657,7 +15746,13 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
SDValue Flags;
Flags = BoolTestSetCCCombine(EFLAGS, CC);
Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
if (Flags.getNode()) {
SDValue Cond = DAG.getConstant(CC, MVT::i8);
return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
}
Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget);
if (Flags.getNode()) {
SDValue Cond = DAG.getConstant(CC, MVT::i8);
return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
@ -15679,7 +15774,14 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
SDValue Flags;
Flags = BoolTestSetCCCombine(EFLAGS, CC);
Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
if (Flags.getNode()) {
SDValue Cond = DAG.getConstant(CC, MVT::i8);
return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
Flags);
}
Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget);
if (Flags.getNode()) {
SDValue Cond = DAG.getConstant(CC, MVT::i8);
return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
@ -15874,7 +15976,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
case ISD::VSELECT:
case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget);
case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI);
case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget);
case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget);
case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget);
case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI);
@ -15904,7 +16006,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget);
case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG, DCI);
case ISD::SETCC: return PerformISDSETCCCombine(N, DAG);
case X86ISD::SETCC: return PerformSETCCCombine(N, DAG);
case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget);
case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget);
case X86ISD::SHUFP: // Handle all target specific shuffles
case X86ISD::PALIGN:

View File

@ -0,0 +1,48 @@
; RUN: llc -march=x86-64 -mattr=+sse41,-avx < %s | FileCheck %s --check-prefix SSE41
; RUN: llc -march=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefix AVX
define i32 @veccond(<4 x i32> %input) {
entry:
%0 = bitcast <4 x i32> %input to i128
%1 = icmp ne i128 %0, 0
br i1 %1, label %if-true-block, label %endif-block
if-true-block: ; preds = %entry
ret i32 0
endif-block: ; preds = %entry,
ret i32 1
; SSE41: veccond
; SSE41: ptest
; SSE41: ret
; AVX: veccond
; AVX: vptest
; AVX: ret
}
define i32 @vectest(<4 x i32> %input) {
entry:
%0 = bitcast <4 x i32> %input to i128
%1 = icmp ne i128 %0, 0
%2 = zext i1 %1 to i32
ret i32 %2
; SSE41: vectest
; SSE41: ptest
; SSE41: ret
; AVX: vectest
; AVX: vptest
; AVX: ret
}
define i32 @vecsel(<4 x i32> %input, i32 %a, i32 %b) {
entry:
%0 = bitcast <4 x i32> %input to i128
%1 = icmp ne i128 %0, 0
%2 = select i1 %1, i32 %a, i32 %b
ret i32 %2
; SSE41: vecsel
; SSE41: ptest
; SSE41: ret
; AVX: vecsel
; AVX: vptest
; AVX: ret
}