diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 311971070e6..d6eac674693 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -8347,6 +8347,98 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); } +// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able. +// +SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); + + if (!Subtarget->hasSSE41()) + return SDValue(); + + if (!Op->hasOneUse()) + return SDValue(); + + SDNode *N = Op.getNode(); + DebugLoc DL = N->getDebugLoc(); + + SmallVector Opnds; + DenseMap VecInMap; + EVT VT = MVT::Other; + + // Recognize a special case where a vector is casted into wide integer to + // test all 0s. + Opnds.push_back(N->getOperand(0)); + Opnds.push_back(N->getOperand(1)); + + for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { + SmallVector::const_iterator I = Opnds.begin() + Slot; + // BFS traverse all OR'd operands. + if (I->getOpcode() == ISD::OR) { + Opnds.push_back(I->getOperand(0)); + Opnds.push_back(I->getOperand(1)); + // Re-evaluate the number of nodes to be traversed. + e += 2; // 2 more nodes (LHS and RHS) are pushed. + continue; + } + + // Quit if a non-EXTRACT_VECTOR_ELT + if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // Quit if without a constant index. + SDValue Idx = I->getOperand(1); + if (!isa(Idx)) + return SDValue(); + + SDValue ExtractedFromVec = I->getOperand(0); + DenseMap::iterator M = VecInMap.find(ExtractedFromVec); + if (M == VecInMap.end()) { + VT = ExtractedFromVec.getValueType(); + // Quit if not 128/256-bit vector. + if (!VT.is128BitVector() && !VT.is256BitVector()) + return SDValue(); + // Quit if not the same type. + if (VecInMap.begin() != VecInMap.end() && + VT != VecInMap.begin()->first.getValueType()) + return SDValue(); + M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first; + } + M->second |= 1U << cast(Idx)->getZExtValue(); + } + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Not extracted from 128-bit vector."); + + unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U; + SmallVector VecIns; + + for (DenseMap::const_iterator + I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) { + // Quit if not all elements are used. + if (I->second != FullMask) + return SDValue(); + VecIns.push_back(I->first); + } + + EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; + + // Cast all vectors into TestVT for PTEST. + for (unsigned i = 0, e = VecIns.size(); i < e; ++i) + VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]); + + // If more than one full vectors are evaluated, OR them first before PTEST. + for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { + // Each iteration will OR 2 nodes and append the result until there is only + // 1 node left, i.e. the final OR'd value of all vectors. + SDValue LHS = VecIns[Slot]; + SDValue RHS = VecIns[Slot + 1]; + VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); + } + + return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, + VecIns.back(), VecIns.back()); +} + /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, @@ -8486,9 +8578,17 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, switch (ArithOp.getOpcode()) { default: llvm_unreachable("unexpected operator!"); case ISD::SUB: Opcode = X86ISD::SUB; break; - case ISD::OR: Opcode = X86ISD::OR; break; case ISD::XOR: Opcode = X86ISD::XOR; break; case ISD::AND: Opcode = X86ISD::AND; break; + case ISD::OR: { + if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { + SDValue EFLAGS = LowerVectorAllZeroTest(Op, DAG); + if (EFLAGS.getNode()) + return EFLAGS; + } + Opcode = X86ISD::OR; + break; + } } NumOperands = 2; @@ -14205,84 +14305,6 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { return SDValue(); } -/// checkFlaggedOrCombine - DAG combination on X86ISD::OR, i.e. with EFLAGS -/// updated. If only flag result is used and the result is evaluated from a -/// series of element extraction, try to combine it into a PTEST. -static SDValue checkFlaggedOrCombine(SDValue Or, X86::CondCode &CC, - SelectionDAG &DAG, - const X86Subtarget *Subtarget) { - SDNode *N = Or.getNode(); - DebugLoc DL = N->getDebugLoc(); - - // Only SSE4.1 and beyond supports PTEST or like. - if (!Subtarget->hasSSE41()) - return SDValue(); - - if (N->getOpcode() != X86ISD::OR) - return SDValue(); - - // Quit if the value result of OR is used. - if (N->hasAnyUseOfValue(0)) - return SDValue(); - - // Quit if not used as a boolean value. - if (CC != X86::COND_E && CC != X86::COND_NE) - return SDValue(); - - SmallVector Opnds; - SDValue VecIn; - EVT VT = MVT::Other; - unsigned Mask = 0; - - // Recognize a special case where a vector is casted into wide integer to - // test all 0s. - Opnds.push_back(N->getOperand(0)); - Opnds.push_back(N->getOperand(1)); - - for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { - SmallVector::const_iterator I = Opnds.begin() + Slot; - // BFS traverse all OR'd operands. - if (I->getOpcode() == ISD::OR) { - Opnds.push_back(I->getOperand(0)); - Opnds.push_back(I->getOperand(1)); - // Re-evaluate the number of nodes to be traversed. - e += 2; // 2 more nodes (LHS and RHS) are pushed. - continue; - } - - // Quit if a non-EXTRACT_VECTOR_ELT - if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) - return SDValue(); - - // Quit if without a constant index. - SDValue Idx = I->getOperand(1); - if (!isa(Idx)) - return SDValue(); - - // Check if all elements are extracted from the same vector. - SDValue ExtractedFromVec = I->getOperand(0); - if (VecIn.getNode() == 0) { - VT = ExtractedFromVec.getValueType(); - // FIXME: only 128-bit vector is supported so far. - if (!VT.is128BitVector()) - return SDValue(); - VecIn = ExtractedFromVec; - } else if (VecIn != ExtractedFromVec) - return SDValue(); - - // Record the constant index. - Mask |= 1U << cast(Idx)->getZExtValue(); - } - - assert(VT.is128BitVector() && "Only 128-bit vector PTEST is supported so far."); - - // Quit if not all elements are used. - if (Mask != (1U << VT.getVectorNumElements()) - 1U) - return SDValue(); - - return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIn, VecIn); -} - /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -14321,14 +14343,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, Ops, array_lengthof(Ops)); } - Flags = checkFlaggedOrCombine(Cond, CC, DAG, Subtarget); - if (Flags.getNode()) { - SDValue Ops[] = { FalseOp, TrueOp, - DAG.getConstant(CC, MVT::i8), Flags }; - return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), - Ops, array_lengthof(Ops)); - } - // If this is a select between two integer constants, try to do some // optimizations. Note that the operands are ordered the opposite of SELECT // operands. @@ -15860,12 +15874,6 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); } - Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget); - if (Flags.getNode()) { - SDValue Cond = DAG.getConstant(CC, MVT::i8); - return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); - } - return SDValue(); } @@ -15889,13 +15897,6 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, Flags); } - Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget); - if (Flags.getNode()) { - SDValue Cond = DAG.getConstant(CC, MVT::i8); - return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, - Flags); - } - return SDValue(); } diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 92277418835..9c737770220 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -811,6 +811,8 @@ namespace llvm { SDValue LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const; SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVectorFpExtend(SDValue &Op, SelectionDAG &DAG) const; virtual SDValue diff --git a/test/CodeGen/X86/pr12312.ll b/test/CodeGen/X86/pr12312.ll index 84102f148b8..087b8d7539e 100644 --- a/test/CodeGen/X86/pr12312.ll +++ b/test/CodeGen/X86/pr12312.ll @@ -1,7 +1,7 @@ ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse41,-avx < %s | FileCheck %s --check-prefix SSE41 -; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix AVX +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx,-avx2 < %s | FileCheck %s --check-prefix AVX -define i32 @veccond(<4 x i32> %input) { +define i32 @veccond128(<4 x i32> %input) { entry: %0 = bitcast <4 x i32> %input to i128 %1 = icmp ne i128 %0, 0 @@ -11,38 +11,145 @@ if-true-block: ; preds = %entry ret i32 0 endif-block: ; preds = %entry, ret i32 1 -; SSE41: veccond +; SSE41: veccond128 ; SSE41: ptest ; SSE41: ret -; AVX: veccond -; AVX: vptest +; AVX: veccond128 +; AVX: vptest %xmm{{.*}}, %xmm{{.*}} ; AVX: ret } -define i32 @vectest(<4 x i32> %input) { +define i32 @veccond256(<8 x i32> %input) { +entry: + %0 = bitcast <8 x i32> %input to i256 + %1 = icmp ne i256 %0, 0 + br i1 %1, label %if-true-block, label %endif-block + +if-true-block: ; preds = %entry + ret i32 0 +endif-block: ; preds = %entry, + ret i32 1 +; SSE41: veccond256 +; SSE41: por +; SSE41: ptest +; SSE41: ret +; AVX: veccond256 +; AVX: vptest %ymm{{.*}}, %ymm{{.*}} +; AVX: ret +} + +define i32 @veccond512(<16 x i32> %input) { +entry: + %0 = bitcast <16 x i32> %input to i512 + %1 = icmp ne i512 %0, 0 + br i1 %1, label %if-true-block, label %endif-block + +if-true-block: ; preds = %entry + ret i32 0 +endif-block: ; preds = %entry, + ret i32 1 +; SSE41: veccond512 +; SSE41: por +; SSE41: por +; SSE41: por +; SSE41: ptest +; SSE41: ret +; AVX: veccond512 +; AVX: vorps +; AVX: vptest %ymm{{.*}}, %ymm{{.*}} +; AVX: ret +} + +define i32 @vectest128(<4 x i32> %input) { entry: %0 = bitcast <4 x i32> %input to i128 %1 = icmp ne i128 %0, 0 %2 = zext i1 %1 to i32 ret i32 %2 -; SSE41: vectest +; SSE41: vectest128 ; SSE41: ptest ; SSE41: ret -; AVX: vectest -; AVX: vptest +; AVX: vectest128 +; AVX: vptest %xmm{{.*}}, %xmm{{.*}} ; AVX: ret } -define i32 @vecsel(<4 x i32> %input, i32 %a, i32 %b) { +define i32 @vectest256(<8 x i32> %input) { +entry: + %0 = bitcast <8 x i32> %input to i256 + %1 = icmp ne i256 %0, 0 + %2 = zext i1 %1 to i32 + ret i32 %2 +; SSE41: vectest256 +; SSE41: por +; SSE41: ptest +; SSE41: ret +; AVX: vectest256 +; AVX: vptest %ymm{{.*}}, %ymm{{.*}} +; AVX: ret +} + +define i32 @vectest512(<16 x i32> %input) { +entry: + %0 = bitcast <16 x i32> %input to i512 + %1 = icmp ne i512 %0, 0 + %2 = zext i1 %1 to i32 + ret i32 %2 +; SSE41: vectest512 +; SSE41: por +; SSE41: por +; SSE41: por +; SSE41: ptest +; SSE41: ret +; AVX: vectest512 +; AVX: vorps +; AVX: vptest %ymm{{.*}}, %ymm{{.*}} +; AVX: ret +} + +define i32 @vecsel128(<4 x i32> %input, i32 %a, i32 %b) { entry: %0 = bitcast <4 x i32> %input to i128 %1 = icmp ne i128 %0, 0 %2 = select i1 %1, i32 %a, i32 %b ret i32 %2 -; SSE41: vecsel +; SSE41: vecsel128 ; SSE41: ptest ; SSE41: ret -; AVX: vecsel -; AVX: vptest +; AVX: vecsel128 +; AVX: vptest %xmm{{.*}}, %xmm{{.*}} +; AVX: ret +} + +define i32 @vecsel256(<8 x i32> %input, i32 %a, i32 %b) { +entry: + %0 = bitcast <8 x i32> %input to i256 + %1 = icmp ne i256 %0, 0 + %2 = select i1 %1, i32 %a, i32 %b + ret i32 %2 +; SSE41: vecsel256 +; SSE41: por +; SSE41: ptest +; SSE41: ret +; AVX: vecsel256 +; AVX: vptest %ymm{{.*}}, %ymm{{.*}} +; AVX: ret +} + +define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) { +entry: + %0 = bitcast <16 x i32> %input to i512 + %1 = icmp ne i512 %0, 0 + %2 = select i1 %1, i32 %a, i32 %b + ret i32 %2 +; SSE41: vecsel512 +; SSE41: por +; SSE41: por +; SSE41: por +; SSE41: ptest +; SSE41: ret +; AVX: vecsel512 +; AVX: vorps +; AVX: vptest %ymm{{.*}}, %ymm{{.*}} ; AVX: ret }