From 39ad568c62f5120faec29f69d3d614303a1f992d Mon Sep 17 00:00:00 2001 From: Manman Ren Date: Wed, 8 Aug 2012 00:51:41 +0000 Subject: [PATCH] X86: enable CSE between CMP and SUB We perform the following: 1> Use SUB instead of CMP for i8,i16,i32 and i64 in ISel lowering. 2> Modify MachineCSE to correctly handle implicit defs. 3> Convert SUB back to CMP if possible at peephole. Removed pattern matching of (a>b) ? (a-b):0 and like, since they are handled by peephole now. rdar://11873276 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@161462 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/MachineCSE.cpp | 20 ++++++- lib/Target/X86/X86ISelDAGToDAG.cpp | 7 ++- lib/Target/X86/X86ISelLowering.cpp | 56 ++++--------------- lib/Target/X86/X86InstrArithmetic.td | 2 + lib/Target/X86/X86InstrInfo.cpp | 80 ++++++++++++++++++++++++++++ test/CodeGen/X86/jump_sign.ll | 23 ++++++++ 6 files changed, 139 insertions(+), 49 deletions(-) diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp index 0a76f8db0b3..5153abbc4e1 100644 --- a/lib/CodeGen/MachineCSE.cpp +++ b/lib/CodeGen/MachineCSE.cpp @@ -417,6 +417,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) { bool Changed = false; SmallVector, 8> CSEPairs; + SmallVector ImplicitDefsToUpdate; for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; ) { MachineInstr *MI = &*I; ++I; @@ -486,15 +487,24 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) { // Check if it's profitable to perform this CSE. bool DoCSE = true; - unsigned NumDefs = MI->getDesc().getNumDefs(); + unsigned NumDefs = MI->getDesc().getNumDefs() + + MI->getDesc().getNumImplicitDefs(); + for (unsigned i = 0, e = MI->getNumOperands(); NumDefs && i != e; ++i) { MachineOperand &MO = MI->getOperand(i); if (!MO.isReg() || !MO.isDef()) continue; unsigned OldReg = MO.getReg(); unsigned NewReg = CSMI->getOperand(i).getReg(); - if (OldReg == NewReg) + + // Go through implicit defs of CSMI and MI, if a def is not dead at MI, + // we should make sure it is not dead at CSMI. + if (MO.isImplicit() && !MO.isDead() && CSMI->getOperand(i).isDead()) + ImplicitDefsToUpdate.push_back(i); + if (OldReg == NewReg) { + --NumDefs; continue; + } assert(TargetRegisterInfo::isVirtualRegister(OldReg) && TargetRegisterInfo::isVirtualRegister(NewReg) && @@ -526,6 +536,11 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) { MRI->clearKillFlags(CSEPairs[i].second); } + // Go through implicit defs of CSMI and MI, if a def is not dead at MI, + // we should make sure it is not dead at CSMI. + for (unsigned i = 0, e = ImplicitDefsToUpdate.size(); i != e; ++i) + CSMI->getOperand(ImplicitDefsToUpdate[i]).setIsDead(false); + if (CrossMBBPhysDef) { // Add physical register defs now coming in from a predecessor to MBB // livein list. @@ -549,6 +564,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) { Exps.push_back(MI); } CSEPairs.clear(); + ImplicitDefsToUpdate.clear(); } return Changed; diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 316075839c4..f6df4d3b4d4 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2438,7 +2438,12 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { return NULL; } - case X86ISD::CMP: { + case X86ISD::CMP: + case X86ISD::SUB: { + // Sometimes a SUB is used to perform comparison. + if (Opcode == X86ISD::SUB && Node->hasAnyUseOfValue(0)) + // This node is not a CMP. + break; SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index d70cdb8a762..33bc1c9db5d 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -8333,11 +8333,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, switch (Op.getNode()->getOpcode()) { default: llvm_unreachable("unexpected operator!"); case ISD::SUB: - // If the only use of SUB is EFLAGS, use CMP instead. - if (Op.hasOneUse()) - Opcode = X86ISD::CMP; - else - Opcode = X86ISD::SUB; + Opcode = X86ISD::SUB; break; case ISD::OR: Opcode = X86ISD::OR; break; case ISD::XOR: Opcode = X86ISD::XOR; break; @@ -8391,6 +8387,14 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, return EmitTest(Op0, X86CC, DAG); DebugLoc dl = Op0.getDebugLoc(); + if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || + Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { + // Use SUB instead of CMP to enable CSE between SUB and CMP. + SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); + SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, + Op0, Op1); + return SDValue(Sub.getNode(), 1); + } return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); } @@ -8765,46 +8769,6 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { Cond = NewCond; } - // Handle the following cases related to max and min: - // (a > b) ? (a-b) : 0 - // (a >= b) ? (a-b) : 0 - // (b < a) ? (a-b) : 0 - // (b <= a) ? (a-b) : 0 - // Comparison is removed to use EFLAGS from SUB. - if (ConstantSDNode *C = dyn_cast(Op2)) - if (Cond.getOpcode() == X86ISD::SETCC && - Cond.getOperand(1).getOpcode() == X86ISD::CMP && - (Op1.getOpcode() == ISD::SUB || Op1.getOpcode() == X86ISD::SUB) && - C->getAPIntValue() == 0) { - SDValue Cmp = Cond.getOperand(1); - unsigned CC = cast(Cond.getOperand(0))->getZExtValue(); - if ((DAG.isEqualTo(Op1.getOperand(0), Cmp.getOperand(0)) && - DAG.isEqualTo(Op1.getOperand(1), Cmp.getOperand(1)) && - (CC == X86::COND_G || CC == X86::COND_GE || - CC == X86::COND_A || CC == X86::COND_AE)) || - (DAG.isEqualTo(Op1.getOperand(0), Cmp.getOperand(1)) && - DAG.isEqualTo(Op1.getOperand(1), Cmp.getOperand(0)) && - (CC == X86::COND_L || CC == X86::COND_LE || - CC == X86::COND_B || CC == X86::COND_BE))) { - - if (Op1.getOpcode() == ISD::SUB) { - SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i32); - SDValue New = DAG.getNode(X86ISD::SUB, DL, VTs, - Op1.getOperand(0), Op1.getOperand(1)); - DAG.ReplaceAllUsesWith(Op1, New); - Op1 = New; - } - - SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); - unsigned NewCC = (CC == X86::COND_G || CC == X86::COND_GE || - CC == X86::COND_L || - CC == X86::COND_LE) ? X86::COND_GE : X86::COND_AE; - SDValue Ops[] = { Op2, Op1, DAG.getConstant(NewCC, MVT::i8), - SDValue(Op1.getNode(), 1) }; - return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); - } - } - // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y @@ -8945,7 +8909,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // a < b ? 0 : -1 -> RES = setcc_carry // a >= b ? -1 : 0 -> RES = setcc_carry // a >= b ? 0 : -1 -> RES = ~setcc_carry - if (Cond.getOpcode() == X86ISD::CMP) { + if (Cond.getOpcode() == X86ISD::SUB) { Cond = ConvertCmpIfNecessary(Cond, DAG); unsigned CondCode = cast(CC)->getZExtValue(); diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index b6ba68fff0e..f790611b8f8 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -1132,8 +1132,10 @@ defm XOR : ArithBinOp_RF<0x30, 0x32, 0x34, "xor", MRM6r, MRM6m, X86xor_flag, xor, 1, 0>; defm ADD : ArithBinOp_RF<0x00, 0x02, 0x04, "add", MRM0r, MRM0m, X86add_flag, add, 1, 1>; +let isCompare = 1 in { defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m, X86sub_flag, sub, 0, 0>; +} // Arithmetic. let Uses = [EFLAGS] in { diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index b26c08fc9c4..831caaaa5e0 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -3031,6 +3031,37 @@ analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2, CmpMask = ~0; CmpValue = MI->getOperand(1).getImm(); return true; + // A SUB can be used to perform comparison. + case X86::SUB64rm: + case X86::SUB32rm: + case X86::SUB16rm: + case X86::SUB8rm: + SrcReg = MI->getOperand(1).getReg(); + SrcReg2 = 0; + CmpMask = ~0; + CmpValue = 0; + return true; + case X86::SUB64rr: + case X86::SUB32rr: + case X86::SUB16rr: + case X86::SUB8rr: + SrcReg = MI->getOperand(1).getReg(); + SrcReg2 = MI->getOperand(2).getReg(); + CmpMask = ~0; + CmpValue = 0; + return true; + case X86::SUB64ri32: + case X86::SUB64ri8: + case X86::SUB32ri: + case X86::SUB32ri8: + case X86::SUB16ri: + case X86::SUB16ri8: + case X86::SUB8ri: + SrcReg = MI->getOperand(1).getReg(); + SrcReg2 = 0; + CmpMask = ~0; + CmpValue = MI->getOperand(2).getImm(); + return true; case X86::CMP64rr: case X86::CMP32rr: case X86::CMP16rr: @@ -3139,6 +3170,55 @@ bool X86InstrInfo:: optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const { + // Check whether we can replace SUB with CMP. + unsigned NewOpcode = 0; + switch (CmpInstr->getOpcode()) { + default: break; + case X86::SUB64ri32: + case X86::SUB64ri8: + case X86::SUB32ri: + case X86::SUB32ri8: + case X86::SUB16ri: + case X86::SUB16ri8: + case X86::SUB8ri: + case X86::SUB64rm: + case X86::SUB32rm: + case X86::SUB16rm: + case X86::SUB8rm: + case X86::SUB64rr: + case X86::SUB32rr: + case X86::SUB16rr: + case X86::SUB8rr: { + if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg())) + return false; + // There is no use of the destination register, we can replace SUB with CMP. + switch (CmpInstr->getOpcode()) { + default: llvm_unreachable(0); + case X86::SUB64rm: NewOpcode = X86::CMP64rm; break; + case X86::SUB32rm: NewOpcode = X86::CMP32rm; break; + case X86::SUB16rm: NewOpcode = X86::CMP16rm; break; + case X86::SUB8rm: NewOpcode = X86::CMP8rm; break; + case X86::SUB64rr: NewOpcode = X86::CMP64rr; break; + case X86::SUB32rr: NewOpcode = X86::CMP32rr; break; + case X86::SUB16rr: NewOpcode = X86::CMP16rr; break; + case X86::SUB8rr: NewOpcode = X86::CMP8rr; break; + case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break; + case X86::SUB64ri8: NewOpcode = X86::CMP64ri8; break; + case X86::SUB32ri: NewOpcode = X86::CMP32ri; break; + case X86::SUB32ri8: NewOpcode = X86::CMP32ri8; break; + case X86::SUB16ri: NewOpcode = X86::CMP16ri; break; + case X86::SUB16ri8: NewOpcode = X86::CMP16ri8; break; + case X86::SUB8ri: NewOpcode = X86::CMP8ri; break; + } + CmpInstr->setDesc(get(NewOpcode)); + CmpInstr->RemoveOperand(0); + // Fall through to optimize Cmp if Cmp is CMPrr or CMPri. + if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm || + NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm) + return false; + } + } + // Get the unique definition of SrcReg. MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); if (!MI) return false; diff --git a/test/CodeGen/X86/jump_sign.ll b/test/CodeGen/X86/jump_sign.ll index 61792d3253c..48e21061d20 100644 --- a/test/CodeGen/X86/jump_sign.ll +++ b/test/CodeGen/X86/jump_sign.ll @@ -230,3 +230,26 @@ define i32 @q(i32 %j.4, i32 %w, i32 %el) { %j.5 = select i1 %or.cond, i32 %tmp535, i32 %j.4 ret i32 %j.5 } +; rdar://11873276 +define i8* @r(i8* %base, i32* nocapture %offset, i32 %size) nounwind { +entry: +; CHECK: r: +; CHECK: sub +; CHECK-NOT: cmp +; CHECK: j +; CHECK-NOT: sub +; CHECK: ret + %0 = load i32* %offset, align 8 + %cmp = icmp slt i32 %0, %size + br i1 %cmp, label %return, label %if.end + +if.end: + %sub = sub nsw i32 %0, %size + store i32 %sub, i32* %offset, align 8 + %add.ptr = getelementptr inbounds i8* %base, i32 %sub + br label %return + +return: + %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ] + ret i8* %retval.0 +}