From cee56e7d33fecc74da6f16ebb48dd5b62d73ea8d Mon Sep 17 00:00:00 2001 From: Chris Lattner Date: Fri, 13 Mar 2009 05:53:31 +0000 Subject: [PATCH] generalize the previous code to use the full generality of LEA for i32/i64 expressions (we could also do i16 on cpus where i16 lea is fast, but I didn't add this). On the example, we now generate: _test: movl 4(%esp), %eax cmpl $42, (%eax) setl %al movzbl %al, %eax leal 4(%eax,%eax,8), %eax ret instead of: _test: movl 4(%esp), %eax cmpl $41, (%eax) movl $4, %ecx movl $13, %eax cmovg %ecx, %eax ret git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@66869 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 122 ++++++++++++++++++++++++++--- test/CodeGen/X86/select-no-cmov.ll | 12 +++ 2 files changed, 121 insertions(+), 13 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index f54b7b6f80e..e8e7736021f 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -8188,25 +8188,26 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // If this is a select between two integer constants, try to do some // optimizations. - if (ConstantSDNode *LHSC = dyn_cast(LHS)) { - if (ConstantSDNode *RHSC = dyn_cast(RHS)) + if (ConstantSDNode *TrueC = dyn_cast(LHS)) { + if (ConstantSDNode *FalseC = dyn_cast(RHS)) // Don't do this for crazy integer types. if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { // If this is efficiently invertible, canonicalize the LHSC/RHSC values - // so that LHSC (the true value) is larger than RHSC (the false value). + // so that TrueC (the true value) is larger than FalseC. bool NeedsCondInvert = false; - if (LHSC->getAPIntValue().ult(RHSC->getAPIntValue()) && + if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && // Efficiently invertible. (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. isa(Cond.getOperand(1))))) { NeedsCondInvert = true; - std::swap(LHSC, RHSC); + std::swap(TrueC, FalseC); } // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. - if (RHSC->getAPIntValue() == 0 && LHSC->getAPIntValue().isPowerOf2()) { + if (FalseC->getAPIntValue() == 0 && + TrueC->getAPIntValue().isPowerOf2()) { if (NeedsCondInvert) // Invert the condition if needed. Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, DAG.getConstant(1, Cond.getValueType())); @@ -8214,22 +8215,67 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); - unsigned ShAmt = LHSC->getAPIntValue().logBase2(); + unsigned ShAmt = TrueC->getAPIntValue().logBase2(); return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, DAG.getConstant(ShAmt, MVT::i8)); } // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. - if (RHSC->getAPIntValue()+1 == LHSC->getAPIntValue()) { + if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { if (NeedsCondInvert) // Invert the condition if needed. Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, DAG.getConstant(1, Cond.getValueType())); // Zero extend the condition if needed. - Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, RHSC->getValueType(0), Cond); + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, + FalseC->getValueType(0), Cond); return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, - SDValue(RHSC, 0)); + SDValue(FalseC, 0)); } + + // Optimize cases that will turn into an LEA instruction. This requires + // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). + if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { + uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); + if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; + + bool isFastMultiplier = false; + if (Diff < 10) { + switch ((unsigned char)Diff) { + default: break; + case 1: // result = add base, cond + case 2: // result = lea base( , cond*2) + case 3: // result = lea base(cond, cond*2) + case 4: // result = lea base( , cond*4) + case 5: // result = lea base(cond, cond*4) + case 8: // result = lea base( , cond*8) + case 9: // result = lea base(cond, cond*8) + isFastMultiplier = true; + break; + } + } + + if (isFastMultiplier) { + APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); + if (NeedsCondInvert) // Invert the condition if needed. + Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + DAG.getConstant(1, Cond.getValueType())); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), + Cond); + // Scale the condition by the difference. + if (Diff != 1) + Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, + DAG.getConstant(Diff, Cond.getValueType())); + + // Add the base if non-zero. + if (FalseC->getAPIntValue() != 0) + Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + return Cond; + } + } } } @@ -8260,6 +8306,8 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, } // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. + // This is efficient for any integer data type (including i8/i16) and + // shift amount. if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { SDValue Cond = N->getOperand(3); Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, @@ -8275,21 +8323,69 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, return DCI.CombineTo(N, Cond, SDValue()); return Cond; } - - // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. + + // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient + // for any integer data type, including i8/i16. if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { SDValue Cond = N->getOperand(3); Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, DAG.getConstant(CC, MVT::i8), Cond); // Zero extend the condition if needed. - Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, + FalseC->getValueType(0), Cond); Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, SDValue(FalseC, 0)); + if (N->getNumValues() == 2) // Dead flag value? return DCI.CombineTo(N, Cond, SDValue()); return Cond; } + + // Optimize cases that will turn into an LEA instruction. This requires + // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). + if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { + uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); + if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; + + bool isFastMultiplier = false; + if (Diff < 10) { + switch ((unsigned char)Diff) { + default: break; + case 1: // result = add base, cond + case 2: // result = lea base( , cond*2) + case 3: // result = lea base(cond, cond*2) + case 4: // result = lea base( , cond*4) + case 5: // result = lea base(cond, cond*4) + case 8: // result = lea base( , cond*8) + case 9: // result = lea base(cond, cond*8) + isFastMultiplier = true; + break; + } + } + + if (isFastMultiplier) { + APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); + SDValue Cond = N->getOperand(3); + Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(CC, MVT::i8), Cond); + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), + Cond); + // Scale the condition by the difference. + if (Diff != 1) + Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, + DAG.getConstant(Diff, Cond.getValueType())); + + // Add the base if non-zero. + if (FalseC->getAPIntValue() != 0) + Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + if (N->getNumValues() == 2) // Dead flag value? + return DCI.CombineTo(N, Cond, SDValue()); + return Cond; + } + } } } return SDValue(); diff --git a/test/CodeGen/X86/select-no-cmov.ll b/test/CodeGen/X86/select-no-cmov.ll index 87382fe3c83..71636f93327 100644 --- a/test/CodeGen/X86/select-no-cmov.ll +++ b/test/CodeGen/X86/select-no-cmov.ll @@ -12,3 +12,15 @@ entry: %iftmp.0.0 = select i1 %1, i32 -1, i32 -2 ; [#uses=1] ret i32 %iftmp.0.0 } + +; setl %al +; movzbl %al, %eax +; leal 4(%eax,%eax,8), %eax +define i32 @test2(i32* nocapture %P) nounwind readonly { +entry: + %0 = load i32* %P, align 4 ; [#uses=1] + %1 = icmp sgt i32 %0, 41 ; [#uses=1] + %iftmp.0.0 = select i1 %1, i32 4, i32 13 ; [#uses=1] + ret i32 %iftmp.0.0 +} +