From 740cd657f3d9d4e88614831c70a649f9257164da Mon Sep 17 00:00:00 2001 From: Jakob Stoklund Olesen Date: Thu, 5 Apr 2012 20:30:20 +0000 Subject: [PATCH] Don't break the IV update in TLI::SimplifySetCC(). LSR always tries to make the ICmp in the loop latch use the incremented induction variable. This allows the induction variable to be kept in a single register. When the induction variable limit is equal to the stride, SimplifySetCC() would break LSR's hard work by transforming: (icmp (add iv, stride), stride) --> (cmp iv, 0) This forced us to use lea for the IC update, preventing the simpler incl+cmp. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@154119 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/TargetLowering.cpp | 42 +++++++++++++-------- test/CodeGen/Thumb2/lsr-deficiency.ll | 11 ++---- test/CodeGen/X86/lsr-loop-exit-cond.ll | 42 +++++++++++++++++++++ 3 files changed, 72 insertions(+), 23 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp index eefb9e84b17..03aed3aeca6 100644 --- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -2471,6 +2471,10 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, } } + // If RHS is a legal immediate value for a compare instruction, we need + // to be careful about increasing register pressure needlessly. + bool LegalRHSImm = false; + if (ConstantSDNode *RHSC = dyn_cast(N1)) { if (ConstantSDNode *LHSR = dyn_cast(N0.getOperand(1))) { // Turn (X+C1) == C2 --> X == C2-C1 @@ -2505,25 +2509,33 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, Cond); } } + + // Could RHSC fold directly into a compare? + if (RHSC->getValueType(0).getSizeInBits() <= 64) + LegalRHSImm = isLegalICmpImmediate(RHSC->getSExtValue()); } // Simplify (X+Z) == X --> Z == 0 - if (N0.getOperand(0) == N1) - return DAG.getSetCC(dl, VT, N0.getOperand(1), - DAG.getConstant(0, N0.getValueType()), Cond); - if (N0.getOperand(1) == N1) { - if (DAG.isCommutativeBinOp(N0.getOpcode())) - return DAG.getSetCC(dl, VT, N0.getOperand(0), - DAG.getConstant(0, N0.getValueType()), Cond); - else if (N0.getNode()->hasOneUse()) { - assert(N0.getOpcode() == ISD::SUB && "Unexpected operation!"); - // (Z-X) == X --> Z == X<<1 - SDValue SH = DAG.getNode(ISD::SHL, dl, N1.getValueType(), - N1, + // Don't do this if X is an immediate that can fold into a cmp + // instruction and X+Z has other uses. It could be an induction variable + // chain, and the transform would increase register pressure. + if (!LegalRHSImm || N0.getNode()->hasOneUse()) { + if (N0.getOperand(0) == N1) + return DAG.getSetCC(dl, VT, N0.getOperand(1), + DAG.getConstant(0, N0.getValueType()), Cond); + if (N0.getOperand(1) == N1) { + if (DAG.isCommutativeBinOp(N0.getOpcode())) + return DAG.getSetCC(dl, VT, N0.getOperand(0), + DAG.getConstant(0, N0.getValueType()), Cond); + else if (N0.getNode()->hasOneUse()) { + assert(N0.getOpcode() == ISD::SUB && "Unexpected operation!"); + // (Z-X) == X --> Z == X<<1 + SDValue SH = DAG.getNode(ISD::SHL, dl, N1.getValueType(), N1, DAG.getConstant(1, getShiftAmountTy(N1.getValueType()))); - if (!DCI.isCalledByLegalizer()) - DCI.AddToWorklist(SH.getNode()); - return DAG.getSetCC(dl, VT, N0.getOperand(0), SH, Cond); + if (!DCI.isCalledByLegalizer()) + DCI.AddToWorklist(SH.getNode()); + return DAG.getSetCC(dl, VT, N0.getOperand(0), SH, Cond); + } } } } diff --git a/test/CodeGen/Thumb2/lsr-deficiency.ll b/test/CodeGen/Thumb2/lsr-deficiency.ll index 9ff114e2b6f..9aaa821698c 100644 --- a/test/CodeGen/Thumb2/lsr-deficiency.ll +++ b/test/CodeGen/Thumb2/lsr-deficiency.ll @@ -3,11 +3,6 @@ ; This now reduces to a single induction variable. -; TODO: It still gets a GPR shuffle at the end of the loop -; This is because something in instruction selection has decided -; that comparing the pre-incremented value with zero is better -; than comparing the post-incremented value with -4. - @G = external global i32 ; [#uses=2] @array = external global i32* ; [#uses=1] @@ -20,9 +15,9 @@ entry: bb: ; preds = %bb, %entry ; CHECK: LBB0_1: -; CHECK: cmp [[R2:r[0-9]+]], #0 -; CHECK: sub{{(.w)?}} [[REGISTER:(r[0-9]+)|(lr)]], [[R2]], #1 -; CHECK: mov [[R2]], [[REGISTER]] +; CHECK: subs [[R2:r[0-9]+]], #1 +; CHECK: cmp.w [[R2]], #-1 +; CHECK: bne LBB0_1 %0 = phi i32 [ %.pre, %entry ], [ %3, %bb ] ; [#uses=1] %indvar = phi i32 [ 0, %entry ], [ %indvar.next, %bb ] ; [#uses=2] diff --git a/test/CodeGen/X86/lsr-loop-exit-cond.ll b/test/CodeGen/X86/lsr-loop-exit-cond.ll index bdf09dff0b0..ebda9f201df 100644 --- a/test/CodeGen/X86/lsr-loop-exit-cond.ll +++ b/test/CodeGen/X86/lsr-loop-exit-cond.ll @@ -1,5 +1,6 @@ ; RUN: llc -mtriple=x86_64-darwin < %s | FileCheck %s +; CHECK: t: ; CHECK: decq ; CHECK-NEXT: movl ( ; CHECK-NEXT: jne @@ -136,3 +137,44 @@ bb2: ; preds = %bb store i8 %92, i8* %93, align 1 ret void } + +; Check that DAGCombiner doesn't mess up the IV update when the exiting value +; is equal to the stride. +; It must not fold (cmp (add iv, 1), 1) --> (cmp iv, 0). + +; CHECK: f: +; CHECK: %for.body +; CHECK: incl [[IV:%e..]] +; CHECK: cmpl $1, [[IV]] +; CHECK: jne +; CHECK: ret + +define i32 @f(i32 %i, i32* nocapture %a) nounwind uwtable readonly ssp { +entry: + %cmp4 = icmp eq i32 %i, 1 + br i1 %cmp4, label %for.end, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %0 = sext i32 %i to i64 + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %indvars.iv = phi i64 [ %0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %bi.06 = phi i32 [ 0, %for.body.lr.ph ], [ %i.addr.0.bi.0, %for.body ] + %b.05 = phi i32 [ 0, %for.body.lr.ph ], [ %.b.0, %for.body ] + %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv + %1 = load i32* %arrayidx, align 4 + %cmp1 = icmp ugt i32 %1, %b.05 + %.b.0 = select i1 %cmp1, i32 %1, i32 %b.05 + %2 = trunc i64 %indvars.iv to i32 + %i.addr.0.bi.0 = select i1 %cmp1, i32 %2, i32 %bi.06 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 1 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %bi.0.lcssa = phi i32 [ 0, %entry ], [ %i.addr.0.bi.0, %for.body ] + ret i32 %bi.0.lcssa +} +