From c1acc3f764804d25f70d88f937ef9c460143e0f1 Mon Sep 17 00:00:00 2001 From: Dale Johannesen Date: Mon, 11 May 2009 17:15:42 +0000 Subject: [PATCH] Reverse a loop that is counting up to a maximum to count down to 0 instead, under very restricted circumstances. Adjust 4 testcases in which this optimization fires. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@71439 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/LoopStrengthReduce.cpp | 124 +++++++++++++++++- .../CodeGen/X86/2007-11-30-LoadFolding-Bug.ll | 4 +- test/CodeGen/X86/iv-users-in-other-loops.ll | 4 +- test/CodeGen/X86/masked-iv-safe.ll | 5 +- test/CodeGen/X86/pr3495.ll | 3 +- 5 files changed, 129 insertions(+), 11 deletions(-) diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 8d3240e0620..95684499486 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -166,7 +166,7 @@ namespace { const SCEVHandle* &CondStride); void OptimizeIndvars(Loop *L); - + void OptimizeLoopCountIV(Loop *L); void OptimizeLoopTermCond(Loop *L); /// OptimizeShadowIV - If IV is used in a int-to-float cast @@ -1746,11 +1746,11 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride, CommonBaseV = PreheaderRewriter.expandCodeFor(CommonExprs, ReplacedTy, PreInsertPt); - // If all uses are addresses, check if it is possible to reuse an IV with a - // stride that is a factor of this stride. And that the multiple is a number - // that can be encoded in the scale field of the target addressing mode. And - // that we will have a valid instruction after this substition, including - // the immediate field, if any. + // If all uses are addresses, check if it is possible to reuse an IV. The + // new IV must have a stride that is a multiple of the old stride; the + // multiple must be a number that can be encoded in the scale field of the + // target addressing mode; and we must have a valid instruction after this + // substitution, including the immediate field, if any. RewriteFactor = CheckForIVReuse(HaveCommonExprs, AllUsesAreAddresses, AllUsesAreOutsideLoop, Stride, ReuseIV, ReplacedTy, @@ -2444,6 +2444,114 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) { Changed = true; } +// OptimizeLoopCountIV - If, after all sharing of IVs, the IV used for deciding +// when to exit the loop is used only for that purpose, try to rearrange things +// so it counts down to a test against zero. +void LoopStrengthReduce::OptimizeLoopCountIV(Loop *L) { + + // If the number of times the loop is executed isn't computable, give up. + SCEVHandle BackedgeTakenCount = SE->getBackedgeTakenCount(L); + if (isa(BackedgeTakenCount)) + return; + + // Get the terminating condition for the loop if possible (this isn't + // necessarily in the latch, or a block that's a predecessor of the header). + SmallVector ExitBlocks; + L->getExitBlocks(ExitBlocks); + if (ExitBlocks.size() != 1) return; + + // Okay, there is one exit block. Try to find the condition that causes the + // loop to be exited. + BasicBlock *ExitBlock = ExitBlocks[0]; + + BasicBlock *ExitingBlock = 0; + for (pred_iterator PI = pred_begin(ExitBlock), E = pred_end(ExitBlock); + PI != E; ++PI) + if (L->contains(*PI)) { + if (ExitingBlock == 0) + ExitingBlock = *PI; + else + return; // More than one block exiting! + } + assert(ExitingBlock && "No exits from loop, something is broken!"); + + // Okay, we've computed the exiting block. See what condition causes us to + // exit. + // + // FIXME: we should be able to handle switch instructions (with a single exit) + BranchInst *TermBr = dyn_cast(ExitingBlock->getTerminator()); + if (TermBr == 0) return; + assert(TermBr->isConditional() && "If unconditional, it can't be in loop!"); + if (!isa(TermBr->getCondition())) + return; + ICmpInst *Cond = cast(TermBr->getCondition()); + + // Handle only tests for equality for the moment, and only stride 1. + if (Cond->getPredicate() != CmpInst::ICMP_EQ) + return; + SCEVHandle IV = SE->getSCEV(Cond->getOperand(0)); + const SCEVAddRecExpr *AR = dyn_cast(IV); + SCEVHandle One = SE->getIntegerSCEV(1, BackedgeTakenCount->getType()); + if (!AR || !AR->isAffine() || AR->getStepRecurrence(*SE) != One) + return; + + // Make sure the IV is only used for counting. Value may be preinc or + // postinc; 2 uses in either case. + if (!Cond->getOperand(0)->hasNUses(2)) + return; + PHINode *phi = dyn_cast(Cond->getOperand(0)); + Instruction *incr; + if (phi && phi->getParent()==L->getHeader()) { + // value tested is preinc. Find the increment. + // A CmpInst is not a BinaryOperator; we depend on this. + Instruction::use_iterator UI = phi->use_begin(); + incr = dyn_cast(UI); + if (!incr) + incr = dyn_cast(++UI); + // 1 use for postinc value, the phi. Unnecessarily conservative? + if (!incr || !incr->hasOneUse() || incr->getOpcode()!=Instruction::Add) + return; + } else { + // Value tested is postinc. Find the phi node. + incr = dyn_cast(Cond->getOperand(0)); + if (!incr || incr->getOpcode()!=Instruction::Add) + return; + + Instruction::use_iterator UI = Cond->getOperand(0)->use_begin(); + phi = dyn_cast(UI); + if (!phi) + phi = dyn_cast(++UI); + // 1 use for preinc value, the increment. + if (!phi || phi->getParent()!=L->getHeader() || !phi->hasOneUse()) + return; + } + + // Replace the increment with a decrement. + BinaryOperator *decr = + BinaryOperator::Create(Instruction::Sub, incr->getOperand(0), + incr->getOperand(1), "tmp", incr); + incr->replaceAllUsesWith(decr); + incr->eraseFromParent(); + + // Substitute endval-startval for the original startval, and 0 for the + // original endval. Since we're only testing for equality this is OK even + // if the computation wraps around. + BasicBlock *Preheader = L->getLoopPreheader(); + Instruction *PreInsertPt = Preheader->getTerminator(); + int inBlock = L->contains(phi->getIncomingBlock(0)) ? 1 : 0; + Value *startVal = phi->getIncomingValue(inBlock); + Value *endVal = Cond->getOperand(1); + // FIXME check for case where both are constant + ConstantInt* Zero = ConstantInt::get(Cond->getOperand(1)->getType(), 0); + BinaryOperator *NewStartVal = + BinaryOperator::Create(Instruction::Sub, endVal, startVal, + "tmp", PreInsertPt); + phi->setIncomingValue(inBlock, NewStartVal); + Cond->setOperand(1, Zero); + + Changed = true; +} + bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) { LI = &getAnalysis(); @@ -2500,6 +2608,10 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) { } } + // After all sharing is done, see if we can adjust the loop to test against + // zero instead of counting up to a maximum. This is usually faster. + OptimizeLoopCountIV(L); + // We're done analyzing this loop; release all the state we built up for it. IVUsesByStride.clear(); IVsByStride.clear(); diff --git a/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll b/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll index df6d76a0930..1b36fcec67a 100644 --- a/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll +++ b/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll @@ -1,5 +1,7 @@ ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -stats |& \ ; RUN: grep {1 .*folded into instructions} +; Increment in loop bb.128.i adjusted to 2, to prevent loop reversal from +; kicking in. declare fastcc void @rdft(i32, i32, double*, i32*, double*) @@ -41,7 +43,7 @@ bb.i28.i: ; preds = %bb.i28.i, %cond_next36.i %tmp1213.i23.i = sitofp i32 %x.0.i21.i to double ; [#uses=1] %tmp15.i24.i = sub double 0.000000e+00, %tmp1213.i23.i ; [#uses=1] %tmp16.i25.i = mul double 0.000000e+00, %tmp15.i24.i ; [#uses=1] - %indvar.next39.i = add i32 %j.0.reg2mem.0.i16.i, 1 ; [#uses=2] + %indvar.next39.i = add i32 %j.0.reg2mem.0.i16.i, 2 ; [#uses=2] %exitcond40.i = icmp eq i32 %indvar.next39.i, %tmp8.i14.i ; [#uses=1] br i1 %exitcond40.i, label %mp_unexp_d2mp.exit29.i, label %bb.i28.i diff --git a/test/CodeGen/X86/iv-users-in-other-loops.ll b/test/CodeGen/X86/iv-users-in-other-loops.ll index 67d9d49a44f..d11b025d589 100644 --- a/test/CodeGen/X86/iv-users-in-other-loops.ll +++ b/test/CodeGen/X86/iv-users-in-other-loops.ll @@ -1,5 +1,6 @@ ; RUN: llvm-as < %s | llc -march=x86-64 -f -o %t -; RUN: grep inc %t | count 2 +; RUN: grep inc %t | count 1 +; RUN: grep dec %t | count 2 ; RUN: grep addq %t | count 13 ; RUN: grep leaq %t | count 8 ; RUN: grep leal %t | count 4 @@ -8,6 +9,7 @@ ; IV users in each of the loops from other loops shouldn't cause LSR ; to insert new induction variables. Previously it would create a ; flood of new induction variables. +; Also, the loop reversal should kick in once. target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" target triple = "x86_64-unknown-linux-gnu" diff --git a/test/CodeGen/X86/masked-iv-safe.ll b/test/CodeGen/X86/masked-iv-safe.ll index 2ba3f830fdb..e9b80a4c42c 100644 --- a/test/CodeGen/X86/masked-iv-safe.ll +++ b/test/CodeGen/X86/masked-iv-safe.ll @@ -4,12 +4,13 @@ ; RUN: not grep sar %t ; RUN: not grep shl %t ; RUN: grep add %t | count 6 -; RUN: grep inc %t | count 4 -; RUN: grep dec %t | count 2 +; RUN: grep inc %t | count 2 +; RUN: grep dec %t | count 4 ; RUN: grep lea %t | count 2 ; Optimize away zext-inreg and sext-inreg on the loop induction ; variable using trip-count information. +; Also, the loop-reversal algorithm kicks in twice. define void @count_up(double* %d, i64 %n) nounwind { entry: diff --git a/test/CodeGen/X86/pr3495.ll b/test/CodeGen/X86/pr3495.ll index 726ad7413b0..62382c6d78c 100644 --- a/test/CodeGen/X86/pr3495.ll +++ b/test/CodeGen/X86/pr3495.ll @@ -1,7 +1,8 @@ ; RUN: llvm-as < %s | llc -march=x86 -stats |& grep {Number of reloads omited} | grep 2 ; RUN: llvm-as < %s | llc -march=x86 -stats |& not grep {Number of available reloads turned into copies} -; RUN: llvm-as < %s | llc -march=x86 -stats |& grep {Number of machine instrs printed} | grep 39 +; RUN: llvm-as < %s | llc -march=x86 -stats |& grep {Number of machine instrs printed} | grep 38 ; PR3495 +; The loop reversal kicks in once here, resulting in one fewer instruction. target triple = "i386-pc-linux-gnu" @x = external global [8 x i32], align 32 ; <[8 x i32]*> [#uses=1]