From 586f69a11881d828c056ce017b3fb432341d9657 Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Thu, 12 Nov 2009 07:35:05 +0000 Subject: [PATCH] - Teach LSR to avoid changing cmp iv stride if it will create an immediate that cannot be folded into target cmp instruction. - Avoid a phase ordering issue where early cmp optimization would prevent the later count-to-zero optimization. - Add missing checks which could cause LSR to reuse stride that does not have users. - Fix a bug in count-to-zero optimization code which failed to find the pre-inc iv's phi node. - Remove, tighten, loosen some incorrect checks disable valid transformations. - Quite a bit of code clean up. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@86969 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/IVUsers.h | 7 + lib/Analysis/IVUsers.cpp | 12 + lib/Transforms/Scalar/LoopStrengthReduce.cpp | 600 ++++++++++++------- test/CodeGen/Thumb2/lsr-deficiency.ll | 37 ++ 4 files changed, 429 insertions(+), 227 deletions(-) create mode 100644 test/CodeGen/Thumb2/lsr-deficiency.ll diff --git a/include/llvm/Analysis/IVUsers.h b/include/llvm/Analysis/IVUsers.h index 948c6759242..22fbb35cdb9 100644 --- a/include/llvm/Analysis/IVUsers.h +++ b/include/llvm/Analysis/IVUsers.h @@ -161,6 +161,10 @@ public: void addUser(const SCEV *Offset, Instruction *User, Value *Operand) { Users.push_back(new IVStrideUse(this, Offset, User, Operand)); } + + void removeUser(IVStrideUse *User) { + Users.erase(User); + } }; class IVUsers : public LoopPass { @@ -201,6 +205,9 @@ public: /// return true. Otherwise, return false. bool AddUsersIfInteresting(Instruction *I); + void AddUser(const SCEV *Stride, const SCEV *Offset, + Instruction *User, Value *Operand); + /// getReplacementExpr - Return a SCEV expression which computes the /// value of the OperandValToReplace of the given IVStrideUse. const SCEV *getReplacementExpr(const IVStrideUse &U) const; diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp index 20b69f2a7ca..cf52320d1f2 100644 --- a/lib/Analysis/IVUsers.cpp +++ b/lib/Analysis/IVUsers.cpp @@ -267,6 +267,18 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) { return true; } +void IVUsers::AddUser(const SCEV *Stride, const SCEV *Offset, + Instruction *User, Value *Operand) { + IVUsersOfOneStride *StrideUses = IVUsesByStride[Stride]; + if (!StrideUses) { // First occurrence of this stride? + StrideOrder.push_back(Stride); + StrideUses = new IVUsersOfOneStride(Stride); + IVUses.push_back(StrideUses); + IVUsesByStride[Stride] = StrideUses; + } + IVUsesByStride[Stride]->addUser(Offset, User, Operand); +} + IVUsers::IVUsers() : LoopPass(&ID) { } diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index f1366190218..613535f090a 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -132,13 +132,10 @@ namespace { } private: - ICmpInst *ChangeCompareStride(Loop *L, ICmpInst *Cond, - IVStrideUse* &CondUse, - const SCEV *const * &CondStride); - void OptimizeIndvars(Loop *L); - void OptimizeLoopCountIV(const SCEV *Stride, - IVUsersOfOneStride &Uses, Loop *L); + + /// OptimizeLoopTermCond - Change loop terminating condition to use the + /// postinc iv when possible. void OptimizeLoopTermCond(Loop *L); /// OptimizeShadowIV - If IV is used in a int-to-float cast @@ -150,8 +147,27 @@ namespace { ICmpInst *OptimizeMax(Loop *L, ICmpInst *Cond, IVStrideUse* &CondUse); + /// OptimizeLoopCountIV - If, after all sharing of IVs, the IV used for + /// deciding when to exit the loop is used only for that purpose, try to + /// rearrange things so it counts down to a test against zero. + bool OptimizeLoopCountIV(Loop *L); + bool OptimizeLoopCountIVOfStride(const SCEV* &Stride, + IVStrideUse* &CondUse, Loop *L); + + /// StrengthReduceIVUsersOfStride - Strength reduce all of the users of a + /// single stride of IV. All of the users may have different starting + /// values, and this may not be the only stride. + void StrengthReduceIVUsersOfStride(const SCEV *const &Stride, + IVUsersOfOneStride &Uses, + Loop *L); + void StrengthReduceIVUsers(Loop *L); + + ICmpInst *ChangeCompareStride(Loop *L, ICmpInst *Cond, + IVStrideUse* &CondUse, const SCEV* &CondStride, + bool PostPass = false); + bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse, - const SCEV *const * &CondStride); + const SCEV* &CondStride); bool RequiresTypeConversion(const Type *Ty, const Type *NewTy); const SCEV *CheckForIVReuse(bool, bool, bool, const SCEV *const&, IVExpr&, const Type*, @@ -166,6 +182,7 @@ namespace { bool &AllUsesAreAddresses, bool &AllUsesAreOutsideLoop, std::vector &UsersToProcess); + bool StrideMightBeShared(const SCEV *Stride, Loop *L, bool CheckPreInc); bool ShouldUseFullStrengthReductionMode( const std::vector &UsersToProcess, const Loop *L, @@ -190,9 +207,7 @@ namespace { Instruction *IVIncInsertPt, const Loop *L, SCEVExpander &PreheaderRewriter); - void StrengthReduceStridedIVUsers(const SCEV *const &Stride, - IVUsersOfOneStride &Uses, - Loop *L); + void DeleteTriviallyDeadInstructions(); }; } @@ -1007,6 +1022,11 @@ const SCEV *LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg, if (SI == IVsByStride.end() || !isa(SI->first) || StrideNoReuse.count(SI->first)) continue; + // The other stride has no uses, don't reuse it. + std::map::iterator UI = + IU->IVUsesByStride.find(IU->StrideOrder[NewStride]); + if (UI->second->Users.empty()) + continue; int64_t SSInt = cast(SI->first)->getValue()->getSExtValue(); if (SI->first != Stride && (unsigned(abs64(SInt)) < SSInt || (SInt % SSInt) != 0)) @@ -1494,12 +1514,12 @@ static bool IsImmFoldedIntoAddrMode(GlobalValue *GV, int64_t Offset, return true; } -/// StrengthReduceStridedIVUsers - Strength reduce all of the users of a single +/// StrengthReduceIVUsersOfStride - Strength reduce all of the users of a single /// stride of IV. All of the users may have different starting values, and this /// may not be the only stride. -void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEV *const &Stride, - IVUsersOfOneStride &Uses, - Loop *L) { +void LoopStrengthReduce::StrengthReduceIVUsersOfStride(const SCEV *const &Stride, + IVUsersOfOneStride &Uses, + Loop *L) { // If all the users are moved to another stride, then there is nothing to do. if (Uses.Users.empty()) return; @@ -1778,11 +1798,28 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEV *const &Stride, // different starting values, into different PHIs. } +void LoopStrengthReduce::StrengthReduceIVUsers(Loop *L) { + // Note: this processes each stride/type pair individually. All users + // passed into StrengthReduceIVUsersOfStride have the same type AND stride. + // Also, note that we iterate over IVUsesByStride indirectly by using + // StrideOrder. This extra layer of indirection makes the ordering of + // strides deterministic - not dependent on map order. + for (unsigned Stride = 0, e = IU->StrideOrder.size(); Stride != e; ++Stride) { + std::map::iterator SI = + IU->IVUsesByStride.find(IU->StrideOrder[Stride]); + assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!"); + // FIXME: Generalize to non-affine IV's. + if (!SI->first->isLoopInvariant(L)) + continue; + StrengthReduceIVUsersOfStride(SI->first, *SI->second, L); + } +} + /// FindIVUserForCond - If Cond has an operand that is an expression of an IV, /// set the IV user and stride information and return true, otherwise return /// false. bool LoopStrengthReduce::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse, - const SCEV *const * &CondStride) { + const SCEV* &CondStride) { for (unsigned Stride = 0, e = IU->StrideOrder.size(); Stride != e && !CondUse; ++Stride) { std::map::iterator SI = @@ -1796,7 +1833,7 @@ bool LoopStrengthReduce::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse // InstCombine does it as well for simple uses, it's not clear that it // occurs enough in real life to handle. CondUse = UI; - CondStride = &SI->first; + CondStride = SI->first; return true; } } @@ -1854,8 +1891,9 @@ namespace { /// v1 = v1 + 3 /// if (v1 < 30) goto loop ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond, - IVStrideUse* &CondUse, - const SCEV *const* &CondStride) { + IVStrideUse* &CondUse, + const SCEV* &CondStride, + bool PostPass) { // If there's only one stride in the loop, there's nothing to do here. if (IU->StrideOrder.size() < 2) return Cond; @@ -1863,23 +1901,31 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond, // trying to change the condition because the stride will still // remain. std::map::iterator I = - IU->IVUsesByStride.find(*CondStride); - if (I == IU->IVUsesByStride.end() || - I->second->Users.size() != 1) + IU->IVUsesByStride.find(CondStride); + if (I == IU->IVUsesByStride.end()) return Cond; + if (I->second->Users.size() > 1) { + for (ilist::iterator II = I->second->Users.begin(), + EE = I->second->Users.end(); II != EE; ++II) { + if (II->getUser() == Cond) + continue; + if (!isInstructionTriviallyDead(II->getUser())) + return Cond; + } + } // Only handle constant strides for now. - const SCEVConstant *SC = dyn_cast(*CondStride); + const SCEVConstant *SC = dyn_cast(CondStride); if (!SC) return Cond; ICmpInst::Predicate Predicate = Cond->getPredicate(); int64_t CmpSSInt = SC->getValue()->getSExtValue(); - unsigned BitWidth = SE->getTypeSizeInBits((*CondStride)->getType()); + unsigned BitWidth = SE->getTypeSizeInBits(CondStride->getType()); uint64_t SignBit = 1ULL << (BitWidth-1); const Type *CmpTy = Cond->getOperand(0)->getType(); const Type *NewCmpTy = NULL; unsigned TyBits = SE->getTypeSizeInBits(CmpTy); unsigned NewTyBits = 0; - const SCEV **NewStride = NULL; + const SCEV *NewStride = NULL; Value *NewCmpLHS = NULL; Value *NewCmpRHS = NULL; int64_t Scale = 1; @@ -1888,16 +1934,31 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond, if (ConstantInt *C = dyn_cast(Cond->getOperand(1))) { int64_t CmpVal = C->getValue().getSExtValue(); + // Check the relevant induction variable for conformance to + // the pattern. + const SCEV *IV = SE->getSCEV(Cond->getOperand(0)); + const SCEVAddRecExpr *AR = dyn_cast(IV); + if (!AR || !AR->isAffine()) + return Cond; + + const SCEVConstant *StartC = dyn_cast(AR->getStart()); // Check stride constant and the comparision constant signs to detect // overflow. - if ((CmpVal & SignBit) != (CmpSSInt & SignBit)) - return Cond; + if (StartC) { + if ((StartC->getValue()->getSExtValue() < CmpVal && CmpSSInt < 0) || + (StartC->getValue()->getSExtValue() > CmpVal && CmpSSInt > 0)) + return Cond; + } else { + // More restrictive check for the other cases. + if ((CmpVal & SignBit) != (CmpSSInt & SignBit)) + return Cond; + } // Look for a suitable stride / iv as replacement. for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) { std::map::iterator SI = IU->IVUsesByStride.find(IU->StrideOrder[i]); - if (!isa(SI->first)) + if (!isa(SI->first) || SI->second->Users.empty()) continue; int64_t SSInt = cast(SI->first)->getValue()->getSExtValue(); if (SSInt == CmpSSInt || @@ -1907,6 +1968,14 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond, Scale = SSInt / CmpSSInt; int64_t NewCmpVal = CmpVal * Scale; + + // If old icmp value fits in icmp immediate field, but the new one doesn't + // try something else. + if (TLI && + TLI->isLegalICmpImmediate(CmpVal) && + !TLI->isLegalICmpImmediate(NewCmpVal)) + continue; + APInt Mul = APInt(BitWidth*2, CmpVal, true); Mul = Mul * APInt(BitWidth*2, Scale, true); // Check for overflow. @@ -1921,8 +1990,6 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond, (CmpVal & SignBit) != (NewCmpVal & SignBit)) continue; - if (NewCmpVal == CmpVal) - continue; // Pick the best iv to use trying to avoid a cast. NewCmpLHS = NULL; for (ilist::iterator UI = SI->second->Users.begin(), @@ -1972,19 +2039,21 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond, if (NewTyBits != TyBits && !isa(CondUse->getOffset())) continue; - bool AllUsesAreAddresses = true; - bool AllUsesAreOutsideLoop = true; - std::vector UsersToProcess; - const SCEV *CommonExprs = CollectIVUsers(SI->first, *SI->second, L, - AllUsesAreAddresses, - AllUsesAreOutsideLoop, - UsersToProcess); - // Avoid rewriting the compare instruction with an iv of new stride - // if it's likely the new stride uses will be rewritten using the - // stride of the compare instruction. - if (AllUsesAreAddresses && - ValidScale(!CommonExprs->isZero(), Scale, UsersToProcess)) - continue; + if (!PostPass) { + bool AllUsesAreAddresses = true; + bool AllUsesAreOutsideLoop = true; + std::vector UsersToProcess; + const SCEV *CommonExprs = CollectIVUsers(SI->first, *SI->second, L, + AllUsesAreAddresses, + AllUsesAreOutsideLoop, + UsersToProcess); + // Avoid rewriting the compare instruction with an iv of new stride + // if it's likely the new stride uses will be rewritten using the + // stride of the compare instruction. + if (AllUsesAreAddresses && + ValidScale(!CommonExprs->isZero(), Scale, UsersToProcess)) + continue; + } // Avoid rewriting the compare instruction with an iv which has // implicit extension or truncation built into it. @@ -1997,7 +2066,7 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond, if (Scale < 0 && !Cond->isEquality()) Predicate = ICmpInst::getSwappedPredicate(Predicate); - NewStride = &IU->StrideOrder[i]; + NewStride = IU->StrideOrder[i]; if (!isa(NewCmpTy)) NewCmpRHS = ConstantInt::get(NewCmpTy, NewCmpVal); else { @@ -2034,13 +2103,16 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond, Cond = new ICmpInst(OldCond, Predicate, NewCmpLHS, NewCmpRHS, L->getHeader()->getName() + ".termcond"); + DEBUG(errs() << " Change compare stride in Inst " << *OldCond); + DEBUG(errs() << " to " << *Cond << '\n'); + // Remove the old compare instruction. The old indvar is probably dead too. DeadInsts.push_back(CondUse->getOperandValToReplace()); OldCond->replaceAllUsesWith(Cond); OldCond->eraseFromParent(); - IU->IVUsesByStride[*NewStride]->addUser(NewOffset, Cond, NewCmpLHS); - CondUse = &IU->IVUsesByStride[*NewStride]->Users.back(); + IU->IVUsesByStride[NewStride]->addUser(NewOffset, Cond, NewCmpLHS); + CondUse = &IU->IVUsesByStride[NewStride]->Users.back(); CondStride = NewStride; ++NumEliminated; Changed = true; @@ -2300,124 +2372,42 @@ void LoopStrengthReduce::OptimizeIndvars(Loop *L) { OptimizeShadowIV(L); } -/// OptimizeLoopTermCond - Change loop terminating condition to use the -/// postinc iv when possible. -void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) { - // Finally, get the terminating condition for the loop if possible. If we - // can, we want to change it to use a post-incremented version of its - // induction variable, to allow coalescing the live ranges for the IV into - // one register value. - BasicBlock *LatchBlock = L->getLoopLatch(); - BasicBlock *ExitingBlock = L->getExitingBlock(); - - if (!ExitingBlock) - // Multiple exits, just look at the exit in the latch block if there is one. - ExitingBlock = LatchBlock; - BranchInst *TermBr = dyn_cast(ExitingBlock->getTerminator()); - if (!TermBr) - return; - if (TermBr->isUnconditional() || !isa(TermBr->getCondition())) - return; - - // Search IVUsesByStride to find Cond's IVUse if there is one. - IVStrideUse *CondUse = 0; - const SCEV *const *CondStride = 0; - ICmpInst *Cond = cast(TermBr->getCondition()); - if (!FindIVUserForCond(Cond, CondUse, CondStride)) - return; // setcc doesn't use the IV. - - if (ExitingBlock != LatchBlock) { - if (!Cond->hasOneUse()) - // See below, we don't want the condition to be cloned. - return; - - // If exiting block is the latch block, we know it's safe and profitable to - // transform the icmp to use post-inc iv. Otherwise do so only if it would - // not reuse another iv and its iv would be reused by other uses. We are - // optimizing for the case where the icmp is the only use of the iv. - IVUsersOfOneStride &StrideUses = *IU->IVUsesByStride[*CondStride]; - for (ilist::iterator I = StrideUses.Users.begin(), - E = StrideUses.Users.end(); I != E; ++I) { - if (I->getUser() == Cond) - continue; - if (!I->isUseOfPostIncrementedValue()) - return; - } - - // FIXME: This is expensive, and worse still ChangeCompareStride does a - // similar check. Can we perform all the icmp related transformations after - // StrengthReduceStridedIVUsers? - if (const SCEVConstant *SC = dyn_cast(*CondStride)) { - int64_t SInt = SC->getValue()->getSExtValue(); - for (unsigned NewStride = 0, ee = IU->StrideOrder.size(); NewStride != ee; - ++NewStride) { - std::map::iterator SI = - IU->IVUsesByStride.find(IU->StrideOrder[NewStride]); - if (!isa(SI->first) || SI->first == *CondStride) - continue; - int64_t SSInt = - cast(SI->first)->getValue()->getSExtValue(); - if (SSInt == SInt) - return; // This can definitely be reused. - if (unsigned(abs64(SSInt)) < SInt || (SSInt % SInt) != 0) - continue; - int64_t Scale = SSInt / SInt; - bool AllUsesAreAddresses = true; - bool AllUsesAreOutsideLoop = true; - std::vector UsersToProcess; - const SCEV *CommonExprs = CollectIVUsers(SI->first, *SI->second, L, - AllUsesAreAddresses, - AllUsesAreOutsideLoop, - UsersToProcess); - // Avoid rewriting the compare instruction with an iv of new stride - // if it's likely the new stride uses will be rewritten using the - // stride of the compare instruction. - if (AllUsesAreAddresses && - ValidScale(!CommonExprs->isZero(), Scale, UsersToProcess)) - return; +bool LoopStrengthReduce::StrideMightBeShared(const SCEV* Stride, Loop *L, + bool CheckPreInc) { + int64_t SInt = cast(Stride)->getValue()->getSExtValue(); + for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) { + std::map::iterator SI = + IU->IVUsesByStride.find(IU->StrideOrder[i]); + const SCEV *Share = SI->first; + if (!isa(SI->first) || Share == Stride) + continue; + int64_t SSInt = cast(Share)->getValue()->getSExtValue(); + if (SSInt == SInt) + return true; // This can definitely be reused. + if (unsigned(abs64(SSInt)) < SInt || (SSInt % SInt) != 0) + continue; + int64_t Scale = SSInt / SInt; + bool AllUsesAreAddresses = true; + bool AllUsesAreOutsideLoop = true; + std::vector UsersToProcess; + const SCEV *CommonExprs = CollectIVUsers(SI->first, *SI->second, L, + AllUsesAreAddresses, + AllUsesAreOutsideLoop, + UsersToProcess); + if (AllUsesAreAddresses && + ValidScale(!CommonExprs->isZero(), Scale, UsersToProcess)) { + if (!CheckPreInc) + return true; + // Any pre-inc iv use? + IVUsersOfOneStride &StrideUses = *IU->IVUsesByStride[Share]; + for (ilist::iterator I = StrideUses.Users.begin(), + E = StrideUses.Users.end(); I != E; ++I) { + if (!I->isUseOfPostIncrementedValue()) + return true; } } - - StrideNoReuse.insert(*CondStride); } - - // If the trip count is computed in terms of a max (due to ScalarEvolution - // being unable to find a sufficient guard, for example), change the loop - // comparison to use SLT or ULT instead of NE. - Cond = OptimizeMax(L, Cond, CondUse); - - // If possible, change stride and operands of the compare instruction to - // eliminate one stride. - if (ExitingBlock == LatchBlock) - Cond = ChangeCompareStride(L, Cond, CondUse, CondStride); - - // It's possible for the setcc instruction to be anywhere in the loop, and - // possible for it to have multiple users. If it is not immediately before - // the latch block branch, move it. - if (&*++BasicBlock::iterator(Cond) != (Instruction*)TermBr) { - if (Cond->hasOneUse()) { // Condition has a single use, just move it. - Cond->moveBefore(TermBr); - } else { - // Otherwise, clone the terminating condition and insert into the loopend. - Cond = cast(Cond->clone()); - Cond->setName(L->getHeader()->getName() + ".termcond"); - LatchBlock->getInstList().insert(TermBr, Cond); - - // Clone the IVUse, as the old use still exists! - IU->IVUsesByStride[*CondStride]->addUser(CondUse->getOffset(), Cond, - CondUse->getOperandValToReplace()); - CondUse = &IU->IVUsesByStride[*CondStride]->Users.back(); - } - } - - // If we get to here, we know that we can transform the setcc instruction to - // use the post-incremented version of the IV, allowing us to coalesce the - // live ranges for the IV correctly. - CondUse->setOffset(SE->getMinusSCEV(CondUse->getOffset(), *CondStride)); - CondUse->setIsUseOfPostIncrementedValue(true); - Changed = true; - - ++NumLoopCond; + return false; } /// isUsedByExitBranch - Return true if icmp is used by a loop terminating @@ -2444,68 +2434,185 @@ static bool isUsedByExitBranch(ICmpInst *Cond, Loop *L) { return User == TermBr; } -/// OptimizeLoopCountIV - If, after all sharing of IVs, the IV used for deciding -/// when to exit the loop is used only for that purpose, try to rearrange things -/// so it counts down to a test against zero which tends to be cheaper. -void LoopStrengthReduce::OptimizeLoopCountIV(const SCEV *Stride, - IVUsersOfOneStride &Uses, - Loop *L) { - if (Uses.Users.size() != 1) - return; +static bool ShouldCountToZero(ICmpInst *Cond, IVStrideUse* &CondUse, + ScalarEvolution *SE, Loop *L, + const TargetLowering *TLI = 0) { + if (!L->contains(Cond->getParent())) + return false; - // If the only use is an icmp of an loop exiting conditional branch, then - // attempts the optimization. - BasedUser User = BasedUser(*Uses.Users.begin(), SE); - Instruction *Inst = User.Inst; - if (!L->contains(Inst->getParent())) - return; + if (!isa(CondUse->getOffset())) + return false; - ICmpInst *Cond = dyn_cast(Inst); - if (!Cond) - return; // Handle only tests for equality for the moment. - if (Cond->getPredicate() != CmpInst::ICMP_EQ || !Cond->hasOneUse()) - return; + if (!Cond->isEquality() || !Cond->hasOneUse()) + return false; if (!isUsedByExitBranch(Cond, L)) - return; + return false; Value *CondOp0 = Cond->getOperand(0); const SCEV *IV = SE->getSCEV(CondOp0); const SCEVAddRecExpr *AR = dyn_cast(IV); if (!AR || !AR->isAffine()) - return; + return false; const SCEVConstant *SC = dyn_cast(AR->getStepRecurrence(*SE)); if (!SC || SC->getValue()->getSExtValue() < 0) // If it's already counting down, don't do anything. - return; + return false; // If the RHS of the comparison is not an loop invariant, the rewrite // cannot be done. Also bail out if it's already comparing against a zero. + // If we are checking this before cmp stride optimization, check if it's + // comparing against a already legal immediate. Value *RHS = Cond->getOperand(1); + ConstantInt *RHSC = dyn_cast(RHS); if (!L->isLoopInvariant(RHS) || - (isa(RHS) && cast(RHS)->isZero())) - return; + (RHSC && RHSC->isZero()) || + (RHSC && TLI && TLI->isLegalICmpImmediate(RHSC->getSExtValue()))) + return false; // Make sure the IV is only used for counting. Value may be preinc or // postinc; 2 uses in either case. if (!CondOp0->hasNUses(2)) + return false; + + return true; +} + +/// OptimizeLoopTermCond - Change loop terminating condition to use the +/// postinc iv when possible. +void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) { + // Finally, get the terminating condition for the loop if possible. If we + // can, we want to change it to use a post-incremented version of its + // induction variable, to allow coalescing the live ranges for the IV into + // one register value. + BasicBlock *LatchBlock = L->getLoopLatch(); + BasicBlock *ExitingBlock = L->getExitingBlock(); + + if (!ExitingBlock) + // Multiple exits, just look at the exit in the latch block if there is one. + ExitingBlock = LatchBlock; + BranchInst *TermBr = dyn_cast(ExitingBlock->getTerminator()); + if (!TermBr) return; - PHINode *PHIExpr; + if (TermBr->isUnconditional() || !isa(TermBr->getCondition())) + return; + + // Search IVUsesByStride to find Cond's IVUse if there is one. + IVStrideUse *CondUse = 0; + const SCEV *CondStride = 0; + ICmpInst *Cond = cast(TermBr->getCondition()); + if (!FindIVUserForCond(Cond, CondUse, CondStride)) + return; + + bool UsePostInc = true; + if (ExitingBlock != LatchBlock) { + if (Cond->hasOneUse()) { + // See below, we don't want the condition to be cloned. + + // If exiting block is the latch block, we know it's safe and profitable + // to transform the icmp to use post-inc iv. Otherwise do so only if it + // would not reuse another iv and its iv would be reused by other uses. + // We are optimizing for the case where the icmp is the only use of the + // iv. + IVUsersOfOneStride &StrideUses = *IU->IVUsesByStride[CondStride]; + for (ilist::iterator I = StrideUses.Users.begin(), + E = StrideUses.Users.end(); I != E; ++I) { + if (I->getUser() == Cond) + continue; + if (!I->isUseOfPostIncrementedValue()) { + UsePostInc = false; + break; + } + } + } + + // If iv for the stride might be shared and any of the users use pre-inc iv + // might be used, then it's not safe to use post-inc iv. + if (UsePostInc && + isa(CondStride) && + StrideMightBeShared(CondStride, L, true)) + UsePostInc = false; + } + + // If the trip count is computed in terms of a max (due to ScalarEvolution + // being unable to find a sufficient guard, for example), change the loop + // comparison to use SLT or ULT instead of NE. + Cond = OptimizeMax(L, Cond, CondUse); + + // If possible, change stride and operands of the compare instruction to + // eliminate one stride. However, avoid rewriting the compare instruction with + // an iv of new stride if it's likely the new stride uses will be rewritten + // using the stride of the compare instruction. + if (ExitingBlock == LatchBlock && isa(CondStride)) { + // If the condition stride is a constant and it's the only use, we might + // want to optimize it first by turning it to count toward zero. + if (!StrideMightBeShared(CondStride, L, false) && + !ShouldCountToZero(Cond, CondUse, SE, L, TLI)) + Cond = ChangeCompareStride(L, Cond, CondUse, CondStride); + } + + if (!UsePostInc) + return; + + // It's possible for the setcc instruction to be anywhere in the loop, and + // possible for it to have multiple users. If it is not immediately before + // the latch block branch, move it. + if (&*++BasicBlock::iterator(Cond) != (Instruction*)TermBr) { + if (Cond->hasOneUse()) { // Condition has a single use, just move it. + Cond->moveBefore(TermBr); + } else { + // Otherwise, clone the terminating condition and insert into the loopend. + Cond = cast(Cond->clone()); + Cond->setName(L->getHeader()->getName() + ".termcond"); + LatchBlock->getInstList().insert(TermBr, Cond); + + // Clone the IVUse, as the old use still exists! + IU->IVUsesByStride[CondStride]->addUser(CondUse->getOffset(), Cond, + CondUse->getOperandValToReplace()); + CondUse = &IU->IVUsesByStride[CondStride]->Users.back(); + } + } + + // If we get to here, we know that we can transform the setcc instruction to + // use the post-incremented version of the IV, allowing us to coalesce the + // live ranges for the IV correctly. + CondUse->setOffset(SE->getMinusSCEV(CondUse->getOffset(), CondStride)); + CondUse->setIsUseOfPostIncrementedValue(true); + Changed = true; + + ++NumLoopCond; +} + +bool LoopStrengthReduce::OptimizeLoopCountIVOfStride(const SCEV* &Stride, + IVStrideUse* &CondUse, + Loop *L) { + // If the only use is an icmp of an loop exiting conditional branch, then + // attempts the optimization. + BasedUser User = BasedUser(*CondUse, SE); + assert(isa(User.Inst) && "Expecting an ICMPInst!"); + ICmpInst *Cond = cast(User.Inst); + + // Less strict check now that compare stride optimization is done. + if (!ShouldCountToZero(Cond, CondUse, SE, L)) + return false; + + Value *CondOp0 = Cond->getOperand(0); + PHINode *PHIExpr = dyn_cast(CondOp0); Instruction *Incr; - if (User.isUseOfPostIncrementedValue) { + if (!PHIExpr) { // Value tested is postinc. Find the phi node. Incr = dyn_cast(CondOp0); + // FIXME: Just use User.OperandValToReplace here? if (!Incr || Incr->getOpcode() != Instruction::Add) - return; + return false; - Instruction::use_iterator UI = CondOp0->use_begin(); - PHIExpr = dyn_cast(UI); + PHIExpr = dyn_cast(Incr->getOperand(0)); if (!PHIExpr) - PHIExpr = dyn_cast(++UI); + return false; // 1 use for preinc value, the increment. - if (!PHIExpr || !PHIExpr->hasOneUse()) - return; + if (!PHIExpr->hasOneUse()) + return false; } else { assert(isa(CondOp0) && "Unexpected loop exiting counting instruction sequence!"); @@ -2518,18 +2625,13 @@ void LoopStrengthReduce::OptimizeLoopCountIV(const SCEV *Stride, Incr = dyn_cast(++UI); // One use for postinc value, the phi. Unnecessarily conservative? if (!Incr || !Incr->hasOneUse() || Incr->getOpcode() != Instruction::Add) - return; + return false; } // Replace the increment with a decrement. - DEBUG(errs() << " Examining "); - if (User.isUseOfPostIncrementedValue) - DEBUG(errs() << "postinc"); - else - DEBUG(errs() << "preinc"); - DEBUG(errs() << " use "); + DEBUG(errs() << "LSR: Examining use "); DEBUG(WriteAsOperand(errs(), CondOp0, /*PrintType=*/false)); - DEBUG(errs() << " in Inst: " << *Inst << '\n'); + DEBUG(errs() << " in Inst: " << *Cond << '\n'); BinaryOperator *Decr = BinaryOperator::Create(Instruction::Sub, Incr->getOperand(0), Incr->getOperand(1), "tmp", Incr); Incr->replaceAllUsesWith(Decr); @@ -2554,8 +2656,75 @@ void LoopStrengthReduce::OptimizeLoopCountIV(const SCEV *Stride, Cond->setOperand(1, Zero); DEBUG(errs() << " New icmp: " << *Cond << "\n"); - Changed = true; + int64_t SInt = cast(Stride)->getValue()->getSExtValue(); + const SCEV *NewStride = 0; + bool Found = false; + for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) { + const SCEV *OldStride = IU->StrideOrder[i]; + if (const SCEVConstant *SC = dyn_cast(OldStride)) + if (SC->getValue()->getSExtValue() == -SInt) { + Found = true; + NewStride = OldStride; + break; + } + } + + if (!Found) + NewStride = SE->getIntegerSCEV(-SInt, Stride->getType()); + IU->AddUser(NewStride, CondUse->getOffset(), Cond, Cond->getOperand(0)); + IU->IVUsesByStride[Stride]->removeUser(CondUse); + + CondUse = &IU->IVUsesByStride[NewStride]->Users.back(); + Stride = NewStride; + ++NumCountZero; + + return true; +} + +/// OptimizeLoopCountIV - If, after all sharing of IVs, the IV used for deciding +/// when to exit the loop is used only for that purpose, try to rearrange things +/// so it counts down to a test against zero. +bool LoopStrengthReduce::OptimizeLoopCountIV(Loop *L) { + bool ThisChanged = false; + for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) { + const SCEV *Stride = IU->StrideOrder[i]; + std::map::iterator SI = + IU->IVUsesByStride.find(Stride); + assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!"); + // FIXME: Generalize to non-affine IV's. + if (!SI->first->isLoopInvariant(L)) + continue; + // If stride is a constant and it has an icmpinst use, check if we can + // optimize the loop to count down. + if (isa(Stride) && SI->second->Users.size() == 1) { + Instruction *User = SI->second->Users.begin()->getUser(); + if (!isa(User)) + continue; + const SCEV *CondStride = Stride; + IVStrideUse *Use = &*SI->second->Users.begin(); + if (!OptimizeLoopCountIVOfStride(CondStride, Use, L)) + continue; + ThisChanged = true; + + // Now check if it's possible to reuse this iv for other stride uses. + for (unsigned j = 0, ee = IU->StrideOrder.size(); j != ee; ++j) { + const SCEV *SStride = IU->StrideOrder[j]; + if (SStride == CondStride) + continue; + std::map::iterator SII = + IU->IVUsesByStride.find(SStride); + assert(SII != IU->IVUsesByStride.end() && "Stride doesn't exist!"); + // FIXME: Generalize to non-affine IV's. + if (!SII->first->isLoopInvariant(L)) + continue; + // FIXME: Rewrite other stride using CondStride. + } + } + } + + Changed |= ThisChanged; + return ThisChanged; } bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) { @@ -2587,7 +2756,7 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) { // Change loop terminating condition to use the postinc iv when possible // and optimize loop terminating compare. FIXME: Move this after - // StrengthReduceStridedIVUsers? + // StrengthReduceIVUsersOfStride? OptimizeLoopTermCond(L); // FIXME: We can shrink overlarge IV's here. e.g. if the code has @@ -2603,34 +2772,11 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) { // IVsByStride keeps IVs for one particular loop. assert(IVsByStride.empty() && "Stale entries in IVsByStride?"); - // Note: this processes each stride/type pair individually. All users - // passed into StrengthReduceStridedIVUsers have the same type AND stride. - // Also, note that we iterate over IVUsesByStride indirectly by using - // StrideOrder. This extra layer of indirection makes the ordering of - // strides deterministic - not dependent on map order. - for (unsigned Stride = 0, e = IU->StrideOrder.size(); - Stride != e; ++Stride) { - std::map::iterator SI = - IU->IVUsesByStride.find(IU->StrideOrder[Stride]); - assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!"); - // FIXME: Generalize to non-affine IV's. - if (!SI->first->isLoopInvariant(L)) - continue; - StrengthReduceStridedIVUsers(SI->first, *SI->second, L); - } + StrengthReduceIVUsers(L); // After all sharing is done, see if we can adjust the loop to test against // zero instead of counting up to a maximum. This is usually faster. - for (unsigned Stride = 0, e = IU->StrideOrder.size(); - Stride != e; ++Stride) { - std::map::iterator SI = - IU->IVUsesByStride.find(IU->StrideOrder[Stride]); - assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!"); - // FIXME: Generalize to non-affine IV's. - if (!SI->first->isLoopInvariant(L)) - continue; - OptimizeLoopCountIV(SI->first, *SI->second, L); - } + OptimizeLoopCountIV(L); } // We're done analyzing this loop; release all the state we built up for it. diff --git a/test/CodeGen/Thumb2/lsr-deficiency.ll b/test/CodeGen/Thumb2/lsr-deficiency.ll new file mode 100644 index 00000000000..7b1b57a786e --- /dev/null +++ b/test/CodeGen/Thumb2/lsr-deficiency.ll @@ -0,0 +1,37 @@ +; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -relocation-model=pic | FileCheck %s +; rdar://7387640 + +; FIXME: We still need to rewrite array reference iv of stride -4 with loop +; count iv of stride -1. + +@G = external global i32 ; [#uses=2] +@array = external global i32* ; [#uses=1] + +define arm_apcscc void @t() nounwind optsize { +; CHECK: t: +; CHECK: mov.w r2, #4000 +; CHECK: movw r3, #1001 +entry: + %.pre = load i32* @G, align 4 ; [#uses=1] + br label %bb + +bb: ; preds = %bb, %entry +; CHECK: LBB1_1: +; CHECK: subs r3, #1 +; CHECK: cmp r3, #0 +; CHECK: sub.w r2, r2, #4 + %0 = phi i32 [ %.pre, %entry ], [ %3, %bb ] ; [#uses=1] + %indvar = phi i32 [ 0, %entry ], [ %indvar.next, %bb ] ; [#uses=2] + %tmp5 = sub i32 1000, %indvar ; [#uses=1] + %1 = load i32** @array, align 4 ; [#uses=1] + %scevgep = getelementptr i32* %1, i32 %tmp5 ; [#uses=1] + %2 = load i32* %scevgep, align 4 ; [#uses=1] + %3 = add nsw i32 %2, %0 ; [#uses=2] + store i32 %3, i32* @G, align 4 + %indvar.next = add i32 %indvar, 1 ; [#uses=2] + %exitcond = icmp eq i32 %indvar.next, 1001 ; [#uses=1] + br i1 %exitcond, label %return, label %bb + +return: ; preds = %bb + ret void +}