mirror of
				https://github.com/c64scene-ar/llvm-6502.git
				synced 2025-10-25 10:27:04 +00:00 
			
		
		
		
	Add -unroll-runtime for unrolling loops with run-time trip counts.
Patch by Brendon Cahoon! This extends the existing LoopUnroll and LoopUnrollPass. Brendon measured no regressions in the llvm test suite with -unroll-runtime enabled. This implementation works by using the existing loop unrolling code to unroll the loop by a power-of-two (default 8). It generates an if-then-else sequence of code prior to the loop to execute the extra iterations before entering the unrolled loop. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@146245 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
		| @@ -22,9 +22,12 @@ class Loop; | ||||
| class LoopInfo; | ||||
| class LPPassManager; | ||||
|  | ||||
| bool UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, | ||||
| bool UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool AllowRuntime, | ||||
|                 unsigned TripMultiple, LoopInfo* LI, LPPassManager* LPM); | ||||
|  | ||||
| bool UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, | ||||
|                              LPPassManager* LPM); | ||||
|  | ||||
| } | ||||
|  | ||||
| #endif | ||||
|   | ||||
| @@ -40,6 +40,10 @@ UnrollAllowPartial("unroll-allow-partial", cl::init(false), cl::Hidden, | ||||
|   cl::desc("Allows loops to be partially unrolled until " | ||||
|            "-unroll-threshold loop size is reached.")); | ||||
|  | ||||
| static cl::opt<bool> | ||||
| UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::init(false), cl::Hidden, | ||||
|   cl::desc("Unroll loops with run-time trip counts")); | ||||
|  | ||||
| namespace { | ||||
|   class LoopUnroll : public LoopPass { | ||||
|   public: | ||||
| @@ -63,6 +67,10 @@ namespace { | ||||
|     // explicit -unroll-threshold). | ||||
|     static const unsigned OptSizeUnrollThreshold = 50; | ||||
|  | ||||
|     // Default unroll count for loops with run-time trip count if | ||||
|     // -unroll-count is not set | ||||
|     static const unsigned UnrollRuntimeCount = 8; | ||||
|  | ||||
|     unsigned CurrentCount; | ||||
|     unsigned CurrentThreshold; | ||||
|     bool     CurrentAllowPartial; | ||||
| @@ -151,8 +159,13 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { | ||||
|     TripCount = SE->getSmallConstantTripCount(L, LatchBlock); | ||||
|     TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock); | ||||
|   } | ||||
|   // Automatically select an unroll count. | ||||
|   // Use a default unroll-count if the user doesn't specify a value | ||||
|   // and the trip count is a run-time value.  The default is different | ||||
|   // for run-time or compile-time trip count loops. | ||||
|   unsigned Count = CurrentCount; | ||||
|   if (UnrollRuntime && CurrentCount == 0 && TripCount == 0) | ||||
|     Count = UnrollRuntimeCount; | ||||
|  | ||||
|   if (Count == 0) { | ||||
|     // Conservative heuristic: if we know the trip count, see if we can | ||||
|     // completely unroll (subject to the threshold, checked below); otherwise | ||||
| @@ -177,15 +190,23 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { | ||||
|     if (TripCount != 1 && Size > Threshold) { | ||||
|       DEBUG(dbgs() << "  Too large to fully unroll with count: " << Count | ||||
|             << " because size: " << Size << ">" << Threshold << "\n"); | ||||
|       if (!CurrentAllowPartial) { | ||||
|       if (!CurrentAllowPartial && !(UnrollRuntime && TripCount == 0)) { | ||||
|         DEBUG(dbgs() << "  will not try to unroll partially because " | ||||
|               << "-unroll-allow-partial not given\n"); | ||||
|         return false; | ||||
|       } | ||||
|       // Reduce unroll count to be modulo of TripCount for partial unrolling | ||||
|       Count = Threshold / LoopSize; | ||||
|       while (Count != 0 && TripCount%Count != 0) { | ||||
|         Count--; | ||||
|       if (TripCount) { | ||||
|         // Reduce unroll count to be modulo of TripCount for partial unrolling | ||||
|         Count = CurrentThreshold / LoopSize; | ||||
|         while (Count != 0 && TripCount%Count != 0) | ||||
|           Count--; | ||||
|       } | ||||
|       else if (UnrollRuntime) { | ||||
|         // Reduce unroll count to be a lower power-of-two value | ||||
|         while (Count != 0 && Size > CurrentThreshold) { | ||||
|           Count >>= 1; | ||||
|           Size = LoopSize*Count; | ||||
|         } | ||||
|       } | ||||
|       if (Count < 2) { | ||||
|         DEBUG(dbgs() << "  could not unroll partially\n"); | ||||
| @@ -196,7 +217,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { | ||||
|   } | ||||
|  | ||||
|   // Unroll the loop. | ||||
|   if (!UnrollLoop(L, Count, TripCount, TripMultiple, LI, &LPM)) | ||||
|   if (!UnrollLoop(L, Count, TripCount, UnrollRuntime, TripMultiple, LI, &LPM)) | ||||
|     return false; | ||||
|  | ||||
|   return true; | ||||
|   | ||||
| @@ -14,6 +14,7 @@ add_llvm_library(LLVMTransformUtils | ||||
|   Local.cpp | ||||
|   LoopSimplify.cpp | ||||
|   LoopUnroll.cpp | ||||
|   LoopUnrollRuntime.cpp | ||||
|   LowerExpectIntrinsic.cpp | ||||
|   LowerInvoke.cpp | ||||
|   LowerSwitch.cpp | ||||
|   | ||||
| @@ -135,7 +135,8 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, | ||||
| /// This utility preserves LoopInfo. If DominatorTree or ScalarEvolution are | ||||
| /// available it must also preserve those analyses. | ||||
| bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, | ||||
|                       unsigned TripMultiple, LoopInfo *LI, LPPassManager *LPM) { | ||||
|                       bool AllowRuntime, unsigned TripMultiple, | ||||
|                       LoopInfo *LI, LPPassManager *LPM) { | ||||
|   BasicBlock *Preheader = L->getLoopPreheader(); | ||||
|   if (!Preheader) { | ||||
|     DEBUG(dbgs() << "  Can't unroll; loop preheader-insertion failed.\n"); | ||||
| @@ -165,12 +166,6 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, | ||||
|     return false; | ||||
|   } | ||||
|  | ||||
|   // Notify ScalarEvolution that the loop will be substantially changed, | ||||
|   // if not outright eliminated. | ||||
|   ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>(); | ||||
|   if (SE) | ||||
|     SE->forgetLoop(L); | ||||
|  | ||||
|   if (TripCount != 0) | ||||
|     DEBUG(dbgs() << "  Trip Count = " << TripCount << "\n"); | ||||
|   if (TripMultiple != 1) | ||||
| @@ -188,6 +183,20 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, | ||||
|   // Are we eliminating the loop control altogether? | ||||
|   bool CompletelyUnroll = Count == TripCount; | ||||
|  | ||||
|   // We assume a run-time trip count if the compiler cannot | ||||
|   // figure out the loop trip count and the unroll-runtime | ||||
|   // flag is specified. | ||||
|   bool RuntimeTripCount = (TripCount == 0 && Count > 0 && AllowRuntime); | ||||
|  | ||||
|   if (RuntimeTripCount && !UnrollRuntimeLoopProlog(L, Count, LI, LPM)) | ||||
|     return false; | ||||
|  | ||||
|   // Notify ScalarEvolution that the loop will be substantially changed, | ||||
|   // if not outright eliminated. | ||||
|   ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>(); | ||||
|   if (SE) | ||||
|     SE->forgetLoop(L); | ||||
|  | ||||
|   // If we know the trip count, we know the multiple... | ||||
|   unsigned BreakoutTrip = 0; | ||||
|   if (TripCount != 0) { | ||||
| @@ -209,6 +218,8 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, | ||||
|       DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip); | ||||
|     } else if (TripMultiple != 1) { | ||||
|       DEBUG(dbgs() << " with " << TripMultiple << " trips per branch"); | ||||
|     } else if (RuntimeTripCount) { | ||||
|       DEBUG(dbgs() << " with run-time trip count"); | ||||
|     } | ||||
|     DEBUG(dbgs() << "!\n"); | ||||
|   } | ||||
| @@ -332,6 +343,10 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, | ||||
|     BasicBlock *Dest = Headers[j]; | ||||
|     bool NeedConditional = true; | ||||
|  | ||||
|     if (RuntimeTripCount && j != 0) { | ||||
|       NeedConditional = false; | ||||
|     } | ||||
|  | ||||
|     // For a complete unroll, make the last iteration end with a branch | ||||
|     // to the exit block. | ||||
|     if (CompletelyUnroll && j == 0) { | ||||
|   | ||||
							
								
								
									
										375
									
								
								lib/Transforms/Utils/LoopUnrollRuntime.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										375
									
								
								lib/Transforms/Utils/LoopUnrollRuntime.cpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,375 @@ | ||||
| //===-- UnrollLoopRuntime.cpp - Runtime Loop unrolling utilities ----------===// | ||||
| // | ||||
| //                     The LLVM Compiler Infrastructure | ||||
| // | ||||
| // This file is distributed under the University of Illinois Open Source | ||||
| // License. See LICENSE.TXT for details. | ||||
| // | ||||
| //===----------------------------------------------------------------------===// | ||||
| // | ||||
| // This file implements some loop unrolling utilities for loops with run-time | ||||
| // trip counts.  See LoopUnroll.cpp for unrolling loops with compile-time | ||||
| // trip counts. | ||||
| // | ||||
| // The functions in this file are used to generate extra code when the  | ||||
| // run-time trip count modulo the unroll factor is not 0.  When this is the | ||||
| // case, we need to generate code to execute these 'left over' iterations. | ||||
| // | ||||
| // The current strategy generates an if-then-else sequence prior to the  | ||||
| // unrolled loop to execute the 'left over' iterations.  Other strategies | ||||
| // include generate a loop before or after the unrolled loop. | ||||
| // | ||||
| //===----------------------------------------------------------------------===// | ||||
|  | ||||
| #define DEBUG_TYPE "loop-unroll" | ||||
| #include "llvm/Transforms/Utils/UnrollLoop.h" | ||||
| #include "llvm/BasicBlock.h" | ||||
| #include "llvm/ADT/Statistic.h" | ||||
| #include "llvm/Analysis/LoopIterator.h" | ||||
| #include "llvm/Analysis/LoopPass.h" | ||||
| #include "llvm/Analysis/ScalarEvolution.h" | ||||
| #include "llvm/Analysis/ScalarEvolutionExpander.h" | ||||
| #include "llvm/Support/Debug.h" | ||||
| #include "llvm/Support/raw_ostream.h" | ||||
| #include "llvm/Transforms/Utils/BasicBlockUtils.h" | ||||
| #include "llvm/Transforms/Utils/Cloning.h" | ||||
| #include <algorithm> | ||||
|  | ||||
| using namespace llvm; | ||||
|  | ||||
| STATISTIC(NumRuntimeUnrolled,  | ||||
|           "Number of loops unrolled with run-time trip counts"); | ||||
|  | ||||
| /// Connect the unrolling prolog code to the original loop. | ||||
| /// The unrolling prolog code contains code to execute the | ||||
| /// 'extra' iterations if the run-time trip count modulo the | ||||
| /// unroll count is non-zero. | ||||
| /// | ||||
| /// This function performs the following: | ||||
| /// - Create PHI nodes at prolog end block to combine values | ||||
| ///   that exit the prolog code and jump around the prolog. | ||||
| /// - Add a PHI operand to a PHI node at the loop exit block | ||||
| ///   for values that exit the prolog and go around the loop. | ||||
| /// - Branch around the original loop if the trip count is less | ||||
| ///   than the unroll factor. | ||||
| /// | ||||
| static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count, | ||||
|                           BasicBlock *LastPrologBB, BasicBlock *PrologEnd, | ||||
|                           BasicBlock *OrigPH, BasicBlock *NewPH, | ||||
|                           ValueToValueMapTy &LVMap, Pass *P) { | ||||
|   BasicBlock *Latch = L->getLoopLatch(); | ||||
|   assert(Latch != 0 && "Loop must have a latch"); | ||||
|  | ||||
|   // Create a PHI node for each outgoing value from the original loop | ||||
|   // (which means it is an outgoing value from the prolog code too). | ||||
|   // The new PHI node is inserted in the prolog end basic block. | ||||
|   // The new PHI name is added as an operand of a PHI node in either | ||||
|   // the loop header or the loop exit block. | ||||
|   for (succ_iterator SBI = succ_begin(Latch), SBE = succ_end(Latch); | ||||
|        SBI != SBE; ++SBI) { | ||||
|     for (BasicBlock::iterator BBI = (*SBI)->begin(); | ||||
|          PHINode *PN = dyn_cast<PHINode>(BBI); ++BBI) { | ||||
|  | ||||
|       // Add a new PHI node to the prolog end block and add the | ||||
|       // appropriate incoming values. | ||||
|       PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName()+".unr", | ||||
|                                        PrologEnd->getTerminator()); | ||||
|       // Adding a value to the new PHI node from the original loop preheader. | ||||
|       // This is the value that skips all the prolog code. | ||||
|       if (L->contains(PN)) { | ||||
|         NewPN->addIncoming(PN->getIncomingValueForBlock(NewPH), OrigPH); | ||||
|       } else { | ||||
|         NewPN->addIncoming(Constant::getNullValue(PN->getType()), OrigPH); | ||||
|       } | ||||
|       Value *OrigVal = PN->getIncomingValueForBlock(Latch); | ||||
|       Value *V = OrigVal; | ||||
|       if (Instruction *I = dyn_cast<Instruction>(V)) { | ||||
|         if (L->contains(I)) { | ||||
|           V = LVMap[I]; | ||||
|         } | ||||
|       } | ||||
|       // Adding a value to the new PHI node from the last prolog block | ||||
|       // that was created. | ||||
|       NewPN->addIncoming(V, LastPrologBB); | ||||
|  | ||||
|       // Update the existing PHI node operand with the value from the | ||||
|       // new PHI node.  How this is done depends on if the existing | ||||
|       // PHI node is in the original loop block, or the exit block. | ||||
|       if (L->contains(PN)) { | ||||
|         PN->setIncomingValue(PN->getBasicBlockIndex(NewPH), NewPN); | ||||
|       } else { | ||||
|         PN->addIncoming(NewPN, PrologEnd); | ||||
|       } | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   // Create a branch around the orignal loop, which is taken if the | ||||
|   // trip count is less than the unroll factor. | ||||
|   Instruction *InsertPt = PrologEnd->getTerminator(); | ||||
|   Instruction *BrLoopExit = | ||||
|     new ICmpInst(InsertPt, ICmpInst::ICMP_ULT, TripCount, | ||||
|                  ConstantInt::get(TripCount->getType(), Count)); | ||||
|   BasicBlock *Exit = L->getUniqueExitBlock(); | ||||
|   assert(Exit != 0 && "Loop must have a single exit block only"); | ||||
|   // Split the exit to maintain loop canonicalization guarantees | ||||
|   SmallVector<BasicBlock*, 4> Preds(pred_begin(Exit), pred_end(Exit)); | ||||
|   if (!Exit->isLandingPad()) { | ||||
|     SplitBlockPredecessors(Exit, Preds.data(), Preds.size(), | ||||
|                            ".unr-lcssa", P); | ||||
|   } else { | ||||
|     SmallVector<BasicBlock*, 2> NewBBs; | ||||
|     SplitLandingPadPredecessors(Exit, Preds, ".unr1-lcssa", ".unr2-lcssa", | ||||
|                                 P, NewBBs); | ||||
|   } | ||||
|   // Add the branch to the exit block (around the unrolled loop) | ||||
|   BranchInst::Create(Exit, NewPH, BrLoopExit, InsertPt); | ||||
|   InsertPt->eraseFromParent(); | ||||
| } | ||||
|  | ||||
| /// Create a clone of the blocks in a loop and connect them together. | ||||
| /// This function doesn't create a clone of the loop structure. | ||||
| /// | ||||
| /// There are two value maps that are defined and used.  VMap is | ||||
| /// for the values in the current loop instance.  LVMap contains | ||||
| /// the values from the last loop instance.  We need the LVMap values | ||||
| /// to update the inital values for the current loop instance. | ||||
| /// | ||||
| static void CloneLoopBlocks(Loop *L, | ||||
|                             bool FirstCopy, | ||||
|                             BasicBlock *InsertTop, | ||||
|                             BasicBlock *InsertBot, | ||||
|                             std::vector<BasicBlock *> &NewBlocks, | ||||
|                             LoopBlocksDFS &LoopBlocks, | ||||
|                             ValueToValueMapTy &VMap, | ||||
|                             ValueToValueMapTy &LVMap, | ||||
|                             LoopInfo *LI) { | ||||
|  | ||||
|   BasicBlock *Preheader = L->getLoopPreheader(); | ||||
|   BasicBlock *Header = L->getHeader(); | ||||
|   BasicBlock *Latch = L->getLoopLatch(); | ||||
|   Function *F = Header->getParent(); | ||||
|   LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO(); | ||||
|   LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO(); | ||||
|   // For each block in the original loop, create a new copy, | ||||
|   // and update the value map with the newly created values. | ||||
|   for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { | ||||
|     BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".unr", F); | ||||
|     NewBlocks.push_back(NewBB); | ||||
|  | ||||
|     if (Loop *ParentLoop = L->getParentLoop()) | ||||
|       ParentLoop->addBasicBlockToLoop(NewBB, LI->getBase()); | ||||
|  | ||||
|     VMap[*BB] = NewBB; | ||||
|     if (Header == *BB) { | ||||
|       // For the first block, add a CFG connection to this newly | ||||
|       // created block | ||||
|       InsertTop->getTerminator()->setSuccessor(0, NewBB); | ||||
|  | ||||
|       // Change the incoming values to the ones defined in the | ||||
|       // previously cloned loop. | ||||
|       for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { | ||||
|         PHINode *NewPHI = cast<PHINode>(VMap[I]); | ||||
|         if (FirstCopy) { | ||||
|           // We replace the first phi node with the value from the preheader | ||||
|           VMap[I] = NewPHI->getIncomingValueForBlock(Preheader); | ||||
|           NewBB->getInstList().erase(NewPHI); | ||||
|         } else { | ||||
|           // Update VMap with values from the previous block | ||||
|           unsigned idx = NewPHI->getBasicBlockIndex(Latch); | ||||
|           Value *InVal = NewPHI->getIncomingValue(idx); | ||||
|           if (Instruction *I = dyn_cast<Instruction>(InVal)) | ||||
|             if (L->contains(I)) | ||||
|               InVal = LVMap[InVal]; | ||||
|           NewPHI->setIncomingValue(idx, InVal); | ||||
|           NewPHI->setIncomingBlock(idx, InsertTop); | ||||
|         } | ||||
|       } | ||||
|     } | ||||
|  | ||||
|     if (Latch == *BB) { | ||||
|       VMap.erase((*BB)->getTerminator()); | ||||
|       NewBB->getTerminator()->eraseFromParent(); | ||||
|       BranchInst::Create(InsertBot, NewBB); | ||||
|     } | ||||
|   } | ||||
|   // LastValueMap is updated with the values for the current loop | ||||
|   // which are used the next time this function is called. | ||||
|   for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); | ||||
|        VI != VE; ++VI) { | ||||
|     LVMap[VI->first] = VI->second; | ||||
|   } | ||||
| } | ||||
|  | ||||
| /// Insert code in the prolog code when unrolling a loop with a | ||||
| /// run-time trip-count. | ||||
| /// | ||||
| /// This method assumes that the loop unroll factor is total number | ||||
| /// of loop bodes in the loop after unrolling. (Some folks refer | ||||
| /// to the unroll factor as the number of *extra* copies added). | ||||
| /// We assume also that the loop unroll factor is a power-of-two. So, after | ||||
| /// unrolling the loop, the number of loop bodies executed is 2, | ||||
| /// 4, 8, etc.  Note - LLVM converts the if-then-sequence to a switch  | ||||
| /// instruction in SimplifyCFG.cpp.  Then, the backend decides how code for | ||||
| /// the switch instruction is generated. | ||||
| /// | ||||
| ///    extraiters = tripcount % loopfactor | ||||
| ///    if (extraiters == 0) jump Loop: | ||||
| ///    if (extraiters == loopfactor) jump L1 | ||||
| ///    if (extraiters == loopfactor-1) jump L2 | ||||
| ///    ... | ||||
| ///    L1:  LoopBody; | ||||
| ///    L2:  LoopBody; | ||||
| ///    ... | ||||
| ///    if tripcount < loopfactor jump End | ||||
| ///    Loop: | ||||
| ///    ... | ||||
| ///    End: | ||||
| /// | ||||
| bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, | ||||
|                                    LPPassManager *LPM) { | ||||
|   // for now, only unroll loops that contain a single exit | ||||
|   SmallVector<BasicBlock*, 4> ExitingBlocks; | ||||
|   L->getExitingBlocks(ExitingBlocks); | ||||
|   if (ExitingBlocks.size() > 1) | ||||
|     return false; | ||||
|  | ||||
|   // Make sure the loop is in canonical form, and there is a single | ||||
|   // exit block only. | ||||
|   if (!L->isLoopSimplifyForm() || L->getUniqueExitBlock() == 0) | ||||
|     return false; | ||||
|  | ||||
|   // Use Scalar Evolution to compute the trip count.  This allows more | ||||
|   // loops to be unrolled than relying on induction var simplification | ||||
|   ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>(); | ||||
|   if (SE == 0) | ||||
|     return false; | ||||
|  | ||||
|   // Only unroll loops with a computable trip count and the trip count needs | ||||
|   // to be an int value (allowing a pointer type is a TODO item) | ||||
|   const SCEV *BECount = SE->getBackedgeTakenCount(L); | ||||
|   if (isa<SCEVCouldNotCompute>(BECount) || !BECount->getType()->isIntegerTy()) | ||||
|     return false; | ||||
|  | ||||
|   // Add 1 since the backedge count doesn't include the first loop iteration | ||||
|   const SCEV *TripCountSC =  | ||||
|     SE->getAddExpr(BECount, SE->getConstant(BECount->getType(), 1)); | ||||
|   if (isa<SCEVCouldNotCompute>(TripCountSC)) | ||||
|     return false; | ||||
|  | ||||
|   // We only handle cases when the unroll factor is a power of 2. | ||||
|   // Count is the loop unroll factor, the number of extra copies added + 1. | ||||
|   if ((Count & (Count-1)) != 0) | ||||
|     return false; | ||||
|  | ||||
|   // If this loop is nested, then the loop unroller changes the code in | ||||
|   // parent loop, so the Scalar Evolution pass needs to be run again | ||||
|   if (Loop *ParentLoop = L->getParentLoop()) | ||||
|     SE->forgetLoop(ParentLoop); | ||||
|  | ||||
|   BasicBlock *PH = L->getLoopPreheader(); | ||||
|   BasicBlock *Header = L->getHeader(); | ||||
|   BasicBlock *Latch = L->getLoopLatch(); | ||||
|   // It helps to splits the original preheader twice, one for the end of the | ||||
|   // prolog code and one for a new loop preheader | ||||
|   BasicBlock *PEnd = SplitEdge(PH, Header, LPM->getAsPass()); | ||||
|   BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), LPM->getAsPass()); | ||||
|   BranchInst *PreHeaderBR = cast<BranchInst>(PH->getTerminator()); | ||||
|  | ||||
|   // Compute the number of extra iterations required, which is: | ||||
|   //  extra iterations = run-time trip count % (loop unroll factor + 1) | ||||
|   SCEVExpander Expander(*SE, "loop-unroll"); | ||||
|   Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(), | ||||
|                                             PreHeaderBR); | ||||
|   Type *CountTy = TripCount->getType(); | ||||
|   BinaryOperator *ModVal = | ||||
|     BinaryOperator::CreateURem(TripCount, | ||||
|                                ConstantInt::get(CountTy, Count), | ||||
|                                "xtraiter"); | ||||
|   ModVal->insertBefore(PreHeaderBR); | ||||
|  | ||||
|   // Check if for no extra iterations, then jump to unrolled loop | ||||
|   Value *BranchVal = new ICmpInst(PreHeaderBR, | ||||
|                                   ICmpInst::ICMP_NE, ModVal, | ||||
|                                   ConstantInt::get(CountTy, 0), "lcmp"); | ||||
|   // Branch to either the extra iterations or the unrolled loop | ||||
|   // We will fix up the true branch label when adding loop body copies | ||||
|   BranchInst::Create(PEnd, PEnd, BranchVal, PreHeaderBR); | ||||
|   assert(PreHeaderBR->isUnconditional() &&  | ||||
|          PreHeaderBR->getSuccessor(0) == PEnd &&  | ||||
|          "CFG edges in Preheader are not correct"); | ||||
|   PreHeaderBR->eraseFromParent(); | ||||
|  | ||||
|   ValueToValueMapTy LVMap; | ||||
|   Function *F = Header->getParent(); | ||||
|   // These variables are used to update the CFG links in each iteration | ||||
|   BasicBlock *CompareBB = 0; | ||||
|   BasicBlock *LastLoopBB = PH; | ||||
|   // Get an ordered list of blocks in the loop to help with the ordering of the | ||||
|   // cloned blocks in the prolog code | ||||
|   LoopBlocksDFS LoopBlocks(L); | ||||
|   LoopBlocks.perform(LI); | ||||
|  | ||||
|   // | ||||
|   // For each extra loop iteration, create a copy of the loop's basic blocks | ||||
|   // and generate a condition that branches to the copy depending on the | ||||
|   // number of 'left over' iterations. | ||||
|   // | ||||
|   for (unsigned leftOverIters = Count-1; leftOverIters > 0; --leftOverIters) { | ||||
|     std::vector<BasicBlock*> NewBlocks; | ||||
|     ValueToValueMapTy VMap; | ||||
|  | ||||
|     // Clone all the basic blocks in the loop, but we don't clone the loop | ||||
|     // This function adds the appropriate CFG connections. | ||||
|     CloneLoopBlocks(L, (leftOverIters == Count-1), LastLoopBB, PEnd, NewBlocks, | ||||
|                     LoopBlocks, VMap, LVMap, LI); | ||||
|     LastLoopBB = cast<BasicBlock>(VMap[Latch]); | ||||
|  | ||||
|     // Insert the cloned blocks into function just before the original loop | ||||
|     F->getBasicBlockList().splice(PEnd, F->getBasicBlockList(), | ||||
|                                   NewBlocks[0], F->end()); | ||||
|  | ||||
|     // Generate the code for the comparison which determines if the loop | ||||
|     // prolog code needs to be executed. | ||||
|     if (leftOverIters == Count-1) { | ||||
|       // There is no compare block for the fall-thru case when for the last | ||||
|       // left over iteration | ||||
|       CompareBB = NewBlocks[0]; | ||||
|     } else { | ||||
|       // Create a new block for the comparison | ||||
|       BasicBlock *NewBB = BasicBlock::Create(CompareBB->getContext(), "unr.cmp", | ||||
|                                              F, CompareBB); | ||||
|       if (Loop *ParentLoop = L->getParentLoop()) { | ||||
|         // Add the new block to the parent loop, if needed | ||||
|         ParentLoop->addBasicBlockToLoop(NewBB, LI->getBase()); | ||||
|       } | ||||
|  | ||||
|       // The comparison w/ the extra iteration value and branch | ||||
|       Value *BranchVal = new ICmpInst(*NewBB, ICmpInst::ICMP_EQ, ModVal, | ||||
|                                       ConstantInt::get(CountTy, leftOverIters), | ||||
|                                       "un.tmp"); | ||||
|       // Branch to either the extra iterations or the unrolled loop | ||||
|       BranchInst::Create(NewBlocks[0], CompareBB, | ||||
|                          BranchVal, NewBB); | ||||
|       CompareBB = NewBB; | ||||
|       PH->getTerminator()->setSuccessor(0, NewBB); | ||||
|       VMap[NewPH] = CompareBB; | ||||
|     } | ||||
|  | ||||
|     // Rewrite the cloned instruction operands to use the values | ||||
|     // created when the clone is created. | ||||
|     for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) { | ||||
|       for (BasicBlock::iterator I = NewBlocks[i]->begin(), | ||||
|              E = NewBlocks[i]->end(); I != E; ++I) { | ||||
|         RemapInstruction(I, VMap,  | ||||
|                          RF_NoModuleLevelChanges|RF_IgnoreMissingEntries); | ||||
|       } | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   // Connect the prolog code to the original loop and update the | ||||
|   // PHI functions. | ||||
|   ConnectProlog(L, TripCount, Count, LastLoopBB, PEnd, PH, NewPH, LVMap, | ||||
|                 LPM->getAsPass()); | ||||
|   NumRuntimeUnrolled++; | ||||
|   return true; | ||||
| } | ||||
							
								
								
									
										109
									
								
								test/Transforms/LoopUnroll/runtime-loop.ll
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										109
									
								
								test/Transforms/LoopUnroll/runtime-loop.ll
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,109 @@ | ||||
| ; RUN: opt < %s -S -loop-unroll -unroll-runtime=true | FileCheck %s | ||||
|  | ||||
| ; Tests for unrolling loops with run-time trip counts | ||||
|  | ||||
| ; CHECK: unr.cmp{{.*}}: | ||||
| ; CHECK: for.body.unr{{.*}}: | ||||
| ; CHECK: for.body: | ||||
| ; CHECK: br i1 %exitcond.7, label %for.end.loopexit{{.*}}, label %for.body | ||||
|  | ||||
| define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly { | ||||
| entry: | ||||
|   %cmp1 = icmp eq i32 %n, 0 | ||||
|   br i1 %cmp1, label %for.end, label %for.body | ||||
|  | ||||
| for.body:                                         ; preds = %for.body, %entry | ||||
|   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] | ||||
|   %sum.02 = phi i32 [ %add, %for.body ], [ 0, %entry ] | ||||
|   %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv | ||||
|   %0 = load i32* %arrayidx, align 4 | ||||
|   %add = add nsw i32 %0, %sum.02 | ||||
|   %indvars.iv.next = add i64 %indvars.iv, 1 | ||||
|   %lftr.wideiv = trunc i64 %indvars.iv.next to i32 | ||||
|   %exitcond = icmp eq i32 %lftr.wideiv, %n | ||||
|   br i1 %exitcond, label %for.end, label %for.body | ||||
|  | ||||
| for.end:                                          ; preds = %for.body, %entry | ||||
|   %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] | ||||
|   ret i32 %sum.0.lcssa | ||||
| } | ||||
|  | ||||
|  | ||||
| ; Still try to completely unroll loops with compile-time trip counts | ||||
| ; even if the -unroll-runtime is specified | ||||
|  | ||||
| ; CHECK: for.body: | ||||
| ; CHECK-NOT: for.body.unr: | ||||
|  | ||||
| define i32 @test1(i32* nocapture %a) nounwind uwtable readonly { | ||||
| entry: | ||||
|   br label %for.body | ||||
|  | ||||
| for.body:                                         ; preds = %for.body, %entry | ||||
|   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] | ||||
|   %sum.01 = phi i32 [ 0, %entry ], [ %add, %for.body ] | ||||
|   %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv | ||||
|   %0 = load i32* %arrayidx, align 4 | ||||
|   %add = add nsw i32 %0, %sum.01 | ||||
|   %indvars.iv.next = add i64 %indvars.iv, 1 | ||||
|   %lftr.wideiv = trunc i64 %indvars.iv.next to i32 | ||||
|   %exitcond = icmp eq i32 %lftr.wideiv, 5 | ||||
|   br i1 %exitcond, label %for.end, label %for.body | ||||
|  | ||||
| for.end:                                          ; preds = %for.body | ||||
|   ret i32 %add | ||||
| } | ||||
|  | ||||
| ; This is test 2007-05-09-UnknownTripCount.ll which can be unrolled now | ||||
| ; if the -unroll-runtime option is turned on | ||||
|  | ||||
| ; CHECK: bb72.2: | ||||
|  | ||||
| define void @foo(i32 %trips) { | ||||
| entry: | ||||
|         br label %cond_true.outer | ||||
|  | ||||
| cond_true.outer: | ||||
|         %indvar1.ph = phi i32 [ 0, %entry ], [ %indvar.next2, %bb72 ] | ||||
|         br label %bb72 | ||||
|  | ||||
| bb72: | ||||
|         %indvar.next2 = add i32 %indvar1.ph, 1 | ||||
|         %exitcond3 = icmp eq i32 %indvar.next2, %trips | ||||
|         br i1 %exitcond3, label %cond_true138, label %cond_true.outer | ||||
|  | ||||
| cond_true138: | ||||
|         ret void | ||||
| } | ||||
|  | ||||
|  | ||||
| ; Test run-time unrolling for a loop that counts down by -2. | ||||
|  | ||||
| ; CHECK: for.body.unr: | ||||
| ; CHECK: br i1 %cmp.7, label %for.cond.for.end_crit_edge{{.*}}, label %for.body | ||||
|  | ||||
| define zeroext i16 @down(i16* nocapture %p, i32 %len) nounwind uwtable readonly { | ||||
| entry: | ||||
|   %cmp2 = icmp eq i32 %len, 0 | ||||
|   br i1 %cmp2, label %for.end, label %for.body | ||||
|  | ||||
| for.body:                                         ; preds = %for.body, %entry | ||||
|   %p.addr.05 = phi i16* [ %incdec.ptr, %for.body ], [ %p, %entry ] | ||||
|   %len.addr.04 = phi i32 [ %sub, %for.body ], [ %len, %entry ] | ||||
|   %res.03 = phi i32 [ %add, %for.body ], [ 0, %entry ] | ||||
|   %incdec.ptr = getelementptr inbounds i16* %p.addr.05, i64 1 | ||||
|   %0 = load i16* %p.addr.05, align 2 | ||||
|   %conv = zext i16 %0 to i32 | ||||
|   %add = add i32 %conv, %res.03 | ||||
|   %sub = add nsw i32 %len.addr.04, -2 | ||||
|   %cmp = icmp eq i32 %sub, 0 | ||||
|   br i1 %cmp, label %for.cond.for.end_crit_edge, label %for.body | ||||
|  | ||||
| for.cond.for.end_crit_edge:                       ; preds = %for.body | ||||
|   %phitmp = trunc i32 %add to i16 | ||||
|   br label %for.end | ||||
|  | ||||
| for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry | ||||
|   %res.0.lcssa = phi i16 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] | ||||
|   ret i16 %res.0.lcssa | ||||
| } | ||||
							
								
								
									
										30
									
								
								test/Transforms/LoopUnroll/runtime-loop1.ll
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								test/Transforms/LoopUnroll/runtime-loop1.ll
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,30 @@ | ||||
| ; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-count=4 | FileCheck %s | ||||
|  | ||||
| ; This tests that setting the unroll count works | ||||
|  | ||||
| ; CHECK: unr.cmp: | ||||
| ; CHECK: for.body.unr: | ||||
| ; CHECK: for.body: | ||||
| ; CHECK: br i1 %exitcond.3, label %for.end.loopexit{{.*}}, label %for.body | ||||
| ; CHECK-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body | ||||
|  | ||||
| define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly { | ||||
| entry: | ||||
|   %cmp1 = icmp eq i32 %n, 0 | ||||
|   br i1 %cmp1, label %for.end, label %for.body | ||||
|  | ||||
| for.body:                                         ; preds = %for.body, %entry | ||||
|   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] | ||||
|   %sum.02 = phi i32 [ %add, %for.body ], [ 0, %entry ] | ||||
|   %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv | ||||
|   %0 = load i32* %arrayidx, align 4 | ||||
|   %add = add nsw i32 %0, %sum.02 | ||||
|   %indvars.iv.next = add i64 %indvars.iv, 1 | ||||
|   %lftr.wideiv = trunc i64 %indvars.iv.next to i32 | ||||
|   %exitcond = icmp eq i32 %lftr.wideiv, %n | ||||
|   br i1 %exitcond, label %for.end, label %for.body | ||||
|  | ||||
| for.end:                                          ; preds = %for.body, %entry | ||||
|   %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] | ||||
|   ret i32 %sum.0.lcssa | ||||
| } | ||||
							
								
								
									
										31
									
								
								test/Transforms/LoopUnroll/runtime-loop2.ll
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										31
									
								
								test/Transforms/LoopUnroll/runtime-loop2.ll
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,31 @@ | ||||
| ; RUN: opt < %s -S -loop-unroll -unroll-threshold=50 -unroll-runtime -unroll-count=8 | FileCheck %s | ||||
|  | ||||
| ; Choose a smaller, power-of-two, unroll count if the loop is too large. | ||||
| ; This test makes sure we're not unrolling 'odd' counts | ||||
|  | ||||
| ; CHECK: unr.cmp: | ||||
| ; CHECK: for.body.unr: | ||||
| ; CHECK: for.body: | ||||
| ; CHECK: br i1 %exitcond.3, label %for.end.loopexit{{.*}}, label %for.body | ||||
| ; CHECK-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body | ||||
|  | ||||
| define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly { | ||||
| entry: | ||||
|   %cmp1 = icmp eq i32 %n, 0 | ||||
|   br i1 %cmp1, label %for.end, label %for.body | ||||
|  | ||||
| for.body:                                         ; preds = %for.body, %entry | ||||
|   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] | ||||
|   %sum.02 = phi i32 [ %add, %for.body ], [ 0, %entry ] | ||||
|   %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv | ||||
|   %0 = load i32* %arrayidx, align 4 | ||||
|   %add = add nsw i32 %0, %sum.02 | ||||
|   %indvars.iv.next = add i64 %indvars.iv, 1 | ||||
|   %lftr.wideiv = trunc i64 %indvars.iv.next to i32 | ||||
|   %exitcond = icmp eq i32 %lftr.wideiv, %n | ||||
|   br i1 %exitcond, label %for.end, label %for.body | ||||
|  | ||||
| for.end:                                          ; preds = %for.body, %entry | ||||
|   %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] | ||||
|   ret i32 %sum.0.lcssa | ||||
| } | ||||
							
								
								
									
										44
									
								
								test/Transforms/LoopUnroll/runtime-loop3.ll
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										44
									
								
								test/Transforms/LoopUnroll/runtime-loop3.ll
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,44 @@ | ||||
| ; RUN: opt < %s -disable-output -stats -loop-unroll -unroll-runtime -unroll-threshold=400 -info-output-file - | FileCheck %s --check-prefix=STATS | ||||
|  | ||||
| ; Test that nested loops can be unrolled.  We need to increase threshold to do it | ||||
|  | ||||
| ; STATS: 2 loop-unroll - Number of loops unrolled (completely or otherwise) | ||||
|  | ||||
| define i32 @nested(i32* nocapture %a, i32 %n, i32 %m) nounwind uwtable readonly { | ||||
| entry: | ||||
|   %cmp11 = icmp sgt i32 %n, 0 | ||||
|   br i1 %cmp11, label %for.cond1.preheader.lr.ph, label %for.end7 | ||||
|  | ||||
| for.cond1.preheader.lr.ph:                        ; preds = %entry | ||||
|   %cmp28 = icmp sgt i32 %m, 0 | ||||
|   br label %for.cond1.preheader | ||||
|  | ||||
| for.cond1.preheader:                              ; preds = %for.inc5, %for.cond1.preheader.lr.ph | ||||
|   %indvars.iv16 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next17, %for.inc5 ] | ||||
|   %sum.012 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %sum.1.lcssa, %for.inc5 ] | ||||
|   br i1 %cmp28, label %for.body3, label %for.inc5 | ||||
|  | ||||
| for.body3:                                        ; preds = %for.cond1.preheader, %for.body3 | ||||
|   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.cond1.preheader ] | ||||
|   %sum.19 = phi i32 [ %add4, %for.body3 ], [ %sum.012, %for.cond1.preheader ] | ||||
|   %0 = add nsw i64 %indvars.iv, %indvars.iv16 | ||||
|   %arrayidx = getelementptr inbounds i32* %a, i64 %0 | ||||
|   %1 = load i32* %arrayidx, align 4 | ||||
|   %add4 = add nsw i32 %1, %sum.19 | ||||
|   %indvars.iv.next = add i64 %indvars.iv, 1 | ||||
|   %lftr.wideiv = trunc i64 %indvars.iv.next to i32 | ||||
|   %exitcond = icmp eq i32 %lftr.wideiv, %m | ||||
|   br i1 %exitcond, label %for.inc5, label %for.body3 | ||||
|  | ||||
| for.inc5:                                         ; preds = %for.body3, %for.cond1.preheader | ||||
|   %sum.1.lcssa = phi i32 [ %sum.012, %for.cond1.preheader ], [ %add4, %for.body3 ] | ||||
|   %indvars.iv.next17 = add i64 %indvars.iv16, 1 | ||||
|   %lftr.wideiv18 = trunc i64 %indvars.iv.next17 to i32 | ||||
|   %exitcond19 = icmp eq i32 %lftr.wideiv18, %n | ||||
|   br i1 %exitcond19, label %for.end7, label %for.cond1.preheader | ||||
|  | ||||
| for.end7:                                         ; preds = %for.inc5, %entry | ||||
|   %sum.0.lcssa = phi i32 [ 0, %entry ], [ %sum.1.lcssa, %for.inc5 ] | ||||
|   ret i32 %sum.0.lcssa | ||||
| } | ||||
|  | ||||
		Reference in New Issue
	
	Block a user