diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp index 7eeb1527ad4..2c76706bafa 100644 --- a/lib/Transforms/Scalar/LoopRotation.cpp +++ b/lib/Transforms/Scalar/LoopRotation.cpp @@ -24,6 +24,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" #include "llvm/ADT/Statistic.h" using namespace llvm; @@ -256,6 +257,7 @@ bool LoopRotate::rotateLoop(Loop *L) { return false; BasicBlock *OrigHeader = L->getHeader(); + BasicBlock *OrigLatch = L->getLoopLatch(); BranchInst *BI = dyn_cast(OrigHeader->getTerminator()); if (BI == 0 || BI->isUnconditional()) @@ -267,13 +269,9 @@ bool LoopRotate::rotateLoop(Loop *L) { if (!L->isLoopExiting(OrigHeader)) return false; - // Updating PHInodes in loops with multiple exits adds complexity. - // Keep it simple, and restrict loop rotation to loops with one exit only. - // In future, lift this restriction and support for multiple exits if - // required. - SmallVector ExitBlocks; - L->getExitBlocks(ExitBlocks); - if (ExitBlocks.size() > 1) + // If the loop latch already contains a branch that leaves the loop then the + // loop is already rotated. + if (OrigLatch == 0 || L->isLoopExiting(OrigLatch)) return false; // Check size of original header and reject loop if it is very big. @@ -286,11 +284,10 @@ bool LoopRotate::rotateLoop(Loop *L) { // Now, this loop is suitable for rotation. BasicBlock *OrigPreheader = L->getLoopPreheader(); - BasicBlock *OrigLatch = L->getLoopLatch(); // If the loop could not be converted to canonical form, it must have an // indirectbr in it, just give up. - if (OrigPreheader == 0 || OrigLatch == 0) + if (OrigPreheader == 0) return false; // Anything ScalarEvolution may know about this loop or the PHI nodes @@ -298,6 +295,8 @@ bool LoopRotate::rotateLoop(Loop *L) { if (ScalarEvolution *SE = getAnalysisIfAvailable()) SE->forgetLoop(L); + DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); + // Find new Loop header. NewHeader is a Header's one and only successor // that is inside loop. Header's other successor is outside the // loop. Otherwise loop is not suitable for rotation. @@ -408,10 +407,16 @@ bool LoopRotate::rotateLoop(Loop *L) { // Update DominatorTree to reflect the CFG change we just made. Then split // edges as necessary to preserve LoopSimplify form. if (DominatorTree *DT = getAnalysisIfAvailable()) { - // Since OrigPreheader now has the conditional branch to Exit block, it is - // the dominator of Exit. - DT->changeImmediateDominator(Exit, OrigPreheader); - DT->changeImmediateDominator(NewHeader, OrigPreheader); + // Everything that was dominated by the old loop header is now dominated + // by the original loop preheader. Conceptually the header was merged + // into the preheader, even though we reuse the actual block as a new + // loop latch. + DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader); + SmallVector HeaderChildren(OrigHeaderNode->begin(), + OrigHeaderNode->end()); + DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader); + for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) + DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode); // Update OrigHeader to be dominated by the new header block. DT->changeImmediateDominator(OrigHeader, OrigLatch); @@ -440,6 +445,46 @@ bool LoopRotate::rotateLoop(Loop *L) { // Update OrigHeader to be dominated by the new header block. DT->changeImmediateDominator(NewHeader, OrigPreheader); DT->changeImmediateDominator(OrigHeader, OrigLatch); + + // Brute force incremental dominator tree update. Call + // findNearestCommonDominator on all CFG predecessors of each child of the + // original header. + DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader); + SmallVector WorkList(OrigHeaderNode->begin(), + OrigHeaderNode->end()); + while (!WorkList.empty()) { + DomTreeNode *Node = WorkList.pop_back_val(); + BasicBlock *BB = Node->getBlock(); + BasicBlock *NearestDom = 0; + for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; + ++PI) { + BasicBlock *Pred = *PI; + + // We have to process predecessors of a node before we touch the + // actual node. If one of the predecessors is in our worklist, put it + // and the currently processed node on the worklist and go processing + // the predecessor. + SmallVectorImpl::iterator I = + std::find(WorkList.begin(), WorkList.end(), DT->getNode(Pred)); + if (I != WorkList.end()) { + WorkList.push_back(Node); + std::swap(*I, WorkList.back()); + // The predecessor is now at the end of the worklist. + NearestDom = 0; + break; + } + + // On the first iteration start with Pred, on the other iterations we + // narrow it down to the nearest common dominator. + if (!NearestDom) + NearestDom = Pred; + else + NearestDom = DT->findNearestCommonDominator(NearestDom, Pred); + } + + if (NearestDom) + DT->changeImmediateDominator(BB, NearestDom); + } } } @@ -452,6 +497,8 @@ bool LoopRotate::rotateLoop(Loop *L) { // emitted code isn't too gross in this common case. MergeBlockIntoPredecessor(OrigHeader, this); + DEBUG(dbgs() << "LoopRotation: into "; L->dump()); + ++NumRotated; return true; } diff --git a/test/Transforms/LoopRotate/multiple-exits.ll b/test/Transforms/LoopRotate/multiple-exits.ll new file mode 100644 index 00000000000..4568678c3d1 --- /dev/null +++ b/test/Transforms/LoopRotate/multiple-exits.ll @@ -0,0 +1,200 @@ +; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; PR7447 +define i32 @test1([100 x i32]* nocapture %a) nounwind readonly { +entry: + br label %for.cond + +for.cond: ; preds = %for.cond1, %entry + %sum.0 = phi i32 [ 0, %entry ], [ %sum.1, %for.cond1 ] + %i.0 = phi i1 [ true, %entry ], [ false, %for.cond1 ] + br i1 %i.0, label %for.cond1, label %return + +for.cond1: ; preds = %for.cond, %land.rhs + %sum.1 = phi i32 [ %add, %land.rhs ], [ %sum.0, %for.cond ] + %i.1 = phi i32 [ %inc, %land.rhs ], [ 0, %for.cond ] + %cmp2 = icmp ult i32 %i.1, 100 + br i1 %cmp2, label %land.rhs, label %for.cond + +land.rhs: ; preds = %for.cond1 + %conv = zext i32 %i.1 to i64 + %arrayidx = getelementptr inbounds [100 x i32]* %a, i64 0, i64 %conv + %0 = load i32* %arrayidx, align 4 + %add = add i32 %0, %sum.1 + %cmp4 = icmp ugt i32 %add, 1000 + %inc = add i32 %i.1, 1 + br i1 %cmp4, label %return, label %for.cond1 + +return: ; preds = %for.cond, %land.rhs + %retval.0 = phi i32 [ 1000, %land.rhs ], [ %sum.0, %for.cond ] + ret i32 %retval.0 + +; CHECK: @test1 +; CHECK: for.cond1.preheader: +; CHECK: %sum.04 = phi i32 [ 0, %entry ], [ %sum.1.lcssa, %for.cond.loopexit ] +; CHECK: br label %for.cond1 + +; CHECK: for.cond1: +; CHECK: %sum.1 = phi i32 [ %add, %land.rhs ], [ %sum.04, %for.cond1.preheader ] +; CHECK: %i.1 = phi i32 [ %inc, %land.rhs ], [ 0, %for.cond1.preheader ] +; CHECK: %cmp2 = icmp ult i32 %i.1, 100 +; CHECK: br i1 %cmp2, label %land.rhs, label %for.cond.loopexit +} + +define void @test2(i32 %x) nounwind { +entry: + br label %for.cond + +for.cond: ; preds = %if.end, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ] + %cmp = icmp eq i32 %i.0, %x + br i1 %cmp, label %return.loopexit, label %for.body + +for.body: ; preds = %for.cond + %call = tail call i32 @foo(i32 %i.0) nounwind + %tobool = icmp eq i32 %call, 0 + br i1 %tobool, label %if.end, label %a + +if.end: ; preds = %for.body + %call1 = tail call i32 @foo(i32 42) nounwind + %inc = add i32 %i.0, 1 + br label %for.cond + +a: ; preds = %for.body + %call2 = tail call i32 @bar(i32 1) nounwind + br label %return + +return.loopexit: ; preds = %for.cond + br label %return + +return: ; preds = %return.loopexit, %a + ret void + +; CHECK: @test2 +; CHECK: if.end: +; CHECK: %inc = add i32 %i.02, 1 +; CHECK: %cmp = icmp eq i32 %inc, %x +; CHECK: br i1 %cmp, label %for.cond.return.loopexit_crit_edge, label %for.body +} + +declare i32 @foo(i32) + +declare i32 @bar(i32) + +@_ZTIi = external constant i8* + +; Verify dominators. +define void @test3(i32 %x) { +entry: + %cmp2 = icmp eq i32 0, %x + br i1 %cmp2, label %try.cont.loopexit, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.inc + %i.03 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] + invoke void @_Z3fooi(i32 %i.03) + to label %for.inc unwind label %lpad + +for.inc: ; preds = %for.body + %inc = add i32 %i.03, 1 + %cmp = icmp eq i32 %inc, %x + br i1 %cmp, label %for.cond.try.cont.loopexit_crit_edge, label %for.body + +lpad: ; preds = %for.body + %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) + catch i8* bitcast (i8** @_ZTIi to i8*) + %1 = extractvalue { i8*, i32 } %0, 0 + %2 = extractvalue { i8*, i32 } %0, 1 + %3 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) nounwind + %matches = icmp eq i32 %2, %3 + br i1 %matches, label %catch, label %eh.resume + +catch: ; preds = %lpad + %4 = tail call i8* @__cxa_begin_catch(i8* %1) nounwind + br i1 true, label %invoke.cont2.loopexit, label %for.body.i.lr.ph + +for.body.i.lr.ph: ; preds = %catch + br label %for.body.i + +for.body.i: ; preds = %for.body.i.lr.ph, %for.inc.i + %i.0.i1 = phi i32 [ 0, %for.body.i.lr.ph ], [ %inc.i, %for.inc.i ] + invoke void @_Z3fooi(i32 %i.0.i1) + to label %for.inc.i unwind label %lpad.i + +for.inc.i: ; preds = %for.body.i + %inc.i = add i32 %i.0.i1, 1 + %cmp.i = icmp eq i32 %inc.i, 0 + br i1 %cmp.i, label %for.cond.i.invoke.cont2.loopexit_crit_edge, label %for.body.i + +lpad.i: ; preds = %for.body.i + %5 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) + catch i8* bitcast (i8** @_ZTIi to i8*) + %6 = extractvalue { i8*, i32 } %5, 0 + %7 = extractvalue { i8*, i32 } %5, 1 + %matches.i = icmp eq i32 %7, %3 + br i1 %matches.i, label %catch.i, label %lpad1.body + +catch.i: ; preds = %lpad.i + %8 = tail call i8* @__cxa_begin_catch(i8* %6) nounwind + invoke void @_Z3barj(i32 0) + to label %invoke.cont2.i unwind label %lpad1.i + +invoke.cont2.i: ; preds = %catch.i + tail call void @__cxa_end_catch() nounwind + br label %invoke.cont2 + +lpad1.i: ; preds = %catch.i + %9 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) + cleanup + %10 = extractvalue { i8*, i32 } %9, 0 + %11 = extractvalue { i8*, i32 } %9, 1 + tail call void @__cxa_end_catch() nounwind + br label %lpad1.body + +for.cond.i.invoke.cont2.loopexit_crit_edge: ; preds = %for.inc.i + br label %invoke.cont2.loopexit + +invoke.cont2.loopexit: ; preds = %for.cond.i.invoke.cont2.loopexit_crit_edge, %catch + br label %invoke.cont2 + +invoke.cont2: ; preds = %invoke.cont2.loopexit, %invoke.cont2.i + tail call void @__cxa_end_catch() nounwind + br label %try.cont + +for.cond.try.cont.loopexit_crit_edge: ; preds = %for.inc + br label %try.cont.loopexit + +try.cont.loopexit: ; preds = %for.cond.try.cont.loopexit_crit_edge, %entry + br label %try.cont + +try.cont: ; preds = %try.cont.loopexit, %invoke.cont2 + ret void + +lpad1.body: ; preds = %lpad1.i, %lpad.i + %exn.slot.0.i = phi i8* [ %10, %lpad1.i ], [ %6, %lpad.i ] + %ehselector.slot.0.i = phi i32 [ %11, %lpad1.i ], [ %7, %lpad.i ] + tail call void @__cxa_end_catch() nounwind + br label %eh.resume + +eh.resume: ; preds = %lpad1.body, %lpad + %exn.slot.0 = phi i8* [ %exn.slot.0.i, %lpad1.body ], [ %1, %lpad ] + %ehselector.slot.0 = phi i32 [ %ehselector.slot.0.i, %lpad1.body ], [ %2, %lpad ] + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn.slot.0, 0 + %lpad.val5 = insertvalue { i8*, i32 } %lpad.val, i32 %ehselector.slot.0, 1 + resume { i8*, i32 } %lpad.val5 +} + +declare void @_Z3fooi(i32) + +declare i32 @__gxx_personality_v0(...) + +declare i32 @llvm.eh.typeid.for(i8*) nounwind readnone + +declare i8* @__cxa_begin_catch(i8*) + +declare void @__cxa_end_catch()