From 69e067fdd86d34cb81ccdffb82415b4f89144218 Mon Sep 17 00:00:00 2001 From: Chris Lattner Date: Thu, 27 Nov 2008 05:07:53 +0000 Subject: [PATCH] Make jump threading substantially more powerful, in the following ways: 1. Make it fold blocks separated by an unconditional branch. This enables jump threading to see a broader scope. 2. Make jump threading able to eliminate locally redundant loads when they feed the branch condition of a block. This frequently occurs due to reg2mem running. 3. Make jump threading able to eliminate *partially redundant* loads when they feed the branch condition of a block. This is common in code with lots of loads and stores like C++ code and 255.vortex. This implements thread-loads.ll and rdar://6402033. Per the fixme's, several pieces of this should be moved into Transforms/Utils. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@60148 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/JumpThreading.cpp | 273 ++++++++++++++++++ test/Transforms/JumpThreading/thread-loads.ll | 34 +++ 2 files changed, 307 insertions(+) create mode 100644 test/Transforms/JumpThreading/thread-loads.ll diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index 2f91c07c94e..4d3a44310bd 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -22,6 +22,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/ADT/SmallPtrSet.h" using namespace llvm; STATISTIC(NumThreads, "Number of jumps threaded"); @@ -62,6 +63,8 @@ namespace { bool ProcessJumpOnPHI(PHINode *PN); bool ProcessBranchOnLogical(Value *V, BasicBlock *BB, bool isAnd); bool ProcessBranchOnCompare(CmpInst *Cmp, BasicBlock *BB); + + bool SimplifyPartiallyRedundantLoad(LoadInst *LI); }; } @@ -153,10 +156,50 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB) { return Size; } +/// MergeBasicBlockIntoOnlyPred - DestBB is a block with one predecessor and its +/// predecessor is known to have one successor (DestBB!). Eliminate the edge +/// between them, moving the instructions in the predecessor into DestBB and +/// deleting the predecessor block. +/// +/// FIXME: Move to TransformUtils to share with simplifycfg and codegenprepare. +static void MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB) { + // If BB has single-entry PHI nodes, fold them. + while (PHINode *PN = dyn_cast(DestBB->begin())) { + Value *NewVal = PN->getIncomingValue(0); + // Replace self referencing PHI with undef, it must be dead. + if (NewVal == PN) NewVal = UndefValue::get(PN->getType()); + PN->replaceAllUsesWith(NewVal); + PN->eraseFromParent(); + } + + BasicBlock *PredBB = DestBB->getSinglePredecessor(); + assert(PredBB && "Block doesn't have a single predecessor!"); + + // Splice all the instructions from PredBB to DestBB. + PredBB->getTerminator()->eraseFromParent(); + DestBB->getInstList().splice(DestBB->begin(), PredBB->getInstList()); + + // Anything that branched to PredBB now branches to DestBB. + PredBB->replaceAllUsesWith(DestBB); + + // Nuke BB. + PredBB->eraseFromParent(); +} + /// ThreadBlock - If there are any predecessors whose control can be threaded /// through to a successor, transform them now. bool JumpThreading::ThreadBlock(BasicBlock *BB) { + // If this block has a single predecessor, and if that pred has a single + // successor, merge the blocks. This encourages recursive jump threading + // because now the condition in this block can be threaded through + // predecessors of our predecessor block. + if (BasicBlock *SinglePred = BB->getSinglePredecessor()) + if (SinglePred->getTerminator()->getNumSuccessors() == 1) { + MergeBasicBlockIntoOnlyPred(BB); + return true; + } + // See if this block ends with a branch or switch. If so, see if the // condition is a phi node. If so, and if an entry of the phi node is a // constant, we can thread the block. @@ -208,10 +251,240 @@ bool JumpThreading::ThreadBlock(BasicBlock *BB) { isa(CondCmp->getOperand(1)) && ProcessBranchOnCompare(CondCmp, BB)) return true; + + // Check for some cases that are worth simplifying. Right now we want to look + // for loads that are used by a switch or by the condition for the branch. If + // we see one, check to see if it's partially redundant. If so, insert a PHI + // which can then be used to thread the values. + // + // This is particularly important because reg2mem inserts loads and stores all + // over the place, and this blocks jump threading if we don't zap them. + Value *SimplifyValue = Condition; + if (CmpInst *CondCmp = dyn_cast(SimplifyValue)) + if (isa(CondCmp->getOperand(1))) + SimplifyValue = CondCmp->getOperand(0); + + if (LoadInst *LI = dyn_cast(SimplifyValue)) + if (SimplifyPartiallyRedundantLoad(LI)) + return true; + + // TODO: If we have: "br (X > 0)" and we have a predecessor where we know + // "(X == 4)" thread through this block. return false; } + +/// FindAvailableLoadedValue - Scan backwards from ScanFrom checking to see if +/// we have the value at the memory address *Ptr locally available within a +/// small number of instructions. If the value is available, return it. +/// +/// If not, return the iterator for the last validated instruction that the +/// value would be live through. If we scanned the entire block, ScanFrom would +/// be left at begin(). +/// +/// FIXME: Move this to transform utils and use from +/// InstCombiner::visitLoadInst. It would also be nice to optionally take AA so +/// that GVN could do this. +static Value *FindAvailableLoadedValue(Value *Ptr, + BasicBlock *ScanBB, + BasicBlock::iterator &ScanFrom) { + + unsigned NumToScan = 6; + while (ScanFrom != ScanBB->begin()) { + // Don't scan huge blocks. + if (--NumToScan == 0) return 0; + + Instruction *Inst = --ScanFrom; + + // If this is a load of Ptr, the loaded value is available. + if (LoadInst *LI = dyn_cast(Inst)) + if (LI->getOperand(0) == Ptr) + return LI; + + if (StoreInst *SI = dyn_cast(Inst)) { + // If this is a store through Ptr, the value is available! + if (SI->getOperand(1) == Ptr) + return SI->getOperand(0); + + // If Ptr is an alloca and this is a store to a different alloca, ignore + // the store. This is a trivial form of alias analysis that is important + // for reg2mem'd code. + if ((isa(Ptr) || isa(Ptr)) && + (isa(SI->getOperand(1)) || + isa(SI->getOperand(1)))) + continue; + + // Otherwise the store that may or may not alias the pointer, bail out. + ++ScanFrom; + return 0; + } + + + // If this is some other instruction that may clobber Ptr, bail out. + if (Inst->mayWriteToMemory()) { + // May modify the pointer, bail out. + ++ScanFrom; + return 0; + } + } + + // Got to the start of the block, we didn't find it, but are done for this + // block. + return 0; +} + + +/// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant +/// load instruction, eliminate it by replacing it with a PHI node. This is an +/// important optimization that encourages jump threading, and needs to be run +/// interlaced with other jump threading tasks. +bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { + // Don't hack volatile loads. + if (LI->isVolatile()) return false; + + // If the load is defined in a block with exactly one predecessor, it can't be + // partially redundant. + BasicBlock *LoadBB = LI->getParent(); + if (LoadBB->getSinglePredecessor()) + return false; + + Value *LoadedPtr = LI->getOperand(0); + + // If the loaded operand is defined in the LoadBB, it can't be available. + // FIXME: Could do PHI translation, that would be fun :) + if (Instruction *PtrOp = dyn_cast(LoadedPtr)) + if (PtrOp->getParent() == LoadBB) + return false; + + // Scan a few instructions up from the load, to see if it is obviously live at + // the entry to its block. + BasicBlock::iterator BBIt = LI; + + if (Value *AvailableVal = FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt)) { + // If the value if the load is locally available within the block, just use + // it. This frequently occurs for reg2mem'd allocas. + //cerr << "LOAD ELIMINATED:\n" << *BBIt << *LI << "\n"; + LI->replaceAllUsesWith(AvailableVal); + LI->eraseFromParent(); + return true; + } + + // Otherwise, if we scanned the whole block and got to the top of the block, + // we know the block is locally transparent to the load. If not, something + // might clobber its value. + if (BBIt != LoadBB->begin()) + return false; + + + SmallPtrSet PredsScanned; + typedef SmallVector, 8> AvailablePredsTy; + AvailablePredsTy AvailablePreds; + BasicBlock *OneUnavailablePred = 0; + + // If we got here, the loaded value is transparent through to the start of the + // block. Check to see if it is available in any of the predecessor blocks. + for (pred_iterator PI = pred_begin(LoadBB), PE = pred_end(LoadBB); + PI != PE; ++PI) { + BasicBlock *PredBB = *PI; + + // If we already scanned this predecessor, skip it. + if (!PredsScanned.insert(PredBB)) + continue; + + // Scan the predecessor to see if the value is available in the pred. + BBIt = PredBB->end(); + Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt); + if (!PredAvailable) { + OneUnavailablePred = PredBB; + continue; + } + + // If so, this load is partially redundant. Remember this info so that we + // can create a PHI node. + AvailablePreds.push_back(std::make_pair(PredBB, PredAvailable)); + } + + // If the loaded value isn't available in any predecessor, it isn't partially + // redundant. + if (AvailablePreds.empty()) return false; + + // Okay, the loaded value is available in at least one (and maybe all!) + // predecessors. If the value is unavailable in more than one unique + // predecessor, we want to insert a merge block for those common predecessors. + // This ensures that we only have to insert one reload, thus not increasing + // code size. + BasicBlock *UnavailablePred = 0; + + // If there is exactly one predecessor where the value is unavailable, the + // already computed 'OneUnavailablePred' block is it. If it ends in an + // unconditional branch, we know that it isn't a critical edge. + if (PredsScanned.size() == AvailablePreds.size()+1 && + OneUnavailablePred->getTerminator()->getNumSuccessors() == 1) { + UnavailablePred = OneUnavailablePred; + } else if (PredsScanned.size() != AvailablePreds.size()) { + // Otherwise, we had multiple unavailable predecessors or we had a critical + // edge from the one. + SmallVector PredsToSplit; + SmallPtrSet AvailablePredSet; + + for (unsigned i = 0, e = AvailablePreds.size(); i != e; ++i) + AvailablePredSet.insert(AvailablePreds[i].first); + + // Add all the unavailable predecessors to the PredsToSplit list. + for (pred_iterator PI = pred_begin(LoadBB), PE = pred_end(LoadBB); + PI != PE; ++PI) + if (!AvailablePredSet.count(*PI)) + PredsToSplit.push_back(*PI); + + // Split them out to their own block. + UnavailablePred = + SplitBlockPredecessors(LoadBB, &PredsToSplit[0], PredsToSplit.size(), + "thread-split", this); + } + + // If the value isn't available in all predecessors, then there will be + // exactly one where it isn't available. Insert a load on that edge and add + // it to the AvailablePreds list. + if (UnavailablePred) { + assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 && + "Can't handle critical edge here!"); + Value *NewVal = new LoadInst(LoadedPtr, LI->getName()+".pr", + UnavailablePred->getTerminator()); + AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal)); + } + + // Now we know that each predecessor of this block has a value in + // AvailablePreds, sort them for efficient access as we're walking the preds. + std::sort(AvailablePreds.begin(), AvailablePreds.end()); + + // Create a PHI node at the start of the block for the PRE'd load value. + PHINode *PN = PHINode::Create(LI->getType(), "", LoadBB->begin()); + PN->takeName(LI); + + // Insert new entries into the PHI for each predecessor. A single block may + // have multiple entries here. + for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); PI != E; + ++PI) { + AvailablePredsTy::iterator I = + std::lower_bound(AvailablePreds.begin(), AvailablePreds.end(), + std::make_pair(*PI, (Value*)0)); + + assert(I != AvailablePreds.end() && I->first == *PI && + "Didn't find entry for predecessor!"); + + PN->addIncoming(I->second, I->first); + } + + //cerr << "PRE: " << *LI << *PN << "\n"; + + LI->replaceAllUsesWith(PN); + LI->eraseFromParent(); + + return true; +} + + /// ProcessJumpOnPHI - We have a conditional branch of switch on a PHI node in /// the current block. See if there are any simplifications we can do based on /// inputs to the phi node. diff --git a/test/Transforms/JumpThreading/thread-loads.ll b/test/Transforms/JumpThreading/thread-loads.ll new file mode 100644 index 00000000000..5c0b256224a --- /dev/null +++ b/test/Transforms/JumpThreading/thread-loads.ll @@ -0,0 +1,34 @@ +; RUN: llvm-as < %s | opt -jump-threading -mem2reg -simplifycfg | llvm-dis | grep {ret i32 1} +; rdar://6402033 + +; Test that we can thread through the block with the partially redundant load (%2). +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" +target triple = "i386-apple-darwin7" + +define i32 @foo(i32* %P) nounwind { +entry: + %0 = tail call i32 (...)* @f1() nounwind ; [#uses=1] + %1 = icmp eq i32 %0, 0 ; [#uses=1] + br i1 %1, label %bb1, label %bb + +bb: ; preds = %entry + store i32 42, i32* %P, align 4 + br label %bb1 + +bb1: ; preds = %entry, %bb + %res.0 = phi i32 [ 1, %bb ], [ 0, %entry ] ; [#uses=2] + %2 = load i32* %P, align 4 ; [#uses=1] + %3 = icmp sgt i32 %2, 36 ; [#uses=1] + br i1 %3, label %bb3, label %bb2 + +bb2: ; preds = %bb1 + %4 = tail call i32 (...)* @f2() nounwind ; [#uses=0] + ret i32 %res.0 + +bb3: ; preds = %bb1 + ret i32 %res.0 +} + +declare i32 @f1(...) + +declare i32 @f2(...)