mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-07-21 02:29:22 +00:00
Make jump threading substantially more powerful, in the following ways:
1. Make it fold blocks separated by an unconditional branch. This enables jump threading to see a broader scope. 2. Make jump threading able to eliminate locally redundant loads when they feed the branch condition of a block. This frequently occurs due to reg2mem running. 3. Make jump threading able to eliminate *partially redundant* loads when they feed the branch condition of a block. This is common in code with lots of loads and stores like C++ code and 255.vortex. This implements thread-loads.ll and rdar://6402033. Per the fixme's, several pieces of this should be moved into Transforms/Utils. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@60148 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
dc52cf48dc
commit
69e067fdd8
@ -22,6 +22,7 @@
|
|||||||
#include "llvm/Support/CommandLine.h"
|
#include "llvm/Support/CommandLine.h"
|
||||||
#include "llvm/Support/Compiler.h"
|
#include "llvm/Support/Compiler.h"
|
||||||
#include "llvm/Support/Debug.h"
|
#include "llvm/Support/Debug.h"
|
||||||
|
#include "llvm/ADT/SmallPtrSet.h"
|
||||||
using namespace llvm;
|
using namespace llvm;
|
||||||
|
|
||||||
STATISTIC(NumThreads, "Number of jumps threaded");
|
STATISTIC(NumThreads, "Number of jumps threaded");
|
||||||
@ -62,6 +63,8 @@ namespace {
|
|||||||
bool ProcessJumpOnPHI(PHINode *PN);
|
bool ProcessJumpOnPHI(PHINode *PN);
|
||||||
bool ProcessBranchOnLogical(Value *V, BasicBlock *BB, bool isAnd);
|
bool ProcessBranchOnLogical(Value *V, BasicBlock *BB, bool isAnd);
|
||||||
bool ProcessBranchOnCompare(CmpInst *Cmp, BasicBlock *BB);
|
bool ProcessBranchOnCompare(CmpInst *Cmp, BasicBlock *BB);
|
||||||
|
|
||||||
|
bool SimplifyPartiallyRedundantLoad(LoadInst *LI);
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -153,10 +156,50 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB) {
|
|||||||
return Size;
|
return Size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// MergeBasicBlockIntoOnlyPred - DestBB is a block with one predecessor and its
|
||||||
|
/// predecessor is known to have one successor (DestBB!). Eliminate the edge
|
||||||
|
/// between them, moving the instructions in the predecessor into DestBB and
|
||||||
|
/// deleting the predecessor block.
|
||||||
|
///
|
||||||
|
/// FIXME: Move to TransformUtils to share with simplifycfg and codegenprepare.
|
||||||
|
static void MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB) {
|
||||||
|
// If BB has single-entry PHI nodes, fold them.
|
||||||
|
while (PHINode *PN = dyn_cast<PHINode>(DestBB->begin())) {
|
||||||
|
Value *NewVal = PN->getIncomingValue(0);
|
||||||
|
// Replace self referencing PHI with undef, it must be dead.
|
||||||
|
if (NewVal == PN) NewVal = UndefValue::get(PN->getType());
|
||||||
|
PN->replaceAllUsesWith(NewVal);
|
||||||
|
PN->eraseFromParent();
|
||||||
|
}
|
||||||
|
|
||||||
|
BasicBlock *PredBB = DestBB->getSinglePredecessor();
|
||||||
|
assert(PredBB && "Block doesn't have a single predecessor!");
|
||||||
|
|
||||||
|
// Splice all the instructions from PredBB to DestBB.
|
||||||
|
PredBB->getTerminator()->eraseFromParent();
|
||||||
|
DestBB->getInstList().splice(DestBB->begin(), PredBB->getInstList());
|
||||||
|
|
||||||
|
// Anything that branched to PredBB now branches to DestBB.
|
||||||
|
PredBB->replaceAllUsesWith(DestBB);
|
||||||
|
|
||||||
|
// Nuke BB.
|
||||||
|
PredBB->eraseFromParent();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/// ThreadBlock - If there are any predecessors whose control can be threaded
|
/// ThreadBlock - If there are any predecessors whose control can be threaded
|
||||||
/// through to a successor, transform them now.
|
/// through to a successor, transform them now.
|
||||||
bool JumpThreading::ThreadBlock(BasicBlock *BB) {
|
bool JumpThreading::ThreadBlock(BasicBlock *BB) {
|
||||||
|
// If this block has a single predecessor, and if that pred has a single
|
||||||
|
// successor, merge the blocks. This encourages recursive jump threading
|
||||||
|
// because now the condition in this block can be threaded through
|
||||||
|
// predecessors of our predecessor block.
|
||||||
|
if (BasicBlock *SinglePred = BB->getSinglePredecessor())
|
||||||
|
if (SinglePred->getTerminator()->getNumSuccessors() == 1) {
|
||||||
|
MergeBasicBlockIntoOnlyPred(BB);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
// See if this block ends with a branch or switch. If so, see if the
|
// See if this block ends with a branch or switch. If so, see if the
|
||||||
// condition is a phi node. If so, and if an entry of the phi node is a
|
// condition is a phi node. If so, and if an entry of the phi node is a
|
||||||
// constant, we can thread the block.
|
// constant, we can thread the block.
|
||||||
@ -208,10 +251,240 @@ bool JumpThreading::ThreadBlock(BasicBlock *BB) {
|
|||||||
isa<Constant>(CondCmp->getOperand(1)) &&
|
isa<Constant>(CondCmp->getOperand(1)) &&
|
||||||
ProcessBranchOnCompare(CondCmp, BB))
|
ProcessBranchOnCompare(CondCmp, BB))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
|
// Check for some cases that are worth simplifying. Right now we want to look
|
||||||
|
// for loads that are used by a switch or by the condition for the branch. If
|
||||||
|
// we see one, check to see if it's partially redundant. If so, insert a PHI
|
||||||
|
// which can then be used to thread the values.
|
||||||
|
//
|
||||||
|
// This is particularly important because reg2mem inserts loads and stores all
|
||||||
|
// over the place, and this blocks jump threading if we don't zap them.
|
||||||
|
Value *SimplifyValue = Condition;
|
||||||
|
if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue))
|
||||||
|
if (isa<Constant>(CondCmp->getOperand(1)))
|
||||||
|
SimplifyValue = CondCmp->getOperand(0);
|
||||||
|
|
||||||
|
if (LoadInst *LI = dyn_cast<LoadInst>(SimplifyValue))
|
||||||
|
if (SimplifyPartiallyRedundantLoad(LI))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
// TODO: If we have: "br (X > 0)" and we have a predecessor where we know
|
||||||
|
// "(X == 4)" thread through this block.
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// FindAvailableLoadedValue - Scan backwards from ScanFrom checking to see if
|
||||||
|
/// we have the value at the memory address *Ptr locally available within a
|
||||||
|
/// small number of instructions. If the value is available, return it.
|
||||||
|
///
|
||||||
|
/// If not, return the iterator for the last validated instruction that the
|
||||||
|
/// value would be live through. If we scanned the entire block, ScanFrom would
|
||||||
|
/// be left at begin().
|
||||||
|
///
|
||||||
|
/// FIXME: Move this to transform utils and use from
|
||||||
|
/// InstCombiner::visitLoadInst. It would also be nice to optionally take AA so
|
||||||
|
/// that GVN could do this.
|
||||||
|
static Value *FindAvailableLoadedValue(Value *Ptr,
|
||||||
|
BasicBlock *ScanBB,
|
||||||
|
BasicBlock::iterator &ScanFrom) {
|
||||||
|
|
||||||
|
unsigned NumToScan = 6;
|
||||||
|
while (ScanFrom != ScanBB->begin()) {
|
||||||
|
// Don't scan huge blocks.
|
||||||
|
if (--NumToScan == 0) return 0;
|
||||||
|
|
||||||
|
Instruction *Inst = --ScanFrom;
|
||||||
|
|
||||||
|
// If this is a load of Ptr, the loaded value is available.
|
||||||
|
if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
|
||||||
|
if (LI->getOperand(0) == Ptr)
|
||||||
|
return LI;
|
||||||
|
|
||||||
|
if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
|
||||||
|
// If this is a store through Ptr, the value is available!
|
||||||
|
if (SI->getOperand(1) == Ptr)
|
||||||
|
return SI->getOperand(0);
|
||||||
|
|
||||||
|
// If Ptr is an alloca and this is a store to a different alloca, ignore
|
||||||
|
// the store. This is a trivial form of alias analysis that is important
|
||||||
|
// for reg2mem'd code.
|
||||||
|
if ((isa<AllocaInst>(Ptr) || isa<GlobalVariable>(Ptr)) &&
|
||||||
|
(isa<AllocaInst>(SI->getOperand(1)) ||
|
||||||
|
isa<GlobalVariable>(SI->getOperand(1))))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// Otherwise the store that may or may not alias the pointer, bail out.
|
||||||
|
++ScanFrom;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// If this is some other instruction that may clobber Ptr, bail out.
|
||||||
|
if (Inst->mayWriteToMemory()) {
|
||||||
|
// May modify the pointer, bail out.
|
||||||
|
++ScanFrom;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Got to the start of the block, we didn't find it, but are done for this
|
||||||
|
// block.
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant
|
||||||
|
/// load instruction, eliminate it by replacing it with a PHI node. This is an
|
||||||
|
/// important optimization that encourages jump threading, and needs to be run
|
||||||
|
/// interlaced with other jump threading tasks.
|
||||||
|
bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
|
||||||
|
// Don't hack volatile loads.
|
||||||
|
if (LI->isVolatile()) return false;
|
||||||
|
|
||||||
|
// If the load is defined in a block with exactly one predecessor, it can't be
|
||||||
|
// partially redundant.
|
||||||
|
BasicBlock *LoadBB = LI->getParent();
|
||||||
|
if (LoadBB->getSinglePredecessor())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
Value *LoadedPtr = LI->getOperand(0);
|
||||||
|
|
||||||
|
// If the loaded operand is defined in the LoadBB, it can't be available.
|
||||||
|
// FIXME: Could do PHI translation, that would be fun :)
|
||||||
|
if (Instruction *PtrOp = dyn_cast<Instruction>(LoadedPtr))
|
||||||
|
if (PtrOp->getParent() == LoadBB)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
// Scan a few instructions up from the load, to see if it is obviously live at
|
||||||
|
// the entry to its block.
|
||||||
|
BasicBlock::iterator BBIt = LI;
|
||||||
|
|
||||||
|
if (Value *AvailableVal = FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt)) {
|
||||||
|
// If the value if the load is locally available within the block, just use
|
||||||
|
// it. This frequently occurs for reg2mem'd allocas.
|
||||||
|
//cerr << "LOAD ELIMINATED:\n" << *BBIt << *LI << "\n";
|
||||||
|
LI->replaceAllUsesWith(AvailableVal);
|
||||||
|
LI->eraseFromParent();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Otherwise, if we scanned the whole block and got to the top of the block,
|
||||||
|
// we know the block is locally transparent to the load. If not, something
|
||||||
|
// might clobber its value.
|
||||||
|
if (BBIt != LoadBB->begin())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
|
||||||
|
SmallPtrSet<BasicBlock*, 8> PredsScanned;
|
||||||
|
typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy;
|
||||||
|
AvailablePredsTy AvailablePreds;
|
||||||
|
BasicBlock *OneUnavailablePred = 0;
|
||||||
|
|
||||||
|
// If we got here, the loaded value is transparent through to the start of the
|
||||||
|
// block. Check to see if it is available in any of the predecessor blocks.
|
||||||
|
for (pred_iterator PI = pred_begin(LoadBB), PE = pred_end(LoadBB);
|
||||||
|
PI != PE; ++PI) {
|
||||||
|
BasicBlock *PredBB = *PI;
|
||||||
|
|
||||||
|
// If we already scanned this predecessor, skip it.
|
||||||
|
if (!PredsScanned.insert(PredBB))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// Scan the predecessor to see if the value is available in the pred.
|
||||||
|
BBIt = PredBB->end();
|
||||||
|
Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt);
|
||||||
|
if (!PredAvailable) {
|
||||||
|
OneUnavailablePred = PredBB;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If so, this load is partially redundant. Remember this info so that we
|
||||||
|
// can create a PHI node.
|
||||||
|
AvailablePreds.push_back(std::make_pair(PredBB, PredAvailable));
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the loaded value isn't available in any predecessor, it isn't partially
|
||||||
|
// redundant.
|
||||||
|
if (AvailablePreds.empty()) return false;
|
||||||
|
|
||||||
|
// Okay, the loaded value is available in at least one (and maybe all!)
|
||||||
|
// predecessors. If the value is unavailable in more than one unique
|
||||||
|
// predecessor, we want to insert a merge block for those common predecessors.
|
||||||
|
// This ensures that we only have to insert one reload, thus not increasing
|
||||||
|
// code size.
|
||||||
|
BasicBlock *UnavailablePred = 0;
|
||||||
|
|
||||||
|
// If there is exactly one predecessor where the value is unavailable, the
|
||||||
|
// already computed 'OneUnavailablePred' block is it. If it ends in an
|
||||||
|
// unconditional branch, we know that it isn't a critical edge.
|
||||||
|
if (PredsScanned.size() == AvailablePreds.size()+1 &&
|
||||||
|
OneUnavailablePred->getTerminator()->getNumSuccessors() == 1) {
|
||||||
|
UnavailablePred = OneUnavailablePred;
|
||||||
|
} else if (PredsScanned.size() != AvailablePreds.size()) {
|
||||||
|
// Otherwise, we had multiple unavailable predecessors or we had a critical
|
||||||
|
// edge from the one.
|
||||||
|
SmallVector<BasicBlock*, 8> PredsToSplit;
|
||||||
|
SmallPtrSet<BasicBlock*, 8> AvailablePredSet;
|
||||||
|
|
||||||
|
for (unsigned i = 0, e = AvailablePreds.size(); i != e; ++i)
|
||||||
|
AvailablePredSet.insert(AvailablePreds[i].first);
|
||||||
|
|
||||||
|
// Add all the unavailable predecessors to the PredsToSplit list.
|
||||||
|
for (pred_iterator PI = pred_begin(LoadBB), PE = pred_end(LoadBB);
|
||||||
|
PI != PE; ++PI)
|
||||||
|
if (!AvailablePredSet.count(*PI))
|
||||||
|
PredsToSplit.push_back(*PI);
|
||||||
|
|
||||||
|
// Split them out to their own block.
|
||||||
|
UnavailablePred =
|
||||||
|
SplitBlockPredecessors(LoadBB, &PredsToSplit[0], PredsToSplit.size(),
|
||||||
|
"thread-split", this);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the value isn't available in all predecessors, then there will be
|
||||||
|
// exactly one where it isn't available. Insert a load on that edge and add
|
||||||
|
// it to the AvailablePreds list.
|
||||||
|
if (UnavailablePred) {
|
||||||
|
assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
|
||||||
|
"Can't handle critical edge here!");
|
||||||
|
Value *NewVal = new LoadInst(LoadedPtr, LI->getName()+".pr",
|
||||||
|
UnavailablePred->getTerminator());
|
||||||
|
AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now we know that each predecessor of this block has a value in
|
||||||
|
// AvailablePreds, sort them for efficient access as we're walking the preds.
|
||||||
|
std::sort(AvailablePreds.begin(), AvailablePreds.end());
|
||||||
|
|
||||||
|
// Create a PHI node at the start of the block for the PRE'd load value.
|
||||||
|
PHINode *PN = PHINode::Create(LI->getType(), "", LoadBB->begin());
|
||||||
|
PN->takeName(LI);
|
||||||
|
|
||||||
|
// Insert new entries into the PHI for each predecessor. A single block may
|
||||||
|
// have multiple entries here.
|
||||||
|
for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); PI != E;
|
||||||
|
++PI) {
|
||||||
|
AvailablePredsTy::iterator I =
|
||||||
|
std::lower_bound(AvailablePreds.begin(), AvailablePreds.end(),
|
||||||
|
std::make_pair(*PI, (Value*)0));
|
||||||
|
|
||||||
|
assert(I != AvailablePreds.end() && I->first == *PI &&
|
||||||
|
"Didn't find entry for predecessor!");
|
||||||
|
|
||||||
|
PN->addIncoming(I->second, I->first);
|
||||||
|
}
|
||||||
|
|
||||||
|
//cerr << "PRE: " << *LI << *PN << "\n";
|
||||||
|
|
||||||
|
LI->replaceAllUsesWith(PN);
|
||||||
|
LI->eraseFromParent();
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/// ProcessJumpOnPHI - We have a conditional branch of switch on a PHI node in
|
/// ProcessJumpOnPHI - We have a conditional branch of switch on a PHI node in
|
||||||
/// the current block. See if there are any simplifications we can do based on
|
/// the current block. See if there are any simplifications we can do based on
|
||||||
/// inputs to the phi node.
|
/// inputs to the phi node.
|
||||||
|
34
test/Transforms/JumpThreading/thread-loads.ll
Normal file
34
test/Transforms/JumpThreading/thread-loads.ll
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
; RUN: llvm-as < %s | opt -jump-threading -mem2reg -simplifycfg | llvm-dis | grep {ret i32 1}
|
||||||
|
; rdar://6402033
|
||||||
|
|
||||||
|
; Test that we can thread through the block with the partially redundant load (%2).
|
||||||
|
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
|
||||||
|
target triple = "i386-apple-darwin7"
|
||||||
|
|
||||||
|
define i32 @foo(i32* %P) nounwind {
|
||||||
|
entry:
|
||||||
|
%0 = tail call i32 (...)* @f1() nounwind ; <i32> [#uses=1]
|
||||||
|
%1 = icmp eq i32 %0, 0 ; <i1> [#uses=1]
|
||||||
|
br i1 %1, label %bb1, label %bb
|
||||||
|
|
||||||
|
bb: ; preds = %entry
|
||||||
|
store i32 42, i32* %P, align 4
|
||||||
|
br label %bb1
|
||||||
|
|
||||||
|
bb1: ; preds = %entry, %bb
|
||||||
|
%res.0 = phi i32 [ 1, %bb ], [ 0, %entry ] ; <i32> [#uses=2]
|
||||||
|
%2 = load i32* %P, align 4 ; <i32> [#uses=1]
|
||||||
|
%3 = icmp sgt i32 %2, 36 ; <i1> [#uses=1]
|
||||||
|
br i1 %3, label %bb3, label %bb2
|
||||||
|
|
||||||
|
bb2: ; preds = %bb1
|
||||||
|
%4 = tail call i32 (...)* @f2() nounwind ; <i32> [#uses=0]
|
||||||
|
ret i32 %res.0
|
||||||
|
|
||||||
|
bb3: ; preds = %bb1
|
||||||
|
ret i32 %res.0
|
||||||
|
}
|
||||||
|
|
||||||
|
declare i32 @f1(...)
|
||||||
|
|
||||||
|
declare i32 @f2(...)
|
Loading…
Reference in New Issue
Block a user