From 69e067fdd86d34cb81ccdffb82415b4f89144218 Mon Sep 17 00:00:00 2001
From: Chris Lattner <sabre@nondot.org>
Date: Thu, 27 Nov 2008 05:07:53 +0000
Subject: [PATCH] Make jump threading substantially more powerful, in the
 following ways:

1. Make it fold blocks separated by an unconditional branch.  This enables
   jump threading to see a broader scope.
2. Make jump threading able to eliminate locally redundant loads when they
   feed the branch condition of a block.  This frequently occurs due to
   reg2mem running.
3. Make jump threading able to eliminate *partially redundant* loads when
   they feed the branch condition of a block.  This is common in code with
   lots of loads and stores like C++ code and 255.vortex.

This implements thread-loads.ll and rdar://6402033.

Per the fixme's, several pieces of this should be moved into Transforms/Utils.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@60148 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/JumpThreading.cpp       | 273 ++++++++++++++++++
 test/Transforms/JumpThreading/thread-loads.ll |  34 +++
 2 files changed, 307 insertions(+)
 create mode 100644 test/Transforms/JumpThreading/thread-loads.ll
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 2f91c07c94e..4d3a44310bd 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/ADT/SmallPtrSet.h"
 using namespace llvm;
 
 STATISTIC(NumThreads, "Number of jumps threaded");
@@ -62,6 +63,8 @@ namespace {
     bool ProcessJumpOnPHI(PHINode *PN);
     bool ProcessBranchOnLogical(Value *V, BasicBlock *BB, bool isAnd);
     bool ProcessBranchOnCompare(CmpInst *Cmp, BasicBlock *BB);
+    
+    bool SimplifyPartiallyRedundantLoad(LoadInst *LI);
   };
 }
 
@@ -153,10 +156,50 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB) {
   return Size;
 }
 
+/// MergeBasicBlockIntoOnlyPred - DestBB is a block with one predecessor and its
+/// predecessor is known to have one successor (DestBB!).  Eliminate the edge
+/// between them, moving the instructions in the predecessor into DestBB and
+/// deleting the predecessor block.
+///
+/// FIXME: Move to TransformUtils to share with simplifycfg and codegenprepare.
+static void MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB) {
+  // If BB has single-entry PHI nodes, fold them.
+  while (PHINode *PN = dyn_cast<PHINode>(DestBB->begin())) {
+    Value *NewVal = PN->getIncomingValue(0);
+    // Replace self referencing PHI with undef, it must be dead.
+    if (NewVal == PN) NewVal = UndefValue::get(PN->getType());
+    PN->replaceAllUsesWith(NewVal);
+    PN->eraseFromParent();
+  }
+
+  BasicBlock *PredBB = DestBB->getSinglePredecessor();
+  assert(PredBB && "Block doesn't have a single predecessor!");
+  
+  // Splice all the instructions from PredBB to DestBB.
+  PredBB->getTerminator()->eraseFromParent();
+  DestBB->getInstList().splice(DestBB->begin(), PredBB->getInstList());
+  
+  // Anything that branched to PredBB now branches to DestBB.
+  PredBB->replaceAllUsesWith(DestBB);
+  
+  // Nuke BB.
+  PredBB->eraseFromParent();
+}
+
 
 /// ThreadBlock - If there are any predecessors whose control can be threaded
 /// through to a successor, transform them now.
 bool JumpThreading::ThreadBlock(BasicBlock *BB) {
+  // If this block has a single predecessor, and if that pred has a single
+  // successor, merge the blocks.  This encourages recursive jump threading
+  // because now the condition in this block can be threaded through
+  // predecessors of our predecessor block.
+  if (BasicBlock *SinglePred = BB->getSinglePredecessor())
+    if (SinglePred->getTerminator()->getNumSuccessors() == 1) {
+      MergeBasicBlockIntoOnlyPred(BB);
+      return true;
+    }
+  
   // See if this block ends with a branch or switch.  If so, see if the
   // condition is a phi node.  If so, and if an entry of the phi node is a
   // constant, we can thread the block.
@@ -208,10 +251,240 @@ bool JumpThreading::ThreadBlock(BasicBlock *BB) {
         isa<Constant>(CondCmp->getOperand(1)) &&
         ProcessBranchOnCompare(CondCmp, BB))
       return true;
+
+  // Check for some cases that are worth simplifying.  Right now we want to look
+  // for loads that are used by a switch or by the condition for the branch.  If
+  // we see one, check to see if it's partially redundant.  If so, insert a PHI
+  // which can then be used to thread the values.
+  //
+  // This is particularly important because reg2mem inserts loads and stores all
+  // over the place, and this blocks jump threading if we don't zap them.
+  Value *SimplifyValue = Condition;
+  if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue))
+    if (isa<Constant>(CondCmp->getOperand(1)))
+      SimplifyValue = CondCmp->getOperand(0);
+  
+  if (LoadInst *LI = dyn_cast<LoadInst>(SimplifyValue))
+    if (SimplifyPartiallyRedundantLoad(LI))
+      return true;
+  
+  // TODO: If we have: "br (X > 0)"  and we have a predecessor where we know
+  // "(X == 4)" thread through this block.
   
   return false;
 }
 
+
+/// FindAvailableLoadedValue - Scan backwards from ScanFrom checking to see if
+/// we have the value at the memory address *Ptr locally available within a
+/// small number of instructions.  If the value is available, return it.
+///
+/// If not, return the iterator for the last validated instruction that the 
+/// value would be live through.  If we scanned the entire block, ScanFrom would
+/// be left at begin().
+///
+/// FIXME: Move this to transform utils and use from
+/// InstCombiner::visitLoadInst.  It would also be nice to optionally take AA so
+/// that GVN could do this.
+static Value *FindAvailableLoadedValue(Value *Ptr,
+                                       BasicBlock *ScanBB,
+                                       BasicBlock::iterator &ScanFrom) {
+  
+  unsigned NumToScan = 6;
+  while (ScanFrom != ScanBB->begin()) {
+    // Don't scan huge blocks.
+    if (--NumToScan == 0) return 0;
+    
+    Instruction *Inst = --ScanFrom;
+    
+    // If this is a load of Ptr, the loaded value is available.
+    if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+      if (LI->getOperand(0) == Ptr)
+        return LI;
+    
+    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      // If this is a store through Ptr, the value is available!
+      if (SI->getOperand(1) == Ptr)
+        return SI->getOperand(0);
+
+      // If Ptr is an alloca and this is a store to a different alloca, ignore
+      // the store.  This is a trivial form of alias analysis that is important
+      // for reg2mem'd code.
+      if ((isa<AllocaInst>(Ptr) || isa<GlobalVariable>(Ptr)) &&
+          (isa<AllocaInst>(SI->getOperand(1)) ||
+           isa<GlobalVariable>(SI->getOperand(1))))
+        continue;
+      
+      // Otherwise the store that may or may not alias the pointer, bail out.
+      ++ScanFrom;
+      return 0;
+    }
+    
+  
+    // If this is some other instruction that may clobber Ptr, bail out.
+    if (Inst->mayWriteToMemory()) {
+      // May modify the pointer, bail out.
+      ++ScanFrom;
+      return 0;
+    }
+  }
+  
+  // Got to the start of the block, we didn't find it, but are done for this
+  // block.
+  return 0;
+}
+
+
+/// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant
+/// load instruction, eliminate it by replacing it with a PHI node.  This is an
+/// important optimization that encourages jump threading, and needs to be run
+/// interlaced with other jump threading tasks.
+bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
+  // Don't hack volatile loads.
+  if (LI->isVolatile()) return false;
+  
+  // If the load is defined in a block with exactly one predecessor, it can't be
+  // partially redundant.
+  BasicBlock *LoadBB = LI->getParent();
+  if (LoadBB->getSinglePredecessor())
+    return false;
+  
+  Value *LoadedPtr = LI->getOperand(0);
+
+  // If the loaded operand is defined in the LoadBB, it can't be available.
+  // FIXME: Could do PHI translation, that would be fun :)
+  if (Instruction *PtrOp = dyn_cast<Instruction>(LoadedPtr))
+    if (PtrOp->getParent() == LoadBB)
+      return false;
+  
+  // Scan a few instructions up from the load, to see if it is obviously live at
+  // the entry to its block.
+  BasicBlock::iterator BBIt = LI;
+
+  if (Value *AvailableVal = FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt)) {
+    // If the value if the load is locally available within the block, just use
+    // it.  This frequently occurs for reg2mem'd allocas.
+    //cerr << "LOAD ELIMINATED:\n" << *BBIt << *LI << "\n";
+    LI->replaceAllUsesWith(AvailableVal);
+    LI->eraseFromParent();
+    return true;
+  }
+
+  // Otherwise, if we scanned the whole block and got to the top of the block,
+  // we know the block is locally transparent to the load.  If not, something
+  // might clobber its value.
+  if (BBIt != LoadBB->begin())
+    return false;
+  
+  
+  SmallPtrSet<BasicBlock*, 8> PredsScanned;
+  typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy;
+  AvailablePredsTy AvailablePreds;
+  BasicBlock *OneUnavailablePred = 0;
+  
+  // If we got here, the loaded value is transparent through to the start of the
+  // block.  Check to see if it is available in any of the predecessor blocks.
+  for (pred_iterator PI = pred_begin(LoadBB), PE = pred_end(LoadBB);
+       PI != PE; ++PI) {
+    BasicBlock *PredBB = *PI;
+
+    // If we already scanned this predecessor, skip it.
+    if (!PredsScanned.insert(PredBB))
+      continue;
+
+    // Scan the predecessor to see if the value is available in the pred.
+    BBIt = PredBB->end();
+    Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt);
+    if (!PredAvailable) {
+      OneUnavailablePred = PredBB;
+      continue;
+    }
+    
+    // If so, this load is partially redundant.  Remember this info so that we
+    // can create a PHI node.
+    AvailablePreds.push_back(std::make_pair(PredBB, PredAvailable));
+  }
+  
+  // If the loaded value isn't available in any predecessor, it isn't partially
+  // redundant.
+  if (AvailablePreds.empty()) return false;
+  
+  // Okay, the loaded value is available in at least one (and maybe all!)
+  // predecessors.  If the value is unavailable in more than one unique
+  // predecessor, we want to insert a merge block for those common predecessors.
+  // This ensures that we only have to insert one reload, thus not increasing
+  // code size.
+  BasicBlock *UnavailablePred = 0;
+  
+  // If there is exactly one predecessor where the value is unavailable, the
+  // already computed 'OneUnavailablePred' block is it.  If it ends in an
+  // unconditional branch, we know that it isn't a critical edge.
+  if (PredsScanned.size() == AvailablePreds.size()+1 &&
+      OneUnavailablePred->getTerminator()->getNumSuccessors() == 1) {
+    UnavailablePred = OneUnavailablePred;
+  } else if (PredsScanned.size() != AvailablePreds.size()) {
+    // Otherwise, we had multiple unavailable predecessors or we had a critical
+    // edge from the one.
+    SmallVector<BasicBlock*, 8> PredsToSplit;
+    SmallPtrSet<BasicBlock*, 8> AvailablePredSet;
+
+    for (unsigned i = 0, e = AvailablePreds.size(); i != e; ++i)
+      AvailablePredSet.insert(AvailablePreds[i].first);
+
+    // Add all the unavailable predecessors to the PredsToSplit list.
+    for (pred_iterator PI = pred_begin(LoadBB), PE = pred_end(LoadBB);
+         PI != PE; ++PI)
+      if (!AvailablePredSet.count(*PI))
+        PredsToSplit.push_back(*PI);
+    
+    // Split them out to their own block.
+    UnavailablePred =
+      SplitBlockPredecessors(LoadBB, &PredsToSplit[0], PredsToSplit.size(),
+                             "thread-split", this);
+  }
+  
+  // If the value isn't available in all predecessors, then there will be
+  // exactly one where it isn't available.  Insert a load on that edge and add
+  // it to the AvailablePreds list.
+  if (UnavailablePred) {
+    assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
+           "Can't handle critical edge here!");
+    Value *NewVal = new LoadInst(LoadedPtr, LI->getName()+".pr",
+                                 UnavailablePred->getTerminator());
+    AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal));
+  }
+  
+  // Now we know that each predecessor of this block has a value in
+  // AvailablePreds, sort them for efficient access as we're walking the preds.
+  std::sort(AvailablePreds.begin(), AvailablePreds.end());
+  
+  // Create a PHI node at the start of the block for the PRE'd load value.
+  PHINode *PN = PHINode::Create(LI->getType(), "", LoadBB->begin());
+  PN->takeName(LI);
+  
+  // Insert new entries into the PHI for each predecessor.  A single block may
+  // have multiple entries here.
+  for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); PI != E;
+       ++PI) {
+    AvailablePredsTy::iterator I = 
+      std::lower_bound(AvailablePreds.begin(), AvailablePreds.end(),
+                       std::make_pair(*PI, (Value*)0));
+    
+    assert(I != AvailablePreds.end() && I->first == *PI &&
+           "Didn't find entry for predecessor!");
+    
+    PN->addIncoming(I->second, I->first);
+  }
+  
+  //cerr << "PRE: " << *LI << *PN << "\n";
+  
+  LI->replaceAllUsesWith(PN);
+  LI->eraseFromParent();
+  
+  return true;
+}
+
+
 /// ProcessJumpOnPHI - We have a conditional branch of switch on a PHI node in
 /// the current block.  See if there are any simplifications we can do based on
 /// inputs to the phi node.
diff --git a/test/Transforms/JumpThreading/thread-loads.ll b/test/Transforms/JumpThreading/thread-loads.ll
new file mode 100644
index 00000000000..5c0b256224a
--- /dev/null
+++ b/test/Transforms/JumpThreading/thread-loads.ll
@@ -0,0 +1,34 @@
+; RUN: llvm-as < %s | opt -jump-threading -mem2reg -simplifycfg | llvm-dis | grep {ret i32 1}
+; rdar://6402033
+
+; Test that we can thread through the block with the partially redundant load (%2).
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin7"
+
+define i32 @foo(i32* %P) nounwind {
+entry:
+	%0 = tail call i32 (...)* @f1() nounwind		; <i32> [#uses=1]
+	%1 = icmp eq i32 %0, 0		; <i1> [#uses=1]
+	br i1 %1, label %bb1, label %bb
+
+bb:		; preds = %entry
+	store i32 42, i32* %P, align 4
+	br label %bb1
+
+bb1:		; preds = %entry, %bb
+	%res.0 = phi i32 [ 1, %bb ], [ 0, %entry ]		; <i32> [#uses=2]
+	%2 = load i32* %P, align 4		; <i32> [#uses=1]
+	%3 = icmp sgt i32 %2, 36		; <i1> [#uses=1]
+	br i1 %3, label %bb3, label %bb2
+
+bb2:		; preds = %bb1
+	%4 = tail call i32 (...)* @f2() nounwind		; <i32> [#uses=0]
+	ret i32 %res.0
+
+bb3:		; preds = %bb1
+	ret i32 %res.0
+}
+
+declare i32 @f1(...)
+
+declare i32 @f2(...)