Teach LSR to optimize more loop exit compares, i.e. change them to use postinc iv value. Previously LSR would only optimize those which are in the loop latch block. However, if LSR can prove it is safe (and profitable), it's now possible to change those not in the latch blocks to use postinc values.

Also, if the compare is the only use, LSR would place the iv increment instruction before the compare instead in the latch. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@71485 91177308-0d34-0410-b5e6-96231b3b80d8
2025-06-21 18:24:23 +00:00 · 2009-05-11 22:33:01 +00:00
parent 6dc4ade595
commit 5792f51e12
4 changed files with 275 additions and 45 deletions
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@ -43,6 +43,7 @@ STATISTIC(NumVariable,    "Number of PHIs with variable strides");
 STATISTIC(NumEliminated,  "Number of strides eliminated");
 STATISTIC(NumShadow,      "Number of Shadow IVs optimized");
 STATISTIC(NumImmSunk,     "Number of common expr immediates sunk into uses");
 STATISTIC(NumLoopCond,    "Number of loop terminating conds optimized");
 static cl::opt<bool> EnableFullLSRMode("enable-full-lsr",
                                       cl::init(false),
@ -122,6 +123,10 @@ namespace {
    /// particular stride.
    std::map<SCEVHandle, IVsOfOneStride> IVsByStride;
    /// StrideNoReuse - Keep track of all the strides whose ivs cannot be
    /// reused (nor should they be rewritten to reuse other strides).
    SmallSet<SCEVHandle, 4> StrideNoReuse;
    /// StrideOrder - An ordering of the keys in IVUsesByStride that is stable:
    /// We use this to iterate over the IVUsesByStride collection without being
    /// dependent on random ordering of pointers in the process.
@ -184,7 +189,7 @@ namespace {
    SCEVHandle CheckForIVReuse(bool, bool, bool, const SCEVHandle&,
                             IVExpr&, const Type*,
                             const std::vector<BasedUser>& UsersToProcess);
-    bool ValidStride(bool, int64_t,
+    bool ValidScale(bool, int64_t,
                    const std::vector<BasedUser>& UsersToProcess);
    SCEVHandle CollectIVUsers(const SCEVHandle &Stride,
                              IVUsersOfOneStride &Uses,
@ -213,6 +218,7 @@ namespace {
                                  SCEVHandle Stride,
                                  SCEVHandle CommonExprs,
                                  Value *CommonBaseV,
                                  Instruction *IVIncInsertPt,
                                  const Loop *L,
                                  SCEVExpander &PreheaderRewriter);
    void StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
@ -1122,11 +1128,10 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
  return Result;
 }
-/// ValidStride - Check whether the given Scale is valid for all loads and 
+/// ValidScale - Check whether the given Scale is valid for all loads and 
 /// stores in UsersToProcess.
 ///
-bool LoopStrengthReduce::ValidStride(bool HasBaseReg,
+bool LoopStrengthReduce::ValidScale(bool HasBaseReg, int64_t Scale,
                               int64_t Scale, 
                               const std::vector<BasedUser>& UsersToProcess) {
  if (!TLI)
    return true;
@ -1186,13 +1191,17 @@ SCEVHandle LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
                                const SCEVHandle &Stride, 
                                IVExpr &IV, const Type *Ty,
                                const std::vector<BasedUser>& UsersToProcess) {
  if (StrideNoReuse.count(Stride))
    return SE->getIntegerSCEV(0, Stride->getType());
  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Stride)) {
    int64_t SInt = SC->getValue()->getSExtValue();
    for (unsigned NewStride = 0, e = StrideOrder.size(); NewStride != e;
         ++NewStride) {
      std::map<SCEVHandle, IVsOfOneStride>::iterator SI = 
                IVsByStride.find(StrideOrder[NewStride]);
-      if (SI == IVsByStride.end() || !isa<SCEVConstant>(SI->first))
+      if (SI == IVsByStride.end() || !isa<SCEVConstant>(SI->first) ||
          StrideNoReuse.count(SI->first))
        continue;
      int64_t SSInt = cast<SCEVConstant>(SI->first)->getValue()->getSExtValue();
      if (SI->first != Stride &&
@ -1206,7 +1215,7 @@ SCEVHandle LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
      // multiplications.
      if (Scale == 1 ||
          (AllUsesAreAddresses &&
-           ValidStride(HasBaseReg, Scale, UsersToProcess)))
+           ValidScale(HasBaseReg, Scale, UsersToProcess)))
        for (std::vector<IVExpr>::iterator II = SI->second.IVs.begin(),
               IE = SI->second.IVs.end(); II != IE; ++II)
          // FIXME: Only handle base == 0 for now.
@ -1452,6 +1461,7 @@ bool LoopStrengthReduce::ShouldUseFullStrengthReductionMode(
 /// Return the created phi node.
 ///
 static PHINode *InsertAffinePhi(SCEVHandle Start, SCEVHandle Step,
                                Instruction *IVIncInsertPt,
                                const Loop *L,
                                SCEVExpander &Rewriter) {
  assert(Start->isLoopInvariant(L) && "New PHI start is not loop invariant!");
@ -1475,16 +1485,17 @@ static PHINode *InsertAffinePhi(SCEVHandle Start, SCEVHandle Step,
    IncAmount = Rewriter.SE.getNegativeSCEV(Step);
  // Insert an add instruction right before the terminator corresponding
-  // to the back-edge.
+  // to the back-edge or just before the only use. The location is determined
  // by the caller and passed in as IVIncInsertPt.
  Value *StepV = Rewriter.expandCodeFor(IncAmount, Ty,
                                        Preheader->getTerminator());
  Instruction *IncV;
  if (isNegative) {
    IncV = BinaryOperator::CreateSub(PN, StepV, "lsr.iv.next",
-                                     LatchBlock->getTerminator());
+                                     IVIncInsertPt);
  } else {
    IncV = BinaryOperator::CreateAdd(PN, StepV, "lsr.iv.next",
-                                     LatchBlock->getTerminator());
+                                     IVIncInsertPt);
  }
  if (!isa<ConstantInt>(StepV)) ++NumVariable;
@ -1541,6 +1552,7 @@ LoopStrengthReduce::PrepareToStrengthReduceFully(
  // Rewrite the UsersToProcess records, creating a separate PHI for each
  // unique Base value.
  Instruction *IVIncInsertPt = L->getLoopLatch()->getTerminator();
  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ) {
    // TODO: The uses are grouped by base, but not sorted. We arbitrarily
    // pick the first Imm value here to start with, and adjust it for the
@ -1548,7 +1560,7 @@ LoopStrengthReduce::PrepareToStrengthReduceFully(
    SCEVHandle Imm = UsersToProcess[i].Imm;
    SCEVHandle Base = UsersToProcess[i].Base;
    SCEVHandle Start = SE->getAddExpr(CommonExprs, Base, Imm);
-    PHINode *Phi = InsertAffinePhi(Start, Stride, L,
+    PHINode *Phi = InsertAffinePhi(Start, Stride, IVIncInsertPt, L,
                                   PreheaderRewriter);
    // Loop over all the users with the same base.
    do {
@ -1561,6 +1573,18 @@ LoopStrengthReduce::PrepareToStrengthReduceFully(
  }
 }
 /// FindIVIncInsertPt - Return the location to insert the increment instruction.
 /// If the only use if a use of postinc value, (must be the loop termination
 /// condition), then insert it just before the use.
 static Instruction *FindIVIncInsertPt(std::vector<BasedUser> &UsersToProcess,
                                      const Loop *L) {
  if (UsersToProcess.size() == 1 &&
      UsersToProcess[0].isUseOfPostIncrementedValue &&
      L->contains(UsersToProcess[0].Inst->getParent()))
    return UsersToProcess[0].Inst;
  return L->getLoopLatch()->getTerminator();
 }
 /// PrepareToStrengthReduceWithNewPhi - Insert a new induction variable for the
 /// given users to share.
 ///
@ -1570,12 +1594,13 @@ LoopStrengthReduce::PrepareToStrengthReduceWithNewPhi(
                                         SCEVHandle Stride,
                                         SCEVHandle CommonExprs,
                                         Value *CommonBaseV,
                                         Instruction *IVIncInsertPt,
                                         const Loop *L,
                                         SCEVExpander &PreheaderRewriter) {
  DOUT << "  Inserting new PHI:\n";
  PHINode *Phi = InsertAffinePhi(SE->getUnknown(CommonBaseV),
-                                 Stride, L,
+                                 Stride, IVIncInsertPt, L,
                                 PreheaderRewriter);
  // Remember this in case a later stride is multiple of this.
@ -1590,8 +1615,8 @@ LoopStrengthReduce::PrepareToStrengthReduceWithNewPhi(
  DOUT << "\n";
 }
-/// PrepareToStrengthReduceWithNewPhi - Prepare for the given users to reuse
+/// PrepareToStrengthReduceFromSmallerStride - Prepare for the given users to
-/// an induction variable with a stride that is a factor of the current
+/// reuse an induction variable with a stride that is a factor of the current
 /// induction variable.
 ///
 void
@ -1727,6 +1752,7 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
  BasicBlock  *Preheader = L->getLoopPreheader();
  Instruction *PreInsertPt = Preheader->getTerminator();
  BasicBlock *LatchBlock = L->getLoopLatch();
  Instruction *IVIncInsertPt = LatchBlock->getTerminator();
  Value *CommonBaseV = Constant::getNullValue(ReplacedTy);
@ -1755,13 +1781,15 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
                                    AllUsesAreOutsideLoop,
                                    Stride, ReuseIV, ReplacedTy,
                                    UsersToProcess);
-    if (isa<SCEVConstant>(RewriteFactor) &&
+    if (!RewriteFactor->isZero())
        cast<SCEVConstant>(RewriteFactor)->isZero())
      PrepareToStrengthReduceWithNewPhi(UsersToProcess, Stride, CommonExprs,
                                        CommonBaseV, L, PreheaderRewriter);
    else
      PrepareToStrengthReduceFromSmallerStride(UsersToProcess, CommonBaseV,
                                               ReuseIV, PreInsertPt);
    else {
      IVIncInsertPt = FindIVIncInsertPt(UsersToProcess, L);
      PrepareToStrengthReduceWithNewPhi(UsersToProcess, Stride, CommonExprs,
                                        CommonBaseV, IVIncInsertPt,
                                        L, PreheaderRewriter);
    }
  }
  // Process all the users now, replacing their strided uses with
@ -1800,7 +1828,12 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
      // FIXME: Use emitted users to emit other users.
      BasedUser &User = UsersToProcess.back();
-      DOUT << "    Examining use ";
+      DOUT << "    Examining ";
      if (User.isUseOfPostIncrementedValue)
        DOUT << "postinc";
      else
        DOUT << "preinc";
      DOUT << " use ";
      DEBUG(WriteAsOperand(*DOUT, UsersToProcess.back().OperandValToReplace,
                           /*PrintType=*/false));
      DOUT << " in Inst: " << *(User.Inst);
@ -1810,11 +1843,12 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
      Value *RewriteOp = User.Phi;
      if (User.isUseOfPostIncrementedValue) {
        RewriteOp = User.Phi->getIncomingValueForBlock(LatchBlock);
        // If this user is in the loop, make sure it is the last thing in the
-        // loop to ensure it is dominated by the increment.
+        // loop to ensure it is dominated by the increment. In case it's the
-        if (L->contains(User.Inst->getParent()))
+        // only use of the iv, the increment instruction is already before the
-          User.Inst->moveBefore(LatchBlock->getTerminator());
+        // use.
        if (L->contains(User.Inst->getParent()) && User.Inst != IVIncInsertPt)
          User.Inst->moveBefore(IVIncInsertPt);
      }
      SCEVHandle RewriteExpr = SE->getUnknown(RewriteOp);
@ -2085,7 +2119,7 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
      // if it's likely the new stride uses will be rewritten using the
      // stride of the compare instruction.
      if (AllUsesAreAddresses &&
-          ValidStride(!CommonExprs->isZero(), Scale, UsersToProcess))
+          ValidScale(!CommonExprs->isZero(), Scale, UsersToProcess))
        continue;
      // If scale is negative, use swapped predicate unless it's testing
@ -2304,8 +2338,8 @@ void LoopStrengthReduce::OptimizeShadowIV(Loop *L) {
      if (!DestTy) continue;
      if (TLI) {
-        /* If target does not support DestTy natively then do not apply
+        // If target does not support DestTy natively then do not apply
-           this transformation. */
+        // this transformation.
        MVT DVT = TLI->getValueType(DestTy);
        if (!TLI->isTypeLegal(DVT)) continue;
      }
@ -2380,8 +2414,6 @@ void LoopStrengthReduce::OptimizeIndvars(Loop *L) {
  // TODO: implement optzns here.
  OptimizeShadowIV(L);
  OptimizeLoopTermCond(L);
 }
 /// OptimizeLoopTermCond - Change loop terminating condition to use the 
@ -2391,23 +2423,78 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
  // can, we want to change it to use a post-incremented version of its
  // induction variable, to allow coalescing the live ranges for the IV into
  // one register value.
-  PHINode *SomePHI = cast<PHINode>(L->getHeader()->begin());
+  BasicBlock *LatchBlock = L->getLoopLatch();
-  BasicBlock  *Preheader = L->getLoopPreheader();
+  BasicBlock *ExitBlock = L->getExitingBlock();
-  BasicBlock *LatchBlock =
+  if (!ExitBlock)
-   SomePHI->getIncomingBlock(SomePHI->getIncomingBlock(0) == Preheader);
+    // Multiple exits, just look at the exit in the latch block if there is one.
-  BranchInst *TermBr = dyn_cast<BranchInst>(LatchBlock->getTerminator());
+    ExitBlock = LatchBlock;
-  if (!TermBr || TermBr->isUnconditional() || 
+  BranchInst *TermBr = dyn_cast<BranchInst>(ExitBlock->getTerminator());
-      !isa<ICmpInst>(TermBr->getCondition()))
+  if (!TermBr)
    return;
  if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
    return;
  ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
  // Search IVUsesByStride to find Cond's IVUse if there is one.
  IVStrideUse *CondUse = 0;
  const SCEVHandle *CondStride = 0;
-
+  ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
  if (!FindIVUserForCond(Cond, CondUse, CondStride))
    return; // setcc doesn't use the IV.
  if (ExitBlock != LatchBlock) {
    if (!Cond->hasOneUse())
      // See below, we don't want the condition to be cloned.
      return;
    // If exiting block is the latch block, we know it's safe and profitable to
    // transform the icmp to use post-inc iv. Otherwise do so only if it would
    // not reuse another iv and its iv would be reused by other uses. We are
    // optimizing for the case where the icmp is the only use of the iv.
    IVUsersOfOneStride &StrideUses = IVUsesByStride[*CondStride];
    for (unsigned i = 0, e = StrideUses.Users.size(); i != e; ++i) {
      if (StrideUses.Users[i].User == Cond)
        continue;
      if (!StrideUses.Users[i].isUseOfPostIncrementedValue)
        return;
    }
    // FIXME: This is expensive, and worse still ChangeCompareStride does a
    // similar check. Can we perform all the icmp related transformations after
    // StrengthReduceStridedIVUsers?
    if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(*CondStride)) {
      int64_t SInt = SC->getValue()->getSExtValue();
      for (unsigned NewStride = 0, ee = StrideOrder.size(); NewStride != ee;
           ++NewStride) {
        std::map<SCEVHandle, IVUsersOfOneStride>::iterator SI = 
          IVUsesByStride.find(StrideOrder[NewStride]);
        if (!isa<SCEVConstant>(SI->first) || SI->first == *CondStride)
          continue;
        int64_t SSInt =
          cast<SCEVConstant>(SI->first)->getValue()->getSExtValue();
        if (SSInt == SInt)
          return; // This can definitely be reused.
        if (unsigned(abs(SSInt)) < SInt || (SSInt % SInt) != 0)
          continue;
        int64_t Scale = SSInt / SInt;
        bool AllUsesAreAddresses = true;
        bool AllUsesAreOutsideLoop = true;
        std::vector<BasedUser> UsersToProcess;
        SCEVHandle CommonExprs = CollectIVUsers(SI->first, SI->second, L,
                                                AllUsesAreAddresses,
                                                AllUsesAreOutsideLoop,
                                                UsersToProcess);
        // Avoid rewriting the compare instruction with an iv of new stride
        // if it's likely the new stride uses will be rewritten using the
        // stride of the compare instruction.
        if (AllUsesAreAddresses &&
            ValidScale(!CommonExprs->isZero(), Scale, UsersToProcess))
          return;
      }
    }
    StrideNoReuse.insert(*CondStride);
  }
  // If the trip count is computed in terms of an smax (due to ScalarEvolution
  // being unable to find a sufficient guard, for example), change the loop
  // comparison to use SLT instead of NE.
@ -2415,6 +2502,7 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
  // If possible, change stride and operands of the compare instruction to
  // eliminate one stride.
  if (ExitBlock == LatchBlock)
    Cond = ChangeCompareStride(L, Cond, CondUse, CondStride);
  // It's possible for the setcc instruction to be anywhere in the loop, and
@ -2442,6 +2530,8 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
  CondUse->Offset = SE->getMinusSCEV(CondUse->Offset, *CondStride);
  CondUse->isUseOfPostIncrementedValue = true;
  Changed = true;
  ++NumLoopCond;
 }
 // OptimizeLoopCountIV - If, after all sharing of IVs, the IV used for deciding
@ -2582,6 +2672,11 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
    // computation of some other indvar to decide when to terminate the loop.
    OptimizeIndvars(L);
    // Change loop terminating condition to use the postinc iv when possible
    // and optimize loop terminating compare. FIXME: Move this after
    // StrengthReduceStridedIVUsers?
    OptimizeLoopTermCond(L);
    // FIXME: We can shrink overlarge IV's here.  e.g. if the code has
    // computation in i64 values and the target doesn't support i64, demote
    // the computation to 32-bit if safe.
@ -2616,6 +2711,7 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
  IVUsesByStride.clear();
  IVsByStride.clear();
  StrideOrder.clear();
  StrideNoReuse.clear();
  // Clean up after ourselves
  if (!DeadInsts.empty())
--- a/test/CodeGen/X86/lsr-loop-exit-cond.ll
+++ b/test/CodeGen/X86/lsr-loop-exit-cond.ll
@ -0,0 +1,134 @@
 ; RUN: llvm-as < %s | llc -march=x86-64 | %prcontext decq 1 | grep jne
@Te0 = external global [256 x i32]		; <[256 x i32]*> [#uses=5]
@Te1 = external global [256 x i32]		; <[256 x i32]*> [#uses=4]
@Te3 = external global [256 x i32]		; <[256 x i32]*> [#uses=2]
 define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r) nounwind ssp {
 entry:
 	%0 = load i32* %rk, align 4		; <i32> [#uses=1]
 	%1 = getelementptr i32* %rk, i64 1		; <i32*> [#uses=1]
 	%2 = load i32* %1, align 4		; <i32> [#uses=1]
 	%tmp15 = add i32 %r, -1		; <i32> [#uses=1]
 	%tmp.16 = zext i32 %tmp15 to i64		; <i64> [#uses=2]
 	br label %bb
 bb:		; preds = %bb1, %entry
 	%indvar = phi i64 [ 0, %entry ], [ %indvar.next, %bb1 ]		; <i64> [#uses=3]
 	%s1.0 = phi i32 [ %2, %entry ], [ %56, %bb1 ]		; <i32> [#uses=2]
 	%s0.0 = phi i32 [ %0, %entry ], [ %43, %bb1 ]		; <i32> [#uses=2]
 	%tmp18 = shl i64 %indvar, 4		; <i64> [#uses=4]
 	%rk26 = bitcast i32* %rk to i8*		; <i8*> [#uses=6]
 	%3 = lshr i32 %s0.0, 24		; <i32> [#uses=1]
 	%4 = zext i32 %3 to i64		; <i64> [#uses=1]
 	%5 = getelementptr [256 x i32]* @Te0, i64 0, i64 %4		; <i32*> [#uses=1]
 	%6 = load i32* %5, align 4		; <i32> [#uses=1]
 	%7 = lshr i32 %s1.0, 16		; <i32> [#uses=1]
 	%8 = and i32 %7, 255		; <i32> [#uses=1]
 	%9 = zext i32 %8 to i64		; <i64> [#uses=1]
 	%10 = getelementptr [256 x i32]* @Te1, i64 0, i64 %9		; <i32*> [#uses=1]
 	%11 = load i32* %10, align 4		; <i32> [#uses=1]
 	%ctg2.sum2728 = or i64 %tmp18, 8		; <i64> [#uses=1]
 	%12 = getelementptr i8* %rk26, i64 %ctg2.sum2728		; <i8*> [#uses=1]
 	%13 = bitcast i8* %12 to i32*		; <i32*> [#uses=1]
 	%14 = load i32* %13, align 4		; <i32> [#uses=1]
 	%15 = xor i32 %11, %6		; <i32> [#uses=1]
 	%16 = xor i32 %15, %14		; <i32> [#uses=3]
 	%17 = lshr i32 %s1.0, 24		; <i32> [#uses=1]
 	%18 = zext i32 %17 to i64		; <i64> [#uses=1]
 	%19 = getelementptr [256 x i32]* @Te0, i64 0, i64 %18		; <i32*> [#uses=1]
 	%20 = load i32* %19, align 4		; <i32> [#uses=1]
 	%21 = and i32 %s0.0, 255		; <i32> [#uses=1]
 	%22 = zext i32 %21 to i64		; <i64> [#uses=1]
 	%23 = getelementptr [256 x i32]* @Te3, i64 0, i64 %22		; <i32*> [#uses=1]
 	%24 = load i32* %23, align 4		; <i32> [#uses=1]
 	%ctg2.sum2930 = or i64 %tmp18, 12		; <i64> [#uses=1]
 	%25 = getelementptr i8* %rk26, i64 %ctg2.sum2930		; <i8*> [#uses=1]
 	%26 = bitcast i8* %25 to i32*		; <i32*> [#uses=1]
 	%27 = load i32* %26, align 4		; <i32> [#uses=1]
 	%28 = xor i32 %24, %20		; <i32> [#uses=1]
 	%29 = xor i32 %28, %27		; <i32> [#uses=4]
 	%30 = lshr i32 %16, 24		; <i32> [#uses=1]
 	%31 = zext i32 %30 to i64		; <i64> [#uses=1]
 	%32 = getelementptr [256 x i32]* @Te0, i64 0, i64 %31		; <i32*> [#uses=1]
 	%33 = load i32* %32, align 4		; <i32> [#uses=2]
 	%exitcond = icmp eq i64 %indvar, %tmp.16		; <i1> [#uses=1]
 	br i1 %exitcond, label %bb2, label %bb1
 bb1:		; preds = %bb
 	%ctg2.sum31 = add i64 %tmp18, 16		; <i64> [#uses=1]
 	%34 = getelementptr i8* %rk26, i64 %ctg2.sum31		; <i8*> [#uses=1]
 	%35 = bitcast i8* %34 to i32*		; <i32*> [#uses=1]
 	%36 = lshr i32 %29, 16		; <i32> [#uses=1]
 	%37 = and i32 %36, 255		; <i32> [#uses=1]
 	%38 = zext i32 %37 to i64		; <i64> [#uses=1]
 	%39 = getelementptr [256 x i32]* @Te1, i64 0, i64 %38		; <i32*> [#uses=1]
 	%40 = load i32* %39, align 4		; <i32> [#uses=1]
 	%41 = load i32* %35, align 4		; <i32> [#uses=1]
 	%42 = xor i32 %40, %33		; <i32> [#uses=1]
 	%43 = xor i32 %42, %41		; <i32> [#uses=1]
 	%44 = lshr i32 %29, 24		; <i32> [#uses=1]
 	%45 = zext i32 %44 to i64		; <i64> [#uses=1]
 	%46 = getelementptr [256 x i32]* @Te0, i64 0, i64 %45		; <i32*> [#uses=1]
 	%47 = load i32* %46, align 4		; <i32> [#uses=1]
 	%48 = and i32 %16, 255		; <i32> [#uses=1]
 	%49 = zext i32 %48 to i64		; <i64> [#uses=1]
 	%50 = getelementptr [256 x i32]* @Te3, i64 0, i64 %49		; <i32*> [#uses=1]
 	%51 = load i32* %50, align 4		; <i32> [#uses=1]
 	%ctg2.sum32 = add i64 %tmp18, 20		; <i64> [#uses=1]
 	%52 = getelementptr i8* %rk26, i64 %ctg2.sum32		; <i8*> [#uses=1]
 	%53 = bitcast i8* %52 to i32*		; <i32*> [#uses=1]
 	%54 = load i32* %53, align 4		; <i32> [#uses=1]
 	%55 = xor i32 %51, %47		; <i32> [#uses=1]
 	%56 = xor i32 %55, %54		; <i32> [#uses=1]
 	%indvar.next = add i64 %indvar, 1		; <i64> [#uses=1]
 	br label %bb
 bb2:		; preds = %bb
 	%tmp10 = shl i64 %tmp.16, 4		; <i64> [#uses=2]
 	%ctg2.sum = add i64 %tmp10, 16		; <i64> [#uses=1]
 	%tmp1213 = getelementptr i8* %rk26, i64 %ctg2.sum		; <i8*> [#uses=1]
 	%57 = bitcast i8* %tmp1213 to i32*		; <i32*> [#uses=1]
 	%58 = and i32 %33, -16777216		; <i32> [#uses=1]
 	%59 = lshr i32 %29, 16		; <i32> [#uses=1]
 	%60 = and i32 %59, 255		; <i32> [#uses=1]
 	%61 = zext i32 %60 to i64		; <i64> [#uses=1]
 	%62 = getelementptr [256 x i32]* @Te1, i64 0, i64 %61		; <i32*> [#uses=1]
 	%63 = load i32* %62, align 4		; <i32> [#uses=1]
 	%64 = and i32 %63, 16711680		; <i32> [#uses=1]
 	%65 = or i32 %64, %58		; <i32> [#uses=1]
 	%66 = load i32* %57, align 4		; <i32> [#uses=1]
 	%67 = xor i32 %65, %66		; <i32> [#uses=2]
 	%68 = lshr i32 %29, 8		; <i32> [#uses=1]
 	%69 = zext i32 %68 to i64		; <i64> [#uses=1]
 	%70 = getelementptr [256 x i32]* @Te0, i64 0, i64 %69		; <i32*> [#uses=1]
 	%71 = load i32* %70, align 4		; <i32> [#uses=1]
 	%72 = and i32 %71, -16777216		; <i32> [#uses=1]
 	%73 = and i32 %16, 255		; <i32> [#uses=1]
 	%74 = zext i32 %73 to i64		; <i64> [#uses=1]
 	%75 = getelementptr [256 x i32]* @Te1, i64 0, i64 %74		; <i32*> [#uses=1]
 	%76 = load i32* %75, align 4		; <i32> [#uses=1]
 	%77 = and i32 %76, 16711680		; <i32> [#uses=1]
 	%78 = or i32 %77, %72		; <i32> [#uses=1]
 	%ctg2.sum25 = add i64 %tmp10, 20		; <i64> [#uses=1]
 	%79 = getelementptr i8* %rk26, i64 %ctg2.sum25		; <i8*> [#uses=1]
 	%80 = bitcast i8* %79 to i32*		; <i32*> [#uses=1]
 	%81 = load i32* %80, align 4		; <i32> [#uses=1]
 	%82 = xor i32 %78, %81		; <i32> [#uses=2]
 	%83 = lshr i32 %67, 24		; <i32> [#uses=1]
 	%84 = trunc i32 %83 to i8		; <i8> [#uses=1]
 	store i8 %84, i8* %out, align 1
 	%85 = lshr i32 %67, 16		; <i32> [#uses=1]
 	%86 = trunc i32 %85 to i8		; <i8> [#uses=1]
 	%87 = getelementptr i8* %out, i64 1		; <i8*> [#uses=1]
 	store i8 %86, i8* %87, align 1
 	%88 = getelementptr i8* %out, i64 4		; <i8*> [#uses=1]
 	%89 = lshr i32 %82, 24		; <i32> [#uses=1]
 	%90 = trunc i32 %89 to i8		; <i8> [#uses=1]
 	store i8 %90, i8* %88, align 1
 	%91 = lshr i32 %82, 16		; <i32> [#uses=1]
 	%92 = trunc i32 %91 to i8		; <i8> [#uses=1]
 	%93 = getelementptr i8* %out, i64 5		; <i8*> [#uses=1]
 	store i8 %92, i8* %93, align 1
 	ret void
 }
--- a/test/CodeGen/X86/lsr-negative-stride.ll
+++ b/test/CodeGen/X86/lsr-negative-stride.ll
@ -16,7 +16,7 @@
 ;}
-define i32 @t(i32 %a, i32 %b) {
+define i32 @t(i32 %a, i32 %b) nounwind {
 entry:
 	%tmp1434 = icmp eq i32 %a, %b		; <i1> [#uses=1]
 	br i1 %tmp1434, label %bb17, label %bb.outer
--- a/test/CodeGen/X86/remat-mov-1.ll
+++ b/test/CodeGen/X86/remat-mov-1.ll
@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=x86 | grep xor | count 2
+; RUN: llvm-as < %s | llc -march=x86 | grep 4294967295 | grep mov | count 2
 	%struct.FILE = type { i8*, i32, i32, i16, i16, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, %struct.__sFILEX*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 }
 	%struct.ImgT = type { i8, i8*, i8*, %struct.FILE*, i32, i32, i32, i32, i8*, double*, float*, float*, float*, i32*, double, double, i32*, double*, i32*, i32* }