diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index 5234da71a8d..f50f9b5a33c 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -32,6 +32,7 @@
 #define DEBUG_TYPE "ctrloops"
 #include "PPC.h"
 #include "PPCTargetMachine.h"
+#include "MCTargetDesc/PPCPredicates.h"
 #include "llvm/Constants.h"
 #include "llvm/PassSupport.h"
 #include "llvm/ADT/DenseMap.h"
@@ -82,13 +83,14 @@ namespace {
     /// getCanonicalInductionVariable - Check to see if the loop has a canonical
     /// induction variable.
     /// Should be defined in MachineLoop. Based upon version in class Loop.
-    MachineInstr *getCanonicalInductionVariable(MachineLoop *L,
-                                                MachineInstr *&IOp) const;
+    void getCanonicalInductionVariable(MachineLoop *L,
+                              SmallVector<MachineInstr *, 4> &IVars,
+                              SmallVector<MachineInstr *, 4> &IOps) const;
 
     /// getTripCount - Return a loop-invariant LLVM register indicating the
     /// number of times the loop will be executed.  If the trip-count cannot
     /// be determined, this return null.
-    CountValue *getTripCount(MachineLoop *L, bool &WordCmp,
+    CountValue *getTripCount(MachineLoop *L,
                              SmallVector<MachineInstr *, 2> &OldInsts) const;
 
     /// isInductionOperation - Return true if the instruction matches the
@@ -175,12 +177,12 @@ namespace {
 
 /// isCompareEquals - Returns true if the instruction is a compare equals
 /// instruction with an immediate operand.
-static bool isCompareEqualsImm(const MachineInstr *MI, bool &WordCmp) {
-  if (MI->getOpcode() == PPC::CMPWI || MI->getOpcode() == PPC::CMPLWI) {
-    WordCmp = true;
+static bool isCompareEqualsImm(const MachineInstr *MI, bool &SignedCmp) {
+  if (MI->getOpcode() == PPC::CMPWI || MI->getOpcode() == PPC::CMPDI) {
+    SignedCmp = true;
     return true;
-  } else if (MI->getOpcode() == PPC::CMPDI || MI->getOpcode() == PPC::CMPLDI) {
-    WordCmp = false;
+  } else if (MI->getOpcode() == PPC::CMPLWI || MI->getOpcode() == PPC::CMPLDI) {
+    SignedCmp = false;
     return true;
   }
 
@@ -227,26 +229,27 @@ bool PPCCTRLoops::runOnMachineFunction(MachineFunction &MF) {
 /// the machine.
 /// This method assumes that the IndVarSimplify pass has been run by 'opt'.
 ///
-MachineInstr
-*PPCCTRLoops::getCanonicalInductionVariable(MachineLoop *L,
-                                            MachineInstr *&IOp) const {
+void
+PPCCTRLoops::getCanonicalInductionVariable(MachineLoop *L,
+                                  SmallVector<MachineInstr *, 4> &IVars,
+                                  SmallVector<MachineInstr *, 4> &IOps) const {
   MachineBasicBlock *TopMBB = L->getTopBlock();
   MachineBasicBlock::pred_iterator PI = TopMBB->pred_begin();
   assert(PI != TopMBB->pred_end() &&
          "Loop must have more than one incoming edge!");
   MachineBasicBlock *Backedge = *PI++;
-  if (PI == TopMBB->pred_end()) return 0;  // dead loop
+  if (PI == TopMBB->pred_end()) return;  // dead loop
   MachineBasicBlock *Incoming = *PI++;
-  if (PI != TopMBB->pred_end()) return 0;  // multiple backedges?
+  if (PI != TopMBB->pred_end()) return;  // multiple backedges?
 
   // make sure there is one incoming and one backedge and determine which
   // is which.
   if (L->contains(Incoming)) {
     if (L->contains(Backedge))
-      return 0;
+      return;
     std::swap(Incoming, Backedge);
   } else if (!L->contains(Backedge))
-    return 0;
+    return;
 
   // Loop over all of the PHI nodes, looking for a canonical induction variable:
   //   - The PHI node is "reg1 = PHI reg2, BB1, reg3, BB2".
@@ -263,13 +266,13 @@ MachineInstr
         // Check if the definition is an induction operation.
         MachineInstr *DI = MRI->getVRegDef(MPhi->getOperand(i).getReg());
         if (isInductionOperation(DI, DefReg)) {
-          IOp = DI;
-          return MPhi;
+          IOps.push_back(DI);
+          IVars.push_back(MPhi);
         }
       }
     }
   }
-  return 0;
+  return;
 }
 
 /// getTripCount - Return a loop-invariant LLVM value indicating the
@@ -283,66 +286,100 @@ MachineInstr
 ///
 /// Based upon getTripCount in LoopInfo.
 ///
-CountValue *PPCCTRLoops::getTripCount(MachineLoop *L, bool &WordCmp,
+CountValue *PPCCTRLoops::getTripCount(MachineLoop *L,
                            SmallVector<MachineInstr *, 2> &OldInsts) const {
+  MachineBasicBlock *LastMBB = L->getExitingBlock();
+  // Don't generate a CTR loop if the loop has more than one exit.
+  if (LastMBB == 0)
+    return 0;
+
+  MachineBasicBlock::iterator LastI = LastMBB->getFirstTerminator();
+  if (LastI->getOpcode() != PPC::BCC)
+    return 0;
+
+  // We need to make sure that this compare is defining the condition
+  // register actually used by the terminating branch.
+
+  unsigned PredReg = LastI->getOperand(1).getReg();
+  DEBUG(dbgs() << "Examining loop with first terminator: " << *LastI);
+
+  unsigned PredCond = LastI->getOperand(0).getImm();
+  if (PredCond != PPC::PRED_EQ && PredCond != PPC::PRED_NE)
+    return 0;
+
   // Check that the loop has a induction variable.
-  MachineInstr *IOp;
-  MachineInstr *IV_Inst = getCanonicalInductionVariable(L, IOp);
-  if (IV_Inst == 0) return 0;
+  SmallVector<MachineInstr *, 4> IVars, IOps;
+  getCanonicalInductionVariable(L, IVars, IOps);
+  for (unsigned i = 0; i < IVars.size(); ++i) {
+    MachineInstr *IOp = IOps[i];
+    MachineInstr *IV_Inst = IVars[i];
 
-  // Canonical loops will end with a 'cmpwi/cmpdi cr, IV, Imm',
-  //  if Imm is 0, get the count from the PHI opnd
-  //  if Imm is -M, than M is the count
-  //  Otherwise, Imm is the count
-  MachineOperand *IV_Opnd;
-  const MachineOperand *InitialValue;
-  if (!L->contains(IV_Inst->getOperand(2).getMBB())) {
-    InitialValue = &IV_Inst->getOperand(1);
-    IV_Opnd = &IV_Inst->getOperand(3);
-  } else {
-    InitialValue = &IV_Inst->getOperand(3);
-    IV_Opnd = &IV_Inst->getOperand(1);
-  }
+    // Canonical loops will end with a 'cmpwi/cmpdi cr, IV, Imm',
+    //  if Imm is 0, get the count from the PHI opnd
+    //  if Imm is -M, than M is the count
+    //  Otherwise, Imm is the count
+    MachineOperand *IV_Opnd;
+    const MachineOperand *InitialValue;
+    if (!L->contains(IV_Inst->getOperand(2).getMBB())) {
+      InitialValue = &IV_Inst->getOperand(1);
+      IV_Opnd = &IV_Inst->getOperand(3);
+    } else {
+      InitialValue = &IV_Inst->getOperand(3);
+      IV_Opnd = &IV_Inst->getOperand(1);
+    }
 
-  // Look for the cmp instruction to determine if we
-  // can get a useful trip count.  The trip count can
-  // be either a register or an immediate.  The location
-  // of the value depends upon the type (reg or imm).
-  while ((IV_Opnd = IV_Opnd->getNextOperandForReg())) {
-    MachineInstr *MI = IV_Opnd->getParent();
-    if (L->contains(MI) && isCompareEqualsImm(MI, WordCmp)) {
-      OldInsts.push_back(MI);
-      OldInsts.push_back(IOp);
+    DEBUG(dbgs() << "Considering:\n");
+    DEBUG(dbgs() << "  induction operation: " << *IOp);
+    DEBUG(dbgs() << "  induction variable: " << *IV_Inst);
+    DEBUG(dbgs() << "  initial value: " << *InitialValue << "\n");
+  
+    // Look for the cmp instruction to determine if we
+    // can get a useful trip count.  The trip count can
+    // be either a register or an immediate.  The location
+    // of the value depends upon the type (reg or imm).
+    while ((IV_Opnd = IV_Opnd->getNextOperandForReg())) {
+      bool SignedCmp;
+      MachineInstr *MI = IV_Opnd->getParent();
+      if (L->contains(MI) && isCompareEqualsImm(MI, SignedCmp) &&
+          MI->getOperand(0).getReg() == PredReg) {
 
-      const MachineOperand &MO = MI->getOperand(2);
-      assert(MO.isImm() && "IV Cmp Operand should be an immediate");
-      int64_t ImmVal = MO.getImm();
+        OldInsts.push_back(MI);
+        OldInsts.push_back(IOp);
+ 
+        DEBUG(dbgs() << "  compare: " << *MI);
+ 
+        const MachineOperand &MO = MI->getOperand(2);
+        assert(MO.isImm() && "IV Cmp Operand should be an immediate");
 
-      const MachineInstr *IV_DefInstr = MRI->getVRegDef(IV_Opnd->getReg());
-      assert(L->contains(IV_DefInstr->getParent()) &&
-             "IV definition should occurs in loop");
-      int64_t iv_value = IV_DefInstr->getOperand(2).getImm();
-
-      if (ImmVal == 0) {
-        // Make sure the induction variable changes by one on each iteration.
-        if (iv_value != 1 && iv_value != -1) {
-          return 0;
-        }
-        return new CountValue(InitialValue->getReg(), iv_value > 0);
-      } else {
+        int64_t ImmVal;
+        if (SignedCmp)
+          ImmVal = (short) MO.getImm();
+        else
+          ImmVal = MO.getImm();
+  
+        const MachineInstr *IV_DefInstr = MRI->getVRegDef(IV_Opnd->getReg());
+        assert(L->contains(IV_DefInstr->getParent()) &&
+               "IV definition should occurs in loop");
+        int64_t iv_value = (short) IV_DefInstr->getOperand(2).getImm();
+  
         assert(InitialValue->isReg() && "Expecting register for init value");
-        const MachineInstr *DefInstr = MRI->getVRegDef(InitialValue->getReg());
-
+        unsigned InitialValueReg = InitialValue->getReg();
+  
+        const MachineInstr *DefInstr = MRI->getVRegDef(InitialValueReg);
+  
         // Here we need to look for an immediate load (an li or lis/ori pair).
         if (DefInstr && (DefInstr->getOpcode() == PPC::ORI8 ||
                          DefInstr->getOpcode() == PPC::ORI)) {
-          int64_t start = DefInstr->getOperand(2).getImm();
+          int64_t start = (short) DefInstr->getOperand(2).getImm();
           const MachineInstr *DefInstr2 =
             MRI->getVRegDef(DefInstr->getOperand(0).getReg());
           if (DefInstr2 && (DefInstr2->getOpcode() == PPC::LIS8 ||
                             DefInstr2->getOpcode() == PPC::LIS)) {
-            start |= DefInstr2->getOperand(1).getImm() << 16;
+            DEBUG(dbgs() << "  initial constant: " << *DefInstr);
+            DEBUG(dbgs() << "  initial constant: " << *DefInstr2);
 
+            start |= int64_t(short(DefInstr2->getOperand(1).getImm())) << 16;
+  
             int64_t count = ImmVal - start;
             if ((count % iv_value) != 0) {
               return 0;
@@ -351,12 +388,23 @@ CountValue *PPCCTRLoops::getTripCount(MachineLoop *L, bool &WordCmp,
           }
         } else if (DefInstr && (DefInstr->getOpcode() == PPC::LI8 ||
                                 DefInstr->getOpcode() == PPC::LI)) {
-          int64_t count = ImmVal - DefInstr->getOperand(1).getImm();
+          DEBUG(dbgs() << "  initial constant: " << *DefInstr);
+
+          int64_t count = ImmVal - int64_t(short(DefInstr->getOperand(1).getImm()));
           if ((count % iv_value) != 0) {
             return 0;
           }
           return new CountValue(count/iv_value);
+        } else if (iv_value == 1 || iv_value == -1) {
+          // We can't determine a constant starting value.
+          if (ImmVal == 0) {
+            return new CountValue(InitialValueReg, iv_value > 0);
+          }
+          // FIXME: handle non-zero end value.
         }
+        // FIXME: handle non-unit increments (we might not want to introduce division
+        // but we can handle some 2^n cases with shifts).
+  
       }
     }
   }
@@ -524,10 +572,9 @@ bool PPCCTRLoops::convertToCTRLoop(MachineLoop *L) {
     return Changed;
   }
 
-  bool WordCmp;
   SmallVector<MachineInstr *, 2> OldInsts;
   // Are we able to determine the trip count for the loop?
-  CountValue *TripCount = getTripCount(L, WordCmp, OldInsts);
+  CountValue *TripCount = getTripCount(L, OldInsts);
   if (TripCount == 0) {
     DEBUG(dbgs() << "failed to get trip count!\n");
     return false;
@@ -575,14 +622,21 @@ bool PPCCTRLoops::convertToCTRLoop(MachineLoop *L) {
   const PPCSubtarget &Subtarget = MF->getTarget().getSubtarget<PPCSubtarget>();
   bool isPPC64 = Subtarget.isPPC64();
 
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *RC = isPPC64 ? G8RC : GPRC;
+
   unsigned CountReg;
   if (TripCount->isReg()) {
     // Create a copy of the loop count register.
-    const TargetRegisterClass *RC =
+    const TargetRegisterClass *SrcRC =
       MF->getRegInfo().getRegClass(TripCount->getReg());
     CountReg = MF->getRegInfo().createVirtualRegister(RC);
+    unsigned CopyOp = (isPPC64 && SrcRC == GPRC) ?
+                        (unsigned) PPC::EXTSW_32_64 :
+                        (unsigned) TargetOpcode::COPY;
     BuildMI(*Preheader, InsertPos, dl,
-            TII->get(TargetOpcode::COPY), CountReg).addReg(TripCount->getReg());
+            TII->get(CopyOp), CountReg).addReg(TripCount->getReg());
     if (TripCount->isNeg()) {
       unsigned CountReg1 = CountReg;
       CountReg = MF->getRegInfo().createVirtualRegister(RC);
@@ -590,26 +644,12 @@ bool PPCCTRLoops::convertToCTRLoop(MachineLoop *L) {
               TII->get(isPPC64 ? PPC::NEG8 : PPC::NEG),
                        CountReg).addReg(CountReg1);
     }
-
-    // On a 64-bit system, if the original comparison was only 32-bit, then
-    // mask out the higher-order part of the count.
-    if (isPPC64 && WordCmp) {
-       unsigned CountReg1 = CountReg;
-       CountReg = MF->getRegInfo().createVirtualRegister(RC);
-       BuildMI(*Preheader, InsertPos, dl,
-               TII->get(PPC::RLDICL), CountReg).addReg(CountReg1
-              ).addImm(0).addImm(32);
-    }
   } else {
     assert(TripCount->isImm() && "Expecting immedate vaule for trip count");
     // Put the trip count in a register for transfer into the count register.
-    const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
-    const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
-    const TargetRegisterClass *RC = isPPC64 ? G8RC : GPRC;
 
     int64_t CountImm = TripCount->getImm();
-    if (TripCount->isNeg())
-      CountImm = -CountImm;
+    assert(!TripCount->isNeg() && "Constant trip count must be positive");
 
     CountReg = MF->getRegInfo().createVirtualRegister(RC);
     if (CountImm > 0xFFFF) {
@@ -665,6 +705,7 @@ bool PPCCTRLoops::convertToCTRLoop(MachineLoop *L) {
                  (isPPC64 ? PPC::BDZ8 : PPC::BDZ))).addMBB(BranchTarget);
 
   // Conditional branch; just delete it.
+  DEBUG(dbgs() << "Removing old branch: " << *LastI);
   LastMBB->erase(LastI);
 
   delete TripCount;
diff --git a/test/CodeGen/PowerPC/ctrloop-s000.ll b/test/CodeGen/PowerPC/ctrloop-s000.ll
new file mode 100644
index 00000000000..dcea06f29e7
--- /dev/null
+++ b/test/CodeGen/PowerPC/ctrloop-s000.ll
@@ -0,0 +1,156 @@
+; ModuleID = 'tsc_s000.c'
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+; RUN: llc < %s -march=ppc64 | FileCheck %s
+
+@Y = common global [16000 x double] zeroinitializer, align 32
+@X = common global [16000 x double] zeroinitializer, align 32
+@Z = common global [16000 x double] zeroinitializer, align 32
+@U = common global [16000 x double] zeroinitializer, align 32
+@V = common global [16000 x double] zeroinitializer, align 32
+@aa = common global [256 x [256 x double]] zeroinitializer, align 32
+@bb = common global [256 x [256 x double]] zeroinitializer, align 32
+@cc = common global [256 x [256 x double]] zeroinitializer, align 32
+@array = common global [65536 x double] zeroinitializer, align 32
+@x = common global [16000 x double] zeroinitializer, align 32
+@temp = common global double 0.000000e+00, align 8
+@temp_int = common global i32 0, align 4
+@a = common global [16000 x double] zeroinitializer, align 32
+@b = common global [16000 x double] zeroinitializer, align 32
+@c = common global [16000 x double] zeroinitializer, align 32
+@d = common global [16000 x double] zeroinitializer, align 32
+@e = common global [16000 x double] zeroinitializer, align 32
+@tt = common global [256 x [256 x double]] zeroinitializer, align 32
+@indx = common global [16000 x i32] zeroinitializer, align 32
+@xx = common global double* null, align 8
+@yy = common global double* null, align 8
+
+define i32 @s000() nounwind {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.end, %entry
+  %nl.010 = phi i32 [ 0, %entry ], [ %inc7, %for.end ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next.15, %for.body3 ]
+  %arrayidx = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv
+  %0 = load double* %arrayidx, align 32, !tbaa !0
+  %add = fadd double %0, 1.000000e+00
+  %arrayidx5 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv
+  store double %add, double* %arrayidx5, align 32, !tbaa !0
+  %indvars.iv.next11 = or i64 %indvars.iv, 1
+  %arrayidx.1 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next11
+  %1 = load double* %arrayidx.1, align 8, !tbaa !0
+  %add.1 = fadd double %1, 1.000000e+00
+  %arrayidx5.1 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next11
+  store double %add.1, double* %arrayidx5.1, align 8, !tbaa !0
+  %indvars.iv.next.112 = or i64 %indvars.iv, 2
+  %arrayidx.2 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.112
+  %2 = load double* %arrayidx.2, align 16, !tbaa !0
+  %add.2 = fadd double %2, 1.000000e+00
+  %arrayidx5.2 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.112
+  store double %add.2, double* %arrayidx5.2, align 16, !tbaa !0
+  %indvars.iv.next.213 = or i64 %indvars.iv, 3
+  %arrayidx.3 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.213
+  %3 = load double* %arrayidx.3, align 8, !tbaa !0
+  %add.3 = fadd double %3, 1.000000e+00
+  %arrayidx5.3 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.213
+  store double %add.3, double* %arrayidx5.3, align 8, !tbaa !0
+  %indvars.iv.next.314 = or i64 %indvars.iv, 4
+  %arrayidx.4 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.314
+  %4 = load double* %arrayidx.4, align 32, !tbaa !0
+  %add.4 = fadd double %4, 1.000000e+00
+  %arrayidx5.4 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.314
+  store double %add.4, double* %arrayidx5.4, align 32, !tbaa !0
+  %indvars.iv.next.415 = or i64 %indvars.iv, 5
+  %arrayidx.5 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.415
+  %5 = load double* %arrayidx.5, align 8, !tbaa !0
+  %add.5 = fadd double %5, 1.000000e+00
+  %arrayidx5.5 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.415
+  store double %add.5, double* %arrayidx5.5, align 8, !tbaa !0
+  %indvars.iv.next.516 = or i64 %indvars.iv, 6
+  %arrayidx.6 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.516
+  %6 = load double* %arrayidx.6, align 16, !tbaa !0
+  %add.6 = fadd double %6, 1.000000e+00
+  %arrayidx5.6 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.516
+  store double %add.6, double* %arrayidx5.6, align 16, !tbaa !0
+  %indvars.iv.next.617 = or i64 %indvars.iv, 7
+  %arrayidx.7 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.617
+  %7 = load double* %arrayidx.7, align 8, !tbaa !0
+  %add.7 = fadd double %7, 1.000000e+00
+  %arrayidx5.7 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.617
+  store double %add.7, double* %arrayidx5.7, align 8, !tbaa !0
+  %indvars.iv.next.718 = or i64 %indvars.iv, 8
+  %arrayidx.8 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.718
+  %8 = load double* %arrayidx.8, align 32, !tbaa !0
+  %add.8 = fadd double %8, 1.000000e+00
+  %arrayidx5.8 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.718
+  store double %add.8, double* %arrayidx5.8, align 32, !tbaa !0
+  %indvars.iv.next.819 = or i64 %indvars.iv, 9
+  %arrayidx.9 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.819
+  %9 = load double* %arrayidx.9, align 8, !tbaa !0
+  %add.9 = fadd double %9, 1.000000e+00
+  %arrayidx5.9 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.819
+  store double %add.9, double* %arrayidx5.9, align 8, !tbaa !0
+  %indvars.iv.next.920 = or i64 %indvars.iv, 10
+  %arrayidx.10 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.920
+  %10 = load double* %arrayidx.10, align 16, !tbaa !0
+  %add.10 = fadd double %10, 1.000000e+00
+  %arrayidx5.10 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.920
+  store double %add.10, double* %arrayidx5.10, align 16, !tbaa !0
+  %indvars.iv.next.1021 = or i64 %indvars.iv, 11
+  %arrayidx.11 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.1021
+  %11 = load double* %arrayidx.11, align 8, !tbaa !0
+  %add.11 = fadd double %11, 1.000000e+00
+  %arrayidx5.11 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.1021
+  store double %add.11, double* %arrayidx5.11, align 8, !tbaa !0
+  %indvars.iv.next.1122 = or i64 %indvars.iv, 12
+  %arrayidx.12 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.1122
+  %12 = load double* %arrayidx.12, align 32, !tbaa !0
+  %add.12 = fadd double %12, 1.000000e+00
+  %arrayidx5.12 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.1122
+  store double %add.12, double* %arrayidx5.12, align 32, !tbaa !0
+  %indvars.iv.next.1223 = or i64 %indvars.iv, 13
+  %arrayidx.13 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.1223
+  %13 = load double* %arrayidx.13, align 8, !tbaa !0
+  %add.13 = fadd double %13, 1.000000e+00
+  %arrayidx5.13 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.1223
+  store double %add.13, double* %arrayidx5.13, align 8, !tbaa !0
+  %indvars.iv.next.1324 = or i64 %indvars.iv, 14
+  %arrayidx.14 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.1324
+  %14 = load double* %arrayidx.14, align 16, !tbaa !0
+  %add.14 = fadd double %14, 1.000000e+00
+  %arrayidx5.14 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.1324
+  store double %add.14, double* %arrayidx5.14, align 16, !tbaa !0
+  %indvars.iv.next.1425 = or i64 %indvars.iv, 15
+  %arrayidx.15 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.1425
+  %15 = load double* %arrayidx.15, align 8, !tbaa !0
+  %add.15 = fadd double %15, 1.000000e+00
+  %arrayidx5.15 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.1425
+  store double %add.15, double* %arrayidx5.15, align 8, !tbaa !0
+  %indvars.iv.next.15 = add i64 %indvars.iv, 16
+  %lftr.wideiv.15 = trunc i64 %indvars.iv.next.15 to i32
+  %exitcond.15 = icmp eq i32 %lftr.wideiv.15, 16000
+  br i1 %exitcond.15, label %for.end, label %for.body3
+
+for.end:                                          ; preds = %for.body3
+  %call = tail call i32 @dummy(double* getelementptr inbounds ([16000 x double]* @X, i64 0, i64 0), double* getelementptr inbounds ([16000 x double]* @Y, i64 0, i64 0), double* getelementptr inbounds ([16000 x double]* @Z, i64 0, i64 0), double* getelementptr inbounds ([16000 x double]* @U, i64 0, i64 0), double* getelementptr inbounds ([16000 x double]* @V, i64 0, i64 0), [256 x double]* getelementptr inbounds ([256 x [256 x double]]* @aa, i64 0, i64 0), [256 x double]* getelementptr inbounds ([256 x [256 x double]]* @bb, i64 0, i64 0), [256 x double]* getelementptr inbounds ([256 x [256 x double]]* @cc, i64 0, i64 0), double 0.000000e+00) nounwind
+  %inc7 = add nsw i32 %nl.010, 1
+  %exitcond = icmp eq i32 %inc7, 400000
+  br i1 %exitcond, label %for.end8, label %for.cond1.preheader
+
+for.end8:                                         ; preds = %for.end
+  ret i32 0
+
+; CHECK: @s000
+; CHECK: mtctr
+; CHECK: bdnz
+}
+
+declare i32 @dummy(double*, double*, double*, double*, double*, [256 x double]*, [256 x double]*, [256 x double]*, double)
+
+!0 = metadata !{metadata !"double", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/PowerPC/ctrloop-sums.ll b/test/CodeGen/PowerPC/ctrloop-sums.ll
new file mode 100644
index 00000000000..eae8c38eee0
--- /dev/null
+++ b/test/CodeGen/PowerPC/ctrloop-sums.ll
@@ -0,0 +1,134 @@
+; ModuleID = 'SingleSource/Regression/C/sumarray2d.c'
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+; RUN: llc < %s -march=ppc64 | FileCheck %s
+
+@.str = private unnamed_addr constant [23 x i8] c"Sum(Array[%d,%d] = %d\0A\00", align 1
+
+define i32 @SumArray([100 x i32]* nocapture %Array, i32 %NumI, i32 %NumJ) nounwind readonly {
+entry:
+  %cmp12 = icmp eq i32 %NumI, 0
+  br i1 %cmp12, label %for.end8, label %for.cond1.preheader.lr.ph
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %cmp29 = icmp eq i32 %NumJ, 0
+  br i1 %cmp29, label %for.inc6, label %for.body3.lr.ph.us
+
+for.inc6.us:                                      ; preds = %for.body3.us
+  %indvars.iv.next17 = add i64 %indvars.iv16, 1
+  %lftr.wideiv18 = trunc i64 %indvars.iv.next17 to i32
+  %exitcond19 = icmp eq i32 %lftr.wideiv18, %NumI
+  br i1 %exitcond19, label %for.end8, label %for.body3.lr.ph.us
+
+for.body3.us:                                     ; preds = %for.body3.us, %for.body3.lr.ph.us
+  %indvars.iv = phi i64 [ 0, %for.body3.lr.ph.us ], [ %indvars.iv.next, %for.body3.us ]
+  %Result.111.us = phi i32 [ %Result.014.us, %for.body3.lr.ph.us ], [ %add.us, %for.body3.us ]
+  %arrayidx5.us = getelementptr inbounds [100 x i32]* %Array, i64 %indvars.iv16, i64 %indvars.iv
+  %0 = load i32* %arrayidx5.us, align 4, !tbaa !0
+  %add.us = add nsw i32 %0, %Result.111.us
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %NumJ
+  br i1 %exitcond, label %for.inc6.us, label %for.body3.us
+
+for.body3.lr.ph.us:                               ; preds = %for.inc6.us, %for.cond1.preheader.lr.ph
+  %indvars.iv16 = phi i64 [ %indvars.iv.next17, %for.inc6.us ], [ 0, %for.cond1.preheader.lr.ph ]
+  %Result.014.us = phi i32 [ %add.us, %for.inc6.us ], [ 0, %for.cond1.preheader.lr.ph ]
+  br label %for.body3.us
+
+for.inc6:                                         ; preds = %for.inc6, %for.cond1.preheader.lr.ph
+  %i.013 = phi i32 [ %inc7, %for.inc6 ], [ 0, %for.cond1.preheader.lr.ph ]
+  %inc7 = add i32 %i.013, 1
+  %exitcond20 = icmp eq i32 %inc7, %NumI
+  br i1 %exitcond20, label %for.end8, label %for.inc6
+
+for.end8:                                         ; preds = %for.inc6.us, %for.inc6, %entry
+  %Result.0.lcssa = phi i32 [ 0, %entry ], [ %add.us, %for.inc6.us ], [ 0, %for.inc6 ]
+  ret i32 %Result.0.lcssa
+; CHECK: @SumArray
+; CHECK: mtctr
+; CHECK: bdnz
+}
+
+define i32 @main() nounwind {
+entry:
+  %Array = alloca [100 x [100 x i32]], align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv33 = phi i64 [ 0, %entry ], [ %indvars.iv.next34, %for.body ]
+  %0 = trunc i64 %indvars.iv33 to i32
+  %sub = sub i32 0, %0
+  %arrayidx2 = getelementptr inbounds [100 x [100 x i32]]* %Array, i64 0, i64 %indvars.iv33, i64 %indvars.iv33
+  store i32 %sub, i32* %arrayidx2, align 4, !tbaa !0
+  %indvars.iv.next34 = add i64 %indvars.iv33, 1
+  %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
+  %exitcond36 = icmp eq i32 %lftr.wideiv35, 100
+  br i1 %exitcond36, label %for.cond6.preheader, label %for.body
+
+for.cond6.preheader:                              ; preds = %for.body, %for.inc17
+  %indvars.iv29 = phi i64 [ %indvars.iv.next30, %for.inc17 ], [ 0, %for.body ]
+  br label %for.body8
+
+for.body8:                                        ; preds = %for.inc14, %for.cond6.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond6.preheader ], [ %indvars.iv.next, %for.inc14 ]
+  %1 = trunc i64 %indvars.iv to i32
+  %2 = trunc i64 %indvars.iv29 to i32
+  %cmp9 = icmp eq i32 %1, %2
+  br i1 %cmp9, label %for.inc14, label %if.then
+
+if.then:                                          ; preds = %for.body8
+  %3 = add i64 %indvars.iv, %indvars.iv29
+  %arrayidx13 = getelementptr inbounds [100 x [100 x i32]]* %Array, i64 0, i64 %indvars.iv29, i64 %indvars.iv
+  %4 = trunc i64 %3 to i32
+  store i32 %4, i32* %arrayidx13, align 4, !tbaa !0
+  br label %for.inc14
+
+for.inc14:                                        ; preds = %for.body8, %if.then
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv27 = trunc i64 %indvars.iv.next to i32
+  %exitcond28 = icmp eq i32 %lftr.wideiv27, 100
+  br i1 %exitcond28, label %for.inc17, label %for.body8
+
+for.inc17:                                        ; preds = %for.inc14
+  %indvars.iv.next30 = add i64 %indvars.iv29, 1
+  %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
+  %exitcond32 = icmp eq i32 %lftr.wideiv31, 100
+  br i1 %exitcond32, label %for.body3.lr.ph.us.i, label %for.cond6.preheader
+
+for.inc6.us.i:                                    ; preds = %for.body3.us.i
+  %indvars.iv.next17.i = add i64 %indvars.iv16.i, 1
+  %lftr.wideiv24 = trunc i64 %indvars.iv.next17.i to i32
+  %exitcond25 = icmp eq i32 %lftr.wideiv24, 100
+  br i1 %exitcond25, label %SumArray.exit, label %for.body3.lr.ph.us.i
+
+for.body3.us.i:                                   ; preds = %for.body3.lr.ph.us.i, %for.body3.us.i
+  %indvars.iv.i = phi i64 [ 0, %for.body3.lr.ph.us.i ], [ %indvars.iv.next.i, %for.body3.us.i ]
+  %Result.111.us.i = phi i32 [ %Result.014.us.i, %for.body3.lr.ph.us.i ], [ %add.us.i, %for.body3.us.i ]
+  %arrayidx5.us.i = getelementptr inbounds [100 x [100 x i32]]* %Array, i64 0, i64 %indvars.iv16.i, i64 %indvars.iv.i
+  %5 = load i32* %arrayidx5.us.i, align 4, !tbaa !0
+  %add.us.i = add nsw i32 %5, %Result.111.us.i
+  %indvars.iv.next.i = add i64 %indvars.iv.i, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next.i to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 100
+  br i1 %exitcond, label %for.inc6.us.i, label %for.body3.us.i
+
+for.body3.lr.ph.us.i:                             ; preds = %for.inc17, %for.inc6.us.i
+  %indvars.iv16.i = phi i64 [ %indvars.iv.next17.i, %for.inc6.us.i ], [ 0, %for.inc17 ]
+  %Result.014.us.i = phi i32 [ %add.us.i, %for.inc6.us.i ], [ 0, %for.inc17 ]
+  br label %for.body3.us.i
+
+SumArray.exit:                                    ; preds = %for.inc6.us.i
+  %call20 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([23 x i8]* @.str, i64 0, i64 0), i32 100, i32 100, i32 %add.us.i) nounwind
+  ret i32 0
+
+; CHECK: @main
+; CHECK: mtctr
+; CHECK: bdnz
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}