diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 9c5f3aab9b8..911b84dc7de 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -688,7 +688,7 @@ foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
   unsigned OpNum = Ops[0];
   unsigned Opc = MI->getOpcode();
   MachineInstr *NewMI = NULL;
-  if (Opc == ARM::MOVr || Opc == ARM::t2MOVr) {
+  if (Opc == ARM::MOVr || Opc == ARM::t2MOVr) { // FIXME: tMOVgpr2gpr etc.?
     // If it is updating CPSR, then it cannot be folded.
     if (MI->getOperand(4).getReg() != ARM::CPSR || MI->getOperand(4).isDead()) {
       unsigned Pred = MI->getOperand(2).getImm();
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 97b9ad1dc3b..a72e9dd3bd6 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -1289,11 +1289,12 @@ emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {
           AFI->getDPRCalleeSavedAreaOffset()||
           hasFP(MF)) {
         if (NumBytes) {
-          unsigned SUBriOpc = isARM ? ARM::SUBri : ARM::t2SUBri;
-          BuildMI(MBB, MBBI, dl, TII.get(SUBriOpc), ARM::SP)
-            .addReg(FramePtr)
-            .addImm(NumBytes)
-            .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+          if (isARM)
+            emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes,
+                                    ARMCC::AL, 0, TII);
+          else
+            emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes,
+                                    ARMCC::AL, 0, TII);
         } else {
           // Thumb2 or ARM.
           unsigned MOVrOpc = isARM ? ARM::MOVr : ARM::t2MOVr;
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index d996b24ddf0..04f4dd3ca49 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -122,6 +122,8 @@ private:
   SDNode *SelectARMIndexedLoad(SDValue Op);
   SDNode *SelectT2IndexedLoad(SDValue Op);
 
+  /// SelectDYN_ALLOC - Select dynamic alloc for Thumb.
+  SDNode *SelectDYN_ALLOC(SDValue Op);
 
   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
   /// inline asm expressions.
@@ -868,6 +870,59 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDValue Op) {
   return NULL;
 }
 
+SDNode *ARMDAGToDAGISel::SelectDYN_ALLOC(SDValue Op) {
+  SDNode *N = Op.getNode();
+  DebugLoc dl = N->getDebugLoc();
+  MVT VT = Op.getValueType();
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  SDValue Align = Op.getOperand(2);
+  SDValue SP = CurDAG->getRegister(ARM::SP, MVT::i32);
+  int32_t AlignVal = cast<ConstantSDNode>(Align)->getSExtValue();
+  if (AlignVal < 0)
+    // We need to align the stack. Use Thumb1 tAND which is the only thumb
+    // instruction that can read and write SP. This matches to a pseudo
+    // instruction that has a chain to ensure the result is written back to
+    // the stack pointer.
+    SP = SDValue(CurDAG->getTargetNode(ARM::tANDsp, dl, VT, SP, Align), 0);
+
+  bool isC = isa<ConstantSDNode>(Size);
+  uint32_t C = isC ? cast<ConstantSDNode>(Size)->getZExtValue() : ~0UL;
+  // Handle the most common case for both Thumb1 and Thumb2:
+  // tSUBspi - immediate is between 0 ... 508 inclusive.
+  if (C <= 508 && ((C & 3) == 0))
+    // FIXME: tSUBspi encode scale 4 implicitly.
+    return CurDAG->SelectNodeTo(N, ARM::tSUBspi_, VT, MVT::Other, SP,
+                                CurDAG->getTargetConstant(C/4, MVT::i32),
+                                Chain);
+
+  if (Subtarget->isThumb1Only()) {
+    // Use tADDrSPr since Thumb1 does not have a sub r, sp, r. ARMISelLowering
+    // should have negated the size operand already. FIXME: We can't insert
+    // new target independent node at this stage so we are forced to negate
+    // it earlier. Is there a better solution? 
+    return CurDAG->SelectNodeTo(N, ARM::tADDspr_, VT, MVT::Other, SP, Size,
+                                Chain);
+  } else if (Subtarget->isThumb2()) {
+    if (isC && Predicate_t2_so_imm(Size.getNode())) {
+      // t2SUBrSPi
+      SDValue Ops[] = { SP, CurDAG->getTargetConstant(C, MVT::i32), Chain };
+      return CurDAG->SelectNodeTo(N, ARM::t2SUBrSPi_, VT, MVT::Other, Ops, 3);
+    } else if (isC && Predicate_imm0_4095(Size.getNode())) {
+      // t2SUBrSPi12
+      SDValue Ops[] = { SP, CurDAG->getTargetConstant(C, MVT::i32), Chain };
+      return CurDAG->SelectNodeTo(N, ARM::t2SUBrSPi12_, VT, MVT::Other, Ops, 3);
+    } else {
+      // t2SUBrSPs
+      SDValue Ops[] = { SP, Size,
+                        getI32Imm(ARM_AM::getSORegOpc(ARM_AM::lsl,0)), Chain };
+      return CurDAG->SelectNodeTo(N, ARM::t2SUBrSPs_, VT, MVT::Other, Ops, 4);
+    }
+  }
+
+  // FIXME: Add ADD / SUB sp instructions for ARM.
+  return 0;
+}
 
 SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
   SDNode *N = Op.getNode();
@@ -941,25 +996,8 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
       return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
     }
   }
-  case ISD::ADD: {
-    if (!Subtarget->isThumb1Only())
-      break;
-    // Select add sp, c to tADDhirr.
-    SDValue N0 = Op.getOperand(0);
-    SDValue N1 = Op.getOperand(1);
-    RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(Op.getOperand(0));
-    RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(Op.getOperand(1));
-    if (LHSR && LHSR->getReg() == ARM::SP) {
-      std::swap(N0, N1);
-      std::swap(LHSR, RHSR);
-    }
-    if (RHSR && RHSR->getReg() == ARM::SP) {
-      SDValue Val = SDValue(CurDAG->getTargetNode(ARM::tMOVtgpr2gpr, dl,
-                                                  Op.getValueType(), N0, N0),0);
-      return CurDAG->SelectNodeTo(N, ARM::tADDhirr, Op.getValueType(), Val, N1);
-    }
-    break;
-  }
+  case ARMISD::DYN_ALLOC:
+    return SelectDYN_ALLOC(Op);
   case ISD::MUL:
     if (Subtarget->isThumb1Only())
       break;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 0bfe213cf04..4922f7d12e7 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -307,7 +307,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::VAEND,              MVT::Other, Expand);
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32,   Expand);
+  if (Subtarget->isThumb())
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+  else
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
   setOperationAction(ISD::MEMBARRIER,         MVT::Other, Expand);
 
   if (!Subtarget->hasV6Ops() && !Subtarget->isThumb2()) {
@@ -435,6 +438,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
 
   case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
 
+  case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
+
   case ARMISD::VCEQ:          return "ARMISD::VCEQ";
   case ARMISD::VCGE:          return "ARMISD::VCGE";
   case ARMISD::VCGEU:         return "ARMISD::VCGEU";
@@ -1398,6 +1403,53 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
   return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0);
 }
 
+SDValue
+ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) {
+  SDNode *Node = Op.getNode();
+  DebugLoc dl = Node->getDebugLoc();
+  MVT VT = Node->getValueType(0);
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+  SDValue Align = Op.getOperand(2);
+
+  // Chain the dynamic stack allocation so that it doesn't modify the stack
+  // pointer when other instructions are using the stack.
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true));
+
+  unsigned AlignVal = cast<ConstantSDNode>(Align)->getZExtValue();
+  unsigned StackAlign = getTargetMachine().getFrameInfo()->getStackAlignment();
+  if (AlignVal > StackAlign)
+    // Do this now since selection pass cannot introduce new target
+    // independent node.
+    Align = DAG.getConstant(-(uint64_t)AlignVal, VT);
+
+  // In Thumb1 mode, there isn't a "sub r, sp, r" instruction, we will end up
+  // using a "add r, sp, r" instead. Negate the size now so we don't have to
+  // do even more horrible hack later.
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  if (AFI->isThumb1OnlyFunction()) {
+    bool Negate = true;
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Size);
+    if (C) {
+      uint32_t Val = C->getZExtValue();
+      if (Val <= 508 && ((Val & 3) == 0))
+        Negate = false;
+    }
+    if (Negate)
+      Size = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, VT), Size);
+  }
+
+  SDVTList VTList = DAG.getVTList(VT, MVT::Other);
+  SDValue Ops1[] = { Chain, Size, Align };
+  SDValue Res = DAG.getNode(ARMISD::DYN_ALLOC, dl, VTList, Ops1, 3);
+  Chain = Res.getValue(1);
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
+                             DAG.getIntPtrConstant(0, true), SDValue());
+  SDValue Ops2[] = { Res, Chain };
+  return DAG.getMergeValues(Ops2, 2, dl);
+}
+
 SDValue
 ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
                                         SDValue &Root, SelectionDAG &DAG,
@@ -2396,6 +2448,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
   case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG, Subtarget);
   case ISD::BR_CC:         return LowerBR_CC(Op, DAG, Subtarget);
   case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::VASTART:       return LowerVASTART(Op, DAG, VarArgsFrameIndex);
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
@@ -2454,7 +2507,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
   switch (MI->getOpcode()) {
-  default: assert(false && "Unexpected instr type to insert");
+  default:
+    llvm_unreachable("Unexpected instr type to insert");
   case ARM::tMOVCCr: {
     // To "insert" a SELECT_CC instruction, we actually have to insert the
     // diamond control-flow pattern.  The incoming instruction knows the
@@ -2509,6 +2563,78 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
     return BB;
   }
+
+  case ARM::tANDsp:
+  case ARM::tADDspr_:
+  case ARM::tSUBspi_:
+  case ARM::t2SUBrSPi_:
+  case ARM::t2SUBrSPi12_:
+  case ARM::t2SUBrSPs_: {
+    MachineFunction *MF = BB->getParent();
+    unsigned DstReg = MI->getOperand(0).getReg();
+    unsigned SrcReg = MI->getOperand(1).getReg();
+    bool DstIsDead = MI->getOperand(0).isDead();
+    bool SrcIsKill = MI->getOperand(1).isKill();
+
+    if (SrcReg != ARM::SP) {
+      // Copy the source to SP from virtual register.
+      const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(SrcReg);
+      unsigned CopyOpc = (RC == ARM::tGPRRegisterClass)
+        ? ARM::tMOVtgpr2gpr : ARM::tMOVgpr2gpr;
+      BuildMI(BB, dl, TII->get(CopyOpc), ARM::SP)
+        .addReg(SrcReg, getKillRegState(SrcIsKill));
+    }
+
+    unsigned OpOpc = 0;
+    bool NeedPred = false, NeedCC = false, NeedOp3 = false;
+    switch (MI->getOpcode()) {
+    default:
+      llvm_unreachable("Unexpected pseudo instruction!");
+    case ARM::tANDsp:
+      OpOpc = ARM::tAND;
+      NeedPred = true;
+      break;
+    case ARM::tADDspr_:
+      OpOpc = ARM::tADDspr;
+      break;
+    case ARM::tSUBspi_:
+      OpOpc = ARM::tSUBspi;
+      break;
+    case ARM::t2SUBrSPi_:
+      OpOpc = ARM::t2SUBrSPi;
+      NeedPred = true; NeedCC = true;
+      break;
+    case ARM::t2SUBrSPi12_:
+      OpOpc = ARM::t2SUBrSPi12;
+      NeedPred = true;
+      break;
+    case ARM::t2SUBrSPs_:
+      OpOpc = ARM::t2SUBrSPs;
+      NeedPred = true; NeedCC = true; NeedOp3 = true;
+      break;
+    }
+    MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(OpOpc), ARM::SP);
+    if (OpOpc == ARM::tAND)
+      AddDefaultT1CC(MIB);
+    MIB.addReg(ARM::SP);
+    MIB.addOperand(MI->getOperand(2));
+    if (NeedOp3)
+      MIB.addOperand(MI->getOperand(3));
+    if (NeedPred)
+      AddDefaultPred(MIB);
+    if (NeedCC)
+      AddDefaultCC(MIB);
+
+    // Copy the result from SP to virtual register.
+    const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(DstReg);
+    unsigned CopyOpc = (RC == ARM::tGPRRegisterClass)
+      ? ARM::tMOVgpr2tgpr : ARM::tMOVgpr2gpr;
+    BuildMI(BB, dl, TII->get(CopyOpc))
+      .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstIsDead))
+      .addReg(ARM::SP);
+    MF->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+    return BB;
+  }
   }
 }
 
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 4fe4d8bf943..4649c18de8d 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -65,11 +65,13 @@ namespace llvm {
       FMRRD,        // double to two gprs.
       FMDRR,        // Two gprs to double.
 
-      EH_SJLJ_SETJMP,    // SjLj exception handling setjmp
-      EH_SJLJ_LONGJMP,   // SjLj exception handling longjmp
+      EH_SJLJ_SETJMP,    // SjLj exception handling setjmp.
+      EH_SJLJ_LONGJMP,   // SjLj exception handling longjmp.
 
       THREAD_POINTER,
 
+      DYN_ALLOC,    // Dynamic allocation on the stack.
+
       VCEQ,         // Vector compare equal.
       VCGE,         // Vector compare greater than or equal.
       VCGEU,        // Vector compare unsigned greater than or equal.
@@ -255,6 +257,7 @@ namespace llvm {
     SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG);
     SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG);
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG);
 
     SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
                                       SDValue Chain,
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 5dfc4fc25a8..9b54e67bedf 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -138,18 +138,37 @@ def tADDrPCi : T1I<(outs tGPR:$dst), (ins i32imm:$rhs), IIC_iALU,
                   "add $dst, pc, $rhs * 4", []>;
 
 // ADD rd, sp, #imm8
-// FIXME: hard code sp?
 def tADDrSPi : T1I<(outs tGPR:$dst), (ins GPR:$sp, i32imm:$rhs), IIC_iALU,
                   "add $dst, $sp, $rhs * 4 @ addrspi", []>;
 
 // ADD sp, sp, #imm7
-// FIXME: hard code sp?
-def tADDspi : T1It<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), IIC_iALU,
+def tADDspi : TIt<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), IIC_iALU,
                   "add $dst, $rhs * 4", []>;
 
-// FIXME: Make use of the following?
+// SUB sp, sp, #imm7
+def tSUBspi : TIt<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), IIC_iALU,
+                  "sub $dst, $rhs * 4", []>;
+
 // ADD rm, sp, rm
+def tADDrSPr : TI<(outs GPR:$dst), (ins GPR:$sp, GPR:$rhs), IIC_iALU,
+                  "add $dst, $sp, $rhs", []>;
+
 // ADD sp, rm
+def tADDspr : TIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALU,
+                  "add $dst, $rhs", []>;
+
+// Pseudo instruction that will expand into a tSUBspi + a copy.
+let usesCustomDAGSchedInserter = 1 in { // Expanded by the scheduler.
+def tSUBspi_ : PseudoInst<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs),
+               NoItinerary, "@ sub $dst, $rhs * 4", []>;
+
+def tADDspr_ : PseudoInst<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs),
+               NoItinerary, "@ add $dst, $rhs", []>;
+
+let Defs = [CPSR] in
+def tANDsp : PseudoInst<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
+             NoItinerary, "@ and $dst, $rhs", []>;
+} // usesCustomDAGSchedInserter
 
 //===----------------------------------------------------------------------===//
 //  Control Flow Instructions.
@@ -549,9 +568,6 @@ def tSUBrr : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALU,
 
 // TODO: A7-96: STMIA - store multiple.
 
-def tSUBspi : T1It<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), IIC_iALU,
-                  "sub $dst, $rhs * 4", []>;
-
 // sign-extend byte
 def tSXTB  : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iALU,
                   "sxtb", " $dst, $src",
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 9305c8ad6b6..11b0454802c 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -75,7 +75,8 @@ def imm1_31 : PatLeaf<(i32 imm), [{
 }]>;
 
 /// imm0_4095 predicate - True if the 32-bit immediate is in the range [0.4095].
-def imm0_4095 : PatLeaf<(i32 imm), [{
+def imm0_4095 : Operand<i32>,
+                PatLeaf<(i32 imm), [{
   return (uint32_t)N->getZExtValue() < 4096;
 }]>;
 
@@ -239,7 +240,7 @@ multiclass T2I_bin_ii12rs<string opc, PatFrag opnode, bit Commutable = 0> {
                  opc, ".w $dst, $lhs, $rhs",
                  [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>;
    // 12-bit imm
-   def ri12 : T2sI<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), IIC_iALU,
+   def ri12 : T2sI<(outs GPR:$dst), (ins GPR:$lhs, imm0_4095:$rhs), IIC_iALU,
                    !strconcat(opc, "w"), " $dst, $lhs, $rhs",
                    [(set GPR:$dst, (opnode GPR:$lhs, imm0_4095:$rhs))]>;
    // register
@@ -431,6 +432,39 @@ def t2LEApcrelJT : T2XI<(outs GPR:$dst),
                         (ins i32imm:$label, i32imm:$id, pred:$p), IIC_iALU,
                         "adr$p.w $dst, #${label}_${id:no_hash}", []>;
 
+
+// ADD r, sp, {so_imm|i12}
+def t2ADDrSPi   : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm), IIC_iALU,
+                       "add", ".w $dst, $sp, $imm", []>;
+def t2ADDrSPi12 : T2I<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm), IIC_iALU,
+                       "addw", " $dst, $sp, $imm", []>;
+
+// ADD r, sp, so_reg
+def t2ADDrSPs   : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs), IIC_iALU,
+                       "add", ".w $dst, $sp, $rhs", []>;
+
+// SUB r, sp, {so_imm|i12}
+def t2SUBrSPi   : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm), IIC_iALU,
+                       "sub", ".w $dst, $sp, $imm", []>;
+def t2SUBrSPi12 : T2I<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm), IIC_iALU,
+                       "subw", " $dst, $sp, $imm", []>;
+
+// SUB r, sp, so_reg
+def t2SUBrSPs   : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs), IIC_iALU,
+                       "sub", " $dst, $sp, $rhs", []>;
+
+
+// Pseudo instruction that will expand into a t2SUBrSPi + a copy.
+let usesCustomDAGSchedInserter = 1 in { // Expanded by the scheduler.
+def t2SUBrSPi_   : PseudoInst<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm),
+                   NoItinerary, "@ sub.w $dst, $sp, $imm", []>;
+def t2SUBrSPi12_ : PseudoInst<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm),
+                   NoItinerary, "@ subw $dst, $sp, $imm", []>;
+def t2SUBrSPs_   : PseudoInst<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs),
+                   NoItinerary, "@ sub $dst, $sp, $rhs", []>;
+} // usesCustomDAGSchedInserter
+
+
 //===----------------------------------------------------------------------===//
 //  Load / store Instructions.
 //
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index a81b790f9dc..ea80e47589d 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -206,9 +206,13 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
       if (NewBase == 0)
         return false;
     }
-    int BaseOpc = isThumb2 ? ARM::t2ADDri : ARM::ADDri;
+    int BaseOpc = !isThumb2
+      ? ARM::ADDri
+      : ((Base == ARM::SP) ? ARM::t2ADDrSPi : ARM::t2ADDri);
     if (Offset < 0) {
-      BaseOpc = isThumb2 ? ARM::t2SUBri : ARM::SUBri;
+      BaseOpc = !isThumb2
+        ? ARM::SUBri
+        : ((Base == ARM::SP) ? ARM::t2SUBrSPi : ARM::t2SUBri);
       Offset = - Offset;
     }
     int ImmedOffset = isThumb2
@@ -329,6 +333,9 @@ static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
   if (!MI)
     return false;
   if (MI->getOpcode() != ARM::t2SUBri &&
+      MI->getOpcode() != ARM::t2SUBrSPi &&
+      MI->getOpcode() != ARM::t2SUBrSPi12 &&
+      MI->getOpcode() != ARM::tSUBspi &&
       MI->getOpcode() != ARM::SUBri)
     return false;
 
@@ -336,9 +343,10 @@ static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
   if (Bytes <= 0 || (Limit && Bytes >= Limit))
     return false;
 
+  unsigned Scale = (MI->getOpcode() == ARM::tSUBspi) ? 4 : 1; // FIXME
   return (MI->getOperand(0).getReg() == Base &&
           MI->getOperand(1).getReg() == Base &&
-          MI->getOperand(2).getImm() == Bytes &&
+          (MI->getOperand(2).getImm()*Scale) == Bytes &&
           getInstrPredicate(MI, MyPredReg) == Pred &&
           MyPredReg == PredReg);
 }
@@ -350,6 +358,9 @@ static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
   if (!MI)
     return false;
   if (MI->getOpcode() != ARM::t2ADDri &&
+      MI->getOpcode() != ARM::t2ADDrSPi &&
+      MI->getOpcode() != ARM::t2ADDrSPi12 &&
+      MI->getOpcode() != ARM::tADDspi &&
       MI->getOpcode() != ARM::ADDri)
     return false;
 
@@ -357,9 +368,10 @@ static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
     // Make sure the offset fits in 8 bits.
     return false;
 
+  unsigned Scale = (MI->getOpcode() == ARM::tADDspi) ? 4 : 1; // FIXME
   return (MI->getOperand(0).getReg() == Base &&
           MI->getOperand(1).getReg() == Base &&
-          MI->getOperand(2).getImm() == Bytes &&
+          (MI->getOperand(2).getImm()*Scale) == Bytes &&
           getInstrPredicate(MI, MyPredReg) == Pred &&
           MyPredReg == PredReg);
 }
diff --git a/lib/Target/ARM/README-Thumb2.txt b/lib/Target/ARM/README-Thumb2.txt
index 651d39311e0..48b52781228 100644
--- a/lib/Target/ARM/README-Thumb2.txt
+++ b/lib/Target/ARM/README-Thumb2.txt
@@ -1,3 +1,7 @@
 //===---------------------------------------------------------------------===//
 // Random ideas for the ARM backend (Thumb2 specific).
 //===---------------------------------------------------------------------===//
+
+We should be using ADD / SUB rd, sp, rm <shift> instructions.
+
+copyRegToReg should use tMOVgpr2gpr instead of t2MOVr?
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index cf2d09912d4..2b329e03db3 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -65,11 +65,15 @@ Thumb2InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
 
   if (DestRC == ARM::GPRRegisterClass &&
       SrcRC == ARM::GPRRegisterClass) {
-    AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2MOVr),
-                                        DestReg).addReg(SrcReg)));
+    // FIXME: Just use tMOVgpr2gpr since it's shorter?
+    if (SrcReg == ARM::SP || DestReg == ARM::SP)
+      BuildMI(MBB, I, DL, get(ARM::tMOVgpr2gpr), DestReg).addReg(SrcReg);
+    else
+      AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2MOVr),
+                                          DestReg).addReg(SrcReg)));
     return true;
   } else if (DestRC == ARM::GPRRegisterClass &&
-      SrcRC == ARM::tGPRRegisterClass) {
+             SrcRC == ARM::tGPRRegisterClass) {
     BuildMI(MBB, I, DL, get(ARM::tMOVtgpr2gpr), DestReg).addReg(SrcReg);
     return true;
   } else if (DestRC == ARM::tGPRRegisterClass &&
@@ -162,26 +166,62 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
   }
 
   while (NumBytes) {
-    unsigned Opc = isSub ? ARM::t2SUBri : ARM::t2ADDri;
     unsigned ThisVal = NumBytes;
-    if (ARM_AM::getT2SOImmVal(NumBytes) != -1) {
-      NumBytes = 0;
-    } else if (ThisVal < 4096) {
-      Opc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12;
-      NumBytes = 0;
+    unsigned Opc = 0;
+    if (DestReg == ARM::SP && BaseReg != ARM::SP) {
+      // mov sp, rn. Note t2MOVr cannot be used.
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr),DestReg).addReg(BaseReg);
+      BaseReg = ARM::SP;
+      continue;
+    }
+
+    if (BaseReg == ARM::SP) {
+      // sub sp, sp, #imm7
+      if (DestReg == ARM::SP && (ThisVal < ((1 << 7)-1) * 4)) {
+        assert((ThisVal & 3) == 0 && "Stack update is not multiple of 4?");
+        Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
+        // FIXME: Fix Thumb1 immediate encoding.
+        BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+          .addReg(BaseReg).addImm(ThisVal/4);
+        NumBytes = 0;
+        continue;
+      }
+
+      // sub rd, sp, so_imm
+      Opc = isSub ? ARM::t2SUBrSPi : ARM::t2ADDrSPi;
+      if (ARM_AM::getT2SOImmVal(NumBytes) != -1) {
+        NumBytes = 0;
+      } else {
+        // FIXME: Move this to ARMAddressingModes.h?
+        unsigned RotAmt = CountLeadingZeros_32(ThisVal);
+        ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
+        NumBytes &= ~ThisVal;
+        assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
+               "Bit extraction didn't work?");
+      }
     } else {
-      // FIXME: Move this to ARMAddressingModes.h?
-      unsigned RotAmt = CountLeadingZeros_32(ThisVal);
-      ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
-      NumBytes &= ~ThisVal;
-      assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
-             "Bit extraction didn't work?");
+      assert(DestReg != ARM::SP && BaseReg != ARM::SP);
+      Opc = isSub ? ARM::t2SUBri : ARM::t2ADDri;
+      if (ARM_AM::getT2SOImmVal(NumBytes) != -1) {
+        NumBytes = 0;
+      } else if (ThisVal < 4096) {
+        Opc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12;
+        NumBytes = 0;
+      } else {
+        // FIXME: Move this to ARMAddressingModes.h?
+        unsigned RotAmt = CountLeadingZeros_32(ThisVal);
+        ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
+        NumBytes &= ~ThisVal;
+        assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
+               "Bit extraction didn't work?");
+      }
     }
 
     // Build the new ADD / SUB.
-    BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
-      .addReg(BaseReg, RegState::Kill).addImm(ThisVal)
-      .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+    AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+                                .addReg(BaseReg, RegState::Kill)
+                                .addImm(ThisVal)));
+
     BaseReg = DestReg;
   }
 }
@@ -288,7 +328,6 @@ int llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
                               unsigned FrameReg, int Offset,
                               const ARMBaseInstrInfo &TII) {
   unsigned Opcode = MI.getOpcode();
-  unsigned NewOpc = Opcode;
   const TargetInstrDesc &Desc = MI.getDesc();
   unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
   bool isSub = false;
@@ -299,9 +338,12 @@ int llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
   
   if (Opcode == ARM::t2ADDri || Opcode == ARM::t2ADDri12) {
     Offset += MI.getOperand(FrameRegIdx+1).getImm();
+
+    bool isSP = FrameReg == ARM::SP;
     if (Offset == 0) {
       // Turn it into a move.
-      MI.setDesc(TII.get(ARM::t2MOVr));
+      unsigned NewOpc = isSP ? ARM::tMOVgpr2gpr : ARM::t2MOVr;
+      MI.setDesc(TII.get(NewOpc));
       MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
       MI.RemoveOperand(FrameRegIdx+1);
       return 0;
@@ -310,23 +352,23 @@ int llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
     if (Offset < 0) {
       Offset = -Offset;
       isSub = true;
-      MI.setDesc(TII.get(ARM::t2SUBri));
+      MI.setDesc(TII.get(isSP ? ARM::t2SUBrSPi : ARM::t2SUBri));
+    } else {
+      MI.setDesc(TII.get(isSP ? ARM::t2ADDrSPi : ARM::t2ADDri));
     }
 
     // Common case: small offset, fits into instruction.
     if (ARM_AM::getT2SOImmVal(Offset) != -1) {
-      NewOpc = isSub ? ARM::t2SUBri : ARM::t2ADDri;
-      if (NewOpc != Opcode)
-        MI.setDesc(TII.get(NewOpc));
       MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
       MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
       return 0;
     }
     // Another common case: imm12.
     if (Offset < 4096) {
-      NewOpc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12;
-      if (NewOpc != Opcode)
-        MI.setDesc(TII.get(NewOpc));
+      unsigned NewOpc = isSP
+        ? (isSub ? ARM::t2SUBrSPi12 : ARM::t2ADDrSPi12)
+        : (isSub ? ARM::t2SUBri12   : ARM::t2ADDri12);
+      MI.setDesc(TII.get(NewOpc));
       MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
       MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
       return 0;
@@ -346,7 +388,7 @@ int llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
   } else {
     // AddrModeT2_so cannot handle any offset. If there is no offset
     // register then we change to an immediate version.
-    NewOpc = Opcode;
+    unsigned NewOpc = Opcode;
     if (AddrMode == ARMII::AddrModeT2_so) {
       unsigned OffsetReg = MI.getOperand(FrameRegIdx+1).getReg();
       if (OffsetReg != 0) {
diff --git a/test/CodeGen/Thumb2/2009-08-06-SpDecBug.ll b/test/CodeGen/Thumb2/2009-08-06-SpDecBug.ll
new file mode 100644
index 00000000000..bdc23bab7c8
--- /dev/null
+++ b/test/CodeGen/Thumb2/2009-08-06-SpDecBug.ll
@@ -0,0 +1,24 @@
+; RUN: llvm-as < %s | llc -mtriple=thumbv7-none-linux-gnueabi | FileCheck %s
+; PR4659
+; PR4682
+
+define hidden arm_aapcscc i32 @__gcov_execlp(i8* %path, i8* %arg, ...) nounwind {
+entry:
+; CHECK: __gcov_execlp:
+; CHECK: mov sp, r7
+; CHECK: sub sp, #1 * 4
+	call arm_aapcscc  void @__gcov_flush() nounwind
+	br i1 undef, label %bb5, label %bb
+
+bb:		; preds = %bb, %entry
+	br i1 undef, label %bb5, label %bb
+
+bb5:		; preds = %bb, %entry
+	%0 = alloca i8*, i32 undef, align 4		; <i8**> [#uses=1]
+	%1 = call arm_aapcscc  i32 @execvp(i8* %path, i8** %0) nounwind		; <i32> [#uses=1]
+	ret i32 %1
+}
+
+declare hidden arm_aapcscc void @__gcov_flush()
+
+declare arm_aapcscc i32 @execvp(i8*, i8**) nounwind
diff --git a/test/CodeGen/Thumb2/large-stack.ll b/test/CodeGen/Thumb2/large-stack.ll
index 60604f020ff..d183da44c12 100644
--- a/test/CodeGen/Thumb2/large-stack.ll
+++ b/test/CodeGen/Thumb2/large-stack.ll
@@ -2,7 +2,7 @@
 
 define void @test1() {
 ; CHECK: test1:
-; CHECK: sub.w sp, sp, #256
+; CHECK: sub sp, #64 * 4
     %tmp = alloca [ 64 x i32 ] , align 4
     ret void
 }
@@ -10,7 +10,7 @@ define void @test1() {
 define void @test2() {
 ; CHECK: test2:
 ; CHECK: sub.w sp, sp, #4160
-; CHECK: sub.w sp, sp, #8
+; CHECK: sub sp, #2 * 4
     %tmp = alloca [ 4168 x i8 ] , align 4
     ret void
 }
@@ -18,7 +18,7 @@ define void @test2() {
 define i32 @test3() {
 ; CHECK: test3:
 ; CHECK: sub.w sp, sp, #805306368
-; CHECK: sub.w sp, sp, #16
+; CHECK: sub sp, #4 * 4
     %retval = alloca i32, align 4
     %tmp = alloca i32, align 4
     %a = alloca [805306369 x i8], align 16