diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index bef191fc11e..b6f8c320ba5 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -111,8 +111,12 @@ namespace {
       /// Index into the basic block where the merged instruction will be
       /// inserted. (See MemOpQueueEntry.Position)
       unsigned InsertPos;
+      /// Whether the instructions can be merged into a ldm/stm instruction.
+      bool CanMergeToLSMulti;
+      /// Whether the instructions can be merged into a ldrd/strd instruction.
+      bool CanMergeToLSDouble;
     };
-    BumpPtrAllocator Allocator;
+    SpecificBumpPtrAllocator<MergeCandidate> Allocator;
     SmallVector<const MergeCandidate*,4> Candidates;
 
     void moveLiveRegsBefore(const MachineBasicBlock &MBB,
@@ -122,11 +126,14 @@ namespace {
                            MachineBasicBlock::iterator MBBI,
                            DebugLoc DL, unsigned Base, unsigned WordOffset,
                            ARMCC::CondCodes Pred, unsigned PredReg);
-    MachineInstr *MergeOps(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator InsertBefore, int Offset,
-                           unsigned Base, bool BaseKill, unsigned Opcode,
-                           ARMCC::CondCodes Pred, unsigned PredReg, DebugLoc DL,
-                           ArrayRef<std::pair<unsigned, bool>> Regs);
+    MachineInstr *CreateLoadStoreMulti(MachineBasicBlock &MBB,
+        MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
+        bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
+        DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs);
+    MachineInstr *CreateLoadStoreDouble(MachineBasicBlock &MBB,
+        MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
+        bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
+        DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const;
     void FormCandidates(const MemOpQueue &MemOps);
     MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand);
     bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
@@ -555,12 +562,10 @@ static bool ContainsReg(const ArrayRef<std::pair<unsigned, bool>> &Regs,
 /// Create and insert a LDM or STM with Base as base register and registers in
 /// Regs as the register operands that would be loaded / stored.  It returns
 /// true if the transformation is done.
-MachineInstr *
-ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator InsertBefore, int Offset,
-                          unsigned Base, bool BaseKill, unsigned Opcode,
-                          ARMCC::CondCodes Pred, unsigned PredReg, DebugLoc DL,
-                          ArrayRef<std::pair<unsigned, bool>> Regs) {
+MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
+    bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
+    DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) {
   unsigned NumRegs = Regs.size();
   assert(NumRegs > 1);
 
@@ -749,6 +754,28 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
   return MIB.getInstr();
 }
 
+MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble(MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
+    bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
+    DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const {
+  bool IsLoad = isi32Load(Opcode);
+  assert((IsLoad || isi32Store(Opcode)) && "Must have integer load or store");
+  unsigned LoadStoreOpcode = IsLoad ? ARM::t2LDRDi8 : ARM::t2STRDi8;
+
+  assert(Regs.size() == 2);
+  MachineInstrBuilder MIB = BuildMI(MBB, InsertBefore, DL,
+                                    TII->get(LoadStoreOpcode));
+  if (IsLoad) {
+    MIB.addReg(Regs[0].first, RegState::Define)
+       .addReg(Regs[1].first, RegState::Define);
+  } else {
+    MIB.addReg(Regs[0].first, getKillRegState(Regs[0].second))
+       .addReg(Regs[1].first, getKillRegState(Regs[1].second));
+  }
+  MIB.addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+  return MIB.getInstr();
+}
+
 /// Call MergeOps and update MemOps and merges accordingly on success.
 MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
   const MachineInstr *First = Cand.Instrs.front();
@@ -799,7 +826,12 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
   unsigned PredReg = 0;
   ARMCC::CondCodes Pred = getInstrPredicate(First, PredReg);
   DebugLoc DL = First->getDebugLoc();
-  MachineInstr *Merged = MergeOps(MBB, InsertBefore, Offset, Base, BaseKill,
+  MachineInstr *Merged = nullptr;
+  if (Cand.CanMergeToLSDouble)
+    Merged = CreateLoadStoreDouble(MBB, InsertBefore, Offset, Base, BaseKill,
+                                   Opcode, Pred, PredReg, DL, Regs);
+  if (!Merged && Cand.CanMergeToLSMulti)
+    Merged = CreateLoadStoreMulti(MBB, InsertBefore, Offset, Base, BaseKill,
                                   Opcode, Pred, PredReg, DL, Regs);
   if (!Merged)
     return nullptr;
@@ -861,6 +893,13 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
   return Merged;
 }
 
+static bool isValidLSDoubleOffset(int Offset) {
+  unsigned Value = abs(Offset);
+  // t2LDRDi8/t2STRDi8 supports an 8 bit immediate which is internally
+  // multiplied by 4.
+  return (Value % 4) == 0 && Value < 1024;
+}
+
 /// Find candidates for load/store multiple merge in list of MemOpQueueEntries.
 void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
   const MachineInstr *FirstMI = MemOps[0].MI;
@@ -880,29 +919,55 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
     unsigned Latest = SIndex;
     unsigned Earliest = SIndex;
     unsigned Count = 1;
+    bool CanMergeToLSDouble =
+      STI->isThumb2() && isNotVFP && isValidLSDoubleOffset(Offset);
+    // ARM errata 602117: LDRD with base in list may result in incorrect base
+    // register when interrupted or faulted.
+    if (STI->isCortexM3() && isi32Load(Opcode) &&
+        PReg == getLoadStoreBaseOp(*MI).getReg())
+      CanMergeToLSDouble = false;
 
-    // Merge additional instructions fulfilling LDM/STM constraints.
+    bool CanMergeToLSMulti = true;
+    // On swift vldm/vstm starting with an odd register number as that needs
+    // more uops than single vldrs.
+    if (STI->isSwift() && !isNotVFP && (PRegNum % 2) == 1)
+      CanMergeToLSMulti = false;
+
+    // LDRD/STRD do not allow SP/PC. LDM/STM do not support it or have it
+    // deprecated; LDM to PC is fine but cannot happen here.
+    if (PReg == ARM::SP || PReg == ARM::PC)
+      CanMergeToLSMulti = CanMergeToLSDouble = false;
+
+    // Merge following instructions where possible.
     for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) {
       int NewOffset = MemOps[I].Offset;
       if (NewOffset != Offset + (int)Size)
         break;
       const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI);
       unsigned Reg = MO.getReg();
-      if (Reg == ARM::SP)
-        break;
-      unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
-      // Register numbers must be in ascending order.
-      if (RegNum <= PRegNum)
-        break;
-      // For VFP / NEON load/store multiples, the registers must be consecutive
-      // and within the limit on the number of registers per instruction.
-      if (!isNotVFP && RegNum != PRegNum+1)
-        break;
-      // On Swift we don't want vldm/vstm to start with a odd register num
-      // because Q register unaligned vldm/vstm need more uops.
-      if (!isNotVFP && STI->isSwift() && Count == 1 && (PRegNum % 2) == 1)
+      if (Reg == ARM::SP || Reg == ARM::PC)
         break;
 
+      // See if the current load/store may be part of a multi load/store.
+      unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
+      bool PartOfLSMulti = CanMergeToLSMulti;
+      if (PartOfLSMulti) {
+        // Register numbers must be in ascending order.
+        if (RegNum <= PRegNum)
+          PartOfLSMulti = false;
+        // For VFP / NEON load/store multiples, the registers must be
+        // consecutive and within the limit on the number of registers per
+        // instruction.
+        else if (!isNotVFP && RegNum != PRegNum+1)
+          PartOfLSMulti = false;
+      }
+      // See if the current load/store may be part of a double load/store.
+      bool PartOfLSDouble = CanMergeToLSDouble && Count <= 1;
+
+      if (!PartOfLSMulti && !PartOfLSDouble)
+        break;
+      CanMergeToLSMulti &= PartOfLSMulti;
+      CanMergeToLSDouble &= PartOfLSDouble;
       // Track MemOp with latest and earliest position (Positions are
       // counted in reverse).
       unsigned Position = MemOps[I].Position;
@@ -916,12 +981,16 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
     }
 
     // Form a candidate from the Ops collected so far.
-    MergeCandidate *Candidate = new(Allocator) MergeCandidate;
+    MergeCandidate *Candidate = new(Allocator.Allocate()) MergeCandidate;
     for (unsigned C = SIndex, CE = SIndex + Count; C < CE; ++C)
       Candidate->Instrs.push_back(MemOps[C].MI);
     Candidate->LatestMIIdx = Latest - SIndex;
     Candidate->EarliestMIIdx = Earliest - SIndex;
     Candidate->InsertPos = MemOps[Latest].Position;
+    if (Count == 1)
+      CanMergeToLSMulti = CanMergeToLSDouble = false;
+    Candidate->CanMergeToLSMulti = CanMergeToLSMulti;
+    Candidate->CanMergeToLSDouble = CanMergeToLSDouble;
     Candidates.push_back(Candidate);
     // Continue after the chain.
     SIndex += Count;
@@ -1653,12 +1722,14 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
   // Go through list of candidates and merge.
   bool Changed = false;
   for (const MergeCandidate *Candidate : Candidates) {
-    if (Candidate->Instrs.size() > 1) {
+    if (Candidate->CanMergeToLSMulti || Candidate->CanMergeToLSDouble) {
       MachineInstr *Merged = MergeOpsUpdate(*Candidate);
       // Merge preceding/trailing base inc/dec into the merged op.
       if (Merged) {
-        MergeBaseUpdateLSMultiple(Merged);
         Changed = true;
+        unsigned Opcode = Merged->getOpcode();
+        if (Opcode != ARM::t2STRDi8 && Opcode != ARM::t2LDRDi8)
+          MergeBaseUpdateLSMultiple(Merged);
       } else {
         for (MachineInstr *MI : Candidate->Instrs) {
           if (MergeBaseUpdateLoadStore(MI))
@@ -1738,7 +1809,7 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
       Modified |= MergeReturnIntoLDM(MBB);
   }
 
-  Allocator.Reset();
+  Allocator.DestroyAll();
   return Modified;
 }
 
diff --git a/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll b/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll
index c93d2a2d34f..ac5b6f9c970 100644
--- a/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll
+++ b/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll
@@ -25,8 +25,7 @@ entry:
   ;CHECK: push {r7, lr}
   ;CHECK: sub sp, #4
   ;CHECK: add r0, sp, #12
-  ;CHECK: str r2, [sp, #16]
-  ;CHECK: str r1, [sp, #12]
+  ;CHECK: strd r1, r2, [sp, #12]
   ;CHECK: bl  fooUseStruct
   call void @fooUseStruct(%st_t* %p1)
   ret void
diff --git a/test/CodeGen/ARM/byval-align.ll b/test/CodeGen/ARM/byval-align.ll
index a26b5a79575..8a506280dd5 100644
--- a/test/CodeGen/ARM/byval-align.ll
+++ b/test/CodeGen/ARM/byval-align.ll
@@ -28,8 +28,7 @@ define i32 @test_align8(i8*, [4 x i32]* byval align 8 %b) {
 ; CHECK: push {r4, r7, lr}
 ; CHECK: add r7, sp, #4
 
-; CHECK-DAG: str r2, [r7, #8]
-; CHECK-DAG: str r3, [r7, #12]
+; CHECK: strd r2, r3, [r7, #8]
 
 ; CHECK: ldr r0, [r7, #8]
 
diff --git a/test/CodeGen/ARM/ldrd.ll b/test/CodeGen/ARM/ldrd.ll
index f3e13671ac3..5411618ed86 100644
--- a/test/CodeGen/ARM/ldrd.ll
+++ b/test/CodeGen/ARM/ldrd.ll
@@ -3,6 +3,7 @@
 ; rdar://6949835
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s -check-prefix=BASIC -check-prefix=CHECK
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=greedy | FileCheck %s -check-prefix=GREEDY -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=swift | FileCheck %s -check-prefix=SWIFT -check-prefix=CHECK
 
 ; Magic ARM pair hints works best with linearscan / fast.
 
@@ -110,5 +111,25 @@ entry:
   ret void
 }
 
+; CHECK-LABEL: strd_spill_ldrd_reload:
+; A8: strd r1, r0, [sp]
+; M3: strd r1, r0, [sp]
+; BASIC: strd r1, r0, [sp]
+; GREEDY: strd r0, r1, [sp]
+; CHECK: @ InlineAsm Start
+; CHECK: @ InlineAsm End
+; A8: ldrd r2, r1, [sp]
+; M3: ldrd r2, r1, [sp]
+; BASIC: ldrd r2, r1, [sp]
+; GREEDY: ldrd r1, r2, [sp]
+; CHECK: bl{{x?}} _extfunc
+define void @strd_spill_ldrd_reload(i32 %v0, i32 %v1) {
+  ; force %v0 and %v1 to be spilled
+  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{lr}"()
+  ; force the reloaded %v0, %v1 into different registers
+  call void @extfunc(i32 0, i32 %v0, i32 %v1, i32 7)
+  ret void
+}
+
 declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
 declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
diff --git a/test/CodeGen/ARM/memset-inline.ll b/test/CodeGen/ARM/memset-inline.ll
index 191db1e20a2..f6f8d562350 100644
--- a/test/CodeGen/ARM/memset-inline.ll
+++ b/test/CodeGen/ARM/memset-inline.ll
@@ -4,8 +4,7 @@ define void @t1(i8* nocapture %c) nounwind optsize {
 entry:
 ; CHECK-LABEL: t1:
 ; CHECK: movs r1, #0
-; CHECK: str r1, [r0]
-; CHECK: str r1, [r0, #4]
+; CHECK: strd r1, r1, [r0]
 ; CHECK: str r1, [r0, #8]
   call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false)
   ret void
diff --git a/test/CodeGen/ARM/wrong-t2stmia-size-opt.ll b/test/CodeGen/ARM/wrong-t2stmia-size-opt.ll
index 96c5fb8961e..fe335df7a1a 100644
--- a/test/CodeGen/ARM/wrong-t2stmia-size-opt.ll
+++ b/test/CodeGen/ARM/wrong-t2stmia-size-opt.ll
@@ -5,16 +5,20 @@ target triple = "thumbv7--linux-gnueabi"
 
 declare i8* @llvm.returnaddress(i32)
 
-define i32* @wrong-t2stmia-size-reduction(i32* %addr, i32 %val0) minsize {
+define i32* @wrong-t2stmia-size-reduction(i32* %addr, i32 %val0, i32 %val1) minsize {
   store i32 %val0, i32* %addr
   %addr1 = getelementptr i32, i32* %addr, i32 1
+  %addr2 = getelementptr i32, i32* %addr, i32 2
   %lr = call i8* @llvm.returnaddress(i32 0)
   %lr32 = ptrtoint i8* %lr to i32
-  store i32 %lr32, i32* %addr1
-  %addr2 = getelementptr i32, i32* %addr1, i32 1
-  ret i32* %addr2
+  store i32 %val1, i32* %addr1
+  store i32 %lr32, i32* %addr2
+
+  %addr3 = getelementptr i32, i32* %addr, i32 3
+  ret i32* %addr3
 }
 
-; Check that stm writes two registers.  The bug caused one of registers (LR,
+; Check that stm writes three registers.  The bug caused one of registers (LR,
 ; which invalid for Thumb1 form of STMIA instruction) to be dropped.
-; CHECK: stm{{[^,]*}}, {{{.*,.*}}}
+; CHECK-LABEL: wrong-t2stmia-size-reduction:
+; CHECK: stm{{[^,]*}}, {{{.*,.*,.*}}}
diff --git a/test/CodeGen/Thumb2/aapcs.ll b/test/CodeGen/Thumb2/aapcs.ll
index 21af8c119b0..299562fe4c5 100644
--- a/test/CodeGen/Thumb2/aapcs.ll
+++ b/test/CodeGen/Thumb2/aapcs.ll
@@ -33,8 +33,7 @@ define float @float_on_stack(double %a, double %b, double %c, double %d, double
 
 define double @double_on_stack(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i) {
 ; CHECK-LABEL: double_on_stack:
-; SOFT: ldr r0, [sp, #48]
-; SOFT: ldr r1, [sp, #52]
+; SOFT: ldrd r0, r1, [sp, #48]
 ; HARD: vldr d0, [sp]
 ; CHECK-NEXT: bx lr
   ret double %i
@@ -42,8 +41,7 @@ define double @double_on_stack(double %a, double %b, double %c, double %d, doubl
 
 define double @double_not_split(double %a, double %b, double %c, double %d, double %e, double %f, double %g, float %h, double %i) {
 ; CHECK-LABEL: double_not_split:
-; SOFT: ldr r0, [sp, #48]
-; SOFT: ldr r1, [sp, #52]
+; SOFT: ldrd r0, r1, [sp, #48]
 ; HARD: vldr d0, [sp]
 ; CHECK-NEXT: bx lr
   ret double %i