diff --git a/lib/Target/SystemZ/SystemZ.td b/lib/Target/SystemZ/SystemZ.td
index 4f3f18ed499..d4d636d3479 100644
--- a/lib/Target/SystemZ/SystemZ.td
+++ b/lib/Target/SystemZ/SystemZ.td
@@ -40,8 +40,8 @@ include "SystemZOperands.td"
 include "SystemZPatterns.td"
 include "SystemZInstrFormats.td"
 include "SystemZInstrInfo.td"
-include "SystemZInstrFP.td"
 include "SystemZInstrVector.td"
+include "SystemZInstrFP.td"
 
 def SystemZInstrInfo : InstrInfo {}
 
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index 026a75f2140..f16488063af 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -80,6 +80,27 @@ static const MCSymbolRefExpr *getGlobalOffsetTable(MCContext &Context) {
                                  Context);
 }
 
+// MI loads the high part of a vector from memory.  Return an instruction
+// that uses replicating vector load Opcode to do the same thing.
+static MCInst lowerSubvectorLoad(const MachineInstr *MI, unsigned Opcode) {
+  return MCInstBuilder(Opcode)
+    .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+    .addReg(MI->getOperand(1).getReg())
+    .addImm(MI->getOperand(2).getImm())
+    .addReg(MI->getOperand(3).getReg());
+}
+
+// MI stores the high part of a vector to memory.  Return an instruction
+// that uses elemental vector store Opcode to do the same thing.
+static MCInst lowerSubvectorStore(const MachineInstr *MI, unsigned Opcode) {
+  return MCInstBuilder(Opcode)
+    .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+    .addReg(MI->getOperand(1).getReg())
+    .addImm(MI->getOperand(2).getImm())
+    .addReg(MI->getOperand(3).getReg())
+    .addImm(0);
+}
+
 void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   SystemZMCInstLower Lower(MF->getContext(), *this);
   MCInst LoweredMI;
@@ -158,6 +179,29 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addReg(SystemZMC::getRegAsGR64(MI->getOperand(2).getReg()));
     break;
 
+  case SystemZ::VLR32:
+  case SystemZ::VLR64:
+    LoweredMI = MCInstBuilder(SystemZ::VLR)
+      .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+      .addReg(SystemZMC::getRegAsVR128(MI->getOperand(1).getReg()));
+    break;
+
+  case SystemZ::VL32:
+    LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPF);
+    break;
+
+  case SystemZ::VL64:
+    LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPG);
+    break;
+
+  case SystemZ::VST32:
+    LoweredMI = lowerSubvectorStore(MI, SystemZ::VSTEF);
+    break;
+
+  case SystemZ::VST64:
+    LoweredMI = lowerSubvectorStore(MI, SystemZ::VSTEG);
+    break;
+
   case SystemZ::LFER:
     LoweredMI = MCInstBuilder(SystemZ::VLGVF)
       .addReg(SystemZMC::getRegAsGR64(MI->getOperand(0).getReg()))
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 391cb8c6fc9..ff79a48179f 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -91,9 +91,14 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
     addRegisterClass(MVT::i32, &SystemZ::GRX32BitRegClass);
   else
     addRegisterClass(MVT::i32, &SystemZ::GR32BitRegClass);
-  addRegisterClass(MVT::i64,  &SystemZ::GR64BitRegClass);
-  addRegisterClass(MVT::f32,  &SystemZ::FP32BitRegClass);
-  addRegisterClass(MVT::f64,  &SystemZ::FP64BitRegClass);
+  addRegisterClass(MVT::i64, &SystemZ::GR64BitRegClass);
+  if (Subtarget.hasVector()) {
+    addRegisterClass(MVT::f32, &SystemZ::VR32BitRegClass);
+    addRegisterClass(MVT::f64, &SystemZ::VR64BitRegClass);
+  } else {
+    addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass);
+    addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass);
+  }
   addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
 
   if (Subtarget.hasVector()) {
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index efa29fa9c00..27fbd7df288 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -46,9 +46,14 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
   defm LTDBR : LoadAndTestRRE<"ltdb", 0xB312, FP64>;
   defm LTXBR : LoadAndTestRRE<"ltxb", 0xB342, FP128>;
 }
-defm : CompareZeroFP<LTEBRCompare, FP32>;
-defm : CompareZeroFP<LTDBRCompare, FP64>;
-defm : CompareZeroFP<LTXBRCompare, FP128>;
+// Note that the comparison against zero operation is not available if we
+// have vector support, since load-and-test instructions will partially
+// clobber the target (vector) register.
+let Predicates = [FeatureNoVector] in {
+  defm : CompareZeroFP<LTEBRCompare, FP32>;
+  defm : CompareZeroFP<LTDBRCompare, FP64>;
+  defm : CompareZeroFP<LTXBRCompare, FP128>;
+}
 
 // Moves between 64-bit integer and floating-point registers.
 def LGDR : UnaryRRE<"lgd", 0xB3CD, bitconvert, GR64, FP64>;
@@ -98,6 +103,9 @@ let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
   defm LE : UnaryRXPair<"le", 0x78, 0xED64, load, FP32, 4>;
   defm LD : UnaryRXPair<"ld", 0x68, 0xED65, load, FP64, 8>;
 
+  // For z13 we prefer LDE over LE to avoid partial register dependencies.
+  def LDE32 : UnaryRXE<"lde", 0xED24, null_frag, FP32, 4>;
+
   // These instructions are split after register allocation, so we don't
   // want a custom inserter.
   let Has20BitOffset = 1, HasIndex = 1, Is128Bit = 1 in {
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index dc9dfa801fd..71eb9986499 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -2151,10 +2151,13 @@ class PrefetchRILPC<string mnemonic, bits<12> opcode,
 
 // A floating-point load-and test operation.  Create both a normal unary
 // operation and one that acts as a comparison against zero.
+// Note that the comparison against zero operation is not available if we
+// have vector support, since load-and-test instructions will partially
+// clobber the target (vector) register.
 multiclass LoadAndTestRRE<string mnemonic, bits<16> opcode,
                           RegisterOperand cls> {
   def "" : UnaryRRE<mnemonic, opcode, null_frag, cls, cls>;
-  let isCodeGenOnly = 1 in
+  let isCodeGenOnly = 1, Predicates = [FeatureNoVector] in
     def Compare : CompareRRE<mnemonic, opcode, null_frag, cls, cls>;
 }
 
@@ -2401,6 +2404,23 @@ class Alias<int size, dag outs, dag ins, list<dag> pattern>
 class UnaryAliasVRS<RegisterOperand cls1, RegisterOperand cls2>
  : Alias<6, (outs cls1:$src1), (ins cls2:$src2), []>;
 
+// An alias of a UnaryVRR*, but with different register sizes.
+class UnaryAliasVRR<SDPatternOperator operator, TypedReg tr1, TypedReg tr2>
+  : Alias<6, (outs tr1.op:$V1), (ins tr2.op:$V2),
+          [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2))))]>;
+
+// An alias of a UnaryVRX, but with different register sizes.
+class UnaryAliasVRX<SDPatternOperator operator, TypedReg tr,
+                    AddressingMode mode = bdxaddr12only>
+  : Alias<6, (outs tr.op:$V1), (ins mode:$XBD2),
+          [(set tr.op:$V1, (tr.vt (operator mode:$XBD2)))]>;
+
+// An alias of a StoreVRX, but with different register sizes.
+class StoreAliasVRX<SDPatternOperator operator, TypedReg tr,
+                    AddressingMode mode = bdxaddr12only>
+  : Alias<6, (outs), (ins tr.op:$V1, mode:$XBD2),
+          [(operator (tr.vt tr.op:$V1), mode:$XBD2)]>;
+
 // An alias of a BinaryRI, but with different register sizes.
 class BinaryAliasRI<SDPatternOperator operator, RegisterOperand cls,
                     Immediate imm>
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 63101a9d000..8dbd9df32b7 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -578,6 +578,10 @@ SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     Opcode = SystemZ::LDR;
   else if (SystemZ::FP128BitRegClass.contains(DestReg, SrcReg))
     Opcode = SystemZ::LXR;
+  else if (SystemZ::VR32BitRegClass.contains(DestReg, SrcReg))
+    Opcode = SystemZ::VLR32;
+  else if (SystemZ::VR64BitRegClass.contains(DestReg, SrcReg))
+    Opcode = SystemZ::VLR64;
   else if (SystemZ::VR128BitRegClass.contains(DestReg, SrcReg))
     Opcode = SystemZ::VLR;
   else
@@ -1118,6 +1122,12 @@ void SystemZInstrInfo::getLoadStoreOpcodes(const TargetRegisterClass *RC,
   } else if (RC == &SystemZ::FP128BitRegClass) {
     LoadOpcode = SystemZ::LX;
     StoreOpcode = SystemZ::STX;
+  } else if (RC == &SystemZ::VR32BitRegClass) {
+    LoadOpcode = SystemZ::VL32;
+    StoreOpcode = SystemZ::VST32;
+  } else if (RC == &SystemZ::VR64BitRegClass) {
+    LoadOpcode = SystemZ::VL64;
+    StoreOpcode = SystemZ::VST64;
   } else if (RC == &SystemZ::VF128BitRegClass ||
              RC == &SystemZ::VR128BitRegClass) {
     LoadOpcode = SystemZ::VL;
diff --git a/lib/Target/SystemZ/SystemZInstrVector.td b/lib/Target/SystemZ/SystemZInstrVector.td
index b6c8042b3c8..8abaeb69a20 100644
--- a/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/lib/Target/SystemZ/SystemZInstrVector.td
@@ -14,6 +14,8 @@
 let Predicates = [FeatureVector] in {
   // Register move.
   def VLR : UnaryVRRa<"vlr", 0xE756, null_frag, v128any, v128any>;
+  def VLR32 : UnaryAliasVRR<null_frag, v32eb, v32eb>;
+  def VLR64 : UnaryAliasVRR<null_frag, v64db, v64db>;
 
   // Load GR from VR element.
   def VLGVB : BinaryVRSc<"vlgvb", 0xE721, null_frag, v128b, 0>;
@@ -123,6 +125,13 @@ let Predicates = [FeatureVector] in {
   def : Pat<(v2f64 (z_replicate_loadf64 bdxaddr12only:$addr)),
             (VLREPG bdxaddr12only:$addr)>;
 
+  // Use VLREP to load subvectors.  These patterns use "12pair" because
+  // LEY and LDY offer full 20-bit displacement fields.  It's often better
+  // to use those instructions rather than force a 20-bit displacement
+  // into a GPR temporary.
+  def VL32 : UnaryAliasVRX<load, v32eb, bdxaddr12pair>;
+  def VL64 : UnaryAliasVRX<load, v64db, bdxaddr12pair>;
+
   // Load logical element and zero.
   def VLLEZB : UnaryVRX<"vllezb", 0xE704, z_vllezi8,  v128b, 1, 0>;
   def VLLEZH : UnaryVRX<"vllezh", 0xE704, z_vllezi16, v128h, 2, 1>;
@@ -193,6 +202,13 @@ let Predicates = [FeatureVector] in {
                        imm32zx1:$index),
             (VSTEG VR128:$val, bdxaddr12only:$addr, imm32zx1:$index)>;
 
+  // Use VSTE to store subvectors.  These patterns use "12pair" because
+  // STEY and STDY offer full 20-bit displacement fields.  It's often better
+  // to use those instructions rather than force a 20-bit displacement
+  // into a GPR temporary.
+  def VST32 : StoreAliasVRX<store, v32eb, bdxaddr12pair>;
+  def VST64 : StoreAliasVRX<store, v64db, bdxaddr12pair>;
+
   // Scatter element.
   def VSCEF : StoreBinaryVRV<"vscef", 0xE71B, 4, imm32zx2>;
   def VSCEG : StoreBinaryVRV<"vsceg", 0xE71A, 8, imm32zx1>;
@@ -778,7 +794,7 @@ multiclass VectorRounding<Instruction insn, TypedReg tr> {
 let Predicates = [FeatureVector] in {
   // Add.
   def VFADB : BinaryVRRc<"vfadb", 0xE7E3, fadd, v128db, v128db, 3, 0>;
-  def WFADB : BinaryVRRc<"wfadb", 0xE7E3, null_frag, v64db, v64db, 3, 8>;
+  def WFADB : BinaryVRRc<"wfadb", 0xE7E3, fadd, v64db, v64db, 3, 8>;
 
   // Convert from fixed 64-bit.
   def VCDGB : TernaryVRRa<"vcdgb", 0xE7C3, null_frag, v128db, v128g, 3, 0>;
@@ -804,53 +820,55 @@ let Predicates = [FeatureVector] in {
 
   // Divide.
   def VFDDB : BinaryVRRc<"vfddb", 0xE7E5, fdiv, v128db, v128db, 3, 0>;
-  def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, null_frag, v64db, v64db, 3, 8>;
+  def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, fdiv, v64db, v64db, 3, 8>;
 
   // Load FP integer.
   def VFIDB : TernaryVRRa<"vfidb", 0xE7C7, null_frag, v128db, v128db, 3, 0>;
   def WFIDB : TernaryVRRa<"wfidb", 0xE7C7, null_frag, v64db, v64db, 3, 8>;
   defm : VectorRounding<VFIDB, v128db>;
+  defm : VectorRounding<WFIDB, v64db>;
 
   // Load lengthened.
   def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128eb, 2, 0>;
-  def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, null_frag, v64db, v32eb, 2, 8>;
+  def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fextend, v64db, v32eb, 2, 8>;
 
   // Load rounded,
   def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128eb, v128db, 3, 0>;
   def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32eb, v64db, 3, 8>;
   def : Pat<(v4f32 (z_vround (v2f64 VR128:$src))), (VLEDB VR128:$src, 0, 0)>;
+  def : FPConversion<WLEDB, fround, v32eb, v64db, 0, 0>;
 
   // Multiply.
   def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, fmul, v128db, v128db, 3, 0>;
-  def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, null_frag, v64db, v64db, 3, 8>;
+  def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, fmul, v64db, v64db, 3, 8>;
 
   // Multiply and add.
   def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, fma, v128db, v128db, 0, 3>;
-  def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, null_frag, v64db, v64db, 8, 3>;
+  def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, fma, v64db, v64db, 8, 3>;
 
   // Multiply and subtract.
   def VFMSDB : TernaryVRRe<"vfmsdb", 0xE78E, fms, v128db, v128db, 0, 3>;
-  def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, null_frag, v64db, v64db, 8, 3>;
+  def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, fms, v64db, v64db, 8, 3>;
 
   // Load complement,
   def VFLCDB : UnaryVRRa<"vflcdb", 0xE7CC, fneg, v128db, v128db, 3, 0, 0>;
-  def WFLCDB : UnaryVRRa<"wflcdb", 0xE7CC, null_frag, v64db, v64db, 3, 8, 0>;
+  def WFLCDB : UnaryVRRa<"wflcdb", 0xE7CC, fneg, v64db, v64db, 3, 8, 0>;
 
   // Load negative.
   def VFLNDB : UnaryVRRa<"vflndb", 0xE7CC, fnabs, v128db, v128db, 3, 0, 1>;
-  def WFLNDB : UnaryVRRa<"wflndb", 0xE7CC, null_frag, v64db, v64db, 3, 8, 1>;
+  def WFLNDB : UnaryVRRa<"wflndb", 0xE7CC, fnabs, v64db, v64db, 3, 8, 1>;
 
   // Load positive.
   def VFLPDB : UnaryVRRa<"vflpdb", 0xE7CC, fabs, v128db, v128db, 3, 0, 2>;
-  def WFLPDB : UnaryVRRa<"wflpdb", 0xE7CC, null_frag, v64db, v64db, 3, 8, 2>;
+  def WFLPDB : UnaryVRRa<"wflpdb", 0xE7CC, fabs, v64db, v64db, 3, 8, 2>;
 
   // Square root.
   def VFSQDB : UnaryVRRa<"vfsqdb", 0xE7CE, fsqrt, v128db, v128db, 3, 0>;
-  def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, null_frag, v64db, v64db, 3, 8>;
+  def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, fsqrt, v64db, v64db, 3, 8>;
 
   // Subtract.
   def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, fsub, v128db, v128db, 3, 0>;
-  def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, null_frag, v64db, v64db, 3, 8>;
+  def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, fsub, v64db, v64db, 3, 8>;
 
   // Test data class immediate.
   let Defs = [CC] in {
@@ -866,7 +884,7 @@ let Predicates = [FeatureVector] in {
 let Predicates = [FeatureVector] in {
   // Compare scalar.
   let Defs = [CC] in
-    def WFCDB : CompareVRRa<"wfcdb", 0xE7CB, null_frag, v64db, 3>;
+    def WFCDB : CompareVRRa<"wfcdb", 0xE7CB, z_fcmp, v64db, 3>;
 
   // Compare and signal scalar.
   let Defs = [CC] in
diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp
index ec7a8c40d18..d1a17c5500d 100644
--- a/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -15,6 +15,7 @@
 
 #include "SystemZTargetMachine.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 
 using namespace llvm;
 
@@ -36,6 +37,10 @@ public:
 private:
   bool shortenIIF(MachineInstr &MI, unsigned *GPRMap, unsigned LiveOther,
                   unsigned LLIxL, unsigned LLIxH);
+  bool shortenOn0(MachineInstr &MI, unsigned Opcode);
+  bool shortenOn01(MachineInstr &MI, unsigned Opcode);
+  bool shortenOn001(MachineInstr &MI, unsigned Opcode);
+  bool shortenFPConv(MachineInstr &MI, unsigned Opcode);
 
   const SystemZInstrInfo *TII;
 
@@ -97,6 +102,64 @@ bool SystemZShortenInst::shortenIIF(MachineInstr &MI, unsigned *GPRMap,
   return false;
 }
 
+// Change MI's opcode to Opcode if register operand 0 has a 4-bit encoding.
+bool SystemZShortenInst::shortenOn0(MachineInstr &MI, unsigned Opcode) {
+  if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16) {
+    MI.setDesc(TII->get(Opcode));
+    return true;
+  }
+  return false;
+}
+
+// Change MI's opcode to Opcode if register operands 0 and 1 have a
+// 4-bit encoding.
+bool SystemZShortenInst::shortenOn01(MachineInstr &MI, unsigned Opcode) {
+  if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 &&
+      SystemZMC::getFirstReg(MI.getOperand(1).getReg()) < 16) {
+    MI.setDesc(TII->get(Opcode));
+    return true;
+  }
+  return false;
+}
+
+// Change MI's opcode to Opcode if register operands 0, 1 and 2 have a
+// 4-bit encoding and if operands 0 and 1 are tied.
+bool SystemZShortenInst::shortenOn001(MachineInstr &MI, unsigned Opcode) {
+  if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 &&
+      MI.getOperand(1).getReg() == MI.getOperand(0).getReg() &&
+      SystemZMC::getFirstReg(MI.getOperand(2).getReg()) < 16) {
+    MI.setDesc(TII->get(Opcode));
+    return true;
+  }
+  return false;
+}
+
+// MI is a vector-style conversion instruction with the operand order:
+// destination, source, exact-suppress, rounding-mode.  If both registers
+// have a 4-bit encoding then change it to Opcode, which has operand order:
+// destination, rouding-mode, source, exact-suppress.
+bool SystemZShortenInst::shortenFPConv(MachineInstr &MI, unsigned Opcode) {
+  if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 &&
+      SystemZMC::getFirstReg(MI.getOperand(1).getReg()) < 16) {
+    MachineOperand Dest(MI.getOperand(0));
+    MachineOperand Src(MI.getOperand(1));
+    MachineOperand Suppress(MI.getOperand(2));
+    MachineOperand Mode(MI.getOperand(3));
+    MI.RemoveOperand(3);
+    MI.RemoveOperand(2);
+    MI.RemoveOperand(1);
+    MI.RemoveOperand(0);
+    MI.setDesc(TII->get(Opcode));
+    MachineInstrBuilder(*MI.getParent()->getParent(), &MI)
+      .addOperand(Dest)
+      .addOperand(Mode)
+      .addOperand(Src)
+      .addOperand(Suppress);
+    return true;
+  }
+  return false;
+}
+
 // Process all instructions in MBB.  Return true if something changed.
 bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
@@ -117,13 +180,83 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
   // Iterate backwards through the block looking for instructions to change.
   for (auto MBBI = MBB.rbegin(), MBBE = MBB.rend(); MBBI != MBBE; ++MBBI) {
     MachineInstr &MI = *MBBI;
-    unsigned Opcode = MI.getOpcode();
-    if (Opcode == SystemZ::IILF)
+    switch (MI.getOpcode()) {
+    case SystemZ::IILF:
       Changed |= shortenIIF(MI, LowGPRs, LiveHigh, SystemZ::LLILL,
                             SystemZ::LLILH);
-    else if (Opcode == SystemZ::IIHF)
+      break;
+
+    case SystemZ::IIHF:
       Changed |= shortenIIF(MI, HighGPRs, LiveLow, SystemZ::LLIHL,
                             SystemZ::LLIHH);
+      break;
+
+    case SystemZ::WFADB:
+      Changed |= shortenOn001(MI, SystemZ::ADBR);
+      break;
+
+    case SystemZ::WFDDB:
+      Changed |= shortenOn001(MI, SystemZ::DDBR);
+      break;
+
+    case SystemZ::WFIDB:
+      Changed |= shortenFPConv(MI, SystemZ::FIDBRA);
+      break;
+
+    case SystemZ::WLDEB:
+      Changed |= shortenOn01(MI, SystemZ::LDEBR);
+      break;
+
+    case SystemZ::WLEDB:
+      Changed |= shortenFPConv(MI, SystemZ::LEDBRA);
+      break;
+
+    case SystemZ::WFMDB:
+      Changed |= shortenOn001(MI, SystemZ::MDBR);
+      break;
+
+    case SystemZ::WFLCDB:
+      Changed |= shortenOn01(MI, SystemZ::LCDBR);
+      break;
+
+    case SystemZ::WFLNDB:
+      Changed |= shortenOn01(MI, SystemZ::LNDBR);
+      break;
+
+    case SystemZ::WFLPDB:
+      Changed |= shortenOn01(MI, SystemZ::LPDBR);
+      break;
+
+    case SystemZ::WFSQDB:
+      Changed |= shortenOn01(MI, SystemZ::SQDBR);
+      break;
+
+    case SystemZ::WFSDB:
+      Changed |= shortenOn001(MI, SystemZ::SDBR);
+      break;
+
+    case SystemZ::WFCDB:
+      Changed |= shortenOn01(MI, SystemZ::CDBR);
+      break;
+
+    case SystemZ::VL32:
+      // For z13 we prefer LDE over LE to avoid partial register dependencies.
+      Changed |= shortenOn0(MI, SystemZ::LDE32);
+      break;
+
+    case SystemZ::VST32:
+      Changed |= shortenOn0(MI, SystemZ::STE);
+      break;
+
+    case SystemZ::VL64:
+      Changed |= shortenOn0(MI, SystemZ::LD);
+      break;
+
+    case SystemZ::VST64:
+      Changed |= shortenOn0(MI, SystemZ::STD);
+      break;
+    }
+
     unsigned UsedLow = 0;
     unsigned UsedHigh = 0;
     for (auto MOI = MI.operands_begin(), MOE = MI.operands_end();
diff --git a/test/CodeGen/SystemZ/fp-abs-01.ll b/test/CodeGen/SystemZ/fp-abs-01.ll
index d14a92acae8..3b143d93315 100644
--- a/test/CodeGen/SystemZ/fp-abs-01.ll
+++ b/test/CodeGen/SystemZ/fp-abs-01.ll
@@ -1,6 +1,7 @@
 ; Test floating-point absolute.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 ; Test f32.
 declare float @llvm.fabs.f32(float %f)
diff --git a/test/CodeGen/SystemZ/fp-abs-02.ll b/test/CodeGen/SystemZ/fp-abs-02.ll
index deec8c32b4a..e831ddb86fe 100644
--- a/test/CodeGen/SystemZ/fp-abs-02.ll
+++ b/test/CodeGen/SystemZ/fp-abs-02.ll
@@ -1,6 +1,7 @@
 ; Test negated floating-point absolute.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 ; Test f32.
 declare float @llvm.fabs.f32(float %f)
diff --git a/test/CodeGen/SystemZ/fp-add-02.ll b/test/CodeGen/SystemZ/fp-add-02.ll
index 07c7462020f..5be1ad79d45 100644
--- a/test/CodeGen/SystemZ/fp-add-02.ll
+++ b/test/CodeGen/SystemZ/fp-add-02.ll
@@ -1,7 +1,8 @@
 ; Test 64-bit floating-point addition.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
-
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 declare double @foo()
 
 ; Check register addition.
@@ -76,7 +77,7 @@ define double @f6(double %f1, double *%base, i64 %index) {
 define double @f7(double *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: adb %f0, 160(%r15)
+; CHECK-SCALAR: adb %f0, 160(%r15)
 ; CHECK: br %r14
   %ptr1 = getelementptr double, double *%ptr0, i64 2
   %ptr2 = getelementptr double, double *%ptr0, i64 4
diff --git a/test/CodeGen/SystemZ/fp-cmp-02.ll b/test/CodeGen/SystemZ/fp-cmp-02.ll
index 95af309e795..94a256777c7 100644
--- a/test/CodeGen/SystemZ/fp-cmp-02.ll
+++ b/test/CodeGen/SystemZ/fp-cmp-02.ll
@@ -1,7 +1,10 @@
 ; Test 64-bit floating-point comparison.  The tests assume a z10 implementation
 ; of select, using conditional branches rather than LOCGR.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
 
 declare double @foo()
 
@@ -9,8 +12,9 @@ declare double @foo()
 define i64 @f1(i64 %a, i64 %b, double %f1, double %f2) {
 ; CHECK-LABEL: f1:
 ; CHECK: cdbr %f0, %f2
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %cond = fcmp oeq double %f1, %f2
   %res = select i1 %cond, i64 %a, i64 %b
@@ -21,8 +25,9 @@ define i64 @f1(i64 %a, i64 %b, double %f1, double %f2) {
 define i64 @f2(i64 %a, i64 %b, double %f1, double *%ptr) {
 ; CHECK-LABEL: f2:
 ; CHECK: cdb %f0, 0(%r4)
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %f2 = load double , double *%ptr
   %cond = fcmp oeq double %f1, %f2
@@ -34,8 +39,9 @@ define i64 @f2(i64 %a, i64 %b, double %f1, double *%ptr) {
 define i64 @f3(i64 %a, i64 %b, double %f1, double *%base) {
 ; CHECK-LABEL: f3:
 ; CHECK: cdb %f0, 4088(%r4)
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %ptr = getelementptr double, double *%base, i64 511
   %f2 = load double , double *%ptr
@@ -50,8 +56,9 @@ define i64 @f4(i64 %a, i64 %b, double %f1, double *%base) {
 ; CHECK-LABEL: f4:
 ; CHECK: aghi %r4, 4096
 ; CHECK: cdb %f0, 0(%r4)
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %ptr = getelementptr double, double *%base, i64 512
   %f2 = load double , double *%ptr
@@ -65,8 +72,9 @@ define i64 @f5(i64 %a, i64 %b, double %f1, double *%base) {
 ; CHECK-LABEL: f5:
 ; CHECK: aghi %r4, -8
 ; CHECK: cdb %f0, 0(%r4)
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %ptr = getelementptr double, double *%base, i64 -1
   %f2 = load double , double *%ptr
@@ -80,8 +88,9 @@ define i64 @f6(i64 %a, i64 %b, double %f1, double *%base, i64 %index) {
 ; CHECK-LABEL: f6:
 ; CHECK: sllg %r1, %r5, 3
 ; CHECK: cdb %f0, 800(%r1,%r4)
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %ptr1 = getelementptr double, double *%base, i64 %index
   %ptr2 = getelementptr double, double *%ptr1, i64 100
@@ -95,7 +104,7 @@ define i64 @f6(i64 %a, i64 %b, double %f1, double *%base, i64 %index) {
 define double @f7(double *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: cdb {{%f[0-9]+}}, 160(%r15)
+; CHECK-SCALAR: cdb {{%f[0-9]+}}, 160(%r15)
 ; CHECK: br %r14
   %ptr1 = getelementptr double, double *%ptr0, i64 2
   %ptr2 = getelementptr double, double *%ptr0, i64 4
@@ -152,9 +161,12 @@ define double @f7(double *%ptr0) {
 ; Check comparison with zero.
 define i64 @f8(i64 %a, i64 %b, double %f) {
 ; CHECK-LABEL: f8:
-; CHECK: ltdbr %f0, %f0
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR: ltdbr %f0, %f0
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR: lzdr %f1
+; CHECK-VECTOR-NEXT: cdbr %f0, %f1
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %cond = fcmp oeq double %f, 0.0
   %res = select i1 %cond, i64 %a, i64 %b
@@ -165,8 +177,9 @@ define i64 @f8(i64 %a, i64 %b, double %f) {
 define i64 @f9(i64 %a, i64 %b, double %f2, double *%ptr) {
 ; CHECK-LABEL: f9:
 ; CHECK: cdb %f0, 0(%r4)
-; CHECK-NEXT: jl {{\.L.*}}
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: jl
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrnl %r2, %r3
 ; CHECK: br %r14
   %f1 = load double , double *%ptr
   %cond = fcmp ogt double %f1, %f2
diff --git a/test/CodeGen/SystemZ/fp-conv-01.ll b/test/CodeGen/SystemZ/fp-conv-01.ll
index ebc174afada..06740ed4b4a 100644
--- a/test/CodeGen/SystemZ/fp-conv-01.ll
+++ b/test/CodeGen/SystemZ/fp-conv-01.ll
@@ -1,11 +1,15 @@
 ; Test floating-point truncations.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
 
 ; Test f64->f32.
 define float @f1(double %d1, double %d2) {
 ; CHECK-LABEL: f1:
-; CHECK: ledbr %f0, %f2
+; CHECK-SCALAR: ledbr %f0, %f2
+; CHECK-VECTOR: ledbra %f0, 0, %f2, 0
 ; CHECK: br %r14
   %res = fptrunc double %d2 to float
   ret float %res
@@ -50,8 +54,10 @@ define double @f4(fp128 *%ptr) {
 define void @f5(double *%dst, fp128 *%ptr, double %d1, double %d2) {
 ; CHECK-LABEL: f5:
 ; CHECK: ldxbr %f1, %f1
-; CHECK: adbr %f1, %f2
-; CHECK: std %f1, 0(%r2)
+; CHECK-SCALAR: adbr %f1, %f2
+; CHECK-SCALAR: std %f1, 0(%r2)
+; CHECK-VECTOR: wfadb [[REG:%f[0-9]+]], %f1, %f2
+; CHECK-VECTOR: std [[REG]], 0(%r2)
 ; CHECK: br %r14
   %val = load fp128 , fp128 *%ptr
   %conv = fptrunc fp128 %val to double
diff --git a/test/CodeGen/SystemZ/fp-conv-02.ll b/test/CodeGen/SystemZ/fp-conv-02.ll
index e9376ba6973..be32bfe7ba9 100644
--- a/test/CodeGen/SystemZ/fp-conv-02.ll
+++ b/test/CodeGen/SystemZ/fp-conv-02.ll
@@ -1,6 +1,8 @@
 ; Test extensions of f32 to f64.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 ; Check register extension.
 define double @f1(float %val) {
@@ -74,7 +76,7 @@ define double @f6(float *%base, i64 %index) {
 ; to use LDEB if possible.
 define void @f7(double *%ptr1, float *%ptr2) {
 ; CHECK-LABEL: f7:
-; CHECK: ldeb {{%f[0-9]+}}, 16{{[04]}}(%r15)
+; CHECK-SCALAR: ldeb {{%f[0-9]+}}, 16{{[04]}}(%r15)
 ; CHECK: br %r14
   %val0 = load volatile float , float *%ptr2
   %val1 = load volatile float , float *%ptr2
diff --git a/test/CodeGen/SystemZ/fp-div-02.ll b/test/CodeGen/SystemZ/fp-div-02.ll
index 82eeb480602..f120e7c923d 100644
--- a/test/CodeGen/SystemZ/fp-div-02.ll
+++ b/test/CodeGen/SystemZ/fp-div-02.ll
@@ -1,6 +1,8 @@
 ; Test 64-bit floating-point division.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 declare double @foo()
 
@@ -76,7 +78,7 @@ define double @f6(double %f1, double *%base, i64 %index) {
 define double @f7(double *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: ddb %f0, 160(%r15)
+; CHECK-SCALAR: ddb %f0, 160(%r15)
 ; CHECK: br %r14
   %ptr1 = getelementptr double, double *%ptr0, i64 2
   %ptr2 = getelementptr double, double *%ptr0, i64 4
diff --git a/test/CodeGen/SystemZ/fp-move-01.ll b/test/CodeGen/SystemZ/fp-move-01.ll
index 31a8fc55d77..843b1b6a6e6 100644
--- a/test/CodeGen/SystemZ/fp-move-01.ll
+++ b/test/CodeGen/SystemZ/fp-move-01.ll
@@ -1,11 +1,13 @@
 ; Test moves between FPRs.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 ; Test f32 moves.
 define float @f1(float %a, float %b) {
 ; CHECK-LABEL: f1:
 ; CHECK: ler %f0, %f2
+; CHECK: br %r14
   ret float %b
 }
 
@@ -13,6 +15,7 @@ define float @f1(float %a, float %b) {
 define double @f2(double %a, double %b) {
 ; CHECK-LABEL: f2:
 ; CHECK: ldr %f0, %f2
+; CHECK: br %r14
   ret double %b
 }
 
@@ -22,6 +25,7 @@ define void @f3(fp128 *%x) {
 ; CHECK-LABEL: f3:
 ; CHECK: lxr
 ; CHECK: axbr
+; CHECK: br %r14
   %val = load volatile fp128 , fp128 *%x
   %sum = fadd fp128 %val, %val
   store volatile fp128 %sum, fp128 *%x
diff --git a/test/CodeGen/SystemZ/fp-move-04.ll b/test/CodeGen/SystemZ/fp-move-04.ll
index d3728d0e585..6650419b2c3 100644
--- a/test/CodeGen/SystemZ/fp-move-04.ll
+++ b/test/CodeGen/SystemZ/fp-move-04.ll
@@ -1,6 +1,7 @@
 ; Test 64-bit floating-point loads.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 ; Test the low end of the LD range.
 define double @f1(double *%src) {
diff --git a/test/CodeGen/SystemZ/fp-move-07.ll b/test/CodeGen/SystemZ/fp-move-07.ll
index c3ad2a59f66..5361002a97e 100644
--- a/test/CodeGen/SystemZ/fp-move-07.ll
+++ b/test/CodeGen/SystemZ/fp-move-07.ll
@@ -1,6 +1,7 @@
 ; Test 64-bit floating-point stores.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 ; Test the low end of the STD range.
 define void @f1(double *%src, double %val) {
diff --git a/test/CodeGen/SystemZ/fp-move-11.ll b/test/CodeGen/SystemZ/fp-move-11.ll
new file mode 100644
index 00000000000..ce45019425c
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-move-11.ll
@@ -0,0 +1,110 @@
+; Test 32-bit floating-point loads for z13.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Test that we use LDE instead of LE - low end of the LE range.
+define float @f1(float *%src) {
+; CHECK-LABEL: f1:
+; CHECK: lde %f0, 0(%r2)
+; CHECK: br %r14
+  %val = load float, float *%src
+  ret float %val
+}
+
+; Test that we use LDE instead of LE - high end of the LE range.
+define float @f2(float *%src) {
+; CHECK-LABEL: f2:
+; CHECK: lde %f0, 4092(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%src, i64 1023
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check the next word up, which should use LEY instead of LDE.
+define float @f3(float *%src) {
+; CHECK-LABEL: f3:
+; CHECK: ley %f0, 4096(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%src, i64 1024
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check the high end of the aligned LEY range.
+define float @f4(float *%src) {
+; CHECK-LABEL: f4:
+; CHECK: ley %f0, 524284(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%src, i64 131071
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check the next word up, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define float @f5(float *%src) {
+; CHECK-LABEL: f5:
+; CHECK: agfi %r2, 524288
+; CHECK: lde %f0, 0(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%src, i64 131072
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check the high end of the negative aligned LEY range.
+define float @f6(float *%src) {
+; CHECK-LABEL: f6:
+; CHECK: ley %f0, -4(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%src, i64 -1
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check the low end of the LEY range.
+define float @f7(float *%src) {
+; CHECK-LABEL: f7:
+; CHECK: ley %f0, -524288(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%src, i64 -131072
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check the next word down, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define float @f8(float *%src) {
+; CHECK-LABEL: f8:
+; CHECK: agfi %r2, -524292
+; CHECK: lde %f0, 0(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%src, i64 -131073
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check that LDE allows an index.
+define float @f9(i64 %src, i64 %index) {
+; CHECK-LABEL: f9:
+; CHECK: lde %f0, 4092({{%r3,%r2|%r2,%r3}})
+; CHECK: br %r14
+  %add1 = add i64 %src, %index
+  %add2 = add i64 %add1, 4092
+  %ptr = inttoptr i64 %add2 to float *
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check that LEY allows an index.
+define float @f10(i64 %src, i64 %index) {
+; CHECK-LABEL: f10:
+; CHECK: ley %f0, 4096({{%r3,%r2|%r2,%r3}})
+; CHECK: br %r14
+  %add1 = add i64 %src, %index
+  %add2 = add i64 %add1, 4096
+  %ptr = inttoptr i64 %add2 to float *
+  %val = load float, float *%ptr
+  ret float %val
+}
diff --git a/test/CodeGen/SystemZ/fp-mul-03.ll b/test/CodeGen/SystemZ/fp-mul-03.ll
index 701304ef3ee..0d52121f41c 100644
--- a/test/CodeGen/SystemZ/fp-mul-03.ll
+++ b/test/CodeGen/SystemZ/fp-mul-03.ll
@@ -1,6 +1,8 @@
 ; Test multiplication of two f64s, producing an f64 result.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 declare double @foo()
 
@@ -76,7 +78,7 @@ define double @f6(double %f1, double *%base, i64 %index) {
 define double @f7(double *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: mdb %f0, 160(%r15)
+; CHECK-SCALAR: mdb %f0, 160(%r15)
 ; CHECK: br %r14
   %ptr1 = getelementptr double, double *%ptr0, i64 2
   %ptr2 = getelementptr double, double *%ptr0, i64 4
diff --git a/test/CodeGen/SystemZ/fp-mul-07.ll b/test/CodeGen/SystemZ/fp-mul-07.ll
index b1d0ae3c520..e0b4a5c5d78 100644
--- a/test/CodeGen/SystemZ/fp-mul-07.ll
+++ b/test/CodeGen/SystemZ/fp-mul-07.ll
@@ -1,11 +1,15 @@
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
 
 declare double @llvm.fma.f64(double %f1, double %f2, double %f3)
 
 define double @f1(double %f1, double %f2, double %acc) {
 ; CHECK-LABEL: f1:
-; CHECK: madbr %f4, %f0, %f2
-; CHECK: ldr %f0, %f4
+; CHECK-SCALAR: madbr %f4, %f0, %f2
+; CHECK-SCALAR: ldr %f0, %f4
+; CHECK-VECTOR: wfmadb %f0, %f0, %f2, %f4
 ; CHECK: br %r14
   %res = call double @llvm.fma.f64 (double %f1, double %f2, double %acc)
   ret double %res
diff --git a/test/CodeGen/SystemZ/fp-mul-09.ll b/test/CodeGen/SystemZ/fp-mul-09.ll
index f2eadf55ff3..927a8064823 100644
--- a/test/CodeGen/SystemZ/fp-mul-09.ll
+++ b/test/CodeGen/SystemZ/fp-mul-09.ll
@@ -1,11 +1,15 @@
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
 
 declare double @llvm.fma.f64(double %f1, double %f2, double %f3)
 
 define double @f1(double %f1, double %f2, double %acc) {
 ; CHECK-LABEL: f1:
-; CHECK: msdbr %f4, %f0, %f2
-; CHECK: ldr %f0, %f4
+; CHECK-SCALAR: msdbr %f4, %f0, %f2
+; CHECK-SCALAR: ldr %f0, %f4
+; CHECK-VECTOR: wfmsdb %f0, %f0, %f2, %f4
 ; CHECK: br %r14
   %negacc = fsub double -0.0, %acc
   %res = call double @llvm.fma.f64 (double %f1, double %f2, double %negacc)
diff --git a/test/CodeGen/SystemZ/fp-neg-01.ll b/test/CodeGen/SystemZ/fp-neg-01.ll
index 927bcd44d02..fe2e5f67cf5 100644
--- a/test/CodeGen/SystemZ/fp-neg-01.ll
+++ b/test/CodeGen/SystemZ/fp-neg-01.ll
@@ -1,6 +1,7 @@
 ; Test floating-point negation.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 ; Test f32.
 define float @f1(float %f) {
diff --git a/test/CodeGen/SystemZ/fp-round-02.ll b/test/CodeGen/SystemZ/fp-round-02.ll
index bd5419dad1d..428261478dc 100644
--- a/test/CodeGen/SystemZ/fp-round-02.ll
+++ b/test/CodeGen/SystemZ/fp-round-02.ll
@@ -1,6 +1,9 @@
 ; Test rounding functions for z196 and above.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
 
 ; Test rint for f32.
 declare float @llvm.rint.f32(float %f)
@@ -16,7 +19,8 @@ define float @f1(float %f) {
 declare double @llvm.rint.f64(double %f)
 define double @f2(double %f) {
 ; CHECK-LABEL: f2:
-; CHECK: fidbr %f0, 0, %f0
+; CHECK-SCALAR: fidbr %f0, 0, %f0
+; CHECK-VECTOR: fidbra %f0, 0, %f0, 0
 ; CHECK: br %r14
   %res = call double @llvm.rint.f64(double %f)
   ret double %res
diff --git a/test/CodeGen/SystemZ/fp-sqrt-02.ll b/test/CodeGen/SystemZ/fp-sqrt-02.ll
index a6d987b0d76..a162466064e 100644
--- a/test/CodeGen/SystemZ/fp-sqrt-02.ll
+++ b/test/CodeGen/SystemZ/fp-sqrt-02.ll
@@ -1,6 +1,8 @@
 ; Test 64-bit square root.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 declare double @llvm.sqrt.f64(double %f)
 declare double @sqrt(double)
@@ -77,7 +79,7 @@ define double @f6(double *%base, i64 %index) {
 ; to use SQDB if possible.
 define void @f7(double *%ptr) {
 ; CHECK-LABEL: f7:
-; CHECK: sqdb {{%f[0-9]+}}, 160(%r15)
+; CHECK-SCALAR: sqdb {{%f[0-9]+}}, 160(%r15)
 ; CHECK: br %r14
   %val0 = load volatile double , double *%ptr
   %val1 = load volatile double , double *%ptr
diff --git a/test/CodeGen/SystemZ/fp-sub-02.ll b/test/CodeGen/SystemZ/fp-sub-02.ll
index f59ec0a31d7..143baac23e1 100644
--- a/test/CodeGen/SystemZ/fp-sub-02.ll
+++ b/test/CodeGen/SystemZ/fp-sub-02.ll
@@ -1,6 +1,8 @@
 ; Test 64-bit floating-point subtraction.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 declare double @foo()
 
@@ -76,7 +78,7 @@ define double @f6(double %f1, double *%base, i64 %index) {
 define double @f7(double *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: sdb %f0, 16{{[04]}}(%r15)
+; CHECK-SCALAR: sdb %f0, 16{{[04]}}(%r15)
 ; CHECK: br %r14
   %ptr1 = getelementptr double, double *%ptr0, i64 2
   %ptr2 = getelementptr double, double *%ptr0, i64 4
diff --git a/test/CodeGen/SystemZ/frame-03.ll b/test/CodeGen/SystemZ/frame-03.ll
index 029c6d6d37d..21b8fdb0d67 100644
--- a/test/CodeGen/SystemZ/frame-03.ll
+++ b/test/CodeGen/SystemZ/frame-03.ll
@@ -2,7 +2,7 @@
 ; uses a different register class, but the set of saved and restored
 ; registers should be the same.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; This function should require all FPRs, but no other spill slots.
 ; We need to save and restore 8 of the 16 FPRs, so the frame size
diff --git a/test/CodeGen/SystemZ/frame-07.ll b/test/CodeGen/SystemZ/frame-07.ll
index 253bbc26c1f..dd810142962 100644
--- a/test/CodeGen/SystemZ/frame-07.ll
+++ b/test/CodeGen/SystemZ/frame-07.ll
@@ -1,7 +1,7 @@
 ; Test the saving and restoring of FPRs in large frames.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefix=CHECK-NOFP %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck -check-prefix=CHECK-NOFP %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s
 
 ; Test a frame size that requires some FPRs to be saved and loaded using
 ; the 20-bit STDY and LDY while others can use the 12-bit STD and LD.
diff --git a/test/CodeGen/SystemZ/frame-17.ll b/test/CodeGen/SystemZ/frame-17.ll
index 485297a2b21..502e541bafc 100644
--- a/test/CodeGen/SystemZ/frame-17.ll
+++ b/test/CodeGen/SystemZ/frame-17.ll
@@ -1,6 +1,6 @@
 ; Test spilling of FPRs.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; We need to save and restore 8 of the 16 FPRs and allocate an additional
 ; 4-byte spill slot, rounded to 8 bytes.  The frame size should be exactly
diff --git a/test/CodeGen/SystemZ/frame-20.ll b/test/CodeGen/SystemZ/frame-20.ll
new file mode 100644
index 00000000000..8d601c6f6d5
--- /dev/null
+++ b/test/CodeGen/SystemZ/frame-20.ll
@@ -0,0 +1,445 @@
+; Like frame-03.ll, but for z13.  In this case we have 16 more registers
+; available.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; This function should require all FPRs, but no other spill slots.
+; We need to save and restore 8 of the 16 FPRs, so the frame size
+; should be exactly 160 + 8 * 8 = 224.  The CFA offset is 160
+; (the caller-allocated part of the frame) + 224.
+define void @f1(double *%ptr) {
+; CHECK-LABEL: f1:
+; CHECK: aghi %r15, -224
+; CHECK: .cfi_def_cfa_offset 384
+; CHECK: std %f8, 216(%r15)
+; CHECK: std %f9, 208(%r15)
+; CHECK: std %f10, 200(%r15)
+; CHECK: std %f11, 192(%r15)
+; CHECK: std %f12, 184(%r15)
+; CHECK: std %f13, 176(%r15)
+; CHECK: std %f14, 168(%r15)
+; CHECK: std %f15, 160(%r15)
+; CHECK: .cfi_offset %f8, -168
+; CHECK: .cfi_offset %f9, -176
+; CHECK: .cfi_offset %f10, -184
+; CHECK: .cfi_offset %f11, -192
+; CHECK: .cfi_offset %f12, -200
+; CHECK: .cfi_offset %f13, -208
+; CHECK: .cfi_offset %f14, -216
+; CHECK: .cfi_offset %f15, -224
+; CHECK-DAG: ld %f0, 0(%r2)
+; CHECK-DAG: ld %f7, 0(%r2)
+; CHECK-DAG: ld %f8, 0(%r2)
+; CHECK-DAG: ld %f15, 0(%r2)
+; CHECK-DAG: vlrepg %v16, 0(%r2)
+; CHECK-DAG: vlrepg %v23, 0(%r2)
+; CHECK-DAG: vlrepg %v24, 0(%r2)
+; CHECK-DAG: vlrepg %v31, 0(%r2)
+; CHECK: ld %f8, 216(%r15)
+; CHECK: ld %f9, 208(%r15)
+; CHECK: ld %f10, 200(%r15)
+; CHECK: ld %f11, 192(%r15)
+; CHECK: ld %f12, 184(%r15)
+; CHECK: ld %f13, 176(%r15)
+; CHECK: ld %f14, 168(%r15)
+; CHECK: ld %f15, 160(%r15)
+; CHECK: aghi %r15, 224
+; CHECK: br %r14
+  %l0 = load volatile double, double *%ptr
+  %l1 = load volatile double, double *%ptr
+  %l2 = load volatile double, double *%ptr
+  %l3 = load volatile double, double *%ptr
+  %l4 = load volatile double, double *%ptr
+  %l5 = load volatile double, double *%ptr
+  %l6 = load volatile double, double *%ptr
+  %l7 = load volatile double, double *%ptr
+  %l8 = load volatile double, double *%ptr
+  %l9 = load volatile double, double *%ptr
+  %l10 = load volatile double, double *%ptr
+  %l11 = load volatile double, double *%ptr
+  %l12 = load volatile double, double *%ptr
+  %l13 = load volatile double, double *%ptr
+  %l14 = load volatile double, double *%ptr
+  %l15 = load volatile double, double *%ptr
+  %l16 = load volatile double, double *%ptr
+  %l17 = load volatile double, double *%ptr
+  %l18 = load volatile double, double *%ptr
+  %l19 = load volatile double, double *%ptr
+  %l20 = load volatile double, double *%ptr
+  %l21 = load volatile double, double *%ptr
+  %l22 = load volatile double, double *%ptr
+  %l23 = load volatile double, double *%ptr
+  %l24 = load volatile double, double *%ptr
+  %l25 = load volatile double, double *%ptr
+  %l26 = load volatile double, double *%ptr
+  %l27 = load volatile double, double *%ptr
+  %l28 = load volatile double, double *%ptr
+  %l29 = load volatile double, double *%ptr
+  %l30 = load volatile double, double *%ptr
+  %l31 = load volatile double, double *%ptr
+  %acc0 = fsub double %l0, %l0
+  %acc1 = fsub double %l1, %acc0
+  %acc2 = fsub double %l2, %acc1
+  %acc3 = fsub double %l3, %acc2
+  %acc4 = fsub double %l4, %acc3
+  %acc5 = fsub double %l5, %acc4
+  %acc6 = fsub double %l6, %acc5
+  %acc7 = fsub double %l7, %acc6
+  %acc8 = fsub double %l8, %acc7
+  %acc9 = fsub double %l9, %acc8
+  %acc10 = fsub double %l10, %acc9
+  %acc11 = fsub double %l11, %acc10
+  %acc12 = fsub double %l12, %acc11
+  %acc13 = fsub double %l13, %acc12
+  %acc14 = fsub double %l14, %acc13
+  %acc15 = fsub double %l15, %acc14
+  %acc16 = fsub double %l16, %acc15
+  %acc17 = fsub double %l17, %acc16
+  %acc18 = fsub double %l18, %acc17
+  %acc19 = fsub double %l19, %acc18
+  %acc20 = fsub double %l20, %acc19
+  %acc21 = fsub double %l21, %acc20
+  %acc22 = fsub double %l22, %acc21
+  %acc23 = fsub double %l23, %acc22
+  %acc24 = fsub double %l24, %acc23
+  %acc25 = fsub double %l25, %acc24
+  %acc26 = fsub double %l26, %acc25
+  %acc27 = fsub double %l27, %acc26
+  %acc28 = fsub double %l28, %acc27
+  %acc29 = fsub double %l29, %acc28
+  %acc30 = fsub double %l30, %acc29
+  %acc31 = fsub double %l31, %acc30
+  store volatile double %acc0, double *%ptr
+  store volatile double %acc1, double *%ptr
+  store volatile double %acc2, double *%ptr
+  store volatile double %acc3, double *%ptr
+  store volatile double %acc4, double *%ptr
+  store volatile double %acc5, double *%ptr
+  store volatile double %acc6, double *%ptr
+  store volatile double %acc7, double *%ptr
+  store volatile double %acc8, double *%ptr
+  store volatile double %acc9, double *%ptr
+  store volatile double %acc10, double *%ptr
+  store volatile double %acc11, double *%ptr
+  store volatile double %acc12, double *%ptr
+  store volatile double %acc13, double *%ptr
+  store volatile double %acc14, double *%ptr
+  store volatile double %acc15, double *%ptr
+  store volatile double %acc16, double *%ptr
+  store volatile double %acc17, double *%ptr
+  store volatile double %acc18, double *%ptr
+  store volatile double %acc19, double *%ptr
+  store volatile double %acc20, double *%ptr
+  store volatile double %acc21, double *%ptr
+  store volatile double %acc22, double *%ptr
+  store volatile double %acc23, double *%ptr
+  store volatile double %acc24, double *%ptr
+  store volatile double %acc25, double *%ptr
+  store volatile double %acc26, double *%ptr
+  store volatile double %acc27, double *%ptr
+  store volatile double %acc28, double *%ptr
+  store volatile double %acc29, double *%ptr
+  store volatile double %acc30, double *%ptr
+  store volatile double %acc31, double *%ptr
+  ret void
+}
+
+; Like f1, but requires one fewer FPR.  We allocate in numerical order,
+; so %f15 is the one that gets dropped.
+define void @f2(double *%ptr) {
+; CHECK-LABEL: f2:
+; CHECK: aghi %r15, -216
+; CHECK: .cfi_def_cfa_offset 376
+; CHECK: std %f8, 208(%r15)
+; CHECK: std %f9, 200(%r15)
+; CHECK: std %f10, 192(%r15)
+; CHECK: std %f11, 184(%r15)
+; CHECK: std %f12, 176(%r15)
+; CHECK: std %f13, 168(%r15)
+; CHECK: std %f14, 160(%r15)
+; CHECK: .cfi_offset %f8, -168
+; CHECK: .cfi_offset %f9, -176
+; CHECK: .cfi_offset %f10, -184
+; CHECK: .cfi_offset %f11, -192
+; CHECK: .cfi_offset %f12, -200
+; CHECK: .cfi_offset %f13, -208
+; CHECK: .cfi_offset %f14, -216
+; CHECK-NOT: %v15
+; CHECK-NOT: %f15
+; CHECK: ld %f8, 208(%r15)
+; CHECK: ld %f9, 200(%r15)
+; CHECK: ld %f10, 192(%r15)
+; CHECK: ld %f11, 184(%r15)
+; CHECK: ld %f12, 176(%r15)
+; CHECK: ld %f13, 168(%r15)
+; CHECK: ld %f14, 160(%r15)
+; CHECK: aghi %r15, 216
+; CHECK: br %r14
+  %l0 = load volatile double, double *%ptr
+  %l1 = load volatile double, double *%ptr
+  %l2 = load volatile double, double *%ptr
+  %l3 = load volatile double, double *%ptr
+  %l4 = load volatile double, double *%ptr
+  %l5 = load volatile double, double *%ptr
+  %l6 = load volatile double, double *%ptr
+  %l7 = load volatile double, double *%ptr
+  %l8 = load volatile double, double *%ptr
+  %l9 = load volatile double, double *%ptr
+  %l10 = load volatile double, double *%ptr
+  %l11 = load volatile double, double *%ptr
+  %l12 = load volatile double, double *%ptr
+  %l13 = load volatile double, double *%ptr
+  %l14 = load volatile double, double *%ptr
+  %l16 = load volatile double, double *%ptr
+  %l17 = load volatile double, double *%ptr
+  %l18 = load volatile double, double *%ptr
+  %l19 = load volatile double, double *%ptr
+  %l20 = load volatile double, double *%ptr
+  %l21 = load volatile double, double *%ptr
+  %l22 = load volatile double, double *%ptr
+  %l23 = load volatile double, double *%ptr
+  %l24 = load volatile double, double *%ptr
+  %l25 = load volatile double, double *%ptr
+  %l26 = load volatile double, double *%ptr
+  %l27 = load volatile double, double *%ptr
+  %l28 = load volatile double, double *%ptr
+  %l29 = load volatile double, double *%ptr
+  %l30 = load volatile double, double *%ptr
+  %l31 = load volatile double, double *%ptr
+  %acc0 = fsub double %l0, %l0
+  %acc1 = fsub double %l1, %acc0
+  %acc2 = fsub double %l2, %acc1
+  %acc3 = fsub double %l3, %acc2
+  %acc4 = fsub double %l4, %acc3
+  %acc5 = fsub double %l5, %acc4
+  %acc6 = fsub double %l6, %acc5
+  %acc7 = fsub double %l7, %acc6
+  %acc8 = fsub double %l8, %acc7
+  %acc9 = fsub double %l9, %acc8
+  %acc10 = fsub double %l10, %acc9
+  %acc11 = fsub double %l11, %acc10
+  %acc12 = fsub double %l12, %acc11
+  %acc13 = fsub double %l13, %acc12
+  %acc14 = fsub double %l14, %acc13
+  %acc16 = fsub double %l16, %acc14
+  %acc17 = fsub double %l17, %acc16
+  %acc18 = fsub double %l18, %acc17
+  %acc19 = fsub double %l19, %acc18
+  %acc20 = fsub double %l20, %acc19
+  %acc21 = fsub double %l21, %acc20
+  %acc22 = fsub double %l22, %acc21
+  %acc23 = fsub double %l23, %acc22
+  %acc24 = fsub double %l24, %acc23
+  %acc25 = fsub double %l25, %acc24
+  %acc26 = fsub double %l26, %acc25
+  %acc27 = fsub double %l27, %acc26
+  %acc28 = fsub double %l28, %acc27
+  %acc29 = fsub double %l29, %acc28
+  %acc30 = fsub double %l30, %acc29
+  %acc31 = fsub double %l31, %acc30
+  store volatile double %acc0, double *%ptr
+  store volatile double %acc1, double *%ptr
+  store volatile double %acc2, double *%ptr
+  store volatile double %acc3, double *%ptr
+  store volatile double %acc4, double *%ptr
+  store volatile double %acc5, double *%ptr
+  store volatile double %acc6, double *%ptr
+  store volatile double %acc7, double *%ptr
+  store volatile double %acc8, double *%ptr
+  store volatile double %acc9, double *%ptr
+  store volatile double %acc10, double *%ptr
+  store volatile double %acc11, double *%ptr
+  store volatile double %acc12, double *%ptr
+  store volatile double %acc13, double *%ptr
+  store volatile double %acc14, double *%ptr
+  store volatile double %acc16, double *%ptr
+  store volatile double %acc17, double *%ptr
+  store volatile double %acc18, double *%ptr
+  store volatile double %acc19, double *%ptr
+  store volatile double %acc20, double *%ptr
+  store volatile double %acc21, double *%ptr
+  store volatile double %acc22, double *%ptr
+  store volatile double %acc23, double *%ptr
+  store volatile double %acc24, double *%ptr
+  store volatile double %acc25, double *%ptr
+  store volatile double %acc26, double *%ptr
+  store volatile double %acc27, double *%ptr
+  store volatile double %acc28, double *%ptr
+  store volatile double %acc29, double *%ptr
+  store volatile double %acc30, double *%ptr
+  store volatile double %acc31, double *%ptr
+  ret void
+}
+
+; Like f1, but should require only one call-saved FPR.
+define void @f3(double *%ptr) {
+; CHECK-LABEL: f3:
+; CHECK: aghi %r15, -168
+; CHECK: .cfi_def_cfa_offset 328
+; CHECK: std %f8, 160(%r15)
+; CHECK: .cfi_offset %f8, -168
+; CHECK-NOT: {{%[fv]9}}
+; CHECK-NOT: {{%[fv]1[0-5]}}
+; CHECK: ld %f8, 160(%r15)
+; CHECK: aghi %r15, 168
+; CHECK: br %r14
+  %l0 = load volatile double, double *%ptr
+  %l1 = load volatile double, double *%ptr
+  %l2 = load volatile double, double *%ptr
+  %l3 = load volatile double, double *%ptr
+  %l4 = load volatile double, double *%ptr
+  %l5 = load volatile double, double *%ptr
+  %l6 = load volatile double, double *%ptr
+  %l7 = load volatile double, double *%ptr
+  %l8 = load volatile double, double *%ptr
+  %l16 = load volatile double, double *%ptr
+  %l17 = load volatile double, double *%ptr
+  %l18 = load volatile double, double *%ptr
+  %l19 = load volatile double, double *%ptr
+  %l20 = load volatile double, double *%ptr
+  %l21 = load volatile double, double *%ptr
+  %l22 = load volatile double, double *%ptr
+  %l23 = load volatile double, double *%ptr
+  %l24 = load volatile double, double *%ptr
+  %l25 = load volatile double, double *%ptr
+  %l26 = load volatile double, double *%ptr
+  %l27 = load volatile double, double *%ptr
+  %l28 = load volatile double, double *%ptr
+  %l29 = load volatile double, double *%ptr
+  %l30 = load volatile double, double *%ptr
+  %l31 = load volatile double, double *%ptr
+  %acc0 = fsub double %l0, %l0
+  %acc1 = fsub double %l1, %acc0
+  %acc2 = fsub double %l2, %acc1
+  %acc3 = fsub double %l3, %acc2
+  %acc4 = fsub double %l4, %acc3
+  %acc5 = fsub double %l5, %acc4
+  %acc6 = fsub double %l6, %acc5
+  %acc7 = fsub double %l7, %acc6
+  %acc8 = fsub double %l8, %acc7
+  %acc16 = fsub double %l16, %acc8
+  %acc17 = fsub double %l17, %acc16
+  %acc18 = fsub double %l18, %acc17
+  %acc19 = fsub double %l19, %acc18
+  %acc20 = fsub double %l20, %acc19
+  %acc21 = fsub double %l21, %acc20
+  %acc22 = fsub double %l22, %acc21
+  %acc23 = fsub double %l23, %acc22
+  %acc24 = fsub double %l24, %acc23
+  %acc25 = fsub double %l25, %acc24
+  %acc26 = fsub double %l26, %acc25
+  %acc27 = fsub double %l27, %acc26
+  %acc28 = fsub double %l28, %acc27
+  %acc29 = fsub double %l29, %acc28
+  %acc30 = fsub double %l30, %acc29
+  %acc31 = fsub double %l31, %acc30
+  store volatile double %acc0, double *%ptr
+  store volatile double %acc1, double *%ptr
+  store volatile double %acc2, double *%ptr
+  store volatile double %acc3, double *%ptr
+  store volatile double %acc4, double *%ptr
+  store volatile double %acc5, double *%ptr
+  store volatile double %acc6, double *%ptr
+  store volatile double %acc7, double *%ptr
+  store volatile double %acc8, double *%ptr
+  store volatile double %acc16, double *%ptr
+  store volatile double %acc17, double *%ptr
+  store volatile double %acc18, double *%ptr
+  store volatile double %acc19, double *%ptr
+  store volatile double %acc20, double *%ptr
+  store volatile double %acc21, double *%ptr
+  store volatile double %acc22, double *%ptr
+  store volatile double %acc23, double *%ptr
+  store volatile double %acc24, double *%ptr
+  store volatile double %acc25, double *%ptr
+  store volatile double %acc26, double *%ptr
+  store volatile double %acc27, double *%ptr
+  store volatile double %acc28, double *%ptr
+  store volatile double %acc29, double *%ptr
+  store volatile double %acc30, double *%ptr
+  store volatile double %acc31, double *%ptr
+  ret void
+}
+
+; This function should use all call-clobbered FPRs and vector registers
+; but no call-saved ones.  It shouldn't need to create a frame.
+define void @f4(double *%ptr) {
+; CHECK-LABEL: f4:
+; CHECK-NOT: %r15
+; CHECK-NOT: {{%[fv][89]}}
+; CHECK-NOT: {{%[fv]1[0-5]}}
+; CHECK: br %r14
+  %l0 = load volatile double, double *%ptr
+  %l1 = load volatile double, double *%ptr
+  %l2 = load volatile double, double *%ptr
+  %l3 = load volatile double, double *%ptr
+  %l4 = load volatile double, double *%ptr
+  %l5 = load volatile double, double *%ptr
+  %l6 = load volatile double, double *%ptr
+  %l7 = load volatile double, double *%ptr
+  %l16 = load volatile double, double *%ptr
+  %l17 = load volatile double, double *%ptr
+  %l18 = load volatile double, double *%ptr
+  %l19 = load volatile double, double *%ptr
+  %l20 = load volatile double, double *%ptr
+  %l21 = load volatile double, double *%ptr
+  %l22 = load volatile double, double *%ptr
+  %l23 = load volatile double, double *%ptr
+  %l24 = load volatile double, double *%ptr
+  %l25 = load volatile double, double *%ptr
+  %l26 = load volatile double, double *%ptr
+  %l27 = load volatile double, double *%ptr
+  %l28 = load volatile double, double *%ptr
+  %l29 = load volatile double, double *%ptr
+  %l30 = load volatile double, double *%ptr
+  %l31 = load volatile double, double *%ptr
+  %acc0 = fsub double %l0, %l0
+  %acc1 = fsub double %l1, %acc0
+  %acc2 = fsub double %l2, %acc1
+  %acc3 = fsub double %l3, %acc2
+  %acc4 = fsub double %l4, %acc3
+  %acc5 = fsub double %l5, %acc4
+  %acc6 = fsub double %l6, %acc5
+  %acc7 = fsub double %l7, %acc6
+  %acc16 = fsub double %l16, %acc7
+  %acc17 = fsub double %l17, %acc16
+  %acc18 = fsub double %l18, %acc17
+  %acc19 = fsub double %l19, %acc18
+  %acc20 = fsub double %l20, %acc19
+  %acc21 = fsub double %l21, %acc20
+  %acc22 = fsub double %l22, %acc21
+  %acc23 = fsub double %l23, %acc22
+  %acc24 = fsub double %l24, %acc23
+  %acc25 = fsub double %l25, %acc24
+  %acc26 = fsub double %l26, %acc25
+  %acc27 = fsub double %l27, %acc26
+  %acc28 = fsub double %l28, %acc27
+  %acc29 = fsub double %l29, %acc28
+  %acc30 = fsub double %l30, %acc29
+  %acc31 = fsub double %l31, %acc30
+  store volatile double %acc0, double *%ptr
+  store volatile double %acc1, double *%ptr
+  store volatile double %acc2, double *%ptr
+  store volatile double %acc3, double *%ptr
+  store volatile double %acc4, double *%ptr
+  store volatile double %acc5, double *%ptr
+  store volatile double %acc6, double *%ptr
+  store volatile double %acc7, double *%ptr
+  store volatile double %acc16, double *%ptr
+  store volatile double %acc17, double *%ptr
+  store volatile double %acc18, double *%ptr
+  store volatile double %acc19, double *%ptr
+  store volatile double %acc20, double *%ptr
+  store volatile double %acc21, double *%ptr
+  store volatile double %acc22, double *%ptr
+  store volatile double %acc23, double *%ptr
+  store volatile double %acc24, double *%ptr
+  store volatile double %acc25, double *%ptr
+  store volatile double %acc26, double *%ptr
+  store volatile double %acc27, double *%ptr
+  store volatile double %acc28, double *%ptr
+  store volatile double %acc29, double *%ptr
+  store volatile double %acc30, double *%ptr
+  store volatile double %acc31, double *%ptr
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/vec-abs-05.ll b/test/CodeGen/SystemZ/vec-abs-05.ll
index 89142b21854..63210f87b94 100644
--- a/test/CodeGen/SystemZ/vec-abs-05.ll
+++ b/test/CodeGen/SystemZ/vec-abs-05.ll
@@ -1,7 +1,8 @@
-; Test v2f64 absolute.
+; Test f64 and v2f64 absolute.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
+declare double @llvm.fabs.f64(double)
 declare <2 x double> @llvm.fabs.v2f64(<2 x double>)
 
 ; Test a plain absolute.
@@ -22,3 +23,24 @@ define <2 x double> @f2(<2 x double> %val) {
   %ret = fsub <2 x double> <double -0.0, double -0.0>, %abs
   ret <2 x double> %ret
 }
+
+; Test an f64 absolute that uses vector registers.
+define double @f3(<2 x double> %val) {
+; CHECK-LABEL: f3:
+; CHECK: wflpdb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %ret = call double @llvm.fabs.f64(double %scalar)
+  ret double %ret
+}
+
+; Test an f64 negative absolute that uses vector registers.
+define double @f4(<2 x double> %val) {
+; CHECK-LABEL: f4:
+; CHECK: wflndb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %abs = call double @llvm.fabs.f64(double %scalar)
+  %ret = fsub double -0.0, %abs
+  ret double %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-add-01.ll b/test/CodeGen/SystemZ/vec-add-01.ll
index 1de2aa2a1b9..31703437767 100644
--- a/test/CodeGen/SystemZ/vec-add-01.ll
+++ b/test/CodeGen/SystemZ/vec-add-01.ll
@@ -47,3 +47,14 @@ define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1,
   %ret = fadd <2 x double> %val1, %val2
   ret <2 x double> %ret
 }
+
+; Test an f64 addition that uses vector registers.
+define double @f6(<2 x double> %val1, <2 x double> %val2) {
+; CHECK-LABEL: f6:
+; CHECK: wfadb %f0, %v24, %v26
+; CHECK: br %r14
+  %scalar1 = extractelement <2 x double> %val1, i32 0
+  %scalar2 = extractelement <2 x double> %val2, i32 0
+  %ret = fadd double %scalar1, %scalar2
+  ret double %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-cmp-06.ll b/test/CodeGen/SystemZ/vec-cmp-06.ll
index bdb8744631a..eef57555b48 100644
--- a/test/CodeGen/SystemZ/vec-cmp-06.ll
+++ b/test/CodeGen/SystemZ/vec-cmp-06.ll
@@ -1,4 +1,4 @@
-; Test v2f64 comparisons.
+; Test f64 and v2f64 comparisons.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
@@ -335,3 +335,15 @@ define <2 x double> @f28(<2 x double> %val1, <2 x double> %val2,
   %ret = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
   ret <2 x double> %ret
 }
+
+; Test an f64 comparison that uses vector registers.
+define i64 @f29(i64 %a, i64 %b, double %f1, <2 x double> %vec) {
+; CHECK-LABEL: f29:
+; CHECK: wfcdb %f0, %v24
+; CHECK-NEXT: locgrne %r2, %r3
+; CHECK: br %r14
+  %f2 = extractelement <2 x double> %vec, i32 0
+  %cond = fcmp oeq double %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/vec-conv-02.ll b/test/CodeGen/SystemZ/vec-conv-02.ll
index ceccfc60b37..ab84389f3c8 100644
--- a/test/CodeGen/SystemZ/vec-conv-02.ll
+++ b/test/CodeGen/SystemZ/vec-conv-02.ll
@@ -11,3 +11,23 @@ define void @f1(<2 x double> %val, <2 x float> *%ptr) {
   store <2 x float> %res, <2 x float> *%ptr
   ret void
 }
+
+; Test conversion of an f64 in a vector register to an f32.
+define float @f2(<2 x double> %vec) {
+; CHECK-LABEL: f2:
+; CHECK: wledb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %vec, i32 0
+  %ret = fptrunc double %scalar to float
+  ret float %ret
+}
+
+; Test conversion of an f32 in a vector register to an f64.
+define double @f3(<4 x float> %vec) {
+; CHECK-LABEL: f3:
+; CHECK: wldeb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <4 x float> %vec, i32 0
+  %ret = fpext float %scalar to double
+  ret double %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-div-01.ll b/test/CodeGen/SystemZ/vec-div-01.ll
index 5666444e9da..506d40861d3 100644
--- a/test/CodeGen/SystemZ/vec-div-01.ll
+++ b/test/CodeGen/SystemZ/vec-div-01.ll
@@ -70,3 +70,14 @@ define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1,
   %ret = fdiv <2 x double> %val1, %val2
   ret <2 x double> %ret
 }
+
+; Test an f64 division that uses vector registers.
+define double @f6(<2 x double> %val1, <2 x double> %val2) {
+; CHECK-LABEL: f6:
+; CHECK: wfddb %f0, %v24, %v26
+; CHECK: br %r14
+  %scalar1 = extractelement <2 x double> %val1, i32 0
+  %scalar2 = extractelement <2 x double> %val2, i32 0
+  %ret = fdiv double %scalar1, %scalar2
+  ret double %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-mul-01.ll b/test/CodeGen/SystemZ/vec-mul-01.ll
index d0018fa1f8c..5ecc30d4427 100644
--- a/test/CodeGen/SystemZ/vec-mul-01.ll
+++ b/test/CodeGen/SystemZ/vec-mul-01.ll
@@ -47,3 +47,14 @@ define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1,
   %ret = fmul <2 x double> %val1, %val2
   ret <2 x double> %ret
 }
+
+; Test an f64 multiplication that uses vector registers.
+define double @f6(<2 x double> %val1, <2 x double> %val2) {
+; CHECK-LABEL: f6:
+; CHECK: wfmdb %f0, %v24, %v26
+; CHECK: br %r14
+  %scalar1 = extractelement <2 x double> %val1, i32 0
+  %scalar2 = extractelement <2 x double> %val2, i32 0
+  %ret = fmul double %scalar1, %scalar2
+  ret double %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-neg-01.ll b/test/CodeGen/SystemZ/vec-neg-01.ll
index 491e24bb34f..b1389ce4d6d 100644
--- a/test/CodeGen/SystemZ/vec-neg-01.ll
+++ b/test/CodeGen/SystemZ/vec-neg-01.ll
@@ -46,3 +46,13 @@ define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val) {
   %ret = fsub <2 x double> <double -0.0, double -0.0>, %val
   ret <2 x double> %ret
 }
+
+; Test an f64 negation that uses vector registers.
+define double @f6(<2 x double> %val) {
+; CHECK-LABEL: f6:
+; CHECK: wflcdb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %ret = fsub double -0.0, %scalar
+  ret double %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-round-01.ll b/test/CodeGen/SystemZ/vec-round-01.ll
index 284b83e96f7..82718276bb0 100644
--- a/test/CodeGen/SystemZ/vec-round-01.ll
+++ b/test/CodeGen/SystemZ/vec-round-01.ll
@@ -2,6 +2,12 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
+declare double @llvm.rint.f64(double)
+declare double @llvm.nearbyint.f64(double)
+declare double @llvm.floor.f64(double)
+declare double @llvm.ceil.f64(double)
+declare double @llvm.trunc.f64(double)
+declare double @llvm.round.f64(double)
 declare <2 x double> @llvm.rint.v2f64(<2 x double>)
 declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>)
 declare <2 x double> @llvm.floor.v2f64(<2 x double>)
@@ -56,3 +62,57 @@ define <2 x double> @f6(<2 x double> %val) {
   %res = call <2 x double> @llvm.round.v2f64(<2 x double> %val)
   ret <2 x double> %res
 }
+
+define double @f7(<2 x double> %val) {
+; CHECK-LABEL: f7:
+; CHECK: wfidb %f0, %v24, 0, 0
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %res = call double @llvm.rint.f64(double %scalar)
+  ret double %res
+}
+
+define double @f8(<2 x double> %val) {
+; CHECK-LABEL: f8:
+; CHECK: wfidb %f0, %v24, 4, 0
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %res = call double @llvm.nearbyint.f64(double %scalar)
+  ret double %res
+}
+
+define double @f9(<2 x double> %val) {
+; CHECK-LABEL: f9:
+; CHECK: wfidb %f0, %v24, 4, 7
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %res = call double @llvm.floor.f64(double %scalar)
+  ret double %res
+}
+
+define double @f10(<2 x double> %val) {
+; CHECK-LABEL: f10:
+; CHECK: wfidb %f0, %v24, 4, 6
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %res = call double @llvm.ceil.f64(double %scalar)
+  ret double %res
+}
+
+define double @f11(<2 x double> %val) {
+; CHECK-LABEL: f11:
+; CHECK: wfidb %f0, %v24, 4, 5
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %res = call double @llvm.trunc.f64(double %scalar)
+  ret double %res
+}
+
+define double @f12(<2 x double> %val) {
+; CHECK-LABEL: f12:
+; CHECK: wfidb %f0, %v24, 4, 1
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %res = call double @llvm.round.f64(double %scalar)
+  ret double %res
+}
diff --git a/test/CodeGen/SystemZ/vec-sqrt-01.ll b/test/CodeGen/SystemZ/vec-sqrt-01.ll
index 0160c24a749..5c3ffb3b064 100644
--- a/test/CodeGen/SystemZ/vec-sqrt-01.ll
+++ b/test/CodeGen/SystemZ/vec-sqrt-01.ll
@@ -1,7 +1,8 @@
-; Test v2f64 square root.
+; Test f64 and v2f64 square root.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
+declare double @llvm.sqrt.f64(double)
 declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
 
 define <2 x double> @f1(<2 x double> %val) {
@@ -11,3 +12,12 @@ define <2 x double> @f1(<2 x double> %val) {
   %ret = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %val)
   ret <2 x double> %ret
 }
+
+define double @f2(<2 x double> %val) {
+; CHECK-LABEL: f2:
+; CHECK: wfsqdb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %ret = call double @llvm.sqrt.f64(double %scalar)
+  ret double %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-sub-01.ll b/test/CodeGen/SystemZ/vec-sub-01.ll
index aabf1c9be4a..5620ebcb8c4 100644
--- a/test/CodeGen/SystemZ/vec-sub-01.ll
+++ b/test/CodeGen/SystemZ/vec-sub-01.ll
@@ -74,3 +74,14 @@ define <2 x double> @f6(<2 x double> %dummy, <2 x double> %val1,
   %ret = fsub <2 x double> %val1, %val2
   ret <2 x double> %ret
 }
+
+; Test an f64 subtraction that uses vector registers.
+define double @f7(<2 x double> %val1, <2 x double> %val2) {
+; CHECK-LABEL: f7:
+; CHECK: wfsdb %f0, %v24, %v26
+; CHECK: br %r14
+  %scalar1 = extractelement <2 x double> %val1, i32 0
+  %scalar2 = extractelement <2 x double> %val2, i32 0
+  %ret = fsub double %scalar1, %scalar2
+  ret double %ret
+}