[SystemZ] Add CodeGen support for scalar f64 ops in vector registers

The z13 vector facility includes some instructions that operate only on the high f64 in a v2f64, effectively extending the FP register set from 16 to 32 registers. It's still better to use the old instructions if the operands happen to fit though, since the older instructions have a shorter encoding. Based on a patch by Richard Sandiford. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@236524 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-09 01:38:03 +00:00 · 2015-05-05 19:28:34 +00:00 · 2015-05-05 19:28:34 +00:00 · cf0fa9b9dd
commit cf0fa9b9dd
parent 878c6281d3
40 changed files with 1102 additions and 80 deletions
--- a/lib/Target/SystemZ/SystemZ.td
+++ b/lib/Target/SystemZ/SystemZ.td
@ -40,8 +40,8 @@ include "SystemZOperands.td"
 include "SystemZPatterns.td"
 include "SystemZInstrFormats.td"
 include "SystemZInstrInfo.td"
-include "SystemZInstrFP.td"
 include "SystemZInstrVector.td"
+include "SystemZInstrFP.td"

 def SystemZInstrInfo : InstrInfo {}

--- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@ -80,6 +80,27 @@ static const MCSymbolRefExpr *getGlobalOffsetTable(MCContext &Context) {
                                 Context);
 }

+// MI loads the high part of a vector from memory.  Return an instruction
+// that uses replicating vector load Opcode to do the same thing.
+static MCInst lowerSubvectorLoad(const MachineInstr *MI, unsigned Opcode) {
+  return MCInstBuilder(Opcode)
+    .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+    .addReg(MI->getOperand(1).getReg())
+    .addImm(MI->getOperand(2).getImm())
+    .addReg(MI->getOperand(3).getReg());
+}
+
+// MI stores the high part of a vector to memory.  Return an instruction
+// that uses elemental vector store Opcode to do the same thing.
+static MCInst lowerSubvectorStore(const MachineInstr *MI, unsigned Opcode) {
+  return MCInstBuilder(Opcode)
+    .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+    .addReg(MI->getOperand(1).getReg())
+    .addImm(MI->getOperand(2).getImm())
+    .addReg(MI->getOperand(3).getReg())
+    .addImm(0);
+}
+
 void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
  SystemZMCInstLower Lower(MF->getContext(), *this);
  MCInst LoweredMI;
@ -158,6 +179,29 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
      .addReg(SystemZMC::getRegAsGR64(MI->getOperand(2).getReg()));
    break;

+  case SystemZ::VLR32:
+  case SystemZ::VLR64:
+    LoweredMI = MCInstBuilder(SystemZ::VLR)
+      .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+      .addReg(SystemZMC::getRegAsVR128(MI->getOperand(1).getReg()));
+    break;
+
+  case SystemZ::VL32:
+    LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPF);
+    break;
+
+  case SystemZ::VL64:
+    LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPG);
+    break;
+
+  case SystemZ::VST32:
+    LoweredMI = lowerSubvectorStore(MI, SystemZ::VSTEF);
+    break;
+
+  case SystemZ::VST64:
+    LoweredMI = lowerSubvectorStore(MI, SystemZ::VSTEG);
+    break;
+
  case SystemZ::LFER:
    LoweredMI = MCInstBuilder(SystemZ::VLGVF)
      .addReg(SystemZMC::getRegAsGR64(MI->getOperand(0).getReg()))
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@ -91,9 +91,14 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
    addRegisterClass(MVT::i32, &SystemZ::GRX32BitRegClass);
  else
    addRegisterClass(MVT::i32, &SystemZ::GR32BitRegClass);
-  addRegisterClass(MVT::i64,  &SystemZ::GR64BitRegClass);
-  addRegisterClass(MVT::f32,  &SystemZ::FP32BitRegClass);
-  addRegisterClass(MVT::f64,  &SystemZ::FP64BitRegClass);
+  addRegisterClass(MVT::i64, &SystemZ::GR64BitRegClass);
+  if (Subtarget.hasVector()) {
+    addRegisterClass(MVT::f32, &SystemZ::VR32BitRegClass);
+    addRegisterClass(MVT::f64, &SystemZ::VR64BitRegClass);
+  } else {
+    addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass);
+    addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass);
+  }
  addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);

  if (Subtarget.hasVector()) {
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@ -46,9 +46,14 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
  defm LTDBR : LoadAndTestRRE<"ltdb", 0xB312, FP64>;
  defm LTXBR : LoadAndTestRRE<"ltxb", 0xB342, FP128>;
 }
-defm : CompareZeroFP<LTEBRCompare, FP32>;
-defm : CompareZeroFP<LTDBRCompare, FP64>;
-defm : CompareZeroFP<LTXBRCompare, FP128>;
+// Note that the comparison against zero operation is not available if we
+// have vector support, since load-and-test instructions will partially
+// clobber the target (vector) register.
+let Predicates = [FeatureNoVector] in {
+  defm : CompareZeroFP<LTEBRCompare, FP32>;
+  defm : CompareZeroFP<LTDBRCompare, FP64>;
+  defm : CompareZeroFP<LTXBRCompare, FP128>;
+}

 // Moves between 64-bit integer and floating-point registers.
 def LGDR : UnaryRRE<"lgd", 0xB3CD, bitconvert, GR64, FP64>;
@ -98,6 +103,9 @@ let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
  defm LE : UnaryRXPair<"le", 0x78, 0xED64, load, FP32, 4>;
  defm LD : UnaryRXPair<"ld", 0x68, 0xED65, load, FP64, 8>;

+  // For z13 we prefer LDE over LE to avoid partial register dependencies.
+  def LDE32 : UnaryRXE<"lde", 0xED24, null_frag, FP32, 4>;
+
  // These instructions are split after register allocation, so we don't
  // want a custom inserter.
  let Has20BitOffset = 1, HasIndex = 1, Is128Bit = 1 in {
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@ -2151,10 +2151,13 @@ class PrefetchRILPC<string mnemonic, bits<12> opcode,

 // A floating-point load-and test operation.  Create both a normal unary
 // operation and one that acts as a comparison against zero.
+// Note that the comparison against zero operation is not available if we
+// have vector support, since load-and-test instructions will partially
+// clobber the target (vector) register.
 multiclass LoadAndTestRRE<string mnemonic, bits<16> opcode,
                          RegisterOperand cls> {
  def "" : UnaryRRE<mnemonic, opcode, null_frag, cls, cls>;
-  let isCodeGenOnly = 1 in
+  let isCodeGenOnly = 1, Predicates = [FeatureNoVector] in
    def Compare : CompareRRE<mnemonic, opcode, null_frag, cls, cls>;
 }

@ -2401,6 +2404,23 @@ class Alias<int size, dag outs, dag ins, list<dag> pattern>
 class UnaryAliasVRS<RegisterOperand cls1, RegisterOperand cls2>
 : Alias<6, (outs cls1:$src1), (ins cls2:$src2), []>;

+// An alias of a UnaryVRR*, but with different register sizes.
+class UnaryAliasVRR<SDPatternOperator operator, TypedReg tr1, TypedReg tr2>
+  : Alias<6, (outs tr1.op:$V1), (ins tr2.op:$V2),
+          [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2))))]>;
+
+// An alias of a UnaryVRX, but with different register sizes.
+class UnaryAliasVRX<SDPatternOperator operator, TypedReg tr,
+                    AddressingMode mode = bdxaddr12only>
+  : Alias<6, (outs tr.op:$V1), (ins mode:$XBD2),
+          [(set tr.op:$V1, (tr.vt (operator mode:$XBD2)))]>;
+
+// An alias of a StoreVRX, but with different register sizes.
+class StoreAliasVRX<SDPatternOperator operator, TypedReg tr,
+                    AddressingMode mode = bdxaddr12only>
+  : Alias<6, (outs), (ins tr.op:$V1, mode:$XBD2),
+          [(operator (tr.vt tr.op:$V1), mode:$XBD2)]>;
+
 // An alias of a BinaryRI, but with different register sizes.
 class BinaryAliasRI<SDPatternOperator operator, RegisterOperand cls,
                    Immediate imm>
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@ -578,6 +578,10 @@ SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
    Opcode = SystemZ::LDR;
  else if (SystemZ::FP128BitRegClass.contains(DestReg, SrcReg))
    Opcode = SystemZ::LXR;
+  else if (SystemZ::VR32BitRegClass.contains(DestReg, SrcReg))
+    Opcode = SystemZ::VLR32;
+  else if (SystemZ::VR64BitRegClass.contains(DestReg, SrcReg))
+    Opcode = SystemZ::VLR64;
  else if (SystemZ::VR128BitRegClass.contains(DestReg, SrcReg))
    Opcode = SystemZ::VLR;
  else
@ -1118,6 +1122,12 @@ void SystemZInstrInfo::getLoadStoreOpcodes(const TargetRegisterClass *RC,
  } else if (RC == &SystemZ::FP128BitRegClass) {
    LoadOpcode = SystemZ::LX;
    StoreOpcode = SystemZ::STX;
+  } else if (RC == &SystemZ::VR32BitRegClass) {
+    LoadOpcode = SystemZ::VL32;
+    StoreOpcode = SystemZ::VST32;
+  } else if (RC == &SystemZ::VR64BitRegClass) {
+    LoadOpcode = SystemZ::VL64;
+    StoreOpcode = SystemZ::VST64;
  } else if (RC == &SystemZ::VF128BitRegClass ||
             RC == &SystemZ::VR128BitRegClass) {
    LoadOpcode = SystemZ::VL;
--- a/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/lib/Target/SystemZ/SystemZInstrVector.td
@ -14,6 +14,8 @@
 let Predicates = [FeatureVector] in {
  // Register move.
  def VLR : UnaryVRRa<"vlr", 0xE756, null_frag, v128any, v128any>;
+  def VLR32 : UnaryAliasVRR<null_frag, v32eb, v32eb>;
+  def VLR64 : UnaryAliasVRR<null_frag, v64db, v64db>;

  // Load GR from VR element.
  def VLGVB : BinaryVRSc<"vlgvb", 0xE721, null_frag, v128b, 0>;
@ -123,6 +125,13 @@ let Predicates = [FeatureVector] in {
  def : Pat<(v2f64 (z_replicate_loadf64 bdxaddr12only:$addr)),
            (VLREPG bdxaddr12only:$addr)>;

+  // Use VLREP to load subvectors.  These patterns use "12pair" because
+  // LEY and LDY offer full 20-bit displacement fields.  It's often better
+  // to use those instructions rather than force a 20-bit displacement
+  // into a GPR temporary.
+  def VL32 : UnaryAliasVRX<load, v32eb, bdxaddr12pair>;
+  def VL64 : UnaryAliasVRX<load, v64db, bdxaddr12pair>;
+
  // Load logical element and zero.
  def VLLEZB : UnaryVRX<"vllezb", 0xE704, z_vllezi8,  v128b, 1, 0>;
  def VLLEZH : UnaryVRX<"vllezh", 0xE704, z_vllezi16, v128h, 2, 1>;
@ -193,6 +202,13 @@ let Predicates = [FeatureVector] in {
                       imm32zx1:$index),
            (VSTEG VR128:$val, bdxaddr12only:$addr, imm32zx1:$index)>;

+  // Use VSTE to store subvectors.  These patterns use "12pair" because
+  // STEY and STDY offer full 20-bit displacement fields.  It's often better
+  // to use those instructions rather than force a 20-bit displacement
+  // into a GPR temporary.
+  def VST32 : StoreAliasVRX<store, v32eb, bdxaddr12pair>;
+  def VST64 : StoreAliasVRX<store, v64db, bdxaddr12pair>;
+
  // Scatter element.
  def VSCEF : StoreBinaryVRV<"vscef", 0xE71B, 4, imm32zx2>;
  def VSCEG : StoreBinaryVRV<"vsceg", 0xE71A, 8, imm32zx1>;
@ -778,7 +794,7 @@ multiclass VectorRounding<Instruction insn, TypedReg tr> {
 let Predicates = [FeatureVector] in {
  // Add.
  def VFADB : BinaryVRRc<"vfadb", 0xE7E3, fadd, v128db, v128db, 3, 0>;
-  def WFADB : BinaryVRRc<"wfadb", 0xE7E3, null_frag, v64db, v64db, 3, 8>;
+  def WFADB : BinaryVRRc<"wfadb", 0xE7E3, fadd, v64db, v64db, 3, 8>;

  // Convert from fixed 64-bit.
  def VCDGB : TernaryVRRa<"vcdgb", 0xE7C3, null_frag, v128db, v128g, 3, 0>;
@ -804,53 +820,55 @@ let Predicates = [FeatureVector] in {

  // Divide.
  def VFDDB : BinaryVRRc<"vfddb", 0xE7E5, fdiv, v128db, v128db, 3, 0>;
-  def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, null_frag, v64db, v64db, 3, 8>;
+  def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, fdiv, v64db, v64db, 3, 8>;

  // Load FP integer.
  def VFIDB : TernaryVRRa<"vfidb", 0xE7C7, null_frag, v128db, v128db, 3, 0>;
  def WFIDB : TernaryVRRa<"wfidb", 0xE7C7, null_frag, v64db, v64db, 3, 8>;
  defm : VectorRounding<VFIDB, v128db>;
+  defm : VectorRounding<WFIDB, v64db>;

  // Load lengthened.
  def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128eb, 2, 0>;
-  def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, null_frag, v64db, v32eb, 2, 8>;
+  def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fextend, v64db, v32eb, 2, 8>;

  // Load rounded,
  def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128eb, v128db, 3, 0>;
  def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32eb, v64db, 3, 8>;
  def : Pat<(v4f32 (z_vround (v2f64 VR128:$src))), (VLEDB VR128:$src, 0, 0)>;
+  def : FPConversion<WLEDB, fround, v32eb, v64db, 0, 0>;

  // Multiply.
  def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, fmul, v128db, v128db, 3, 0>;
-  def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, null_frag, v64db, v64db, 3, 8>;
+  def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, fmul, v64db, v64db, 3, 8>;

  // Multiply and add.
  def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, fma, v128db, v128db, 0, 3>;
-  def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, null_frag, v64db, v64db, 8, 3>;
+  def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, fma, v64db, v64db, 8, 3>;

  // Multiply and subtract.
  def VFMSDB : TernaryVRRe<"vfmsdb", 0xE78E, fms, v128db, v128db, 0, 3>;
-  def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, null_frag, v64db, v64db, 8, 3>;
+  def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, fms, v64db, v64db, 8, 3>;

  // Load complement,
  def VFLCDB : UnaryVRRa<"vflcdb", 0xE7CC, fneg, v128db, v128db, 3, 0, 0>;
-  def WFLCDB : UnaryVRRa<"wflcdb", 0xE7CC, null_frag, v64db, v64db, 3, 8, 0>;
+  def WFLCDB : UnaryVRRa<"wflcdb", 0xE7CC, fneg, v64db, v64db, 3, 8, 0>;

  // Load negative.
  def VFLNDB : UnaryVRRa<"vflndb", 0xE7CC, fnabs, v128db, v128db, 3, 0, 1>;
-  def WFLNDB : UnaryVRRa<"wflndb", 0xE7CC, null_frag, v64db, v64db, 3, 8, 1>;
+  def WFLNDB : UnaryVRRa<"wflndb", 0xE7CC, fnabs, v64db, v64db, 3, 8, 1>;

  // Load positive.
  def VFLPDB : UnaryVRRa<"vflpdb", 0xE7CC, fabs, v128db, v128db, 3, 0, 2>;
-  def WFLPDB : UnaryVRRa<"wflpdb", 0xE7CC, null_frag, v64db, v64db, 3, 8, 2>;
+  def WFLPDB : UnaryVRRa<"wflpdb", 0xE7CC, fabs, v64db, v64db, 3, 8, 2>;

  // Square root.
  def VFSQDB : UnaryVRRa<"vfsqdb", 0xE7CE, fsqrt, v128db, v128db, 3, 0>;
-  def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, null_frag, v64db, v64db, 3, 8>;
+  def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, fsqrt, v64db, v64db, 3, 8>;

  // Subtract.
  def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, fsub, v128db, v128db, 3, 0>;
-  def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, null_frag, v64db, v64db, 3, 8>;
+  def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, fsub, v64db, v64db, 3, 8>;

  // Test data class immediate.
  let Defs = [CC] in {
@ -866,7 +884,7 @@ let Predicates = [FeatureVector] in {
 let Predicates = [FeatureVector] in {
  // Compare scalar.
  let Defs = [CC] in
-    def WFCDB : CompareVRRa<"wfcdb", 0xE7CB, null_frag, v64db, 3>;
+    def WFCDB : CompareVRRa<"wfcdb", 0xE7CB, z_fcmp, v64db, 3>;

  // Compare and signal scalar.
  let Defs = [CC] in
--- a/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/lib/Target/SystemZ/SystemZShortenInst.cpp
@ -15,6 +15,7 @@

 #include "SystemZTargetMachine.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"

 using namespace llvm;

@ -36,6 +37,10 @@ public:
 private:
  bool shortenIIF(MachineInstr &MI, unsigned *GPRMap, unsigned LiveOther,
                  unsigned LLIxL, unsigned LLIxH);
+  bool shortenOn0(MachineInstr &MI, unsigned Opcode);
+  bool shortenOn01(MachineInstr &MI, unsigned Opcode);
+  bool shortenOn001(MachineInstr &MI, unsigned Opcode);
+  bool shortenFPConv(MachineInstr &MI, unsigned Opcode);

  const SystemZInstrInfo *TII;

@ -97,6 +102,64 @@ bool SystemZShortenInst::shortenIIF(MachineInstr &MI, unsigned *GPRMap,
  return false;
 }

+// Change MI's opcode to Opcode if register operand 0 has a 4-bit encoding.
+bool SystemZShortenInst::shortenOn0(MachineInstr &MI, unsigned Opcode) {
+  if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16) {
+    MI.setDesc(TII->get(Opcode));
+    return true;
+  }
+  return false;
+}
+
+// Change MI's opcode to Opcode if register operands 0 and 1 have a
+// 4-bit encoding.
+bool SystemZShortenInst::shortenOn01(MachineInstr &MI, unsigned Opcode) {
+  if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 &&
+      SystemZMC::getFirstReg(MI.getOperand(1).getReg()) < 16) {
+    MI.setDesc(TII->get(Opcode));
+    return true;
+  }
+  return false;
+}
+
+// Change MI's opcode to Opcode if register operands 0, 1 and 2 have a
+// 4-bit encoding and if operands 0 and 1 are tied.
+bool SystemZShortenInst::shortenOn001(MachineInstr &MI, unsigned Opcode) {
+  if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 &&
+      MI.getOperand(1).getReg() == MI.getOperand(0).getReg() &&
+      SystemZMC::getFirstReg(MI.getOperand(2).getReg()) < 16) {
+    MI.setDesc(TII->get(Opcode));
+    return true;
+  }
+  return false;
+}
+
+// MI is a vector-style conversion instruction with the operand order:
+// destination, source, exact-suppress, rounding-mode.  If both registers
+// have a 4-bit encoding then change it to Opcode, which has operand order:
+// destination, rouding-mode, source, exact-suppress.
+bool SystemZShortenInst::shortenFPConv(MachineInstr &MI, unsigned Opcode) {
+  if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 &&
+      SystemZMC::getFirstReg(MI.getOperand(1).getReg()) < 16) {
+    MachineOperand Dest(MI.getOperand(0));
+    MachineOperand Src(MI.getOperand(1));
+    MachineOperand Suppress(MI.getOperand(2));
+    MachineOperand Mode(MI.getOperand(3));
+    MI.RemoveOperand(3);
+    MI.RemoveOperand(2);
+    MI.RemoveOperand(1);
+    MI.RemoveOperand(0);
+    MI.setDesc(TII->get(Opcode));
+    MachineInstrBuilder(*MI.getParent()->getParent(), &MI)
+      .addOperand(Dest)
+      .addOperand(Mode)
+      .addOperand(Src)
+      .addOperand(Suppress);
+    return true;
+  }
+  return false;
+}
+
 // Process all instructions in MBB.  Return true if something changed.
 bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
  bool Changed = false;
@ -117,13 +180,83 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
  // Iterate backwards through the block looking for instructions to change.
  for (auto MBBI = MBB.rbegin(), MBBE = MBB.rend(); MBBI != MBBE; ++MBBI) {
    MachineInstr &MI = *MBBI;
-    unsigned Opcode = MI.getOpcode();
-    if (Opcode == SystemZ::IILF)
+    switch (MI.getOpcode()) {
+    case SystemZ::IILF:
      Changed |= shortenIIF(MI, LowGPRs, LiveHigh, SystemZ::LLILL,
                            SystemZ::LLILH);
-    else if (Opcode == SystemZ::IIHF)
+      break;
+
+    case SystemZ::IIHF:
      Changed |= shortenIIF(MI, HighGPRs, LiveLow, SystemZ::LLIHL,
                            SystemZ::LLIHH);
+      break;
+
+    case SystemZ::WFADB:
+      Changed |= shortenOn001(MI, SystemZ::ADBR);
+      break;
+
+    case SystemZ::WFDDB:
+      Changed |= shortenOn001(MI, SystemZ::DDBR);
+      break;
+
+    case SystemZ::WFIDB:
+      Changed |= shortenFPConv(MI, SystemZ::FIDBRA);
+      break;
+
+    case SystemZ::WLDEB:
+      Changed |= shortenOn01(MI, SystemZ::LDEBR);
+      break;
+
+    case SystemZ::WLEDB:
+      Changed |= shortenFPConv(MI, SystemZ::LEDBRA);
+      break;
+
+    case SystemZ::WFMDB:
+      Changed |= shortenOn001(MI, SystemZ::MDBR);
+      break;
+
+    case SystemZ::WFLCDB:
+      Changed |= shortenOn01(MI, SystemZ::LCDBR);
+      break;
+
+    case SystemZ::WFLNDB:
+      Changed |= shortenOn01(MI, SystemZ::LNDBR);
+      break;
+
+    case SystemZ::WFLPDB:
+      Changed |= shortenOn01(MI, SystemZ::LPDBR);
+      break;
+
+    case SystemZ::WFSQDB:
+      Changed |= shortenOn01(MI, SystemZ::SQDBR);
+      break;
+
+    case SystemZ::WFSDB:
+      Changed |= shortenOn001(MI, SystemZ::SDBR);
+      break;
+
+    case SystemZ::WFCDB:
+      Changed |= shortenOn01(MI, SystemZ::CDBR);
+      break;
+
+    case SystemZ::VL32:
+      // For z13 we prefer LDE over LE to avoid partial register dependencies.
+      Changed |= shortenOn0(MI, SystemZ::LDE32);
+      break;
+
+    case SystemZ::VST32:
+      Changed |= shortenOn0(MI, SystemZ::STE);
+      break;
+
+    case SystemZ::VL64:
+      Changed |= shortenOn0(MI, SystemZ::LD);
+      break;
+
+    case SystemZ::VST64:
+      Changed |= shortenOn0(MI, SystemZ::STD);
+      break;
+    }
+
    unsigned UsedLow = 0;
    unsigned UsedHigh = 0;
    for (auto MOI = MI.operands_begin(), MOE = MI.operands_end();
--- a/test/CodeGen/SystemZ/fp-abs-01.ll
+++ b/test/CodeGen/SystemZ/fp-abs-01.ll
@ -1,6 +1,7 @@
 ; Test floating-point absolute.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s

 ; Test f32.
 declare float @llvm.fabs.f32(float %f)
--- a/test/CodeGen/SystemZ/fp-abs-02.ll
+++ b/test/CodeGen/SystemZ/fp-abs-02.ll
@ -1,6 +1,7 @@
 ; Test negated floating-point absolute.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s

 ; Test f32.
 declare float @llvm.fabs.f32(float %f)
--- a/test/CodeGen/SystemZ/fp-add-02.ll
+++ b/test/CodeGen/SystemZ/fp-add-02.ll
@ -1,7 +1,8 @@
 ; Test 64-bit floating-point addition.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
-
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 declare double @foo()

 ; Check register addition.
@ -76,7 +77,7 @@ define double @f6(double %f1, double *%base, i64 %index) {
 define double @f7(double *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: adb %f0, 160(%r15)
+; CHECK-SCALAR: adb %f0, 160(%r15)
 ; CHECK: br %r14
  %ptr1 = getelementptr double, double *%ptr0, i64 2
  %ptr2 = getelementptr double, double *%ptr0, i64 4
--- a/test/CodeGen/SystemZ/fp-cmp-02.ll
+++ b/test/CodeGen/SystemZ/fp-cmp-02.ll
@ -1,7 +1,10 @@
 ; Test 64-bit floating-point comparison.  The tests assume a z10 implementation
 ; of select, using conditional branches rather than LOCGR.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s

 declare double @foo()

@ -9,8 +12,9 @@ declare double @foo()
 define i64 @f1(i64 %a, i64 %b, double %f1, double %f2) {
 ; CHECK-LABEL: f1:
 ; CHECK: cdbr %f0, %f2
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
  %cond = fcmp oeq double %f1, %f2
  %res = select i1 %cond, i64 %a, i64 %b
@ -21,8 +25,9 @@ define i64 @f1(i64 %a, i64 %b, double %f1, double %f2) {
 define i64 @f2(i64 %a, i64 %b, double %f1, double *%ptr) {
 ; CHECK-LABEL: f2:
 ; CHECK: cdb %f0, 0(%r4)
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
  %f2 = load double , double *%ptr
  %cond = fcmp oeq double %f1, %f2
@ -34,8 +39,9 @@ define i64 @f2(i64 %a, i64 %b, double %f1, double *%ptr) {
 define i64 @f3(i64 %a, i64 %b, double %f1, double *%base) {
 ; CHECK-LABEL: f3:
 ; CHECK: cdb %f0, 4088(%r4)
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
  %ptr = getelementptr double, double *%base, i64 511
  %f2 = load double , double *%ptr
@ -50,8 +56,9 @@ define i64 @f4(i64 %a, i64 %b, double %f1, double *%base) {
 ; CHECK-LABEL: f4:
 ; CHECK: aghi %r4, 4096
 ; CHECK: cdb %f0, 0(%r4)
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
  %ptr = getelementptr double, double *%base, i64 512
  %f2 = load double , double *%ptr
@ -65,8 +72,9 @@ define i64 @f5(i64 %a, i64 %b, double %f1, double *%base) {
 ; CHECK-LABEL: f5:
 ; CHECK: aghi %r4, -8
 ; CHECK: cdb %f0, 0(%r4)
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
  %ptr = getelementptr double, double *%base, i64 -1
  %f2 = load double , double *%ptr
@ -80,8 +88,9 @@ define i64 @f6(i64 %a, i64 %b, double %f1, double *%base, i64 %index) {
 ; CHECK-LABEL: f6:
 ; CHECK: sllg %r1, %r5, 3
 ; CHECK: cdb %f0, 800(%r1,%r4)
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
  %ptr1 = getelementptr double, double *%base, i64 %index
  %ptr2 = getelementptr double, double *%ptr1, i64 100
@ -95,7 +104,7 @@ define i64 @f6(i64 %a, i64 %b, double %f1, double *%base, i64 %index) {
 define double @f7(double *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: cdb {{%f[0-9]+}}, 160(%r15)
+; CHECK-SCALAR: cdb {{%f[0-9]+}}, 160(%r15)
 ; CHECK: br %r14
  %ptr1 = getelementptr double, double *%ptr0, i64 2
  %ptr2 = getelementptr double, double *%ptr0, i64 4
@ -152,9 +161,12 @@ define double @f7(double *%ptr0) {
 ; Check comparison with zero.
 define i64 @f8(i64 %a, i64 %b, double %f) {
 ; CHECK-LABEL: f8:
-; CHECK: ltdbr %f0, %f0
-; CHECK-NEXT: je
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR: ltdbr %f0, %f0
+; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR: lzdr %f1
+; CHECK-VECTOR-NEXT: cdbr %f0, %f1
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
  %cond = fcmp oeq double %f, 0.0
  %res = select i1 %cond, i64 %a, i64 %b
@ -165,8 +177,9 @@ define i64 @f8(i64 %a, i64 %b, double %f) {
 define i64 @f9(i64 %a, i64 %b, double %f2, double *%ptr) {
 ; CHECK-LABEL: f9:
 ; CHECK: cdb %f0, 0(%r4)
-; CHECK-NEXT: jl {{\.L.*}}
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: jl
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrnl %r2, %r3
 ; CHECK: br %r14
  %f1 = load double , double *%ptr
  %cond = fcmp ogt double %f1, %f2
--- a/test/CodeGen/SystemZ/fp-conv-01.ll
+++ b/test/CodeGen/SystemZ/fp-conv-01.ll
@ -1,11 +1,15 @@
 ; Test floating-point truncations.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s

 ; Test f64->f32.
 define float @f1(double %d1, double %d2) {
 ; CHECK-LABEL: f1:
-; CHECK: ledbr %f0, %f2
+; CHECK-SCALAR: ledbr %f0, %f2
+; CHECK-VECTOR: ledbra %f0, 0, %f2, 0
 ; CHECK: br %r14
  %res = fptrunc double %d2 to float
  ret float %res
@ -50,8 +54,10 @@ define double @f4(fp128 *%ptr) {
 define void @f5(double *%dst, fp128 *%ptr, double %d1, double %d2) {
 ; CHECK-LABEL: f5:
 ; CHECK: ldxbr %f1, %f1
-; CHECK: adbr %f1, %f2
-; CHECK: std %f1, 0(%r2)
+; CHECK-SCALAR: adbr %f1, %f2
+; CHECK-SCALAR: std %f1, 0(%r2)
+; CHECK-VECTOR: wfadb [[REG:%f[0-9]+]], %f1, %f2
+; CHECK-VECTOR: std [[REG]], 0(%r2)
 ; CHECK: br %r14
  %val = load fp128 , fp128 *%ptr
  %conv = fptrunc fp128 %val to double
--- a/test/CodeGen/SystemZ/fp-conv-02.ll
+++ b/test/CodeGen/SystemZ/fp-conv-02.ll
@ -1,6 +1,8 @@
 ; Test extensions of f32 to f64.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s

 ; Check register extension.
 define double @f1(float %val) {
@ -74,7 +76,7 @@ define double @f6(float *%base, i64 %index) {
 ; to use LDEB if possible.
 define void @f7(double *%ptr1, float *%ptr2) {
 ; CHECK-LABEL: f7:
-; CHECK: ldeb {{%f[0-9]+}}, 16{{[04]}}(%r15)
+; CHECK-SCALAR: ldeb {{%f[0-9]+}}, 16{{[04]}}(%r15)
 ; CHECK: br %r14
  %val0 = load volatile float , float *%ptr2
  %val1 = load volatile float , float *%ptr2
--- a/test/CodeGen/SystemZ/fp-div-02.ll
+++ b/test/CodeGen/SystemZ/fp-div-02.ll
@ -1,6 +1,8 @@
 ; Test 64-bit floating-point division.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s

 declare double @foo()

@ -76,7 +78,7 @@ define double @f6(double %f1, double *%base, i64 %index) {
 define double @f7(double *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: ddb %f0, 160(%r15)
+; CHECK-SCALAR: ddb %f0, 160(%r15)
 ; CHECK: br %r14
  %ptr1 = getelementptr double, double *%ptr0, i64 2
  %ptr2 = getelementptr double, double *%ptr0, i64 4
--- a/test/CodeGen/SystemZ/fp-move-01.ll
+++ b/test/CodeGen/SystemZ/fp-move-01.ll
@ -1,11 +1,13 @@
 ; Test moves between FPRs.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s

 ; Test f32 moves.
 define float @f1(float %a, float %b) {
 ; CHECK-LABEL: f1:
 ; CHECK: ler %f0, %f2
+; CHECK: br %r14
  ret float %b
 }

@ -13,6 +15,7 @@ define float @f1(float %a, float %b) {
 define double @f2(double %a, double %b) {
 ; CHECK-LABEL: f2:
 ; CHECK: ldr %f0, %f2
+; CHECK: br %r14
  ret double %b
 }

@ -22,6 +25,7 @@ define void @f3(fp128 *%x) {
 ; CHECK-LABEL: f3:
 ; CHECK: lxr
 ; CHECK: axbr
+; CHECK: br %r14
  %val = load volatile fp128 , fp128 *%x
  %sum = fadd fp128 %val, %val
  store volatile fp128 %sum, fp128 *%x
--- a/test/CodeGen/SystemZ/fp-move-04.ll
+++ b/test/CodeGen/SystemZ/fp-move-04.ll
@ -1,6 +1,7 @@
 ; Test 64-bit floating-point loads.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s

 ; Test the low end of the LD range.
 define double @f1(double *%src) {
--- a/test/CodeGen/SystemZ/fp-move-07.ll
+++ b/test/CodeGen/SystemZ/fp-move-07.ll
@ -1,6 +1,7 @@
 ; Test 64-bit floating-point stores.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s

 ; Test the low end of the STD range.
 define void @f1(double *%src, double %val) {
--- a/test/CodeGen/SystemZ/fp-move-11.ll
+++ b/test/CodeGen/SystemZ/fp-move-11.ll
@ -0,0 +1,110 @@
+; Test 32-bit floating-point loads for z13.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Test that we use LDE instead of LE - low end of the LE range.
+define float @f1(float *%src) {
+; CHECK-LABEL: f1:
+; CHECK: lde %f0, 0(%r2)
+; CHECK: br %r14
+  %val = load float, float *%src
+  ret float %val
+}
+
+; Test that we use LDE instead of LE - high end of the LE range.
+define float @f2(float *%src) {
+; CHECK-LABEL: f2:
+; CHECK: lde %f0, 4092(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%src, i64 1023
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check the next word up, which should use LEY instead of LDE.
+define float @f3(float *%src) {
+; CHECK-LABEL: f3:
+; CHECK: ley %f0, 4096(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%src, i64 1024
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check the high end of the aligned LEY range.
+define float @f4(float *%src) {
+; CHECK-LABEL: f4:
+; CHECK: ley %f0, 524284(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%src, i64 131071
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check the next word up, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define float @f5(float *%src) {
+; CHECK-LABEL: f5:
+; CHECK: agfi %r2, 524288
+; CHECK: lde %f0, 0(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%src, i64 131072
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check the high end of the negative aligned LEY range.
+define float @f6(float *%src) {
+; CHECK-LABEL: f6:
+; CHECK: ley %f0, -4(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%src, i64 -1
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check the low end of the LEY range.
+define float @f7(float *%src) {
+; CHECK-LABEL: f7:
+; CHECK: ley %f0, -524288(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%src, i64 -131072
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check the next word down, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define float @f8(float *%src) {
+; CHECK-LABEL: f8:
+; CHECK: agfi %r2, -524292
+; CHECK: lde %f0, 0(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%src, i64 -131073
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check that LDE allows an index.
+define float @f9(i64 %src, i64 %index) {
+; CHECK-LABEL: f9:
+; CHECK: lde %f0, 4092({{%r3,%r2|%r2,%r3}})
+; CHECK: br %r14
+  %add1 = add i64 %src, %index
+  %add2 = add i64 %add1, 4092
+  %ptr = inttoptr i64 %add2 to float *
+  %val = load float, float *%ptr
+  ret float %val
+}
+
+; Check that LEY allows an index.
+define float @f10(i64 %src, i64 %index) {
+; CHECK-LABEL: f10:
+; CHECK: ley %f0, 4096({{%r3,%r2|%r2,%r3}})
+; CHECK: br %r14
+  %add1 = add i64 %src, %index
+  %add2 = add i64 %add1, 4096
+  %ptr = inttoptr i64 %add2 to float *
+  %val = load float, float *%ptr
+  ret float %val
+}
--- a/test/CodeGen/SystemZ/fp-mul-03.ll
+++ b/test/CodeGen/SystemZ/fp-mul-03.ll
@ -1,6 +1,8 @@
 ; Test multiplication of two f64s, producing an f64 result.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s

 declare double @foo()

@ -76,7 +78,7 @@ define double @f6(double %f1, double *%base, i64 %index) {
 define double @f7(double *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: mdb %f0, 160(%r15)
+; CHECK-SCALAR: mdb %f0, 160(%r15)
 ; CHECK: br %r14
  %ptr1 = getelementptr double, double *%ptr0, i64 2
  %ptr2 = getelementptr double, double *%ptr0, i64 4
--- a/test/CodeGen/SystemZ/fp-mul-07.ll
+++ b/test/CodeGen/SystemZ/fp-mul-07.ll
@ -1,11 +1,15 @@
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s

 declare double @llvm.fma.f64(double %f1, double %f2, double %f3)

 define double @f1(double %f1, double %f2, double %acc) {
 ; CHECK-LABEL: f1:
-; CHECK: madbr %f4, %f0, %f2
-; CHECK: ldr %f0, %f4
+; CHECK-SCALAR: madbr %f4, %f0, %f2
+; CHECK-SCALAR: ldr %f0, %f4
+; CHECK-VECTOR: wfmadb %f0, %f0, %f2, %f4
 ; CHECK: br %r14
  %res = call double @llvm.fma.f64 (double %f1, double %f2, double %acc)
  ret double %res
--- a/test/CodeGen/SystemZ/fp-mul-09.ll
+++ b/test/CodeGen/SystemZ/fp-mul-09.ll
@ -1,11 +1,15 @@
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s

 declare double @llvm.fma.f64(double %f1, double %f2, double %f3)

 define double @f1(double %f1, double %f2, double %acc) {
 ; CHECK-LABEL: f1:
-; CHECK: msdbr %f4, %f0, %f2
-; CHECK: ldr %f0, %f4
+; CHECK-SCALAR: msdbr %f4, %f0, %f2
+; CHECK-SCALAR: ldr %f0, %f4
+; CHECK-VECTOR: wfmsdb %f0, %f0, %f2, %f4
 ; CHECK: br %r14
  %negacc = fsub double -0.0, %acc
  %res = call double @llvm.fma.f64 (double %f1, double %f2, double %negacc)
--- a/test/CodeGen/SystemZ/fp-neg-01.ll
+++ b/test/CodeGen/SystemZ/fp-neg-01.ll
@ -1,6 +1,7 @@
 ; Test floating-point negation.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s

 ; Test f32.
 define float @f1(float %f) {
--- a/test/CodeGen/SystemZ/fp-round-02.ll
+++ b/test/CodeGen/SystemZ/fp-round-02.ll
@ -1,6 +1,9 @@
 ; Test rounding functions for z196 and above.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s

 ; Test rint for f32.
 declare float @llvm.rint.f32(float %f)
@ -16,7 +19,8 @@ define float @f1(float %f) {
 declare double @llvm.rint.f64(double %f)
 define double @f2(double %f) {
 ; CHECK-LABEL: f2:
-; CHECK: fidbr %f0, 0, %f0
+; CHECK-SCALAR: fidbr %f0, 0, %f0
+; CHECK-VECTOR: fidbra %f0, 0, %f0, 0
 ; CHECK: br %r14
  %res = call double @llvm.rint.f64(double %f)
  ret double %res
--- a/test/CodeGen/SystemZ/fp-sqrt-02.ll
+++ b/test/CodeGen/SystemZ/fp-sqrt-02.ll
@ -1,6 +1,8 @@
 ; Test 64-bit square root.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s

 declare double @llvm.sqrt.f64(double %f)
 declare double @sqrt(double)
@ -77,7 +79,7 @@ define double @f6(double *%base, i64 %index) {
 ; to use SQDB if possible.
 define void @f7(double *%ptr) {
 ; CHECK-LABEL: f7:
-; CHECK: sqdb {{%f[0-9]+}}, 160(%r15)
+; CHECK-SCALAR: sqdb {{%f[0-9]+}}, 160(%r15)
 ; CHECK: br %r14
  %val0 = load volatile double , double *%ptr
  %val1 = load volatile double , double *%ptr
--- a/test/CodeGen/SystemZ/fp-sub-02.ll
+++ b/test/CodeGen/SystemZ/fp-sub-02.ll
@ -1,6 +1,8 @@
 ; Test 64-bit floating-point subtraction.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s

 declare double @foo()

@ -76,7 +78,7 @@ define double @f6(double %f1, double *%base, i64 %index) {
 define double @f7(double *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: sdb %f0, 16{{[04]}}(%r15)
+; CHECK-SCALAR: sdb %f0, 16{{[04]}}(%r15)
 ; CHECK: br %r14
  %ptr1 = getelementptr double, double *%ptr0, i64 2
  %ptr2 = getelementptr double, double *%ptr0, i64 4
--- a/test/CodeGen/SystemZ/frame-03.ll
+++ b/test/CodeGen/SystemZ/frame-03.ll
@ -2,7 +2,7 @@
 ; uses a different register class, but the set of saved and restored
 ; registers should be the same.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s

 ; This function should require all FPRs, but no other spill slots.
 ; We need to save and restore 8 of the 16 FPRs, so the frame size
--- a/test/CodeGen/SystemZ/frame-07.ll
+++ b/test/CodeGen/SystemZ/frame-07.ll
@ -1,7 +1,7 @@
 ; Test the saving and restoring of FPRs in large frames.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefix=CHECK-NOFP %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck -check-prefix=CHECK-NOFP %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s

 ; Test a frame size that requires some FPRs to be saved and loaded using
 ; the 20-bit STDY and LDY while others can use the 12-bit STD and LD.
--- a/test/CodeGen/SystemZ/frame-17.ll
+++ b/test/CodeGen/SystemZ/frame-17.ll
@ -1,6 +1,6 @@
 ; Test spilling of FPRs.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s

 ; We need to save and restore 8 of the 16 FPRs and allocate an additional
 ; 4-byte spill slot, rounded to 8 bytes.  The frame size should be exactly
--- a/test/CodeGen/SystemZ/frame-20.ll
+++ b/test/CodeGen/SystemZ/frame-20.ll
@ -0,0 +1,445 @@
+; Like frame-03.ll, but for z13.  In this case we have 16 more registers
+; available.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; This function should require all FPRs, but no other spill slots.
+; We need to save and restore 8 of the 16 FPRs, so the frame size
+; should be exactly 160 + 8 * 8 = 224.  The CFA offset is 160
+; (the caller-allocated part of the frame) + 224.
+define void @f1(double *%ptr) {
+; CHECK-LABEL: f1:
+; CHECK: aghi %r15, -224
+; CHECK: .cfi_def_cfa_offset 384
+; CHECK: std %f8, 216(%r15)
+; CHECK: std %f9, 208(%r15)
+; CHECK: std %f10, 200(%r15)
+; CHECK: std %f11, 192(%r15)
+; CHECK: std %f12, 184(%r15)
+; CHECK: std %f13, 176(%r15)
+; CHECK: std %f14, 168(%r15)
+; CHECK: std %f15, 160(%r15)
+; CHECK: .cfi_offset %f8, -168
+; CHECK: .cfi_offset %f9, -176
+; CHECK: .cfi_offset %f10, -184
+; CHECK: .cfi_offset %f11, -192
+; CHECK: .cfi_offset %f12, -200
+; CHECK: .cfi_offset %f13, -208
+; CHECK: .cfi_offset %f14, -216
+; CHECK: .cfi_offset %f15, -224
+; CHECK-DAG: ld %f0, 0(%r2)
+; CHECK-DAG: ld %f7, 0(%r2)
+; CHECK-DAG: ld %f8, 0(%r2)
+; CHECK-DAG: ld %f15, 0(%r2)
+; CHECK-DAG: vlrepg %v16, 0(%r2)
+; CHECK-DAG: vlrepg %v23, 0(%r2)
+; CHECK-DAG: vlrepg %v24, 0(%r2)
+; CHECK-DAG: vlrepg %v31, 0(%r2)
+; CHECK: ld %f8, 216(%r15)
+; CHECK: ld %f9, 208(%r15)
+; CHECK: ld %f10, 200(%r15)
+; CHECK: ld %f11, 192(%r15)
+; CHECK: ld %f12, 184(%r15)
+; CHECK: ld %f13, 176(%r15)
+; CHECK: ld %f14, 168(%r15)
+; CHECK: ld %f15, 160(%r15)
+; CHECK: aghi %r15, 224
+; CHECK: br %r14
+  %l0 = load volatile double, double *%ptr
+  %l1 = load volatile double, double *%ptr
+  %l2 = load volatile double, double *%ptr
+  %l3 = load volatile double, double *%ptr
+  %l4 = load volatile double, double *%ptr
+  %l5 = load volatile double, double *%ptr
+  %l6 = load volatile double, double *%ptr
+  %l7 = load volatile double, double *%ptr
+  %l8 = load volatile double, double *%ptr
+  %l9 = load volatile double, double *%ptr
+  %l10 = load volatile double, double *%ptr
+  %l11 = load volatile double, double *%ptr
+  %l12 = load volatile double, double *%ptr
+  %l13 = load volatile double, double *%ptr
+  %l14 = load volatile double, double *%ptr
+  %l15 = load volatile double, double *%ptr
+  %l16 = load volatile double, double *%ptr
+  %l17 = load volatile double, double *%ptr
+  %l18 = load volatile double, double *%ptr
+  %l19 = load volatile double, double *%ptr
+  %l20 = load volatile double, double *%ptr
+  %l21 = load volatile double, double *%ptr
+  %l22 = load volatile double, double *%ptr
+  %l23 = load volatile double, double *%ptr
+  %l24 = load volatile double, double *%ptr
+  %l25 = load volatile double, double *%ptr
+  %l26 = load volatile double, double *%ptr
+  %l27 = load volatile double, double *%ptr
+  %l28 = load volatile double, double *%ptr
+  %l29 = load volatile double, double *%ptr
+  %l30 = load volatile double, double *%ptr
+  %l31 = load volatile double, double *%ptr
+  %acc0 = fsub double %l0, %l0
+  %acc1 = fsub double %l1, %acc0
+  %acc2 = fsub double %l2, %acc1
+  %acc3 = fsub double %l3, %acc2
+  %acc4 = fsub double %l4, %acc3
+  %acc5 = fsub double %l5, %acc4
+  %acc6 = fsub double %l6, %acc5
+  %acc7 = fsub double %l7, %acc6
+  %acc8 = fsub double %l8, %acc7
+  %acc9 = fsub double %l9, %acc8
+  %acc10 = fsub double %l10, %acc9
+  %acc11 = fsub double %l11, %acc10
+  %acc12 = fsub double %l12, %acc11
+  %acc13 = fsub double %l13, %acc12
+  %acc14 = fsub double %l14, %acc13
+  %acc15 = fsub double %l15, %acc14
+  %acc16 = fsub double %l16, %acc15
+  %acc17 = fsub double %l17, %acc16
+  %acc18 = fsub double %l18, %acc17
+  %acc19 = fsub double %l19, %acc18
+  %acc20 = fsub double %l20, %acc19
+  %acc21 = fsub double %l21, %acc20
+  %acc22 = fsub double %l22, %acc21
+  %acc23 = fsub double %l23, %acc22
+  %acc24 = fsub double %l24, %acc23
+  %acc25 = fsub double %l25, %acc24
+  %acc26 = fsub double %l26, %acc25
+  %acc27 = fsub double %l27, %acc26
+  %acc28 = fsub double %l28, %acc27
+  %acc29 = fsub double %l29, %acc28
+  %acc30 = fsub double %l30, %acc29
+  %acc31 = fsub double %l31, %acc30
+  store volatile double %acc0, double *%ptr
+  store volatile double %acc1, double *%ptr
+  store volatile double %acc2, double *%ptr
+  store volatile double %acc3, double *%ptr
+  store volatile double %acc4, double *%ptr
+  store volatile double %acc5, double *%ptr
+  store volatile double %acc6, double *%ptr
+  store volatile double %acc7, double *%ptr
+  store volatile double %acc8, double *%ptr
+  store volatile double %acc9, double *%ptr
+  store volatile double %acc10, double *%ptr
+  store volatile double %acc11, double *%ptr
+  store volatile double %acc12, double *%ptr
+  store volatile double %acc13, double *%ptr
+  store volatile double %acc14, double *%ptr
+  store volatile double %acc15, double *%ptr
+  store volatile double %acc16, double *%ptr
+  store volatile double %acc17, double *%ptr
+  store volatile double %acc18, double *%ptr
+  store volatile double %acc19, double *%ptr
+  store volatile double %acc20, double *%ptr
+  store volatile double %acc21, double *%ptr
+  store volatile double %acc22, double *%ptr
+  store volatile double %acc23, double *%ptr
+  store volatile double %acc24, double *%ptr
+  store volatile double %acc25, double *%ptr
+  store volatile double %acc26, double *%ptr
+  store volatile double %acc27, double *%ptr
+  store volatile double %acc28, double *%ptr
+  store volatile double %acc29, double *%ptr
+  store volatile double %acc30, double *%ptr
+  store volatile double %acc31, double *%ptr
+  ret void
+}
+
+; Like f1, but requires one fewer FPR.  We allocate in numerical order,
+; so %f15 is the one that gets dropped.
+define void @f2(double *%ptr) {
+; CHECK-LABEL: f2:
+; CHECK: aghi %r15, -216
+; CHECK: .cfi_def_cfa_offset 376
+; CHECK: std %f8, 208(%r15)
+; CHECK: std %f9, 200(%r15)
+; CHECK: std %f10, 192(%r15)
+; CHECK: std %f11, 184(%r15)
+; CHECK: std %f12, 176(%r15)
+; CHECK: std %f13, 168(%r15)
+; CHECK: std %f14, 160(%r15)
+; CHECK: .cfi_offset %f8, -168
+; CHECK: .cfi_offset %f9, -176
+; CHECK: .cfi_offset %f10, -184
+; CHECK: .cfi_offset %f11, -192
+; CHECK: .cfi_offset %f12, -200
+; CHECK: .cfi_offset %f13, -208
+; CHECK: .cfi_offset %f14, -216
+; CHECK-NOT: %v15
+; CHECK-NOT: %f15
+; CHECK: ld %f8, 208(%r15)
+; CHECK: ld %f9, 200(%r15)
+; CHECK: ld %f10, 192(%r15)
+; CHECK: ld %f11, 184(%r15)
+; CHECK: ld %f12, 176(%r15)
+; CHECK: ld %f13, 168(%r15)
+; CHECK: ld %f14, 160(%r15)
+; CHECK: aghi %r15, 216
+; CHECK: br %r14
+  %l0 = load volatile double, double *%ptr
+  %l1 = load volatile double, double *%ptr
+  %l2 = load volatile double, double *%ptr
+  %l3 = load volatile double, double *%ptr
+  %l4 = load volatile double, double *%ptr
+  %l5 = load volatile double, double *%ptr
+  %l6 = load volatile double, double *%ptr
+  %l7 = load volatile double, double *%ptr
+  %l8 = load volatile double, double *%ptr
+  %l9 = load volatile double, double *%ptr
+  %l10 = load volatile double, double *%ptr
+  %l11 = load volatile double, double *%ptr
+  %l12 = load volatile double, double *%ptr
+  %l13 = load volatile double, double *%ptr
+  %l14 = load volatile double, double *%ptr
+  %l16 = load volatile double, double *%ptr
+  %l17 = load volatile double, double *%ptr
+  %l18 = load volatile double, double *%ptr
+  %l19 = load volatile double, double *%ptr
+  %l20 = load volatile double, double *%ptr
+  %l21 = load volatile double, double *%ptr
+  %l22 = load volatile double, double *%ptr
+  %l23 = load volatile double, double *%ptr
+  %l24 = load volatile double, double *%ptr
+  %l25 = load volatile double, double *%ptr
+  %l26 = load volatile double, double *%ptr
+  %l27 = load volatile double, double *%ptr
+  %l28 = load volatile double, double *%ptr
+  %l29 = load volatile double, double *%ptr
+  %l30 = load volatile double, double *%ptr
+  %l31 = load volatile double, double *%ptr
+  %acc0 = fsub double %l0, %l0
+  %acc1 = fsub double %l1, %acc0
+  %acc2 = fsub double %l2, %acc1
+  %acc3 = fsub double %l3, %acc2
+  %acc4 = fsub double %l4, %acc3
+  %acc5 = fsub double %l5, %acc4
+  %acc6 = fsub double %l6, %acc5
+  %acc7 = fsub double %l7, %acc6
+  %acc8 = fsub double %l8, %acc7
+  %acc9 = fsub double %l9, %acc8
+  %acc10 = fsub double %l10, %acc9
+  %acc11 = fsub double %l11, %acc10
+  %acc12 = fsub double %l12, %acc11
+  %acc13 = fsub double %l13, %acc12
+  %acc14 = fsub double %l14, %acc13
+  %acc16 = fsub double %l16, %acc14
+  %acc17 = fsub double %l17, %acc16
+  %acc18 = fsub double %l18, %acc17
+  %acc19 = fsub double %l19, %acc18
+  %acc20 = fsub double %l20, %acc19
+  %acc21 = fsub double %l21, %acc20
+  %acc22 = fsub double %l22, %acc21
+  %acc23 = fsub double %l23, %acc22
+  %acc24 = fsub double %l24, %acc23
+  %acc25 = fsub double %l25, %acc24
+  %acc26 = fsub double %l26, %acc25
+  %acc27 = fsub double %l27, %acc26
+  %acc28 = fsub double %l28, %acc27
+  %acc29 = fsub double %l29, %acc28
+  %acc30 = fsub double %l30, %acc29
+  %acc31 = fsub double %l31, %acc30
+  store volatile double %acc0, double *%ptr
+  store volatile double %acc1, double *%ptr
+  store volatile double %acc2, double *%ptr
+  store volatile double %acc3, double *%ptr
+  store volatile double %acc4, double *%ptr
+  store volatile double %acc5, double *%ptr
+  store volatile double %acc6, double *%ptr
+  store volatile double %acc7, double *%ptr
+  store volatile double %acc8, double *%ptr
+  store volatile double %acc9, double *%ptr
+  store volatile double %acc10, double *%ptr
+  store volatile double %acc11, double *%ptr
+  store volatile double %acc12, double *%ptr
+  store volatile double %acc13, double *%ptr
+  store volatile double %acc14, double *%ptr
+  store volatile double %acc16, double *%ptr
+  store volatile double %acc17, double *%ptr
+  store volatile double %acc18, double *%ptr
+  store volatile double %acc19, double *%ptr
+  store volatile double %acc20, double *%ptr
+  store volatile double %acc21, double *%ptr
+  store volatile double %acc22, double *%ptr
+  store volatile double %acc23, double *%ptr
+  store volatile double %acc24, double *%ptr
+  store volatile double %acc25, double *%ptr
+  store volatile double %acc26, double *%ptr
+  store volatile double %acc27, double *%ptr
+  store volatile double %acc28, double *%ptr
+  store volatile double %acc29, double *%ptr
+  store volatile double %acc30, double *%ptr
+  store volatile double %acc31, double *%ptr
+  ret void
+}
+
+; Like f1, but should require only one call-saved FPR.
+define void @f3(double *%ptr) {
+; CHECK-LABEL: f3:
+; CHECK: aghi %r15, -168
+; CHECK: .cfi_def_cfa_offset 328
+; CHECK: std %f8, 160(%r15)
+; CHECK: .cfi_offset %f8, -168
+; CHECK-NOT: {{%[fv]9}}
+; CHECK-NOT: {{%[fv]1[0-5]}}
+; CHECK: ld %f8, 160(%r15)
+; CHECK: aghi %r15, 168
+; CHECK: br %r14
+  %l0 = load volatile double, double *%ptr
+  %l1 = load volatile double, double *%ptr
+  %l2 = load volatile double, double *%ptr
+  %l3 = load volatile double, double *%ptr
+  %l4 = load volatile double, double *%ptr
+  %l5 = load volatile double, double *%ptr
+  %l6 = load volatile double, double *%ptr
+  %l7 = load volatile double, double *%ptr
+  %l8 = load volatile double, double *%ptr
+  %l16 = load volatile double, double *%ptr
+  %l17 = load volatile double, double *%ptr
+  %l18 = load volatile double, double *%ptr
+  %l19 = load volatile double, double *%ptr
+  %l20 = load volatile double, double *%ptr
+  %l21 = load volatile double, double *%ptr
+  %l22 = load volatile double, double *%ptr
+  %l23 = load volatile double, double *%ptr
+  %l24 = load volatile double, double *%ptr
+  %l25 = load volatile double, double *%ptr
+  %l26 = load volatile double, double *%ptr
+  %l27 = load volatile double, double *%ptr
+  %l28 = load volatile double, double *%ptr
+  %l29 = load volatile double, double *%ptr
+  %l30 = load volatile double, double *%ptr
+  %l31 = load volatile double, double *%ptr
+  %acc0 = fsub double %l0, %l0
+  %acc1 = fsub double %l1, %acc0
+  %acc2 = fsub double %l2, %acc1
+  %acc3 = fsub double %l3, %acc2
+  %acc4 = fsub double %l4, %acc3
+  %acc5 = fsub double %l5, %acc4
+  %acc6 = fsub double %l6, %acc5
+  %acc7 = fsub double %l7, %acc6
+  %acc8 = fsub double %l8, %acc7
+  %acc16 = fsub double %l16, %acc8
+  %acc17 = fsub double %l17, %acc16
+  %acc18 = fsub double %l18, %acc17
+  %acc19 = fsub double %l19, %acc18
+  %acc20 = fsub double %l20, %acc19
+  %acc21 = fsub double %l21, %acc20
+  %acc22 = fsub double %l22, %acc21
+  %acc23 = fsub double %l23, %acc22
+  %acc24 = fsub double %l24, %acc23
+  %acc25 = fsub double %l25, %acc24
+  %acc26 = fsub double %l26, %acc25
+  %acc27 = fsub double %l27, %acc26
+  %acc28 = fsub double %l28, %acc27
+  %acc29 = fsub double %l29, %acc28
+  %acc30 = fsub double %l30, %acc29
+  %acc31 = fsub double %l31, %acc30
+  store volatile double %acc0, double *%ptr
+  store volatile double %acc1, double *%ptr
+  store volatile double %acc2, double *%ptr
+  store volatile double %acc3, double *%ptr
+  store volatile double %acc4, double *%ptr
+  store volatile double %acc5, double *%ptr
+  store volatile double %acc6, double *%ptr
+  store volatile double %acc7, double *%ptr
+  store volatile double %acc8, double *%ptr
+  store volatile double %acc16, double *%ptr
+  store volatile double %acc17, double *%ptr
+  store volatile double %acc18, double *%ptr
+  store volatile double %acc19, double *%ptr
+  store volatile double %acc20, double *%ptr
+  store volatile double %acc21, double *%ptr
+  store volatile double %acc22, double *%ptr
+  store volatile double %acc23, double *%ptr
+  store volatile double %acc24, double *%ptr
+  store volatile double %acc25, double *%ptr
+  store volatile double %acc26, double *%ptr
+  store volatile double %acc27, double *%ptr
+  store volatile double %acc28, double *%ptr
+  store volatile double %acc29, double *%ptr
+  store volatile double %acc30, double *%ptr
+  store volatile double %acc31, double *%ptr
+  ret void
+}
+
+; This function should use all call-clobbered FPRs and vector registers
+; but no call-saved ones.  It shouldn't need to create a frame.
+define void @f4(double *%ptr) {
+; CHECK-LABEL: f4:
+; CHECK-NOT: %r15
+; CHECK-NOT: {{%[fv][89]}}
+; CHECK-NOT: {{%[fv]1[0-5]}}
+; CHECK: br %r14
+  %l0 = load volatile double, double *%ptr
+  %l1 = load volatile double, double *%ptr
+  %l2 = load volatile double, double *%ptr
+  %l3 = load volatile double, double *%ptr
+  %l4 = load volatile double, double *%ptr
+  %l5 = load volatile double, double *%ptr
+  %l6 = load volatile double, double *%ptr
+  %l7 = load volatile double, double *%ptr
+  %l16 = load volatile double, double *%ptr
+  %l17 = load volatile double, double *%ptr
+  %l18 = load volatile double, double *%ptr
+  %l19 = load volatile double, double *%ptr
+  %l20 = load volatile double, double *%ptr
+  %l21 = load volatile double, double *%ptr
+  %l22 = load volatile double, double *%ptr
+  %l23 = load volatile double, double *%ptr
+  %l24 = load volatile double, double *%ptr
+  %l25 = load volatile double, double *%ptr
+  %l26 = load volatile double, double *%ptr
+  %l27 = load volatile double, double *%ptr
+  %l28 = load volatile double, double *%ptr
+  %l29 = load volatile double, double *%ptr
+  %l30 = load volatile double, double *%ptr
+  %l31 = load volatile double, double *%ptr
+  %acc0 = fsub double %l0, %l0
+  %acc1 = fsub double %l1, %acc0
+  %acc2 = fsub double %l2, %acc1
+  %acc3 = fsub double %l3, %acc2
+  %acc4 = fsub double %l4, %acc3
+  %acc5 = fsub double %l5, %acc4
+  %acc6 = fsub double %l6, %acc5
+  %acc7 = fsub double %l7, %acc6
+  %acc16 = fsub double %l16, %acc7
+  %acc17 = fsub double %l17, %acc16
+  %acc18 = fsub double %l18, %acc17
+  %acc19 = fsub double %l19, %acc18
+  %acc20 = fsub double %l20, %acc19
+  %acc21 = fsub double %l21, %acc20
+  %acc22 = fsub double %l22, %acc21
+  %acc23 = fsub double %l23, %acc22
+  %acc24 = fsub double %l24, %acc23
+  %acc25 = fsub double %l25, %acc24
+  %acc26 = fsub double %l26, %acc25
+  %acc27 = fsub double %l27, %acc26
+  %acc28 = fsub double %l28, %acc27
+  %acc29 = fsub double %l29, %acc28
+  %acc30 = fsub double %l30, %acc29
+  %acc31 = fsub double %l31, %acc30
+  store volatile double %acc0, double *%ptr
+  store volatile double %acc1, double *%ptr
+  store volatile double %acc2, double *%ptr
+  store volatile double %acc3, double *%ptr
+  store volatile double %acc4, double *%ptr
+  store volatile double %acc5, double *%ptr
+  store volatile double %acc6, double *%ptr
+  store volatile double %acc7, double *%ptr
+  store volatile double %acc16, double *%ptr
+  store volatile double %acc17, double *%ptr
+  store volatile double %acc18, double *%ptr
+  store volatile double %acc19, double *%ptr
+  store volatile double %acc20, double *%ptr
+  store volatile double %acc21, double *%ptr
+  store volatile double %acc22, double *%ptr
+  store volatile double %acc23, double *%ptr
+  store volatile double %acc24, double *%ptr
+  store volatile double %acc25, double *%ptr
+  store volatile double %acc26, double *%ptr
+  store volatile double %acc27, double *%ptr
+  store volatile double %acc28, double *%ptr
+  store volatile double %acc29, double *%ptr
+  store volatile double %acc30, double *%ptr
+  store volatile double %acc31, double *%ptr
+  ret void
+}
--- a/test/CodeGen/SystemZ/vec-abs-05.ll
+++ b/test/CodeGen/SystemZ/vec-abs-05.ll
@ -1,7 +1,8 @@
-; Test v2f64 absolute.
+; Test f64 and v2f64 absolute.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s

+declare double @llvm.fabs.f64(double)
 declare <2 x double> @llvm.fabs.v2f64(<2 x double>)

 ; Test a plain absolute.
@ -22,3 +23,24 @@ define <2 x double> @f2(<2 x double> %val) {
  %ret = fsub <2 x double> <double -0.0, double -0.0>, %abs
  ret <2 x double> %ret
 }
+
+; Test an f64 absolute that uses vector registers.
+define double @f3(<2 x double> %val) {
+; CHECK-LABEL: f3:
+; CHECK: wflpdb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %ret = call double @llvm.fabs.f64(double %scalar)
+  ret double %ret
+}
+
+; Test an f64 negative absolute that uses vector registers.
+define double @f4(<2 x double> %val) {
+; CHECK-LABEL: f4:
+; CHECK: wflndb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %abs = call double @llvm.fabs.f64(double %scalar)
+  %ret = fsub double -0.0, %abs
+  ret double %ret
+}
--- a/test/CodeGen/SystemZ/vec-add-01.ll
+++ b/test/CodeGen/SystemZ/vec-add-01.ll
@ -47,3 +47,14 @@ define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1,
  %ret = fadd <2 x double> %val1, %val2
  ret <2 x double> %ret
 }
+
+; Test an f64 addition that uses vector registers.
+define double @f6(<2 x double> %val1, <2 x double> %val2) {
+; CHECK-LABEL: f6:
+; CHECK: wfadb %f0, %v24, %v26
+; CHECK: br %r14
+  %scalar1 = extractelement <2 x double> %val1, i32 0
+  %scalar2 = extractelement <2 x double> %val2, i32 0
+  %ret = fadd double %scalar1, %scalar2
+  ret double %ret
+}
--- a/test/CodeGen/SystemZ/vec-cmp-06.ll
+++ b/test/CodeGen/SystemZ/vec-cmp-06.ll
@ -1,4 +1,4 @@
-; Test v2f64 comparisons.
+; Test f64 and v2f64 comparisons.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s

@ -335,3 +335,15 @@ define <2 x double> @f28(<2 x double> %val1, <2 x double> %val2,
  %ret = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
  ret <2 x double> %ret
 }
+
+; Test an f64 comparison that uses vector registers.
+define i64 @f29(i64 %a, i64 %b, double %f1, <2 x double> %vec) {
+; CHECK-LABEL: f29:
+; CHECK: wfcdb %f0, %v24
+; CHECK-NEXT: locgrne %r2, %r3
+; CHECK: br %r14
+  %f2 = extractelement <2 x double> %vec, i32 0
+  %cond = fcmp oeq double %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
--- a/test/CodeGen/SystemZ/vec-conv-02.ll
+++ b/test/CodeGen/SystemZ/vec-conv-02.ll
@ -11,3 +11,23 @@ define void @f1(<2 x double> %val, <2 x float> *%ptr) {
  store <2 x float> %res, <2 x float> *%ptr
  ret void
 }
+
+; Test conversion of an f64 in a vector register to an f32.
+define float @f2(<2 x double> %vec) {
+; CHECK-LABEL: f2:
+; CHECK: wledb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %vec, i32 0
+  %ret = fptrunc double %scalar to float
+  ret float %ret
+}
+
+; Test conversion of an f32 in a vector register to an f64.
+define double @f3(<4 x float> %vec) {
+; CHECK-LABEL: f3:
+; CHECK: wldeb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <4 x float> %vec, i32 0
+  %ret = fpext float %scalar to double
+  ret double %ret
+}
--- a/test/CodeGen/SystemZ/vec-div-01.ll
+++ b/test/CodeGen/SystemZ/vec-div-01.ll
@ -70,3 +70,14 @@ define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1,
  %ret = fdiv <2 x double> %val1, %val2
  ret <2 x double> %ret
 }
+
+; Test an f64 division that uses vector registers.
+define double @f6(<2 x double> %val1, <2 x double> %val2) {
+; CHECK-LABEL: f6:
+; CHECK: wfddb %f0, %v24, %v26
+; CHECK: br %r14
+  %scalar1 = extractelement <2 x double> %val1, i32 0
+  %scalar2 = extractelement <2 x double> %val2, i32 0
+  %ret = fdiv double %scalar1, %scalar2
+  ret double %ret
+}
--- a/test/CodeGen/SystemZ/vec-mul-01.ll
+++ b/test/CodeGen/SystemZ/vec-mul-01.ll
@ -47,3 +47,14 @@ define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1,
  %ret = fmul <2 x double> %val1, %val2
  ret <2 x double> %ret
 }
+
+; Test an f64 multiplication that uses vector registers.
+define double @f6(<2 x double> %val1, <2 x double> %val2) {
+; CHECK-LABEL: f6:
+; CHECK: wfmdb %f0, %v24, %v26
+; CHECK: br %r14
+  %scalar1 = extractelement <2 x double> %val1, i32 0
+  %scalar2 = extractelement <2 x double> %val2, i32 0
+  %ret = fmul double %scalar1, %scalar2
+  ret double %ret
+}
--- a/test/CodeGen/SystemZ/vec-neg-01.ll
+++ b/test/CodeGen/SystemZ/vec-neg-01.ll
@ -46,3 +46,13 @@ define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val) {
  %ret = fsub <2 x double> <double -0.0, double -0.0>, %val
  ret <2 x double> %ret
 }
+
+; Test an f64 negation that uses vector registers.
+define double @f6(<2 x double> %val) {
+; CHECK-LABEL: f6:
+; CHECK: wflcdb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %ret = fsub double -0.0, %scalar
+  ret double %ret
+}
--- a/test/CodeGen/SystemZ/vec-round-01.ll
+++ b/test/CodeGen/SystemZ/vec-round-01.ll
@ -2,6 +2,12 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s

+declare double @llvm.rint.f64(double)
+declare double @llvm.nearbyint.f64(double)
+declare double @llvm.floor.f64(double)
+declare double @llvm.ceil.f64(double)
+declare double @llvm.trunc.f64(double)
+declare double @llvm.round.f64(double)
 declare <2 x double> @llvm.rint.v2f64(<2 x double>)
 declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>)
 declare <2 x double> @llvm.floor.v2f64(<2 x double>)
@ -56,3 +62,57 @@ define <2 x double> @f6(<2 x double> %val) {
  %res = call <2 x double> @llvm.round.v2f64(<2 x double> %val)
  ret <2 x double> %res
 }
+
+define double @f7(<2 x double> %val) {
+; CHECK-LABEL: f7:
+; CHECK: wfidb %f0, %v24, 0, 0
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %res = call double @llvm.rint.f64(double %scalar)
+  ret double %res
+}
+
+define double @f8(<2 x double> %val) {
+; CHECK-LABEL: f8:
+; CHECK: wfidb %f0, %v24, 4, 0
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %res = call double @llvm.nearbyint.f64(double %scalar)
+  ret double %res
+}
+
+define double @f9(<2 x double> %val) {
+; CHECK-LABEL: f9:
+; CHECK: wfidb %f0, %v24, 4, 7
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %res = call double @llvm.floor.f64(double %scalar)
+  ret double %res
+}
+
+define double @f10(<2 x double> %val) {
+; CHECK-LABEL: f10:
+; CHECK: wfidb %f0, %v24, 4, 6
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %res = call double @llvm.ceil.f64(double %scalar)
+  ret double %res
+}
+
+define double @f11(<2 x double> %val) {
+; CHECK-LABEL: f11:
+; CHECK: wfidb %f0, %v24, 4, 5
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %res = call double @llvm.trunc.f64(double %scalar)
+  ret double %res
+}
+
+define double @f12(<2 x double> %val) {
+; CHECK-LABEL: f12:
+; CHECK: wfidb %f0, %v24, 4, 1
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %res = call double @llvm.round.f64(double %scalar)
+  ret double %res
+}
--- a/test/CodeGen/SystemZ/vec-sqrt-01.ll
+++ b/test/CodeGen/SystemZ/vec-sqrt-01.ll
@ -1,7 +1,8 @@
-; Test v2f64 square root.
+; Test f64 and v2f64 square root.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s

+declare double @llvm.sqrt.f64(double)
 declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)

 define <2 x double> @f1(<2 x double> %val) {
@ -11,3 +12,12 @@ define <2 x double> @f1(<2 x double> %val) {
  %ret = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %val)
  ret <2 x double> %ret
 }
+
+define double @f2(<2 x double> %val) {
+; CHECK-LABEL: f2:
+; CHECK: wfsqdb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <2 x double> %val, i32 0
+  %ret = call double @llvm.sqrt.f64(double %scalar)
+  ret double %ret
+}
--- a/test/CodeGen/SystemZ/vec-sub-01.ll
+++ b/test/CodeGen/SystemZ/vec-sub-01.ll
@ -74,3 +74,14 @@ define <2 x double> @f6(<2 x double> %dummy, <2 x double> %val1,
  %ret = fsub <2 x double> %val1, %val2
  ret <2 x double> %ret
 }
+
+; Test an f64 subtraction that uses vector registers.
+define double @f7(<2 x double> %val1, <2 x double> %val2) {
+; CHECK-LABEL: f7:
+; CHECK: wfsdb %f0, %v24, %v26
+; CHECK: br %r14
+  %scalar1 = extractelement <2 x double> %val1, i32 0
+  %scalar2 = extractelement <2 x double> %val2, i32 0
+  %ret = fsub double %scalar1, %scalar2
+  ret double %ret
+}