[AArch64] Add an FP load balancing pass for Cortex-A57

For best-case performance on Cortex-A57, we should try to use a balanced mix of odd and even D-registers when performing a critical sequence of independent, non-quadword FP/ASIMD floating-point multiply or multiply-accumulate operations. This pass attempts to detect situations where the register allocation may adversely affect this load balancing and to change the registers used so as to better utilize the CPU. Ideally we'd just take each multiply or multiply-accumulate in turn and allocate it alternating even or odd registers. However, multiply-accumulates are most efficiently performed in the same functional unit as their accumulation operand. Therefore this pass tries to find maximal sequences ("Chains") of multiply-accumulates linked via their accumulation operand, and assign them all the same "color" (oddness/evenness). This optimization affects S-register and D-register floating point multiplies and FMADD/FMAs, as well as vector (floating point only) muls and FMADD/FMA. Q register instructions (and 128-bit vector instructions) are not affected. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@215199 91177308-0d34-0410-b5e6-96231b3b80d8
2024-12-15 04:30:12 +00:00 · 2014-08-08 12:33:21 +00:00 · 2014-08-08 12:33:21 +00:00 · 3a106a2813
commit 3a106a2813
parent b56b256896
5 changed files with 1023 additions and 0 deletions
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@ -37,6 +37,7 @@ FunctionPass *createAArch64ExpandPseudoPass();
 FunctionPass *createAArch64LoadStoreOptimizationPass();
 ModulePass *createAArch64PromoteConstantPass();
 FunctionPass *createAArch64AddressTypePromotionPass();
+FunctionPass *createAArch64A57FPLoadBalancing();
 /// \brief Creates an ARM-specific Target Transformation Info pass.
 ImmutablePass *
 createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM);
--- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@ -0,0 +1,694 @@
+//===-- AArch64A57FPLoadBalancing.cpp - Balance FP ops statically on A57---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// For best-case performance on Cortex-A57, we should try to use a balanced
+// mix of odd and even D-registers when performing a critical sequence of
+// independent, non-quadword FP/ASIMD floating-point multiply or
+// multiply-accumulate operations.
+//
+// This pass attempts to detect situations where the register allocation may
+// adversely affect this load balancing and to change the registers used so as
+// to better utilize the CPU.
+//
+// Ideally we'd just take each multiply or multiply-accumulate in turn and
+// allocate it alternating even or odd registers. However, multiply-accumulates
+// are most efficiently performed in the same functional unit as their
+// accumulation operand. Therefore this pass tries to find maximal sequences
+// ("Chains") of multiply-accumulates linked via their accumulation operand,
+// and assign them all the same "color" (oddness/evenness).
+//
+// This optimization affects S-register and D-register floating point
+// multiplies and FMADD/FMAs, as well as vector (floating point only) muls and
+// FMADD/FMA. Q register instructions (and 128-bit vector instructions) are
+// not affected.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <list>
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-a57-fp-load-balancing"
+
+// Enforce the algorithm to use the scavenged register even when the original
+// destination register is the correct color. Used for testing.
+static cl::opt<bool>
+TransformAll("aarch64-a57-fp-load-balancing-force-all",
+             cl::desc("Always modify dest registers regardless of color"),
+             cl::init(false), cl::Hidden);
+
+// Never use the balance information obtained from chains - return a specific
+// color always. Used for testing.
+static cl::opt<unsigned>
+OverrideBalance("aarch64-a57-fp-load-balancing-override",
+              cl::desc("Ignore balance information, always return "
+                       "(1: Even, 2: Odd)."),
+              cl::init(0), cl::Hidden);
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+
+// Is the instruction a type of multiply on 64-bit (or 32-bit) FPRs?
+static bool isMul(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case AArch64::FMULSrr:
+  case AArch64::FNMULSrr:
+  case AArch64::FMULDrr:
+  case AArch64::FNMULDrr:
+
+  case AArch64::FMULv2f32:
+    return true;
+  default:
+    return false;
+  }
+}
+
+// Is the instruction a type of FP multiply-accumulate on 64-bit (or 32-bit) FPRs?
+static bool isMla(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case AArch64::FMSUBSrrr:
+  case AArch64::FMADDSrrr:
+  case AArch64::FNMSUBSrrr:
+  case AArch64::FNMADDSrrr:
+  case AArch64::FMSUBDrrr:
+  case AArch64::FMADDDrrr:
+  case AArch64::FNMSUBDrrr:
+  case AArch64::FNMADDDrrr:
+
+  case AArch64::FMLAv2f32:
+  case AArch64::FMLSv2f32:
+    return true;
+  default:
+    return false;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// A "color", which is either even or odd. Yes, these aren't really colors
+/// but the algorithm is conceptually doing two-color graph coloring.
+enum class Color { Even, Odd };
+static const char *ColorNames[2] = { "Even", "Odd" };
+
+class Chain;
+
+class AArch64A57FPLoadBalancing : public MachineFunctionPass {
+  const AArch64InstrInfo *TII;
+  MachineRegisterInfo *MRI;
+  const TargetRegisterInfo *TRI;
+  RegisterClassInfo RCI;
+
+public:
+  static char ID;
+  explicit AArch64A57FPLoadBalancing() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  const char *getPassName() const override {
+    return "A57 FP Anti-dependency breaker";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  bool runOnBasicBlock(MachineBasicBlock &MBB);
+  bool colorChainSet(std::vector<Chain*> GV, MachineBasicBlock &MBB,
+                     int &Balance);
+  bool colorChain(Chain *G, Color C, MachineBasicBlock &MBB);
+  int scavengeRegister(Chain *G, Color C, MachineBasicBlock &MBB);
+  void scanInstruction(MachineInstr *MI, unsigned Idx,
+                       std::map<unsigned, Chain*> &Chains,
+                       std::set<Chain*> &ChainSet);
+  void maybeKillChain(MachineOperand &MO, unsigned Idx,
+                      std::map<unsigned, Chain*> &RegChains);
+  Color getColor(unsigned Register);
+  Chain *getAndEraseNext(Color PreferredColor, std::vector<Chain*> &L);
+};
+char AArch64A57FPLoadBalancing::ID = 0;
+
+/// A Chain is a sequence of instructions that are linked together by 
+/// an accumulation operand. For example:
+///
+///   fmul d0<def>, ?
+///   fmla d1<def>, ?, ?, d0<kill>
+///   fmla d2<def>, ?, ?, d1<kill>
+///
+/// There may be other instructions interleaved in the sequence that
+/// do not belong to the chain. These other instructions must not use
+/// the "chain" register at any point.
+///
+/// We currently only support chains where the "chain" operand is killed
+/// at each link in the chain for simplicity.
+/// A chain has three important instructions - Start, Last and Kill.
+///   * The start instruction is the first instruction in the chain.
+///   * Last is the final instruction in the chain.
+///   * Kill may or may not be defined. If defined, Kill is the instruction
+///     where the outgoing value of the Last instruction is killed.
+///     This information is important as if we know the outgoing value is
+///     killed with no intervening uses, we can safely change its register.
+///
+/// Without a kill instruction, we must assume the outgoing value escapes
+/// beyond our model and either must not change its register or must
+/// create a fixup FMOV to keep the old register value consistent.
+///
+class Chain {
+public:
+  /// The important (marker) instructions.
+  MachineInstr *StartInst, *LastInst, *KillInst;
+  /// The index, from the start of the basic block, that each marker
+  /// appears. These are stored so we can do quick interval tests.
+  unsigned StartInstIdx, LastInstIdx, KillInstIdx;
+  /// All instructions in the chain.
+  std::set<MachineInstr*> Insts;
+  /// True if KillInst cannot be modified. If this is true,
+  /// we cannot change LastInst's outgoing register.
+  /// This will be true for tied values and regmasks.
+  bool KillIsImmutable;
+  /// The "color" of LastInst. This will be the preferred chain color,
+  /// as changing intermediate nodes is easy but changing the last
+  /// instruction can be more tricky.
+  Color LastColor;
+
+  Chain(MachineInstr *MI, unsigned Idx, Color C) :
+  StartInst(MI), LastInst(MI), KillInst(NULL),
+  StartInstIdx(Idx), LastInstIdx(Idx), KillInstIdx(0),
+  LastColor(C) {
+    Insts.insert(MI);
+  }
+
+  /// Add a new instruction into the chain. The instruction's dest operand
+  /// has the given color.
+  void add(MachineInstr *MI, unsigned Idx, Color C) {
+    LastInst = MI;
+    LastInstIdx = Idx;
+    LastColor = C;
+
+    Insts.insert(MI);
+  }
+
+  /// Return true if MI is a member of the chain.
+  bool contains(MachineInstr *MI) { return Insts.count(MI) > 0; }
+
+  /// Return the number of instructions in the chain.
+  unsigned size() const {
+    return Insts.size();
+  }
+
+  /// Inform the chain that its last active register (the dest register of
+  /// LastInst) is killed by MI with no intervening uses or defs.
+  void setKill(MachineInstr *MI, unsigned Idx, bool Immutable) {
+    KillInst = MI;
+    KillInstIdx = Idx;
+    KillIsImmutable = Immutable;
+  }
+
+  /// Return the first instruction in the chain.
+  MachineInstr *getStart() const { return StartInst; }
+  /// Return the last instruction in the chain.
+  MachineInstr *getLast() const { return LastInst; }
+  /// Return the "kill" instruction (as set with setKill()) or NULL.
+  MachineInstr *getKill() const { return KillInst; }
+  /// Return an instruction that can be used as an iterator for the end
+  /// of the chain. This is the maximum of KillInst (if set) and LastInst.
+  MachineInstr *getEnd() const {
+    return ++MachineBasicBlock::iterator(KillInst ? KillInst : LastInst);
+  }
+
+  /// Can the Kill instruction (assuming one exists) be modified?
+  bool isKillImmutable() const { return KillIsImmutable; }
+
+  /// Return the preferred color of this chain.
+  Color getPreferredColor() {
+    if (OverrideBalance != 0)
+      return OverrideBalance == 1 ? Color::Even : Color::Odd;
+    return LastColor;
+  }
+
+  /// Return true if this chain (StartInst..KillInst) overlaps with Other.
+  bool rangeOverlapsWith(Chain *Other) {
+    unsigned End = KillInst ? KillInstIdx : LastInstIdx;
+    unsigned OtherEnd = Other->KillInst ?
+      Other->KillInstIdx : Other->LastInstIdx;
+
+    return StartInstIdx <= OtherEnd && Other->StartInstIdx <= End;
+  }
+
+  /// Return true if this chain starts before Other.
+  bool startsBefore(Chain *Other) {
+    return StartInstIdx < Other->StartInstIdx;
+  }
+
+  /// Return true if the group will require a fixup MOV at the end.
+  bool requiresFixup() const {
+    return (getKill() && isKillImmutable()) || !getKill();
+  }
+
+  /// Return a simple string representation of the chain.
+  std::string str() const {
+    std::string S;
+    raw_string_ostream OS(S);
+    
+    OS << "{";
+    StartInst->print(OS, NULL, true);
+    OS << " -> ";
+    LastInst->print(OS, NULL, true);
+    if (KillInst) {
+      OS << " (kill @ ";
+      KillInst->print(OS, NULL, true);
+      OS << ")";
+    }
+    OS << "}";
+
+    return OS.str();
+  }
+
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+
+bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) {
+  bool Changed = false;
+  DEBUG(dbgs() << "***** AArch64A57FPLoadBalancing *****\n");
+
+  const TargetMachine &TM = F.getTarget();
+  MRI = &F.getRegInfo();
+  TRI = F.getRegInfo().getTargetRegisterInfo();
+  TII = TM.getSubtarget<AArch64Subtarget>().getInstrInfo();
+  RCI.runOnMachineFunction(F);
+
+  for (auto &MBB : F) {
+    Changed |= runOnBasicBlock(MBB);
+  }
+
+  return Changed;
+}
+
+bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  DEBUG(dbgs() << "Running on MBB: " << MBB << " - scanning instructions...\n");
+
+  // First, scan the basic block producing a set of chains.
+
+  // The currently "active" chains - chains that can be added to and haven't
+  // been killed yet. This is keyed by register - all chains can only have one
+  // "link" register between each inst in the chain.
+  std::map<unsigned, Chain*> ActiveChains;
+  std::set<Chain*> AllChains;
+  unsigned Idx = 0;
+  for (auto &MI : MBB)
+    scanInstruction(&MI, Idx++, ActiveChains, AllChains);
+
+  DEBUG(dbgs() << "Scan complete, "<< AllChains.size() << " chains created.\n");
+
+  // Group the chains into disjoint sets based on their liveness range. This is
+  // a poor-man's version of graph coloring. Ideally we'd create an interference
+  // graph and perform full-on graph coloring on that, but;
+  //   (a) That's rather heavyweight for only two colors.
+  //   (b) We expect multiple disjoint interference regions - in practice the live
+  //       range of chains is quite small and they are clustered between loads
+  //       and stores.
+  EquivalenceClasses<Chain*> EC;
+  for (auto *I : AllChains)
+    EC.insert(I);
+
+  for (auto *I : AllChains) {
+    for (auto *J : AllChains) {
+      if (I != J && I->rangeOverlapsWith(J))
+        EC.unionSets(I, J);
+    }
+  }
+  DEBUG(dbgs() << "Created " << EC.getNumClasses() << " disjoint sets.\n");
+
+  // Now we assume that every member of an equivalence class interferes
+  // with every other member of that class, and with no members of other classes.
+
+  // Convert the EquivalenceClasses to a simpler set of sets.
+  std::vector<std::vector<Chain*> > V;
+  for (auto I = EC.begin(), E = EC.end(); I != E; ++I) {
+    std::vector<Chain*> Cs(EC.member_begin(I), EC.member_end());
+    if (Cs.empty()) continue;
+    V.push_back(Cs);
+  }
+
+  // Now we have a set of sets, order them by start address so
+  // we can iterate over them sequentially.
+  std::sort(V.begin(), V.end(),
+            [](const std::vector<Chain*> &A,
+               const std::vector<Chain*> &B) {
+      return A.front()->startsBefore(B.front());
+    });
+
+  // As we only have two colors, we can track the global (BB-level) balance of
+  // odds versus evens. We aim to keep this near zero to keep both execution
+  // units fed.
+  // Positive means we're even-heavy, negative we're odd-heavy.
+  //
+  // FIXME: If chains have interdependencies, for example:
+  //   mul r0, r1, r2
+  //   mul r3, r0, r1
+  // We do not model this and may color each one differently, assuming we'll
+  // get ILP when we obviously can't. This hasn't been seen to be a problem
+  // in practice so far, so we simplify the algorithm by ignoring it.
+  int Parity = 0;
+
+  for (auto &I : V)
+    Changed |= colorChainSet(I, MBB, Parity);
+
+  for (auto *C : AllChains)
+    delete C;
+
+  return Changed;
+}
+
+Chain *AArch64A57FPLoadBalancing::getAndEraseNext(Color PreferredColor,
+                                                  std::vector<Chain*> &L) {
+  if (L.empty())
+    return nullptr;
+
+  // We try and get the best candidate from L to color next, given that our
+  // preferred color is "PreferredColor". L is ordered from larger to smaller
+  // chains. It is beneficial to color the large chains before the small chains,
+  // but if we can't find a chain of the maximum length with the preferred color,
+  // we fuzz the size and look for slightly smaller chains before giving up and
+  // returning a chain that must be recolored.
+
+  // FIXME: Does this need to be configurable?
+  const unsigned SizeFuzz = 1;
+  unsigned MinSize = L.front()->size() - SizeFuzz;
+  for (auto I = L.begin(), E = L.end(); I != E; ++I) {
+    if ((*I)->size() <= MinSize) {
+      // We've gone past the size limit. Return the previous item.
+      Chain *Ch = *--I;
+      L.erase(I);
+      return Ch;
+    }
+
+    if ((*I)->getPreferredColor() == PreferredColor) {
+      Chain *Ch = *I;
+      L.erase(I);
+      return Ch;
+    }
+  }
+  
+  // Bailout case - just return the first item.
+  Chain *Ch = L.front();
+  L.erase(L.begin());
+  return Ch;
+}
+
+bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV,
+                                              MachineBasicBlock &MBB,
+                                              int &Parity) {
+  bool Changed = false;
+  DEBUG(dbgs() << "colorChainSet(): #sets=" << GV.size() << "\n");
+
+  // Sort by descending size order so that we allocate the most important
+  // sets first.
+  // Tie-break equivalent sizes by sorting chains requiring fixups before
+  // those without fixups. The logic here is that we should look at the
+  // chains that we cannot change before we look at those we can,
+  // so the parity counter is updated and we know what color we should
+  // change them to!
+  std::sort(GV.begin(), GV.end(), [](const Chain *G1, const Chain *G2) {
+      if (G1->size() != G2->size())
+        return G1->size() > G2->size();
+      return G1->requiresFixup() > G2->requiresFixup();
+    });
+
+  Color PreferredColor = Parity < 0 ? Color::Even : Color::Odd;
+  while (Chain *G = getAndEraseNext(PreferredColor, GV)) {
+    // Start off by assuming we'll color to our own preferred color.
+    Color C = PreferredColor;
+    if (Parity == 0)
+      // But if we really don't care, use the chain's preferred color.
+      C = G->getPreferredColor();
+
+    DEBUG(dbgs() << " - Parity=" << Parity << ", Color="
+          << ColorNames[(int)C] << "\n");
+
+    // If we'll need a fixup FMOV, don't bother. Testing has shown that this
+    // happens infrequently and when it does it has at least a 50% chance of
+    // slowing code down instead of speeding it up.
+    if (G->requiresFixup() && C != G->getPreferredColor()) {
+      C = G->getPreferredColor();
+      DEBUG(dbgs() << " - " << G->str() << " - not worthwhile changing; "
+            "color remains " << ColorNames[(int)C] << "\n");
+    }
+
+    Changed |= colorChain(G, C, MBB);
+
+    Parity += (C == Color::Even) ? G->size() : -G->size();
+    PreferredColor = Parity < 0 ? Color::Even : Color::Odd;
+  }
+
+  return Changed;
+}
+
+int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
+                                                MachineBasicBlock &MBB) {
+  RegScavenger RS;
+  RS.enterBasicBlock(&MBB);
+  RS.forward(MachineBasicBlock::iterator(G->getStart()));
+
+  // Can we find an appropriate register that is available throughout the life 
+  // of the chain?
+  unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass;
+  BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID));
+  for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd();
+       I != E; ++I) {
+    RS.forward(I);
+    AvailableRegs &= RS.getRegsAvailable(TRI->getRegClass(RegClassID));
+
+    // Remove any registers clobbered by a regmask.
+    for (auto J : I->operands()) {
+      if (J.isRegMask())
+        AvailableRegs.clearBitsNotInMask(J.getRegMask());
+    }
+  }
+
+  // Make sure we allocate in-order, to get the cheapest registers first.
+  auto Ord = RCI.getOrder(TRI->getRegClass(RegClassID));
+  for (auto Reg : Ord) {
+    if (!AvailableRegs[Reg])
+      continue;
+    if ((C == Color::Even && (Reg % 2) == 0) ||
+        (C == Color::Odd && (Reg % 2) == 1))
+      return Reg;
+  }
+
+  return -1;
+}
+
+bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
+                                           MachineBasicBlock &MBB) {
+  bool Changed = false;
+  DEBUG(dbgs() << " - colorChain(" << G->str() << ", "
+        << ColorNames[(int)C] << ")\n");
+
+  // Try and obtain a free register of the right class. Without a register
+  // to play with we cannot continue.
+  int Reg = scavengeRegister(G, C, MBB);
+  if (Reg == -1) {
+    DEBUG(dbgs() << "Scavenging (thus coloring) failed!\n");
+    return false;
+  }
+  DEBUG(dbgs() << " - Scavenged register: " << TRI->getName(Reg) << "\n");
+
+  std::map<unsigned, unsigned> Substs;
+  for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd();
+       I != E; ++I) {
+    if (!G->contains(I) &&
+        (&*I != G->getKill() || G->isKillImmutable()))
+      continue;
+
+    // I is a member of G, or I is a mutable instruction that kills G.
+
+    std::vector<unsigned> ToErase;
+    for (auto &U : I->operands()) {
+      if (U.isReg() && U.isUse() && Substs.find(U.getReg()) != Substs.end()) {
+        unsigned OrigReg = U.getReg();
+        U.setReg(Substs[OrigReg]);
+        if (U.isKill())
+          // Don't erase straight away, because there may be other operands
+          // that also reference this substitution!
+          ToErase.push_back(OrigReg);
+      } else if (U.isRegMask()) {
+        for (auto J : Substs) {
+          if (U.clobbersPhysReg(J.first))
+            ToErase.push_back(J.first);
+        }
+      }
+    }
+    // Now it's safe to remove the substs identified earlier.
+    for (auto J : ToErase)
+      Substs.erase(J);
+
+    // Only change the def if this isn't the last instruction.
+    if (&*I != G->getKill()) {
+      MachineOperand &MO = I->getOperand(0);
+
+      bool Change = TransformAll || getColor(MO.getReg()) != C;
+      if (G->requiresFixup() && &*I == G->getLast())
+        Change = false;
+
+      if (Change) {
+        Substs[MO.getReg()] = Reg;
+        MO.setReg(Reg);
+        MRI->setPhysRegUsed(Reg);
+
+        Changed = true;
+      }
+    }
+  }
+  assert(Substs.size() == 0 && "No substitutions should be left active!");
+
+  if (G->getKill()) {
+    DEBUG(dbgs() << " - Kill instruction seen.\n");
+  } else {
+    // We didn't have a kill instruction, but we didn't seem to need to change
+    // the destination register anyway.
+    DEBUG(dbgs() << " - Destination register not changed.\n");
+  }
+  return Changed;
+}
+
+void AArch64A57FPLoadBalancing::
+scanInstruction(MachineInstr *MI, unsigned Idx, 
+                std::map<unsigned, Chain*> &ActiveChains,
+                std::set<Chain*> &AllChains) {
+  // Inspect "MI", updating ActiveChains and AllChains.
+
+  if (isMul(MI)) {
+
+    for (auto &I : MI->operands())
+      maybeKillChain(I, Idx, ActiveChains);
+
+    // Create a new chain. Multiplies don't require forwarding so can go on any
+    // unit.
+    unsigned DestReg = MI->getOperand(0).getReg();
+
+    DEBUG(dbgs() << "New chain started for register "
+          << TRI->getName(DestReg) << " at " << *MI);
+
+    Chain *G = new Chain(MI, Idx, getColor(DestReg));
+    ActiveChains[DestReg] = G;
+    AllChains.insert(G);
+
+  } else if (isMla(MI)) {
+
+    // It is beneficial to keep MLAs on the same functional unit as their
+    // accumulator operand.
+    unsigned DestReg  = MI->getOperand(0).getReg();
+    unsigned AccumReg = MI->getOperand(3).getReg();
+
+    maybeKillChain(MI->getOperand(1), Idx, ActiveChains);
+    maybeKillChain(MI->getOperand(2), Idx, ActiveChains);
+    if (DestReg != AccumReg)
+      maybeKillChain(MI->getOperand(0), Idx, ActiveChains);
+
+    if (ActiveChains.find(AccumReg) != ActiveChains.end()) {
+      DEBUG(dbgs() << "Chain found for accumulator register "
+            << TRI->getName(AccumReg) << " in MI " << *MI);
+
+      // For simplicity we only chain together sequences of MULs/MLAs where the
+      // accumulator register is killed on each instruction. This means we don't
+      // need to track other uses of the registers we want to rewrite.
+      //
+      // FIXME: We could extend to handle the non-kill cases for more coverage.
+      if (MI->getOperand(3).isKill()) {
+        // Add to chain.
+        DEBUG(dbgs() << "Instruction was successfully added to chain.\n");
+        ActiveChains[AccumReg]->add(MI, Idx, getColor(DestReg));
+        // Handle cases where the destination is not the same as the accumulator.
+        ActiveChains[DestReg] = ActiveChains[AccumReg];
+        return;
+      }
+
+      DEBUG(dbgs() << "Cannot add to chain because accumulator operand wasn't "
+            << "marked <kill>!\n");
+      maybeKillChain(MI->getOperand(3), Idx, ActiveChains);
+    }
+
+    DEBUG(dbgs() << "Creating new chain for dest register "
+          << TRI->getName(DestReg) << "\n");
+    Chain *G = new Chain(MI, Idx, getColor(DestReg));
+    ActiveChains[DestReg] = G;
+    AllChains.insert(G);
+
+  } else {
+
+    // Non-MUL or MLA instruction. Invalidate any chain in the uses or defs
+    // lists.
+    for (auto &I : MI->operands())
+      maybeKillChain(I, Idx, ActiveChains);
+
+  }
+}
+
+void AArch64A57FPLoadBalancing::
+maybeKillChain(MachineOperand &MO, unsigned Idx,
+               std::map<unsigned, Chain*> &ActiveChains) {
+  // Given an operand and the set of active chains (keyed by register),
+  // determine if a chain should be ended and remove from ActiveChains.
+  MachineInstr *MI = MO.getParent();
+
+  if (MO.isReg()) {
+
+    // If this is a KILL of a current chain, record it.
+    if (MO.isKill() && ActiveChains.find(MO.getReg()) != ActiveChains.end()) {
+      DEBUG(dbgs() << "Kill seen for chain " << TRI->getName(MO.getReg())
+            << "\n");
+      ActiveChains[MO.getReg()]->setKill(MI, Idx, /*Immutable=*/MO.isTied());
+    }
+    ActiveChains.erase(MO.getReg());
+
+  } else if (MO.isRegMask()) {
+
+    for (auto I = ActiveChains.begin(), E = ActiveChains.end();
+         I != E; ++I) {
+      if (MO.clobbersPhysReg(I->first)) {
+        DEBUG(dbgs() << "Kill (regmask) seen for chain "
+              << TRI->getName(I->first) << "\n");
+        I->second->setKill(MI, Idx, /*Immutable=*/true);
+        ActiveChains.erase(I);
+      }
+    }
+
+  }
+}
+
+Color AArch64A57FPLoadBalancing::getColor(unsigned Reg) {
+  if ((TRI->getEncodingValue(Reg) % 2) == 0)
+    return Color::Even;
+  else
+    return Color::Odd;
+}
+
+// Factory function used by AArch64TargetMachine to add the pass to the passmanager.
+FunctionPass *llvm::createAArch64A57FPLoadBalancing() {
+  return new AArch64A57FPLoadBalancing();
+}
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@ -204,6 +204,10 @@ bool AArch64PassConfig::addPostRegAlloc() {
  // Change dead register definitions to refer to the zero register.
  if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination)
    addPass(createAArch64DeadRegisterDefinitions());
+  if (TM->getOptLevel() != CodeGenOpt::None &&
+      TM->getSubtarget<AArch64Subtarget>().isCortexA57())
+    // Improve performance for some FP/SIMD code for A57.
+    addPass(createAArch64A57FPLoadBalancing());
  return true;
 }

--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@ -15,6 +15,7 @@ tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler)
 add_public_tablegen_target(AArch64CommonTableGen)

 add_llvm_target(AArch64CodeGen
+  AArch64A57FPLoadBalancing.cpp
  AArch64AddressTypePromotion.cpp
  AArch64AdvSIMDScalarPass.cpp
  AArch64AsmPrinter.cpp
--- a/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
+++ b/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
@ -0,0 +1,323 @@
+; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN
+; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD
+
+; Test the AArch64A57FPLoadBalancing pass. This pass relies heavily on register allocation, so
+; our test strategy is to:
+;   * Force the pass to always perform register swapping even if the dest register is of the
+;     correct color already (-force-all)
+;   * Force the pass to ignore all hints it obtained from regalloc (-deterministic-balance),
+;     and run it twice, once where it always hints odd, and once where it always hints even.
+;
+; We then use regex magic to check that in the two cases the register allocation is
+; different; this is what gives us the testing coverage and distinguishes cases where
+; the pass has done some work versus accidental regalloc.
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+; Non-overlapping groups - shouldn't need any changing at all.
+
+; CHECK-LABEL: f1:
+; CHECK-EVEN: fmadd [[x:d[0-9]*[02468]]]
+; CHECK-ODD: fmadd [[x:d[0-9]*[13579]]]
+; CHECK: fmadd [[x]]
+; CHECK: fmsub [[x]]
+; CHECK: fmadd [[x]]
+; CHECK: str [[x]]
+
+define void @f1(double* nocapture readonly %p, double* nocapture %q) #0 {
+entry:
+  %0 = load double* %p, align 8
+  %arrayidx1 = getelementptr inbounds double* %p, i64 1
+  %1 = load double* %arrayidx1, align 8
+  %arrayidx2 = getelementptr inbounds double* %p, i64 2
+  %2 = load double* %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds double* %p, i64 3
+  %3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %p, i64 4
+  %4 = load double* %arrayidx4, align 8
+  %mul = fmul fast double %0, %1
+  %add = fadd fast double %mul, %4
+  %mul5 = fmul fast double %1, %2
+  %add6 = fadd fast double %mul5, %add
+  %mul7 = fmul fast double %1, %3
+  %sub = fsub fast double %add6, %mul7
+  %mul8 = fmul fast double %2, %3
+  %add9 = fadd fast double %mul8, %sub
+  store double %add9, double* %q, align 8
+  %arrayidx11 = getelementptr inbounds double* %p, i64 5
+  %5 = load double* %arrayidx11, align 8
+  %arrayidx12 = getelementptr inbounds double* %p, i64 6
+  %6 = load double* %arrayidx12, align 8
+  %arrayidx13 = getelementptr inbounds double* %p, i64 7
+  %7 = load double* %arrayidx13, align 8
+  %mul15 = fmul fast double %6, %7
+  %mul16 = fmul fast double %0, %5
+  %add17 = fadd fast double %mul16, %mul15
+  %mul18 = fmul fast double %5, %6
+  %add19 = fadd fast double %mul18, %add17
+  %arrayidx20 = getelementptr inbounds double* %q, i64 1
+  store double %add19, double* %arrayidx20, align 8
+  ret void
+}
+
+; Overlapping groups - coloring needed.
+
+; CHECK-LABEL: f2:
+; CHECK-EVEN: fmadd [[x:d[0-9]*[02468]]]
+; CHECK-EVEN: fmul [[y:d[0-9]*[13579]]]
+; CHECK-ODD: fmadd [[x:d[0-9]*[13579]]]
+; CHECK-ODD: fmul [[y:d[0-9]*[02468]]]
+; CHECK: fmadd [[x]]
+; CHECK: fmadd [[y]]
+; CHECK: fmsub [[x]]
+; CHECK: fmadd [[y]]
+; CHECK: fmadd [[x]]
+; CHECK: stp [[x]], [[y]]
+
+define void @f2(double* nocapture readonly %p, double* nocapture %q) #0 {
+entry:
+  %0 = load double* %p, align 8
+  %arrayidx1 = getelementptr inbounds double* %p, i64 1
+  %1 = load double* %arrayidx1, align 8
+  %arrayidx2 = getelementptr inbounds double* %p, i64 2
+  %2 = load double* %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds double* %p, i64 3
+  %3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %p, i64 4
+  %4 = load double* %arrayidx4, align 8
+  %arrayidx5 = getelementptr inbounds double* %p, i64 5
+  %5 = load double* %arrayidx5, align 8
+  %arrayidx6 = getelementptr inbounds double* %p, i64 6
+  %6 = load double* %arrayidx6, align 8
+  %arrayidx7 = getelementptr inbounds double* %p, i64 7
+  %7 = load double* %arrayidx7, align 8
+  %mul = fmul fast double %0, %1
+  %add = fadd fast double %mul, %7
+  %mul8 = fmul fast double %5, %6
+  %mul9 = fmul fast double %1, %2
+  %add10 = fadd fast double %mul9, %add
+  %mul11 = fmul fast double %3, %4
+  %add12 = fadd fast double %mul11, %mul8
+  %mul13 = fmul fast double %1, %3
+  %sub = fsub fast double %add10, %mul13
+  %mul14 = fmul fast double %4, %5
+  %add15 = fadd fast double %mul14, %add12
+  %mul16 = fmul fast double %2, %3
+  %add17 = fadd fast double %mul16, %sub
+  store double %add17, double* %q, align 8
+  %arrayidx19 = getelementptr inbounds double* %q, i64 1
+  store double %add15, double* %arrayidx19, align 8
+  ret void
+}
+
+; Dest register is live on block exit - fixup needed.
+
+; CHECK-LABEL: f3:
+; CHECK-EVEN: fmadd [[x:d[0-9]*[02468]]]
+; CHECK-ODD: fmadd [[x:d[0-9]*[13579]]]
+; CHECK: fmadd [[x]]
+; CHECK: fmsub [[x]]
+; CHECK: fmadd [[y:d[0-9]+]], {{.*}}, [[x]]
+; CHECK: str [[y]]
+
+define void @f3(double* nocapture readonly %p, double* nocapture %q) #0 {
+entry:
+  %0 = load double* %p, align 8
+  %arrayidx1 = getelementptr inbounds double* %p, i64 1
+  %1 = load double* %arrayidx1, align 8
+  %arrayidx2 = getelementptr inbounds double* %p, i64 2
+  %2 = load double* %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds double* %p, i64 3
+  %3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %p, i64 4
+  %4 = load double* %arrayidx4, align 8
+  %mul = fmul fast double %0, %1
+  %add = fadd fast double %mul, %4
+  %mul5 = fmul fast double %1, %2
+  %add6 = fadd fast double %mul5, %add
+  %mul7 = fmul fast double %1, %3
+  %sub = fsub fast double %add6, %mul7
+  %mul8 = fmul fast double %2, %3
+  %add9 = fadd fast double %mul8, %sub
+  %cmp = fcmp oeq double %3, 0.000000e+00
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  tail call void bitcast (void (...)* @g to void ()*)() #2
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  store double %add9, double* %q, align 8
+  ret void
+}
+
+declare void @g(...) #1
+
+; Single precision version of f2.
+
+; CHECK-LABEL: f4:
+; CHECK-EVEN: fmadd [[x:s[0-9]*[02468]]]
+; CHECK-EVEN: fmul [[y:s[0-9]*[13579]]]
+; CHECK-ODD: fmadd [[x:s[0-9]*[13579]]]
+; CHECK-ODD: fmul [[y:s[0-9]*[02468]]]
+; CHECK: fmadd [[x]]
+; CHECK: fmadd [[y]]
+; CHECK: fmsub [[x]]
+; CHECK: fmadd [[y]]
+; CHECK: fmadd [[x]]
+; CHECK: stp [[x]], [[y]]
+
+define void @f4(float* nocapture readonly %p, float* nocapture %q) #0 {
+entry:
+  %0 = load float* %p, align 4
+  %arrayidx1 = getelementptr inbounds float* %p, i64 1
+  %1 = load float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float* %p, i64 2
+  %2 = load float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds float* %p, i64 3
+  %3 = load float* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds float* %p, i64 4
+  %4 = load float* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds float* %p, i64 5
+  %5 = load float* %arrayidx5, align 4
+  %arrayidx6 = getelementptr inbounds float* %p, i64 6
+  %6 = load float* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds float* %p, i64 7
+  %7 = load float* %arrayidx7, align 4
+  %mul = fmul fast float %0, %1
+  %add = fadd fast float %mul, %7
+  %mul8 = fmul fast float %5, %6
+  %mul9 = fmul fast float %1, %2
+  %add10 = fadd fast float %mul9, %add
+  %mul11 = fmul fast float %3, %4
+  %add12 = fadd fast float %mul11, %mul8
+  %mul13 = fmul fast float %1, %3
+  %sub = fsub fast float %add10, %mul13
+  %mul14 = fmul fast float %4, %5
+  %add15 = fadd fast float %mul14, %add12
+  %mul16 = fmul fast float %2, %3
+  %add17 = fadd fast float %mul16, %sub
+  store float %add17, float* %q, align 4
+  %arrayidx19 = getelementptr inbounds float* %q, i64 1
+  store float %add15, float* %arrayidx19, align 4
+  ret void
+}
+
+; Single precision version of f3
+
+; CHECK-LABEL: f5:
+; CHECK-EVEN: fmadd [[x:s[0-9]*[02468]]]
+; CHECK-ODD: fmadd [[x:s[0-9]*[13579]]]
+; CHECK: fmadd [[x]]
+; CHECK: fmsub [[x]]
+; CHECK: fmadd [[y:s[0-9]+]], {{.*}}, [[x]]
+; CHECK: str [[y]]
+
+define void @f5(float* nocapture readonly %p, float* nocapture %q) #0 {
+entry:
+  %0 = load float* %p, align 4
+  %arrayidx1 = getelementptr inbounds float* %p, i64 1
+  %1 = load float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float* %p, i64 2
+  %2 = load float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds float* %p, i64 3
+  %3 = load float* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds float* %p, i64 4
+  %4 = load float* %arrayidx4, align 4
+  %mul = fmul fast float %0, %1
+  %add = fadd fast float %mul, %4
+  %mul5 = fmul fast float %1, %2
+  %add6 = fadd fast float %mul5, %add
+  %mul7 = fmul fast float %1, %3
+  %sub = fsub fast float %add6, %mul7
+  %mul8 = fmul fast float %2, %3
+  %add9 = fadd fast float %mul8, %sub
+  %cmp = fcmp oeq float %3, 0.000000e+00
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  tail call void bitcast (void (...)* @g to void ()*)() #2
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  store float %add9, float* %q, align 4
+  ret void
+}
+
+; Test that regmask clobbering stops a chain sequence.
+
+; CHECK-LABEL: f6:
+; CHECK-EVEN: fmadd [[x:d[0-9]*[02468]]]
+; CHECK-ODD: fmadd [[x:d[0-9]*[13579]]]
+; CHECK: fmadd [[x]]
+; CHECK: fmsub [[x]]
+; CHECK: fmadd d0, {{.*}}, [[x]]
+; CHECK: bl hh
+; CHECK: str d0
+
+define void @f6(double* nocapture readonly %p, double* nocapture %q) #0 {
+entry:
+  %0 = load double* %p, align 8
+  %arrayidx1 = getelementptr inbounds double* %p, i64 1
+  %1 = load double* %arrayidx1, align 8
+  %arrayidx2 = getelementptr inbounds double* %p, i64 2
+  %2 = load double* %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds double* %p, i64 3
+  %3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %p, i64 4
+  %4 = load double* %arrayidx4, align 8
+  %mul = fmul fast double %0, %1
+  %add = fadd fast double %mul, %4
+  %mul5 = fmul fast double %1, %2
+  %add6 = fadd fast double %mul5, %add
+  %mul7 = fmul fast double %1, %3
+  %sub = fsub fast double %add6, %mul7
+  %mul8 = fmul fast double %2, %3
+  %add9 = fadd fast double %mul8, %sub
+  %call = tail call double @hh(double %add9) #2
+  store double %call, double* %q, align 8
+  ret void
+}
+
+declare double @hh(double) #1
+
+; Check that we correctly deal with repeated operands.
+; The following testcase creates:
+;   %D1<def> = FADDDrr %D0<kill>, %D0
+; We'll get a crash if we naively look at the first operand, remove it
+; from the substitution list then look at the second operand.
+
+; CHECK: fmadd [[x:d[0-9]+]]
+; CHECK: fadd d1, [[x]], [[x]]
+
+define void @f7(double* nocapture readonly %p, double* nocapture %q) #0 {
+entry:
+  %0 = load double* %p, align 8
+  %arrayidx1 = getelementptr inbounds double* %p, i64 1
+  %1 = load double* %arrayidx1, align 8
+  %arrayidx2 = getelementptr inbounds double* %p, i64 2
+  %2 = load double* %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds double* %p, i64 3
+  %3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %p, i64 4
+  %4 = load double* %arrayidx4, align 8
+  %mul = fmul fast double %0, %1
+  %add = fadd fast double %mul, %4
+  %mul5 = fmul fast double %1, %2
+  %add6 = fadd fast double %mul5, %add
+  %mul7 = fmul fast double %1, %3
+  %sub = fsub fast double %add6, %mul7
+  %mul8 = fmul fast double %2, %3
+  %add9 = fadd fast double %mul8, %sub
+  %add10 = fadd fast double %add9, %add9
+  call void @hhh(double 0.0, double %add10)
+  ret void
+}
+
+declare void @hhh(double, double)
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+