diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h index 3e85bacbda2..425a9028488 100644 --- a/lib/Target/AArch64/AArch64.h +++ b/lib/Target/AArch64/AArch64.h @@ -39,6 +39,7 @@ ModulePass *createAArch64PromoteConstantPass(); FunctionPass *createAArch64ConditionOptimizerPass(); FunctionPass *createAArch64AddressTypePromotionPass(); FunctionPass *createAArch64A57FPLoadBalancing(); +FunctionPass *createAArch64A57PBQPRegAlloc(); /// \brief Creates an ARM-specific Target Transformation Info pass. ImmutablePass * createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM); diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp new file mode 100644 index 00000000000..c4787f4afd6 --- /dev/null +++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp @@ -0,0 +1,413 @@ +//===-- AArch64PBQPRegAlloc.cpp - AArch64 specific PBQP constraints -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This file contains the AArch64 / Cortex-A57 specific register allocation +// constraints for use by the PBQP register allocator. +// +// It is essentially a transcription of what is contained in +// AArch64A57FPLoadBalancing, which tries to use a balanced +// mix of odd and even D-registers when performing a critical sequence of +// independent, non-quadword FP/ASIMD floating-point multiply-accumulates. +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "aarch64-pbqp" + +#include "AArch64.h" +#include "AArch64RegisterInfo.h" + +#include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegAllocPBQP.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" + +#define PBQP_BUILDER PBQPBuilderWithCoalescing +//#define PBQP_BUILDER PBQPBuilder + +using namespace llvm; + +namespace { + +bool isFPReg(unsigned reg) { + return AArch64::FPR32RegClass.contains(reg) || + AArch64::FPR64RegClass.contains(reg) || + AArch64::FPR128RegClass.contains(reg); +}; + +bool isOdd(unsigned reg) { + switch (reg) { + default: + llvm_unreachable("Register is not from the expected class !"); + case AArch64::S1: + case AArch64::S3: + case AArch64::S5: + case AArch64::S7: + case AArch64::S9: + case AArch64::S11: + case AArch64::S13: + case AArch64::S15: + case AArch64::S17: + case AArch64::S19: + case AArch64::S21: + case AArch64::S23: + case AArch64::S25: + case AArch64::S27: + case AArch64::S29: + case AArch64::S31: + case AArch64::D1: + case AArch64::D3: + case AArch64::D5: + case AArch64::D7: + case AArch64::D9: + case AArch64::D11: + case AArch64::D13: + case AArch64::D15: + case AArch64::D17: + case AArch64::D19: + case AArch64::D21: + case AArch64::D23: + case AArch64::D25: + case AArch64::D27: + case AArch64::D29: + case AArch64::D31: + case AArch64::Q1: + case AArch64::Q3: + case AArch64::Q5: + case AArch64::Q7: + case AArch64::Q9: + case AArch64::Q11: + case AArch64::Q13: + case AArch64::Q15: + case AArch64::Q17: + case AArch64::Q19: + case AArch64::Q21: + case AArch64::Q23: + case AArch64::Q25: + case AArch64::Q27: + case AArch64::Q29: + case AArch64::Q31: + return true; + case AArch64::S0: + case AArch64::S2: + case AArch64::S4: + case AArch64::S6: + case AArch64::S8: + case AArch64::S10: + case AArch64::S12: + case AArch64::S14: + case AArch64::S16: + case AArch64::S18: + case AArch64::S20: + case AArch64::S22: + case AArch64::S24: + case AArch64::S26: + case AArch64::S28: + case AArch64::S30: + case AArch64::D0: + case AArch64::D2: + case AArch64::D4: + case AArch64::D6: + case AArch64::D8: + case AArch64::D10: + case AArch64::D12: + case AArch64::D14: + case AArch64::D16: + case AArch64::D18: + case AArch64::D20: + case AArch64::D22: + case AArch64::D24: + case AArch64::D26: + case AArch64::D28: + case AArch64::D30: + case AArch64::Q0: + case AArch64::Q2: + case AArch64::Q4: + case AArch64::Q6: + case AArch64::Q8: + case AArch64::Q10: + case AArch64::Q12: + case AArch64::Q14: + case AArch64::Q16: + case AArch64::Q18: + case AArch64::Q20: + case AArch64::Q22: + case AArch64::Q24: + case AArch64::Q26: + case AArch64::Q28: + case AArch64::Q30: + return false; + + } +} + +bool haveSameParity(unsigned reg1, unsigned reg2) { + assert(isFPReg(reg1) && "Expecting an FP register for reg1"); + assert(isFPReg(reg2) && "Expecting an FP register for reg2"); + + return isOdd(reg1) == isOdd(reg2); +} + +class A57PBQPBuilder : public PBQP_BUILDER { +public: + A57PBQPBuilder() : PBQP_BUILDER(), TRI(nullptr), LIs(nullptr), Chains() {} + + // Build a PBQP instance to represent the register allocation problem for + // the given MachineFunction. + std::unique_ptr + build(MachineFunction *MF, const LiveIntervals *LI, + const MachineBlockFrequencyInfo *blockInfo, + const RegSet &VRegs) override; + +private: + const AArch64RegisterInfo *TRI; + const LiveIntervals *LIs; + SmallSetVector Chains; + + // Return true if reg is a physical register + bool isPhysicalReg(unsigned reg) const { + return TRI->isPhysicalRegister(reg); + } + + // Add the accumulator chaining constraint, inside the chain, i.e. so that + // parity(Rd) == parity(Ra). + // \return true if a constraint was added + bool addIntraChainConstraint(PBQPRAProblem *p, unsigned Rd, unsigned Ra); + + // Add constraints between existing chains + void addInterChainConstraint(PBQPRAProblem *p, unsigned Rd, unsigned Ra); +}; +} // Anonymous namespace + +bool A57PBQPBuilder::addIntraChainConstraint(PBQPRAProblem *p, unsigned Rd, + unsigned Ra) { + if (Rd == Ra) + return false; + + if (isPhysicalReg(Rd) || isPhysicalReg(Ra)) { + dbgs() << "Rd is a physical reg:" << isPhysicalReg(Rd) << '\n'; + dbgs() << "Ra is a physical reg:" << isPhysicalReg(Ra) << '\n'; + return false; + } + + const PBQPRAProblem::AllowedSet *vRdAllowed = &p->getAllowedSet(Rd); + const PBQPRAProblem::AllowedSet *vRaAllowed = &p->getAllowedSet(Ra); + + PBQPRAGraph &g = p->getGraph(); + PBQPRAGraph::NodeId node1 = p->getNodeForVReg(Rd); + PBQPRAGraph::NodeId node2 = p->getNodeForVReg(Ra); + PBQPRAGraph::EdgeId edge = g.findEdge(node1, node2); + + // The edge does not exist. Create one with the appropriate interference + // costs. + if (edge == g.invalidEdgeId()) { + const LiveInterval &ld = LIs->getInterval(Rd); + const LiveInterval &la = LIs->getInterval(Ra); + bool livesOverlap = ld.overlaps(la); + + PBQP::Matrix costs(vRdAllowed->size() + 1, vRaAllowed->size() + 1, 0); + for (unsigned i = 0; i != vRdAllowed->size(); ++i) { + unsigned pRd = (*vRdAllowed)[i]; + for (unsigned j = 0; j != vRaAllowed->size(); ++j) { + unsigned pRa = (*vRaAllowed)[j]; + if (livesOverlap && TRI->regsOverlap(pRd, pRa)) + costs[i + 1][j + 1] = std::numeric_limits::infinity(); + else + costs[i + 1][j + 1] = haveSameParity(pRd, pRa) ? 0.0 : 1.0; + } + } + g.addEdge(node1, node2, std::move(costs)); + return true; + } + + if (g.getEdgeNode1Id(edge) == node2) { + std::swap(node1, node2); + std::swap(vRdAllowed, vRaAllowed); + } + + // Enforce minCost(sameParity(RaClass)) > maxCost(otherParity(RdClass)) + PBQP::Matrix costs(g.getEdgeCosts(edge)); + for (unsigned i = 0; i != vRdAllowed->size(); ++i) { + unsigned pRd = (*vRdAllowed)[i]; + + // Get the maximum cost (excluding unallocatable reg) for same parity + // registers + PBQP::PBQPNum sameParityMax = std::numeric_limits::min(); + for (unsigned j = 0; j != vRaAllowed->size(); ++j) { + unsigned pRa = (*vRaAllowed)[j]; + if (haveSameParity(pRd, pRa)) + if (costs[i + 1][j + 1] != + std::numeric_limits::infinity() && + costs[i + 1][j + 1] > sameParityMax) + sameParityMax = costs[i + 1][j + 1]; + } + + // Ensure all registers with a different parity have a higher cost + // than sameParityMax + for (unsigned j = 0; j != vRaAllowed->size(); ++j) { + unsigned pRa = (*vRaAllowed)[j]; + if (!haveSameParity(pRd, pRa)) + if (sameParityMax > costs[i + 1][j + 1]) + costs[i + 1][j + 1] = sameParityMax + 1.0; + } + } + g.setEdgeCosts(edge, costs); + + return true; +} + +void +A57PBQPBuilder::addInterChainConstraint(PBQPRAProblem *p, unsigned Rd, + unsigned Ra) { + // Do some Chain management + if (Chains.count(Ra)) { + if (Rd != Ra) { + DEBUG(dbgs() << "Moving acc chain from " << PrintReg(Ra, TRI) << " to " + << PrintReg(Rd, TRI) << '\n';); + Chains.remove(Ra); + Chains.insert(Rd); + } + } else { + DEBUG(dbgs() << "Creating new acc chain for " << PrintReg(Rd, TRI) + << '\n';); + Chains.insert(Rd); + } + + const LiveInterval &ld = LIs->getInterval(Rd); + for (auto r : Chains) { + // Skip self + if (r == Rd) + continue; + + const LiveInterval &lr = LIs->getInterval(r); + if (ld.overlaps(lr)) { + const PBQPRAProblem::AllowedSet *vRdAllowed = &p->getAllowedSet(Rd); + const PBQPRAProblem::AllowedSet *vRrAllowed = &p->getAllowedSet(r); + + PBQPRAGraph &g = p->getGraph(); + PBQPRAGraph::NodeId node1 = p->getNodeForVReg(Rd); + PBQPRAGraph::NodeId node2 = p->getNodeForVReg(r); + PBQPRAGraph::EdgeId edge = g.findEdge(node1, node2); + assert(edge != g.invalidEdgeId() && + "PBQP error ! The edge should exist !"); + + DEBUG(dbgs() << "Refining constraint !\n";); + + if (g.getEdgeNode1Id(edge) == node2) { + std::swap(node1, node2); + std::swap(vRdAllowed, vRrAllowed); + } + + // Enforce that cost is higher with all other Chains of the same parity + PBQP::Matrix costs(g.getEdgeCosts(edge)); + for (unsigned i = 0; i != vRdAllowed->size(); ++i) { + unsigned pRd = (*vRdAllowed)[i]; + + // Get the maximum cost (excluding unallocatable reg) for all other + // parity registers + PBQP::PBQPNum sameParityMax = std::numeric_limits::min(); + for (unsigned j = 0; j != vRrAllowed->size(); ++j) { + unsigned pRa = (*vRrAllowed)[j]; + if (!haveSameParity(pRd, pRa)) + if (costs[i + 1][j + 1] != + std::numeric_limits::infinity() && + costs[i + 1][j + 1] > sameParityMax) + sameParityMax = costs[i + 1][j + 1]; + } + + // Ensure all registers with same parity have a higher cost + // than sameParityMax + for (unsigned j = 0; j != vRrAllowed->size(); ++j) { + unsigned pRa = (*vRrAllowed)[j]; + if (haveSameParity(pRd, pRa)) + if (sameParityMax > costs[i + 1][j + 1]) + costs[i + 1][j + 1] = sameParityMax + 1.0; + } + } + g.setEdgeCosts(edge, costs); + } + } +} + +std::unique_ptr +A57PBQPBuilder::build(MachineFunction *MF, const LiveIntervals *LI, + const MachineBlockFrequencyInfo *blockInfo, + const RegSet &VRegs) { + std::unique_ptr p = + PBQP_BUILDER::build(MF, LI, blockInfo, VRegs); + + TRI = static_cast( + MF->getTarget().getSubtargetImpl()->getRegisterInfo()); + LIs = LI; + + DEBUG(MF->dump();); + + for (MachineFunction::const_iterator mbbItr = MF->begin(), mbbEnd = MF->end(); + mbbItr != mbbEnd; ++mbbItr) { + const MachineBasicBlock *MBB = &*mbbItr; + Chains.clear(); // FIXME: really needed ? Could not work at MF level ? + + for (MachineBasicBlock::const_iterator miItr = MBB->begin(), + miEnd = MBB->end(); + miItr != miEnd; ++miItr) { + const MachineInstr *MI = &*miItr; + switch (MI->getOpcode()) { + case AArch64::FMSUBSrrr: + case AArch64::FMADDSrrr: + case AArch64::FNMSUBSrrr: + case AArch64::FNMADDSrrr: + case AArch64::FMSUBDrrr: + case AArch64::FMADDDrrr: + case AArch64::FNMSUBDrrr: + case AArch64::FNMADDDrrr: { + unsigned Rd = MI->getOperand(0).getReg(); + unsigned Ra = MI->getOperand(3).getReg(); + + if (addIntraChainConstraint(p.get(), Rd, Ra)) + addInterChainConstraint(p.get(), Rd, Ra); + break; + } + + case AArch64::FMLAv2f32: + case AArch64::FMLSv2f32: { + unsigned Rd = MI->getOperand(0).getReg(); + addInterChainConstraint(p.get(), Rd, Rd); + break; + } + + default: + // Forget Chains which have been killed + for (auto r : Chains) { + SmallVector toDel; + if (MI->killsRegister(r)) { + DEBUG(dbgs() << "Killing chain " << PrintReg(r, TRI) << " at "; + MI->print(dbgs());); + toDel.push_back(r); + } + + while (!toDel.empty()) { + Chains.remove(toDel.back()); + toDel.pop_back(); + } + } + } + } + } + + return p; +} + +// Factory function used by AArch64TargetMachine to add the pass to the +// passmanager. +FunctionPass *llvm::createAArch64A57PBQPRegAlloc() { + std::unique_ptr builder = llvm::make_unique(); + return createPBQPRegisterAllocator(std::move(builder), nullptr); +} diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index e867524226b..1f5978198e4 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -13,6 +13,7 @@ #include "AArch64.h" #include "AArch64TargetMachine.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/PassManager.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/TargetRegistry.h" @@ -73,6 +74,10 @@ EnableCondOpt("aarch64-condopt", cl::desc("Enable the condition optimizer pass"), cl::init(true), cl::Hidden); +static cl::opt +EnablePBQP("aarch64-pbqp", cl::Hidden, + cl::desc("Use PBQP register allocator (experimental)"), + cl::init(false)); extern "C" void LLVMInitializeAArch64Target() { // Register the target. @@ -90,8 +95,14 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT, CodeGenOpt::Level OL, bool LittleEndian) : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS, *this, LittleEndian) { + Subtarget(TT, CPU, FS, *this, LittleEndian), + usingPBQP(false) { initAsmInfo(); + + if (EnablePBQP && Subtarget.isCortexA57() && OL != CodeGenOpt::None) { + usingPBQP = true; + RegisterRegAlloc::setDefault(createAArch64A57PBQPRegAlloc); + } } void AArch64leTargetMachine::anchor() { } @@ -216,7 +227,8 @@ bool AArch64PassConfig::addPostRegAlloc() { if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination) addPass(createAArch64DeadRegisterDefinitions()); if (TM->getOptLevel() != CodeGenOpt::None && - TM->getSubtarget().isCortexA57()) + TM->getSubtarget().isCortexA57() && + !static_cast(TM)->isPBQPUsed()) // Improve performance for some FP/SIMD code for A57. addPass(createAArch64A57FPLoadBalancing()); return true; diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h index 101d10c29a8..42d7dc57328 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.h +++ b/lib/Target/AArch64/AArch64TargetMachine.h @@ -40,6 +40,12 @@ public: /// \brief Register AArch64 analysis passes with a pass manager. void addAnalysisPasses(PassManagerBase &PM) override; + + /// \brief Query if the PBQP register allocator is being used + bool isPBQPUsed() const { return usingPBQP; } + +private: + bool usingPBQP; }; // AArch64leTargetMachine - AArch64 little endian target machine. diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt index ebf7ae0d347..d68a2de94b6 100644 --- a/lib/Target/AArch64/CMakeLists.txt +++ b/lib/Target/AArch64/CMakeLists.txt @@ -34,6 +34,7 @@ add_llvm_target(AArch64CodeGen AArch64LoadStoreOptimizer.cpp AArch64MCInstLower.cpp AArch64PromoteConstant.cpp + AArch64PBQPRegAlloc.cpp AArch64RegisterInfo.cpp AArch64SelectionDAGInfo.cpp AArch64StorePairSuppress.cpp diff --git a/test/CodeGen/AArch64/PBQP.ll b/test/CodeGen/AArch64/PBQP.ll new file mode 100644 index 00000000000..491998e1e30 --- /dev/null +++ b/test/CodeGen/AArch64/PBQP.ll @@ -0,0 +1,14 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 -aarch64-pbqp -o - %s | FileCheck %s + +define i32 @foo(i32 %a) { +; CHECK-LABEL: foo: +; CHECK: bl bar +; CHECK-NEXT: bl baz + %call = call i32 @bar(i32 %a) + %call1 = call i32 @baz(i32 %call) + ret i32 %call1 +} + +declare i32 @bar(i32) +declare i32 @baz(i32) +