[AArch64] Add experimental PBQP support

This adds target specific support for using the PBQP register allocator on the
AArch64, for the A57 cpu.

By default, the PBQP allocator is not used, unless explicitely required
on the command line with "-aarch64-pbqp".

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217504 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Arnaud A. de Grandmaison 2014-09-10 14:06:10 +00:00
parent 3babc141b2
commit 438669ca81
6 changed files with 449 additions and 2 deletions

View File

@ -39,6 +39,7 @@ ModulePass *createAArch64PromoteConstantPass();
FunctionPass *createAArch64ConditionOptimizerPass();
FunctionPass *createAArch64AddressTypePromotionPass();
FunctionPass *createAArch64A57FPLoadBalancing();
FunctionPass *createAArch64A57PBQPRegAlloc();
/// \brief Creates an ARM-specific Target Transformation Info pass.
ImmutablePass *
createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM);

View File

@ -0,0 +1,413 @@
//===-- AArch64PBQPRegAlloc.cpp - AArch64 specific PBQP constraints -------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
// This file contains the AArch64 / Cortex-A57 specific register allocation
// constraints for use by the PBQP register allocator.
//
// It is essentially a transcription of what is contained in
// AArch64A57FPLoadBalancing, which tries to use a balanced
// mix of odd and even D-registers when performing a critical sequence of
// independent, non-quadword FP/ASIMD floating-point multiply-accumulates.
//===----------------------------------------------------------------------===//
#define DEBUG_TYPE "aarch64-pbqp"
#include "AArch64.h"
#include "AArch64RegisterInfo.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/CodeGen/LiveIntervalAnalysis.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegAllocPBQP.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#define PBQP_BUILDER PBQPBuilderWithCoalescing
//#define PBQP_BUILDER PBQPBuilder
using namespace llvm;
namespace {
bool isFPReg(unsigned reg) {
return AArch64::FPR32RegClass.contains(reg) ||
AArch64::FPR64RegClass.contains(reg) ||
AArch64::FPR128RegClass.contains(reg);
};
bool isOdd(unsigned reg) {
switch (reg) {
default:
llvm_unreachable("Register is not from the expected class !");
case AArch64::S1:
case AArch64::S3:
case AArch64::S5:
case AArch64::S7:
case AArch64::S9:
case AArch64::S11:
case AArch64::S13:
case AArch64::S15:
case AArch64::S17:
case AArch64::S19:
case AArch64::S21:
case AArch64::S23:
case AArch64::S25:
case AArch64::S27:
case AArch64::S29:
case AArch64::S31:
case AArch64::D1:
case AArch64::D3:
case AArch64::D5:
case AArch64::D7:
case AArch64::D9:
case AArch64::D11:
case AArch64::D13:
case AArch64::D15:
case AArch64::D17:
case AArch64::D19:
case AArch64::D21:
case AArch64::D23:
case AArch64::D25:
case AArch64::D27:
case AArch64::D29:
case AArch64::D31:
case AArch64::Q1:
case AArch64::Q3:
case AArch64::Q5:
case AArch64::Q7:
case AArch64::Q9:
case AArch64::Q11:
case AArch64::Q13:
case AArch64::Q15:
case AArch64::Q17:
case AArch64::Q19:
case AArch64::Q21:
case AArch64::Q23:
case AArch64::Q25:
case AArch64::Q27:
case AArch64::Q29:
case AArch64::Q31:
return true;
case AArch64::S0:
case AArch64::S2:
case AArch64::S4:
case AArch64::S6:
case AArch64::S8:
case AArch64::S10:
case AArch64::S12:
case AArch64::S14:
case AArch64::S16:
case AArch64::S18:
case AArch64::S20:
case AArch64::S22:
case AArch64::S24:
case AArch64::S26:
case AArch64::S28:
case AArch64::S30:
case AArch64::D0:
case AArch64::D2:
case AArch64::D4:
case AArch64::D6:
case AArch64::D8:
case AArch64::D10:
case AArch64::D12:
case AArch64::D14:
case AArch64::D16:
case AArch64::D18:
case AArch64::D20:
case AArch64::D22:
case AArch64::D24:
case AArch64::D26:
case AArch64::D28:
case AArch64::D30:
case AArch64::Q0:
case AArch64::Q2:
case AArch64::Q4:
case AArch64::Q6:
case AArch64::Q8:
case AArch64::Q10:
case AArch64::Q12:
case AArch64::Q14:
case AArch64::Q16:
case AArch64::Q18:
case AArch64::Q20:
case AArch64::Q22:
case AArch64::Q24:
case AArch64::Q26:
case AArch64::Q28:
case AArch64::Q30:
return false;
}
}
bool haveSameParity(unsigned reg1, unsigned reg2) {
assert(isFPReg(reg1) && "Expecting an FP register for reg1");
assert(isFPReg(reg2) && "Expecting an FP register for reg2");
return isOdd(reg1) == isOdd(reg2);
}
class A57PBQPBuilder : public PBQP_BUILDER {
public:
A57PBQPBuilder() : PBQP_BUILDER(), TRI(nullptr), LIs(nullptr), Chains() {}
// Build a PBQP instance to represent the register allocation problem for
// the given MachineFunction.
std::unique_ptr<PBQPRAProblem>
build(MachineFunction *MF, const LiveIntervals *LI,
const MachineBlockFrequencyInfo *blockInfo,
const RegSet &VRegs) override;
private:
const AArch64RegisterInfo *TRI;
const LiveIntervals *LIs;
SmallSetVector<unsigned, 32> Chains;
// Return true if reg is a physical register
bool isPhysicalReg(unsigned reg) const {
return TRI->isPhysicalRegister(reg);
}
// Add the accumulator chaining constraint, inside the chain, i.e. so that
// parity(Rd) == parity(Ra).
// \return true if a constraint was added
bool addIntraChainConstraint(PBQPRAProblem *p, unsigned Rd, unsigned Ra);
// Add constraints between existing chains
void addInterChainConstraint(PBQPRAProblem *p, unsigned Rd, unsigned Ra);
};
} // Anonymous namespace
bool A57PBQPBuilder::addIntraChainConstraint(PBQPRAProblem *p, unsigned Rd,
unsigned Ra) {
if (Rd == Ra)
return false;
if (isPhysicalReg(Rd) || isPhysicalReg(Ra)) {
dbgs() << "Rd is a physical reg:" << isPhysicalReg(Rd) << '\n';
dbgs() << "Ra is a physical reg:" << isPhysicalReg(Ra) << '\n';
return false;
}
const PBQPRAProblem::AllowedSet *vRdAllowed = &p->getAllowedSet(Rd);
const PBQPRAProblem::AllowedSet *vRaAllowed = &p->getAllowedSet(Ra);
PBQPRAGraph &g = p->getGraph();
PBQPRAGraph::NodeId node1 = p->getNodeForVReg(Rd);
PBQPRAGraph::NodeId node2 = p->getNodeForVReg(Ra);
PBQPRAGraph::EdgeId edge = g.findEdge(node1, node2);
// The edge does not exist. Create one with the appropriate interference
// costs.
if (edge == g.invalidEdgeId()) {
const LiveInterval &ld = LIs->getInterval(Rd);
const LiveInterval &la = LIs->getInterval(Ra);
bool livesOverlap = ld.overlaps(la);
PBQP::Matrix costs(vRdAllowed->size() + 1, vRaAllowed->size() + 1, 0);
for (unsigned i = 0; i != vRdAllowed->size(); ++i) {
unsigned pRd = (*vRdAllowed)[i];
for (unsigned j = 0; j != vRaAllowed->size(); ++j) {
unsigned pRa = (*vRaAllowed)[j];
if (livesOverlap && TRI->regsOverlap(pRd, pRa))
costs[i + 1][j + 1] = std::numeric_limits<PBQP::PBQPNum>::infinity();
else
costs[i + 1][j + 1] = haveSameParity(pRd, pRa) ? 0.0 : 1.0;
}
}
g.addEdge(node1, node2, std::move(costs));
return true;
}
if (g.getEdgeNode1Id(edge) == node2) {
std::swap(node1, node2);
std::swap(vRdAllowed, vRaAllowed);
}
// Enforce minCost(sameParity(RaClass)) > maxCost(otherParity(RdClass))
PBQP::Matrix costs(g.getEdgeCosts(edge));
for (unsigned i = 0; i != vRdAllowed->size(); ++i) {
unsigned pRd = (*vRdAllowed)[i];
// Get the maximum cost (excluding unallocatable reg) for same parity
// registers
PBQP::PBQPNum sameParityMax = std::numeric_limits<PBQP::PBQPNum>::min();
for (unsigned j = 0; j != vRaAllowed->size(); ++j) {
unsigned pRa = (*vRaAllowed)[j];
if (haveSameParity(pRd, pRa))
if (costs[i + 1][j + 1] !=
std::numeric_limits<PBQP::PBQPNum>::infinity() &&
costs[i + 1][j + 1] > sameParityMax)
sameParityMax = costs[i + 1][j + 1];
}
// Ensure all registers with a different parity have a higher cost
// than sameParityMax
for (unsigned j = 0; j != vRaAllowed->size(); ++j) {
unsigned pRa = (*vRaAllowed)[j];
if (!haveSameParity(pRd, pRa))
if (sameParityMax > costs[i + 1][j + 1])
costs[i + 1][j + 1] = sameParityMax + 1.0;
}
}
g.setEdgeCosts(edge, costs);
return true;
}
void
A57PBQPBuilder::addInterChainConstraint(PBQPRAProblem *p, unsigned Rd,
unsigned Ra) {
// Do some Chain management
if (Chains.count(Ra)) {
if (Rd != Ra) {
DEBUG(dbgs() << "Moving acc chain from " << PrintReg(Ra, TRI) << " to "
<< PrintReg(Rd, TRI) << '\n';);
Chains.remove(Ra);
Chains.insert(Rd);
}
} else {
DEBUG(dbgs() << "Creating new acc chain for " << PrintReg(Rd, TRI)
<< '\n';);
Chains.insert(Rd);
}
const LiveInterval &ld = LIs->getInterval(Rd);
for (auto r : Chains) {
// Skip self
if (r == Rd)
continue;
const LiveInterval &lr = LIs->getInterval(r);
if (ld.overlaps(lr)) {
const PBQPRAProblem::AllowedSet *vRdAllowed = &p->getAllowedSet(Rd);
const PBQPRAProblem::AllowedSet *vRrAllowed = &p->getAllowedSet(r);
PBQPRAGraph &g = p->getGraph();
PBQPRAGraph::NodeId node1 = p->getNodeForVReg(Rd);
PBQPRAGraph::NodeId node2 = p->getNodeForVReg(r);
PBQPRAGraph::EdgeId edge = g.findEdge(node1, node2);
assert(edge != g.invalidEdgeId() &&
"PBQP error ! The edge should exist !");
DEBUG(dbgs() << "Refining constraint !\n";);
if (g.getEdgeNode1Id(edge) == node2) {
std::swap(node1, node2);
std::swap(vRdAllowed, vRrAllowed);
}
// Enforce that cost is higher with all other Chains of the same parity
PBQP::Matrix costs(g.getEdgeCosts(edge));
for (unsigned i = 0; i != vRdAllowed->size(); ++i) {
unsigned pRd = (*vRdAllowed)[i];
// Get the maximum cost (excluding unallocatable reg) for all other
// parity registers
PBQP::PBQPNum sameParityMax = std::numeric_limits<PBQP::PBQPNum>::min();
for (unsigned j = 0; j != vRrAllowed->size(); ++j) {
unsigned pRa = (*vRrAllowed)[j];
if (!haveSameParity(pRd, pRa))
if (costs[i + 1][j + 1] !=
std::numeric_limits<PBQP::PBQPNum>::infinity() &&
costs[i + 1][j + 1] > sameParityMax)
sameParityMax = costs[i + 1][j + 1];
}
// Ensure all registers with same parity have a higher cost
// than sameParityMax
for (unsigned j = 0; j != vRrAllowed->size(); ++j) {
unsigned pRa = (*vRrAllowed)[j];
if (haveSameParity(pRd, pRa))
if (sameParityMax > costs[i + 1][j + 1])
costs[i + 1][j + 1] = sameParityMax + 1.0;
}
}
g.setEdgeCosts(edge, costs);
}
}
}
std::unique_ptr<PBQPRAProblem>
A57PBQPBuilder::build(MachineFunction *MF, const LiveIntervals *LI,
const MachineBlockFrequencyInfo *blockInfo,
const RegSet &VRegs) {
std::unique_ptr<PBQPRAProblem> p =
PBQP_BUILDER::build(MF, LI, blockInfo, VRegs);
TRI = static_cast<const AArch64RegisterInfo *>(
MF->getTarget().getSubtargetImpl()->getRegisterInfo());
LIs = LI;
DEBUG(MF->dump(););
for (MachineFunction::const_iterator mbbItr = MF->begin(), mbbEnd = MF->end();
mbbItr != mbbEnd; ++mbbItr) {
const MachineBasicBlock *MBB = &*mbbItr;
Chains.clear(); // FIXME: really needed ? Could not work at MF level ?
for (MachineBasicBlock::const_iterator miItr = MBB->begin(),
miEnd = MBB->end();
miItr != miEnd; ++miItr) {
const MachineInstr *MI = &*miItr;
switch (MI->getOpcode()) {
case AArch64::FMSUBSrrr:
case AArch64::FMADDSrrr:
case AArch64::FNMSUBSrrr:
case AArch64::FNMADDSrrr:
case AArch64::FMSUBDrrr:
case AArch64::FMADDDrrr:
case AArch64::FNMSUBDrrr:
case AArch64::FNMADDDrrr: {
unsigned Rd = MI->getOperand(0).getReg();
unsigned Ra = MI->getOperand(3).getReg();
if (addIntraChainConstraint(p.get(), Rd, Ra))
addInterChainConstraint(p.get(), Rd, Ra);
break;
}
case AArch64::FMLAv2f32:
case AArch64::FMLSv2f32: {
unsigned Rd = MI->getOperand(0).getReg();
addInterChainConstraint(p.get(), Rd, Rd);
break;
}
default:
// Forget Chains which have been killed
for (auto r : Chains) {
SmallVector<unsigned, 8> toDel;
if (MI->killsRegister(r)) {
DEBUG(dbgs() << "Killing chain " << PrintReg(r, TRI) << " at ";
MI->print(dbgs()););
toDel.push_back(r);
}
while (!toDel.empty()) {
Chains.remove(toDel.back());
toDel.pop_back();
}
}
}
}
}
return p;
}
// Factory function used by AArch64TargetMachine to add the pass to the
// passmanager.
FunctionPass *llvm::createAArch64A57PBQPRegAlloc() {
std::unique_ptr<PBQP_BUILDER> builder = llvm::make_unique<A57PBQPBuilder>();
return createPBQPRegisterAllocator(std::move(builder), nullptr);
}

View File

@ -13,6 +13,7 @@
#include "AArch64.h"
#include "AArch64TargetMachine.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/RegAllocRegistry.h"
#include "llvm/PassManager.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/TargetRegistry.h"
@ -73,6 +74,10 @@ EnableCondOpt("aarch64-condopt",
cl::desc("Enable the condition optimizer pass"),
cl::init(true), cl::Hidden);
static cl::opt<bool>
EnablePBQP("aarch64-pbqp", cl::Hidden,
cl::desc("Use PBQP register allocator (experimental)"),
cl::init(false));
extern "C" void LLVMInitializeAArch64Target() {
// Register the target.
@ -90,8 +95,14 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT,
CodeGenOpt::Level OL,
bool LittleEndian)
: LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
Subtarget(TT, CPU, FS, *this, LittleEndian) {
Subtarget(TT, CPU, FS, *this, LittleEndian),
usingPBQP(false) {
initAsmInfo();
if (EnablePBQP && Subtarget.isCortexA57() && OL != CodeGenOpt::None) {
usingPBQP = true;
RegisterRegAlloc::setDefault(createAArch64A57PBQPRegAlloc);
}
}
void AArch64leTargetMachine::anchor() { }
@ -216,7 +227,8 @@ bool AArch64PassConfig::addPostRegAlloc() {
if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination)
addPass(createAArch64DeadRegisterDefinitions());
if (TM->getOptLevel() != CodeGenOpt::None &&
TM->getSubtarget<AArch64Subtarget>().isCortexA57())
TM->getSubtarget<AArch64Subtarget>().isCortexA57() &&
!static_cast<const AArch64TargetMachine *>(TM)->isPBQPUsed())
// Improve performance for some FP/SIMD code for A57.
addPass(createAArch64A57FPLoadBalancing());
return true;

View File

@ -40,6 +40,12 @@ public:
/// \brief Register AArch64 analysis passes with a pass manager.
void addAnalysisPasses(PassManagerBase &PM) override;
/// \brief Query if the PBQP register allocator is being used
bool isPBQPUsed() const { return usingPBQP; }
private:
bool usingPBQP;
};
// AArch64leTargetMachine - AArch64 little endian target machine.

View File

@ -34,6 +34,7 @@ add_llvm_target(AArch64CodeGen
AArch64LoadStoreOptimizer.cpp
AArch64MCInstLower.cpp
AArch64PromoteConstant.cpp
AArch64PBQPRegAlloc.cpp
AArch64RegisterInfo.cpp
AArch64SelectionDAGInfo.cpp
AArch64StorePairSuppress.cpp

View File

@ -0,0 +1,14 @@
; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 -aarch64-pbqp -o - %s | FileCheck %s
define i32 @foo(i32 %a) {
; CHECK-LABEL: foo:
; CHECK: bl bar
; CHECK-NEXT: bl baz
%call = call i32 @bar(i32 %a)
%call1 = call i32 @baz(i32 %call)
ret i32 %call1
}
declare i32 @bar(i32)
declare i32 @baz(i32)