Add a ARM specific pre-allocation pass that re-schedule loads / stores from

consecutive addresses togther. This makes it easier for the post-allocation pass
to form ldm / stm.

This is step 1. We are still missing a lot of ldm / stm opportunities because
of register allocation are not done in the desired order. More enhancements
coming.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@73291 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Evan Cheng 2009-06-13 09:12:55 +00:00
parent daf9e02893
commit e7d6df7353
6 changed files with 347 additions and 28 deletions

View File

@ -240,7 +240,7 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
if (OptLevel != CodeGenOpt::None) {
PM.add(createMachineLICMPass());
PM.add(createMachineSinkingPass());
printAndVerify(PM, /* allowDoubleDefs= */ true);
printAndVerify(PM, /* allowDoubleDefs= */ false);
}
// Run pre-ra passes.

View File

@ -98,12 +98,12 @@ FunctionPass *createARMCodePrinterPass(raw_ostream &O,
FunctionPass *createARMCodeEmitterPass(ARMTargetMachine &TM,
MachineCodeEmitter &MCE);
FunctionPass *createARMCodeEmitterPass( ARMTargetMachine &TM,
FunctionPass *createARMCodeEmitterPass(ARMTargetMachine &TM,
MachineCodeEmitter &MCE);
FunctionPass *createARMJITCodeEmitterPass( ARMTargetMachine &TM,
FunctionPass *createARMJITCodeEmitterPass(ARMTargetMachine &TM,
JITCodeEmitter &JCE);
FunctionPass *createARMLoadStoreOptimizationPass();
FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
FunctionPass *createARMConstantIslandPass();
} // end namespace llvm;

View File

@ -17,24 +17,31 @@
#include "ARMAddressingModes.h"
#include "ARMMachineFunctionInfo.h"
#include "ARMRegisterInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
using namespace llvm;
STATISTIC(NumLDMGened , "Number of ldm instructions generated");
STATISTIC(NumSTMGened , "Number of stm instructions generated");
STATISTIC(NumFLDMGened, "Number of fldm instructions generated");
STATISTIC(NumFSTMGened, "Number of fstm instructions generated");
STATISTIC(NumLdStMoved, "Number of load / store instructions moved");
/// ARMAllocLoadStoreOpt - Post- register allocation pass the combine
/// load / store instructions to form ldm / stm instructions.
namespace {
struct VISIBILITY_HIDDEN ARMLoadStoreOpt : public MachineFunctionPass {
@ -81,12 +88,6 @@ namespace {
char ARMLoadStoreOpt::ID = 0;
}
/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
/// optimization pass.
FunctionPass *llvm::createARMLoadStoreOptimizationPass() {
return new ARMLoadStoreOpt();
}
static int getLoadStoreMultipleOpcode(int Opcode) {
switch (Opcode) {
case ARM::LDR:
@ -582,6 +583,23 @@ void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) {
RS->forward(prior(Loc));
}
static int getMemoryOpOffset(const MachineInstr *MI) {
int Opcode = MI->getOpcode();
bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR;
unsigned NumOperands = MI->getDesc().getNumOperands();
unsigned OffField = MI->getOperand(NumOperands-3).getImm();
int Offset = isAM2
? ARM_AM::getAM2Offset(OffField) : ARM_AM::getAM5Offset(OffField) * 4;
if (isAM2) {
if (ARM_AM::getAM2Op(OffField) == ARM_AM::sub)
Offset = -Offset;
} else {
if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub)
Offset = -Offset;
}
return Offset;
}
/// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR
/// ops of the same base and incrementing offset into LDM / STM ops.
bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
@ -606,22 +624,11 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
bool isMemOp = isMemoryOp(MBBI);
if (isMemOp) {
int Opcode = MBBI->getOpcode();
bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR;
unsigned Size = getLSMultipleTransferSize(MBBI);
unsigned Base = MBBI->getOperand(1).getReg();
unsigned PredReg = 0;
ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg);
unsigned NumOperands = MBBI->getDesc().getNumOperands();
unsigned OffField = MBBI->getOperand(NumOperands-3).getImm();
int Offset = isAM2
? ARM_AM::getAM2Offset(OffField) : ARM_AM::getAM5Offset(OffField) * 4;
if (isAM2) {
if (ARM_AM::getAM2Op(OffField) == ARM_AM::sub)
Offset = -Offset;
} else {
if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub)
Offset = -Offset;
}
int Offset = getMemoryOpOffset(MBBI);
// Watch out for:
// r4 := ldr [r5]
// r5 := ldr [r5, #4]
@ -744,6 +751,17 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
return NumMerges > 0;
}
namespace {
struct OffsetCompare {
bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const {
int LOffset = getMemoryOpOffset(LHS);
int ROffset = getMemoryOpOffset(RHS);
assert(LHS == RHS || LOffset != ROffset);
return LOffset > ROffset;
}
};
}
/// MergeReturnIntoLDM - If this is a exit BB, try merging the return op
/// (bx lr) into the preceeding stack restore so it directly restore the value
/// of LR into pc.
@ -788,3 +806,277 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
delete RS;
return Modified;
}
/// ARMPreAllocLoadStoreOpt - Pre- register allocation pass that move
/// load / stores from consecutive locations close to make it more
/// likely they will be combined later.
namespace {
struct VISIBILITY_HIDDEN ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
static char ID;
ARMPreAllocLoadStoreOpt() : MachineFunctionPass(&ID) {}
const TargetInstrInfo *TII;
const TargetRegisterInfo *TRI;
MachineRegisterInfo *MRI;
virtual bool runOnMachineFunction(MachineFunction &Fn);
virtual const char *getPassName() const {
return "ARM pre- register allocation load / store optimization pass";
}
private:
bool RescheduleOps(MachineBasicBlock *MBB,
SmallVector<MachineInstr*, 4> &Ops,
unsigned Base, bool isLd,
DenseMap<MachineInstr*, unsigned> &MI2LocMap);
bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
};
char ARMPreAllocLoadStoreOpt::ID = 0;
}
bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
TII = Fn.getTarget().getInstrInfo();
TRI = Fn.getTarget().getRegisterInfo();
MRI = &Fn.getRegInfo();
bool Modified = false;
for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
++MFI)
Modified |= RescheduleLoadStoreInstrs(MFI);
return Modified;
}
static bool IsSafeToMove(bool isLd, unsigned Base,
MachineBasicBlock::iterator I,
MachineBasicBlock::iterator E,
SmallPtrSet<MachineInstr*, 4> MoveOps,
const TargetRegisterInfo *TRI) {
// Are there stores / loads / calls between them?
// FIXME: This is overly conservative. We should make use of alias information
// some day.
while (++I != E) {
const TargetInstrDesc &TID = I->getDesc();
if (TID.isCall() || TID.isTerminator() || TID.hasUnmodeledSideEffects())
return false;
if (isLd && TID.mayStore())
return false;
if (!isLd) {
if (TID.mayLoad())
return false;
// It's not safe to move the first 'str' down.
// str r1, [r0]
// strh r5, [r0]
// str r4, [r0, #+4]
if (TID.mayStore() && !MoveOps.count(&*I))
return false;
}
for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
MachineOperand &MO = I->getOperand(j);
if (MO.isReg() && MO.isDef() && TRI->regsOverlap(MO.getReg(), Base))
return false;
}
}
return true;
}
bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
SmallVector<MachineInstr*, 4> &Ops,
unsigned Base, bool isLd,
DenseMap<MachineInstr*, unsigned> &MI2LocMap) {
bool RetVal = false;
// Sort by offset (in reverse order).
std::sort(Ops.begin(), Ops.end(), OffsetCompare());
// The loads / stores of the same base are in order. Scan them from first to
// last and check for the followins:
// 1. Any def of base.
// 2. Any gaps.
while (Ops.size() > 1) {
unsigned FirstLoc = ~0U;
unsigned LastLoc = 0;
MachineInstr *FirstOp = 0;
MachineInstr *LastOp = 0;
int LastOffset = 0;
unsigned LastBytes = 0;
unsigned NumMove = 0;
for (int i = Ops.size() - 1; i >= 0; --i) {
MachineInstr *Op = Ops[i];
unsigned Loc = MI2LocMap[Op];
if (Loc <= FirstLoc) {
FirstLoc = Loc;
FirstOp = Op;
}
if (Loc >= LastLoc) {
LastLoc = Loc;
LastOp = Op;
}
int Offset = getMemoryOpOffset(Op);
unsigned Bytes = getLSMultipleTransferSize(Op);
if (LastBytes) {
if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
break;
}
LastOffset = Offset;
LastBytes = Bytes;
if (++NumMove == 4)
break;
}
if (NumMove <= 1)
Ops.pop_back();
else {
SmallPtrSet<MachineInstr*, 4> MoveOps;
for (int i = NumMove-1; i >= 0; --i)
MoveOps.insert(Ops[i]);
// Be conservative, if the instructions are too far apart, don't
// move them. We want to limit the increase of register pressure.
bool DoMove = (LastLoc - FirstLoc) < NumMove*4;
if (DoMove)
DoMove = IsSafeToMove(isLd, Base, FirstOp, LastOp, MoveOps, TRI);
if (!DoMove) {
for (unsigned i = 0; i != NumMove; ++i)
Ops.pop_back();
} else {
// This is the new location for the loads / stores.
MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
while (InsertPos != MBB->end() && MoveOps.count(InsertPos))
++InsertPos;
for (unsigned i = 0; i != NumMove; ++i) {
MachineInstr *Op = Ops.back();
Ops.pop_back();
MBB->splice(InsertPos, MBB, Op);
}
NumLdStMoved += NumMove;
RetVal = true;
}
}
}
return RetVal;
}
bool
ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
bool RetVal = false;
DenseMap<MachineInstr*, unsigned> MI2LocMap;
DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap;
DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap;
SmallVector<unsigned, 4> LdBases;
SmallVector<unsigned, 4> StBases;
unsigned Loc = 0;
MachineBasicBlock::iterator MBBI = MBB->begin();
MachineBasicBlock::iterator E = MBB->end();
while (MBBI != E) {
for (; MBBI != E; ++MBBI) {
MachineInstr *MI = MBBI;
const TargetInstrDesc &TID = MI->getDesc();
if (TID.isCall() || TID.isTerminator()) {
// Stop at barriers.
++MBBI;
break;
}
MI2LocMap[MI] = Loc++;
if (!isMemoryOp(MI))
continue;
unsigned PredReg = 0;
if (getInstrPredicate(MI, PredReg) != ARMCC::AL)
continue;
int Opcode = MI->getOpcode();
bool isLd = Opcode == ARM::LDR ||
Opcode == ARM::FLDS || Opcode == ARM::FLDD;
unsigned Base = MI->getOperand(1).getReg();
int Offset = getMemoryOpOffset(MI);
bool StopHere = false;
if (isLd) {
DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
Base2LdsMap.find(Base);
if (BI != Base2LdsMap.end()) {
for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
if (Offset == getMemoryOpOffset(BI->second[i])) {
StopHere = true;
break;
}
}
if (!StopHere)
BI->second.push_back(MI);
} else {
SmallVector<MachineInstr*, 4> MIs;
MIs.push_back(MI);
Base2LdsMap[Base] = MIs;
LdBases.push_back(Base);
}
} else {
DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
Base2StsMap.find(Base);
if (BI != Base2StsMap.end()) {
for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
if (Offset == getMemoryOpOffset(BI->second[i])) {
StopHere = true;
break;
}
}
if (!StopHere)
BI->second.push_back(MI);
} else {
SmallVector<MachineInstr*, 4> MIs;
MIs.push_back(MI);
Base2StsMap[Base] = MIs;
StBases.push_back(Base);
}
}
if (StopHere) {
// Found a duplicate (a base+offset combination that's seen earlier). Backtrack.
--Loc;
break;
}
}
// Re-schedule loads.
for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
unsigned Base = LdBases[i];
SmallVector<MachineInstr*, 4> &Lds = Base2LdsMap[Base];
if (Lds.size() > 1)
RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap);
}
// Re-schedule stores.
for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
unsigned Base = StBases[i];
SmallVector<MachineInstr*, 4> &Sts = Base2StsMap[Base];
if (Sts.size() > 1)
RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap);
}
if (MBBI != E) {
Base2LdsMap.clear();
Base2StsMap.clear();
LdBases.clear();
StBases.clear();
}
}
return RetVal;
}
/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
/// optimization pass.
FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
if (PreAlloc)
return new ARMPreAllocLoadStoreOpt();
return new ARMLoadStoreOpt();
}

View File

@ -23,6 +23,9 @@
#include "llvm/Target/TargetOptions.h"
using namespace llvm;
static cl::opt<bool>
EnablePreLdStOpti("arm-pre-alloc-loadstore-opti", cl::Hidden,
cl::desc("Enable pre-regalloc load store optimization pass"));
static cl::opt<bool> DisableLdStOpti("disable-arm-loadstore-opti", cl::Hidden,
cl::desc("Disable load store optimization pass"));
static cl::opt<bool> DisableIfConversion("disable-arm-if-conversion",cl::Hidden,
@ -144,6 +147,16 @@ bool ARMTargetMachine::addInstSelector(PassManagerBase &PM,
return false;
}
bool ARMTargetMachine::addPreRegAlloc(PassManagerBase &PM,
CodeGenOpt::Level OptLevel) {
if (!EnablePreLdStOpti)
return false;
// FIXME: temporarily disabling load / store optimization pass for Thumb mode.
if (OptLevel != CodeGenOpt::None && !DisableLdStOpti && !Subtarget.isThumb())
PM.add(createARMLoadStoreOptimizationPass(true));
return true;
}
bool ARMTargetMachine::addPreEmitPass(PassManagerBase &PM,
CodeGenOpt::Level OptLevel) {
// FIXME: temporarily disabling load / store optimization pass for Thumb mode.

View File

@ -71,6 +71,7 @@ public:
// Pass Pipeline Configuration
virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
virtual bool addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
virtual bool addAssemblyEmitter(PassManagerBase &PM,
CodeGenOpt::Level OptLevel,

13
test/CodeGen/ARM/stm.ll Normal file
View File

@ -0,0 +1,13 @@
; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -mattr=+v6,+vfp2 -arm-pre-alloc-loadstore-opti | grep stm | count 2
@"\01LC" = internal constant [32 x i8] c"Boolean Not: %d %d %d %d %d %d\0A\00", section "__TEXT,__cstring,cstring_literals" ; <[32 x i8]*> [#uses=1]
@"\01LC1" = internal constant [26 x i8] c"Bitwise Not: %d %d %d %d\0A\00", section "__TEXT,__cstring,cstring_literals" ; <[26 x i8]*> [#uses=1]
declare i32 @printf(i8* nocapture, ...) nounwind
define i32 @main() nounwind {
entry:
%0 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([26 x i8]* @"\01LC1", i32 0, i32 0), i32 -2, i32 -3, i32 2, i32 -6) nounwind ; <i32> [#uses=0]
%1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([32 x i8]* @"\01LC", i32 0, i32 0), i32 0, i32 1, i32 0, i32 1, i32 0, i32 1) nounwind ; <i32> [#uses=0]
ret i32 0
}