Be careful about scheduling nodes above previous calls. It increase usages of

more callee-saved registers and introduce copies. Only allows it if scheduling
a node above calls would end up lessen register pressure.

Call operands also has added ABI restrictions for register allocation, so be
extra careful with hoisting them above calls.

rdar://9329627


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@130245 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Evan Cheng 2011-04-26 21:31:35 +00:00
parent 90fab0f9d8
commit 554daa67bd
8 changed files with 138 additions and 27 deletions

View File

@ -252,6 +252,7 @@ namespace llvm {
unsigned short Latency; // Node latency.
bool isVRegCycle : 1; // May use and def the same vreg.
bool isCall : 1; // Is a function call.
bool isCallOp : 1; // Is a function call operand.
bool isTwoAddress : 1; // Is a two-address instruction.
bool isCommutable : 1; // Is a commutable instruction.
bool hasPhysRegDefs : 1; // Has physreg defs that are being used.
@ -280,7 +281,7 @@ namespace llvm {
: Node(node), Instr(0), OrigNode(0), NodeNum(nodenum),
NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
NumSuccsLeft(0), NumRegDefsLeft(0), Latency(0),
isVRegCycle(false), isCall(false), isTwoAddress(false),
isVRegCycle(false), isCall(false), isCallOp(false), isTwoAddress(false),
isCommutable(false), hasPhysRegDefs(false), hasPhysRegClobbers(false),
isPending(false), isAvailable(false), isScheduled(false),
isScheduleHigh(false), isScheduleLow(false), isCloned(false),
@ -294,7 +295,7 @@ namespace llvm {
: Node(0), Instr(instr), OrigNode(0), NodeNum(nodenum),
NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
NumSuccsLeft(0), NumRegDefsLeft(0), Latency(0),
isVRegCycle(false), isCall(false), isTwoAddress(false),
isVRegCycle(false), isCall(false), isCallOp(false), isTwoAddress(false),
isCommutable(false), hasPhysRegDefs(false), hasPhysRegClobbers(false),
isPending(false), isAvailable(false), isScheduled(false),
isScheduleHigh(false), isScheduleLow(false), isCloned(false),
@ -307,7 +308,7 @@ namespace llvm {
: Node(0), Instr(0), OrigNode(0), NodeNum(~0u),
NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
NumSuccsLeft(0), NumRegDefsLeft(0), Latency(0),
isVRegCycle(false), isCall(false), isTwoAddress(false),
isVRegCycle(false), isCall(false), isCallOp(false), isTwoAddress(false),
isCommutable(false), hasPhysRegDefs(false), hasPhysRegClobbers(false),
isPending(false), isAvailable(false), isScheduled(false),
isScheduleHigh(false), isScheduleLow(false), isCloned(false),

View File

@ -1732,7 +1732,17 @@ unsigned RegReductionPQBase::getNodePriority(const SUnit *SU) const {
// If SU does not have a register def, schedule it close to its uses
// because it does not lengthen any live ranges.
return 0;
#if 1
return SethiUllmanNumbers[SU->NodeNum];
#else
unsigned Priority = SethiUllmanNumbers[SU->NodeNum];
if (SU->isCallOp) {
// FIXME: This assumes all of the defs are used as call operands.
int NP = (int)Priority - SU->getNode()->getNumValues();
return (NP > 0) ? NP : 0;
}
return Priority;
#endif
}
//===----------------------------------------------------------------------===//
@ -2238,11 +2248,35 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
// Prioritize by Sethi-Ulmann number and push CopyToReg nodes down.
unsigned LPriority = SPQ->getNodePriority(left);
unsigned RPriority = SPQ->getNodePriority(right);
// Be really careful about hoisting call operands above previous calls.
// Only allows it if it would reduce register pressure.
if (left->isCall && right->isCallOp) {
unsigned RNumVals = right->getNode()->getNumValues();
RPriority = (RPriority > RNumVals) ? (RPriority - RNumVals) : 0;
}
if (right->isCall && left->isCallOp) {
unsigned LNumVals = left->getNode()->getNumValues();
LPriority = (LPriority > LNumVals) ? (LPriority - LNumVals) : 0;
}
if (LPriority != RPriority) {
DEBUG(++FactorCount[FactStatic]);
return LPriority > RPriority;
}
// One or both of the nodes are calls and their sethi-ullman numbers are the
// same, then keep source order.
if (left->isCall || right->isCall) {
unsigned LOrder = SPQ->getNodeOrdering(left);
unsigned ROrder = SPQ->getNodeOrdering(right);
// Prefer an ordering where the lower the non-zero order number, the higher
// the preference.
if ((LOrder || ROrder) && LOrder != ROrder)
return LOrder != 0 && (LOrder < ROrder || ROrder == 0);
}
// Try schedule def + use closer when Sethi-Ullman numbers are the same.
// e.g.
// t1 = op t2, c1
@ -2275,7 +2309,14 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
return LScratch > RScratch;
}
if (!DisableSchedCycles) {
// Comparing latency against a call makes little sense unless the node
// is register pressure-neutral.
if ((left->isCall && RPriority > 0) || (right->isCall && LPriority > 0))
return (left->NodeQueueId > right->NodeQueueId);
// Do not compare latencies when one or both of the nodes are calls.
if (!DisableSchedCycles &&
!(left->isCall || right->isCall)) {
int result = BUCompareLatency(left, right, false /*checkPref*/, SPQ);
if (result != 0)
return result > 0;

View File

@ -83,6 +83,7 @@ SUnit *ScheduleDAGSDNodes::Clone(SUnit *Old) {
SU->Latency = Old->Latency;
SU->isVRegCycle = Old->isVRegCycle;
SU->isCall = Old->isCall;
SU->isCallOp = Old->isCallOp;
SU->isTwoAddress = Old->isTwoAddress;
SU->isCommutable = Old->isCommutable;
SU->hasPhysRegDefs = Old->hasPhysRegDefs;
@ -285,6 +286,7 @@ void ScheduleDAGSDNodes::BuildSchedUnits() {
Worklist.push_back(DAG->getRoot().getNode());
Visited.insert(DAG->getRoot().getNode());
SmallVector<SUnit*, 8> CallSUnits;
while (!Worklist.empty()) {
SDNode *NI = Worklist.pop_back_val();
@ -337,6 +339,9 @@ void ScheduleDAGSDNodes::BuildSchedUnits() {
if (!HasGlueUse) break;
}
if (NodeSUnit->isCall)
CallSUnits.push_back(NodeSUnit);
// Schedule zero-latency TokenFactor below any nodes that may increase the
// schedule height. Otherwise, ancestors of the TokenFactor may appear to
// have false stalls.
@ -356,6 +361,20 @@ void ScheduleDAGSDNodes::BuildSchedUnits() {
// Assign the Latency field of NodeSUnit using target-provided information.
ComputeLatency(NodeSUnit);
}
// Find all call operands.
while (!CallSUnits.empty()) {
SUnit *SU = CallSUnits.pop_back_val();
for (const SDNode *SUNode = SU->getNode(); SUNode;
SUNode = SUNode->getGluedNode()) {
if (SUNode->getOpcode() != ISD::CopyToReg)
continue;
SDNode *SrcN = SUNode->getOperand(2).getNode();
if (isPassiveNode(SrcN)) continue; // Not scheduled.
SUnit *SrcSU = &SUnits[SrcN->getNodeId()];
SrcSU->isCallOp = true;
}
}
}
void ScheduleDAGSDNodes::AddSchedEdges() {

View File

@ -0,0 +1,70 @@
; RUN: llc < %s -mtriple=thumbv7-apple-ios -relocation-model=pic -mcpu=cortex-a8 | FileCheck %s
; Do not move the umull above previous call which would require use of
; more callee-saved registers and introduce copies.
; rdar://9329627
%struct.FF = type { i32 (i32*)*, i32 (i32*, i32*, i32, i32, i32, i32)*, i32 (i32, i32, i8*)*, void ()*, i32 (i32, i8*, i32*)*, i32 ()* }
%struct.BD = type { %struct.BD*, i32, i32, i32, i32, i64, i32 (%struct.BD*, i8*, i64, i32)*, i32 (%struct.BD*, i8*, i32, i32)*, i32 (%struct.BD*, i8*, i64, i32)*, i32 (%struct.BD*, i8*, i32, i32)*, i32 (%struct.BD*, i64, i32)*, [16 x i8], i64, i64 }
@FuncPtr = external hidden unnamed_addr global %struct.FF*
@.str1 = external hidden unnamed_addr constant [6 x i8], align 4
@G = external unnamed_addr global i32
@.str2 = external hidden unnamed_addr constant [58 x i8], align 4
@.str3 = external hidden unnamed_addr constant [58 x i8], align 4
define i32 @test() nounwind optsize ssp {
entry:
; CHECK: test:
; CHECK: push
; CHECK-NOT: push
%block_size = alloca i32, align 4
%block_count = alloca i32, align 4
%index_cache = alloca i32, align 4
store i32 0, i32* %index_cache, align 4
%tmp = load i32* @G, align 4
%tmp1 = call i32 @bar(i32 0, i32 0, i32 %tmp) nounwind
switch i32 %tmp1, label %bb8 [
i32 0, label %bb
i32 536870913, label %bb4
i32 536870914, label %bb6
]
bb:
%tmp2 = load i32* @G, align 4
%tmp4 = icmp eq i32 %tmp2, 0
br i1 %tmp4, label %bb1, label %bb8
bb1:
; CHECK: %bb1
; CHECK-NOT: umull
; CHECK: blx _Get
; CHECK: umull
; CHECK: blx _foo
%tmp5 = load i32* %block_size, align 4
%tmp6 = load i32* %block_count, align 4
%tmp7 = call %struct.FF* @Get() nounwind
store %struct.FF* %tmp7, %struct.FF** @FuncPtr, align 4
%tmp10 = zext i32 %tmp6 to i64
%tmp11 = zext i32 %tmp5 to i64
%tmp12 = mul nsw i64 %tmp10, %tmp11
%tmp13 = call i32 @foo(i8* getelementptr inbounds ([6 x i8]* @.str1, i32 0, i32 0), i64 %tmp12, i32 %tmp5) nounwind
br label %bb8
bb4:
ret i32 0
bb6:
ret i32 1
bb8:
ret i32 -1
}
declare i32 @printf(i8*, ...)
declare %struct.FF* @Get()
declare i32 @foo(i8*, i64, i32)
declare i32 @bar(i32, i32, i32)

View File

@ -58,7 +58,7 @@ entry:
; A8: str r2, [r0, r1, lsl #2]
; A9: test4:
; A9: add r0, r0, r4, lsl #2
; A9: add r0, r0, r{{[0-9]+}}, lsl #2
; A9: ldr r1, [r0]
; A9: str r1, [r0]
%0 = tail call i8* (...)* @malloc(i32 undef) nounwind

View File

@ -1,20 +0,0 @@
; RUN: llc < %s -regalloc=fast -relocation-model=pic | FileCheck %s
target triple = "thumbv6-apple-darwin10"
@fred = internal global i32 0 ; <i32*> [#uses=1]
define void @foo() nounwind {
entry:
; CHECK: str r0, [sp
%0 = call i32 (...)* @bar() nounwind ; <i32> [#uses=1]
; CHECK: blx _bar
; CHECK: ldr r1, [sp
store i32 %0, i32* @fred, align 4
br label %return
return: ; preds = %entry
ret void
}
declare i32 @bar(...)

View File

@ -13,7 +13,7 @@ define weak arm_aapcs_vfpcc i32 @_ZNKSs7compareERKSs(%"struct.std::basic_string<
; CHECK: _ZNKSs7compareERKSs:
; CHECK: it eq
; CHECK-NEXT: subeq r0, r{{[0-9]+}}, r{{[0-9]+}}
; CHECK-NEXT: ldmia.w sp!, {r4, r5, r6, r7, r8, pc}
; CHECK-NEXT: ldmia.w sp!,
entry:
%0 = tail call arm_aapcs_vfpcc i32 @_ZNKSs4sizeEv(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %this) ; <i32> [#uses=3]
%1 = tail call arm_aapcs_vfpcc i32 @_ZNKSs4sizeEv(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %__str) ; <i32> [#uses=3]

View File

@ -79,8 +79,8 @@ entry:
; LINUX-NEXT: .L3$pb:
; LINUX: popl
; LINUX: addl $_GLOBAL_OFFSET_TABLE_+(.L{{.*}}-.L3$pb), %[[REG3:e..]]
; LINUX: movl pfoo@GOT(%[[REG3]]),
; LINUX: calll afoo@PLT
; LINUX: movl pfoo@GOT(%[[REG3]]),
; LINUX: calll *
}