From 554daa67bd1c4f01fb7a00f2f4255a52b81e9fa3 Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Tue, 26 Apr 2011 21:31:35 +0000 Subject: [PATCH] Be careful about scheduling nodes above previous calls. It increase usages of more callee-saved registers and introduce copies. Only allows it if scheduling a node above calls would end up lessen register pressure. Call operands also has added ABI restrictions for register allocation, so be extra careful with hoisting them above calls. rdar://9329627 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@130245 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/CodeGen/ScheduleDAG.h | 7 +- .../SelectionDAG/ScheduleDAGRRList.cpp | 43 +++++++++++- .../SelectionDAG/ScheduleDAGSDNodes.cpp | 19 +++++ test/CodeGen/ARM/2011-04-26-SchedTweak.ll | 70 +++++++++++++++++++ test/CodeGen/ARM/shifter_operand.ll | 2 +- .../2010-01-15-local-alloc-spill-physical.ll | 20 ------ .../Thumb2/2009-10-15-ITBlockBranch.ll | 2 +- test/CodeGen/X86/pic.ll | 2 +- 8 files changed, 138 insertions(+), 27 deletions(-) create mode 100644 test/CodeGen/ARM/2011-04-26-SchedTweak.ll delete mode 100644 test/CodeGen/Thumb/2010-01-15-local-alloc-spill-physical.ll diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h index 4e0971fc2ed..2eb3db319dd 100644 --- a/include/llvm/CodeGen/ScheduleDAG.h +++ b/include/llvm/CodeGen/ScheduleDAG.h @@ -252,6 +252,7 @@ namespace llvm { unsigned short Latency; // Node latency. bool isVRegCycle : 1; // May use and def the same vreg. bool isCall : 1; // Is a function call. + bool isCallOp : 1; // Is a function call operand. bool isTwoAddress : 1; // Is a two-address instruction. bool isCommutable : 1; // Is a commutable instruction. bool hasPhysRegDefs : 1; // Has physreg defs that are being used. @@ -280,7 +281,7 @@ namespace llvm { : Node(node), Instr(0), OrigNode(0), NodeNum(nodenum), NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0), NumSuccsLeft(0), NumRegDefsLeft(0), Latency(0), - isVRegCycle(false), isCall(false), isTwoAddress(false), + isVRegCycle(false), isCall(false), isCallOp(false), isTwoAddress(false), isCommutable(false), hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false), isAvailable(false), isScheduled(false), isScheduleHigh(false), isScheduleLow(false), isCloned(false), @@ -294,7 +295,7 @@ namespace llvm { : Node(0), Instr(instr), OrigNode(0), NodeNum(nodenum), NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0), NumSuccsLeft(0), NumRegDefsLeft(0), Latency(0), - isVRegCycle(false), isCall(false), isTwoAddress(false), + isVRegCycle(false), isCall(false), isCallOp(false), isTwoAddress(false), isCommutable(false), hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false), isAvailable(false), isScheduled(false), isScheduleHigh(false), isScheduleLow(false), isCloned(false), @@ -307,7 +308,7 @@ namespace llvm { : Node(0), Instr(0), OrigNode(0), NodeNum(~0u), NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0), NumSuccsLeft(0), NumRegDefsLeft(0), Latency(0), - isVRegCycle(false), isCall(false), isTwoAddress(false), + isVRegCycle(false), isCall(false), isCallOp(false), isTwoAddress(false), isCommutable(false), hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false), isAvailable(false), isScheduled(false), isScheduleHigh(false), isScheduleLow(false), isCloned(false), diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index 5f28cdb4079..88bd4509b46 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -1732,7 +1732,17 @@ unsigned RegReductionPQBase::getNodePriority(const SUnit *SU) const { // If SU does not have a register def, schedule it close to its uses // because it does not lengthen any live ranges. return 0; +#if 1 return SethiUllmanNumbers[SU->NodeNum]; +#else + unsigned Priority = SethiUllmanNumbers[SU->NodeNum]; + if (SU->isCallOp) { + // FIXME: This assumes all of the defs are used as call operands. + int NP = (int)Priority - SU->getNode()->getNumValues(); + return (NP > 0) ? NP : 0; + } + return Priority; +#endif } //===----------------------------------------------------------------------===// @@ -2238,11 +2248,35 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) { // Prioritize by Sethi-Ulmann number and push CopyToReg nodes down. unsigned LPriority = SPQ->getNodePriority(left); unsigned RPriority = SPQ->getNodePriority(right); + + // Be really careful about hoisting call operands above previous calls. + // Only allows it if it would reduce register pressure. + if (left->isCall && right->isCallOp) { + unsigned RNumVals = right->getNode()->getNumValues(); + RPriority = (RPriority > RNumVals) ? (RPriority - RNumVals) : 0; + } + if (right->isCall && left->isCallOp) { + unsigned LNumVals = left->getNode()->getNumValues(); + LPriority = (LPriority > LNumVals) ? (LPriority - LNumVals) : 0; + } + if (LPriority != RPriority) { DEBUG(++FactorCount[FactStatic]); return LPriority > RPriority; } + // One or both of the nodes are calls and their sethi-ullman numbers are the + // same, then keep source order. + if (left->isCall || right->isCall) { + unsigned LOrder = SPQ->getNodeOrdering(left); + unsigned ROrder = SPQ->getNodeOrdering(right); + + // Prefer an ordering where the lower the non-zero order number, the higher + // the preference. + if ((LOrder || ROrder) && LOrder != ROrder) + return LOrder != 0 && (LOrder < ROrder || ROrder == 0); + } + // Try schedule def + use closer when Sethi-Ullman numbers are the same. // e.g. // t1 = op t2, c1 @@ -2275,7 +2309,14 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) { return LScratch > RScratch; } - if (!DisableSchedCycles) { + // Comparing latency against a call makes little sense unless the node + // is register pressure-neutral. + if ((left->isCall && RPriority > 0) || (right->isCall && LPriority > 0)) + return (left->NodeQueueId > right->NodeQueueId); + + // Do not compare latencies when one or both of the nodes are calls. + if (!DisableSchedCycles && + !(left->isCall || right->isCall)) { int result = BUCompareLatency(left, right, false /*checkPref*/, SPQ); if (result != 0) return result > 0; diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index 94b8c2f2049..9f2f0121a86 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -83,6 +83,7 @@ SUnit *ScheduleDAGSDNodes::Clone(SUnit *Old) { SU->Latency = Old->Latency; SU->isVRegCycle = Old->isVRegCycle; SU->isCall = Old->isCall; + SU->isCallOp = Old->isCallOp; SU->isTwoAddress = Old->isTwoAddress; SU->isCommutable = Old->isCommutable; SU->hasPhysRegDefs = Old->hasPhysRegDefs; @@ -285,6 +286,7 @@ void ScheduleDAGSDNodes::BuildSchedUnits() { Worklist.push_back(DAG->getRoot().getNode()); Visited.insert(DAG->getRoot().getNode()); + SmallVector CallSUnits; while (!Worklist.empty()) { SDNode *NI = Worklist.pop_back_val(); @@ -337,6 +339,9 @@ void ScheduleDAGSDNodes::BuildSchedUnits() { if (!HasGlueUse) break; } + if (NodeSUnit->isCall) + CallSUnits.push_back(NodeSUnit); + // Schedule zero-latency TokenFactor below any nodes that may increase the // schedule height. Otherwise, ancestors of the TokenFactor may appear to // have false stalls. @@ -356,6 +361,20 @@ void ScheduleDAGSDNodes::BuildSchedUnits() { // Assign the Latency field of NodeSUnit using target-provided information. ComputeLatency(NodeSUnit); } + + // Find all call operands. + while (!CallSUnits.empty()) { + SUnit *SU = CallSUnits.pop_back_val(); + for (const SDNode *SUNode = SU->getNode(); SUNode; + SUNode = SUNode->getGluedNode()) { + if (SUNode->getOpcode() != ISD::CopyToReg) + continue; + SDNode *SrcN = SUNode->getOperand(2).getNode(); + if (isPassiveNode(SrcN)) continue; // Not scheduled. + SUnit *SrcSU = &SUnits[SrcN->getNodeId()]; + SrcSU->isCallOp = true; + } + } } void ScheduleDAGSDNodes::AddSchedEdges() { diff --git a/test/CodeGen/ARM/2011-04-26-SchedTweak.ll b/test/CodeGen/ARM/2011-04-26-SchedTweak.ll new file mode 100644 index 00000000000..ed7dd033204 --- /dev/null +++ b/test/CodeGen/ARM/2011-04-26-SchedTweak.ll @@ -0,0 +1,70 @@ +; RUN: llc < %s -mtriple=thumbv7-apple-ios -relocation-model=pic -mcpu=cortex-a8 | FileCheck %s + +; Do not move the umull above previous call which would require use of +; more callee-saved registers and introduce copies. +; rdar://9329627 + +%struct.FF = type { i32 (i32*)*, i32 (i32*, i32*, i32, i32, i32, i32)*, i32 (i32, i32, i8*)*, void ()*, i32 (i32, i8*, i32*)*, i32 ()* } +%struct.BD = type { %struct.BD*, i32, i32, i32, i32, i64, i32 (%struct.BD*, i8*, i64, i32)*, i32 (%struct.BD*, i8*, i32, i32)*, i32 (%struct.BD*, i8*, i64, i32)*, i32 (%struct.BD*, i8*, i32, i32)*, i32 (%struct.BD*, i64, i32)*, [16 x i8], i64, i64 } + +@FuncPtr = external hidden unnamed_addr global %struct.FF* +@.str1 = external hidden unnamed_addr constant [6 x i8], align 4 +@G = external unnamed_addr global i32 +@.str2 = external hidden unnamed_addr constant [58 x i8], align 4 +@.str3 = external hidden unnamed_addr constant [58 x i8], align 4 + +define i32 @test() nounwind optsize ssp { +entry: +; CHECK: test: +; CHECK: push +; CHECK-NOT: push + %block_size = alloca i32, align 4 + %block_count = alloca i32, align 4 + %index_cache = alloca i32, align 4 + store i32 0, i32* %index_cache, align 4 + %tmp = load i32* @G, align 4 + %tmp1 = call i32 @bar(i32 0, i32 0, i32 %tmp) nounwind + switch i32 %tmp1, label %bb8 [ + i32 0, label %bb + i32 536870913, label %bb4 + i32 536870914, label %bb6 + ] + +bb: + %tmp2 = load i32* @G, align 4 + %tmp4 = icmp eq i32 %tmp2, 0 + br i1 %tmp4, label %bb1, label %bb8 + +bb1: +; CHECK: %bb1 +; CHECK-NOT: umull +; CHECK: blx _Get +; CHECK: umull +; CHECK: blx _foo + %tmp5 = load i32* %block_size, align 4 + %tmp6 = load i32* %block_count, align 4 + %tmp7 = call %struct.FF* @Get() nounwind + store %struct.FF* %tmp7, %struct.FF** @FuncPtr, align 4 + %tmp10 = zext i32 %tmp6 to i64 + %tmp11 = zext i32 %tmp5 to i64 + %tmp12 = mul nsw i64 %tmp10, %tmp11 + %tmp13 = call i32 @foo(i8* getelementptr inbounds ([6 x i8]* @.str1, i32 0, i32 0), i64 %tmp12, i32 %tmp5) nounwind + br label %bb8 + +bb4: + ret i32 0 + +bb6: + ret i32 1 + +bb8: + ret i32 -1 +} + +declare i32 @printf(i8*, ...) + +declare %struct.FF* @Get() + +declare i32 @foo(i8*, i64, i32) + +declare i32 @bar(i32, i32, i32) diff --git a/test/CodeGen/ARM/shifter_operand.ll b/test/CodeGen/ARM/shifter_operand.ll index be891bc76d3..f0e2d102610 100644 --- a/test/CodeGen/ARM/shifter_operand.ll +++ b/test/CodeGen/ARM/shifter_operand.ll @@ -58,7 +58,7 @@ entry: ; A8: str r2, [r0, r1, lsl #2] ; A9: test4: -; A9: add r0, r0, r4, lsl #2 +; A9: add r0, r0, r{{[0-9]+}}, lsl #2 ; A9: ldr r1, [r0] ; A9: str r1, [r0] %0 = tail call i8* (...)* @malloc(i32 undef) nounwind diff --git a/test/CodeGen/Thumb/2010-01-15-local-alloc-spill-physical.ll b/test/CodeGen/Thumb/2010-01-15-local-alloc-spill-physical.ll deleted file mode 100644 index fad26693e76..00000000000 --- a/test/CodeGen/Thumb/2010-01-15-local-alloc-spill-physical.ll +++ /dev/null @@ -1,20 +0,0 @@ -; RUN: llc < %s -regalloc=fast -relocation-model=pic | FileCheck %s - -target triple = "thumbv6-apple-darwin10" - -@fred = internal global i32 0 ; [#uses=1] - -define void @foo() nounwind { -entry: -; CHECK: str r0, [sp - %0 = call i32 (...)* @bar() nounwind ; [#uses=1] -; CHECK: blx _bar -; CHECK: ldr r1, [sp - store i32 %0, i32* @fred, align 4 - br label %return - -return: ; preds = %entry - ret void -} - -declare i32 @bar(...) diff --git a/test/CodeGen/Thumb2/2009-10-15-ITBlockBranch.ll b/test/CodeGen/Thumb2/2009-10-15-ITBlockBranch.ll index 789a8916970..3594424e290 100644 --- a/test/CodeGen/Thumb2/2009-10-15-ITBlockBranch.ll +++ b/test/CodeGen/Thumb2/2009-10-15-ITBlockBranch.ll @@ -13,7 +13,7 @@ define weak arm_aapcs_vfpcc i32 @_ZNKSs7compareERKSs(%"struct.std::basic_string< ; CHECK: _ZNKSs7compareERKSs: ; CHECK: it eq ; CHECK-NEXT: subeq r0, r{{[0-9]+}}, r{{[0-9]+}} -; CHECK-NEXT: ldmia.w sp!, {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: ldmia.w sp!, entry: %0 = tail call arm_aapcs_vfpcc i32 @_ZNKSs4sizeEv(%"struct.std::basic_string,std::allocator >"* %this) ; [#uses=3] %1 = tail call arm_aapcs_vfpcc i32 @_ZNKSs4sizeEv(%"struct.std::basic_string,std::allocator >"* %__str) ; [#uses=3] diff --git a/test/CodeGen/X86/pic.ll b/test/CodeGen/X86/pic.ll index dc5fcd78dc8..fb60ac2a60b 100644 --- a/test/CodeGen/X86/pic.ll +++ b/test/CodeGen/X86/pic.ll @@ -79,8 +79,8 @@ entry: ; LINUX-NEXT: .L3$pb: ; LINUX: popl ; LINUX: addl $_GLOBAL_OFFSET_TABLE_+(.L{{.*}}-.L3$pb), %[[REG3:e..]] -; LINUX: movl pfoo@GOT(%[[REG3]]), ; LINUX: calll afoo@PLT +; LINUX: movl pfoo@GOT(%[[REG3]]), ; LINUX: calll * }