Be careful about scheduling nodes above previous calls. It increase usages of

more callee-saved registers and introduce copies. Only allows it if scheduling a node above calls would end up lessen register pressure. Call operands also has added ABI restrictions for register allocation, so be extra careful with hoisting them above calls. rdar://9329627 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@130245 91177308-0d34-0410-b5e6-96231b3b80d8
2025-01-14 00:32:55 +00:00 · 2011-04-26 21:31:35 +00:00 · 2011-04-26 21:31:35 +00:00 · 554daa67bd
commit 554daa67bd
parent 90fab0f9d8
8 changed files with 138 additions and 27 deletions
--- a/include/llvm/CodeGen/ScheduleDAG.h
+++ b/include/llvm/CodeGen/ScheduleDAG.h
@ -252,6 +252,7 @@ namespace llvm {
    unsigned short Latency;             // Node latency.
    bool isVRegCycle      : 1;          // May use and def the same vreg.
    bool isCall           : 1;          // Is a function call.
+    bool isCallOp         : 1;          // Is a function call operand.
    bool isTwoAddress     : 1;          // Is a two-address instruction.
    bool isCommutable     : 1;          // Is a commutable instruction.
    bool hasPhysRegDefs   : 1;          // Has physreg defs that are being used.
@ -280,7 +281,7 @@ namespace llvm {
      : Node(node), Instr(0), OrigNode(0), NodeNum(nodenum),
        NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
        NumSuccsLeft(0), NumRegDefsLeft(0), Latency(0),
-        isVRegCycle(false), isCall(false), isTwoAddress(false),
+        isVRegCycle(false), isCall(false), isCallOp(false), isTwoAddress(false),
        isCommutable(false), hasPhysRegDefs(false), hasPhysRegClobbers(false),
        isPending(false), isAvailable(false), isScheduled(false),
        isScheduleHigh(false), isScheduleLow(false), isCloned(false),
@ -294,7 +295,7 @@ namespace llvm {
      : Node(0), Instr(instr), OrigNode(0), NodeNum(nodenum),
        NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
        NumSuccsLeft(0), NumRegDefsLeft(0), Latency(0),
-        isVRegCycle(false), isCall(false), isTwoAddress(false),
+        isVRegCycle(false), isCall(false), isCallOp(false), isTwoAddress(false),
        isCommutable(false), hasPhysRegDefs(false), hasPhysRegClobbers(false),
        isPending(false), isAvailable(false), isScheduled(false),
        isScheduleHigh(false), isScheduleLow(false), isCloned(false),
@ -307,7 +308,7 @@ namespace llvm {
      : Node(0), Instr(0), OrigNode(0), NodeNum(~0u),
        NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
        NumSuccsLeft(0), NumRegDefsLeft(0), Latency(0),
-        isVRegCycle(false), isCall(false), isTwoAddress(false),
+        isVRegCycle(false), isCall(false), isCallOp(false), isTwoAddress(false),
        isCommutable(false), hasPhysRegDefs(false), hasPhysRegClobbers(false),
        isPending(false), isAvailable(false), isScheduled(false),
        isScheduleHigh(false), isScheduleLow(false), isCloned(false),
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@ -1732,7 +1732,17 @@ unsigned RegReductionPQBase::getNodePriority(const SUnit *SU) const {
    // If SU does not have a register def, schedule it close to its uses
    // because it does not lengthen any live ranges.
    return 0;
+#if 1
  return SethiUllmanNumbers[SU->NodeNum];
+#else
+  unsigned Priority = SethiUllmanNumbers[SU->NodeNum];
+  if (SU->isCallOp) {
+    // FIXME: This assumes all of the defs are used as call operands.
+    int NP = (int)Priority - SU->getNode()->getNumValues();
+    return (NP > 0) ? NP : 0;
+  }
+  return Priority;
+#endif
 }

 //===----------------------------------------------------------------------===//
@ -2238,11 +2248,35 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
  // Prioritize by Sethi-Ulmann number and push CopyToReg nodes down.
  unsigned LPriority = SPQ->getNodePriority(left);
  unsigned RPriority = SPQ->getNodePriority(right);
+
+  // Be really careful about hoisting call operands above previous calls.
+  // Only allows it if it would reduce register pressure.
+  if (left->isCall && right->isCallOp) {
+    unsigned RNumVals = right->getNode()->getNumValues();
+    RPriority = (RPriority > RNumVals) ? (RPriority - RNumVals) : 0;
+  }
+  if (right->isCall && left->isCallOp) {
+    unsigned LNumVals = left->getNode()->getNumValues();
+    LPriority = (LPriority > LNumVals) ? (LPriority - LNumVals) : 0;
+  }
+
  if (LPriority != RPriority) {
    DEBUG(++FactorCount[FactStatic]);
    return LPriority > RPriority;
  }

+  // One or both of the nodes are calls and their sethi-ullman numbers are the
+  // same, then keep source order.
+  if (left->isCall || right->isCall) {
+    unsigned LOrder = SPQ->getNodeOrdering(left);
+    unsigned ROrder = SPQ->getNodeOrdering(right);
+
+    // Prefer an ordering where the lower the non-zero order number, the higher
+    // the preference.
+    if ((LOrder || ROrder) && LOrder != ROrder)
+      return LOrder != 0 && (LOrder < ROrder || ROrder == 0);
+  }
+
  // Try schedule def + use closer when Sethi-Ullman numbers are the same.
  // e.g.
  // t1 = op t2, c1
@ -2275,7 +2309,14 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
    return LScratch > RScratch;
  }

-  if (!DisableSchedCycles) {
+  // Comparing latency against a call makes little sense unless the node
+  // is register pressure-neutral.
+  if ((left->isCall && RPriority > 0) || (right->isCall && LPriority > 0))
+    return (left->NodeQueueId > right->NodeQueueId);
+
+  // Do not compare latencies when one or both of the nodes are calls.
+  if (!DisableSchedCycles &&
+      !(left->isCall || right->isCall)) {
    int result = BUCompareLatency(left, right, false /*checkPref*/, SPQ);
    if (result != 0)
      return result > 0;
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@ -83,6 +83,7 @@ SUnit *ScheduleDAGSDNodes::Clone(SUnit *Old) {
  SU->Latency = Old->Latency;
  SU->isVRegCycle = Old->isVRegCycle;
  SU->isCall = Old->isCall;
+  SU->isCallOp = Old->isCallOp;
  SU->isTwoAddress = Old->isTwoAddress;
  SU->isCommutable = Old->isCommutable;
  SU->hasPhysRegDefs = Old->hasPhysRegDefs;
@ -285,6 +286,7 @@ void ScheduleDAGSDNodes::BuildSchedUnits() {
  Worklist.push_back(DAG->getRoot().getNode());
  Visited.insert(DAG->getRoot().getNode());

+  SmallVector<SUnit*, 8> CallSUnits;
  while (!Worklist.empty()) {
    SDNode *NI = Worklist.pop_back_val();

@ -337,6 +339,9 @@ void ScheduleDAGSDNodes::BuildSchedUnits() {
      if (!HasGlueUse) break;
    }

+    if (NodeSUnit->isCall)
+      CallSUnits.push_back(NodeSUnit);
+
    // Schedule zero-latency TokenFactor below any nodes that may increase the
    // schedule height. Otherwise, ancestors of the TokenFactor may appear to
    // have false stalls.
@ -356,6 +361,20 @@ void ScheduleDAGSDNodes::BuildSchedUnits() {
    // Assign the Latency field of NodeSUnit using target-provided information.
    ComputeLatency(NodeSUnit);
  }
+
+  // Find all call operands.
+  while (!CallSUnits.empty()) {
+    SUnit *SU = CallSUnits.pop_back_val();
+    for (const SDNode *SUNode = SU->getNode(); SUNode;
+         SUNode = SUNode->getGluedNode()) {
+      if (SUNode->getOpcode() != ISD::CopyToReg)
+        continue;
+      SDNode *SrcN = SUNode->getOperand(2).getNode();
+      if (isPassiveNode(SrcN)) continue;   // Not scheduled.
+      SUnit *SrcSU = &SUnits[SrcN->getNodeId()];
+      SrcSU->isCallOp = true;
+    }
+  }
 }

 void ScheduleDAGSDNodes::AddSchedEdges() {
--- a/test/CodeGen/ARM/2011-04-26-SchedTweak.ll
+++ b/test/CodeGen/ARM/2011-04-26-SchedTweak.ll
@ -0,0 +1,70 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -relocation-model=pic -mcpu=cortex-a8 | FileCheck %s
+
+; Do not move the umull above previous call which would require use of
+; more callee-saved registers and introduce copies.
+; rdar://9329627
+
+%struct.FF = type { i32 (i32*)*, i32 (i32*, i32*, i32, i32, i32, i32)*, i32 (i32, i32, i8*)*, void ()*, i32 (i32, i8*, i32*)*, i32 ()* }
+%struct.BD = type { %struct.BD*, i32, i32, i32, i32, i64, i32 (%struct.BD*, i8*, i64, i32)*, i32 (%struct.BD*, i8*, i32, i32)*, i32 (%struct.BD*, i8*, i64, i32)*, i32 (%struct.BD*, i8*, i32, i32)*, i32 (%struct.BD*, i64, i32)*, [16 x i8], i64, i64 }
+
+@FuncPtr = external hidden unnamed_addr global %struct.FF*
+@.str1 = external hidden unnamed_addr constant [6 x i8], align 4
+@G = external unnamed_addr global i32
+@.str2 = external hidden unnamed_addr constant [58 x i8], align 4
+@.str3 = external hidden unnamed_addr constant [58 x i8], align 4
+
+define i32 @test() nounwind optsize ssp {
+entry:
+; CHECK: test:
+; CHECK: push
+; CHECK-NOT: push
+  %block_size = alloca i32, align 4
+  %block_count = alloca i32, align 4
+  %index_cache = alloca i32, align 4
+  store i32 0, i32* %index_cache, align 4
+  %tmp = load i32* @G, align 4
+  %tmp1 = call i32 @bar(i32 0, i32 0, i32 %tmp) nounwind
+  switch i32 %tmp1, label %bb8 [
+    i32 0, label %bb
+    i32 536870913, label %bb4
+    i32 536870914, label %bb6
+  ]
+
+bb:
+  %tmp2 = load i32* @G, align 4
+  %tmp4 = icmp eq i32 %tmp2, 0
+  br i1 %tmp4, label %bb1, label %bb8
+
+bb1:
+; CHECK: %bb1
+; CHECK-NOT: umull
+; CHECK: blx _Get
+; CHECK: umull
+; CHECK: blx _foo
+  %tmp5 = load i32* %block_size, align 4
+  %tmp6 = load i32* %block_count, align 4
+  %tmp7 = call %struct.FF* @Get() nounwind
+  store %struct.FF* %tmp7, %struct.FF** @FuncPtr, align 4
+  %tmp10 = zext i32 %tmp6 to i64
+  %tmp11 = zext i32 %tmp5 to i64
+  %tmp12 = mul nsw i64 %tmp10, %tmp11
+  %tmp13 = call i32 @foo(i8* getelementptr inbounds ([6 x i8]* @.str1, i32 0, i32 0), i64 %tmp12, i32 %tmp5) nounwind
+  br label %bb8
+
+bb4:
+  ret i32 0
+
+bb6:
+  ret i32 1
+
+bb8:
+  ret i32 -1
+}
+
+declare i32 @printf(i8*, ...)
+
+declare %struct.FF* @Get()
+
+declare i32 @foo(i8*, i64, i32)
+
+declare i32 @bar(i32, i32, i32)
--- a/test/CodeGen/ARM/shifter_operand.ll
+++ b/test/CodeGen/ARM/shifter_operand.ll
@ -58,7 +58,7 @@ entry:
 ; A8: str r2, [r0, r1, lsl #2]

 ; A9: test4:
-; A9: add r0, r0, r4, lsl #2
+; A9: add r0, r0, r{{[0-9]+}}, lsl #2
 ; A9: ldr r1, [r0]
 ; A9: str r1, [r0]
  %0 = tail call i8* (...)* @malloc(i32 undef) nounwind
--- a/test/CodeGen/Thumb/2010-01-15-local-alloc-spill-physical.ll
+++ b/test/CodeGen/Thumb/2010-01-15-local-alloc-spill-physical.ll
@ -1,20 +0,0 @@
-; RUN: llc < %s -regalloc=fast -relocation-model=pic | FileCheck %s
-
-target triple = "thumbv6-apple-darwin10"
-
-@fred = internal global i32 0              ; <i32*> [#uses=1]
-
-define void @foo() nounwind {
-entry:
-; CHECK: str r0, [sp
-  %0 = call  i32 (...)* @bar() nounwind ; <i32> [#uses=1]
-; CHECK: blx _bar
-; CHECK: ldr r1, [sp
-  store i32 %0, i32* @fred, align 4
-  br label %return
-
-return:                                           ; preds = %entry
-  ret void
-}
-
-declare i32 @bar(...)
--- a/test/CodeGen/Thumb2/2009-10-15-ITBlockBranch.ll
+++ b/test/CodeGen/Thumb2/2009-10-15-ITBlockBranch.ll
@ -13,7 +13,7 @@ define weak arm_aapcs_vfpcc i32 @_ZNKSs7compareERKSs(%"struct.std::basic_string<
 ; CHECK: _ZNKSs7compareERKSs:
 ; CHECK:      it  eq
 ; CHECK-NEXT: subeq r0, r{{[0-9]+}}, r{{[0-9]+}}
-; CHECK-NEXT: ldmia.w sp!, {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT: ldmia.w sp!,
 entry:
  %0 = tail call arm_aapcs_vfpcc  i32 @_ZNKSs4sizeEv(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %this) ; <i32> [#uses=3]
  %1 = tail call arm_aapcs_vfpcc  i32 @_ZNKSs4sizeEv(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %__str) ; <i32> [#uses=3]
--- a/test/CodeGen/X86/pic.ll
+++ b/test/CodeGen/X86/pic.ll
@ -79,8 +79,8 @@ entry:
 ; LINUX-NEXT: .L3$pb:
 ; LINUX: 	popl
 ; LINUX: 	addl	$_GLOBAL_OFFSET_TABLE_+(.L{{.*}}-.L3$pb), %[[REG3:e..]]
-; LINUX: 	movl	pfoo@GOT(%[[REG3]]),
 ; LINUX: 	calll	afoo@PLT
+; LINUX: 	movl	pfoo@GOT(%[[REG3]]),
 ; LINUX: 	calll	*
 }