Add initial support for back-scheduling address computations,

especially in the case of addresses computed from loop induction variables. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@61075 91177308-0d34-0410-b5e6-96231b3b80d8
2025-07-28 19:25:00 +00:00 · 2008-12-16 03:35:01 +00:00
parent 9a65d6afc2
commit 8749b61178
6 changed files with 166 additions and 3 deletions
--- a/include/llvm/CodeGen/ScheduleDAG.h
+++ b/include/llvm/CodeGen/ScheduleDAG.h
@@ -242,6 +242,7 @@ namespace llvm {
    bool isPending        : 1;          // True once pending.
    bool isAvailable      : 1;          // True once available.
    bool isScheduled      : 1;          // True once scheduled.
    bool isScheduleHigh   : 1;          // True if preferable to schedule high.
  private:
    bool isDepthCurrent   : 1;          // True if Depth is current.
    bool isHeightCurrent  : 1;          // True if Height is current.
@@ -258,7 +259,7 @@ namespace llvm {
        Latency(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0), NumSuccsLeft(0),
        isTwoAddress(false), isCommutable(false), hasPhysRegDefs(false),
        isPending(false), isAvailable(false), isScheduled(false),
-        isDepthCurrent(false), isHeightCurrent(false),
+        isScheduleHigh(false), isDepthCurrent(false), isHeightCurrent(false),
        Depth(0), Height(0),
        CopyDstRC(NULL), CopySrcRC(NULL) {}
@@ -269,7 +270,7 @@ namespace llvm {
        Latency(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0), NumSuccsLeft(0),
        isTwoAddress(false), isCommutable(false), hasPhysRegDefs(false),
        isPending(false), isAvailable(false), isScheduled(false),
-        isDepthCurrent(false), isHeightCurrent(false),
+        isScheduleHigh(false), isDepthCurrent(false), isHeightCurrent(false),
        Depth(0), Height(0),
        CopyDstRC(NULL), CopySrcRC(NULL) {}
--- a/include/llvm/Target/TargetSubtarget.h
+++ b/include/llvm/Target/TargetSubtarget.h
@@ -29,6 +29,12 @@ protected: // Can only create subclasses...
  TargetSubtarget();
 public:
  virtual ~TargetSubtarget();
  /// getSpecialAddressLatency - For targets where it is beneficial to
  /// backschedule instructions that compute addresses, return a value
  /// indicating the number of scheduling cycles of backscheduling that
  /// should be attempted.
  virtual unsigned getSpecialAddressLatency() const { return 0; }
 };
 } // End llvm namespace
--- a/lib/CodeGen/LatencyPriorityQueue.cpp
+++ b/lib/CodeGen/LatencyPriorityQueue.cpp
@@ -19,6 +19,14 @@
 using namespace llvm;
 bool latency_sort::operator()(const SUnit *LHS, const SUnit *RHS) const {
  // The isScheduleHigh flag allows nodes with wraparound dependencies that
  // cannot easily be modeled as edges with latencies to be scheduled as
  // soon as possible in a top-down schedule.
  if (LHS->isScheduleHigh && !RHS->isScheduleHigh)
    return false;
  if (!LHS->isScheduleHigh && RHS->isScheduleHigh)
    return true;
  unsigned LHSNum = LHS->NodeNum;
  unsigned RHSNum = RHS->NodeNum;
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -15,6 +15,7 @@
 #define DEBUG_TYPE "sched-instrs"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
@@ -29,6 +30,59 @@
 #include <map>
 using namespace llvm;
 namespace {
  class VISIBILITY_HIDDEN LoopDependencies {
    const MachineLoopInfo &MLI;
    const MachineDominatorTree &MDT;
  public:
    typedef std::map<unsigned, std::pair<const MachineOperand *, unsigned> >
      LoopDeps;
    LoopDeps Deps;
    LoopDependencies(const MachineLoopInfo &mli,
                     const MachineDominatorTree &mdt) :
      MLI(mli), MDT(mdt) {}
    void VisitLoop(const MachineLoop *Loop) {
      Deps.clear();
      MachineBasicBlock *Header = Loop->getHeader();
      SmallSet<unsigned, 8> LoopLiveIns;
      for (MachineBasicBlock::livein_iterator LI = Header->livein_begin(),
           LE = Header->livein_end(); LI != LE; ++LI)
        LoopLiveIns.insert(*LI);
      VisitRegion(MDT.getNode(Header), Loop, LoopLiveIns);
    }
  private:
    void VisitRegion(const MachineDomTreeNode *Node,
                     const MachineLoop *Loop,
                     const SmallSet<unsigned, 8> &LoopLiveIns) {
      MachineBasicBlock *MBB = Node->getBlock();
      if (!Loop->contains(MBB)) return;
      unsigned Count = 0;
      for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
           I != E; ++I, ++Count) {
        const MachineInstr *MI = I;
        for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
          const MachineOperand &MO = MI->getOperand(i);
          if (!MO.isReg() || !MO.isUse())
            continue;
          unsigned MOReg = MO.getReg();
          if (LoopLiveIns.count(MOReg))
            Deps.insert(std::make_pair(MOReg, std::make_pair(&MO, Count)));
        }
      }
      const std::vector<MachineDomTreeNode*> &Children = Node->getChildren();
      for (unsigned I = 0, E = Children.size(); I != E; ++I)
        VisitRegion(Children[I], Loop, LoopLiveIns);
    }
  };
 }
 ScheduleDAGInstrs::ScheduleDAGInstrs(MachineBasicBlock *bb,
                                     const TargetMachine &tm,
                                     const MachineLoopInfo &mli,
@@ -65,9 +119,27 @@ void ScheduleDAGInstrs::BuildSchedUnits() {
  // all the work of the block is done before the terminator.
  SUnit *Terminator = 0;
  LoopDependencies LoopRegs(MLI, MDT);
  // Track which regs are live into a loop, to help guide back-edge-aware
  // scheduling.
  SmallSet<unsigned, 8> LoopLiveInRegs;
  if (MachineLoop *ML = MLI.getLoopFor(BB))
    if (BB == ML->getLoopLatch()) {
      MachineBasicBlock *Header = ML->getHeader();
      for (MachineBasicBlock::livein_iterator I = Header->livein_begin(),
           E = Header->livein_end(); I != E; ++I)
        LoopLiveInRegs.insert(*I);
      LoopRegs.VisitLoop(ML);
    }
  // Check to see if the scheduler cares about latencies.
  bool UnitLatencies = ForceUnitLatencies();
  // Ask the target if address-backscheduling is desirable, and if so how much.
  unsigned SpecialAddressLatency =
    TM.getSubtarget<TargetSubtarget>().getSpecialAddressLatency();
  for (MachineBasicBlock::iterator MII = BB->end(), MIE = BB->begin();
       MII != MIE; --MII) {
    MachineInstr *MI = prior(MII);
@@ -118,7 +190,21 @@ void ScheduleDAGInstrs::BuildSchedUnits() {
        for (unsigned i = 0, e = UseList.size(); i != e; ++i) {
          SUnit *UseSU = UseList[i];
          if (UseSU != SU) {
-            UseSU->addPred(SDep(SU, SDep::Data, DataLatency, Reg));
+            unsigned LDataLatency = DataLatency;
            // Optionally add in a special extra latency for nodes that
            // feed addresses.
            // TODO: Do this for register aliases too.
            if (SpecialAddressLatency != 0 && !UnitLatencies) {
              MachineInstr *UseMI = UseSU->getInstr();
              const TargetInstrDesc &UseTID = UseMI->getDesc();
              int RegUseIndex = UseMI->findRegisterUseOperandIdx(Reg);
              assert(RegUseIndex >= 0 && "UseMI doesn's use register!");
              if ((UseTID.mayLoad() || UseTID.mayStore()) &&
                  (unsigned)RegUseIndex < UseTID.getNumOperands() &&
                  UseTID.OpInfo[RegUseIndex].isLookupPtrRegClass())
                LDataLatency += SpecialAddressLatency;
            }
            UseSU->addPred(SDep(SU, SDep::Data, LDataLatency, Reg));
          }
        }
        for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
@@ -130,6 +216,51 @@ void ScheduleDAGInstrs::BuildSchedUnits() {
          }
        }
        // If a def is going to wrap back around to the top of the loop,
        // backschedule it.
        // TODO: Blocks in loops without terminators can benefit too.
        if (!UnitLatencies && Terminator && DefList.empty()) {
          LoopDependencies::LoopDeps::iterator I = LoopRegs.Deps.find(Reg);
          if (I != LoopRegs.Deps.end()) {
            const MachineOperand *UseMO = I->second.first;
            unsigned Count = I->second.second;
            const MachineInstr *UseMI = UseMO->getParent();
            unsigned UseMOIdx = UseMO - &UseMI->getOperand(0);
            const TargetInstrDesc &UseTID = UseMI->getDesc();
            // TODO: If we knew the total depth of the region here, we could
            // handle the case where the whole loop is inside the region but
            // is large enough that the isScheduleHigh trick isn't needed.
            if (UseMOIdx < UseTID.getNumOperands()) {
              // Currently, we only support scheduling regions consisting of
              // single basic blocks. Check to see if the instruction is in
              // the same region by checking to see if it has the same parent.
              if (UseMI->getParent() != MI->getParent()) {
                unsigned Latency = SU->Latency;
                if (UseTID.OpInfo[UseMOIdx].isLookupPtrRegClass())
                  Latency += SpecialAddressLatency;
                // This is a wild guess as to the portion of the latency which
                // will be overlapped by work done outside the current
                // scheduling region.
                Latency -= std::min(Latency, Count);
                // Add the artifical edge.
                Terminator->addPred(SDep(SU, SDep::Order, Latency,
                                         /*Reg=*/0, /*isNormalMemory=*/false,
                                         /*isMustAlias=*/false,
                                         /*isArtificial=*/true));
              } else if (SpecialAddressLatency > 0 &&
                         UseTID.OpInfo[UseMOIdx].isLookupPtrRegClass()) {
                // The entire loop body is within the current scheduling region
                // and the latency of this operation is assumed to be greater
                // than the latency of the loop.
                // TODO: Recursively mark data-edge predecessors as
                //       isScheduleHigh too.
                SU->isScheduleHigh = true;
              }
            }
            LoopRegs.Deps.erase(I);
          }
        }
        UseList.clear();
        if (!MO.isDead())
          DefList.clear();
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -93,6 +93,17 @@ const char *X86Subtarget::getBZeroEntry() const {
  return 0;
 }
 /// getSpecialAddressLatency - For targets where it is beneficial to
 /// backschedule instructions that compute addresses, return a value
 /// indicating the number of scheduling cycles of backscheduling that
 /// should be attempted.
 unsigned X86Subtarget::getSpecialAddressLatency() const {
  // For x86 out-of-order targets, back-schedule address computations so
  // that loads and stores aren't blocked.
  // This value was chosen arbitrarily.
  return 200;
 }
 /// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in the
 /// specified arguments.  If we can't run cpuid on the host, return true.
 bool X86::GetCpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -191,6 +191,12 @@ public:
  /// memset with zero passed as the second argument. Otherwise it
  /// returns null.
  const char *getBZeroEntry() const;
  /// getSpecialAddressLatency - For targets where it is beneficial to
  /// backschedule instructions that compute addresses, return a value
  /// indicating the number of scheduling cycles of backscheduling that
  /// should be attempted.
  unsigned getSpecialAddressLatency() const;
 };
 namespace X86 {