llvm-6502/lib/Target/ARM/ARMHazardRecognizer.cpp

//===-- ARMHazardRecognizer.cpp - ARM postra hazard recognizer ------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//

#include "ARMHazardRecognizer.h"
#include "ARMBaseInstrInfo.h"
#include "ARMSubtarget.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/Target/TargetRegisterInfo.h"
using namespace llvm;

static bool hasRAWHazard(MachineInstr *DefMI, MachineInstr *MI,
                         const TargetRegisterInfo &TRI) {
  // FIXME: Detect integer instructions properly.
  const TargetInstrDesc &TID = MI->getDesc();
  unsigned Domain = TID.TSFlags & ARMII::DomainMask;
  if (Domain == ARMII::DomainVFP) {
    unsigned Opcode = MI->getOpcode();
    if (Opcode == ARM::VSTRS || Opcode == ARM::VSTRD ||
        Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
      return false;
  } else if (Domain == ARMII::DomainNEON) {
    if (MI->getDesc().mayStore() || MI->getDesc().mayLoad())
      return false;
  } else
    return false;
  return MI->readsRegister(DefMI->getOperand(0).getReg(), &TRI);
}

ScheduleHazardRecognizer::HazardType
ARMHazardRecognizer::getHazardType(SUnit *SU) {
  MachineInstr *MI = SU->getInstr();

  if (!MI->isDebugValue()) {
    if (ITBlockSize && MI != ITBlockMIs[ITBlockSize-1])
      return Hazard;

    // Look for special VMLA / VMLS hazards. A VMUL / VADD / VSUB following
    // a VMLA / VMLS will cause 4 cycle stall.
    const TargetInstrDesc &TID = MI->getDesc();
    if (LastMI && (TID.TSFlags & ARMII::DomainMask) != ARMII::DomainGeneral) {
      MachineInstr *DefMI = LastMI;
      const TargetInstrDesc &LastTID = LastMI->getDesc();
      // Skip over one non-VFP / NEON instruction.
      if (!LastTID.isBarrier() &&
          (LastTID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) {
        MachineBasicBlock::iterator I = LastMI;
        if (I != LastMI->getParent()->begin()) {
          I = llvm::prior(I);
          DefMI = &*I;
        }
      }

      if (TII.isFpMLxInstruction(DefMI->getOpcode()) &&
          (TII.canCauseFpMLxStall(MI->getOpcode()) ||
           hasRAWHazard(DefMI, MI, TRI))) {
        // Try to schedule another instruction for the next 4 cycles.
        if (Stalls == 0)
          Stalls = 4;
        return Hazard;
      }
    }
  }

  return ScoreboardHazardRecognizer::getHazardType(SU);
}

void ARMHazardRecognizer::Reset() {
  LastMI = 0;
  Stalls = 0;
  ITBlockSize = 0;
  ScoreboardHazardRecognizer::Reset();
}

void ARMHazardRecognizer::EmitInstruction(SUnit *SU) {
  MachineInstr *MI = SU->getInstr();
  unsigned Opcode = MI->getOpcode();
  if (ITBlockSize) {
    --ITBlockSize;
  } else if (Opcode == ARM::t2IT) {
    unsigned Mask = MI->getOperand(1).getImm();
    unsigned NumTZ = CountTrailingZeros_32(Mask);
    assert(NumTZ <= 3 && "Invalid IT mask!");
    ITBlockSize = 4 - NumTZ;
    MachineBasicBlock::iterator I = MI;
    for (unsigned i = 0; i < ITBlockSize; ++i) {
      // Advance to the next instruction, skipping any dbg_value instructions.
      do {
        ++I;
      } while (I->isDebugValue());
      ITBlockMIs[ITBlockSize-1-i] = &*I;
    }
  }

  if (!MI->isDebugValue()) {
    LastMI = MI;
    Stalls = 0;
  }

  ScoreboardHazardRecognizer::EmitInstruction(SU);
}

void ARMHazardRecognizer::AdvanceCycle() {
  if (Stalls && --Stalls == 0)
    // Stalled for 4 cycles but still can't schedule any other instructions.
    LastMI = 0;
  ScoreboardHazardRecognizer::AdvanceCycle();
}

void ARMHazardRecognizer::RecedeCycle() {
  llvm_unreachable("reverse ARM hazard checking unsupported");
}
Making use of VFP / NEON floating point multiply-accumulate / subtraction is difficult on current ARM implementations for a few reasons. 1. Even though a single vmla has latency that is one cycle shorter than a pair of vmul + vadd, a RAW hazard during the first (4? on Cortex-a8) can cause additional pipeline stall. So it's frequently better to single codegen vmul + vadd. 2. A vmla folowed by a vmul, vmadd, or vsub causes the second fp instruction to stall for 4 cycles. We need to schedule them apart. 3. A vmla followed vmla is a special case. Obvious issuing back to back RAW vmla + vmla is very bad. But this isn't ideal either: vmul vadd vmla Instead, we want to expand the second vmla: vmla vmul vadd Even with the 4 cycle vmul stall, the second sequence is still 2 cycles faster. Up to now, isel simply avoid codegen'ing fp vmla / vmls. This works well enough but it isn't the optimial solution. This patch attempts to make it possible to use vmla / vmls in cases where it is profitable. A. Add missing isel predicates which cause vmla to be codegen'ed. B. Make sure the fmul in (fadd (fmul)) has a single use. We don't want to compute a fmul and a fmla. C. Add additional isel checks for vmla, avoid cases where vmla is feeding into fp instructions (except for the #3 exceptional case). D. Add ARM hazard recognizer to model the vmla / vmls hazards. E. Add a special pre-regalloc case to expand vmla / vmls when it's likely the vmla / vmls will trigger one of the special hazards. Work in progress, only A+B are enabled. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@120960 91177308-0d34-0410-b5e6-96231b3b80d8 2010-12-05 22:04:16 +00:00			`//===-- ARMHazardRecognizer.cpp - ARM postra hazard recognizer ------------===//`
			`//`
			`// The LLVM Compiler Infrastructure`
			`//`
			`// This file is distributed under the University of Illinois Open Source`
			`// License. See LICENSE.TXT for details.`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#include "ARMHazardRecognizer.h"`
			`#include "ARMBaseInstrInfo.h"`
			`#include "ARMSubtarget.h"`
			`#include "llvm/CodeGen/MachineInstr.h"`
			`#include "llvm/CodeGen/ScheduleDAG.h"`
			`#include "llvm/Target/TargetRegisterInfo.h"`
			`using namespace llvm;`

			`static bool hasRAWHazard(MachineInstr DefMI, MachineInstr MI,`
			`const TargetRegisterInfo &TRI) {`
			`// FIXME: Detect integer instructions properly.`
			`const TargetInstrDesc &TID = MI->getDesc();`
			`unsigned Domain = TID.TSFlags & ARMII::DomainMask;`
			`if (Domain == ARMII::DomainVFP) {`
			`unsigned Opcode = MI->getOpcode();`
			`if (Opcode == ARM::VSTRS \|\| Opcode == ARM::VSTRD \|\|`
			`Opcode == ARM::VMOVRS \|\| Opcode == ARM::VMOVRRD)`
			`return false;`
			`} else if (Domain == ARMII::DomainNEON) {`
			`if (MI->getDesc().mayStore() \|\| MI->getDesc().mayLoad())`
			`return false;`
			`} else`
			`return false;`
			`return MI->readsRegister(DefMI->getOperand(0).getReg(), &TRI);`
			`}`

			`ScheduleHazardRecognizer::HazardType`
			`ARMHazardRecognizer::getHazardType(SUnit *SU) {`
			`MachineInstr *MI = SU->getInstr();`

			`if (!MI->isDebugValue()) {`
			`if (ITBlockSize && MI != ITBlockMIs[ITBlockSize-1])`
			`return Hazard;`

			`// Look for special VMLA / VMLS hazards. A VMUL / VADD / VSUB following`
			`// a VMLA / VMLS will cause 4 cycle stall.`
			`const TargetInstrDesc &TID = MI->getDesc();`
			`if (LastMI && (TID.TSFlags & ARMII::DomainMask) != ARMII::DomainGeneral) {`
			`MachineInstr *DefMI = LastMI;`
			`const TargetInstrDesc &LastTID = LastMI->getDesc();`
			`// Skip over one non-VFP / NEON instruction.`
			`if (!LastTID.isBarrier() &&`
			`(LastTID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) {`
			`MachineBasicBlock::iterator I = LastMI;`
			`if (I != LastMI->getParent()->begin()) {`
			`I = llvm::prior(I);`
			`DefMI = &*I;`
			`}`
			`}`

			`if (TII.isFpMLxInstruction(DefMI->getOpcode()) &&`
			`(TII.canCauseFpMLxStall(MI->getOpcode()) \|\|`
			`hasRAWHazard(DefMI, MI, TRI))) {`
			`// Try to schedule another instruction for the next 4 cycles.`
			`if (Stalls == 0)`
			`Stalls = 4;`
			`return Hazard;`
			`}`
			`}`
			`}`

Generalize PostRAHazardRecognizer so it can be used in any pass for both forward and backward scheduling. Rename it to ScoreboardHazardRecognizer (Scoreboard is one word). Remove integer division from the scoreboard's critical path. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@121274 91177308-0d34-0410-b5e6-96231b3b80d8 2010-12-08 20:04:29 +00:00			`return ScoreboardHazardRecognizer::getHazardType(SU);`
Making use of VFP / NEON floating point multiply-accumulate / subtraction is difficult on current ARM implementations for a few reasons. 1. Even though a single vmla has latency that is one cycle shorter than a pair of vmul + vadd, a RAW hazard during the first (4? on Cortex-a8) can cause additional pipeline stall. So it's frequently better to single codegen vmul + vadd. 2. A vmla folowed by a vmul, vmadd, or vsub causes the second fp instruction to stall for 4 cycles. We need to schedule them apart. 3. A vmla followed vmla is a special case. Obvious issuing back to back RAW vmla + vmla is very bad. But this isn't ideal either: vmul vadd vmla Instead, we want to expand the second vmla: vmla vmul vadd Even with the 4 cycle vmul stall, the second sequence is still 2 cycles faster. Up to now, isel simply avoid codegen'ing fp vmla / vmls. This works well enough but it isn't the optimial solution. This patch attempts to make it possible to use vmla / vmls in cases where it is profitable. A. Add missing isel predicates which cause vmla to be codegen'ed. B. Make sure the fmul in (fadd (fmul)) has a single use. We don't want to compute a fmul and a fmla. C. Add additional isel checks for vmla, avoid cases where vmla is feeding into fp instructions (except for the #3 exceptional case). D. Add ARM hazard recognizer to model the vmla / vmls hazards. E. Add a special pre-regalloc case to expand vmla / vmls when it's likely the vmla / vmls will trigger one of the special hazards. Work in progress, only A+B are enabled. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@120960 91177308-0d34-0410-b5e6-96231b3b80d8 2010-12-05 22:04:16 +00:00			`}`

			`void ARMHazardRecognizer::Reset() {`
			`LastMI = 0;`
			`Stalls = 0;`
			`ITBlockSize = 0;`
Generalize PostRAHazardRecognizer so it can be used in any pass for both forward and backward scheduling. Rename it to ScoreboardHazardRecognizer (Scoreboard is one word). Remove integer division from the scoreboard's critical path. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@121274 91177308-0d34-0410-b5e6-96231b3b80d8 2010-12-08 20:04:29 +00:00			`ScoreboardHazardRecognizer::Reset();`
Making use of VFP / NEON floating point multiply-accumulate / subtraction is difficult on current ARM implementations for a few reasons. 1. Even though a single vmla has latency that is one cycle shorter than a pair of vmul + vadd, a RAW hazard during the first (4? on Cortex-a8) can cause additional pipeline stall. So it's frequently better to single codegen vmul + vadd. 2. A vmla folowed by a vmul, vmadd, or vsub causes the second fp instruction to stall for 4 cycles. We need to schedule them apart. 3. A vmla followed vmla is a special case. Obvious issuing back to back RAW vmla + vmla is very bad. But this isn't ideal either: vmul vadd vmla Instead, we want to expand the second vmla: vmla vmul vadd Even with the 4 cycle vmul stall, the second sequence is still 2 cycles faster. Up to now, isel simply avoid codegen'ing fp vmla / vmls. This works well enough but it isn't the optimial solution. This patch attempts to make it possible to use vmla / vmls in cases where it is profitable. A. Add missing isel predicates which cause vmla to be codegen'ed. B. Make sure the fmul in (fadd (fmul)) has a single use. We don't want to compute a fmul and a fmla. C. Add additional isel checks for vmla, avoid cases where vmla is feeding into fp instructions (except for the #3 exceptional case). D. Add ARM hazard recognizer to model the vmla / vmls hazards. E. Add a special pre-regalloc case to expand vmla / vmls when it's likely the vmla / vmls will trigger one of the special hazards. Work in progress, only A+B are enabled. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@120960 91177308-0d34-0410-b5e6-96231b3b80d8 2010-12-05 22:04:16 +00:00			`}`

			`void ARMHazardRecognizer::EmitInstruction(SUnit *SU) {`
			`MachineInstr *MI = SU->getInstr();`
			`unsigned Opcode = MI->getOpcode();`
			`if (ITBlockSize) {`
			`--ITBlockSize;`
			`} else if (Opcode == ARM::t2IT) {`
			`unsigned Mask = MI->getOperand(1).getImm();`
			`unsigned NumTZ = CountTrailingZeros_32(Mask);`
			`assert(NumTZ <= 3 && "Invalid IT mask!");`
			`ITBlockSize = 4 - NumTZ;`
			`MachineBasicBlock::iterator I = MI;`
			`for (unsigned i = 0; i < ITBlockSize; ++i) {`
			`// Advance to the next instruction, skipping any dbg_value instructions.`
			`do {`
			`++I;`
			`} while (I->isDebugValue());`
			`ITBlockMIs[ITBlockSize-1-i] = &*I;`
			`}`
			`}`

			`if (!MI->isDebugValue()) {`
			`LastMI = MI;`
			`Stalls = 0;`
			`}`

Generalize PostRAHazardRecognizer so it can be used in any pass for both forward and backward scheduling. Rename it to ScoreboardHazardRecognizer (Scoreboard is one word). Remove integer division from the scoreboard's critical path. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@121274 91177308-0d34-0410-b5e6-96231b3b80d8 2010-12-08 20:04:29 +00:00			`ScoreboardHazardRecognizer::EmitInstruction(SU);`
Making use of VFP / NEON floating point multiply-accumulate / subtraction is difficult on current ARM implementations for a few reasons. 1. Even though a single vmla has latency that is one cycle shorter than a pair of vmul + vadd, a RAW hazard during the first (4? on Cortex-a8) can cause additional pipeline stall. So it's frequently better to single codegen vmul + vadd. 2. A vmla folowed by a vmul, vmadd, or vsub causes the second fp instruction to stall for 4 cycles. We need to schedule them apart. 3. A vmla followed vmla is a special case. Obvious issuing back to back RAW vmla + vmla is very bad. But this isn't ideal either: vmul vadd vmla Instead, we want to expand the second vmla: vmla vmul vadd Even with the 4 cycle vmul stall, the second sequence is still 2 cycles faster. Up to now, isel simply avoid codegen'ing fp vmla / vmls. This works well enough but it isn't the optimial solution. This patch attempts to make it possible to use vmla / vmls in cases where it is profitable. A. Add missing isel predicates which cause vmla to be codegen'ed. B. Make sure the fmul in (fadd (fmul)) has a single use. We don't want to compute a fmul and a fmla. C. Add additional isel checks for vmla, avoid cases where vmla is feeding into fp instructions (except for the #3 exceptional case). D. Add ARM hazard recognizer to model the vmla / vmls hazards. E. Add a special pre-regalloc case to expand vmla / vmls when it's likely the vmla / vmls will trigger one of the special hazards. Work in progress, only A+B are enabled. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@120960 91177308-0d34-0410-b5e6-96231b3b80d8 2010-12-05 22:04:16 +00:00			`}`

			`void ARMHazardRecognizer::AdvanceCycle() {`
			`if (Stalls && --Stalls == 0)`
			`// Stalled for 4 cycles but still can't schedule any other instructions.`
			`LastMI = 0;`
Generalize PostRAHazardRecognizer so it can be used in any pass for both forward and backward scheduling. Rename it to ScoreboardHazardRecognizer (Scoreboard is one word). Remove integer division from the scoreboard's critical path. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@121274 91177308-0d34-0410-b5e6-96231b3b80d8 2010-12-08 20:04:29 +00:00			`ScoreboardHazardRecognizer::AdvanceCycle();`
			`}`

			`void ARMHazardRecognizer::RecedeCycle() {`
			`llvm_unreachable("reverse ARM hazard checking unsupported");`
Making use of VFP / NEON floating point multiply-accumulate / subtraction is difficult on current ARM implementations for a few reasons. 1. Even though a single vmla has latency that is one cycle shorter than a pair of vmul + vadd, a RAW hazard during the first (4? on Cortex-a8) can cause additional pipeline stall. So it's frequently better to single codegen vmul + vadd. 2. A vmla folowed by a vmul, vmadd, or vsub causes the second fp instruction to stall for 4 cycles. We need to schedule them apart. 3. A vmla followed vmla is a special case. Obvious issuing back to back RAW vmla + vmla is very bad. But this isn't ideal either: vmul vadd vmla Instead, we want to expand the second vmla: vmla vmul vadd Even with the 4 cycle vmul stall, the second sequence is still 2 cycles faster. Up to now, isel simply avoid codegen'ing fp vmla / vmls. This works well enough but it isn't the optimial solution. This patch attempts to make it possible to use vmla / vmls in cases where it is profitable. A. Add missing isel predicates which cause vmla to be codegen'ed. B. Make sure the fmul in (fadd (fmul)) has a single use. We don't want to compute a fmul and a fmla. C. Add additional isel checks for vmla, avoid cases where vmla is feeding into fp instructions (except for the #3 exceptional case). D. Add ARM hazard recognizer to model the vmla / vmls hazards. E. Add a special pre-regalloc case to expand vmla / vmls when it's likely the vmla / vmls will trigger one of the special hazards. Work in progress, only A+B are enabled. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@120960 91177308-0d34-0410-b5e6-96231b3b80d8 2010-12-05 22:04:16 +00:00			`}`