diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h index 6615ee4aacd..936345c5046 100644 --- a/include/llvm/Target/TargetInstrInfo.h +++ b/include/llvm/Target/TargetInstrInfo.h @@ -24,6 +24,7 @@ class InstrItineraryData; class LiveVariables; class MCAsmInfo; class MachineMemOperand; +class MachineRegisterInfo; class MDNode; class MCInst; class SDNode; @@ -625,6 +626,19 @@ public: int getOperandLatency(const InstrItineraryData *ItinData, SDNode *DefNode, unsigned DefIdx, SDNode *UseNode, unsigned UseIdx) const; + + /// hasHighOperandLatency - Compute operand latency between a def of 'Reg' + /// and an use in the current loop, return true if the target considered + /// it 'high'. This is used by optimization passes such as machine LICM to + /// determine whether it makes sense to hoist an instruction out even in + /// high register pressure situation. + virtual + bool hasHighOperandLatency(const InstrItineraryData *ItinData, + const MachineRegisterInfo *MRI, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const { + return false; + } }; /// TargetInstrInfoImpl - This is the default implementation of diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp index 607e8f15bc0..829fae6edbd 100644 --- a/lib/CodeGen/MachineLICM.cpp +++ b/lib/CodeGen/MachineLICM.cpp @@ -43,11 +43,6 @@ using namespace llvm; -static cl::opt -TrackRegPressure("rp-aware-machine-licm", - cl::desc("Register pressure aware machine LICM"), - cl::init(false), cl::Hidden); - STATISTIC(NumHoisted, "Number of machine instructions hoisted out of loops"); STATISTIC(NumLowRP, @@ -124,6 +119,7 @@ namespace { RegSeen.clear(); RegPressure.clear(); RegLimit.clear(); + BackTrace.clear(); for (DenseMap >::iterator CI = CSEMap.begin(), CE = CSEMap.end(); CI != CE; ++CI) CI->second.clear(); @@ -171,9 +167,10 @@ namespace { /// bool IsLoopInvariantInst(MachineInstr &I); - /// ComputeOperandLatency - Compute operand latency between a def of 'Reg' - /// and an use in the current loop. - int ComputeOperandLatency(MachineInstr &MI, unsigned DefIdx, unsigned Reg); + /// HasHighOperandLatency - Compute operand latency between a def of 'Reg' + /// and an use in the current loop, return true if the target considered + /// it 'high'. + bool HasHighOperandLatency(MachineInstr &MI, unsigned DefIdx, unsigned Reg); /// IncreaseHighRegPressure - Visit BBs from preheader to current BB, check /// if hoisting an instruction of the given cost matrix can cause high @@ -556,28 +553,24 @@ void MachineLICM::HoistRegion(MachineDomTreeNode *N, bool IsHeader) { if (!Preheader) return; - if (TrackRegPressure) { - if (IsHeader) { - // Compute registers which are liveout of preheader. - RegSeen.clear(); - BackTrace.clear(); - InitRegPressure(Preheader); - } - - // Remember livein register pressure. - BackTrace.push_back(RegPressure); + if (IsHeader) { + // Compute registers which are liveout of preheader. + RegSeen.clear(); + BackTrace.clear(); + InitRegPressure(Preheader); } + // Remember livein register pressure. + BackTrace.push_back(RegPressure); + for (MachineBasicBlock::iterator MII = BB->begin(), E = BB->end(); MII != E; ) { MachineBasicBlock::iterator NextMII = MII; ++NextMII; MachineInstr *MI = &*MII; - if (TrackRegPressure) - UpdateRegPressureBefore(MI); + UpdateRegPressureBefore(MI); Hoist(MI, Preheader); - if (TrackRegPressure) - UpdateRegPressureAfter(MI); + UpdateRegPressureAfter(MI); MII = NextMII; } @@ -591,8 +584,7 @@ void MachineLICM::HoistRegion(MachineDomTreeNode *N, bool IsHeader) { HoistRegion(Children[I]); } - if (TrackRegPressure) - BackTrace.pop_back(); + BackTrace.pop_back(); } /// InitRegPressure - Find all virtual register references that are liveout of @@ -788,15 +780,14 @@ bool MachineLICM::isLoadFromConstantMemory(MachineInstr *MI) { } } -/// ComputeOperandLatency - Compute operand latency between a def of 'Reg' -/// and an use in the current loop. -int MachineLICM::ComputeOperandLatency(MachineInstr &MI, - unsigned DefIdx, unsigned Reg) { +/// HasHighOperandLatency - Compute operand latency between a def of 'Reg' +/// and an use in the current loop, return true if the target considered +/// it 'high'. +bool MachineLICM::HasHighOperandLatency(MachineInstr &MI, + unsigned DefIdx, unsigned Reg) { if (MRI->use_nodbg_empty(Reg)) - // No use? Return arbitrary large number! - return 300; + return false; - int Latency = -1; for (MachineRegisterInfo::use_nodbg_iterator I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end(); I != E; ++I) { MachineInstr *UseMI = &*I; @@ -810,18 +801,15 @@ int MachineLICM::ComputeOperandLatency(MachineInstr &MI, if (MOReg != Reg) continue; - int UseCycle = TII->getOperandLatency(InstrItins, &MI, DefIdx, UseMI, i); - Latency = std::max(Latency, UseCycle); + if (TII->hasHighOperandLatency(InstrItins, MRI, &MI, DefIdx, UseMI, i)) + return true; } - if (Latency != -1) - break; + // Only look at the first in loop use. + break; } - if (Latency == -1) - Latency = InstrItins->getOperandCycle(MI.getDesc().getSchedClass(), DefIdx); - - return Latency; + return false; } /// IncreaseHighRegPressure - Visit BBs from preheader to current BB, check @@ -855,19 +843,19 @@ bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) { if (MI.isImplicitDef()) return true; - // FIXME: For now, only hoist re-materilizable instructions. LICM will - // increase register pressure. We want to make sure it doesn't increase - // spilling. + // If the instruction is cheap, only hoist if it is re-materilizable. LICM + // will increase register pressure. It's probably not worth it if the + // instruction is cheap. // Also hoist loads from constant memory, e.g. load from stubs, GOT. Hoisting // these tend to help performance in low register pressure situation. The // trade off is it may cause spill in high pressure situation. It will end up // adding a store in the loop preheader. But the reload is no more expensive. // The side benefit is these loads are frequently CSE'ed. - if (!TrackRegPressure || MI.getDesc().isAsCheapAsAMove()) { - if (!TII->isTriviallyReMaterializable(&MI, AA) && - !isLoadFromConstantMemory(&MI)) + if (MI.getDesc().isAsCheapAsAMove()) { + if (!TII->isTriviallyReMaterializable(&MI, AA)) return false; } else { + // Estimate register pressure to determine whether to LICM the instruction. // In low register pressure situation, we can be more aggressive about // hoisting. Also, favors hoisting long latency instructions even in // moderately high pressure situation. @@ -880,13 +868,9 @@ bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) { if (!Reg || TargetRegisterInfo::isPhysicalRegister(Reg)) continue; if (MO.isDef()) { - if (InstrItins && !InstrItins->isEmpty()) { - int Cycle = ComputeOperandLatency(MI, i, Reg); - if (Cycle > 3) { - // FIXME: Target specific high latency limit? - ++NumHighLatency; - return true; - } + if (HasHighOperandLatency(MI, i, Reg)) { + ++NumHighLatency; + return true; } const TargetRegisterClass *RC = MRI->getRegClass(Reg); diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index aca292abea5..0b5b2437abc 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1925,3 +1925,23 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, return getOperandLatency(ItinData, DefTID, DefIdx, DefAlign, UseTID, UseIdx, UseAlign); } + +bool ARMBaseInstrInfo:: +hasHighOperandLatency(const InstrItineraryData *ItinData, + const MachineRegisterInfo *MRI, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const { + unsigned DDomain = DefMI->getDesc().TSFlags & ARMII::DomainMask; + unsigned UDomain = UseMI->getDesc().TSFlags & ARMII::DomainMask; + if (Subtarget.isCortexA8() && + (DDomain == ARMII::DomainVFP || UDomain == ARMII::DomainVFP)) + // CortexA8 VFP instructions are not pipelined. + return true; + + // Hoist VFP / NEON instructions with 4 or higher latency. + int Latency = getOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx); + if (Latency <= 3) + return false; + return DDomain == ARMII::DomainVFP || DDomain == ARMII::DomainNEON || + UDomain == ARMII::DomainVFP || UDomain == ARMII::DomainNEON; +} diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index 36be3366d29..b3a832948f0 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -377,6 +377,11 @@ private: unsigned DefIdx, unsigned DefAlign, const TargetInstrDesc &UseTID, unsigned UseIdx, unsigned UseAlign) const; + + bool hasHighOperandLatency(const InstrItineraryData *ItinData, + const MachineRegisterInfo *MRI, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const; }; static inline diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 40ef3dbd0d7..79d9872a1b2 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -3152,6 +3152,41 @@ void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { NopInst.setOpcode(X86::NOOP); } +bool X86InstrInfo:: +hasHighOperandLatency(const InstrItineraryData *ItinData, + const MachineRegisterInfo *MRI, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const { + switch (DefMI->getOpcode()) { + default: return false; + case X86::DIVSDrm: + case X86::DIVSDrm_Int: + case X86::DIVSDrr: + case X86::DIVSDrr_Int: + case X86::DIVSSrm: + case X86::DIVSSrm_Int: + case X86::DIVSSrr: + case X86::DIVSSrr_Int: + case X86::SQRTPDm: + case X86::SQRTPDm_Int: + case X86::SQRTPDr: + case X86::SQRTPDr_Int: + case X86::SQRTPSm: + case X86::SQRTPSm_Int: + case X86::SQRTPSr: + case X86::SQRTPSr_Int: + case X86::SQRTSDm: + case X86::SQRTSDm_Int: + case X86::SQRTSDr: + case X86::SQRTSDr_Int: + case X86::SQRTSSm: + case X86::SQRTSSm_Int: + case X86::SQRTSSr: + case X86::SQRTSSr_Int: + return true; + } +} + namespace { /// CGBR - Create Global Base Reg pass. This initializes the PIC /// global base register for x86-32. diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index e43cfacae50..5060ad836af 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -864,6 +864,11 @@ public: unsigned OpNum, const SmallVectorImpl &MOs, unsigned Size, unsigned Alignment) const; + + bool hasHighOperandLatency(const InstrItineraryData *ItinData, + const MachineRegisterInfo *MRI, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const; private: MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc, diff --git a/test/CodeGen/ARM/remat.ll b/test/CodeGen/ARM/remat.ll deleted file mode 100644 index 6b86f1a9f36..00000000000 --- a/test/CodeGen/ARM/remat.ll +++ /dev/null @@ -1,65 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+v6,+vfp2 -o /dev/null -stats -info-output-file - | grep "Number of re-materialization" - -define i32 @main(i32 %argc, i8** nocapture %argv, double %d1, double %d2) nounwind { -entry: - br i1 undef, label %smvp.exit, label %bb.i3 - -bb.i3: ; preds = %bb.i3, %bb134 - br i1 undef, label %smvp.exit, label %bb.i3 - -smvp.exit: ; preds = %bb.i3 - %0 = fmul double %d1, 2.400000e-03 ; [#uses=2] - br i1 undef, label %bb138.preheader, label %bb159 - -bb138.preheader: ; preds = %smvp.exit - br label %bb138 - -bb138: ; preds = %bb138, %bb138.preheader - br i1 undef, label %bb138, label %bb145.loopexit - -bb142: ; preds = %bb.nph218.bb.nph218.split_crit_edge, %phi0.exit - %1 = fmul double %d1, -1.200000e-03 ; [#uses=1] - %2 = fadd double %d2, %1 ; [#uses=1] - %3 = fmul double %2, %d2 ; [#uses=1] - %4 = fsub double 0.000000e+00, %3 ; [#uses=1] - br i1 %14, label %phi1.exit, label %bb.i35 - -bb.i35: ; preds = %bb142 - %5 = call double @sin(double %15) nounwind readonly ; [#uses=1] - %6 = fmul double %5, 0x4031740AFA84AD8A ; [#uses=1] - %7 = fsub double 1.000000e+00, undef ; [#uses=1] - %8 = fdiv double %7, 6.000000e-01 ; [#uses=1] - br label %phi1.exit - -phi1.exit: ; preds = %bb.i35, %bb142 - %.pn = phi double [ %6, %bb.i35 ], [ 0.000000e+00, %bb142 ] ; [#uses=1] - %9 = phi double [ %8, %bb.i35 ], [ 0.000000e+00, %bb142 ] ; [#uses=1] - %10 = fmul double %.pn, %9 ; [#uses=1] - br i1 %14, label %phi0.exit, label %bb.i - -bb.i: ; preds = %phi1.exit - unreachable - -phi0.exit: ; preds = %phi1.exit - %11 = fsub double %4, %10 ; [#uses=1] - %12 = fadd double 0.000000e+00, %11 ; [#uses=1] - store double %12, double* undef, align 4 - br label %bb142 - -bb145.loopexit: ; preds = %bb138 - br i1 undef, label %bb.nph218.bb.nph218.split_crit_edge, label %bb159 - -bb.nph218.bb.nph218.split_crit_edge: ; preds = %bb145.loopexit - %13 = fmul double %0, 0x401921FB54442D18 ; [#uses=1] - %14 = fcmp ugt double %0, 6.000000e-01 ; [#uses=2] - %15 = fdiv double %13, 6.000000e-01 ; [#uses=1] - br label %bb142 - -bb159: ; preds = %bb145.loopexit, %smvp.exit, %bb134 - unreachable - -bb166: ; preds = %bb127 - unreachable -} - -declare double @sin(double) nounwind readonly diff --git a/test/CodeGen/Thumb2/machine-licm-vdup.ll b/test/CodeGen/Thumb2/machine-licm-vdup.ll index f7494ec7d25..2f17e27ab4e 100644 --- a/test/CodeGen/Thumb2/machine-licm-vdup.ll +++ b/test/CodeGen/Thumb2/machine-licm-vdup.ll @@ -2,17 +2,16 @@ ; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -relocation-model=pic -disable-fp-elim -arm-vdup-splat | FileCheck %s ; Modified version of machine-licm.ll with -arm-vdup-splat turned on, 8003375. ; Eventually this should become the default and be moved into machine-licm.ll. -; FIXME: the vdup should be hoisted out of the loop, 8248029. define void @t2(i8* %ptr1, i8* %ptr2) nounwind { entry: ; CHECK: t2: ; CHECK: mov.w r3, #1065353216 +; CHECK: vdup.32 q{{.*}}, r3 br i1 undef, label %bb1, label %bb2 bb1: ; CHECK-NEXT: %bb1 -; CHECK: vdup.32 q{{.*}}, r3 %indvar = phi i32 [ %indvar.next, %bb1 ], [ 0, %entry ] %tmp1 = shl i32 %indvar, 2 %gep1 = getelementptr i8* %ptr1, i32 %tmp1 diff --git a/test/CodeGen/X86/2008-10-27-CoalescerBug.ll b/test/CodeGen/X86/2008-10-27-CoalescerBug.ll index afeb358da57..9d144a4be0e 100644 --- a/test/CodeGen/X86/2008-10-27-CoalescerBug.ll +++ b/test/CodeGen/X86/2008-10-27-CoalescerBug.ll @@ -1,6 +1,9 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 -stats |& not grep {Number of register spills} +; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 -stats |& FileCheck %s +; Now this test spills one register. But a reload in the loop is cheaper than +; the divsd so it's a win. define fastcc void @fourn(double* %data, i32 %isign) nounwind { +; CHECK: fourn entry: br label %bb @@ -11,6 +14,11 @@ bb: ; preds = %bb, %entry %1 = icmp sgt i32 %0, 2 ; [#uses=1] br i1 %1, label %bb30.loopexit, label %bb +; CHECK: %bb30.loopexit +; CHECK: divsd %xmm0 +; CHECK: movsd %xmm0, 16(%esp) +; CHECK: .align +; CHECK-NEXT: %bb3 bb3: ; preds = %bb30.loopexit, %bb25, %bb3 %2 = load i32* null, align 4 ; [#uses=1] %3 = mul i32 %2, 0 ; [#uses=1] diff --git a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll index bb01e5afcef..4a97ac35afc 100644 --- a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll +++ b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse41 -stats |& grep {6 machine-licm} +; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse41 -stats |& grep {7 machine-licm} ; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse41 | FileCheck %s ; rdar://6627786 ; rdar://7792037