diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td index 18430d47a46..a8bb52d90dc 100644 --- a/lib/Target/PowerPC/PPC.td +++ b/lib/Target/PowerPC/PPC.td @@ -254,7 +254,7 @@ def : ProcessorModel<"pwr6x", G5Model, FeatureSTFIWX, FeatureLFIWAX, FeatureFPRND, Feature64Bit, DeprecatedMFTB, DeprecatedDST]>; -def : ProcessorModel<"pwr7", G5Model, +def : ProcessorModel<"pwr7", P7Model, [DirectivePwr7, FeatureAltivec, FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES, diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td index ce048f9d224..1221d414999 100644 --- a/lib/Target/PowerPC/PPCSchedule.td +++ b/lib/Target/PowerPC/PPCSchedule.td @@ -117,6 +117,7 @@ include "PPCSchedule440.td" include "PPCScheduleG4.td" include "PPCScheduleG4Plus.td" include "PPCScheduleG5.td" +include "PPCScheduleP7.td" include "PPCScheduleA2.td" include "PPCScheduleE500mc.td" include "PPCScheduleE5500.td" diff --git a/lib/Target/PowerPC/PPCScheduleP7.td b/lib/Target/PowerPC/PPCScheduleP7.td new file mode 100644 index 00000000000..95b5a8b2c65 --- /dev/null +++ b/lib/Target/PowerPC/PPCScheduleP7.td @@ -0,0 +1,386 @@ +//===-- PPCScheduleP7.td - PPC P7 Scheduling Definitions ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the POWER7 processor. +// +//===----------------------------------------------------------------------===// + +// Primary reference: +// IBM POWER7 multicore server processor +// B. Sinharoy, et al. +// IBM J. Res. & Dev. (55) 3. May/June 2011. + +// Scheduling for the P7 involves tracking two types of resources: +// 1. The dispatch bundle slots +// 2. The functional unit resources + +// Dispatch units: +def P7_DU1 : FuncUnit; +def P7_DU2 : FuncUnit; +def P7_DU3 : FuncUnit; +def P7_DU4 : FuncUnit; +def P7_DU5 : FuncUnit; +def P7_DU6 : FuncUnit; + +def P7_LS1 : FuncUnit; // Load/Store pipeline 1 +def P7_LS2 : FuncUnit; // Load/Store pipeline 2 + +def P7_FX1 : FuncUnit; // FX pipeline 1 +def P7_FX2 : FuncUnit; // FX pipeline 2 + +// VS pipeline 1 (vector integer ops. always here) +def P7_VS1 : FuncUnit; // VS pipeline 1 +// VS pipeline 2 (128-bit stores and perms. here) +def P7_VS2 : FuncUnit; // VS pipeline 2 + +def P7_CRU : FuncUnit; // CR unit (CR logicals and move-from-SPRs) +def P7_BRU : FuncUnit; // BR unit + +// Notes: +// Each LSU pipeline can also execute FX add and logical instructions. +// Each LSU pipeline can complete a load or store in one cycle. +// +// Each store is broken into two parts, AGEN goes to the LSU while a +// "data steering" op. goes to the FXU or VSU. +// +// FX loads have a two cycle load-to-use latency (so one "bubble" cycle). +// VSU loads have a three cycle load-to-use latency (so two "bubble" cycle). +// +// Frequent FX ops. take only one cycle and results can be used again in the +// next cycle (there is a self-bypass). Getting results from the other FX +// pipeline takes an additional cycle. +// +// The VSU XS is similar to the POWER6, but with a pipeline length of 2 cycles +// (instead of 3 cycles on the POWER6). VSU XS handles vector FX-style ops. +// Dispatch of an instruction to VS1 that uses four single prec. inputs +// (either to a float or XC op). prevents dispatch in that cycle to VS2 of any +// floating point instruction. +// +// The VSU PM is similar to the POWER6, but with a pipeline length of 3 cycles +// (instead of 4 cycles on the POWER6). vsel is handled by the PM pipeline +// (unlike on the POWER6). +// +// FMA from the VSUs can forward results in 6 cycles. VS1 XS and vector FP +// share the same write-back, and have a 5-cycle latency difference, so the +// IFU/IDU will not dispatch an XS instructon 5 cycles after a vector FP +// op. has been dispatched to VS1. +// +// Three cycles after an L1 cache hit, a dependent VSU instruction can issue. +// +// Instruction dispatch groups have (at most) four non-branch instructions, and +// two branches. Unlike on the POWER4/5, a branch does not automatically +// end the dispatch group, but a second branch must be the last in the group. + +def P7Itineraries : ProcessorItineraries< + [P7_DU1, P7_DU2, P7_DU3, P7_DU4, P7_DU5, P7_DU6, + P7_LS1, P7_LS2, P7_FX1, P7_FX2, P7_VS1, P7_VS2, P7_CRU, P7_BRU], [], [ + InstrItinData, + InstrStage<1, [P7_FX1, P7_FX2, + P7_LS1, P7_LS2]>], + [1, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_DU2], 0>, + InstrStage<36, [P7_FX1, P7_FX2]>], + [36, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_DU2], 0>, + InstrStage<68, [P7_FX1, P7_FX2]>], + [68, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_FX1, P7_FX2]>], + [4, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_FX1, P7_FX2]>], + [4, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_FX1, P7_FX2]>], + [4, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1]>, + InstrItinData, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1]>, + InstrItinData, + InstrStage<1, [P7_BRU]>], + [3, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_BRU]>], + [3, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_BRU]>], + [3, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_BRU]>], + [3, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_LS1, P7_LS2]>], + [2, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [2, 2, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [3, 3, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_LS1, P7_LS2]>], + [2, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [2, 2, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [3, 3, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_LS1, P7_LS2]>], + [3, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_LS1, P7_LS2]>], + [3, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [3, 3, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [3, 3, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [3, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [4, 4, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [4, 4, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [3, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>], + [3, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>], + [3, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_LS1, P7_LS2]>], + [2, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [2, 1, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [2, 1, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [1, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [2, 1, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_VS2]>], + [1, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>], + [1, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>], + [1, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_CRU]>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [3, 1]>, // mtcr + InstrItinData, + InstrStage<1, [P7_CRU]>], + [6, 1]>, + InstrItinData, + InstrStage<1, [P7_CRU]>], + [3, 1]>, + InstrItinData, + InstrStage<1, [P7_VS1, P7_VS2]>], + [5, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_VS1, P7_VS2]>], + [8, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_VS1, P7_VS2]>], + [33, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_VS1, P7_VS2]>], + [27, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_VS1, P7_VS2]>], + [44, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_VS1, P7_VS2]>], + [32, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_VS1, P7_VS2]>], + [5, 1, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_VS1, P7_VS2]>], + [5, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_VS1]>], + [2, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_VS1]>], + [2, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_VS1]>], + [2, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_VS1, P7_VS2]>], + [6, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_VS1, P7_VS2]>], + [6, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_VS1, P7_VS2]>], + [6, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_VS1]>], + [7, 1, 1]>, + InstrItinData, + InstrStage<1, [P7_VS2]>], + [2, 1, 1]> +]>; + +// ===---------------------------------------------------------------------===// +// P7 machine model for scheduling and other instruction cost heuristics. + +def P7Model : SchedMachineModel { + let IssueWidth = 6; // 4 (non-branch) instructions are dispatched per cycle. + // Note that the dispatch bundle size is 6 (including + // branches), but the total internal issue bandwidth per + // cycle (from all queues) is 8. + + let MinLatency = 0; // Out-of-order dispatch. + let LoadLatency = 3; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + let MispredictPenalty = 16; + + let Itineraries = P7Itineraries; +} + diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp index 7231ab101a2..4de558377b6 100644 --- a/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/lib/Target/PowerPC/PPCSubtarget.cpp @@ -179,7 +179,7 @@ bool PPCSubtarget::enablePostRAScheduler( return OptLevel >= CodeGenOpt::Default; } -// Embedded cores need aggressive scheduling. +// Embedded cores need aggressive scheduling (and some others also benefit). static bool needsAggressiveScheduling(unsigned Directive) { switch (Directive) { default: return false; @@ -187,6 +187,7 @@ static bool needsAggressiveScheduling(unsigned Directive) { case PPC::DIR_A2: case PPC::DIR_E500mc: case PPC::DIR_E5500: + case PPC::DIR_PWR7: return true; } }