R600: Use a refined heuristic to choose when switching clause

This is using a hint from AMD APP OpenCL Programming Guide with
empirically tweaked parameters.
I used Unigine Heaven 3.0 to determine best parameters on my system
(i7 2600/Radeon 6950/Kernel 3.9.4) the benchmark :
it went from 38.8 average fps to 39.6, which is ~3% gain.
(Lightmark 2008.2 gain is much more marginal: from 537 to 539)

There is no lit test provided as the parameter were determined
empirically and it it would be nearly impossiblet to find a test
program that check for optimal behavior.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@183593 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Vincent Lejeune 2013-06-07 23:30:34 +00:00
parent b01bdf87ff
commit 843c6c2d0e
2 changed files with 47 additions and 10 deletions

View File

@ -38,7 +38,8 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
AluInstCount = 0;
FetchInstCount = 0;
}
void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
@ -48,6 +49,12 @@ void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
QSrc.clear();
}
static
unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
assert (GPRCount && "GPRCount cannot be 0");
return 248 / GPRCount;
}
SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
SUnit *SU = 0;
NextInstKind = IDOther;
@ -60,6 +67,32 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
(!Available[IDFetch].empty() || !Available[IDOther].empty());
if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
// We use the heuristic provided by AMD Accelerated Parallel Processing
// OpenCL Programming Guide :
// The approx. number of WF that allows TEX inst to hide ALU inst is :
// 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
float ALUFetchRationEstimate =
(AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
(FetchInstCount + Available[IDFetch].size());
unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
// We assume the local GPR requirements to be "dominated" by the requirement
// of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
// after TEX are indeed likely to consume or generate values from/for the
// TEX clause.
// Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
// We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
// one GPR) or TmXYZW = TnXYZW (need 2 GPR).
// (TODO : use RegisterPressure)
// If we are going too use too many GPR, we flush Fetch instruction to lower
// register pressure on 128 bits regs.
unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
AllowSwitchFromAlu = true;
}
// We want to scheduled AR defs as soon as possible to make sure they aren't
// put in a different ALU clause from their uses.
if (!SU && !UnscheduledARDefs.empty()) {
@ -133,6 +166,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
}
if (CurInstKind == IDAlu) {
AluInstCount ++;
switch (getAluKind(SU)) {
case AluT_XYZW:
CurEmitted += 4;
@ -158,7 +192,8 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
if (CurInstKind != IDFetch) {
MoveUnits(Pending[IDFetch], Available[IDFetch]);
}
} else
FetchInstCount++;
}
static bool
@ -370,16 +405,15 @@ SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) {
return UnslotedSU;
}
bool R600SchedStrategy::isAvailablesAluEmpty() const {
return Pending[IDAlu].empty() && AvailableAlus[AluAny].empty() &&
AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() &&
AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() &&
AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty() &&
AvailableAlus[AluPredX].empty();
unsigned R600SchedStrategy::AvailablesAluCount() const {
return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
AvailableAlus[AluDiscarded].size() + AvailableAlus[AluPredX].size();
}
SUnit* R600SchedStrategy::pickAlu() {
while (!isAvailablesAluEmpty()) {
while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
if (!OccupedSlotsMask) {
// Bottom up scheduling : predX must comes first
if (!AvailableAlus[AluPredX].empty()) {

View File

@ -60,6 +60,9 @@ class R600SchedStrategy : public MachineSchedStrategy {
int CurEmitted;
InstKind NextInstKind;
unsigned AluInstCount;
unsigned FetchInstCount;
int InstKindLimit[IDLast];
int OccupedSlotsMask;
@ -85,7 +88,7 @@ private:
bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
AluKind getAluKind(SUnit *SU) const;
void LoadAlu();
bool isAvailablesAluEmpty() const;
unsigned AvailablesAluCount() const;
SUnit *AttemptFillSlot (unsigned Slot);
void PrepareNextSlot();
SUnit *PopInst(std::vector<SUnit*> &Q);