mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-11-01 15:11:24 +00:00
R600: Use a refined heuristic to choose when switching clause
This is using a hint from AMD APP OpenCL Programming Guide with empirically tweaked parameters. I used Unigine Heaven 3.0 to determine best parameters on my system (i7 2600/Radeon 6950/Kernel 3.9.4) the benchmark : it went from 38.8 average fps to 39.6, which is ~3% gain. (Lightmark 2008.2 gain is much more marginal: from 537 to 539) There is no lit test provided as the parameter were determined empirically and it it would be nearly impossiblet to find a test program that check for optimal behavior. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@183593 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
b01bdf87ff
commit
843c6c2d0e
@ -38,7 +38,8 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
|
||||
|
||||
const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
|
||||
InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
|
||||
|
||||
AluInstCount = 0;
|
||||
FetchInstCount = 0;
|
||||
}
|
||||
|
||||
void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
|
||||
@ -48,6 +49,12 @@ void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
|
||||
QSrc.clear();
|
||||
}
|
||||
|
||||
static
|
||||
unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
|
||||
assert (GPRCount && "GPRCount cannot be 0");
|
||||
return 248 / GPRCount;
|
||||
}
|
||||
|
||||
SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
|
||||
SUnit *SU = 0;
|
||||
NextInstKind = IDOther;
|
||||
@ -60,6 +67,32 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
|
||||
bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
|
||||
(!Available[IDFetch].empty() || !Available[IDOther].empty());
|
||||
|
||||
if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
|
||||
// We use the heuristic provided by AMD Accelerated Parallel Processing
|
||||
// OpenCL Programming Guide :
|
||||
// The approx. number of WF that allows TEX inst to hide ALU inst is :
|
||||
// 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
|
||||
float ALUFetchRationEstimate =
|
||||
(AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
|
||||
(FetchInstCount + Available[IDFetch].size());
|
||||
unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
|
||||
DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
|
||||
// We assume the local GPR requirements to be "dominated" by the requirement
|
||||
// of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
|
||||
// after TEX are indeed likely to consume or generate values from/for the
|
||||
// TEX clause.
|
||||
// Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
|
||||
// We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
|
||||
// one GPR) or TmXYZW = TnXYZW (need 2 GPR).
|
||||
// (TODO : use RegisterPressure)
|
||||
// If we are going too use too many GPR, we flush Fetch instruction to lower
|
||||
// register pressure on 128 bits regs.
|
||||
unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
|
||||
if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
|
||||
AllowSwitchFromAlu = true;
|
||||
}
|
||||
|
||||
|
||||
// We want to scheduled AR defs as soon as possible to make sure they aren't
|
||||
// put in a different ALU clause from their uses.
|
||||
if (!SU && !UnscheduledARDefs.empty()) {
|
||||
@ -133,6 +166,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
|
||||
}
|
||||
|
||||
if (CurInstKind == IDAlu) {
|
||||
AluInstCount ++;
|
||||
switch (getAluKind(SU)) {
|
||||
case AluT_XYZW:
|
||||
CurEmitted += 4;
|
||||
@ -158,7 +192,8 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
|
||||
|
||||
if (CurInstKind != IDFetch) {
|
||||
MoveUnits(Pending[IDFetch], Available[IDFetch]);
|
||||
}
|
||||
} else
|
||||
FetchInstCount++;
|
||||
}
|
||||
|
||||
static bool
|
||||
@ -370,16 +405,15 @@ SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) {
|
||||
return UnslotedSU;
|
||||
}
|
||||
|
||||
bool R600SchedStrategy::isAvailablesAluEmpty() const {
|
||||
return Pending[IDAlu].empty() && AvailableAlus[AluAny].empty() &&
|
||||
AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() &&
|
||||
AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() &&
|
||||
AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty() &&
|
||||
AvailableAlus[AluPredX].empty();
|
||||
unsigned R600SchedStrategy::AvailablesAluCount() const {
|
||||
return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
|
||||
AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
|
||||
AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
|
||||
AvailableAlus[AluDiscarded].size() + AvailableAlus[AluPredX].size();
|
||||
}
|
||||
|
||||
SUnit* R600SchedStrategy::pickAlu() {
|
||||
while (!isAvailablesAluEmpty()) {
|
||||
while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
|
||||
if (!OccupedSlotsMask) {
|
||||
// Bottom up scheduling : predX must comes first
|
||||
if (!AvailableAlus[AluPredX].empty()) {
|
||||
|
@ -60,6 +60,9 @@ class R600SchedStrategy : public MachineSchedStrategy {
|
||||
int CurEmitted;
|
||||
InstKind NextInstKind;
|
||||
|
||||
unsigned AluInstCount;
|
||||
unsigned FetchInstCount;
|
||||
|
||||
int InstKindLimit[IDLast];
|
||||
|
||||
int OccupedSlotsMask;
|
||||
@ -85,7 +88,7 @@ private:
|
||||
bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
|
||||
AluKind getAluKind(SUnit *SU) const;
|
||||
void LoadAlu();
|
||||
bool isAvailablesAluEmpty() const;
|
||||
unsigned AvailablesAluCount() const;
|
||||
SUnit *AttemptFillSlot (unsigned Slot);
|
||||
void PrepareNextSlot();
|
||||
SUnit *PopInst(std::vector<SUnit*> &Q);
|
||||
|
Loading…
Reference in New Issue
Block a user