mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-11-05 13:16:32 +00:00
R600/SI: Implement VGPR register spilling for compute at -O0 v3
VGPRs are spilled to LDS. This still needs more testing, but
we need to at least enable it at -O0, because the fast register
allocator spills all registers that are live at the end of blocks
and without this some future commits will break the
flat-address-space.ll test.
v2: Only calculate thread id once
v3: Move insertion of spill instructions to
SIRegisterInfo::eliminateFrameIndex()
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218348 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
@@ -377,8 +377,12 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
|
|||||||
LDSAlignShift = 9;
|
LDSAlignShift = 9;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned LDSSpillSize = MFI->LDSWaveSpillSize *
|
||||||
|
MFI->getMaximumWorkGroupSize(MF);
|
||||||
|
|
||||||
unsigned LDSBlocks =
|
unsigned LDSBlocks =
|
||||||
RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
|
RoundUpToAlignment(MFI->LDSSize + LDSSpillSize,
|
||||||
|
1 << LDSAlignShift) >> LDSAlignShift;
|
||||||
|
|
||||||
// Scratch is allocated in 256 dword blocks.
|
// Scratch is allocated in 256 dword blocks.
|
||||||
unsigned ScratchAlignShift = 10;
|
unsigned ScratchAlignShift = 10;
|
||||||
|
|||||||
@@ -21,6 +21,7 @@
|
|||||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||||
#include "llvm/IR/Function.h"
|
#include "llvm/IR/Function.h"
|
||||||
|
#include "llvm/CodeGen/RegisterScavenging.h"
|
||||||
#include "llvm/MC/MCInstrDesc.h"
|
#include "llvm/MC/MCInstrDesc.h"
|
||||||
|
|
||||||
using namespace llvm;
|
using namespace llvm;
|
||||||
@@ -433,6 +434,19 @@ unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const {
|
|||||||
return Opcode;
|
return Opcode;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool shouldTryToSpillVGPRs(MachineFunction *MF) {
|
||||||
|
|
||||||
|
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
|
||||||
|
const TargetMachine &TM = MF->getTarget();
|
||||||
|
|
||||||
|
// FIXME: Even though it can cause problems, we need to enable
|
||||||
|
// spilling at -O0, since the fast register allocator always
|
||||||
|
// spills registers that are live at the end of blocks.
|
||||||
|
return MFI->getShaderType() == ShaderType::COMPUTE &&
|
||||||
|
TM.getOptLevel() == CodeGenOpt::None;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
|
void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
|
||||||
MachineBasicBlock::iterator MI,
|
MachineBasicBlock::iterator MI,
|
||||||
unsigned SrcReg, bool isKill,
|
unsigned SrcReg, bool isKill,
|
||||||
@@ -442,32 +456,41 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
|
|||||||
MachineFunction *MF = MBB.getParent();
|
MachineFunction *MF = MBB.getParent();
|
||||||
MachineFrameInfo *FrameInfo = MF->getFrameInfo();
|
MachineFrameInfo *FrameInfo = MF->getFrameInfo();
|
||||||
DebugLoc DL = MBB.findDebugLoc(MI);
|
DebugLoc DL = MBB.findDebugLoc(MI);
|
||||||
|
int Opcode = -1;
|
||||||
|
|
||||||
if (RI.hasVGPRs(RC)) {
|
if (RI.isSGPRClass(RC)) {
|
||||||
LLVMContext &Ctx = MF->getFunction()->getContext();
|
|
||||||
Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Can't spill VGPR!");
|
|
||||||
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), AMDGPU::VGPR0)
|
|
||||||
.addReg(SrcReg);
|
|
||||||
} else if (RI.isSGPRClass(RC)) {
|
|
||||||
// We are only allowed to create one new instruction when spilling
|
// We are only allowed to create one new instruction when spilling
|
||||||
// registers, so we need to use pseudo instruction for spilling
|
// registers, so we need to use pseudo instruction for spilling
|
||||||
// SGPRs.
|
// SGPRs.
|
||||||
unsigned Opcode;
|
|
||||||
switch (RC->getSize() * 8) {
|
switch (RC->getSize() * 8) {
|
||||||
case 32: Opcode = AMDGPU::SI_SPILL_S32_SAVE; break;
|
case 32: Opcode = AMDGPU::SI_SPILL_S32_SAVE; break;
|
||||||
case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break;
|
case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break;
|
||||||
case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break;
|
case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break;
|
||||||
case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break;
|
case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break;
|
||||||
case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break;
|
case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break;
|
||||||
default: llvm_unreachable("Cannot spill register class");
|
}
|
||||||
|
} else if(shouldTryToSpillVGPRs(MF) && RI.hasVGPRs(RC)) {
|
||||||
|
switch(RC->getSize() * 8) {
|
||||||
|
case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break;
|
||||||
|
case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break;
|
||||||
|
case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break;
|
||||||
|
case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break;
|
||||||
|
case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break;
|
||||||
|
case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (Opcode != -1) {
|
||||||
FrameInfo->setObjectAlignment(FrameIndex, 4);
|
FrameInfo->setObjectAlignment(FrameIndex, 4);
|
||||||
BuildMI(MBB, MI, DL, get(Opcode))
|
BuildMI(MBB, MI, DL, get(Opcode))
|
||||||
.addReg(SrcReg)
|
.addReg(SrcReg)
|
||||||
.addFrameIndex(FrameIndex);
|
.addFrameIndex(FrameIndex);
|
||||||
} else {
|
} else {
|
||||||
llvm_unreachable("VGPR spilling not supported");
|
LLVMContext &Ctx = MF->getFunction()->getContext();
|
||||||
|
Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
|
||||||
|
" spill register");
|
||||||
|
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), AMDGPU::VGPR0)
|
||||||
|
.addReg(SrcReg);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -479,31 +502,138 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
|
|||||||
MachineFunction *MF = MBB.getParent();
|
MachineFunction *MF = MBB.getParent();
|
||||||
MachineFrameInfo *FrameInfo = MF->getFrameInfo();
|
MachineFrameInfo *FrameInfo = MF->getFrameInfo();
|
||||||
DebugLoc DL = MBB.findDebugLoc(MI);
|
DebugLoc DL = MBB.findDebugLoc(MI);
|
||||||
|
int Opcode = -1;
|
||||||
|
|
||||||
if (RI.hasVGPRs(RC)) {
|
if (RI.isSGPRClass(RC)){
|
||||||
LLVMContext &Ctx = MF->getFunction()->getContext();
|
|
||||||
Ctx.emitError("SIInstrInfo::loadRegToStackSlot - Can't retrieve spilled VGPR!");
|
|
||||||
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
|
|
||||||
.addImm(0);
|
|
||||||
} else if (RI.isSGPRClass(RC)){
|
|
||||||
unsigned Opcode;
|
|
||||||
switch(RC->getSize() * 8) {
|
switch(RC->getSize() * 8) {
|
||||||
case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break;
|
case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break;
|
||||||
case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break;
|
case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break;
|
||||||
case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break;
|
case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break;
|
||||||
case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
|
case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
|
||||||
case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break;
|
case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break;
|
||||||
default: llvm_unreachable("Cannot spill register class");
|
}
|
||||||
|
} else if(shouldTryToSpillVGPRs(MF) && RI.hasVGPRs(RC)) {
|
||||||
|
switch(RC->getSize() * 8) {
|
||||||
|
case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break;
|
||||||
|
case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break;
|
||||||
|
case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break;
|
||||||
|
case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break;
|
||||||
|
case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break;
|
||||||
|
case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (Opcode != -1) {
|
||||||
FrameInfo->setObjectAlignment(FrameIndex, 4);
|
FrameInfo->setObjectAlignment(FrameIndex, 4);
|
||||||
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
|
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
|
||||||
.addFrameIndex(FrameIndex);
|
.addFrameIndex(FrameIndex);
|
||||||
} else {
|
} else {
|
||||||
llvm_unreachable("VGPR spilling not supported");
|
LLVMContext &Ctx = MF->getFunction()->getContext();
|
||||||
|
Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
|
||||||
|
" restore register");
|
||||||
|
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
|
||||||
|
.addReg(AMDGPU::VGPR0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// \param @Offset Offset in bytes of the FrameIndex being spilled
|
||||||
|
unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
|
||||||
|
MachineBasicBlock::iterator MI,
|
||||||
|
RegScavenger *RS, unsigned TmpReg,
|
||||||
|
unsigned FrameOffset,
|
||||||
|
unsigned Size) const {
|
||||||
|
MachineFunction *MF = MBB.getParent();
|
||||||
|
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
|
||||||
|
const AMDGPUSubtarget &ST = MF->getTarget().getSubtarget<AMDGPUSubtarget>();
|
||||||
|
const SIRegisterInfo *TRI =
|
||||||
|
static_cast<const SIRegisterInfo*>(ST.getRegisterInfo());
|
||||||
|
DebugLoc DL = MBB.findDebugLoc(MI);
|
||||||
|
unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF);
|
||||||
|
unsigned WavefrontSize = ST.getWavefrontSize();
|
||||||
|
|
||||||
|
unsigned TIDReg = MFI->getTIDReg();
|
||||||
|
if (!MFI->hasCalculatedTID()) {
|
||||||
|
MachineBasicBlock &Entry = MBB.getParent()->front();
|
||||||
|
MachineBasicBlock::iterator Insert = Entry.front();
|
||||||
|
DebugLoc DL = Insert->getDebugLoc();
|
||||||
|
|
||||||
|
TIDReg = RI.findUnusedVGPR(MF->getRegInfo());
|
||||||
|
if (TIDReg == AMDGPU::NoRegister)
|
||||||
|
return TIDReg;
|
||||||
|
|
||||||
|
|
||||||
|
if (MFI->getShaderType() == ShaderType::COMPUTE &&
|
||||||
|
WorkGroupSize > WavefrontSize) {
|
||||||
|
|
||||||
|
unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X);
|
||||||
|
unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y);
|
||||||
|
unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z);
|
||||||
|
unsigned InputPtrReg =
|
||||||
|
TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR);
|
||||||
|
static const unsigned TIDIGRegs[3] = {
|
||||||
|
TIDIGXReg, TIDIGYReg, TIDIGZReg
|
||||||
|
};
|
||||||
|
for (unsigned Reg : TIDIGRegs) {
|
||||||
|
if (!Entry.isLiveIn(Reg))
|
||||||
|
Entry.addLiveIn(Reg);
|
||||||
|
}
|
||||||
|
|
||||||
|
RS->enterBasicBlock(&Entry);
|
||||||
|
unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
|
||||||
|
unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
|
||||||
|
BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
|
||||||
|
.addReg(InputPtrReg)
|
||||||
|
.addImm(SI::KernelInputOffsets::NGROUPS_Z);
|
||||||
|
BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
|
||||||
|
.addReg(InputPtrReg)
|
||||||
|
.addImm(SI::KernelInputOffsets::NGROUPS_Y);
|
||||||
|
|
||||||
|
// NGROUPS.X * NGROUPS.Y
|
||||||
|
BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
|
||||||
|
.addReg(STmp1)
|
||||||
|
.addReg(STmp0);
|
||||||
|
// (NGROUPS.X * NGROUPS.Y) * TIDIG.X
|
||||||
|
BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
|
||||||
|
.addReg(STmp1)
|
||||||
|
.addReg(TIDIGXReg);
|
||||||
|
// NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
|
||||||
|
BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
|
||||||
|
.addReg(STmp0)
|
||||||
|
.addReg(TIDIGYReg)
|
||||||
|
.addReg(TIDReg);
|
||||||
|
// (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
|
||||||
|
BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg)
|
||||||
|
.addReg(TIDReg)
|
||||||
|
.addReg(TIDIGZReg);
|
||||||
|
} else {
|
||||||
|
// Get the wave id
|
||||||
|
BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
|
||||||
|
TIDReg)
|
||||||
|
.addImm(-1)
|
||||||
|
.addImm(0);
|
||||||
|
|
||||||
|
BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e32),
|
||||||
|
TIDReg)
|
||||||
|
.addImm(-1)
|
||||||
|
.addReg(TIDReg);
|
||||||
|
}
|
||||||
|
|
||||||
|
BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
|
||||||
|
TIDReg)
|
||||||
|
.addImm(2)
|
||||||
|
.addReg(TIDReg);
|
||||||
|
MFI->setTIDReg(TIDReg);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add FrameIndex to LDS offset
|
||||||
|
unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize);
|
||||||
|
BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg)
|
||||||
|
.addImm(LDSOffset)
|
||||||
|
.addReg(TIDReg);
|
||||||
|
|
||||||
|
return TmpReg;
|
||||||
|
}
|
||||||
|
|
||||||
void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI,
|
void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI,
|
||||||
int Count) const {
|
int Count) const {
|
||||||
while (Count > 0) {
|
while (Count > 0) {
|
||||||
|
|||||||
@@ -79,6 +79,13 @@ public:
|
|||||||
unsigned DestReg, unsigned SrcReg,
|
unsigned DestReg, unsigned SrcReg,
|
||||||
bool KillSrc) const override;
|
bool KillSrc) const override;
|
||||||
|
|
||||||
|
unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB,
|
||||||
|
MachineBasicBlock::iterator MI,
|
||||||
|
RegScavenger *RS,
|
||||||
|
unsigned TmpReg,
|
||||||
|
unsigned Offset,
|
||||||
|
unsigned Size) const;
|
||||||
|
|
||||||
void storeRegToStackSlot(MachineBasicBlock &MBB,
|
void storeRegToStackSlot(MachineBasicBlock &MBB,
|
||||||
MachineBasicBlock::iterator MI,
|
MachineBasicBlock::iterator MI,
|
||||||
unsigned SrcReg, bool isKill, int FrameIndex,
|
unsigned SrcReg, bool isKill, int FrameIndex,
|
||||||
|
|||||||
@@ -1826,6 +1826,27 @@ defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
|
|||||||
defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
|
defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
|
||||||
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
|
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
|
||||||
|
|
||||||
|
multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
|
||||||
|
def _SAVE : InstSI <
|
||||||
|
(outs),
|
||||||
|
(ins vgpr_class:$src, i32imm:$frame_idx),
|
||||||
|
"", []
|
||||||
|
>;
|
||||||
|
|
||||||
|
def _RESTORE : InstSI <
|
||||||
|
(outs vgpr_class:$dst),
|
||||||
|
(ins i32imm:$frame_idx),
|
||||||
|
"", []
|
||||||
|
>;
|
||||||
|
}
|
||||||
|
|
||||||
|
defm SI_SPILL_V32 : SI_SPILL_VGPR <VReg_32>;
|
||||||
|
defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>;
|
||||||
|
defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
|
||||||
|
defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
|
||||||
|
defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
|
||||||
|
defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
|
||||||
|
|
||||||
let Defs = [SCC] in {
|
let Defs = [SCC] in {
|
||||||
|
|
||||||
def SI_CONSTDATA_PTR : InstSI <
|
def SI_CONSTDATA_PTR : InstSI <
|
||||||
|
|||||||
@@ -10,8 +10,9 @@
|
|||||||
|
|
||||||
|
|
||||||
#include "SIMachineFunctionInfo.h"
|
#include "SIMachineFunctionInfo.h"
|
||||||
|
#include "AMDGPUSubtarget.h"
|
||||||
#include "SIInstrInfo.h"
|
#include "SIInstrInfo.h"
|
||||||
#include "SIRegisterInfo.h"
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||||
#include "llvm/CodeGen/MachineFrameInfo.h"
|
#include "llvm/CodeGen/MachineFrameInfo.h"
|
||||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||||
#include "llvm/IR/Function.h"
|
#include "llvm/IR/Function.h"
|
||||||
@@ -27,29 +28,18 @@ void SIMachineFunctionInfo::anchor() {}
|
|||||||
|
|
||||||
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
|
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
|
||||||
: AMDGPUMachineFunction(MF),
|
: AMDGPUMachineFunction(MF),
|
||||||
|
TIDReg(AMDGPU::NoRegister),
|
||||||
PSInputAddr(0),
|
PSInputAddr(0),
|
||||||
NumUserSGPRs(0) { }
|
NumUserSGPRs(0),
|
||||||
|
LDSWaveSpillSize(0) { }
|
||||||
/// \brief Returns a register that is not used at any point in the function.
|
|
||||||
/// If all registers are used, then this function will return
|
|
||||||
// AMDGPU::NoRegister.
|
|
||||||
static unsigned findUnusedVGPR(const MachineRegisterInfo &MRI) {
|
|
||||||
|
|
||||||
const TargetRegisterClass *RC = &AMDGPU::VGPR_32RegClass;
|
|
||||||
|
|
||||||
for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
|
|
||||||
I != E; ++I) {
|
|
||||||
if (!MRI.isPhysRegUsed(*I))
|
|
||||||
return *I;
|
|
||||||
}
|
|
||||||
return AMDGPU::NoRegister;
|
|
||||||
}
|
|
||||||
|
|
||||||
SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
|
SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
|
||||||
MachineFunction *MF,
|
MachineFunction *MF,
|
||||||
unsigned FrameIndex,
|
unsigned FrameIndex,
|
||||||
unsigned SubIdx) {
|
unsigned SubIdx) {
|
||||||
const MachineFrameInfo *FrameInfo = MF->getFrameInfo();
|
const MachineFrameInfo *FrameInfo = MF->getFrameInfo();
|
||||||
|
const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>(
|
||||||
|
MF->getTarget().getSubtarget<AMDGPUSubtarget>().getRegisterInfo());
|
||||||
MachineRegisterInfo &MRI = MF->getRegInfo();
|
MachineRegisterInfo &MRI = MF->getRegInfo();
|
||||||
int64_t Offset = FrameInfo->getObjectOffset(FrameIndex);
|
int64_t Offset = FrameInfo->getObjectOffset(FrameIndex);
|
||||||
Offset += SubIdx * 4;
|
Offset += SubIdx * 4;
|
||||||
@@ -60,7 +50,7 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
|
|||||||
struct SpilledReg Spill;
|
struct SpilledReg Spill;
|
||||||
|
|
||||||
if (!LaneVGPRs.count(LaneVGPRIdx)) {
|
if (!LaneVGPRs.count(LaneVGPRIdx)) {
|
||||||
unsigned LaneVGPR = findUnusedVGPR(MRI);
|
unsigned LaneVGPR = TRI->findUnusedVGPR(MRI);
|
||||||
LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
|
LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
|
||||||
MRI.setPhysRegUsed(LaneVGPR);
|
MRI.setPhysRegUsed(LaneVGPR);
|
||||||
|
|
||||||
@@ -76,3 +66,11 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
|
|||||||
Spill.Lane = Lane;
|
Spill.Lane = Lane;
|
||||||
return Spill;
|
return Spill;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize(
|
||||||
|
const MachineFunction &MF) const {
|
||||||
|
const AMDGPUSubtarget &ST = MF.getTarget().getSubtarget<AMDGPUSubtarget>();
|
||||||
|
// FIXME: We should get this information from kernel attributes if it
|
||||||
|
// is available.
|
||||||
|
return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize();
|
||||||
|
}
|
||||||
|
|||||||
@@ -16,6 +16,7 @@
|
|||||||
#define LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H
|
#define LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H
|
||||||
|
|
||||||
#include "AMDGPUMachineFunction.h"
|
#include "AMDGPUMachineFunction.h"
|
||||||
|
#include "SIRegisterInfo.h"
|
||||||
#include <map>
|
#include <map>
|
||||||
|
|
||||||
namespace llvm {
|
namespace llvm {
|
||||||
@@ -26,6 +27,9 @@ class MachineRegisterInfo;
|
|||||||
/// tells the hardware which interpolation parameters to load.
|
/// tells the hardware which interpolation parameters to load.
|
||||||
class SIMachineFunctionInfo : public AMDGPUMachineFunction {
|
class SIMachineFunctionInfo : public AMDGPUMachineFunction {
|
||||||
void anchor() override;
|
void anchor() override;
|
||||||
|
|
||||||
|
unsigned TIDReg;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
struct SpilledReg {
|
struct SpilledReg {
|
||||||
@@ -44,6 +48,12 @@ public:
|
|||||||
unsigned PSInputAddr;
|
unsigned PSInputAddr;
|
||||||
unsigned NumUserSGPRs;
|
unsigned NumUserSGPRs;
|
||||||
std::map<unsigned, unsigned> LaneVGPRs;
|
std::map<unsigned, unsigned> LaneVGPRs;
|
||||||
|
unsigned LDSWaveSpillSize;
|
||||||
|
bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; };
|
||||||
|
unsigned getTIDReg() const { return TIDReg; };
|
||||||
|
void setTIDReg(unsigned Reg) { TIDReg = Reg; }
|
||||||
|
|
||||||
|
unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // End namespace llvm
|
} // End namespace llvm
|
||||||
|
|||||||
@@ -34,6 +34,11 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
|
|||||||
Reserved.set(AMDGPU::EXEC);
|
Reserved.set(AMDGPU::EXEC);
|
||||||
Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
|
Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
|
||||||
Reserved.set(AMDGPU::FLAT_SCR);
|
Reserved.set(AMDGPU::FLAT_SCR);
|
||||||
|
|
||||||
|
// Reserve some VGPRs to use as temp registers in case we have to spill VGPRs
|
||||||
|
Reserved.set(AMDGPU::VGPR255);
|
||||||
|
Reserved.set(AMDGPU::VGPR254);
|
||||||
|
|
||||||
return Reserved;
|
return Reserved;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -51,18 +56,31 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
|
|||||||
switch (Op) {
|
switch (Op) {
|
||||||
case AMDGPU::SI_SPILL_S512_SAVE:
|
case AMDGPU::SI_SPILL_S512_SAVE:
|
||||||
case AMDGPU::SI_SPILL_S512_RESTORE:
|
case AMDGPU::SI_SPILL_S512_RESTORE:
|
||||||
|
case AMDGPU::SI_SPILL_V512_SAVE:
|
||||||
|
case AMDGPU::SI_SPILL_V512_RESTORE:
|
||||||
return 16;
|
return 16;
|
||||||
case AMDGPU::SI_SPILL_S256_SAVE:
|
case AMDGPU::SI_SPILL_S256_SAVE:
|
||||||
case AMDGPU::SI_SPILL_S256_RESTORE:
|
case AMDGPU::SI_SPILL_S256_RESTORE:
|
||||||
|
case AMDGPU::SI_SPILL_V256_SAVE:
|
||||||
|
case AMDGPU::SI_SPILL_V256_RESTORE:
|
||||||
return 8;
|
return 8;
|
||||||
case AMDGPU::SI_SPILL_S128_SAVE:
|
case AMDGPU::SI_SPILL_S128_SAVE:
|
||||||
case AMDGPU::SI_SPILL_S128_RESTORE:
|
case AMDGPU::SI_SPILL_S128_RESTORE:
|
||||||
|
case AMDGPU::SI_SPILL_V128_SAVE:
|
||||||
|
case AMDGPU::SI_SPILL_V128_RESTORE:
|
||||||
return 4;
|
return 4;
|
||||||
|
case AMDGPU::SI_SPILL_V96_SAVE:
|
||||||
|
case AMDGPU::SI_SPILL_V96_RESTORE:
|
||||||
|
return 3;
|
||||||
case AMDGPU::SI_SPILL_S64_SAVE:
|
case AMDGPU::SI_SPILL_S64_SAVE:
|
||||||
case AMDGPU::SI_SPILL_S64_RESTORE:
|
case AMDGPU::SI_SPILL_S64_RESTORE:
|
||||||
|
case AMDGPU::SI_SPILL_V64_SAVE:
|
||||||
|
case AMDGPU::SI_SPILL_V64_RESTORE:
|
||||||
return 2;
|
return 2;
|
||||||
case AMDGPU::SI_SPILL_S32_SAVE:
|
case AMDGPU::SI_SPILL_S32_SAVE:
|
||||||
case AMDGPU::SI_SPILL_S32_RESTORE:
|
case AMDGPU::SI_SPILL_S32_RESTORE:
|
||||||
|
case AMDGPU::SI_SPILL_V32_SAVE:
|
||||||
|
case AMDGPU::SI_SPILL_V32_RESTORE:
|
||||||
return 1;
|
return 1;
|
||||||
default: llvm_unreachable("Invalid spill opcode");
|
default: llvm_unreachable("Invalid spill opcode");
|
||||||
}
|
}
|
||||||
@@ -139,6 +157,81 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// VGPR register spill
|
||||||
|
case AMDGPU::SI_SPILL_V512_SAVE:
|
||||||
|
case AMDGPU::SI_SPILL_V256_SAVE:
|
||||||
|
case AMDGPU::SI_SPILL_V128_SAVE:
|
||||||
|
case AMDGPU::SI_SPILL_V96_SAVE:
|
||||||
|
case AMDGPU::SI_SPILL_V64_SAVE:
|
||||||
|
case AMDGPU::SI_SPILL_V32_SAVE: {
|
||||||
|
unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
|
||||||
|
unsigned SrcReg = MI->getOperand(0).getReg();
|
||||||
|
int64_t Offset = FrameInfo->getObjectOffset(Index);
|
||||||
|
unsigned Size = NumSubRegs * 4;
|
||||||
|
unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
|
||||||
|
|
||||||
|
for (unsigned i = 0, e = NumSubRegs; i != e; ++i) {
|
||||||
|
unsigned SubReg = NumSubRegs > 1 ?
|
||||||
|
getPhysRegSubReg(SrcReg, &AMDGPU::VGPR_32RegClass, i) :
|
||||||
|
SrcReg;
|
||||||
|
Offset += (i * 4);
|
||||||
|
MFI->LDSWaveSpillSize = std::max((unsigned)Offset + 4, (unsigned)MFI->LDSWaveSpillSize);
|
||||||
|
|
||||||
|
unsigned AddrReg = TII->calculateLDSSpillAddress(*MBB, MI, RS, TmpReg,
|
||||||
|
Offset, Size);
|
||||||
|
|
||||||
|
if (AddrReg == AMDGPU::NoRegister) {
|
||||||
|
LLVMContext &Ctx = MF->getFunction()->getContext();
|
||||||
|
Ctx.emitError("Ran out of VGPRs for spilling VGPRS");
|
||||||
|
AddrReg = AMDGPU::VGPR0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store the value in LDS
|
||||||
|
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::DS_WRITE_B32))
|
||||||
|
.addImm(0) // gds
|
||||||
|
.addReg(AddrReg, RegState::Kill) // addr
|
||||||
|
.addReg(SubReg) // data0
|
||||||
|
.addImm(0); // offset
|
||||||
|
}
|
||||||
|
|
||||||
|
MI->eraseFromParent();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case AMDGPU::SI_SPILL_V32_RESTORE:
|
||||||
|
case AMDGPU::SI_SPILL_V64_RESTORE:
|
||||||
|
case AMDGPU::SI_SPILL_V128_RESTORE:
|
||||||
|
case AMDGPU::SI_SPILL_V256_RESTORE:
|
||||||
|
case AMDGPU::SI_SPILL_V512_RESTORE: {
|
||||||
|
unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
|
||||||
|
unsigned DstReg = MI->getOperand(0).getReg();
|
||||||
|
int64_t Offset = FrameInfo->getObjectOffset(Index);
|
||||||
|
unsigned Size = NumSubRegs * 4;
|
||||||
|
unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
|
||||||
|
|
||||||
|
// FIXME: We could use DS_READ_B64 here to optimize for larger registers.
|
||||||
|
for (unsigned i = 0, e = NumSubRegs; i != e; ++i) {
|
||||||
|
unsigned SubReg = NumSubRegs > 1 ?
|
||||||
|
getPhysRegSubReg(DstReg, &AMDGPU::VGPR_32RegClass, i) :
|
||||||
|
DstReg;
|
||||||
|
|
||||||
|
Offset += (i * 4);
|
||||||
|
unsigned AddrReg = TII->calculateLDSSpillAddress(*MBB, MI, RS, TmpReg,
|
||||||
|
Offset, Size);
|
||||||
|
if (AddrReg == AMDGPU::NoRegister) {
|
||||||
|
LLVMContext &Ctx = MF->getFunction()->getContext();
|
||||||
|
Ctx.emitError("Ran out of VGPRs for spilling VGPRs");
|
||||||
|
AddrReg = AMDGPU::VGPR0;
|
||||||
|
}
|
||||||
|
|
||||||
|
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::DS_READ_B32), SubReg)
|
||||||
|
.addImm(0) // gds
|
||||||
|
.addReg(AddrReg, RegState::Kill) // addr
|
||||||
|
.addImm(0); //offset
|
||||||
|
}
|
||||||
|
MI->eraseFromParent();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
default: {
|
default: {
|
||||||
int64_t Offset = FrameInfo->getObjectOffset(Index);
|
int64_t Offset = FrameInfo->getObjectOffset(Index);
|
||||||
FIOp.ChangeToImmediate(Offset);
|
FIOp.ChangeToImmediate(Offset);
|
||||||
@@ -173,8 +266,12 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
|
|||||||
&AMDGPU::SReg_32RegClass,
|
&AMDGPU::SReg_32RegClass,
|
||||||
&AMDGPU::VReg_64RegClass,
|
&AMDGPU::VReg_64RegClass,
|
||||||
&AMDGPU::SReg_64RegClass,
|
&AMDGPU::SReg_64RegClass,
|
||||||
|
&AMDGPU::VReg_96RegClass,
|
||||||
|
&AMDGPU::VReg_128RegClass,
|
||||||
&AMDGPU::SReg_128RegClass,
|
&AMDGPU::SReg_128RegClass,
|
||||||
&AMDGPU::SReg_256RegClass
|
&AMDGPU::VReg_256RegClass,
|
||||||
|
&AMDGPU::SReg_256RegClass,
|
||||||
|
&AMDGPU::VReg_512RegClass
|
||||||
};
|
};
|
||||||
|
|
||||||
for (const TargetRegisterClass *BaseClass : BaseClasses) {
|
for (const TargetRegisterClass *BaseClass : BaseClasses) {
|
||||||
@@ -335,3 +432,19 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
|
|||||||
}
|
}
|
||||||
llvm_unreachable("unexpected preloaded value type");
|
llvm_unreachable("unexpected preloaded value type");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// \brief Returns a register that is not used at any point in the function.
|
||||||
|
/// If all registers are used, then this function will return
|
||||||
|
// AMDGPU::NoRegister.
|
||||||
|
unsigned SIRegisterInfo::findUnusedVGPR(const MachineRegisterInfo &MRI) const {
|
||||||
|
|
||||||
|
const TargetRegisterClass *RC = &AMDGPU::VGPR_32RegClass;
|
||||||
|
|
||||||
|
for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
|
||||||
|
I != E; ++I) {
|
||||||
|
if (!MRI.isPhysRegUsed(*I))
|
||||||
|
return *I;
|
||||||
|
}
|
||||||
|
return AMDGPU::NoRegister;
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -100,6 +100,7 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
|
|||||||
unsigned getPreloadedValue(const MachineFunction &MF,
|
unsigned getPreloadedValue(const MachineFunction &MF,
|
||||||
enum PreloadedValue Value) const;
|
enum PreloadedValue Value) const;
|
||||||
|
|
||||||
|
unsigned findUnusedVGPR(const MachineRegisterInfo &MRI) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // End namespace llvm
|
} // End namespace llvm
|
||||||
|
|||||||
Reference in New Issue
Block a user