R600/SI: Use scratch memory for large private arrays

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213551 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Tom Stellard
2014-07-21 15:45:01 +00:00
parent c912b101d2
commit 3280804237
23 changed files with 507 additions and 104 deletions

View File

@@ -391,10 +391,15 @@ SDValue SITargetLowering::LowerFormalArguments(
}
// The pointer to the list of arguments is stored in SGPR0, SGPR1
// The pointer to the scratch buffer is stored in SGPR2, SGPR3
if (Info->getShaderType() == ShaderType::COMPUTE) {
Info->NumUserSGPRs = 4;
CCInfo.AllocateReg(AMDGPU::SGPR0);
CCInfo.AllocateReg(AMDGPU::SGPR1);
CCInfo.AllocateReg(AMDGPU::SGPR2);
CCInfo.AllocateReg(AMDGPU::SGPR3);
MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass);
MF.addLiveIn(AMDGPU::SGPR2_SGPR3, &AMDGPU::SReg_64RegClass);
}
if (Info->getShaderType() == ShaderType::COMPUTE) {
@@ -509,6 +514,36 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
MI->eraseFromParent();
break;
}
case AMDGPU::SI_BUFFER_RSRC: {
unsigned SuperReg = MI->getOperand(0).getReg();
unsigned Args[4];
for (unsigned i = 0, e = 4; i < e; ++i) {
MachineOperand &Arg = MI->getOperand(i + 1);
if (Arg.isReg()) {
Args[i] = Arg.getReg();
continue;
}
assert(Arg.isImm());
unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), Reg)
.addImm(Arg.getImm());
Args[i] = Reg;
}
BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE),
SuperReg)
.addReg(Args[0])
.addImm(AMDGPU::sub0)
.addReg(Args[1])
.addImm(AMDGPU::sub1)
.addReg(Args[2])
.addImm(AMDGPU::sub2)
.addReg(Args[3])
.addImm(AMDGPU::sub3);
MI->eraseFromParent();
break;
}
case AMDGPU::V_SUB_F64: {
unsigned DestReg = MI->getOperand(0).getReg();
BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg)
@@ -620,6 +655,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
switch (Op.getOpcode()) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::LOAD: {
LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
@@ -658,8 +694,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
EVT VT = Op.getValueType();
SDLoc DL(Op);
//XXX: Hardcoded we only use two to store the pointer to the parameters.
unsigned NumUserSGPRs = 2;
switch (IntrinsicID) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
case Intrinsic::r600_read_ngroups_x:
@@ -682,13 +716,13 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32, false);
case Intrinsic::r600_read_tgid_x:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT);
AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0), VT);
case Intrinsic::r600_read_tgid_y:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT);
AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1), VT);
case Intrinsic::r600_read_tgid_z:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT);
AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2), VT);
case Intrinsic::r600_read_tidig_x:
return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
AMDGPU::VGPR0, VT);
@@ -782,6 +816,21 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {
return nullptr;
}
SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
const SIInstrInfo *TII =
static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
const SIRegisterInfo &TRI = TII->getRegisterInfo();
FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
unsigned FrameIndex = FINode->getIndex();
CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
TRI.getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET), MVT::i32);
return DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
}
/// This transforms the control flow intrinsics to get the branch destination as
/// last parameter, also switches branch target with BR if the need arise
SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
@@ -891,6 +940,11 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
LoadSDNode *Load = cast<LoadSDNode>(Op);
// Vector private memory loads have already been split, and
// all the rest of private memory loads are legal.
if (Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
return SDValue();
}
SDValue Lowered = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
if (Lowered.getNode())
return Lowered;
@@ -1081,6 +1135,12 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
VT.getVectorElementType() == MVT::i32)
return SDValue();
if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
if (VT.isVector() && VT.getVectorNumElements() > 4)
return SplitVectorStore(Op, DAG);
return SDValue();
}
SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
if (Ret.getNode())
return Ret;
@@ -1495,9 +1555,19 @@ void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand,
// This is a conservative aproach. It is possible that we can't determine the
// correct register class and copy too often, but better safe than sorry.
SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32);
SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(),
Operand.getValueType(), Operand, RC);
SDNode *Node;
// We can't use COPY_TO_REGCLASS with FrameIndex arguments.
if (isa<FrameIndexSDNode>(Operand)) {
unsigned Opcode = Operand.getValueType() == MVT::i32 ?
AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
Node = DAG.getMachineNode(Opcode, SDLoc(), Operand.getValueType(),
Operand);
} else {
SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32);
Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(),
Operand.getValueType(), Operand, RC);
}
Operand = SDValue(Node, 0);
}
@@ -1591,6 +1661,14 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed);
}
continue;
} else {
// If it's not a VSrc or SSrc operand check if we have a GlobalAddress.
// These will be lowered to immediates, so we will need to insert a MOV.
if (isa<GlobalAddressSDNode>(Ops[i])) {
SDNode *Node = DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(),
Operand.getValueType(), Operand);
Ops[i] = SDValue(Node, 0);
}
}
if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) {