mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-19 04:32:19 +00:00
R600/SI: Use scratch memory for large private arrays
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213551 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
c912b101d2
commit
3280804237
@ -25,6 +25,7 @@
|
||||
#include "SIDefines.h"
|
||||
#include "SIMachineFunctionInfo.h"
|
||||
#include "SIRegisterInfo.h"
|
||||
#include "llvm/CodeGen/MachineFrameInfo.h"
|
||||
#include "llvm/MC/MCContext.h"
|
||||
#include "llvm/MC/MCSectionELF.h"
|
||||
#include "llvm/MC/MCStreamer.h"
|
||||
@ -141,6 +142,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
|
||||
false);
|
||||
OutStreamer.emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
|
||||
false);
|
||||
OutStreamer.emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
|
||||
false);
|
||||
} else {
|
||||
R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
|
||||
OutStreamer.emitRawComment(
|
||||
@ -332,6 +335,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
|
||||
// Do not clamp NAN to 0.
|
||||
ProgInfo.DX10Clamp = 0;
|
||||
|
||||
const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
|
||||
ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF);
|
||||
|
||||
ProgInfo.CodeLen = CodeSize;
|
||||
}
|
||||
|
||||
@ -361,6 +367,15 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
|
||||
unsigned LDSBlocks =
|
||||
RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
|
||||
|
||||
// Scratch is allocated in 256 dword blocks.
|
||||
unsigned ScratchAlignShift = 10;
|
||||
// We need to program the hardware with the amount of scratch memory that
|
||||
// is used by the entire wave. KernelInfo.ScratchSize is the amount of
|
||||
// scratch memory used per thread.
|
||||
unsigned ScratchBlocks =
|
||||
RoundUpToAlignment(KernelInfo.ScratchSize * STM.getWavefrontSize(),
|
||||
1 << ScratchAlignShift) >> ScratchAlignShift;
|
||||
|
||||
if (MFI->getShaderType() == ShaderType::COMPUTE) {
|
||||
OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
|
||||
|
||||
@ -377,7 +392,14 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
|
||||
OutStreamer.EmitIntValue(ComputePGMRSrc1, 4);
|
||||
|
||||
OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
|
||||
OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(LDSBlocks), 4);
|
||||
const uint32_t ComputePGMRSrc2 =
|
||||
S_00B84C_LDS_SIZE(LDSBlocks) |
|
||||
S_00B02C_SCRATCH_EN(ScratchBlocks > 0);
|
||||
|
||||
OutStreamer.EmitIntValue(ComputePGMRSrc2, 4);
|
||||
|
||||
OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
|
||||
OutStreamer.EmitIntValue(S_00B860_WAVESIZE(ScratchBlocks), 4);
|
||||
} else {
|
||||
OutStreamer.EmitIntValue(RsrcReg, 4);
|
||||
OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) |
|
||||
|
@ -32,6 +32,7 @@ private:
|
||||
DX10Clamp(0),
|
||||
DebugMode(0),
|
||||
IEEEMode(0),
|
||||
ScratchSize(0),
|
||||
CodeLen(0) {}
|
||||
|
||||
// Fields set in PGM_RSRC1 pm4 packet.
|
||||
@ -43,6 +44,7 @@ private:
|
||||
uint32_t DX10Clamp;
|
||||
uint32_t DebugMode;
|
||||
uint32_t IEEEMode;
|
||||
uint32_t ScratchSize;
|
||||
|
||||
// Bonus information for debugging.
|
||||
uint64_t CodeLen;
|
||||
|
@ -16,9 +16,13 @@
|
||||
#include "AMDGPURegisterInfo.h"
|
||||
#include "AMDGPUSubtarget.h"
|
||||
#include "R600InstrInfo.h"
|
||||
#include "SIDefines.h"
|
||||
#include "SIISelLowering.h"
|
||||
#include "SIMachineFunctionInfo.h"
|
||||
#include "llvm/CodeGen/FunctionLoweringInfo.h"
|
||||
#include "llvm/CodeGen/PseudoSourceValue.h"
|
||||
#include "llvm/CodeGen/MachineFrameInfo.h"
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
#include "llvm/CodeGen/SelectionDAG.h"
|
||||
#include "llvm/CodeGen/SelectionDAGISel.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
@ -85,7 +89,13 @@ private:
|
||||
bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
|
||||
bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
|
||||
bool SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, SDValue &Offset,
|
||||
SDValue &ImmOffset) const;
|
||||
SDValue &ImmOffset) const;
|
||||
bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
|
||||
SDValue &SOffset, SDValue &ImmOffset) const;
|
||||
bool SelectMUBUFAddr32(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
|
||||
SDValue &SOffset, SDValue &Offset, SDValue &Offen,
|
||||
SDValue &Idxen, SDValue &GLC, SDValue &SLC,
|
||||
SDValue &TFE) const;
|
||||
|
||||
SDNode *SelectADD_SUB_I64(SDNode *N);
|
||||
SDNode *SelectDIV_SCALE(SDNode *N);
|
||||
@ -730,6 +740,10 @@ static SDValue wrapAddr64Rsrc(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) {
|
||||
Ptr), 0);
|
||||
}
|
||||
|
||||
static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) {
|
||||
return isUInt<12>(Imm->getZExtValue());
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr,
|
||||
SDValue &Offset,
|
||||
SDValue &ImmOffset) const {
|
||||
@ -740,7 +754,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr,
|
||||
SDValue N1 = Addr.getOperand(1);
|
||||
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
|
||||
|
||||
if (isUInt<12>(C1->getZExtValue())) {
|
||||
if (isLegalMUBUFImmOffset(C1)) {
|
||||
|
||||
if (N0.getOpcode() == ISD::ADD) {
|
||||
// (add (add N2, N3), C1)
|
||||
@ -776,6 +790,95 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr,
|
||||
return true;
|
||||
}
|
||||
|
||||
/// \brief Return a resource descriptor with the 'Add TID' bit enabled
|
||||
/// The TID (Thread ID) is multipled by the stride value (bits [61:48]
|
||||
/// of the resource descriptor) to create an offset, which is added to the
|
||||
/// resource ponter.
|
||||
static SDValue buildScratchRSRC(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) {
|
||||
|
||||
uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE |
|
||||
0xffffffff;
|
||||
|
||||
SDValue PtrLo = DAG->getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
|
||||
SDValue PtrHi = DAG->getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
|
||||
SDValue DataLo = DAG->getTargetConstant(
|
||||
Rsrc & APInt::getAllOnesValue(32).getZExtValue(), MVT::i32);
|
||||
SDValue DataHi = DAG->getTargetConstant(Rsrc >> 32, MVT::i32);
|
||||
|
||||
const SDValue Ops[] = { PtrLo, PtrHi, DataLo, DataHi };
|
||||
return SDValue(DAG->getMachineNode(AMDGPU::SI_BUFFER_RSRC, DL,
|
||||
MVT::v4i32, Ops), 0);
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
|
||||
SDValue &VAddr, SDValue &SOffset,
|
||||
SDValue &ImmOffset) const {
|
||||
|
||||
SDLoc DL(Addr);
|
||||
MachineFunction &MF = CurDAG->getMachineFunction();
|
||||
const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo());
|
||||
MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
|
||||
|
||||
unsigned ScratchPtrReg =
|
||||
TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
|
||||
unsigned ScratchOffsetReg =
|
||||
TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
|
||||
|
||||
Rsrc = buildScratchRSRC(CurDAG, DL, CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, MRI.getLiveInVirtReg(ScratchPtrReg), MVT::i64));
|
||||
SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
|
||||
MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32);
|
||||
|
||||
// (add n0, c1)
|
||||
if (CurDAG->isBaseWithConstantOffset(Addr)) {
|
||||
SDValue N1 = Addr.getOperand(1);
|
||||
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
|
||||
|
||||
if (isLegalMUBUFImmOffset(C1)) {
|
||||
VAddr = Addr.getOperand(0);
|
||||
ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// (add FI, n0)
|
||||
if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
|
||||
isa<FrameIndexSDNode>(Addr.getOperand(0))) {
|
||||
VAddr = Addr.getOperand(1);
|
||||
ImmOffset = Addr.getOperand(0);
|
||||
return true;
|
||||
}
|
||||
|
||||
// (FI)
|
||||
if (isa<FrameIndexSDNode>(Addr)) {
|
||||
VAddr = SDValue(CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
|
||||
CurDAG->getConstant(0, MVT::i32)), 0);
|
||||
ImmOffset = Addr;
|
||||
return true;
|
||||
}
|
||||
|
||||
// (node)
|
||||
VAddr = Addr;
|
||||
ImmOffset = CurDAG->getTargetConstant(0, MVT::i16);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::SelectMUBUFAddr32(SDValue Addr, SDValue &SRsrc,
|
||||
SDValue &VAddr, SDValue &SOffset,
|
||||
SDValue &Offset, SDValue &Offen,
|
||||
SDValue &Idxen, SDValue &GLC,
|
||||
SDValue &SLC, SDValue &TFE) const {
|
||||
|
||||
GLC = CurDAG->getTargetConstant(0, MVT::i1);
|
||||
SLC = CurDAG->getTargetConstant(0, MVT::i1);
|
||||
TFE = CurDAG->getTargetConstant(0, MVT::i1);
|
||||
|
||||
Idxen = CurDAG->getTargetConstant(0, MVT::i1);
|
||||
Offen = CurDAG->getTargetConstant(1, MVT::i1);
|
||||
|
||||
return SelectMUBUFScratch(Addr, SRsrc, VAddr, SOffset, Offset);
|
||||
}
|
||||
|
||||
void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
|
||||
const AMDGPUTargetLowering& Lowering =
|
||||
*static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
|
||||
|
@ -71,13 +71,6 @@ protected:
|
||||
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
|
||||
static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT);
|
||||
|
||||
/// \brief Helper function that adds Reg to the LiveIn list of the DAG's
|
||||
/// MachineFunction.
|
||||
///
|
||||
/// \returns a RegisterSDNode representing Reg.
|
||||
virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
|
||||
const TargetRegisterClass *RC,
|
||||
unsigned Reg, EVT VT) const;
|
||||
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
|
||||
SelectionDAG &DAG) const;
|
||||
/// \brief Split a vector load into multiple scalar loads.
|
||||
@ -160,6 +153,14 @@ public:
|
||||
SDValue Op,
|
||||
const SelectionDAG &DAG,
|
||||
unsigned Depth = 0) const override;
|
||||
|
||||
/// \brief Helper function that adds Reg to the LiveIn list of the DAG's
|
||||
/// MachineFunction.
|
||||
///
|
||||
/// \returns a RegisterSDNode representing Reg.
|
||||
virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
|
||||
const TargetRegisterClass *RC,
|
||||
unsigned Reg, EVT VT) const;
|
||||
};
|
||||
|
||||
namespace AMDGPUISD {
|
||||
|
@ -41,6 +41,8 @@ def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
|
||||
def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
|
||||
def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
|
||||
|
||||
let OperandType = "OPERAND_IMMEDIATE" in {
|
||||
|
||||
def u32imm : Operand<i32> {
|
||||
let PrintMethod = "printU32ImmOperand";
|
||||
}
|
||||
@ -53,6 +55,8 @@ def u8imm : Operand<i8> {
|
||||
let PrintMethod = "printU8ImmOperand";
|
||||
}
|
||||
|
||||
} // End OperandType = "OPERAND_IMMEDIATE"
|
||||
|
||||
//===--------------------------------------------------------------------===//
|
||||
// Custom Operands
|
||||
//===--------------------------------------------------------------------===//
|
||||
@ -136,6 +140,28 @@ def COND_NULL : PatLeaf <
|
||||
// Load/Store Pattern Fragments
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
class PrivateMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
|
||||
return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
|
||||
}]>;
|
||||
|
||||
class PrivateLoad <SDPatternOperator op> : PrivateMemOp <
|
||||
(ops node:$ptr), (op node:$ptr)
|
||||
>;
|
||||
|
||||
class PrivateStore <SDPatternOperator op> : PrivateMemOp <
|
||||
(ops node:$value, node:$ptr), (op node:$value, node:$ptr)
|
||||
>;
|
||||
|
||||
def extloadi8_private : PrivateLoad <extloadi8>;
|
||||
def sextloadi8_private : PrivateLoad <sextloadi8>;
|
||||
def extloadi16_private : PrivateLoad <extloadi16>;
|
||||
def sextloadi16_private : PrivateLoad <sextloadi16>;
|
||||
def load_private : PrivateLoad <load>;
|
||||
|
||||
def truncstorei8_private : PrivateStore <truncstorei8>;
|
||||
def truncstorei16_private : PrivateStore <truncstorei16>;
|
||||
def store_private : PrivateStore <store>;
|
||||
|
||||
def global_store : PatFrag<(ops node:$val, node:$ptr),
|
||||
(store node:$val, node:$ptr), [{
|
||||
return isGlobalStore(dyn_cast<StoreSDNode>(N));
|
||||
|
@ -51,7 +51,7 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
|
||||
unsigned getSubRegFromChannel(unsigned Channel) const;
|
||||
|
||||
const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override;
|
||||
void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
|
||||
virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
|
||||
unsigned FIOperandNum,
|
||||
RegScavenger *RS) const override;
|
||||
unsigned getFrameRegister(const MachineFunction &MF) const override;
|
||||
|
@ -52,7 +52,7 @@ static std::string computeDataLayout(const AMDGPUSubtarget &ST) {
|
||||
std::string Ret = "e-p:32:32";
|
||||
|
||||
if (ST.is64bit()) {
|
||||
// 32-bit private, local, and region pointers. 64-bit global and constant.
|
||||
// 32-bit local, and region pointers. 64-bit private, global, and constant.
|
||||
Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
|
||||
}
|
||||
|
||||
|
@ -32,6 +32,7 @@ enum {
|
||||
#define S_00B028_VGPRS(x) (((x) & 0x3F) << 0)
|
||||
#define S_00B028_SGPRS(x) (((x) & 0x0F) << 6)
|
||||
#define R_00B84C_COMPUTE_PGM_RSRC2 0x00B84C
|
||||
#define S_00B02C_SCRATCH_EN(x) (((x) & 0x1) << 0)
|
||||
#define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15)
|
||||
#define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC
|
||||
|
||||
@ -85,4 +86,7 @@ enum {
|
||||
#define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4)
|
||||
#define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6)
|
||||
|
||||
#define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860
|
||||
#define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12)
|
||||
|
||||
#endif // SIDEFINES_H_
|
||||
|
@ -391,10 +391,15 @@ SDValue SITargetLowering::LowerFormalArguments(
|
||||
}
|
||||
|
||||
// The pointer to the list of arguments is stored in SGPR0, SGPR1
|
||||
// The pointer to the scratch buffer is stored in SGPR2, SGPR3
|
||||
if (Info->getShaderType() == ShaderType::COMPUTE) {
|
||||
Info->NumUserSGPRs = 4;
|
||||
CCInfo.AllocateReg(AMDGPU::SGPR0);
|
||||
CCInfo.AllocateReg(AMDGPU::SGPR1);
|
||||
CCInfo.AllocateReg(AMDGPU::SGPR2);
|
||||
CCInfo.AllocateReg(AMDGPU::SGPR3);
|
||||
MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass);
|
||||
MF.addLiveIn(AMDGPU::SGPR2_SGPR3, &AMDGPU::SReg_64RegClass);
|
||||
}
|
||||
|
||||
if (Info->getShaderType() == ShaderType::COMPUTE) {
|
||||
@ -509,6 +514,36 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
|
||||
MI->eraseFromParent();
|
||||
break;
|
||||
}
|
||||
case AMDGPU::SI_BUFFER_RSRC: {
|
||||
unsigned SuperReg = MI->getOperand(0).getReg();
|
||||
unsigned Args[4];
|
||||
for (unsigned i = 0, e = 4; i < e; ++i) {
|
||||
MachineOperand &Arg = MI->getOperand(i + 1);
|
||||
|
||||
if (Arg.isReg()) {
|
||||
Args[i] = Arg.getReg();
|
||||
continue;
|
||||
}
|
||||
|
||||
assert(Arg.isImm());
|
||||
unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
|
||||
BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), Reg)
|
||||
.addImm(Arg.getImm());
|
||||
Args[i] = Reg;
|
||||
}
|
||||
BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE),
|
||||
SuperReg)
|
||||
.addReg(Args[0])
|
||||
.addImm(AMDGPU::sub0)
|
||||
.addReg(Args[1])
|
||||
.addImm(AMDGPU::sub1)
|
||||
.addReg(Args[2])
|
||||
.addImm(AMDGPU::sub2)
|
||||
.addReg(Args[3])
|
||||
.addImm(AMDGPU::sub3);
|
||||
MI->eraseFromParent();
|
||||
break;
|
||||
}
|
||||
case AMDGPU::V_SUB_F64: {
|
||||
unsigned DestReg = MI->getOperand(0).getReg();
|
||||
BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg)
|
||||
@ -620,6 +655,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
||||
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
||||
switch (Op.getOpcode()) {
|
||||
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
|
||||
case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
|
||||
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
|
||||
case ISD::LOAD: {
|
||||
LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
|
||||
@ -658,8 +694,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
||||
cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
|
||||
EVT VT = Op.getValueType();
|
||||
SDLoc DL(Op);
|
||||
//XXX: Hardcoded we only use two to store the pointer to the parameters.
|
||||
unsigned NumUserSGPRs = 2;
|
||||
switch (IntrinsicID) {
|
||||
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
|
||||
case Intrinsic::r600_read_ngroups_x:
|
||||
@ -682,13 +716,13 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
||||
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32, false);
|
||||
case Intrinsic::r600_read_tgid_x:
|
||||
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
|
||||
AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT);
|
||||
AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0), VT);
|
||||
case Intrinsic::r600_read_tgid_y:
|
||||
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
|
||||
AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT);
|
||||
AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1), VT);
|
||||
case Intrinsic::r600_read_tgid_z:
|
||||
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
|
||||
AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT);
|
||||
AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2), VT);
|
||||
case Intrinsic::r600_read_tidig_x:
|
||||
return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
|
||||
AMDGPU::VGPR0, VT);
|
||||
@ -782,6 +816,21 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
|
||||
|
||||
MachineFunction &MF = DAG.getMachineFunction();
|
||||
const SIInstrInfo *TII =
|
||||
static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
|
||||
const SIRegisterInfo &TRI = TII->getRegisterInfo();
|
||||
FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
|
||||
unsigned FrameIndex = FINode->getIndex();
|
||||
|
||||
CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
|
||||
TRI.getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET), MVT::i32);
|
||||
|
||||
return DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
|
||||
}
|
||||
|
||||
/// This transforms the control flow intrinsics to get the branch destination as
|
||||
/// last parameter, also switches branch target with BR if the need arise
|
||||
SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
|
||||
@ -891,6 +940,11 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
|
||||
SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
|
||||
SDLoc DL(Op);
|
||||
LoadSDNode *Load = cast<LoadSDNode>(Op);
|
||||
// Vector private memory loads have already been split, and
|
||||
// all the rest of private memory loads are legal.
|
||||
if (Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
|
||||
return SDValue();
|
||||
}
|
||||
SDValue Lowered = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
|
||||
if (Lowered.getNode())
|
||||
return Lowered;
|
||||
@ -1081,6 +1135,12 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
|
||||
VT.getVectorElementType() == MVT::i32)
|
||||
return SDValue();
|
||||
|
||||
if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
|
||||
if (VT.isVector() && VT.getVectorNumElements() > 4)
|
||||
return SplitVectorStore(Op, DAG);
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
|
||||
if (Ret.getNode())
|
||||
return Ret;
|
||||
@ -1495,9 +1555,19 @@ void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand,
|
||||
|
||||
// This is a conservative aproach. It is possible that we can't determine the
|
||||
// correct register class and copy too often, but better safe than sorry.
|
||||
SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32);
|
||||
SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(),
|
||||
Operand.getValueType(), Operand, RC);
|
||||
|
||||
SDNode *Node;
|
||||
// We can't use COPY_TO_REGCLASS with FrameIndex arguments.
|
||||
if (isa<FrameIndexSDNode>(Operand)) {
|
||||
unsigned Opcode = Operand.getValueType() == MVT::i32 ?
|
||||
AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
|
||||
Node = DAG.getMachineNode(Opcode, SDLoc(), Operand.getValueType(),
|
||||
Operand);
|
||||
} else {
|
||||
SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32);
|
||||
Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(),
|
||||
Operand.getValueType(), Operand, RC);
|
||||
}
|
||||
Operand = SDValue(Node, 0);
|
||||
}
|
||||
|
||||
@ -1591,6 +1661,14 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
|
||||
ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed);
|
||||
}
|
||||
continue;
|
||||
} else {
|
||||
// If it's not a VSrc or SSrc operand check if we have a GlobalAddress.
|
||||
// These will be lowered to immediates, so we will need to insert a MOV.
|
||||
if (isa<GlobalAddressSDNode>(Ops[i])) {
|
||||
SDNode *Node = DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(),
|
||||
Operand.getValueType(), Operand);
|
||||
Ops[i] = SDValue(Node, 0);
|
||||
}
|
||||
}
|
||||
|
||||
if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) {
|
||||
|
@ -27,6 +27,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
|
||||
SelectionDAG &DAG) const;
|
||||
SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
|
||||
SelectionDAG &DAG) const override;
|
||||
SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
@ -561,6 +561,21 @@ static bool compareMachineOp(const MachineOperand &Op0,
|
||||
}
|
||||
}
|
||||
|
||||
bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
|
||||
const MachineOperand &MO) const {
|
||||
const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo];
|
||||
|
||||
assert(MO.isImm() || MO.isFPImm());
|
||||
|
||||
if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
|
||||
return true;
|
||||
|
||||
if (OpInfo.RegClass < 0)
|
||||
return false;
|
||||
|
||||
return RI.regClassCanUseImmediate(OpInfo.RegClass);
|
||||
}
|
||||
|
||||
bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
|
||||
StringRef &ErrInfo) const {
|
||||
uint16_t Opcode = MI->getOpcode();
|
||||
@ -589,7 +604,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
|
||||
}
|
||||
break;
|
||||
case MCOI::OPERAND_IMMEDIATE:
|
||||
if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm()) {
|
||||
// Check if this operand is an immediate.
|
||||
// FrameIndex operands will be replaced by immediates, so they are
|
||||
// allowed.
|
||||
if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm() &&
|
||||
!MI->getOperand(i).isFI()) {
|
||||
ErrInfo = "Expected immediate, but got non-immediate";
|
||||
return false;
|
||||
}
|
||||
|
@ -106,6 +106,9 @@ public:
|
||||
bool isInlineConstant(const MachineOperand &MO) const;
|
||||
bool isLiteralConstant(const MachineOperand &MO) const;
|
||||
|
||||
bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
|
||||
const MachineOperand &MO) const;
|
||||
|
||||
bool verifyInstruction(const MachineInstr *MI,
|
||||
StringRef &ErrInfo) const override;
|
||||
|
||||
@ -181,7 +184,7 @@ namespace AMDGPU {
|
||||
int getMCOpcode(uint16_t Opcode, unsigned Gen);
|
||||
|
||||
const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
|
||||
|
||||
const uint64_t RSRC_TID_ENABLE = 1LL << 55;
|
||||
|
||||
} // End namespace AMDGPU
|
||||
|
||||
|
@ -163,7 +163,9 @@ def sopp_brtarget : Operand<OtherVT> {
|
||||
// Complex patterns
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
|
||||
def MUBUFAddr64 : ComplexPattern<i64, 3, "SelectMUBUFAddr64">;
|
||||
def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// SI assembler operands
|
||||
@ -605,12 +607,12 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass,
|
||||
asm#" $vdata, $srsrc + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
|
||||
}
|
||||
|
||||
let offen = 1, idxen = 0, offset = 0 in {
|
||||
let offen = 1, idxen = 0 in {
|
||||
def _OFFEN : MUBUF <op, (outs regClass:$vdata),
|
||||
(ins SReg_128:$srsrc, VReg_32:$vaddr,
|
||||
SSrc_32:$soffset, i1imm:$glc, i1imm:$slc,
|
||||
SSrc_32:$soffset, u16imm:$offset, i1imm:$glc, i1imm:$slc,
|
||||
i1imm:$tfe),
|
||||
asm#" $vdata, $srsrc + $vaddr + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
|
||||
asm#" $vdata, $srsrc + $vaddr + $soffset + $offset, glc=$glc, slc=$slc, tfe=$tfe", []>;
|
||||
}
|
||||
|
||||
let offen = 0, idxen = 1 in {
|
||||
@ -640,25 +642,40 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass,
|
||||
}
|
||||
}
|
||||
|
||||
class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass,
|
||||
ValueType store_vt, SDPatternOperator st> :
|
||||
MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr,
|
||||
u16imm:$offset),
|
||||
name#" $vdata, $srsrc + $vaddr + $offset",
|
||||
[(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, u16imm:$offset))]> {
|
||||
multiclass MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass,
|
||||
ValueType store_vt, SDPatternOperator st> {
|
||||
|
||||
let mayLoad = 0;
|
||||
let mayStore = 1;
|
||||
def "" : MUBUF <
|
||||
op, (outs),
|
||||
(ins vdataClass:$vdata, SReg_128:$srsrc, VReg_32:$vaddr, SSrc_32:$soffset,
|
||||
u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$slc,
|
||||
i1imm:$tfe),
|
||||
name#" $vdata, $srsrc, $vaddr, $soffset, $offset $offen $idxen $glc $slc $tfe",
|
||||
[]
|
||||
> {
|
||||
let addr64 = 0;
|
||||
}
|
||||
|
||||
// Encoding
|
||||
let offen = 0;
|
||||
let idxen = 0;
|
||||
let glc = 0;
|
||||
let addr64 = 1;
|
||||
let lds = 0;
|
||||
let slc = 0;
|
||||
let tfe = 0;
|
||||
let soffset = 128; // ZERO
|
||||
def _ADDR64 : MUBUF <
|
||||
op, (outs),
|
||||
(ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset),
|
||||
name#" $vdata, $srsrc + $vaddr + $offset",
|
||||
[(st store_vt:$vdata,
|
||||
(MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, u16imm:$offset))]> {
|
||||
|
||||
let mayLoad = 0;
|
||||
let mayStore = 1;
|
||||
|
||||
// Encoding
|
||||
let offen = 0;
|
||||
let idxen = 0;
|
||||
let glc = 0;
|
||||
let addr64 = 1;
|
||||
let lds = 0;
|
||||
let slc = 0;
|
||||
let tfe = 0;
|
||||
let soffset = 128; // ZERO
|
||||
}
|
||||
}
|
||||
|
||||
class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
|
||||
|
@ -872,23 +872,23 @@ defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <
|
||||
0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128, v4i32, global_load
|
||||
>;
|
||||
|
||||
def BUFFER_STORE_BYTE : MUBUF_Store_Helper <
|
||||
defm BUFFER_STORE_BYTE : MUBUF_Store_Helper <
|
||||
0x00000018, "BUFFER_STORE_BYTE", VReg_32, i32, truncstorei8_global
|
||||
>;
|
||||
|
||||
def BUFFER_STORE_SHORT : MUBUF_Store_Helper <
|
||||
defm BUFFER_STORE_SHORT : MUBUF_Store_Helper <
|
||||
0x0000001a, "BUFFER_STORE_SHORT", VReg_32, i32, truncstorei16_global
|
||||
>;
|
||||
|
||||
def BUFFER_STORE_DWORD : MUBUF_Store_Helper <
|
||||
defm BUFFER_STORE_DWORD : MUBUF_Store_Helper <
|
||||
0x0000001c, "BUFFER_STORE_DWORD", VReg_32, i32, global_store
|
||||
>;
|
||||
|
||||
def BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
|
||||
defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
|
||||
0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, v2i32, global_store
|
||||
>;
|
||||
|
||||
def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
|
||||
defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
|
||||
0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128, v4i32, global_store
|
||||
>;
|
||||
//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>;
|
||||
@ -1667,6 +1667,12 @@ def SI_ADDR64_RSRC : InstSI <
|
||||
"", []
|
||||
>;
|
||||
|
||||
def SI_BUFFER_RSRC : InstSI <
|
||||
(outs SReg_128:$srsrc),
|
||||
(ins SReg_32:$ptr_lo, SReg_32:$ptr_hi, SSrc_32:$data_lo, SSrc_32:$data_hi),
|
||||
"", []
|
||||
>;
|
||||
|
||||
def V_SUB_F64 : InstSI <
|
||||
(outs VReg_64:$dst),
|
||||
(ins VReg_64:$src0, VReg_64:$src1),
|
||||
@ -2410,7 +2416,7 @@ def : Ext32Pat <anyext>;
|
||||
// Offset in an 32Bit VGPR
|
||||
def : Pat <
|
||||
(SIload_constant v4i32:$sbase, i32:$voff),
|
||||
(BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0)
|
||||
(BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0, 0)
|
||||
>;
|
||||
|
||||
// The multiplication scales from [0,1] to the unsigned integer range
|
||||
@ -2599,22 +2605,30 @@ multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
|
||||
(vt (constant_ld (add i64:$ptr, i64:$offset))),
|
||||
(Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0)
|
||||
>;
|
||||
|
||||
}
|
||||
|
||||
defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32,
|
||||
sextloadi8_constant>;
|
||||
defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32,
|
||||
az_extloadi8_constant>;
|
||||
defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32,
|
||||
sextloadi16_constant>;
|
||||
defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32,
|
||||
az_extloadi16_constant>;
|
||||
defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32,
|
||||
constant_load>;
|
||||
defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32,
|
||||
constant_load>;
|
||||
defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32,
|
||||
constant_load>;
|
||||
defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
|
||||
defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
|
||||
defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
|
||||
defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
|
||||
defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32, constant_load>;
|
||||
defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32, constant_load>;
|
||||
defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, constant_load>;
|
||||
|
||||
class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat <
|
||||
(vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr,
|
||||
i32:$soffset, u16imm:$offset))),
|
||||
(Instr $srsrc, $vaddr, $soffset, $offset, 0, 0, 0)
|
||||
>;
|
||||
|
||||
def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>;
|
||||
def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>;
|
||||
def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>;
|
||||
def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>;
|
||||
def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>;
|
||||
def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>;
|
||||
def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>;
|
||||
|
||||
// BUFFER_LOAD_DWORD*, addr64=0
|
||||
multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxen,
|
||||
@ -2630,9 +2644,9 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe
|
||||
|
||||
def : Pat <
|
||||
(vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
|
||||
imm, 1, 0, imm:$glc, imm:$slc,
|
||||
imm:$offset, 1, 0, imm:$glc, imm:$slc,
|
||||
imm:$tfe)),
|
||||
(offen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc),
|
||||
(offen $rsrc, $vaddr, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
|
||||
(as_i1imm $tfe))
|
||||
>;
|
||||
|
||||
@ -2660,6 +2674,34 @@ defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_
|
||||
defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN,
|
||||
BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>;
|
||||
|
||||
class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
|
||||
(st vt:$value, (MUBUFAddr32 v4i32:$srsrc, i32:$vaddr, i32:$soffset,
|
||||
u16imm:$offset, i1imm:$offen, i1imm:$idxen,
|
||||
i1imm:$glc, i1imm:$slc, i1imm:$tfe)),
|
||||
(Instr $value, $srsrc, $vaddr, $soffset, $offset, $offen, $idxen,
|
||||
$glc, $slc, $tfe)
|
||||
>;
|
||||
|
||||
def : MUBUFScratchStorePat <BUFFER_STORE_BYTE, i32, truncstorei8_private>;
|
||||
def : MUBUFScratchStorePat <BUFFER_STORE_SHORT, i32, truncstorei16_private>;
|
||||
def : MUBUFScratchStorePat <BUFFER_STORE_DWORD, i32, store_private>;
|
||||
def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2, v2i32, store_private>;
|
||||
def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4, v4i32, store_private>;
|
||||
|
||||
/*
|
||||
class MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
|
||||
(st vt:$value, (MUBUFScratch v4i32:$srsrc, i64:$vaddr, u16imm:$offset)),
|
||||
(Instr $value, $srsrc, $vaddr, $offset)
|
||||
>;
|
||||
|
||||
def : MUBUFStore_Pattern <BUFFER_STORE_BYTE_ADDR64, i32, truncstorei8_private>;
|
||||
def : MUBUFStore_Pattern <BUFFER_STORE_SHORT_ADDR64, i32, truncstorei16_private>;
|
||||
def : MUBUFStore_Pattern <BUFFER_STORE_DWORD_ADDR64, i32, store_private>;
|
||||
def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2_ADDR64, v2i32, store_private>;
|
||||
def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4_ADDR64, v4i32, store_private>;
|
||||
|
||||
*/
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// MTBUF Patterns
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -27,7 +27,8 @@ void SIMachineFunctionInfo::anchor() {}
|
||||
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
|
||||
: AMDGPUMachineFunction(MF),
|
||||
PSInputAddr(0),
|
||||
SpillTracker() { }
|
||||
SpillTracker(),
|
||||
NumUserSGPRs(0) { }
|
||||
|
||||
static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) {
|
||||
unsigned VGPR = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
|
||||
|
@ -59,6 +59,7 @@ public:
|
||||
SIMachineFunctionInfo(const MachineFunction &MF);
|
||||
unsigned PSInputAddr;
|
||||
struct RegSpillTracker SpillTracker;
|
||||
unsigned NumUserSGPRs;
|
||||
};
|
||||
|
||||
} // End namespace llvm
|
||||
|
@ -16,6 +16,10 @@
|
||||
#include "SIRegisterInfo.h"
|
||||
#include "AMDGPUSubtarget.h"
|
||||
#include "SIInstrInfo.h"
|
||||
#include "SIMachineFunctionInfo.h"
|
||||
#include "llvm/CodeGen/MachineFrameInfo.h"
|
||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||
#include "llvm/CodeGen/RegisterScavenging.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
@ -27,8 +31,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
|
||||
BitVector Reserved(getNumRegs());
|
||||
Reserved.set(AMDGPU::EXEC);
|
||||
Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
|
||||
const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
|
||||
TII->reserveIndirectRegisters(Reserved, MF);
|
||||
return Reserved;
|
||||
}
|
||||
|
||||
@ -37,6 +39,30 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
|
||||
return RC->getNumRegs();
|
||||
}
|
||||
|
||||
bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
|
||||
return Fn.getFrameInfo()->hasStackObjects();
|
||||
}
|
||||
|
||||
void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
|
||||
int SPAdj, unsigned FIOperandNum,
|
||||
RegScavenger *RS) const {
|
||||
MachineFunction *MF = MI->getParent()->getParent();
|
||||
MachineFrameInfo *FrameInfo = MF->getFrameInfo();
|
||||
const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
|
||||
MachineOperand &FIOp = MI->getOperand(FIOperandNum);
|
||||
int Index = MI->getOperand(FIOperandNum).getIndex();
|
||||
int64_t Offset = FrameInfo->getObjectOffset(Index);
|
||||
|
||||
FIOp.ChangeToImmediate(Offset);
|
||||
if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) {
|
||||
unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VReg_32RegClass, MI, SPAdj);
|
||||
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
|
||||
TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
|
||||
.addImm(Offset);
|
||||
FIOp.ChangeToRegister(TmpReg, false);
|
||||
}
|
||||
}
|
||||
|
||||
const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
|
||||
MVT VT) const {
|
||||
switch(VT.SimpleTy) {
|
||||
@ -141,3 +167,21 @@ bool SIRegisterInfo::regClassCanUseImmediate(
|
||||
const TargetRegisterClass *RC) const {
|
||||
return regClassCanUseImmediate(RC->getID());
|
||||
}
|
||||
|
||||
unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
|
||||
enum PreloadedValue Value) const {
|
||||
|
||||
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
||||
switch (Value) {
|
||||
case SIRegisterInfo::TGID_X:
|
||||
return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0);
|
||||
case SIRegisterInfo::TGID_Y:
|
||||
return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1);
|
||||
case SIRegisterInfo::TGID_Z:
|
||||
return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2);
|
||||
case SIRegisterInfo::SCRATCH_WAVE_OFFSET:
|
||||
return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4);
|
||||
case SIRegisterInfo::SCRATCH_PTR:
|
||||
return AMDGPU::SGPR2_SGPR3;
|
||||
}
|
||||
}
|
||||
|
@ -29,6 +29,12 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
|
||||
unsigned getRegPressureLimit(const TargetRegisterClass *RC,
|
||||
MachineFunction &MF) const override;
|
||||
|
||||
bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
|
||||
|
||||
void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
|
||||
unsigned FIOperandNum,
|
||||
RegScavenger *RS) const override;
|
||||
|
||||
/// \brief get the register class of the specified type to use in the
|
||||
/// CFGStructurizer
|
||||
const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
|
||||
@ -68,6 +74,19 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
|
||||
/// \returns True if operands defined with this register class can accept
|
||||
/// inline immediates.
|
||||
bool regClassCanUseImmediate(const TargetRegisterClass *RC) const;
|
||||
|
||||
enum PreloadedValue {
|
||||
TGID_X,
|
||||
TGID_Y,
|
||||
TGID_Z,
|
||||
SCRATCH_WAVE_OFFSET,
|
||||
SCRATCH_PTR
|
||||
};
|
||||
|
||||
/// \brief Returns the physical register that \p Value is stored in.
|
||||
unsigned getPreloadedValue(const MachineFunction &MF,
|
||||
enum PreloadedValue Value) const;
|
||||
|
||||
};
|
||||
|
||||
} // End namespace llvm
|
||||
|
@ -11,15 +11,18 @@ declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
|
||||
|
||||
; SI-LABEL: @test_private_array_ptr_calc:
|
||||
|
||||
; SI: V_ADD_I32_e32 [[PTRREG:v[0-9]+]]
|
||||
|
||||
; SI-ALLOCA: V_MOVRELD_B32_e32 {{v[0-9]+}}, [[PTRREG]]
|
||||
; FIXME: We end up with zero argument for ADD, because
|
||||
; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index
|
||||
; with the appropriate offset. We should fold this into the store.
|
||||
; SI-ALLOCA: V_ADD_I32_e32 [[PTRREG:v[0-9]+]], 0, v{{[0-9]+}}
|
||||
; SI-ALLOCA: BUFFER_STORE_DWORD {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], [[PTRREG]]
|
||||
;
|
||||
; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
|
||||
; alloca to a vector. It currently fails because it does not know how
|
||||
; to interpret:
|
||||
; getelementptr [4 x i32]* %alloca, i32 1, i32 %b
|
||||
|
||||
; SI-PROMOTE: V_ADD_I32_e32 [[PTRREG:v[0-9]+]]
|
||||
; SI-PROMOTE: DS_WRITE_B32 {{v[0-9]+}}, [[PTRREG]]
|
||||
define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
|
||||
%alloca = alloca [4 x i32], i32 4, align 16
|
||||
|
@ -76,3 +76,22 @@ define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
|
||||
store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) {
|
||||
entry:
|
||||
%0 = icmp eq i32 0, %a
|
||||
br i1 %0, label %if, label %else
|
||||
|
||||
if:
|
||||
%1 = getelementptr inbounds [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
|
||||
%2 = load float addrspace(2)* %1
|
||||
store float %2, float addrspace(1)* %out
|
||||
br label %endif
|
||||
|
||||
else:
|
||||
store float 1.0, float addrspace(1)* %out
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
ret void
|
||||
}
|
||||
|
@ -6,10 +6,10 @@ declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
|
||||
|
||||
; SI-LABEL: @private_access_f64_alloca:
|
||||
|
||||
; SI-ALLOCA: V_MOVRELD_B32_e32
|
||||
; SI-ALLOCA: V_MOVRELD_B32_e32
|
||||
; SI-ALLOCA: V_MOVRELS_B32_e32
|
||||
; SI-ALLOCA: V_MOVRELS_B32_e32
|
||||
; SI-ALLOCA: BUFFER_STORE_DWORDX2
|
||||
; FIXME: We should be able to use BUFFER_LOAD_DWORDX2
|
||||
; SI-ALLOCA: BUFFER_LOAD_DWORD
|
||||
; SI-ALLOCA: BUFFER_LOAD_DWORD
|
||||
|
||||
; SI-PROMOTE: DS_WRITE_B64
|
||||
; SI-PROMOTE: DS_READ_B64
|
||||
@ -26,10 +26,12 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double
|
||||
|
||||
; SI-LABEL: @private_access_v2f64_alloca:
|
||||
|
||||
; SI-ALLOCA: V_MOVRELD_B32_e32
|
||||
; SI-ALLOCA: V_MOVRELD_B32_e32
|
||||
; SI-ALLOCA: V_MOVRELS_B32_e32
|
||||
; SI-ALLOCA: V_MOVRELS_B32_e32
|
||||
; SI-ALLOCA: BUFFER_STORE_DWORDX4
|
||||
; FIXME: We should be able to use BUFFER_LOAD_DWORDX4
|
||||
; SI-ALLOCA: BUFFER_LOAD_DWORD
|
||||
; SI-ALLOCA: BUFFER_LOAD_DWORD
|
||||
; SI-ALLOCA: BUFFER_LOAD_DWORD
|
||||
; SI-ALLOCA: BUFFER_LOAD_DWORD
|
||||
|
||||
; SI-PROMOTE: DS_WRITE_B32
|
||||
; SI-PROMOTE: DS_WRITE_B32
|
||||
@ -52,10 +54,10 @@ define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out
|
||||
|
||||
; SI-LABEL: @private_access_i64_alloca:
|
||||
|
||||
; SI-ALLOCA: V_MOVRELD_B32_e32
|
||||
; SI-ALLOCA: V_MOVRELD_B32_e32
|
||||
; SI-ALLOCA: V_MOVRELS_B32_e32
|
||||
; SI-ALLOCA: V_MOVRELS_B32_e32
|
||||
; SI-ALLOCA: BUFFER_STORE_DWORDX2
|
||||
; FIXME: We should be able to use BUFFER_LOAD_DWORDX2
|
||||
; SI-ALLOCA: BUFFER_LOAD_DWORD
|
||||
; SI-ALLOCA: BUFFER_LOAD_DWORD
|
||||
|
||||
; SI-PROMOTE: DS_WRITE_B64
|
||||
; SI-PROMOTE: DS_READ_B64
|
||||
@ -72,14 +74,12 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs
|
||||
|
||||
; SI-LABEL: @private_access_v2i64_alloca:
|
||||
|
||||
; SI-ALLOCA: V_MOVRELD_B32_e32
|
||||
; SI-ALLOCA: V_MOVRELD_B32_e32
|
||||
; SI-ALLOCA: V_MOVRELD_B32_e32
|
||||
; SI-ALLOCA: V_MOVRELD_B32_e32
|
||||
; SI-ALLOCA: V_MOVRELS_B32_e32
|
||||
; SI-ALLOCA: V_MOVRELS_B32_e32
|
||||
; SI-ALLOCA: V_MOVRELS_B32_e32
|
||||
; SI-ALLOCA: V_MOVRELS_B32_e32
|
||||
; SI-ALLOCA: BUFFER_STORE_DWORDX4
|
||||
; FIXME: We should be able to use BUFFER_LOAD_DWORDX4
|
||||
; SI-ALLOCA: BUFFER_LOAD_DWORD
|
||||
; SI-ALLOCA: BUFFER_LOAD_DWORD
|
||||
; SI-ALLOCA: BUFFER_LOAD_DWORD
|
||||
; SI-ALLOCA: BUFFER_LOAD_DWORD
|
||||
|
||||
; SI-PROMOTE: DS_WRITE_B32
|
||||
; SI-PROMOTE: DS_WRITE_B32
|
||||
|
@ -16,12 +16,8 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
|
||||
; SI-PROMOTE: DS_READ_B32
|
||||
; SI-PROMOTE: DS_READ_B32
|
||||
|
||||
; SI-ALLOCA: V_READFIRSTLANE_B32 vcc_lo
|
||||
; SI-ALLOCA: V_MOVRELD
|
||||
; SI-ALLOCA: S_CBRANCH
|
||||
; SI-ALLOCA: V_READFIRSTLANE_B32 vcc_lo
|
||||
; SI-ALLOCA: V_MOVRELD
|
||||
; SI-ALLOCA: S_CBRANCH
|
||||
; SI-ALLOCA: BUFFER_STORE_DWORD v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}
|
||||
; SI-ALLOCA: BUFFER_STORE_DWORD v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}
|
||||
define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
|
||||
entry:
|
||||
%stack = alloca [5 x i32], align 4
|
||||
@ -120,7 +116,9 @@ for.end:
|
||||
|
||||
; R600: MOVA_INT
|
||||
|
||||
; SI-PROMOTE: V_MOVRELS_B32_e32
|
||||
; SI-PROMOTE: BUFFER_STORE_SHORT v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}
|
||||
; SI-PROMOTE: BUFFER_STORE_SHORT v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}
|
||||
; SI_PROMOTE: BUFFER_LOAD_SSHORT v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] + v{{[0-9]+}}, s{{[0-9]+}}
|
||||
define void @short_array(i32 addrspace(1)* %out, i32 %index) {
|
||||
entry:
|
||||
%0 = alloca [2 x i16]
|
||||
@ -139,8 +137,8 @@ entry:
|
||||
|
||||
; R600: MOVA_INT
|
||||
|
||||
; SI: V_OR_B32_e32 v{{[0-9]}}, 0x100
|
||||
; SI: V_MOVRELS_B32_e32
|
||||
; SI-DAG: BUFFER_STORE_BYTE v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}, 0x0
|
||||
; SI-DAG: BUFFER_STORE_BYTE v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}, 0x1
|
||||
define void @char_array(i32 addrspace(1)* %out, i32 %index) {
|
||||
entry:
|
||||
%0 = alloca [2 x i8]
|
||||
|
@ -127,12 +127,12 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; The tgid values are stored in ss offset by the number of user ss.
|
||||
; Currently we always use exactly 2 user ss for the pointer to the
|
||||
; The tgid values are stored in sgprs offset by the number of user sgprs.
|
||||
; Currently we always use exactly 2 user sgprs for the pointer to the
|
||||
; kernel arguments, but this may change in the future.
|
||||
|
||||
; SI-CHECK: @tgid_x
|
||||
; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s2
|
||||
; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s4
|
||||
; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
|
||||
define void @tgid_x (i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
@ -142,7 +142,7 @@ entry:
|
||||
}
|
||||
|
||||
; SI-CHECK: @tgid_y
|
||||
; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s3
|
||||
; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s5
|
||||
; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
|
||||
define void @tgid_y (i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
@ -152,7 +152,7 @@ entry:
|
||||
}
|
||||
|
||||
; SI-CHECK: @tgid_z
|
||||
; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s4
|
||||
; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s6
|
||||
; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
|
||||
define void @tgid_z (i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
|
Loading…
x
Reference in New Issue
Block a user