R600/SI: Use scratch memory for large private arrays

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213551 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Tom Stellard 2014-07-21 15:45:01 +00:00
parent c912b101d2
commit 3280804237
23 changed files with 507 additions and 104 deletions

View File

@ -25,6 +25,7 @@
#include "SIDefines.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCStreamer.h"
@ -141,6 +142,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
false);
OutStreamer.emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
false);
OutStreamer.emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
false);
} else {
R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
OutStreamer.emitRawComment(
@ -332,6 +335,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// Do not clamp NAN to 0.
ProgInfo.DX10Clamp = 0;
const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF);
ProgInfo.CodeLen = CodeSize;
}
@ -361,6 +367,15 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
unsigned LDSBlocks =
RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
// Scratch is allocated in 256 dword blocks.
unsigned ScratchAlignShift = 10;
// We need to program the hardware with the amount of scratch memory that
// is used by the entire wave. KernelInfo.ScratchSize is the amount of
// scratch memory used per thread.
unsigned ScratchBlocks =
RoundUpToAlignment(KernelInfo.ScratchSize * STM.getWavefrontSize(),
1 << ScratchAlignShift) >> ScratchAlignShift;
if (MFI->getShaderType() == ShaderType::COMPUTE) {
OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
@ -377,7 +392,14 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
OutStreamer.EmitIntValue(ComputePGMRSrc1, 4);
OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(LDSBlocks), 4);
const uint32_t ComputePGMRSrc2 =
S_00B84C_LDS_SIZE(LDSBlocks) |
S_00B02C_SCRATCH_EN(ScratchBlocks > 0);
OutStreamer.EmitIntValue(ComputePGMRSrc2, 4);
OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
OutStreamer.EmitIntValue(S_00B860_WAVESIZE(ScratchBlocks), 4);
} else {
OutStreamer.EmitIntValue(RsrcReg, 4);
OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) |

View File

@ -32,6 +32,7 @@ private:
DX10Clamp(0),
DebugMode(0),
IEEEMode(0),
ScratchSize(0),
CodeLen(0) {}
// Fields set in PGM_RSRC1 pm4 packet.
@ -43,6 +44,7 @@ private:
uint32_t DX10Clamp;
uint32_t DebugMode;
uint32_t IEEEMode;
uint32_t ScratchSize;
// Bonus information for debugging.
uint64_t CodeLen;

View File

@ -16,9 +16,13 @@
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "R600InstrInfo.h"
#include "SIDefines.h"
#include "SIISelLowering.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/IR/Function.h"
@ -85,7 +89,13 @@ private:
bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
bool SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, SDValue &Offset,
SDValue &ImmOffset) const;
SDValue &ImmOffset) const;
bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
SDValue &SOffset, SDValue &ImmOffset) const;
bool SelectMUBUFAddr32(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
SDValue &SOffset, SDValue &Offset, SDValue &Offen,
SDValue &Idxen, SDValue &GLC, SDValue &SLC,
SDValue &TFE) const;
SDNode *SelectADD_SUB_I64(SDNode *N);
SDNode *SelectDIV_SCALE(SDNode *N);
@ -730,6 +740,10 @@ static SDValue wrapAddr64Rsrc(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) {
Ptr), 0);
}
static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) {
return isUInt<12>(Imm->getZExtValue());
}
bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr,
SDValue &Offset,
SDValue &ImmOffset) const {
@ -740,7 +754,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr,
SDValue N1 = Addr.getOperand(1);
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
if (isUInt<12>(C1->getZExtValue())) {
if (isLegalMUBUFImmOffset(C1)) {
if (N0.getOpcode() == ISD::ADD) {
// (add (add N2, N3), C1)
@ -776,6 +790,95 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr,
return true;
}
/// \brief Return a resource descriptor with the 'Add TID' bit enabled
/// The TID (Thread ID) is multipled by the stride value (bits [61:48]
/// of the resource descriptor) to create an offset, which is added to the
/// resource ponter.
static SDValue buildScratchRSRC(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) {
uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE |
0xffffffff;
SDValue PtrLo = DAG->getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
SDValue PtrHi = DAG->getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
SDValue DataLo = DAG->getTargetConstant(
Rsrc & APInt::getAllOnesValue(32).getZExtValue(), MVT::i32);
SDValue DataHi = DAG->getTargetConstant(Rsrc >> 32, MVT::i32);
const SDValue Ops[] = { PtrLo, PtrHi, DataLo, DataHi };
return SDValue(DAG->getMachineNode(AMDGPU::SI_BUFFER_RSRC, DL,
MVT::v4i32, Ops), 0);
}
bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
SDValue &VAddr, SDValue &SOffset,
SDValue &ImmOffset) const {
SDLoc DL(Addr);
MachineFunction &MF = CurDAG->getMachineFunction();
const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo());
MachineRegisterInfo &MRI = MF.getRegInfo();
unsigned ScratchPtrReg =
TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
unsigned ScratchOffsetReg =
TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
Rsrc = buildScratchRSRC(CurDAG, DL, CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, MRI.getLiveInVirtReg(ScratchPtrReg), MVT::i64));
SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32);
// (add n0, c1)
if (CurDAG->isBaseWithConstantOffset(Addr)) {
SDValue N1 = Addr.getOperand(1);
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
if (isLegalMUBUFImmOffset(C1)) {
VAddr = Addr.getOperand(0);
ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
return true;
}
}
// (add FI, n0)
if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
isa<FrameIndexSDNode>(Addr.getOperand(0))) {
VAddr = Addr.getOperand(1);
ImmOffset = Addr.getOperand(0);
return true;
}
// (FI)
if (isa<FrameIndexSDNode>(Addr)) {
VAddr = SDValue(CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
CurDAG->getConstant(0, MVT::i32)), 0);
ImmOffset = Addr;
return true;
}
// (node)
VAddr = Addr;
ImmOffset = CurDAG->getTargetConstant(0, MVT::i16);
return true;
}
bool AMDGPUDAGToDAGISel::SelectMUBUFAddr32(SDValue Addr, SDValue &SRsrc,
SDValue &VAddr, SDValue &SOffset,
SDValue &Offset, SDValue &Offen,
SDValue &Idxen, SDValue &GLC,
SDValue &SLC, SDValue &TFE) const {
GLC = CurDAG->getTargetConstant(0, MVT::i1);
SLC = CurDAG->getTargetConstant(0, MVT::i1);
TFE = CurDAG->getTargetConstant(0, MVT::i1);
Idxen = CurDAG->getTargetConstant(0, MVT::i1);
Offen = CurDAG->getTargetConstant(1, MVT::i1);
return SelectMUBUFScratch(Addr, SRsrc, VAddr, SOffset, Offset);
}
void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
const AMDGPUTargetLowering& Lowering =
*static_cast<const AMDGPUTargetLowering*>(getTargetLowering());

View File

@ -71,13 +71,6 @@ protected:
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT);
/// \brief Helper function that adds Reg to the LiveIn list of the DAG's
/// MachineFunction.
///
/// \returns a RegisterSDNode representing Reg.
virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
const TargetRegisterClass *RC,
unsigned Reg, EVT VT) const;
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
SelectionDAG &DAG) const;
/// \brief Split a vector load into multiple scalar loads.
@ -160,6 +153,14 @@ public:
SDValue Op,
const SelectionDAG &DAG,
unsigned Depth = 0) const override;
/// \brief Helper function that adds Reg to the LiveIn list of the DAG's
/// MachineFunction.
///
/// \returns a RegisterSDNode representing Reg.
virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
const TargetRegisterClass *RC,
unsigned Reg, EVT VT) const;
};
namespace AMDGPUISD {

View File

@ -41,6 +41,8 @@ def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
let OperandType = "OPERAND_IMMEDIATE" in {
def u32imm : Operand<i32> {
let PrintMethod = "printU32ImmOperand";
}
@ -53,6 +55,8 @@ def u8imm : Operand<i8> {
let PrintMethod = "printU8ImmOperand";
}
} // End OperandType = "OPERAND_IMMEDIATE"
//===--------------------------------------------------------------------===//
// Custom Operands
//===--------------------------------------------------------------------===//
@ -136,6 +140,28 @@ def COND_NULL : PatLeaf <
// Load/Store Pattern Fragments
//===----------------------------------------------------------------------===//
class PrivateMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
}]>;
class PrivateLoad <SDPatternOperator op> : PrivateMemOp <
(ops node:$ptr), (op node:$ptr)
>;
class PrivateStore <SDPatternOperator op> : PrivateMemOp <
(ops node:$value, node:$ptr), (op node:$value, node:$ptr)
>;
def extloadi8_private : PrivateLoad <extloadi8>;
def sextloadi8_private : PrivateLoad <sextloadi8>;
def extloadi16_private : PrivateLoad <extloadi16>;
def sextloadi16_private : PrivateLoad <sextloadi16>;
def load_private : PrivateLoad <load>;
def truncstorei8_private : PrivateStore <truncstorei8>;
def truncstorei16_private : PrivateStore <truncstorei16>;
def store_private : PrivateStore <store>;
def global_store : PatFrag<(ops node:$val, node:$ptr),
(store node:$val, node:$ptr), [{
return isGlobalStore(dyn_cast<StoreSDNode>(N));

View File

@ -51,7 +51,7 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
unsigned getSubRegFromChannel(unsigned Channel) const;
const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override;
void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
unsigned FIOperandNum,
RegScavenger *RS) const override;
unsigned getFrameRegister(const MachineFunction &MF) const override;

View File

@ -52,7 +52,7 @@ static std::string computeDataLayout(const AMDGPUSubtarget &ST) {
std::string Ret = "e-p:32:32";
if (ST.is64bit()) {
// 32-bit private, local, and region pointers. 64-bit global and constant.
// 32-bit local, and region pointers. 64-bit private, global, and constant.
Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
}

View File

@ -32,6 +32,7 @@ enum {
#define S_00B028_VGPRS(x) (((x) & 0x3F) << 0)
#define S_00B028_SGPRS(x) (((x) & 0x0F) << 6)
#define R_00B84C_COMPUTE_PGM_RSRC2 0x00B84C
#define S_00B02C_SCRATCH_EN(x) (((x) & 0x1) << 0)
#define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15)
#define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC
@ -85,4 +86,7 @@ enum {
#define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4)
#define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6)
#define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860
#define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12)
#endif // SIDEFINES_H_

View File

@ -391,10 +391,15 @@ SDValue SITargetLowering::LowerFormalArguments(
}
// The pointer to the list of arguments is stored in SGPR0, SGPR1
// The pointer to the scratch buffer is stored in SGPR2, SGPR3
if (Info->getShaderType() == ShaderType::COMPUTE) {
Info->NumUserSGPRs = 4;
CCInfo.AllocateReg(AMDGPU::SGPR0);
CCInfo.AllocateReg(AMDGPU::SGPR1);
CCInfo.AllocateReg(AMDGPU::SGPR2);
CCInfo.AllocateReg(AMDGPU::SGPR3);
MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass);
MF.addLiveIn(AMDGPU::SGPR2_SGPR3, &AMDGPU::SReg_64RegClass);
}
if (Info->getShaderType() == ShaderType::COMPUTE) {
@ -509,6 +514,36 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
MI->eraseFromParent();
break;
}
case AMDGPU::SI_BUFFER_RSRC: {
unsigned SuperReg = MI->getOperand(0).getReg();
unsigned Args[4];
for (unsigned i = 0, e = 4; i < e; ++i) {
MachineOperand &Arg = MI->getOperand(i + 1);
if (Arg.isReg()) {
Args[i] = Arg.getReg();
continue;
}
assert(Arg.isImm());
unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), Reg)
.addImm(Arg.getImm());
Args[i] = Reg;
}
BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE),
SuperReg)
.addReg(Args[0])
.addImm(AMDGPU::sub0)
.addReg(Args[1])
.addImm(AMDGPU::sub1)
.addReg(Args[2])
.addImm(AMDGPU::sub2)
.addReg(Args[3])
.addImm(AMDGPU::sub3);
MI->eraseFromParent();
break;
}
case AMDGPU::V_SUB_F64: {
unsigned DestReg = MI->getOperand(0).getReg();
BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg)
@ -620,6 +655,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
switch (Op.getOpcode()) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::LOAD: {
LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
@ -658,8 +694,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
EVT VT = Op.getValueType();
SDLoc DL(Op);
//XXX: Hardcoded we only use two to store the pointer to the parameters.
unsigned NumUserSGPRs = 2;
switch (IntrinsicID) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
case Intrinsic::r600_read_ngroups_x:
@ -682,13 +716,13 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32, false);
case Intrinsic::r600_read_tgid_x:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT);
AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0), VT);
case Intrinsic::r600_read_tgid_y:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT);
AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1), VT);
case Intrinsic::r600_read_tgid_z:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT);
AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2), VT);
case Intrinsic::r600_read_tidig_x:
return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
AMDGPU::VGPR0, VT);
@ -782,6 +816,21 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {
return nullptr;
}
SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
const SIInstrInfo *TII =
static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
const SIRegisterInfo &TRI = TII->getRegisterInfo();
FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
unsigned FrameIndex = FINode->getIndex();
CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
TRI.getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET), MVT::i32);
return DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
}
/// This transforms the control flow intrinsics to get the branch destination as
/// last parameter, also switches branch target with BR if the need arise
SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
@ -891,6 +940,11 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
LoadSDNode *Load = cast<LoadSDNode>(Op);
// Vector private memory loads have already been split, and
// all the rest of private memory loads are legal.
if (Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
return SDValue();
}
SDValue Lowered = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
if (Lowered.getNode())
return Lowered;
@ -1081,6 +1135,12 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
VT.getVectorElementType() == MVT::i32)
return SDValue();
if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
if (VT.isVector() && VT.getVectorNumElements() > 4)
return SplitVectorStore(Op, DAG);
return SDValue();
}
SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
if (Ret.getNode())
return Ret;
@ -1495,9 +1555,19 @@ void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand,
// This is a conservative aproach. It is possible that we can't determine the
// correct register class and copy too often, but better safe than sorry.
SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32);
SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(),
Operand.getValueType(), Operand, RC);
SDNode *Node;
// We can't use COPY_TO_REGCLASS with FrameIndex arguments.
if (isa<FrameIndexSDNode>(Operand)) {
unsigned Opcode = Operand.getValueType() == MVT::i32 ?
AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
Node = DAG.getMachineNode(Opcode, SDLoc(), Operand.getValueType(),
Operand);
} else {
SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32);
Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(),
Operand.getValueType(), Operand, RC);
}
Operand = SDValue(Node, 0);
}
@ -1591,6 +1661,14 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed);
}
continue;
} else {
// If it's not a VSrc or SSrc operand check if we have a GlobalAddress.
// These will be lowered to immediates, so we will need to insert a MOV.
if (isa<GlobalAddressSDNode>(Ops[i])) {
SDNode *Node = DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(),
Operand.getValueType(), Operand);
Ops[i] = SDValue(Node, 0);
}
}
if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) {

View File

@ -27,6 +27,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
SelectionDAG &DAG) const;
SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
SelectionDAG &DAG) const override;
SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const;

View File

@ -561,6 +561,21 @@ static bool compareMachineOp(const MachineOperand &Op0,
}
}
bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
const MachineOperand &MO) const {
const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo];
assert(MO.isImm() || MO.isFPImm());
if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
return true;
if (OpInfo.RegClass < 0)
return false;
return RI.regClassCanUseImmediate(OpInfo.RegClass);
}
bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
StringRef &ErrInfo) const {
uint16_t Opcode = MI->getOpcode();
@ -589,7 +604,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
}
break;
case MCOI::OPERAND_IMMEDIATE:
if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm()) {
// Check if this operand is an immediate.
// FrameIndex operands will be replaced by immediates, so they are
// allowed.
if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm() &&
!MI->getOperand(i).isFI()) {
ErrInfo = "Expected immediate, but got non-immediate";
return false;
}

View File

@ -106,6 +106,9 @@ public:
bool isInlineConstant(const MachineOperand &MO) const;
bool isLiteralConstant(const MachineOperand &MO) const;
bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
const MachineOperand &MO) const;
bool verifyInstruction(const MachineInstr *MI,
StringRef &ErrInfo) const override;
@ -181,7 +184,7 @@ namespace AMDGPU {
int getMCOpcode(uint16_t Opcode, unsigned Gen);
const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
const uint64_t RSRC_TID_ENABLE = 1LL << 55;
} // End namespace AMDGPU

View File

@ -163,7 +163,9 @@ def sopp_brtarget : Operand<OtherVT> {
// Complex patterns
//===----------------------------------------------------------------------===//
def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
def MUBUFAddr64 : ComplexPattern<i64, 3, "SelectMUBUFAddr64">;
def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
//===----------------------------------------------------------------------===//
// SI assembler operands
@ -605,12 +607,12 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass,
asm#" $vdata, $srsrc + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
}
let offen = 1, idxen = 0, offset = 0 in {
let offen = 1, idxen = 0 in {
def _OFFEN : MUBUF <op, (outs regClass:$vdata),
(ins SReg_128:$srsrc, VReg_32:$vaddr,
SSrc_32:$soffset, i1imm:$glc, i1imm:$slc,
SSrc_32:$soffset, u16imm:$offset, i1imm:$glc, i1imm:$slc,
i1imm:$tfe),
asm#" $vdata, $srsrc + $vaddr + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
asm#" $vdata, $srsrc + $vaddr + $soffset + $offset, glc=$glc, slc=$slc, tfe=$tfe", []>;
}
let offen = 0, idxen = 1 in {
@ -640,25 +642,40 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass,
}
}
class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass,
ValueType store_vt, SDPatternOperator st> :
MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr,
u16imm:$offset),
name#" $vdata, $srsrc + $vaddr + $offset",
[(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, u16imm:$offset))]> {
multiclass MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass,
ValueType store_vt, SDPatternOperator st> {
let mayLoad = 0;
let mayStore = 1;
def "" : MUBUF <
op, (outs),
(ins vdataClass:$vdata, SReg_128:$srsrc, VReg_32:$vaddr, SSrc_32:$soffset,
u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$slc,
i1imm:$tfe),
name#" $vdata, $srsrc, $vaddr, $soffset, $offset $offen $idxen $glc $slc $tfe",
[]
> {
let addr64 = 0;
}
// Encoding
let offen = 0;
let idxen = 0;
let glc = 0;
let addr64 = 1;
let lds = 0;
let slc = 0;
let tfe = 0;
let soffset = 128; // ZERO
def _ADDR64 : MUBUF <
op, (outs),
(ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset),
name#" $vdata, $srsrc + $vaddr + $offset",
[(st store_vt:$vdata,
(MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, u16imm:$offset))]> {
let mayLoad = 0;
let mayStore = 1;
// Encoding
let offen = 0;
let idxen = 0;
let glc = 0;
let addr64 = 1;
let lds = 0;
let slc = 0;
let tfe = 0;
let soffset = 128; // ZERO
}
}
class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <

View File

@ -872,23 +872,23 @@ defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <
0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128, v4i32, global_load
>;
def BUFFER_STORE_BYTE : MUBUF_Store_Helper <
defm BUFFER_STORE_BYTE : MUBUF_Store_Helper <
0x00000018, "BUFFER_STORE_BYTE", VReg_32, i32, truncstorei8_global
>;
def BUFFER_STORE_SHORT : MUBUF_Store_Helper <
defm BUFFER_STORE_SHORT : MUBUF_Store_Helper <
0x0000001a, "BUFFER_STORE_SHORT", VReg_32, i32, truncstorei16_global
>;
def BUFFER_STORE_DWORD : MUBUF_Store_Helper <
defm BUFFER_STORE_DWORD : MUBUF_Store_Helper <
0x0000001c, "BUFFER_STORE_DWORD", VReg_32, i32, global_store
>;
def BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, v2i32, global_store
>;
def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128, v4i32, global_store
>;
//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>;
@ -1667,6 +1667,12 @@ def SI_ADDR64_RSRC : InstSI <
"", []
>;
def SI_BUFFER_RSRC : InstSI <
(outs SReg_128:$srsrc),
(ins SReg_32:$ptr_lo, SReg_32:$ptr_hi, SSrc_32:$data_lo, SSrc_32:$data_hi),
"", []
>;
def V_SUB_F64 : InstSI <
(outs VReg_64:$dst),
(ins VReg_64:$src0, VReg_64:$src1),
@ -2410,7 +2416,7 @@ def : Ext32Pat <anyext>;
// Offset in an 32Bit VGPR
def : Pat <
(SIload_constant v4i32:$sbase, i32:$voff),
(BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0)
(BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0, 0)
>;
// The multiplication scales from [0,1] to the unsigned integer range
@ -2599,22 +2605,30 @@ multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
(vt (constant_ld (add i64:$ptr, i64:$offset))),
(Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0)
>;
}
defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32,
sextloadi8_constant>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32,
az_extloadi8_constant>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32,
sextloadi16_constant>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32,
az_extloadi16_constant>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32,
constant_load>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32,
constant_load>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32,
constant_load>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32, constant_load>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32, constant_load>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, constant_load>;
class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat <
(vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr,
i32:$soffset, u16imm:$offset))),
(Instr $srsrc, $vaddr, $soffset, $offset, 0, 0, 0)
>;
def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>;
def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>;
def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>;
def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>;
def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>;
def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>;
def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>;
// BUFFER_LOAD_DWORD*, addr64=0
multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxen,
@ -2630,9 +2644,9 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe
def : Pat <
(vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
imm, 1, 0, imm:$glc, imm:$slc,
imm:$offset, 1, 0, imm:$glc, imm:$slc,
imm:$tfe)),
(offen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc),
(offen $rsrc, $vaddr, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
(as_i1imm $tfe))
>;
@ -2660,6 +2674,34 @@ defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_
defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN,
BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>;
class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
(st vt:$value, (MUBUFAddr32 v4i32:$srsrc, i32:$vaddr, i32:$soffset,
u16imm:$offset, i1imm:$offen, i1imm:$idxen,
i1imm:$glc, i1imm:$slc, i1imm:$tfe)),
(Instr $value, $srsrc, $vaddr, $soffset, $offset, $offen, $idxen,
$glc, $slc, $tfe)
>;
def : MUBUFScratchStorePat <BUFFER_STORE_BYTE, i32, truncstorei8_private>;
def : MUBUFScratchStorePat <BUFFER_STORE_SHORT, i32, truncstorei16_private>;
def : MUBUFScratchStorePat <BUFFER_STORE_DWORD, i32, store_private>;
def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2, v2i32, store_private>;
def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4, v4i32, store_private>;
/*
class MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
(st vt:$value, (MUBUFScratch v4i32:$srsrc, i64:$vaddr, u16imm:$offset)),
(Instr $value, $srsrc, $vaddr, $offset)
>;
def : MUBUFStore_Pattern <BUFFER_STORE_BYTE_ADDR64, i32, truncstorei8_private>;
def : MUBUFStore_Pattern <BUFFER_STORE_SHORT_ADDR64, i32, truncstorei16_private>;
def : MUBUFStore_Pattern <BUFFER_STORE_DWORD_ADDR64, i32, store_private>;
def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2_ADDR64, v2i32, store_private>;
def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4_ADDR64, v4i32, store_private>;
*/
//===----------------------------------------------------------------------===//
// MTBUF Patterns
//===----------------------------------------------------------------------===//

View File

@ -27,7 +27,8 @@ void SIMachineFunctionInfo::anchor() {}
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
: AMDGPUMachineFunction(MF),
PSInputAddr(0),
SpillTracker() { }
SpillTracker(),
NumUserSGPRs(0) { }
static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) {
unsigned VGPR = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);

View File

@ -59,6 +59,7 @@ public:
SIMachineFunctionInfo(const MachineFunction &MF);
unsigned PSInputAddr;
struct RegSpillTracker SpillTracker;
unsigned NumUserSGPRs;
};
} // End namespace llvm

View File

@ -16,6 +16,10 @@
#include "SIRegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/RegisterScavenging.h"
using namespace llvm;
@ -27,8 +31,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
Reserved.set(AMDGPU::EXEC);
Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
TII->reserveIndirectRegisters(Reserved, MF);
return Reserved;
}
@ -37,6 +39,30 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
return RC->getNumRegs();
}
bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
return Fn.getFrameInfo()->hasStackObjects();
}
void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const {
MachineFunction *MF = MI->getParent()->getParent();
MachineFrameInfo *FrameInfo = MF->getFrameInfo();
const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
MachineOperand &FIOp = MI->getOperand(FIOperandNum);
int Index = MI->getOperand(FIOperandNum).getIndex();
int64_t Offset = FrameInfo->getObjectOffset(Index);
FIOp.ChangeToImmediate(Offset);
if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) {
unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VReg_32RegClass, MI, SPAdj);
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
.addImm(Offset);
FIOp.ChangeToRegister(TmpReg, false);
}
}
const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
MVT VT) const {
switch(VT.SimpleTy) {
@ -141,3 +167,21 @@ bool SIRegisterInfo::regClassCanUseImmediate(
const TargetRegisterClass *RC) const {
return regClassCanUseImmediate(RC->getID());
}
unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
enum PreloadedValue Value) const {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
switch (Value) {
case SIRegisterInfo::TGID_X:
return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0);
case SIRegisterInfo::TGID_Y:
return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1);
case SIRegisterInfo::TGID_Z:
return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2);
case SIRegisterInfo::SCRATCH_WAVE_OFFSET:
return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4);
case SIRegisterInfo::SCRATCH_PTR:
return AMDGPU::SGPR2_SGPR3;
}
}

View File

@ -29,6 +29,12 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
unsigned getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const override;
bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
unsigned FIOperandNum,
RegScavenger *RS) const override;
/// \brief get the register class of the specified type to use in the
/// CFGStructurizer
const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
@ -68,6 +74,19 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
/// \returns True if operands defined with this register class can accept
/// inline immediates.
bool regClassCanUseImmediate(const TargetRegisterClass *RC) const;
enum PreloadedValue {
TGID_X,
TGID_Y,
TGID_Z,
SCRATCH_WAVE_OFFSET,
SCRATCH_PTR
};
/// \brief Returns the physical register that \p Value is stored in.
unsigned getPreloadedValue(const MachineFunction &MF,
enum PreloadedValue Value) const;
};
} // End namespace llvm

View File

@ -11,15 +11,18 @@ declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
; SI-LABEL: @test_private_array_ptr_calc:
; SI: V_ADD_I32_e32 [[PTRREG:v[0-9]+]]
; SI-ALLOCA: V_MOVRELD_B32_e32 {{v[0-9]+}}, [[PTRREG]]
; FIXME: We end up with zero argument for ADD, because
; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index
; with the appropriate offset. We should fold this into the store.
; SI-ALLOCA: V_ADD_I32_e32 [[PTRREG:v[0-9]+]], 0, v{{[0-9]+}}
; SI-ALLOCA: BUFFER_STORE_DWORD {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], [[PTRREG]]
;
; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
; alloca to a vector. It currently fails because it does not know how
; to interpret:
; getelementptr [4 x i32]* %alloca, i32 1, i32 %b
; SI-PROMOTE: V_ADD_I32_e32 [[PTRREG:v[0-9]+]]
; SI-PROMOTE: DS_WRITE_B32 {{v[0-9]+}}, [[PTRREG]]
define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
%alloca = alloca [4 x i32], i32 4, align 16

View File

@ -76,3 +76,22 @@ define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4
ret void
}
define void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) {
entry:
%0 = icmp eq i32 0, %a
br i1 %0, label %if, label %else
if:
%1 = getelementptr inbounds [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
%2 = load float addrspace(2)* %1
store float %2, float addrspace(1)* %out
br label %endif
else:
store float 1.0, float addrspace(1)* %out
br label %endif
endif:
ret void
}

View File

@ -6,10 +6,10 @@ declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
; SI-LABEL: @private_access_f64_alloca:
; SI-ALLOCA: V_MOVRELD_B32_e32
; SI-ALLOCA: V_MOVRELD_B32_e32
; SI-ALLOCA: V_MOVRELS_B32_e32
; SI-ALLOCA: V_MOVRELS_B32_e32
; SI-ALLOCA: BUFFER_STORE_DWORDX2
; FIXME: We should be able to use BUFFER_LOAD_DWORDX2
; SI-ALLOCA: BUFFER_LOAD_DWORD
; SI-ALLOCA: BUFFER_LOAD_DWORD
; SI-PROMOTE: DS_WRITE_B64
; SI-PROMOTE: DS_READ_B64
@ -26,10 +26,12 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double
; SI-LABEL: @private_access_v2f64_alloca:
; SI-ALLOCA: V_MOVRELD_B32_e32
; SI-ALLOCA: V_MOVRELD_B32_e32
; SI-ALLOCA: V_MOVRELS_B32_e32
; SI-ALLOCA: V_MOVRELS_B32_e32
; SI-ALLOCA: BUFFER_STORE_DWORDX4
; FIXME: We should be able to use BUFFER_LOAD_DWORDX4
; SI-ALLOCA: BUFFER_LOAD_DWORD
; SI-ALLOCA: BUFFER_LOAD_DWORD
; SI-ALLOCA: BUFFER_LOAD_DWORD
; SI-ALLOCA: BUFFER_LOAD_DWORD
; SI-PROMOTE: DS_WRITE_B32
; SI-PROMOTE: DS_WRITE_B32
@ -52,10 +54,10 @@ define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out
; SI-LABEL: @private_access_i64_alloca:
; SI-ALLOCA: V_MOVRELD_B32_e32
; SI-ALLOCA: V_MOVRELD_B32_e32
; SI-ALLOCA: V_MOVRELS_B32_e32
; SI-ALLOCA: V_MOVRELS_B32_e32
; SI-ALLOCA: BUFFER_STORE_DWORDX2
; FIXME: We should be able to use BUFFER_LOAD_DWORDX2
; SI-ALLOCA: BUFFER_LOAD_DWORD
; SI-ALLOCA: BUFFER_LOAD_DWORD
; SI-PROMOTE: DS_WRITE_B64
; SI-PROMOTE: DS_READ_B64
@ -72,14 +74,12 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs
; SI-LABEL: @private_access_v2i64_alloca:
; SI-ALLOCA: V_MOVRELD_B32_e32
; SI-ALLOCA: V_MOVRELD_B32_e32
; SI-ALLOCA: V_MOVRELD_B32_e32
; SI-ALLOCA: V_MOVRELD_B32_e32
; SI-ALLOCA: V_MOVRELS_B32_e32
; SI-ALLOCA: V_MOVRELS_B32_e32
; SI-ALLOCA: V_MOVRELS_B32_e32
; SI-ALLOCA: V_MOVRELS_B32_e32
; SI-ALLOCA: BUFFER_STORE_DWORDX4
; FIXME: We should be able to use BUFFER_LOAD_DWORDX4
; SI-ALLOCA: BUFFER_LOAD_DWORD
; SI-ALLOCA: BUFFER_LOAD_DWORD
; SI-ALLOCA: BUFFER_LOAD_DWORD
; SI-ALLOCA: BUFFER_LOAD_DWORD
; SI-PROMOTE: DS_WRITE_B32
; SI-PROMOTE: DS_WRITE_B32

View File

@ -16,12 +16,8 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
; SI-PROMOTE: DS_READ_B32
; SI-PROMOTE: DS_READ_B32
; SI-ALLOCA: V_READFIRSTLANE_B32 vcc_lo
; SI-ALLOCA: V_MOVRELD
; SI-ALLOCA: S_CBRANCH
; SI-ALLOCA: V_READFIRSTLANE_B32 vcc_lo
; SI-ALLOCA: V_MOVRELD
; SI-ALLOCA: S_CBRANCH
; SI-ALLOCA: BUFFER_STORE_DWORD v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}
; SI-ALLOCA: BUFFER_STORE_DWORD v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}
define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
entry:
%stack = alloca [5 x i32], align 4
@ -120,7 +116,9 @@ for.end:
; R600: MOVA_INT
; SI-PROMOTE: V_MOVRELS_B32_e32
; SI-PROMOTE: BUFFER_STORE_SHORT v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}
; SI-PROMOTE: BUFFER_STORE_SHORT v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}
; SI_PROMOTE: BUFFER_LOAD_SSHORT v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] + v{{[0-9]+}}, s{{[0-9]+}}
define void @short_array(i32 addrspace(1)* %out, i32 %index) {
entry:
%0 = alloca [2 x i16]
@ -139,8 +137,8 @@ entry:
; R600: MOVA_INT
; SI: V_OR_B32_e32 v{{[0-9]}}, 0x100
; SI: V_MOVRELS_B32_e32
; SI-DAG: BUFFER_STORE_BYTE v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}, 0x0
; SI-DAG: BUFFER_STORE_BYTE v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}, 0x1
define void @char_array(i32 addrspace(1)* %out, i32 %index) {
entry:
%0 = alloca [2 x i8]

View File

@ -127,12 +127,12 @@ entry:
ret void
}
; The tgid values are stored in ss offset by the number of user ss.
; Currently we always use exactly 2 user ss for the pointer to the
; The tgid values are stored in sgprs offset by the number of user sgprs.
; Currently we always use exactly 2 user sgprs for the pointer to the
; kernel arguments, but this may change in the future.
; SI-CHECK: @tgid_x
; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s2
; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s4
; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
define void @tgid_x (i32 addrspace(1)* %out) {
entry:
@ -142,7 +142,7 @@ entry:
}
; SI-CHECK: @tgid_y
; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s3
; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s5
; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
define void @tgid_y (i32 addrspace(1)* %out) {
entry:
@ -152,7 +152,7 @@ entry:
}
; SI-CHECK: @tgid_z
; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s4
; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s6
; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
define void @tgid_z (i32 addrspace(1)* %out) {
entry: