R600/SI: Add support for private address space load/store

Private address space is emulated using the register file with
MOVRELS and MOVRELD instructions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@194626 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Tom Stellard 2013-11-13 23:36:50 +00:00
parent b52bf6a3b3
commit a2b4eb6d15
28 changed files with 429 additions and 154 deletions

View File

@ -309,6 +309,40 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
SDLoc(N), N->getValueType(0), Ops);
}
case AMDGPUISD::REGISTER_LOAD: {
const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
break;
SDValue Addr, Offset;
SelectADDRIndirect(N->getOperand(1), Addr, Offset);
const SDValue Ops[] = {
Addr,
Offset,
CurDAG->getTargetConstant(0, MVT::i32),
N->getOperand(0),
};
return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, SDLoc(N),
CurDAG->getVTList(MVT::i32, MVT::i64, MVT::Other),
Ops);
}
case AMDGPUISD::REGISTER_STORE: {
const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
break;
SDValue Addr, Offset;
SelectADDRIndirect(N->getOperand(2), Addr, Offset);
const SDValue Ops[] = {
N->getOperand(1),
Addr,
Offset,
CurDAG->getTargetConstant(0, MVT::i32),
N->getOperand(0),
};
return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, SDLoc(N),
CurDAG->getVTList(MVT::Other),
Ops);
}
}
return SelectCode(N);
}

View File

@ -15,6 +15,7 @@
#include "AMDGPUISelLowering.h"
#include "AMDGPU.h"
#include "AMDGPUFrameLowering.h"
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDILIntrinsicInfo.h"
@ -250,8 +251,8 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
// AMDGPU DAG lowering
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::STORE: return LowerSTORE(Op, DAG);
case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
}
@ -326,6 +327,21 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
&Args[0], Args.size());
}
SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
const AMDGPUFrameLowering *TFL =
static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
assert(FIN);
unsigned FrameIndex = FIN->getIndex();
unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF),
Op.getValueType());
}
SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
@ -563,7 +579,8 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
}
StoreSDNode *Store = cast<StoreSDNode>(Op);
if (Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
Store->getValue().getValueType().isVector()) {
return SplitVectorStore(Op, DAG);
}

View File

@ -28,6 +28,7 @@ private:
void ExtractVectorElements(SDValue Op, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &Args,
unsigned Start, unsigned Count) const;
SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;

View File

@ -120,31 +120,43 @@ AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const {
MachineBasicBlock *MBB = MI->getParent();
int OffsetOpIdx =
AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::addr);
// addr is a custom operand with multiple MI operands, and only the
// first MI operand is given a name.
int RegOpIdx = OffsetOpIdx + 1;
int ChanOpIdx =
AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::chan);
if (isRegisterLoad(*MI)) {
unsigned RegIndex = MI->getOperand(2).getImm();
unsigned Channel = MI->getOperand(3).getImm();
int DstOpIdx =
AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
unsigned RegIndex = MI->getOperand(RegOpIdx).getImm();
unsigned Channel = MI->getOperand(ChanOpIdx).getImm();
unsigned Address = calculateIndirectAddress(RegIndex, Channel);
unsigned OffsetReg = MI->getOperand(1).getReg();
unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg();
if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
buildMovInstr(MBB, MI, MI->getOperand(0).getReg(),
buildMovInstr(MBB, MI, MI->getOperand(DstOpIdx).getReg(),
getIndirectAddrRegClass()->getRegister(Address));
} else {
buildIndirectRead(MBB, MI, MI->getOperand(0).getReg(),
buildIndirectRead(MBB, MI, MI->getOperand(DstOpIdx).getReg(),
Address, OffsetReg);
}
} else if (isRegisterStore(*MI)) {
unsigned RegIndex = MI->getOperand(2).getImm();
unsigned Channel = MI->getOperand(3).getImm();
int ValOpIdx =
AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::val);
AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
unsigned RegIndex = MI->getOperand(RegOpIdx).getImm();
unsigned Channel = MI->getOperand(ChanOpIdx).getImm();
unsigned Address = calculateIndirectAddress(RegIndex, Channel);
unsigned OffsetReg = MI->getOperand(1).getReg();
unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg();
if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address),
MI->getOperand(0).getReg());
MI->getOperand(ValOpIdx).getReg());
} else {
buildIndirectWrite(MBB, MI, MI->getOperand(0).getReg(),
calculateIndirectAddress(RegIndex, Channel),
OffsetReg);
buildIndirectWrite(MBB, MI, MI->getOperand(ValOpIdx).getReg(),
calculateIndirectAddress(RegIndex, Channel),
OffsetReg);
}
} else {
return false;
@ -260,6 +272,57 @@ bool AMDGPUInstrInfo::isRegisterLoad(const MachineInstr &MI) const {
return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD;
}
int AMDGPUInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
const MachineRegisterInfo &MRI = MF.getRegInfo();
const MachineFrameInfo *MFI = MF.getFrameInfo();
int Offset = -1;
if (MFI->getNumObjects() == 0) {
return -1;
}
if (MRI.livein_empty()) {
return 0;
}
const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass();
for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(),
LE = MRI.livein_end();
LI != LE; ++LI) {
unsigned Reg = LI->first;
if (TargetRegisterInfo::isVirtualRegister(Reg) ||
!IndirectRC->contains(Reg))
continue;
unsigned RegIndex;
unsigned RegEnd;
for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd;
++RegIndex) {
if (IndirectRC->getRegister(RegIndex) == Reg)
break;
}
Offset = std::max(Offset, (int)RegIndex);
}
return Offset + 1;
}
int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
int Offset = 0;
const MachineFrameInfo *MFI = MF.getFrameInfo();
// Variable sized objects are not supported
assert(!MFI->hasVarSizedObjects());
if (MFI->getNumObjects() == 0) {
return -1;
}
Offset = TM.getFrameLowering()->getFrameIndexOffset(MF, -1);
return getIndirectIndexBegin(MF) + Offset;
}
void AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF,
DebugLoc DL) const {

View File

@ -99,6 +99,14 @@ protected:
MachineInstr *MI,
const SmallVectorImpl<unsigned> &Ops,
MachineInstr *LoadMI) const;
/// \returns the smallest register index that will be accessed by an indirect
/// read or write or -1 if indirect addressing is not used by this program.
virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
/// \returns the largest register index that will be accessed by an indirect
/// read or write or -1 if indirect addressing is not used by this program.
virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
public:
bool canFoldMemoryOperand(const MachineInstr *MI,
const SmallVectorImpl<unsigned> &Ops) const;
@ -144,14 +152,6 @@ public:
virtual unsigned getIEQOpcode() const = 0;
virtual bool isMov(unsigned opcode) const = 0;
/// \returns the smallest register index that will be accessed by an indirect
/// read or write or -1 if indirect addressing is not used by this program.
virtual int getIndirectIndexBegin(const MachineFunction &MF) const = 0;
/// \returns the largest register index that will be accessed by an indirect
/// read or write or -1 if indirect addressing is not used by this program.
virtual int getIndirectIndexEnd(const MachineFunction &MF) const = 0;
/// \brief Calculate the "Indirect Address" for the given \p RegIndex and
/// \p Channel
///

View File

@ -35,6 +35,7 @@ class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern>
}
def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
def COND_EQ : PatLeaf <
(cond),
@ -277,6 +278,8 @@ class FNEG <RegisterClass rc> : AMDGPUShaderInst <
multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass,
ComplexPattern addrPat> {
let UseNamedOperandTable = 1 in {
def RegisterLoad : AMDGPUShaderInst <
(outs dstClass:$dst),
(ins addrClass:$addr, i32imm:$chan),
@ -295,6 +298,7 @@ multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass,
let isRegisterStore = 1;
}
}
}
} // End isCodeGenOnly = 1, isPseudo = 1

View File

@ -50,6 +50,10 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
assert(!"Unimplemented"); return NULL;
}
virtual unsigned getHWRegIndex(unsigned Reg) const {
assert(!"Unimplemented"); return 0;
}
/// \returns the sub reg enum value for the given \p Channel
/// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
unsigned getSubRegFromChannel(unsigned Channel) const;

View File

@ -519,7 +519,6 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
case ISD::STORE: return LowerSTORE(Op, DAG);
case ISD::LOAD: return LowerLOAD(Op, DAG);
case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
case ISD::INTRINSIC_VOID: {
SDValue Chain = Op.getOperand(0);
@ -843,20 +842,6 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
false, false, false, 0);
}
SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
const AMDGPUFrameLowering *TFL =
static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
assert(FIN);
unsigned FrameIndex = FIN->getIndex();
unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
}
bool R600TargetLowering::isZero(SDValue Op) const {
if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
return Cst->isNullValue();

View File

@ -59,7 +59,6 @@ private:
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth,

View File

@ -1024,67 +1024,25 @@ unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
return 2;
}
int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
const MachineRegisterInfo &MRI = MF.getRegInfo();
const MachineFrameInfo *MFI = MF.getFrameInfo();
int Offset = 0;
if (MFI->getNumObjects() == 0) {
return -1;
}
if (MRI.livein_empty()) {
return 0;
}
for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(),
LE = MRI.livein_end();
LI != LE; ++LI) {
Offset = std::max(Offset,
GET_REG_INDEX(RI.getEncodingValue(LI->first)));
}
return Offset + 1;
}
int R600InstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
int Offset = 0;
const MachineFrameInfo *MFI = MF.getFrameInfo();
// Variable sized objects are not supported
assert(!MFI->hasVarSizedObjects());
if (MFI->getNumObjects() == 0) {
return -1;
}
Offset = TM.getFrameLowering()->getFrameIndexOffset(MF, -1);
return getIndirectIndexBegin(MF) + Offset;
}
std::vector<unsigned> R600InstrInfo::getIndirectReservedRegs(
void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
const MachineFunction &MF) const {
const AMDGPUFrameLowering *TFL =
static_cast<const AMDGPUFrameLowering*>(TM.getFrameLowering());
std::vector<unsigned> Regs;
unsigned StackWidth = TFL->getStackWidth(MF);
int End = getIndirectIndexEnd(MF);
if (End == -1) {
return Regs;
}
if (End == -1)
return;
for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) {
unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index);
Regs.push_back(SuperReg);
Reserved.set(SuperReg);
for (unsigned Chan = 0; Chan < StackWidth; ++Chan) {
unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan);
Regs.push_back(Reg);
Reserved.set(Reg);
}
}
return Regs;
}
unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex,

View File

@ -193,14 +193,9 @@ namespace llvm {
virtual int getInstrLatency(const InstrItineraryData *ItinData,
SDNode *Node) const { return 1;}
/// \returns a list of all the registers that may be accesed using indirect
/// addressing.
std::vector<unsigned> getIndirectReservedRegs(const MachineFunction &MF) const;
virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
/// \brief Reserve the registers that may be accesed using indirect addressing.
void reserveIndirectRegisters(BitVector &Reserved,
const MachineFunction &MF) const;
virtual unsigned calculateIndirectAddress(unsigned RegIndex,
unsigned Channel) const;

View File

@ -75,7 +75,6 @@ def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>;
def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>;
def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>;
def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>;
def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
def R600_Pred : PredicateOperand<i32, (ops R600_Predicate),

View File

@ -28,6 +28,8 @@ R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm)
BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo());
Reserved.set(AMDGPU::ZERO);
Reserved.set(AMDGPU::HALF);
Reserved.set(AMDGPU::ONE);
@ -48,14 +50,8 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
Reserved.set(*I);
}
const R600InstrInfo *RII =
static_cast<const R600InstrInfo*>(TM.getInstrInfo());
std::vector<unsigned> IndirectRegs = RII->getIndirectReservedRegs(MF);
for (std::vector<unsigned>::iterator I = IndirectRegs.begin(),
E = IndirectRegs.end();
I != E; ++I) {
Reserved.set(*I);
}
TII->reserveIndirectRegisters(Reserved, MF);
return Reserved;
}
@ -73,6 +69,10 @@ unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
}
unsigned R600RegisterInfo::getHWRegIndex(unsigned Reg) const {
return GET_REG_INDEX(getEncodingValue(Reg));
}
const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
MVT VT) const {
switch(VT.SimpleTy) {

View File

@ -39,6 +39,8 @@ struct R600RegisterInfo : public AMDGPURegisterInfo {
/// \brief get the HW encoding for a register's channel.
unsigned getHWRegChan(unsigned reg) const;
virtual unsigned getHWRegIndex(unsigned Reg) const;
/// \brief get the register class of the specified type to use in the
/// CFGStructurizer
virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;

View File

@ -75,6 +75,19 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::STORE, MVT::v8i32, Custom);
setOperationAction(ISD::STORE, MVT::v16i32, Custom);
// We need to custom lower loads/stores from private memory
setOperationAction(ISD::LOAD, MVT::i32, Custom);
setOperationAction(ISD::LOAD, MVT::i64, Custom);
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
setOperationAction(ISD::STORE, MVT::i32, Custom);
setOperationAction(ISD::STORE, MVT::i64, Custom);
setOperationAction(ISD::STORE, MVT::i128, Custom);
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
@ -95,6 +108,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand);
setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand);
setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand);
@ -106,6 +120,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::FrameIndex, MVT::i64, Custom);
setTargetDAGCombine(ISD::SELECT_CC);
@ -122,6 +137,8 @@ bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
bool *IsFast) const {
// XXX: This depends on the address space and also we may want to revist
// the alignment values we specify in the DataLayout.
if (!VT.isSimple() || VT == MVT::Other)
return false;
return VT.bitsGT(MVT::i32);
}
@ -350,6 +367,19 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
MI->eraseFromParent();
break;
}
case AMDGPU::SI_RegisterStorePseudo: {
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
const SIInstrInfo *TII =
static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
MachineInstrBuilder MIB =
BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore),
Reg);
for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i)
MIB.addOperand(MI->getOperand(i));
MI->eraseFromParent();
}
}
return BB;
}
@ -395,7 +425,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::LOAD: {
LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
if (Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
if ((Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
Op.getValueType().isVector()) {
SDValue MergedValues[2] = {
SplitVectorLoad(Op, DAG),
@ -403,20 +434,13 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
};
return DAG.getMergeValues(MergedValues, 2, SDLoc(Op));
} else {
return SDValue();
return LowerLOAD(Op, DAG);
}
}
case ISD::STORE: {
StoreSDNode *Store = dyn_cast<StoreSDNode>(Op);
if (Store->getValue().getValueType().isVector() &&
Store->getValue().getValueType().getVectorNumElements() >= 8)
return SplitVectorStore(Op, DAG);
else
return AMDGPUTargetLowering::LowerOperation(Op, DAG);
}
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG);
case ISD::STORE: return LowerSTORE(Op, DAG);
case ISD::ANY_EXTEND: // Fall-through
case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG);
case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
@ -628,6 +652,30 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
return Chain;
}
SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
LoadSDNode *Load = cast<LoadSDNode>(Op);
if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
return SDValue();
SDValue TruncPtr = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
Load->getBasePtr(), DAG.getConstant(0, MVT::i32));
SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr,
DAG.getConstant(2, MVT::i32));
SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
Load->getChain(), Ptr,
DAG.getTargetConstant(0, MVT::i32),
Op.getOperand(2));
SDValue MergedValues[2] = {
Ret,
Load->getChain()
};
return DAG.getMergeValues(MergedValues, 2, DL);
}
SDValue SITargetLowering::ResourceDescriptorToi128(SDValue Op,
SelectionDAG &DAG) const {
@ -685,6 +733,56 @@ SDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op,
return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi);
}
SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
StoreSDNode *Store = cast<StoreSDNode>(Op);
EVT VT = Store->getMemoryVT();
SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
if (Ret.getNode())
return Ret;
if (VT.isVector() && VT.getVectorNumElements() >= 8)
return SplitVectorStore(Op, DAG);
if (Store->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
return SDValue();
SDValue TruncPtr = DAG.getZExtOrTrunc(Store->getBasePtr(), DL, MVT::i32);
SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr,
DAG.getConstant(2, MVT::i32));
SDValue Chain = Store->getChain();
SmallVector<SDValue, 8> Values;
if (VT == MVT::i64) {
for (unsigned i = 0; i < 2; ++i) {
Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
Store->getValue(), DAG.getConstant(i, MVT::i32)));
}
} else if (VT == MVT::i128) {
for (unsigned i = 0; i < 2; ++i) {
for (unsigned j = 0; j < 2; ++j) {
Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64,
Store->getValue(), DAG.getConstant(i, MVT::i32)),
DAG.getConstant(j, MVT::i32)));
}
}
} else {
Values.push_back(Store->getValue());
}
for (unsigned i = 0; i < Values.size(); ++i) {
SDValue PartPtr = DAG.getNode(ISD::ADD, DL, MVT::i32,
Ptr, DAG.getConstant(i, MVT::i32));
Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
Chain, Values[i], PartPtr,
DAG.getTargetConstant(0, MVT::i32));
}
return Chain;
}
SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();

View File

@ -25,8 +25,10 @@ class SITargetLowering : public AMDGPUTargetLowering {
SDValue Chain, unsigned Offset) const;
SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op,
SelectionDAG &DAG) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;

View File

@ -186,7 +186,7 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
if (!Op.isReg())
if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
return std::make_pair(0, 0);
unsigned Reg = Op.getReg();

View File

@ -230,7 +230,8 @@ MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I,
unsigned DstReg,
unsigned SrcReg) const {
llvm_unreachable("Not Implemented");
return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32),
DstReg) .addReg(SrcReg);
}
bool SIInstrInfo::isMov(unsigned Opcode) const {
@ -603,17 +604,8 @@ unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex,
return RegIndex;
}
int SIInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
llvm_unreachable("Unimplemented");
}
int SIInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
llvm_unreachable("Unimplemented");
}
const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
llvm_unreachable("Unimplemented");
return &AMDGPU::VReg_32RegClass;
}
MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
@ -621,7 +613,17 @@ MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
MachineBasicBlock::iterator I,
unsigned ValueReg,
unsigned Address, unsigned OffsetReg) const {
llvm_unreachable("Unimplemented");
const DebugLoc &DL = MBB->findDebugLoc(I);
unsigned IndirectBaseReg = AMDGPU::VReg_32RegClass.getRegister(
getIndirectIndexBegin(*MBB->getParent()));
return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1))
.addReg(IndirectBaseReg, RegState::Define)
.addOperand(I->getOperand(0))
.addReg(IndirectBaseReg)
.addReg(OffsetReg)
.addImm(0)
.addReg(ValueReg);
}
MachineInstrBuilder SIInstrInfo::buildIndirectRead(
@ -629,5 +631,43 @@ MachineInstrBuilder SIInstrInfo::buildIndirectRead(
MachineBasicBlock::iterator I,
unsigned ValueReg,
unsigned Address, unsigned OffsetReg) const {
llvm_unreachable("Unimplemented");
const DebugLoc &DL = MBB->findDebugLoc(I);
unsigned IndirectBaseReg = AMDGPU::VReg_32RegClass.getRegister(
getIndirectIndexBegin(*MBB->getParent()));
return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC))
.addOperand(I->getOperand(0))
.addOperand(I->getOperand(1))
.addReg(IndirectBaseReg)
.addReg(OffsetReg)
.addImm(0);
}
void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved,
const MachineFunction &MF) const {
int End = getIndirectIndexEnd(MF);
int Begin = getIndirectIndexBegin(MF);
if (End == -1)
return;
for (int Index = Begin; Index <= End; ++Index)
Reserved.set(AMDGPU::VReg_32RegClass.getRegister(Index));
for (int Index = std::max(0, Index - 1); Index <= End; ++Index)
Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index));
for (int Index = std::max(0, Index - 2); Index <= End; ++Index)
Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index));
for (int Index = std::max(0, Index - 3); Index <= End; ++Index)
Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index));
for (int Index = std::max(0, Index - 7); Index <= End; ++Index)
Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index));
for (int Index = std::max(0, Index - 15); Index <= End; ++Index)
Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index));
}

View File

@ -25,6 +25,14 @@ class SIInstrInfo : public AMDGPUInstrInfo {
private:
const SIRegisterInfo RI;
MachineInstrBuilder buildIndirectIndexLoop(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
unsigned OffsetVGPR,
unsigned MovRelOp,
unsigned Dst,
unsigned Src0) const;
// If you add or remove instructions from this function, you will
public:
explicit SIInstrInfo(AMDGPUTargetMachine &tm);
@ -58,9 +66,6 @@ public:
virtual bool verifyInstruction(const MachineInstr *MI,
StringRef &ErrInfo) const;
virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
bool isSALUInstr(const MachineInstr &MI) const;
unsigned getVALUOp(const MachineInstr &MI) const;
@ -114,7 +119,12 @@ public:
unsigned ValueReg,
unsigned Address,
unsigned OffsetReg) const;
};
void reserveIndirectRegisters(BitVector &Reserved,
const MachineFunction &MF) const;
void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I,
unsigned SavReg, unsigned IndexReg) const;
};
namespace AMDGPU {

View File

@ -121,6 +121,10 @@ class SGPRImm <dag frag> : PatLeaf<frag, [{
return false;
}]>;
def FRAMEri64 : Operand<iPTR> {
let MIOperandInfo = (ops SReg_32:$ptr, i32imm:$index);
}
//===----------------------------------------------------------------------===//
// SI assembler operands
//===----------------------------------------------------------------------===//

View File

@ -1293,6 +1293,36 @@ def SI_KILL : InstSI <
let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
//defm SI_ : RegisterLoadStore <VReg_32, FRAMEri64, ADDRIndirect>;
let UseNamedOperandTable = 1 in {
def SI_RegisterLoad : AMDGPUShaderInst <
(outs VReg_32:$dst, SReg_64:$temp),
(ins FRAMEri64:$addr, i32imm:$chan),
"", []
> {
let isRegisterLoad = 1;
let mayLoad = 1;
}
class SIRegStore<dag outs> : AMDGPUShaderInst <
outs,
(ins VReg_32:$val, FRAMEri64:$addr, i32imm:$chan),
"", []
> {
let isRegisterStore = 1;
let mayStore = 1;
}
let usesCustomInserter = 1 in {
def SI_RegisterStorePseudo : SIRegStore<(outs)>;
} // End usesCustomInserter = 1
def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>;
} // End UseNamedOperandTable = 1
def SI_INDIRECT_SRC : InstSI <
(outs VReg_32:$dst, SReg_64:$temp),
(ins unknown:$src, VSrc_32:$idx, i32imm:$off),
@ -1309,6 +1339,7 @@ class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
let Constraints = "$src = $dst";
}
def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VReg_32>;
def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
@ -1988,7 +2019,7 @@ def : Pat<
(V_CMP_U_F32_e64 $src0, $src1)
>;
//============================================================================//
//===----------------------------------------------------------------------===//
// Miscellaneous Patterns
//===----------------------------------------------------------------------===//
@ -1999,6 +2030,11 @@ def : Pat <
(i32 (EXTRACT_SUBREG $x, sub1)), sub1)
>;
def : Pat <
(i32 (trunc i64:$a)),
(EXTRACT_SUBREG $a, sub0)
>;
def : Pat <
(or i64:$a, i64:$b),
(INSERT_SUBREG

View File

@ -377,10 +377,13 @@ void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
unsigned Dst = MI.getOperand(0).getReg();
unsigned Vec = MI.getOperand(2).getReg();
unsigned Off = MI.getOperand(4).getImm();
unsigned SubReg = TRI->getSubReg(Vec, AMDGPU::sub0);
if (!SubReg)
SubReg = Vec;
MachineInstr *MovRel =
MachineInstr *MovRel =
BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
.addReg(TRI->getSubReg(Vec, AMDGPU::sub0) + Off)
.addReg(SubReg + Off)
.addReg(AMDGPU::M0, RegState::Implicit)
.addReg(Vec, RegState::Implicit);
@ -395,10 +398,13 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
unsigned Dst = MI.getOperand(0).getReg();
unsigned Off = MI.getOperand(4).getImm();
unsigned Val = MI.getOperand(5).getReg();
unsigned SubReg = TRI->getSubReg(Dst, AMDGPU::sub0);
if (!SubReg)
SubReg = Dst;
MachineInstr *MovRel =
BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
.addReg(TRI->getSubReg(Dst, AMDGPU::sub0) + Off, RegState::Define)
.addReg(SubReg + Off, RegState::Define)
.addReg(Val)
.addReg(AMDGPU::M0, RegState::Implicit)
.addReg(Dst, RegState::Implicit);
@ -477,6 +483,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
IndirectSrc(MI);
break;
case AMDGPU::SI_INDIRECT_DST_V1:
case AMDGPU::SI_INDIRECT_DST_V2:
case AMDGPU::SI_INDIRECT_DST_V4:
case AMDGPU::SI_INDIRECT_DST_V8:

View File

@ -15,6 +15,7 @@
#include "SIRegisterInfo.h"
#include "AMDGPUTargetMachine.h"
#include "SIInstrInfo.h"
using namespace llvm;
@ -26,6 +27,9 @@ SIRegisterInfo::SIRegisterInfo(AMDGPUTargetMachine &tm)
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
Reserved.set(AMDGPU::EXEC);
Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(TM.getInstrInfo());
TII->reserveIndirectRegisters(Reserved, MF);
return Reserved;
}
@ -51,6 +55,10 @@ const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
}
}
unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const {
return getEncodingValue(Reg);
}
const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
assert(!TargetRegisterInfo::isVirtualRegister(Reg));

View File

@ -42,6 +42,8 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
/// CFGStructurizer
virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
virtual unsigned getHWRegIndex(unsigned Reg) const;
/// \brief Return the 'base' register class for this register.
/// e.g. SGPR0 => SReg_32, VGPR => VReg_32 SGPR0_SGPR1 -> SReg_32, etc.
const TargetRegisterClass *getPhysRegClass(unsigned Reg) const;

View File

@ -299,8 +299,6 @@ entry:
; R600-CHECK: 31
; SI-CHECK-LABEL: @load_i64_sext
; SI-CHECK: BUFFER_LOAD_DWORDX2 [[VAL:v\[[0-9]:[0-9]\]]]
; SI-CHECK: V_LSHL_B64 [[LSHL:v\[[0-9]:[0-9]\]]], [[VAL]], 32
; SI-CHECK: V_ASHR_I64 v{{\[[0-9]:[0-9]\]}}, [[LSHL]], 32
define void @load_i64_sext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
entry:

View File

@ -1,16 +1,24 @@
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
; This test checks that uses and defs of the AR register happen in the same
; instruction clause.
; CHECK: @mova_same_clause
; CHECK: MOVA_INT
; CHECK-NOT: ALU clause
; CHECK: 0 + AR.x
; CHECK: MOVA_INT
; CHECK-NOT: ALU clause
; CHECK: 0 + AR.x
; R600-CHECK-LABEL: @mova_same_clause
; R600-CHECK: MOVA_INT
; R600-CHECK-NOT: ALU clause
; R600-CHECK: 0 + AR.x
; R600-CHECK: MOVA_INT
; R600-CHECK-NOT: ALU clause
; R600-CHECK: 0 + AR.x
; SI-CHECK-LABEL: @mova_same_clause
; SI-CHECK: V_READFIRSTLANE
; SI-CHECK: V_MOVRELD
; SI-CHECK: S_CBRANCH
; SI-CHECK: V_READFIRSTLANE
; SI-CHECK: V_MOVRELD
; SI-CHECK: S_CBRANCH
define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
entry:
%stack = alloca [5 x i32], align 4
@ -38,9 +46,10 @@ entry:
; XXX: This generated code has unnecessary MOVs, we should be able to optimize
; this.
; CHECK: @multiple_structs
; CHECK-NOT: MOVA_INT
; R600-CHECK-LABEL: @multiple_structs
; R600-CHECK-NOT: MOVA_INT
; SI-CHECK-LABEL: @multiple_structs
; SI-CHECK-NOT: V_MOVREL
%struct.point = type { i32, i32 }
define void @multiple_structs(i32 addrspace(1)* %out) {
@ -68,8 +77,10 @@ entry:
; loads and stores should be lowered to copies, so there shouldn't be any
; MOVA instructions.
; CHECK: @direct_loop
; CHECK-NOT: MOVA_INT
; R600-CHECK-LABLE: @direct_loop
; R600-CHECK-NOT: MOVA_INT
; SI-CHECK-LABEL: @direct_loop
; SI-CHECK-NOT: V_MOVREL
define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
entry:

View File

@ -43,7 +43,7 @@ define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
;EG-CHECK: ASHR
;SI-CHECK-LABEL: @ashr_i64
;SI-CHECK: V_ASHR_I64
;SI-CHECK: S_ASHR_I64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
define void @ashr_i64(i64 addrspace(1)* %out, i32 %in) {
entry:
%0 = sext i32 %in to i64

View File

@ -1,8 +1,7 @@
; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
; SI-LABEL: @unaligned_load_store_i32:
; SI: V_ADD_I32_e64 [[REG:v[0-9]+]]
; DS_READ_U8 {{v[0-9]+}}, 0, [[REG]]
; DS_READ_U32 {{v[0-9]+}}, 0, [[REG]]
define void @unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind {
%v = load i32 addrspace(3)* %p, align 1
store i32 %v, i32 addrspace(3)* %r, align 1
@ -10,8 +9,7 @@ define void @unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r
}
; SI-LABEL: @unaligned_load_store_v4i32:
; SI: V_ADD_I32_e64 [[REG:v[0-9]+]]
; DS_READ_U8 {{v[0-9]+}}, 0, [[REG]]
; DS_READ_U32 {{v[0-9]+}}, 0, [[REG]]
define void @unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) nounwind {
%v = load <4 x i32> addrspace(3)* %p, align 1
store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1