diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h index 1aa607f57ea..bac01a3f427 100644 --- a/lib/Target/R600/AMDGPU.h +++ b/lib/Target/R600/AMDGPU.h @@ -36,6 +36,7 @@ FunctionPass *createSIInsertWaits(TargetMachine &tm); // Passes common to R600 and SI Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm); +FunctionPass* createAMDGPUIndirectAddressingPass(TargetMachine &tm); } // End namespace llvm diff --git a/lib/Target/R600/AMDGPUFrameLowering.cpp b/lib/Target/R600/AMDGPUFrameLowering.cpp new file mode 100644 index 00000000000..815d6f71c3b --- /dev/null +++ b/lib/Target/R600/AMDGPUFrameLowering.cpp @@ -0,0 +1,122 @@ +//===----------------------- AMDGPUFrameLowering.cpp ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// Interface to describe a layout of a stack frame on a AMDIL target machine +// +//===----------------------------------------------------------------------===// +#include "AMDGPUFrameLowering.h" +#include "AMDGPURegisterInfo.h" +#include "R600MachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Instructions.h" + +using namespace llvm; +AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl, + int LAO, unsigned TransAl) + : TargetFrameLowering(D, StackAl, LAO, TransAl) { } + +AMDGPUFrameLowering::~AMDGPUFrameLowering() { } + +unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const { + + // XXX: Hardcoding to 1 for now. + // + // I think the StackWidth should stored as metadata associated with the + // MachineFunction. This metadata can either be added by a frontend, or + // calculated by a R600 specific LLVM IR pass. + // + // The StackWidth determines how stack objects are laid out in memory. + // For a vector stack variable, like: int4 stack[2], the data will be stored + // in the following ways depending on the StackWidth. + // + // StackWidth = 1: + // + // T0.X = stack[0].x + // T1.X = stack[0].y + // T2.X = stack[0].z + // T3.X = stack[0].w + // T4.X = stack[1].x + // T5.X = stack[1].y + // T6.X = stack[1].z + // T7.X = stack[1].w + // + // StackWidth = 2: + // + // T0.X = stack[0].x + // T0.Y = stack[0].y + // T1.X = stack[0].z + // T1.Y = stack[0].w + // T2.X = stack[1].x + // T2.Y = stack[1].y + // T3.X = stack[1].z + // T3.Y = stack[1].w + // + // StackWidth = 4: + // T0.X = stack[0].x + // T0.Y = stack[0].y + // T0.Z = stack[0].z + // T0.W = stack[0].w + // T1.X = stack[1].x + // T1.Y = stack[1].y + // T1.Z = stack[1].z + // T1.W = stack[1].w + return 1; +} + +/// \returns The number of registers allocated for \p FI. +int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF, + int FI) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + unsigned Offset = 0; + int UpperBound = FI == -1 ? MFI->getNumObjects() : FI; + + for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) { + const AllocaInst *Alloca = MFI->getObjectAllocation(i); + unsigned ArrayElements; + const Type *AllocaType = Alloca->getAllocatedType(); + const Type *ElementType; + + if (AllocaType->isArrayTy()) { + ArrayElements = AllocaType->getArrayNumElements(); + ElementType = AllocaType->getArrayElementType(); + } else { + ArrayElements = 1; + ElementType = AllocaType; + } + + unsigned VectorElements; + if (ElementType->isVectorTy()) { + VectorElements = ElementType->getVectorNumElements(); + } else { + VectorElements = 1; + } + + Offset += (VectorElements / getStackWidth(MF)) * ArrayElements; + } + return Offset; +} + +const TargetFrameLowering::SpillSlot * +AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const { + NumEntries = 0; + return 0; +} +void +AMDGPUFrameLowering::emitPrologue(MachineFunction &MF) const { +} +void +AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { +} + +bool +AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const { + return false; +} diff --git a/lib/Target/R600/AMDILFrameLowering.h b/lib/Target/R600/AMDGPUFrameLowering.h similarity index 86% rename from lib/Target/R600/AMDILFrameLowering.h rename to lib/Target/R600/AMDGPUFrameLowering.h index 51337c3dd2e..cf5742ee095 100644 --- a/lib/Target/R600/AMDILFrameLowering.h +++ b/lib/Target/R600/AMDGPUFrameLowering.h @@ -1,4 +1,4 @@ -//===--------------------- AMDILFrameLowering.h -----------------*- C++ -*-===// +//===--------------------- AMDGPUFrameLowering.h ----------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -30,6 +30,10 @@ public: AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO, unsigned TransAl = 1); virtual ~AMDGPUFrameLowering(); + + /// \returns The number of 32-bit sub-registers that are used when storing + /// values to the stack. + virtual unsigned getStackWidth(const MachineFunction &MF) const; virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const; virtual const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const; virtual void emitPrologue(MachineFunction &MF) const; diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index f3a047a135c..d0d23d692a3 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -412,5 +412,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(URECIP) NODE_NAME_CASE(EXPORT) NODE_NAME_CASE(CONST_ADDRESS) + NODE_NAME_CASE(REGISTER_LOAD) + NODE_NAME_CASE(REGISTER_STORE) } } diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index 0584d396824..927ed09b463 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -122,6 +122,8 @@ enum { URECIP, EXPORT, CONST_ADDRESS, + REGISTER_LOAD, + REGISTER_STORE, LAST_AMDGPU_ISD_NUMBER }; diff --git a/lib/Target/R600/AMDGPUIndirectAddressing.cpp b/lib/Target/R600/AMDGPUIndirectAddressing.cpp new file mode 100644 index 00000000000..56aaf23cae9 --- /dev/null +++ b/lib/Target/R600/AMDGPUIndirectAddressing.cpp @@ -0,0 +1,326 @@ +//===-- AMDGPUIndirectAddressing.cpp - Indirect Adressing Support ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// Instructions can use indirect addressing to index the register file as if it +/// were memory. This pass lowers RegisterLoad and RegisterStore instructions +/// to either a COPY or a MOV that uses indirect addressing. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +namespace { + +class AMDGPUIndirectAddressingPass : public MachineFunctionPass { + +private: + static char ID; + const AMDGPUInstrInfo *TII; + + bool regHasExplicitDef(MachineRegisterInfo &MRI, unsigned Reg) const; + +public: + AMDGPUIndirectAddressingPass(TargetMachine &tm) : + MachineFunctionPass(ID), + TII(static_cast(tm.getInstrInfo())) + { } + + virtual bool runOnMachineFunction(MachineFunction &MF); + + const char *getPassName() const { return "R600 Handle indirect addressing"; } + +}; + +} // End anonymous namespace + +char AMDGPUIndirectAddressingPass::ID = 0; + +FunctionPass *llvm::createAMDGPUIndirectAddressingPass(TargetMachine &tm) { + return new AMDGPUIndirectAddressingPass(tm); +} + +bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + + int IndirectBegin = TII->getIndirectIndexBegin(MF); + int IndirectEnd = TII->getIndirectIndexEnd(MF); + + if (IndirectBegin == -1) { + // No indirect addressing, we can skip this pass + assert(IndirectEnd == -1); + return false; + } + + // The map keeps track of the indirect address that is represented by + // each virtual register. The key is the register and the value is the + // indirect address it uses. + std::map RegisterAddressMap; + + // First pass - Lower all of the RegisterStore instructions and track which + // registers are live. + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + // This map keeps track of the current live indirect registers. + // The key is the address and the value is the register + std::map LiveAddressRegisterMap; + MachineBasicBlock &MBB = *BB; + + for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); + I != MBB.end(); I = Next) { + Next = llvm::next(I); + MachineInstr &MI = *I; + + if (!TII->isRegisterStore(MI)) { + continue; + } + + // Lower RegisterStore + + unsigned RegIndex = MI.getOperand(2).getImm(); + unsigned Channel = MI.getOperand(3).getImm(); + unsigned Address = TII->calculateIndirectAddress(RegIndex, Channel); + const TargetRegisterClass *IndirectStoreRegClass = + TII->getIndirectAddrStoreRegClass(MI.getOperand(0).getReg()); + + if (MI.getOperand(1).getReg() == AMDGPU::INDIRECT_BASE_ADDR) { + // Direct register access. + unsigned DstReg = MRI.createVirtualRegister(IndirectStoreRegClass); + + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY), DstReg) + .addOperand(MI.getOperand(0)); + + RegisterAddressMap[DstReg] = Address; + LiveAddressRegisterMap[Address] = DstReg; + } else { + // Indirect register access. + MachineInstrBuilder MOV = TII->buildIndirectWrite(BB, I, + MI.getOperand(0).getReg(), // Value + Address, + MI.getOperand(1).getReg()); // Offset + for (int i = IndirectBegin; i <= IndirectEnd; ++i) { + unsigned Addr = TII->calculateIndirectAddress(i, Channel); + unsigned DstReg = MRI.createVirtualRegister(IndirectStoreRegClass); + MOV.addReg(DstReg, RegState::Define | RegState::Implicit); + RegisterAddressMap[DstReg] = Addr; + LiveAddressRegisterMap[Addr] = DstReg; + } + } + MI.eraseFromParent(); + } + + // Update the live-ins of the succesor blocks + for (MachineBasicBlock::succ_iterator Succ = MBB.succ_begin(), + SuccEnd = MBB.succ_end(); + SuccEnd != Succ; ++Succ) { + std::map::const_iterator Key, KeyEnd; + for (Key = LiveAddressRegisterMap.begin(), + KeyEnd = LiveAddressRegisterMap.end(); KeyEnd != Key; ++Key) { + (*Succ)->addLiveIn(Key->second); + } + } + } + + // Second pass - Lower the RegisterLoad instructions + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + // Key is the address and the value is the register + std::map LiveAddressRegisterMap; + MachineBasicBlock &MBB = *BB; + + MachineBasicBlock::livein_iterator LI = MBB.livein_begin(); + while (LI != MBB.livein_end()) { + std::vector PhiRegisters; + + // Make sure this live in is used for indirect addressing + if (RegisterAddressMap.find(*LI) == RegisterAddressMap.end()) { + ++LI; + continue; + } + + unsigned Address = RegisterAddressMap[*LI]; + LiveAddressRegisterMap[Address] = *LI; + PhiRegisters.push_back(*LI); + + // Check if there are other live in registers which map to the same + // indirect address. + for (MachineBasicBlock::livein_iterator LJ = llvm::next(LI), + LE = MBB.livein_end(); + LJ != LE; ++LJ) { + unsigned Reg = *LJ; + if (RegisterAddressMap.find(Reg) == RegisterAddressMap.end()) { + continue; + } + + if (RegisterAddressMap[Reg] == Address) { + if (!regHasExplicitDef(MRI, Reg)) { + continue; + } + PhiRegisters.push_back(Reg); + } + } + + if (PhiRegisters.size() == 1) { + // We don't need to insert a Phi instruction, so we can just add the + // registers to the live list for the block. + LiveAddressRegisterMap[Address] = *LI; + MBB.removeLiveIn(*LI); + } else { + // We need to insert a PHI, because we have the same address being + // written in multiple predecessor blocks. + const TargetRegisterClass *PhiDstClass = + TII->getIndirectAddrStoreRegClass(*(PhiRegisters.begin())); + unsigned PhiDstReg = MRI.createVirtualRegister(PhiDstClass); + MachineInstrBuilder Phi = BuildMI(MBB, MBB.begin(), + MBB.findDebugLoc(MBB.begin()), + TII->get(AMDGPU::PHI), PhiDstReg); + + for (std::vector::const_iterator RI = PhiRegisters.begin(), + RE = PhiRegisters.end(); + RI != RE; ++RI) { + unsigned Reg = *RI; + MachineInstr *DefInst = MRI.getVRegDef(Reg); + assert(DefInst); + MachineBasicBlock *RegBlock = DefInst->getParent(); + Phi.addReg(Reg); + Phi.addMBB(RegBlock); + MBB.removeLiveIn(Reg); + } + RegisterAddressMap[PhiDstReg] = Address; + LiveAddressRegisterMap[Address] = PhiDstReg; + } + LI = MBB.livein_begin(); + } + + for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); + I != MBB.end(); I = Next) { + Next = llvm::next(I); + MachineInstr &MI = *I; + + if (!TII->isRegisterLoad(MI)) { + if (MI.getOpcode() == AMDGPU::PHI) { + continue; + } + // Check for indirect register defs + for (unsigned OpIdx = 0, NumOperands = MI.getNumOperands(); + OpIdx < NumOperands; ++OpIdx) { + MachineOperand &MO = MI.getOperand(OpIdx); + if (MO.isReg() && MO.isDef() && + RegisterAddressMap.find(MO.getReg()) != RegisterAddressMap.end()) { + unsigned Reg = MO.getReg(); + unsigned LiveAddress = RegisterAddressMap[Reg]; + // Chain the live-ins + if (LiveAddressRegisterMap.find(LiveAddress) != + RegisterAddressMap.end()) { + MI.addOperand(MachineOperand::CreateReg( + LiveAddressRegisterMap[LiveAddress], + false, // isDef + true, // isImp + true)); // isKill + } + LiveAddressRegisterMap[LiveAddress] = Reg; + } + } + continue; + } + + const TargetRegisterClass *SuperIndirectRegClass = + TII->getSuperIndirectRegClass(); + const TargetRegisterClass *IndirectLoadRegClass = + TII->getIndirectAddrLoadRegClass(); + unsigned IndirectReg = MRI.createVirtualRegister(SuperIndirectRegClass); + + unsigned RegIndex = MI.getOperand(2).getImm(); + unsigned Channel = MI.getOperand(3).getImm(); + unsigned Address = TII->calculateIndirectAddress(RegIndex, Channel); + + if (MI.getOperand(1).getReg() == AMDGPU::INDIRECT_BASE_ADDR) { + // Direct register access + unsigned Reg = LiveAddressRegisterMap[Address]; + unsigned AddrReg = IndirectLoadRegClass->getRegister(Address); + + if (regHasExplicitDef(MRI, Reg)) { + // If the register we are reading from has an explicit def, then that + // means it was written via a direct register access (i.e. COPY + // or other instruction that doesn't use indirect addressing). In + // this case we know where the value has been stored, so we can just + // issue a copy. + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY), + MI.getOperand(0).getReg()) + .addReg(Reg); + } else { + // If the register we are reading has an implicit def, then that + // means it was written by an indirect register access (i.e. An + // instruction that uses indirect addressing. + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY), + MI.getOperand(0).getReg()) + .addReg(AddrReg); + } + } else { + // Indirect register access + + // Note on REQ_SEQUENCE instructons: You can't actually use the register + // it defines unless you have an instruction that takes the defined + // register class as an operand. + + MachineInstrBuilder Sequence = BuildMI(MBB, I, MBB.findDebugLoc(I), + TII->get(AMDGPU::REG_SEQUENCE), + IndirectReg); + for (int i = IndirectBegin; i <= IndirectEnd; ++i) { + unsigned Addr = TII->calculateIndirectAddress(i, Channel); + if (LiveAddressRegisterMap.find(Addr) == LiveAddressRegisterMap.end()) { + continue; + } + unsigned Reg = LiveAddressRegisterMap[Addr]; + + // We only need to use REG_SEQUENCE for explicit defs, since the + // register coalescer won't do anything with the implicit defs. + MachineInstr *DefInstr = MRI.getVRegDef(Reg); + if (!DefInstr->getOperand(0).isReg() || + DefInstr->getOperand(0).getReg() != Reg) { + continue; + } + + // Insert a REQ_SEQUENCE instruction to force the register allocator + // to allocate the virtual register to the correct physical register. + Sequence.addReg(LiveAddressRegisterMap[Addr]); + Sequence.addImm(TII->getRegisterInfo().getIndirectSubReg(Addr)); + } + MachineInstrBuilder Mov = TII->buildIndirectRead(BB, I, + MI.getOperand(0).getReg(), // Value + Address, + MI.getOperand(1).getReg()); // Offset + + + + Mov.addReg(IndirectReg, RegState::Implicit | RegState::Kill); + + } + MI.eraseFromParent(); + } + } + return false; +} + +bool AMDGPUIndirectAddressingPass::regHasExplicitDef(MachineRegisterInfo &MRI, + unsigned Reg) const { + MachineInstr *DefInstr = MRI.getVRegDef(Reg); + return DefInstr && DefInstr->getOperand(0).isReg() && + DefInstr->getOperand(0).getReg() == Reg; +} diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp index e42a46d839b..640707d7ca4 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.cpp +++ b/lib/Target/R600/AMDGPUInstrInfo.cpp @@ -234,7 +234,16 @@ AMDGPUInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { // TODO: Implement this function return true; } - + +bool AMDGPUInstrInfo::isRegisterStore(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_STORE; +} + +bool AMDGPUInstrInfo::isRegisterLoad(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD; +} + + void AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF, DebugLoc DL) const { MachineRegisterInfo &MRI = MF.getRegInfo(); diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h index cb97af98317..3909e4e105e 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.h +++ b/lib/Target/R600/AMDGPUInstrInfo.h @@ -40,9 +40,10 @@ class MachineInstrBuilder; class AMDGPUInstrInfo : public AMDGPUGenInstrInfo { private: const AMDGPURegisterInfo RI; - TargetMachine &TM; bool getNextBranchInstr(MachineBasicBlock::iterator &iter, MachineBasicBlock &MBB) const; +protected: + TargetMachine &TM; public: explicit AMDGPUInstrInfo(TargetMachine &tm); @@ -130,12 +131,66 @@ public: bool isAExtLoadInst(llvm::MachineInstr *MI) const; bool isStoreInst(llvm::MachineInstr *MI) const; bool isTruncStoreInst(llvm::MachineInstr *MI) const; + bool isRegisterStore(const MachineInstr &MI) const; + bool isRegisterLoad(const MachineInstr &MI) const; + +//===---------------------------------------------------------------------===// +// Pure virtual funtions to be implemented by sub-classes. +//===---------------------------------------------------------------------===// virtual MachineInstr* getMovImmInstr(MachineFunction *MF, unsigned DstReg, int64_t Imm) const = 0; virtual unsigned getIEQOpcode() const = 0; virtual bool isMov(unsigned opcode) const = 0; + /// \returns the smallest register index that will be accessed by an indirect + /// read or write or -1 if indirect addressing is not used by this program. + virtual int getIndirectIndexBegin(const MachineFunction &MF) const = 0; + + /// \returns the largest register index that will be accessed by an indirect + /// read or write or -1 if indirect addressing is not used by this program. + virtual int getIndirectIndexEnd(const MachineFunction &MF) const = 0; + + /// \brief Calculate the "Indirect Address" for the given \p RegIndex and + /// \p Channel + /// + /// We model indirect addressing using a virtual address space that can be + /// accesed with loads and stores. The "Indirect Address" is the memory + /// address in this virtual address space that maps to the given \p RegIndex + /// and \p Channel. + virtual unsigned calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const = 0; + + /// \returns The register class to be used for storing values to an + /// "Indirect Address" . + virtual const TargetRegisterClass *getIndirectAddrStoreRegClass( + unsigned SourceReg) const = 0; + + /// \returns The register class to be used for loading values from + /// an "Indirect Address" . + virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const = 0; + + /// \brief Build instruction(s) for an indirect register write. + /// + /// \returns The instruction that performs the indirect register write + virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const = 0; + + /// \brief Build instruction(s) for an indirect register read. + /// + /// \returns The instruction that performs the indirect register read + virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const = 0; + + /// \returns the register class whose sub registers are the set of all + /// possible registers that can be used for indirect addressing. + virtual const TargetRegisterClass *getSuperIndirectRegClass() const = 0; + + /// \brief Convert the AMDIL MachineInstr to a supported ISA /// MachineInstr virtual void convertToISA(MachineInstr & MI, MachineFunction &MF, @@ -145,4 +200,7 @@ public: } // End llvm namespace +#define AMDGPU_FLAG_REGISTER_LOAD (UINT64_C(1) << 63) +#define AMDGPU_FLAG_REGISTER_STORE (UINT64_C(1) << 62) + #endif // AMDGPUINSTRINFO_H diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td index 96368e8541c..b66ae879dc2 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.td +++ b/lib/Target/R600/AMDGPUInstrInfo.td @@ -72,3 +72,11 @@ def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp, def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>; def fpow : SDNode<"ISD::FPOW", SDTFPBinOp>; + +def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD", + SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>, + [SDNPHasChain, SDNPMayLoad]>; + +def AMDGPUregister_store : SDNode<"AMDGPUISD::REGISTER_STORE", + SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>, + [SDNPHasChain, SDNPMayStore]>; diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index e634d20b614..3dee004270d 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -13,8 +13,8 @@ //===----------------------------------------------------------------------===// class AMDGPUInst pattern> : Instruction { - field bits<16> AMDILOp = 0; - field bits<3> Gen = 0; + field bit isRegisterLoad = 0; + field bit isRegisterStore = 0; let Namespace = "AMDGPU"; let OutOperandList = outs; @@ -22,8 +22,9 @@ class AMDGPUInst pattern> : Instructio let AsmString = asm; let Pattern = pattern; let Itinerary = NullALU; - let TSFlags{42-40} = Gen; - let TSFlags{63-48} = AMDILOp; + + let TSFlags{63} = isRegisterLoad; + let TSFlags{62} = isRegisterStore; } class AMDGPUShaderInst pattern> @@ -101,7 +102,9 @@ def FP_ONE : PatLeaf < [{return N->isExactlyValue(1.0);}] >; -let isCodeGenOnly = 1, isPseudo = 1, usesCustomInserter = 1 in { +let isCodeGenOnly = 1, isPseudo = 1 in { + +let usesCustomInserter = 1 in { class CLAMP : AMDGPUShaderInst < (outs rc:$dst), @@ -131,7 +134,31 @@ def SHADER_TYPE : AMDGPUShaderInst < [(int_AMDGPU_shader_type imm:$type)] >; -} // End isCodeGenOnly = 1, isPseudo = 1, hasCustomInserter = 1 +} // usesCustomInserter = 1 + +multiclass RegisterLoadStore { + def RegisterLoad : AMDGPUShaderInst < + (outs dstClass:$dst), + (ins addrClass:$addr, i32imm:$chan), + "RegisterLoad $dst, $addr", + [(set (i32 dstClass:$dst), (AMDGPUregister_load addrPat:$addr, + (i32 timm:$chan)))] + > { + let isRegisterLoad = 1; + } + + def RegisterStore : AMDGPUShaderInst < + (outs), + (ins dstClass:$val, addrClass:$addr, i32imm:$chan), + "RegisterStore $val, $addr", + [(AMDGPUregister_store (i32 dstClass:$val), addrPat:$addr, (i32 timm:$chan))] + > { + let isRegisterStore = 1; + } +} + +} // End isCodeGenOnly = 1, isPseudo = 1 /* Generic helper patterns for intrinsics */ /* -------------------------------------- */ diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp index b33290523df..7878d6000d7 100644 --- a/lib/Target/R600/AMDGPURegisterInfo.cpp +++ b/lib/Target/R600/AMDGPURegisterInfo.cpp @@ -48,5 +48,28 @@ unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const { return 0; } +unsigned AMDGPURegisterInfo::getIndirectSubReg(unsigned IndirectIndex) const { + + switch(IndirectIndex) { + case 0: return AMDGPU::indirect_0; + case 1: return AMDGPU::indirect_1; + case 2: return AMDGPU::indirect_2; + case 3: return AMDGPU::indirect_3; + case 4: return AMDGPU::indirect_4; + case 5: return AMDGPU::indirect_5; + case 6: return AMDGPU::indirect_6; + case 7: return AMDGPU::indirect_7; + case 8: return AMDGPU::indirect_8; + case 9: return AMDGPU::indirect_9; + case 10: return AMDGPU::indirect_10; + case 11: return AMDGPU::indirect_11; + case 12: return AMDGPU::indirect_12; + case 13: return AMDGPU::indirect_13; + case 14: return AMDGPU::indirect_14; + case 15: return AMDGPU::indirect_15; + default: llvm_unreachable("indirect index out of range"); + } +} + #define GET_REGINFO_TARGET_DESC #include "AMDGPUGenRegisterInfo.inc" diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h index 4a4beade02c..1fc88e7455b 100644 --- a/lib/Target/R600/AMDGPURegisterInfo.h +++ b/lib/Target/R600/AMDGPURegisterInfo.h @@ -57,6 +57,8 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { RegScavenger *RS) const; unsigned getFrameRegister(const MachineFunction &MF) const; + unsigned getIndirectSubReg(unsigned IndirectIndex) const; + }; } // End namespace llvm diff --git a/lib/Target/R600/AMDGPURegisterInfo.td b/lib/Target/R600/AMDGPURegisterInfo.td index 8181e023aa3..0b4482cc51a 100644 --- a/lib/Target/R600/AMDGPURegisterInfo.td +++ b/lib/Target/R600/AMDGPURegisterInfo.td @@ -16,6 +16,14 @@ let Namespace = "AMDGPU" in { def sel_y : SubRegIndex; def sel_z : SubRegIndex; def sel_w : SubRegIndex; + + +foreach Index = 0-15 in { + def indirect_#Index : SubRegIndex; +} + +def INDIRECT_BASE_ADDR : Register <"INDIRECT_BASE_ADDR">; + } include "R600RegisterInfo.td" diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index 2185be3c203..821e86476f3 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -102,6 +102,12 @@ AMDGPUPassConfig::addPreISel() { bool AMDGPUPassConfig::addInstSelector() { addPass(createAMDGPUPeepholeOpt(*TM)); addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); + + const AMDGPUSubtarget &ST = TM->getSubtarget(); + if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) { + // This callbacks this pass uses are not implemented yet on SI. + addPass(createAMDGPUIndirectAddressingPass(*TM)); + } return false; } diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h index 91f9a83a29c..2afe7873a90 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.h +++ b/lib/Target/R600/AMDGPUTargetMachine.h @@ -15,9 +15,9 @@ #ifndef AMDGPU_TARGET_MACHINE_H #define AMDGPU_TARGET_MACHINE_H +#include "AMDGPUFrameLowering.h" #include "AMDGPUInstrInfo.h" #include "AMDGPUSubtarget.h" -#include "AMDILFrameLowering.h" #include "AMDILIntrinsicInfo.h" #include "R600ISelLowering.h" #include "llvm/ADT/OwningPtr.h" diff --git a/lib/Target/R600/AMDILFrameLowering.cpp b/lib/Target/R600/AMDILFrameLowering.cpp deleted file mode 100644 index 9ad495ab48b..00000000000 --- a/lib/Target/R600/AMDILFrameLowering.cpp +++ /dev/null @@ -1,47 +0,0 @@ -//===----------------------- AMDILFrameLowering.cpp -----------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -/// \file -/// \brief Interface to describe a layout of a stack frame on a AMDGPU target -/// machine. -// -//===----------------------------------------------------------------------===// -#include "AMDILFrameLowering.h" -#include "llvm/CodeGen/MachineFrameInfo.h" - -using namespace llvm; -AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl, - int LAO, unsigned TransAl) - : TargetFrameLowering(D, StackAl, LAO, TransAl) { -} - -AMDGPUFrameLowering::~AMDGPUFrameLowering() { -} - -int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF, - int FI) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - return MFI->getObjectOffset(FI); -} - -const TargetFrameLowering::SpillSlot * -AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const { - NumEntries = 0; - return 0; -} -void -AMDGPUFrameLowering::emitPrologue(MachineFunction &MF) const { -} -void -AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { -} -bool -AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const { - return false; -} diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp index 84223f62e17..26994090643 100644 --- a/lib/Target/R600/AMDILISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp @@ -75,6 +75,7 @@ private: bool SelectADDR8BitOffset(SDValue Addr, SDValue& Base, SDValue& Offset); bool SelectADDRReg(SDValue Addr, SDValue& Base, SDValue& Offset); bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); + bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); // Include the pieces autogenerated from the target description. #include "AMDGPUGenDAGISel.inc" @@ -161,16 +162,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { } switch (Opc) { default: break; - case ISD::FrameIndex: { - if (FrameIndexSDNode *FIN = dyn_cast(N)) { - unsigned int FI = FIN->getIndex(); - EVT OpVT = N->getValueType(0); - unsigned int NewOpc = AMDGPU::COPY; - SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i32); - return CurDAG->SelectNodeTo(N, NewOpc, OpVT, TFI); - } - break; - } case ISD::ConstantFP: case ISD::Constant: { const AMDGPUSubtarget &ST = TM.getSubtarget(); @@ -613,3 +604,22 @@ bool AMDGPUDAGToDAGISel::SelectADDRReg(SDValue Addr, SDValue& Base, return true; } + +bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, + SDValue &Offset) { + ConstantSDNode *C; + + if ((C = dyn_cast(Addr))) { + Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); + Offset = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32); + } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && + (C = dyn_cast(Addr.getOperand(1)))) { + Base = Addr.getOperand(0); + Offset = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32); + } else { + Base = Addr; + Offset = CurDAG->getTargetConstant(0, MVT::i32); + } + + return true; +} diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt index a8be7ed975c..4f74b0477b3 100644 --- a/lib/Target/R600/CMakeLists.txt +++ b/lib/Target/R600/CMakeLists.txt @@ -17,7 +17,6 @@ add_llvm_target(R600CodeGen AMDILDevice.cpp AMDILDeviceInfo.cpp AMDILEvergreenDevice.cpp - AMDILFrameLowering.cpp AMDILIntrinsicInfo.cpp AMDILISelDAGToDAG.cpp AMDILISelLowering.cpp @@ -25,6 +24,8 @@ add_llvm_target(R600CodeGen AMDILPeepholeOptimizer.cpp AMDILSIDevice.cpp AMDGPUAsmPrinter.cpp + AMDGPUFrameLowering.cpp + AMDGPUIndirectAddressing.cpp AMDGPUMCInstLower.cpp AMDGPUSubtarget.cpp AMDGPUStructurizeCFG.cpp diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp index e76c6c86757..fb17ab7ddf4 100644 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp @@ -105,10 +105,7 @@ void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.getImm() != 0) { - O << " + " << Op.getImm(); - } + printIfSet(MI, OpNo, O, "+"); } void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo, diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h index e19eea38e49..16cfcf59eb3 100644 --- a/lib/Target/R600/R600Defines.h +++ b/lib/Target/R600/R600Defines.h @@ -49,6 +49,9 @@ namespace R600_InstFlag { #define HW_REG_MASK 0x1ff #define HW_CHAN_SHIFT 9 +#define GET_REG_CHAN(reg) ((reg) >> HW_CHAN_SHIFT) +#define GET_REG_INDEX(reg) ((reg) & HW_REG_MASK) + namespace R600Operands { enum Ops { DST, diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index 110dcc18876..85187f8fc51 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -16,6 +16,7 @@ #include "R600Defines.h" #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" @@ -71,11 +72,23 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::SELECT, MVT::i32, Custom); setOperationAction(ISD::SELECT, MVT::f32, Custom); + // Legalize loads and stores to the private address space. + setOperationAction(ISD::LOAD, MVT::i32, Custom); + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom); + setOperationAction(ISD::STORE, MVT::i8, Custom); setOperationAction(ISD::STORE, MVT::i32, Custom); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); setOperationAction(ISD::LOAD, MVT::i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setOperationAction(ISD::FrameIndex, MVT::i32, Custom); + setTargetDAGCombine(ISD::FP_ROUND); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); @@ -350,6 +363,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::LOAD: return LowerLOAD(Op, DAG); case ISD::FPOW: return LowerFPOW(Op, DAG); + case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); case ISD::INTRINSIC_VOID: { SDValue Chain = Op.getOperand(0); unsigned IntrinsicID = @@ -485,6 +499,10 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N, DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); return; } + case ISD::STORE: + SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode(); + Results.push_back(SDValue(Node, 0)); + return; } } @@ -552,6 +570,20 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, false, false, false, 0); } +SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { + + MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = + static_cast(getTargetMachine().getFrameLowering()); + + FrameIndexSDNode *FIN = dyn_cast(Op); + assert(FIN); + + unsigned FrameIndex = FIN->getIndex(); + unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); + return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32); +} + SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const { DebugLoc DL = Op.getDebugLoc(); EVT VT = Op.getValueType(); @@ -766,6 +798,61 @@ SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return Cond; } +/// LLVM generates byte-addresed pointers. For indirect addressing, we need to +/// convert these pointers to a register index. Each register holds +/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the +/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used +/// for indirect addressing. +SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, + unsigned StackWidth, + SelectionDAG &DAG) const { + unsigned SRLPad; + switch(StackWidth) { + case 1: + SRLPad = 2; + break; + case 2: + SRLPad = 3; + break; + case 4: + SRLPad = 4; + break; + default: llvm_unreachable("Invalid stack width"); + } + + return DAG.getNode(ISD::SRL, Ptr.getDebugLoc(), Ptr.getValueType(), Ptr, + DAG.getConstant(SRLPad, MVT::i32)); +} + +void R600TargetLowering::getStackAddress(unsigned StackWidth, + unsigned ElemIdx, + unsigned &Channel, + unsigned &PtrIncr) const { + switch (StackWidth) { + default: + case 1: + Channel = 0; + if (ElemIdx > 0) { + PtrIncr = 1; + } else { + PtrIncr = 0; + } + break; + case 2: + Channel = ElemIdx % 2; + if (ElemIdx == 2) { + PtrIncr = 1; + } else { + PtrIncr = 0; + } + break; + case 4: + Channel = ElemIdx; + PtrIncr = 0; + break; + } +} + SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { DebugLoc DL = Op.getDebugLoc(); StoreSDNode *StoreNode = cast(Op); @@ -787,7 +874,52 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { } return Chain; } - return SDValue(); + + EVT ValueVT = Value.getValueType(); + + if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + return SDValue(); + } + + // Lowering for indirect addressing + + const MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = static_cast( + getTargetMachine().getFrameLowering()); + unsigned StackWidth = TFL->getStackWidth(MF); + + Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); + + if (ValueVT.isVector()) { + unsigned NumElemVT = ValueVT.getVectorNumElements(); + EVT ElemVT = ValueVT.getVectorElementType(); + SDValue Stores[4]; + + assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " + "vector width in load"); + + for (unsigned i = 0; i < NumElemVT; ++i) { + unsigned Channel, PtrIncr; + getStackAddress(StackWidth, i, Channel, PtrIncr); + Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, + DAG.getConstant(PtrIncr, MVT::i32)); + SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, + Value, DAG.getConstant(i, MVT::i32)); + + Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, + Chain, Elem, Ptr, + DAG.getTargetConstant(Channel, MVT::i32)); + } + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT); + } else { + if (ValueVT == MVT::i8) { + Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); + } + Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, + DAG.getTargetConstant(0, MVT::i32)); // Channel + } + + return Chain; } // return (512 + (kc_bank << 12) @@ -876,7 +1008,53 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const return DAG.getMergeValues(MergedValues, 2, DL); } - return SDValue(); + if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + return SDValue(); + } + + // Lowering for indirect addressing + const MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = static_cast( + getTargetMachine().getFrameLowering()); + unsigned StackWidth = TFL->getStackWidth(MF); + + Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); + + if (VT.isVector()) { + unsigned NumElemVT = VT.getVectorNumElements(); + EVT ElemVT = VT.getVectorElementType(); + SDValue Loads[4]; + + assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " + "vector width in load"); + + for (unsigned i = 0; i < NumElemVT; ++i) { + unsigned Channel, PtrIncr; + getStackAddress(StackWidth, i, Channel, PtrIncr); + Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, + DAG.getConstant(PtrIncr, MVT::i32)); + Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, + Chain, Ptr, + DAG.getTargetConstant(Channel, MVT::i32), + Op.getOperand(2)); + } + for (unsigned i = NumElemVT; i < 4; ++i) { + Loads[i] = DAG.getUNDEF(ElemVT); + } + EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); + LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4); + } else { + LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, + Chain, Ptr, + DAG.getTargetConstant(0, MVT::i32), // Channel + Op.getOperand(2)); + } + + SDValue Ops[2]; + Ops[0] = LoweredLoad; + Ops[1] = Chain; + + return DAG.getMergeValues(Ops, 2, DL); } SDValue R600TargetLowering::LowerFPOW(SDValue Op, diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h index c141d50210e..afa38971551 100644 --- a/lib/Target/R600/R600ISelLowering.h +++ b/lib/Target/R600/R600ISelLowering.h @@ -64,7 +64,12 @@ private: SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFPOW(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; - + SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; + + SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth, + SelectionDAG &DAG) const; + void getStackAddress(unsigned StackWidth, unsigned ElemIdx, + unsigned &Channel, unsigned &PtrIncr) const; bool isZero(SDValue Op) const; }; diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index 85859eb6202..7e3f0057293 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -16,8 +16,11 @@ #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" #include "R600Defines.h" +#include "R600MachineFunctionInfo.h" #include "R600RegisterInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #define GET_INSTRINFO_CTOR #include "AMDGPUGenDFAPacketizer.inc" @@ -465,6 +468,124 @@ unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData, return 2; } +int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + int Offset = 0; + + if (MFI->getNumObjects() == 0) { + return -1; + } + + if (MRI.livein_empty()) { + return 0; + } + + for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(), + LE = MRI.livein_end(); + LI != LE; ++LI) { + Offset = std::max(Offset, + GET_REG_INDEX(RI.getEncodingValue(LI->first))); + } + + return Offset + 1; +} + +int R600InstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { + int Offset = 0; + const MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Variable sized objects are not supported + assert(!MFI->hasVarSizedObjects()); + + if (MFI->getNumObjects() == 0) { + return -1; + } + + Offset = TM.getFrameLowering()->getFrameIndexOffset(MF, -1); + + return getIndirectIndexBegin(MF) + Offset; +} + +std::vector R600InstrInfo::getIndirectReservedRegs( + const MachineFunction &MF) const { + const AMDGPUFrameLowering *TFL = + static_cast(TM.getFrameLowering()); + std::vector Regs; + + unsigned StackWidth = TFL->getStackWidth(MF); + int End = getIndirectIndexEnd(MF); + + if (End == -1) { + return Regs; + } + + for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) { + unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index); + Regs.push_back(SuperReg); + for (unsigned Chan = 0; Chan < StackWidth; ++Chan) { + unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan); + Regs.push_back(Reg); + } + } + return Regs; +} + +unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const { + // XXX: Remove when we support a stack width > 2 + assert(Channel == 0); + return RegIndex; +} + +const TargetRegisterClass * R600InstrInfo::getIndirectAddrStoreRegClass( + unsigned SourceReg) const { + return &AMDGPU::R600_TReg32RegClass; +} + +const TargetRegisterClass *R600InstrInfo::getIndirectAddrLoadRegClass() const { + return &AMDGPU::TRegMemRegClass; +} + +MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const { + unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); + MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, + AMDGPU::AR_X, OffsetReg); + setImmOperand(MOVA, R600Operands::WRITE, 0); + + MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV, + AddrReg, ValueReg) + .addReg(AMDGPU::AR_X, RegState::Implicit); + setImmOperand(Mov, R600Operands::DST_REL, 1); + return Mov; +} + +MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const { + unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); + MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, + AMDGPU::AR_X, + OffsetReg); + setImmOperand(MOVA, R600Operands::WRITE, 0); + MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV, + ValueReg, + AddrReg) + .addReg(AMDGPU::AR_X, RegState::Implicit); + setImmOperand(Mov, R600Operands::SRC0_REL, 1); + + return Mov; +} + +const TargetRegisterClass *R600InstrInfo::getSuperIndirectRegClass() const { + return &AMDGPU::IndirectRegRegClass; +} + + MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned Opcode, diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h index 11685af5b02..efe721c00cf 100644 --- a/lib/Target/R600/R600InstrInfo.h +++ b/lib/Target/R600/R600InstrInfo.h @@ -113,6 +113,38 @@ namespace llvm { virtual int getInstrLatency(const InstrItineraryData *ItinData, SDNode *Node) const { return 1;} + /// \returns a list of all the registers that may be accesed using indirect + /// addressing. + std::vector getIndirectReservedRegs(const MachineFunction &MF) const; + + virtual int getIndirectIndexBegin(const MachineFunction &MF) const; + + virtual int getIndirectIndexEnd(const MachineFunction &MF) const; + + + virtual unsigned calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const; + + virtual const TargetRegisterClass *getIndirectAddrStoreRegClass( + unsigned SourceReg) const; + + virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const; + + virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const; + + virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const; + + virtual const TargetRegisterClass *getSuperIndirectRegClass() const; + + + ///buildDefaultInstruction - This function returns a MachineInstr with + /// all the instruction modifiers initialized to their default values. /// You can use this function to avoid manually specifying each instruction /// modifier operand when building a new instruction. /// diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index f935313fe9b..afb30ec0d4c 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -91,11 +91,16 @@ def UP : InstFlag <"printUpdatePred">; // default to 0. def LAST : InstFlag<"printLast", 1>; +def FRAMEri : Operand { + let MIOperandInfo = (ops R600_Reg32:$ptr, i32imm:$index); +} + def ADDRParam : ComplexPattern; def ADDRDWord : ComplexPattern; def ADDRVTX_READ : ComplexPattern; def ADDRGA_CONST_OFFSET : ComplexPattern; def ADDRGA_VAR_OFFSET : ComplexPattern; +def ADDRIndirect : ComplexPattern; class R600ALU_Word0 { field bits<32> Word0; @@ -1220,6 +1225,10 @@ let Predicates = [isEGorCayman] in { defm DOT4_eg : DOT4_Common<0xBE>; defm CUBE_eg : CUBE_Common<0xC0>; +let hasSideEffects = 1 in { + def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", []>; +} + def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common; def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> { @@ -1470,6 +1479,12 @@ def CONSTANT_LOAD_eg : VTX_READ_32_eg <1, } +//===----------------------------------------------------------------------===// +// Regist loads and stores - for indirect addressing +//===----------------------------------------------------------------------===// + +defm R600_ : RegisterLoadStore ; + let Predicates = [isCayman] in { let isVector = 1 in { diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h index ad7b4da45ae..4b901f4bbc3 100644 --- a/lib/Target/R600/R600MachineFunctionInfo.h +++ b/lib/Target/R600/R600MachineFunctionInfo.h @@ -13,6 +13,7 @@ #ifndef R600MACHINEFUNCTIONINFO_H #define R600MACHINEFUNCTIONINFO_H +#include "llvm/ADT/BitVector.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/SelectionDAG.h" #include @@ -24,6 +25,7 @@ class R600MachineFunctionInfo : public MachineFunctionInfo { public: R600MachineFunctionInfo(const MachineFunction &MF); SmallVector LiveOuts; + std::vector IndirectRegs; SDNode *Outputs[16]; }; diff --git a/lib/Target/R600/R600RegisterInfo.cpp b/lib/Target/R600/R600RegisterInfo.cpp index d46b3a35435..cd3fc4a45ef 100644 --- a/lib/Target/R600/R600RegisterInfo.cpp +++ b/lib/Target/R600/R600RegisterInfo.cpp @@ -15,6 +15,7 @@ #include "R600RegisterInfo.h" #include "AMDGPUTargetMachine.h" #include "R600Defines.h" +#include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" using namespace llvm; @@ -43,6 +44,18 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(AMDGPU::PRED_SEL_ZERO); Reserved.set(AMDGPU::PRED_SEL_ONE); + for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(), + E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) { + Reserved.set(*I); + } + + const R600InstrInfo *RII = static_cast(&TII); + std::vector IndirectRegs = RII->getIndirectReservedRegs(MF); + for (std::vector::iterator I = IndirectRegs.begin(), + E = IndirectRegs.end(); + I != E; ++I) { + Reserved.set(*I); + } return Reserved; } @@ -77,3 +90,4 @@ unsigned R600RegisterInfo::getSubRegFromChannel(unsigned Channel) const { case 3: return AMDGPU::sel_w; } } + diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td index 993fefc2ab3..9a8b85950cd 100644 --- a/lib/Target/R600/R600RegisterInfo.td +++ b/lib/Target/R600/R600RegisterInfo.td @@ -27,6 +27,12 @@ foreach Index = 0-127 in { foreach Chan = [ "X", "Y", "Z", "W" ] in { // 32-bit Temporary Registers def T#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, Chan>; + + // Indirect addressing offset registers + def Addr#Index#_#Chan : R600RegWithChan <"T("#Index#" + AR.x)."#Chan, + Index, Chan>; + def TRegMem#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, + Chan>; } // 128-bit Temporary Registers def T#Index#_XYZW : R600Reg_128 <"T"#Index#".XYZW", @@ -57,6 +63,7 @@ def PREDICATE_BIT : R600Reg<"PredicateBit", 0>; def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>; def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>; def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>; +def AR_X : R600Reg<"AR.x", 0>; def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32, (add (sequence "ArrayBase%u", 448, 464))>; @@ -66,6 +73,13 @@ def ALU_CONST : R600Reg<"CBuf", 0>; // interpolation param reference, SRCx_SEL contains index def ALU_PARAM : R600Reg<"Param", 0>; +let isAllocatable = 0 in { + +// XXX: Only use the X channel, until we support wider stack widths +def R600_Addr : RegisterClass <"AMDGPU", [i32], 127, (add (sequence "Addr%u_X", 0, 127))>; + +} // End isAllocatable = 0 + def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32, (add (sequence "T%u_X", 0, 127))>; @@ -85,6 +99,7 @@ def R600_TReg32 : RegisterClass <"AMDGPU", [f32, i32], 32, def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add R600_TReg32, R600_ArrayBase, + R600_Addr, ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF, ALU_CONST, ALU_PARAM )>; @@ -99,3 +114,34 @@ def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, (add (sequence "T%u_XYZW", 0, 127))> { let CopyCost = -1; } + +//===----------------------------------------------------------------------===// +// Register classes for indirect addressing +//===----------------------------------------------------------------------===// + +// Super register for all the Indirect Registers. This register class is used +// by the REG_SEQUENCE instruction to specify the registers to use for direct +// reads / writes which may be written / read by an indirect address. +class IndirectSuper subregs> : + RegisterWithSubRegs { + let Namespace = "AMDGPU"; + let SubRegIndices = + [indirect_0,indirect_1,indirect_2,indirect_3,indirect_4,indirect_5,indirect_6, + indirect_7,indirect_8,indirect_9,indirect_10,indirect_11,indirect_12, + indirect_13,indirect_14,indirect_15]; +} + +def IndirectSuperReg : IndirectSuper<"Indirect", + [TRegMem0_X, TRegMem1_X, TRegMem2_X, TRegMem3_X, TRegMem4_X, TRegMem5_X, + TRegMem6_X, TRegMem7_X, TRegMem8_X, TRegMem9_X, TRegMem10_X, TRegMem11_X, + TRegMem12_X, TRegMem13_X, TRegMem14_X, TRegMem15_X] +>; + +def IndirectReg : RegisterClass<"AMDGPU", [f32, i32], 32, (add IndirectSuperReg)>; + +// This register class defines the registers that are the storage units for +// the "Indirect Addressing" pseudo memory space. +// XXX: Only use the X channel, until we support wider stack widths +def TRegMem : RegisterClass<"AMDGPU", [f32, i32], 32, + (add (sequence "TRegMem%u_X", 0, 16)) +>; diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index c6ad4d531dc..b40337d5d24 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -87,3 +87,51 @@ bool SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { return RC != &AMDGPU::EXECRegRegClass; } + +//===----------------------------------------------------------------------===// +// Indirect addressing callbacks +//===----------------------------------------------------------------------===// + +unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const { + assert(Channel == 0); + return RegIndex; +} + + +int SIInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const { + llvm_unreachable("Unimplemented"); +} + +int SIInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { + llvm_unreachable("Unimplemented"); +} + +const TargetRegisterClass *SIInstrInfo::getIndirectAddrStoreRegClass( + unsigned SourceReg) const { + llvm_unreachable("Unimplemented"); +} + +const TargetRegisterClass *SIInstrInfo::getIndirectAddrLoadRegClass() const { + llvm_unreachable("Unimplemented"); +} + +MachineInstrBuilder SIInstrInfo::buildIndirectWrite( + MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, + unsigned Address, unsigned OffsetReg) const { + llvm_unreachable("Unimplemented"); +} + +MachineInstrBuilder SIInstrInfo::buildIndirectRead( + MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, + unsigned Address, unsigned OffsetReg) const { + llvm_unreachable("Unimplemented"); +} + +const TargetRegisterClass *SIInstrInfo::getSuperIndirectRegClass() const { + llvm_unreachable("Unimplemented"); +} diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index 783cd9f4cb4..e4de4b85f26 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -48,6 +48,32 @@ public: virtual bool isMov(unsigned Opcode) const; virtual bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const; + + virtual int getIndirectIndexBegin(const MachineFunction &MF) const; + + virtual int getIndirectIndexEnd(const MachineFunction &MF) const; + + virtual unsigned calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const; + + virtual const TargetRegisterClass *getIndirectAddrStoreRegClass( + unsigned SourceReg) const; + + virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const; + + virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, + unsigned Address, + unsigned OffsetReg) const; + + virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, + unsigned Address, + unsigned OffsetReg) const; + + virtual const TargetRegisterClass *getSuperIndirectRegClass() const; }; } // End namespace llvm