mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-12-28 04:33:05 +00:00
R600: Add local memory support via LDS
Reviewed-by: Vincent Lejeune<vljn at ovi.com> git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@185162 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
cedcfee405
commit
e3d4cbc7d2
@ -29,6 +29,7 @@
|
||||
#include "llvm/MC/MCSectionELF.h"
|
||||
#include "llvm/MC/MCStreamer.h"
|
||||
#include "llvm/Support/ELF.h"
|
||||
#include "llvm/Support/MathExtras.h"
|
||||
#include "llvm/Support/TargetRegistry.h"
|
||||
#include "llvm/Target/TargetLoweringObjectFile.h"
|
||||
|
||||
@ -130,6 +131,11 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) {
|
||||
S_STACK_SIZE(MFI->StackSize), 4);
|
||||
OutStreamer.EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
|
||||
OutStreamer.EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
|
||||
|
||||
if (MFI->ShaderType == ShaderType::COMPUTE) {
|
||||
OutStreamer.EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
|
||||
OutStreamer.EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4);
|
||||
}
|
||||
}
|
||||
|
||||
void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
#include "llvm/CodeGen/SelectionDAG.h"
|
||||
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
|
||||
#include "llvm/IR/DataLayout.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
@ -71,6 +72,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
|
||||
setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
|
||||
setOperationAction(ISD::UREM, MVT::i32, Expand);
|
||||
|
||||
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
|
||||
|
||||
int types[] = {
|
||||
(int)MVT::v2i32,
|
||||
(int)MVT::v4i32
|
||||
@ -138,6 +141,26 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
|
||||
return Op;
|
||||
}
|
||||
|
||||
SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
|
||||
SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
|
||||
const DataLayout *TD = getTargetMachine().getDataLayout();
|
||||
GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
|
||||
// XXX: What does the value of G->getOffset() mean?
|
||||
assert(G->getOffset() == 0 &&
|
||||
"Do not know what to do with an non-zero offset");
|
||||
|
||||
unsigned Offset = MFI->LDSSize;
|
||||
const GlobalValue *GV = G->getGlobal();
|
||||
uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
|
||||
|
||||
// XXX: Account for alignment?
|
||||
MFI->LDSSize += Size;
|
||||
|
||||
return DAG.getConstant(Offset, MVT::i32);
|
||||
}
|
||||
|
||||
SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
|
||||
|
@ -20,6 +20,7 @@
|
||||
|
||||
namespace llvm {
|
||||
|
||||
class AMDGPUMachineFunction;
|
||||
class MachineRegisterInfo;
|
||||
|
||||
class AMDGPUTargetLowering : public TargetLowering {
|
||||
@ -36,6 +37,8 @@ protected:
|
||||
virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
|
||||
const TargetRegisterClass *RC,
|
||||
unsigned Reg, EVT VT) const;
|
||||
SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
|
||||
SelectionDAG &DAG) const;
|
||||
|
||||
bool isHWTrueValue(SDValue Op) const;
|
||||
bool isHWFalseValue(SDValue Op) const;
|
||||
|
@ -94,6 +94,15 @@ def zextloadi8_constant : PatFrag<(ops node:$ptr), (zextloadi8 node:$ptr), [{
|
||||
return isGlobalLoad(dyn_cast<LoadSDNode>(N));
|
||||
}]>;
|
||||
|
||||
def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
|
||||
return isLocalLoad(dyn_cast<LoadSDNode>(N));
|
||||
}]>;
|
||||
|
||||
def local_store : PatFrag<(ops node:$val, node:$ptr),
|
||||
(store node:$val, node:$ptr), [{
|
||||
return isLocalStore(dyn_cast<StoreSDNode>(N));
|
||||
}]>;
|
||||
|
||||
class Constants {
|
||||
int TWO_PI = 0x40c90fdb;
|
||||
int PI = 0x40490fdb;
|
||||
|
@ -10,6 +10,7 @@ const char *AMDGPUMachineFunction::ShaderTypeAttribute = "ShaderType";
|
||||
AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
|
||||
MachineFunctionInfo() {
|
||||
ShaderType = ShaderType::COMPUTE;
|
||||
LDSSize = 0;
|
||||
AttributeSet Set = MF.getFunction()->getAttributes();
|
||||
Attribute A = Set.getAttribute(AttributeSet::FunctionIndex,
|
||||
ShaderTypeAttribute);
|
||||
|
@ -23,6 +23,8 @@ private:
|
||||
public:
|
||||
AMDGPUMachineFunction(const MachineFunction &MF);
|
||||
unsigned ShaderType;
|
||||
/// Number of bytes in the LDS that are being used.
|
||||
unsigned LDSSize;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -282,11 +282,16 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
|
||||
|
||||
int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(),
|
||||
AMDGPU::OpName::literal);
|
||||
assert(ImmIdx != -1);
|
||||
if (ImmIdx == -1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// subtract one from ImmIdx, because the DST operand is usually index
|
||||
// 0 for MachineInstrs, but we have no DST in the Ops vector.
|
||||
ImmIdx--;
|
||||
if (TII->getOperandIdx(Use->getMachineOpcode(),
|
||||
AMDGPU::OpName::dst) != -1) {
|
||||
// subtract one from ImmIdx, because the DST operand is usually index
|
||||
// 0 for MachineInstrs, but we have no DST in the Ops vector.
|
||||
ImmIdx--;
|
||||
}
|
||||
|
||||
// Check that we aren't already using an immediate.
|
||||
// XXX: It's possible for an instruction to have more than one
|
||||
@ -336,7 +341,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
|
||||
}
|
||||
if (Result && Result->isMachineOpcode() &&
|
||||
!(TII->get(Result->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR)
|
||||
&& TII->isALUInstr(Result->getMachineOpcode())) {
|
||||
&& TII->hasInstrModifiers(Result->getMachineOpcode())) {
|
||||
// Fold FNEG/FABS/CONST_ADDRESS
|
||||
// TODO: Isel can generate multiple MachineInst, we need to recursively
|
||||
// parse Result
|
||||
|
@ -42,7 +42,9 @@ namespace R600_InstFlag {
|
||||
OP2 = (1 << 11),
|
||||
VTX_INST = (1 << 12),
|
||||
TEX_INST = (1 << 13),
|
||||
ALU_INST = (1 << 14)
|
||||
ALU_INST = (1 << 14),
|
||||
LDS_1A = (1 << 15),
|
||||
LDS_1A1D = (1 << 16)
|
||||
};
|
||||
}
|
||||
|
||||
@ -162,4 +164,6 @@ namespace OpName {
|
||||
#define R_028878_SQ_PGM_RESOURCES_GS 0x028878
|
||||
#define R_0288D4_SQ_PGM_RESOURCES_LS 0x0288d4
|
||||
|
||||
#define R_0288E8_SQ_LDS_ALLOC 0x0288E8
|
||||
|
||||
#endif // R600DEFINES_H_
|
||||
|
@ -138,6 +138,19 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
|
||||
break;
|
||||
}
|
||||
|
||||
case AMDGPU::LDS_READ_RET: {
|
||||
MachineInstrBuilder NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
|
||||
TII->get(MI->getOpcode()),
|
||||
AMDGPU::OQAP);
|
||||
for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
|
||||
NewMI.addOperand(MI->getOperand(i));
|
||||
}
|
||||
TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV,
|
||||
MI->getOperand(0).getReg(),
|
||||
AMDGPU::OQAP);
|
||||
break;
|
||||
}
|
||||
|
||||
case AMDGPU::MOV_IMM_F32:
|
||||
TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
|
||||
MI->getOperand(1).getFPImm()->getValueAPF()
|
||||
@ -456,6 +469,8 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
||||
MachineFunction &MF = DAG.getMachineFunction();
|
||||
R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
|
||||
switch (Op.getOpcode()) {
|
||||
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
|
||||
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
|
||||
@ -463,14 +478,13 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
|
||||
case ISD::STORE: return LowerSTORE(Op, DAG);
|
||||
case ISD::LOAD: return LowerLOAD(Op, DAG);
|
||||
case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
|
||||
case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
|
||||
case ISD::INTRINSIC_VOID: {
|
||||
SDValue Chain = Op.getOperand(0);
|
||||
unsigned IntrinsicID =
|
||||
cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
|
||||
switch (IntrinsicID) {
|
||||
case AMDGPUIntrinsic::AMDGPU_store_output: {
|
||||
MachineFunction &MF = DAG.getMachineFunction();
|
||||
R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
|
||||
int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
|
||||
unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
|
||||
MFI->LiveOuts.push_back(Reg);
|
||||
|
@ -23,6 +23,8 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
|
||||
bits<2> FlagOperandIdx = 0;
|
||||
bit Op1 = 0;
|
||||
bit Op2 = 0;
|
||||
bit LDS_1A = 0;
|
||||
bit LDS_1A1D = 0;
|
||||
bit HasNativeOperands = 0;
|
||||
bit VTXInst = 0;
|
||||
bit TEXInst = 0;
|
||||
@ -49,21 +51,21 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
|
||||
let TSFlags{12} = VTXInst;
|
||||
let TSFlags{13} = TEXInst;
|
||||
let TSFlags{14} = ALUInst;
|
||||
let TSFlags{15} = LDS_1A;
|
||||
let TSFlags{16} = LDS_1A1D;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// ALU instructions
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
class R600ALU_Word0 {
|
||||
class R600_ALU_LDS_Word0 {
|
||||
field bits<32> Word0;
|
||||
|
||||
bits<11> src0;
|
||||
bits<1> src0_neg;
|
||||
bits<1> src0_rel;
|
||||
bits<11> src1;
|
||||
bits<1> src1_rel;
|
||||
bits<1> src1_neg;
|
||||
bits<3> index_mode = 0;
|
||||
bits<2> pred_sel;
|
||||
bits<1> last;
|
||||
@ -76,16 +78,23 @@ class R600ALU_Word0 {
|
||||
let Word0{8-0} = src0_sel;
|
||||
let Word0{9} = src0_rel;
|
||||
let Word0{11-10} = src0_chan;
|
||||
let Word0{12} = src0_neg;
|
||||
let Word0{21-13} = src1_sel;
|
||||
let Word0{22} = src1_rel;
|
||||
let Word0{24-23} = src1_chan;
|
||||
let Word0{25} = src1_neg;
|
||||
let Word0{28-26} = index_mode;
|
||||
let Word0{30-29} = pred_sel;
|
||||
let Word0{31} = last;
|
||||
}
|
||||
|
||||
class R600ALU_Word0 : R600_ALU_LDS_Word0 {
|
||||
|
||||
bits<1> src0_neg;
|
||||
bits<1> src1_neg;
|
||||
|
||||
let Word0{12} = src0_neg;
|
||||
let Word0{25} = src1_neg;
|
||||
}
|
||||
|
||||
class R600ALU_Word1 {
|
||||
field bits<32> Word1;
|
||||
|
||||
@ -138,6 +147,30 @@ class R600ALU_Word1_OP3 <bits<5> alu_inst> : R600ALU_Word1{
|
||||
let Word1{17-13} = alu_inst;
|
||||
}
|
||||
|
||||
class R600LDS_Word1 {
|
||||
field bits<32> Word1;
|
||||
|
||||
bits<11> src2;
|
||||
bits<9> src2_sel = src2{8-0};
|
||||
bits<2> src2_chan = src2{10-9};
|
||||
bits<1> src2_rel;
|
||||
// offset specifies the stride offset to the second set of data to be read
|
||||
// from. This is a dword offset.
|
||||
bits<5> alu_inst = 17; // OP3_INST_LDS_IDX_OP
|
||||
bits<3> bank_swizzle;
|
||||
bits<6> lds_op;
|
||||
bits<2> dst_chan = 0;
|
||||
|
||||
let Word1{8-0} = src2_sel;
|
||||
let Word1{9} = src2_rel;
|
||||
let Word1{11-10} = src2_chan;
|
||||
let Word1{17-13} = alu_inst;
|
||||
let Word1{20-18} = bank_swizzle;
|
||||
let Word1{26-21} = lds_op;
|
||||
let Word1{30-29} = dst_chan;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
XXX: R600 subtarget uses a slightly different encoding than the other
|
||||
subtargets. We currently handle this in R600MCCodeEmitter, but we may
|
||||
|
@ -136,6 +136,21 @@ bool R600InstrInfo::isALUInstr(unsigned Opcode) const {
|
||||
return (TargetFlags & R600_InstFlag::ALU_INST);
|
||||
}
|
||||
|
||||
bool R600InstrInfo::hasInstrModifiers(unsigned Opcode) const {
|
||||
unsigned TargetFlags = get(Opcode).TSFlags;
|
||||
|
||||
return ((TargetFlags & R600_InstFlag::OP1) |
|
||||
(TargetFlags & R600_InstFlag::OP2) |
|
||||
(TargetFlags & R600_InstFlag::OP3));
|
||||
}
|
||||
|
||||
bool R600InstrInfo::isLDSInstr(unsigned Opcode) const {
|
||||
unsigned TargetFlags = get(Opcode).TSFlags;
|
||||
|
||||
return ((TargetFlags & R600_InstFlag::LDS_1A) |
|
||||
(TargetFlags & R600_InstFlag::LDS_1A1D));
|
||||
}
|
||||
|
||||
bool R600InstrInfo::isTransOnly(unsigned Opcode) const {
|
||||
return (get(Opcode).TSFlags & R600_InstFlag::TRANS_ONLY);
|
||||
}
|
||||
@ -245,6 +260,9 @@ R600InstrInfo::ExtractSrcs(MachineInstr *MI,
|
||||
unsigned Reg = Srcs[i].first->getReg();
|
||||
unsigned Index = RI.getEncodingValue(Reg) & 0xff;
|
||||
unsigned Chan = RI.getHWRegChan(Reg);
|
||||
if (Reg == AMDGPU::OQAP) {
|
||||
Result.push_back(std::pair<int, unsigned>(Index, 0));
|
||||
}
|
||||
if (Index > 127) {
|
||||
Result.push_back(DummyPair);
|
||||
continue;
|
||||
@ -287,10 +305,11 @@ Swizzle(std::vector<std::pair<int, unsigned> > Src,
|
||||
return Src;
|
||||
}
|
||||
|
||||
static bool
|
||||
isLegal(const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
|
||||
const std::vector<R600InstrInfo::BankSwizzle> &Swz,
|
||||
unsigned CheckedSize) {
|
||||
bool
|
||||
R600InstrInfo::isLegal(
|
||||
const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
|
||||
const std::vector<R600InstrInfo::BankSwizzle> &Swz,
|
||||
unsigned CheckedSize) const {
|
||||
int Vector[4][3];
|
||||
memset(Vector, -1, sizeof(Vector));
|
||||
for (unsigned i = 0; i < CheckedSize; i++) {
|
||||
@ -300,6 +319,16 @@ isLegal(const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
|
||||
const std::pair<int, unsigned> &Src = Srcs[j];
|
||||
if (Src.first < 0)
|
||||
continue;
|
||||
if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) {
|
||||
if (Swz[i] != R600InstrInfo::ALU_VEC_012 &&
|
||||
Swz[i] != R600InstrInfo::ALU_VEC_021) {
|
||||
// The value from output queue A (denoted by register OQAP) can
|
||||
// only be fetched during the first cycle.
|
||||
return false;
|
||||
}
|
||||
// OQAP does not count towards the normal read port restrictions
|
||||
continue;
|
||||
}
|
||||
if (Vector[Src.second][j] < 0)
|
||||
Vector[Src.second][j] = Src.first;
|
||||
if (Vector[Src.second][j] != Src.first)
|
||||
@ -309,10 +338,11 @@ isLegal(const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool recursiveFitsFPLimitation(
|
||||
const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
|
||||
std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
|
||||
unsigned Depth = 0) {
|
||||
bool
|
||||
R600InstrInfo::recursiveFitsFPLimitation(
|
||||
const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
|
||||
std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
|
||||
unsigned Depth) const {
|
||||
if (!isLegal(IGSrcs, SwzCandidate, Depth))
|
||||
return false;
|
||||
if (IGSrcs.size() == Depth)
|
||||
|
@ -63,6 +63,8 @@ namespace llvm {
|
||||
|
||||
/// \returns true if this \p Opcode represents an ALU instruction.
|
||||
bool isALUInstr(unsigned Opcode) const;
|
||||
bool hasInstrModifiers(unsigned Opcode) const;
|
||||
bool isLDSInstr(unsigned Opcode) const;
|
||||
|
||||
bool isTransOnly(unsigned Opcode) const;
|
||||
bool isTransOnly(const MachineInstr *MI) const;
|
||||
@ -82,6 +84,15 @@ namespace llvm {
|
||||
SmallVector<std::pair<MachineOperand *, int64_t>, 3>
|
||||
getSrcs(MachineInstr *MI) const;
|
||||
|
||||
bool isLegal(
|
||||
const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
|
||||
const std::vector<R600InstrInfo::BankSwizzle> &Swz,
|
||||
unsigned CheckedSize) const;
|
||||
bool recursiveFitsFPLimitation(
|
||||
const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
|
||||
std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
|
||||
unsigned Depth = 0) const;
|
||||
|
||||
/// Given the order VEC_012 < VEC_021 < VEC_120 < VEC_102 < VEC_201 < VEC_210
|
||||
/// returns true and the first (in lexical order) BankSwizzle affectation
|
||||
/// starting from the one already provided in the Instruction Group MIs that
|
||||
|
@ -1529,6 +1529,81 @@ def GROUP_BARRIER : InstR600 <
|
||||
let ALUInst = 1;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// LDS Instructions
|
||||
//===----------------------------------------------------------------------===//
|
||||
class R600_LDS <bits<6> op, dag outs, dag ins, string asm,
|
||||
list<dag> pattern = []> :
|
||||
|
||||
InstR600 <outs, ins, asm, pattern, XALU>,
|
||||
R600_ALU_LDS_Word0,
|
||||
R600LDS_Word1 {
|
||||
|
||||
bits<6> offset = 0;
|
||||
let lds_op = op;
|
||||
|
||||
let Word1{27} = offset{0};
|
||||
let Word1{12} = offset{1};
|
||||
let Word1{28} = offset{2};
|
||||
let Word1{31} = offset{3};
|
||||
let Word0{12} = offset{4};
|
||||
let Word0{25} = offset{5};
|
||||
|
||||
|
||||
let Inst{31-0} = Word0;
|
||||
let Inst{63-32} = Word1;
|
||||
|
||||
let ALUInst = 1;
|
||||
let HasNativeOperands = 1;
|
||||
let UseNamedOperandTable = 1;
|
||||
}
|
||||
|
||||
class R600_LDS_1A <bits<6> lds_op, string name, list<dag> pattern> : R600_LDS <
|
||||
lds_op,
|
||||
(outs R600_Reg32:$dst),
|
||||
(ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
|
||||
LAST:$last, R600_Pred:$pred_sel,
|
||||
BANK_SWIZZLE:$bank_swizzle),
|
||||
" "#name#" $last OQAP, $src0$src0_rel $pred_sel",
|
||||
pattern
|
||||
> {
|
||||
|
||||
let src1 = 0;
|
||||
let src1_rel = 0;
|
||||
let src2 = 0;
|
||||
let src2_rel = 0;
|
||||
|
||||
let Defs = [OQAP];
|
||||
let usesCustomInserter = 1;
|
||||
let LDS_1A = 1;
|
||||
let DisableEncoding = "$dst";
|
||||
}
|
||||
|
||||
class R600_LDS_1A1D <bits<6> lds_op, string name, list<dag> pattern> :
|
||||
R600_LDS <
|
||||
lds_op,
|
||||
(outs),
|
||||
(ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
|
||||
R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
|
||||
LAST:$last, R600_Pred:$pred_sel,
|
||||
BANK_SWIZZLE:$bank_swizzle),
|
||||
" "#name#" $last $src0$src0_rel, $src1$src1_rel, $pred_sel",
|
||||
pattern
|
||||
> {
|
||||
|
||||
let src2 = 0;
|
||||
let src2_rel = 0;
|
||||
let LDS_1A1D = 1;
|
||||
}
|
||||
|
||||
def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET",
|
||||
[(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))]
|
||||
>;
|
||||
|
||||
def LDS_WRITE : R600_LDS_1A1D <0xD, "LDS_WRITE",
|
||||
[(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)]
|
||||
>;
|
||||
|
||||
// TRUNC is used for the FLT_TO_INT instructions to work around a
|
||||
// perceived problem where the rounding modes are applied differently
|
||||
// depending on the instruction and the slot they are in.
|
||||
|
@ -278,6 +278,10 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
|
||||
return AluT_XYZW;
|
||||
}
|
||||
|
||||
if (TII->isLDSInstr(MI->getOpcode())) {
|
||||
return AluT_X;
|
||||
}
|
||||
|
||||
// Is the result already assigned to a channel ?
|
||||
unsigned DestSubReg = MI->getOperand(0).getSubReg();
|
||||
switch (DestSubReg) {
|
||||
@ -371,14 +375,18 @@ void R600SchedStrategy::PrepareNextSlot() {
|
||||
}
|
||||
|
||||
void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
|
||||
unsigned DestReg = MI->getOperand(0).getReg();
|
||||
int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
|
||||
if (DstIndex == -1) {
|
||||
return;
|
||||
}
|
||||
unsigned DestReg = MI->getOperand(DstIndex).getReg();
|
||||
// PressureRegister crashes if an operand is def and used in the same inst
|
||||
// and we try to constraint its regclass
|
||||
for (MachineInstr::mop_iterator It = MI->operands_begin(),
|
||||
E = MI->operands_end(); It != E; ++It) {
|
||||
MachineOperand &MO = *It;
|
||||
if (MO.isReg() && !MO.isDef() &&
|
||||
MO.getReg() == MI->getOperand(0).getReg())
|
||||
MO.getReg() == DestReg)
|
||||
return;
|
||||
}
|
||||
// Constrains the regclass of DestReg to assign it to Slot
|
||||
|
@ -92,6 +92,9 @@ private:
|
||||
Result[Dst] = AMDGPU::PV_X;
|
||||
continue;
|
||||
}
|
||||
if (Dst == AMDGPU::OQAP) {
|
||||
continue;
|
||||
}
|
||||
unsigned PVReg = 0;
|
||||
switch (TRI.getHWRegChan(Dst)) {
|
||||
case 0:
|
||||
|
@ -101,6 +101,7 @@ def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
|
||||
def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
|
||||
def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>;
|
||||
def AR_X : R600Reg<"AR.x", 0>;
|
||||
def OQAP : R600Reg<"OQAP", 221>;
|
||||
|
||||
def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32,
|
||||
(add (sequence "ArrayBase%u", 448, 480))>;
|
||||
@ -170,7 +171,7 @@ def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add
|
||||
R600_ArrayBase,
|
||||
R600_Addr,
|
||||
ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF,
|
||||
ALU_CONST, ALU_PARAM
|
||||
ALU_CONST, ALU_PARAM, OQAP
|
||||
)>;
|
||||
|
||||
def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add
|
||||
|
@ -23,6 +23,7 @@ def TRANS : FuncUnit;
|
||||
def AnyALU : InstrItinClass;
|
||||
def VecALU : InstrItinClass;
|
||||
def TransALU : InstrItinClass;
|
||||
def XALU : InstrItinClass;
|
||||
|
||||
def R600_VLIW5_Itin : ProcessorItineraries <
|
||||
[ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL],
|
||||
@ -31,6 +32,7 @@ def R600_VLIW5_Itin : ProcessorItineraries <
|
||||
InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS]>]>,
|
||||
InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>,
|
||||
InstrItinData<TransALU, [InstrStage<1, [TRANS]>]>,
|
||||
InstrItinData<XALU, [InstrStage<1, [ALU_X]>]>,
|
||||
InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]>
|
||||
]
|
||||
>;
|
||||
|
82
test/CodeGen/R600/local-memory.ll
Normal file
82
test/CodeGen/R600/local-memory.ll
Normal file
@ -0,0 +1,82 @@
|
||||
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
||||
|
||||
|
||||
@local_memory.local_mem = internal addrspace(3) unnamed_addr global [16 x i32] zeroinitializer, align 4
|
||||
|
||||
; CHECK: @local_memory
|
||||
|
||||
; Check that the LDS size emitted correctly
|
||||
; CHECK: .long 166120
|
||||
; CHECK-NEXT: .long 16
|
||||
|
||||
; CHECK: LDS_WRITE
|
||||
|
||||
; GROUP_BARRIER must be the last instruction in a clause
|
||||
; CHECK: GROUP_BARRIER
|
||||
; CHECK-NEXT: ALU clause
|
||||
|
||||
; CHECK: LDS_READ_RET
|
||||
|
||||
define void @local_memory(i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%y.i = call i32 @llvm.r600.read.tidig.x() #0
|
||||
%arrayidx = getelementptr inbounds [16 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
|
||||
store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
|
||||
%add = add nsw i32 %y.i, 1
|
||||
%cmp = icmp eq i32 %add, 16
|
||||
%.add = select i1 %cmp, i32 0, i32 %add
|
||||
call void @llvm.AMDGPU.barrier.local()
|
||||
%arrayidx1 = getelementptr inbounds [16 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
|
||||
%0 = load i32 addrspace(3)* %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32 addrspace(1)* %out, i32 %y.i
|
||||
store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
@local_memory_two_objects.local_mem0 = internal addrspace(3) unnamed_addr global [4 x i32] zeroinitializer, align 4
|
||||
@local_memory_two_objects.local_mem1 = internal addrspace(3) unnamed_addr global [4 x i32] zeroinitializer, align 4
|
||||
|
||||
; CHECK: @local_memory_two_objects
|
||||
|
||||
; Check that the LDS size emitted correctly
|
||||
; CHECK: .long 166120
|
||||
; CHECK-NEXT: .long 8
|
||||
|
||||
; Make sure the lds writes are using different addresses.
|
||||
; CHECK: LDS_WRITE {{[*]*}} {{PV|T}}[[ADDRW:[0-9]*\.[XYZW]]]
|
||||
; CHECK-NOT: LDS_WRITE {{[*]*}} T[[ADDRW]]
|
||||
|
||||
; GROUP_BARRIER must be the last instruction in a clause
|
||||
; CHECK: GROUP_BARRIER
|
||||
; CHECK-NEXT: ALU clause
|
||||
|
||||
; Make sure the lds reads are using different addresses.
|
||||
; CHECK: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
|
||||
; CHECK-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
|
||||
|
||||
define void @local_memory_two_objects(i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%x.i = call i32 @llvm.r600.read.tidig.x() #0
|
||||
%arrayidx = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
|
||||
store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
|
||||
%mul = shl nsw i32 %x.i, 1
|
||||
%arrayidx1 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
|
||||
store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
|
||||
%sub = sub nsw i32 3, %x.i
|
||||
call void @llvm.AMDGPU.barrier.local()
|
||||
%arrayidx2 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
|
||||
%0 = load i32 addrspace(3)* %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds i32 addrspace(1)* %out, i32 %x.i
|
||||
store i32 %0, i32 addrspace(1)* %arrayidx3, align 4
|
||||
%arrayidx4 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
|
||||
%1 = load i32 addrspace(3)* %arrayidx4, align 4
|
||||
%add = add nsw i32 %x.i, 4
|
||||
%arrayidx5 = getelementptr inbounds i32 addrspace(1)* %out, i32 %add
|
||||
store i32 %1, i32 addrspace(1)* %arrayidx5, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.r600.read.tidig.x() #0
|
||||
declare void @llvm.AMDGPU.barrier.local()
|
||||
|
||||
attributes #0 = { readnone }
|
Loading…
Reference in New Issue
Block a user