From fdbf61d00d95d1fe27fbc7fb4e0b7b71e3aa929e Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Thu, 21 Aug 2014 20:41:00 +0000 Subject: [PATCH] R600/SI: Teach moveToVALU how to handle more S_LOAD_* instructions git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@216220 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/SIInstrInfo.cpp | 130 +++++++++++++++++++++++++++--- lib/Target/R600/SIInstrInfo.h | 6 ++ test/CodeGen/R600/salu-to-valu.ll | 28 +++++++ 3 files changed, 155 insertions(+), 9 deletions(-) diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index 3be1b25b363..43efb71a2a2 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -1367,6 +1367,88 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { } } +void SIInstrInfo::splitSMRD(MachineInstr *MI, + const TargetRegisterClass *HalfRC, + unsigned HalfImmOp, unsigned HalfSGPROp, + MachineInstr *&Lo, MachineInstr *&Hi) const { + + DebugLoc DL = MI->getDebugLoc(); + MachineBasicBlock *MBB = MI->getParent(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + unsigned RegLo = MRI.createVirtualRegister(HalfRC); + unsigned RegHi = MRI.createVirtualRegister(HalfRC); + unsigned HalfSize = HalfRC->getSize(); + const MachineOperand *OffOp = + getNamedOperand(*MI, AMDGPU::OpName::offset); + const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); + + if (OffOp) { + // Handle the _IMM variant + unsigned LoOffset = OffOp->getImm(); + unsigned HiOffset = LoOffset + (HalfSize / 4); + Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo) + .addOperand(*SBase) + .addImm(LoOffset); + + if (!isUInt<8>(HiOffset)) { + unsigned OffsetSGPR = + MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR) + .addImm(HiOffset << 2); // The immediate offset is in dwords, + // but offset in register is in bytes. + Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) + .addOperand(*SBase) + .addReg(OffsetSGPR); + } else { + Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi) + .addOperand(*SBase) + .addImm(HiOffset); + } + } else { + // Handle the _SGPR variant + MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff); + Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo) + .addOperand(*SBase) + .addOperand(*SOff); + unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR) + .addOperand(*SOff) + .addImm(HalfSize); + Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp)) + .addOperand(*SBase) + .addReg(OffsetSGPR); + } + + unsigned SubLo, SubHi; + switch (HalfSize) { + case 4: + SubLo = AMDGPU::sub0; + SubHi = AMDGPU::sub1; + break; + case 8: + SubLo = AMDGPU::sub0_sub1; + SubHi = AMDGPU::sub2_sub3; + break; + case 16: + SubLo = AMDGPU::sub0_sub1_sub2_sub3; + SubHi = AMDGPU::sub4_sub5_sub6_sub7; + break; + case 32: + SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; + SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15; + break; + default: + llvm_unreachable("Unhandled HalfSize"); + } + + BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE)) + .addOperand(MI->getOperand(0)) + .addReg(RegLo) + .addImm(SubLo) + .addReg(RegHi) + .addImm(SubHi); +} + void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const { MachineBasicBlock *MBB = MI->getParent(); switch (MI->getOpcode()) { @@ -1375,7 +1457,7 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con case AMDGPU::S_LOAD_DWORDX2_IMM: case AMDGPU::S_LOAD_DWORDX2_SGPR: case AMDGPU::S_LOAD_DWORDX4_IMM: - case AMDGPU::S_LOAD_DWORDX4_SGPR: + case AMDGPU::S_LOAD_DWORDX4_SGPR: { unsigned NewOpcode = getVALUOp(*MI); unsigned RegOffset; unsigned ImmOffset; @@ -1422,14 +1504,44 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con .addImm(AMDGPU::sub2) .addReg(DWord3) .addImm(AMDGPU::sub3); - MI->setDesc(get(NewOpcode)); - if (MI->getOperand(2).isReg()) { - MI->getOperand(2).setReg(MI->getOperand(1).getReg()); - } else { - MI->getOperand(2).ChangeToRegister(MI->getOperand(1).getReg(), false); - } - MI->getOperand(1).setReg(SRsrc); - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset)); + MI->setDesc(get(NewOpcode)); + if (MI->getOperand(2).isReg()) { + MI->getOperand(2).setReg(MI->getOperand(1).getReg()); + } else { + MI->getOperand(2).ChangeToRegister(MI->getOperand(1).getReg(), false); + } + MI->getOperand(1).setReg(SRsrc); + MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset)); + + const TargetRegisterClass *NewDstRC = + RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass); + + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); + MRI.replaceRegWith(DstReg, NewDstReg); + break; + } + case AMDGPU::S_LOAD_DWORDX8_IMM: + case AMDGPU::S_LOAD_DWORDX8_SGPR: { + MachineInstr *Lo, *Hi; + splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM, + AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi); + MI->eraseFromParent(); + moveSMRDToVALU(Lo, MRI); + moveSMRDToVALU(Hi, MRI); + break; + } + + case AMDGPU::S_LOAD_DWORDX16_IMM: + case AMDGPU::S_LOAD_DWORDX16_SGPR: { + MachineInstr *Lo, *Hi; + splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM, + AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi); + MI->eraseFromParent(); + moveSMRDToVALU(Lo, MRI); + moveSMRDToVALU(Hi, MRI); + break; + } } } diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index cab448ac9f5..b7aeb95142e 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -170,6 +170,12 @@ public: /// create new instruction and insert them before \p MI. void legalizeOperands(MachineInstr *MI) const; + /// \brief Split an SMRD instruction into two smaller loads of half the + // size storing the results in \p Lo and \p Hi. + void splitSMRD(MachineInstr *MI, const TargetRegisterClass *HalfRC, + unsigned HalfImmOp, unsigned HalfSGPROp, + MachineInstr *&Lo, MachineInstr *&Hi) const; + void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const; /// \brief Replace this instruction's opcode with the equivalent VALU diff --git a/test/CodeGen/R600/salu-to-valu.ll b/test/CodeGen/R600/salu-to-valu.ll index 49ec62803ae..98ccff68599 100644 --- a/test/CodeGen/R600/salu-to-valu.ll +++ b/test/CodeGen/R600/salu-to-valu.ll @@ -88,3 +88,31 @@ entry: store i32 %3, i32 addrspace(1)* %out ret void } + +; CHECK-LABEL: @s_load_imm_v8i32 +; CHECK: BUFFER_LOAD_DWORDX4 +; CHECK: BUFFER_LOAD_DWORDX4 +define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) { +entry: + %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1 + %tmp1 = getelementptr inbounds i32 addrspace(2)* %in, i32 %tmp0 + %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)* + %tmp3 = load <8 x i32> addrspace(2)* %tmp2, align 4 + store <8 x i32> %tmp3, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +; CHECK-LABEL: @s_load_imm_v16i32 +; CHECK: BUFFER_LOAD_DWORDX4 +; CHECK: BUFFER_LOAD_DWORDX4 +; CHECK: BUFFER_LOAD_DWORDX4 +; CHECK: BUFFER_LOAD_DWORDX4 +define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) { +entry: + %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1 + %tmp1 = getelementptr inbounds i32 addrspace(2)* %in, i32 %tmp0 + %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <16 x i32> addrspace(2)* + %tmp3 = load <16 x i32> addrspace(2)* %tmp2, align 4 + store <16 x i32> %tmp3, <16 x i32> addrspace(1)* %out, align 32 + ret void +}