mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-12 17:32:19 +00:00
R600/SI: Teach SIFoldOperands to split 64-bit constants when folding
This allows folding of sequences like: s[0:1] = s_mov_b64 4 v_add_i32 v0, s0, v0 v_addc_u32 v1, s1, v1 into v_add_i32 v0, 4, v0 v_add_i32 v1, 0, v1 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225369 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
bc2572cac5
commit
546520a727
@ -153,27 +153,44 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
|
||||
const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo());
|
||||
|
||||
// FIXME: Fold operands with subregs.
|
||||
if (UseOp.isReg() && UseOp.getSubReg()) {
|
||||
if (UseOp.isReg() && UseOp.getSubReg() && OpToFold.isReg()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
bool FoldingImm = OpToFold.isImm() || OpToFold.isFPImm();
|
||||
APInt Imm;
|
||||
|
||||
// In order to fold immediates into copies, we need to change the
|
||||
// copy to a MOV.
|
||||
if (FoldingImm && UseMI->getOpcode() == AMDGPU::COPY) {
|
||||
const TargetRegisterClass *TRC =
|
||||
MRI.getRegClass(UseMI->getOperand(0).getReg());
|
||||
if (FoldingImm) {
|
||||
const TargetRegisterClass *UseRC = MRI.getRegClass(UseOp.getReg());
|
||||
|
||||
if (TRC->getSize() == 4) {
|
||||
if (TRI.isSGPRClass(TRC))
|
||||
UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
|
||||
else
|
||||
UseMI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
|
||||
} else if (TRC->getSize() == 8 && TRI.isSGPRClass(TRC)) {
|
||||
UseMI->setDesc(TII->get(AMDGPU::S_MOV_B64));
|
||||
if (OpToFold.isFPImm()) {
|
||||
Imm = OpToFold.getFPImm()->getValueAPF().bitcastToAPInt();
|
||||
} else {
|
||||
continue;
|
||||
Imm = APInt(64, OpToFold.getImm());
|
||||
}
|
||||
|
||||
// Split 64-bit constants into 32-bits for folding.
|
||||
if (UseOp.getSubReg()) {
|
||||
if (UseRC->getSize() != 8)
|
||||
continue;
|
||||
|
||||
if (UseOp.getSubReg() == AMDGPU::sub0) {
|
||||
Imm = Imm.getLoBits(32);
|
||||
} else {
|
||||
assert(UseOp.getSubReg() == AMDGPU::sub1);
|
||||
Imm = Imm.getHiBits(32);
|
||||
}
|
||||
}
|
||||
|
||||
// In order to fold immediates into copies, we need to change the
|
||||
// copy to a MOV.
|
||||
if (UseMI->getOpcode() == AMDGPU::COPY) {
|
||||
unsigned MovOp = TII->getMovOpcode(
|
||||
MRI.getRegClass(UseMI->getOperand(0).getReg()));
|
||||
if (MovOp == AMDGPU::COPY)
|
||||
continue;
|
||||
|
||||
UseMI->setDesc(TII->get(MovOp));
|
||||
}
|
||||
}
|
||||
|
||||
@ -185,19 +202,14 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
|
||||
UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1)
|
||||
continue;
|
||||
|
||||
if (FoldingImm) {
|
||||
uint64_t Imm;
|
||||
if (OpToFold.isFPImm()) {
|
||||
Imm = OpToFold.getFPImm()->getValueAPF().bitcastToAPInt().getSExtValue();
|
||||
} else {
|
||||
Imm = OpToFold.getImm();
|
||||
}
|
||||
|
||||
const MachineOperand ImmOp = MachineOperand::CreateImm(Imm);
|
||||
if (FoldingImm) {
|
||||
const MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
|
||||
if (TII->isOperandLegal(UseMI, Use.getOperandNo(), &ImmOp)) {
|
||||
FoldList.push_back(FoldCandidate(UseMI, Use.getOperandNo(), Imm));
|
||||
continue;
|
||||
FoldList.push_back(FoldCandidate(UseMI, Use.getOperandNo(),
|
||||
Imm.getSExtValue()));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Normal substitution with registers
|
||||
|
@ -418,6 +418,16 @@ unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const {
|
||||
return Opcode;
|
||||
}
|
||||
|
||||
unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
|
||||
|
||||
if (DstRC->getSize() == 4) {
|
||||
return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
|
||||
} else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) {
|
||||
return AMDGPU::S_MOV_B64;
|
||||
}
|
||||
return AMDGPU::COPY;
|
||||
}
|
||||
|
||||
static bool shouldTryToSpillVGPRs(MachineFunction *MF) {
|
||||
|
||||
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
|
||||
|
@ -110,6 +110,10 @@ public:
|
||||
|
||||
bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
|
||||
|
||||
// \brief Returns an opcode that can be used to move a value to a \p DstRC
|
||||
// register. If there is no hardware instruction that can store to \p
|
||||
// DstRC, then AMDGPU::COPY is returned.
|
||||
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
|
||||
unsigned commuteOpcode(unsigned Opcode) const;
|
||||
|
||||
MachineInstr *commuteInstruction(MachineInstr *MI,
|
||||
|
@ -36,5 +36,22 @@ endif:
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}fold_64bit_constant_add:
|
||||
; CHECK-NOT: s_mov_b64
|
||||
; FIXME: It would be better if we could use v_add here and drop the extra
|
||||
; v_mov_b32 instructions.
|
||||
; CHECK-DAG: s_add_u32 [[LO:s[0-9]+]], s{{[0-9]+}}, 1
|
||||
; CHECK-DAG: s_addc_u32 [[HI:s[0-9]+]], s{{[0-9]+}}, 0
|
||||
; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[LO]]
|
||||
; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]]
|
||||
; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}},
|
||||
|
||||
define void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) {
|
||||
entry:
|
||||
%tmp0 = add i64 %val, 1
|
||||
store i64 %tmp0, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.r600.read.tidig.x() #0
|
||||
attributes #0 = { readnone }
|
||||
|
@ -12,10 +12,10 @@ define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
|
||||
|
||||
; SI-LABEL: {{^}}sint_to_fp_i1_f64:
|
||||
; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
|
||||
; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs,
|
||||
; we should be able to fold the SGPRs into the V_CNDMASK instructions.
|
||||
; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
|
||||
; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
|
||||
; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
|
||||
; uses an SGPR for [[CMP]]
|
||||
; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
|
||||
; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
|
||||
; SI: buffer_store_dwordx2
|
||||
; SI: s_endpgm
|
||||
define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
|
||||
|
@ -72,10 +72,10 @@ define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i
|
||||
|
||||
; SI-LABEL: {{^}}uint_to_fp_i1_to_f64:
|
||||
; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
|
||||
; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs,
|
||||
; we should be able to fold the SGPRs into the V_CNDMASK instructions.
|
||||
; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
|
||||
; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
|
||||
; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
|
||||
; uses an SGPR for [[CMP]]
|
||||
; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
|
||||
; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
|
||||
; SI: buffer_store_dwordx2
|
||||
; SI: s_endpgm
|
||||
define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user