llvm-6502/lib/Target/R600/SILowerControlFlow.cpp
Tom Stellard d90e5063ca R600/SI: Add pattern for bitcasting fp immediates to integers
The backend now assumes that all immediates are integers.  This allows
us to simplify immediate handling code, becasue we no longer need to
handle fp and integer immediates differently.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225844 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-13 22:59:41 +00:00

573 lines
17 KiB
C++

//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
/// \file
/// \brief This pass lowers the pseudo control flow instructions to real
/// machine instructions.
///
/// All control flow is handled using predicated instructions and
/// a predicate stack. Each Scalar ALU controls the operations of 64 Vector
/// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs
/// by writting to the 64-bit EXEC register (each bit corresponds to a
/// single vector ALU). Typically, for predicates, a vector ALU will write
/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
/// Vector ALU) and then the ScalarALU will AND the VCC register with the
/// EXEC to update the predicates.
///
/// For example:
/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
/// %SGPR0 = SI_IF %VCC
/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
/// %SGPR0 = SI_ELSE %SGPR0
/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
/// SI_END_CF %SGPR0
///
/// becomes:
///
/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask
/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
/// S_CBRANCH_EXECZ label0 // This instruction is an optional
/// // optimization which allows us to
/// // branch if all the bits of
/// // EXEC are zero.
/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
///
/// label0:
/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block
/// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
/// S_BRANCH_EXECZ label1 // Use our branch optimization
/// // instruction again.
/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block
/// label1:
/// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/Constants.h"
using namespace llvm;
namespace {
class SILowerControlFlowPass : public MachineFunctionPass {
private:
static const unsigned SkipThreshold = 12;
static char ID;
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;
bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
void Skip(MachineInstr &From, MachineOperand &To);
void SkipIfDead(MachineInstr &MI);
void If(MachineInstr &MI);
void Else(MachineInstr &MI);
void Break(MachineInstr &MI);
void IfBreak(MachineInstr &MI);
void ElseBreak(MachineInstr &MI);
void Loop(MachineInstr &MI);
void EndCf(MachineInstr &MI);
void Kill(MachineInstr &MI);
void Branch(MachineInstr &MI);
void LoadM0(MachineInstr &MI, MachineInstr *MovRel);
void IndirectSrc(MachineInstr &MI);
void IndirectDst(MachineInstr &MI);
public:
SILowerControlFlowPass(TargetMachine &tm) :
MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
bool runOnMachineFunction(MachineFunction &MF) override;
const char *getPassName() const override {
return "SI Lower control flow instructions";
}
};
} // End anonymous namespace
char SILowerControlFlowPass::ID = 0;
FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
return new SILowerControlFlowPass(tm);
}
bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From,
MachineBasicBlock *To) {
unsigned NumInstr = 0;
for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty();
MBB = *MBB->succ_begin()) {
for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
NumInstr < SkipThreshold && I != E; ++I) {
if (I->isBundle() || !I->isBundled())
if (++NumInstr >= SkipThreshold)
return true;
}
}
return false;
}
void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
return;
DebugLoc DL = From.getDebugLoc();
BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
.addOperand(To)
.addReg(AMDGPU::EXEC);
}
void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getShaderType() !=
ShaderType::PIXEL ||
!shouldSkip(&MBB, &MBB.getParent()->back()))
return;
MachineBasicBlock::iterator Insert = &MI;
++Insert;
// If the exec mask is non-zero, skip the next two instructions
BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
.addImm(3)
.addReg(AMDGPU::EXEC);
// Exec mask is zero: Export to NULL target...
BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
.addImm(0)
.addImm(0x09) // V_008DFC_SQ_EXP_NULL
.addImm(0)
.addImm(1)
.addImm(1)
.addReg(AMDGPU::VGPR0)
.addReg(AMDGPU::VGPR0)
.addReg(AMDGPU::VGPR0)
.addReg(AMDGPU::VGPR0);
// ... and terminate wavefront
BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
}
void SILowerControlFlowPass::If(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Reg = MI.getOperand(0).getReg();
unsigned Vcc = MI.getOperand(1).getReg();
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
.addReg(Vcc);
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
.addReg(AMDGPU::EXEC)
.addReg(Reg);
Skip(MI, MI.getOperand(2));
MI.eraseFromParent();
}
void SILowerControlFlowPass::Else(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
unsigned Src = MI.getOperand(1).getReg();
BuildMI(MBB, MBB.getFirstNonPHI(), DL,
TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
.addReg(Src); // Saved EXEC
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
.addReg(AMDGPU::EXEC)
.addReg(Dst);
Skip(MI, MI.getOperand(2));
MI.eraseFromParent();
}
void SILowerControlFlowPass::Break(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
unsigned Src = MI.getOperand(1).getReg();
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
.addReg(AMDGPU::EXEC)
.addReg(Src);
MI.eraseFromParent();
}
void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
unsigned Vcc = MI.getOperand(1).getReg();
unsigned Src = MI.getOperand(2).getReg();
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
.addReg(Vcc)
.addReg(Src);
MI.eraseFromParent();
}
void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
unsigned Saved = MI.getOperand(1).getReg();
unsigned Src = MI.getOperand(2).getReg();
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
.addReg(Saved)
.addReg(Src);
MI.eraseFromParent();
}
void SILowerControlFlowPass::Loop(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Src = MI.getOperand(0).getReg();
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
.addReg(AMDGPU::EXEC)
.addReg(Src);
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
.addOperand(MI.getOperand(1))
.addReg(AMDGPU::EXEC);
MI.eraseFromParent();
}
void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Reg = MI.getOperand(0).getReg();
BuildMI(MBB, MBB.getFirstNonPHI(), DL,
TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
.addReg(AMDGPU::EXEC)
.addReg(Reg);
MI.eraseFromParent();
}
void SILowerControlFlowPass::Branch(MachineInstr &MI) {
if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode())
MI.eraseFromParent();
// If these aren't equal, this is probably an infinite loop.
}
void SILowerControlFlowPass::Kill(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
const MachineOperand &Op = MI.getOperand(0);
#ifndef NDEBUG
const SIMachineFunctionInfo *MFI
= MBB.getParent()->getInfo<SIMachineFunctionInfo>();
// Kill is only allowed in pixel / geometry shaders.
assert(MFI->getShaderType() == ShaderType::PIXEL ||
MFI->getShaderType() == ShaderType::GEOMETRY);
#endif
// Clear this thread from the exec mask if the operand is negative
if ((Op.isImm())) {
// Constant operand: Set exec mask to 0 or do nothing
if (Op.getImm() & 0x80000000) {
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
.addImm(0);
}
} else {
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC)
.addImm(0)
.addOperand(Op);
}
MI.eraseFromParent();
}
void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
MachineBasicBlock::iterator I = MI;
unsigned Save = MI.getOperand(1).getReg();
unsigned Idx = MI.getOperand(3).getReg();
if (AMDGPU::SReg_32RegClass.contains(Idx)) {
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
.addReg(Idx);
MBB.insert(I, MovRel);
} else {
assert(AMDGPU::SReg_64RegClass.contains(Save));
assert(AMDGPU::VGPR_32RegClass.contains(Idx));
// Save the EXEC mask
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
.addReg(AMDGPU::EXEC);
// Read the next variant into VCC (lower 32 bits) <- also loop target
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
AMDGPU::VCC_LO)
.addReg(Idx);
// Move index from VCC into M0
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
.addReg(AMDGPU::VCC_LO);
// Compare the just read M0 value to all possible Idx values
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
.addReg(AMDGPU::M0)
.addReg(Idx);
// Update EXEC, save the original EXEC value to VCC
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
.addReg(AMDGPU::VCC);
// Do the actual move
MBB.insert(I, MovRel);
// Update EXEC, switch all done bits to 0 and all todo bits to 1
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
.addReg(AMDGPU::EXEC)
.addReg(AMDGPU::VCC);
// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
.addImm(-7)
.addReg(AMDGPU::EXEC);
// Restore EXEC
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
.addReg(Save);
}
MI.eraseFromParent();
}
void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
unsigned Vec = MI.getOperand(2).getReg();
unsigned Off = MI.getOperand(4).getImm();
unsigned SubReg = TRI->getSubReg(Vec, AMDGPU::sub0);
if (!SubReg)
SubReg = Vec;
MachineInstr *MovRel =
BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
.addReg(SubReg + Off)
.addReg(AMDGPU::M0, RegState::Implicit)
.addReg(Vec, RegState::Implicit);
LoadM0(MI, MovRel);
}
void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
unsigned Off = MI.getOperand(4).getImm();
unsigned Val = MI.getOperand(5).getReg();
unsigned SubReg = TRI->getSubReg(Dst, AMDGPU::sub0);
if (!SubReg)
SubReg = Dst;
MachineInstr *MovRel =
BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
.addReg(SubReg + Off, RegState::Define)
.addReg(Val)
.addReg(AMDGPU::M0, RegState::Implicit)
.addReg(Dst, RegState::Implicit);
LoadM0(MI, MovRel);
}
bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
TRI =
static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
bool HaveKill = false;
bool NeedWQM = false;
bool NeedFlat = false;
unsigned Depth = 0;
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {
MachineBasicBlock &MBB = *BI;
MachineBasicBlock::iterator I, Next;
for (I = MBB.begin(); I != MBB.end(); I = Next) {
Next = std::next(I);
MachineInstr &MI = *I;
if (TII->isDS(MI.getOpcode()))
NeedWQM = true;
// Flat uses m0 in case it needs to access LDS.
if (TII->isFLAT(MI.getOpcode()))
NeedFlat = true;
switch (MI.getOpcode()) {
default: break;
case AMDGPU::SI_IF:
++Depth;
If(MI);
break;
case AMDGPU::SI_ELSE:
Else(MI);
break;
case AMDGPU::SI_BREAK:
Break(MI);
break;
case AMDGPU::SI_IF_BREAK:
IfBreak(MI);
break;
case AMDGPU::SI_ELSE_BREAK:
ElseBreak(MI);
break;
case AMDGPU::SI_LOOP:
++Depth;
Loop(MI);
break;
case AMDGPU::SI_END_CF:
if (--Depth == 0 && HaveKill) {
SkipIfDead(MI);
HaveKill = false;
}
EndCf(MI);
break;
case AMDGPU::SI_KILL:
if (Depth == 0)
SkipIfDead(MI);
else
HaveKill = true;
Kill(MI);
break;
case AMDGPU::S_BRANCH:
Branch(MI);
break;
case AMDGPU::SI_INDIRECT_SRC:
IndirectSrc(MI);
break;
case AMDGPU::SI_INDIRECT_DST_V1:
case AMDGPU::SI_INDIRECT_DST_V2:
case AMDGPU::SI_INDIRECT_DST_V4:
case AMDGPU::SI_INDIRECT_DST_V8:
case AMDGPU::SI_INDIRECT_DST_V16:
IndirectDst(MI);
break;
case AMDGPU::V_INTERP_P1_F32:
case AMDGPU::V_INTERP_P2_F32:
case AMDGPU::V_INTERP_MOV_F32:
NeedWQM = true;
break;
}
}
}
if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
MachineBasicBlock &MBB = MF.front();
BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
AMDGPU::EXEC).addReg(AMDGPU::EXEC);
}
// FIXME: This seems inappropriate to do here.
if (NeedFlat && MFI->IsKernel) {
// Insert the prologue initializing the SGPRs pointing to the scratch space
// for flat accesses.
const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
// TODO: What to use with function calls?
// FIXME: This is reporting stack size that is used in a scratch buffer
// rather than registers as well.
uint64_t StackSizeBytes = FrameInfo->getStackSize();
int IndirectBegin
= static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF);
// Convert register index to 256-byte unit.
uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256);
assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff &&
"Stack limits should be smaller than 16-bits");
// Initialize the flat scratch register pair.
// TODO: Can we use one s_mov_b64 here?
// Offset is in units of 256-bytes.
MachineBasicBlock &MBB = MF.front();
DebugLoc NoDL;
MachineBasicBlock::iterator Start = MBB.getFirstNonPHI();
const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32);
assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes));
BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO)
.addImm(StackOffset);
// Documentation says size is "per-thread scratch size in bytes"
BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI)
.addImm(StackSizeBytes);
}
return true;
}