R600: Add a pass that merge Vector Register

Previously commited @183279 but tests were failing, reverted @183286
It was broken because @183336 was missing, now it's there.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@183343 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Vincent Lejeune 2013-06-05 21:38:04 +00:00
parent f41d317054
commit f3d6e32c09
5 changed files with 400 additions and 0 deletions

View File

@ -23,6 +23,7 @@ class AMDGPUTargetMachine;
// R600 Passes
FunctionPass* createR600TextureIntrinsicsReplacer();
FunctionPass* createR600KernelParametersPass(const DataLayout *TD);
FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
FunctionPass *createR600EmitClauseMarkers(TargetMachine &tm);
FunctionPass *createR600Packetizer(TargetMachine &tm);

View File

@ -130,6 +130,11 @@ bool AMDGPUPassConfig::addInstSelector() {
bool AMDGPUPassConfig::addPreRegAlloc() {
addPass(createAMDGPUConvertToISAPass(*TM));
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
addPass(createR600VectorRegMerger(*TM));
}
return false;
}

View File

@ -41,6 +41,7 @@ add_llvm_target(R600CodeGen
R600ISelLowering.cpp
R600MachineFunctionInfo.cpp
R600MachineScheduler.cpp
R600OptimizeVectorRegisters.cpp
R600Packetizer.cpp
R600RegisterInfo.cpp
R600TextureIntrinsicsReplacer.cpp

View File

@ -0,0 +1,363 @@
//===--------------------- R600MergeVectorRegisters.cpp -------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This pass merges inputs of swizzeable instructions into vector sharing
/// common data and/or have enough undef subreg using swizzle abilities.
///
/// For instance let's consider the following pseudo code :
/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
/// ...
/// vreg7<def> = REG_SEQ vreg1, sub0, vreg3, sub1, undef, sub2, vreg4, sub3
/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub1, sub2, sub3
///
/// is turned into :
/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
/// ...
/// vreg7<def> = INSERT_SUBREG vreg4, sub3
/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub2, sub1, sub3
///
/// This allow regalloc to reduce register pressure for vector registers and
/// to reduce MOV count.
//===----------------------------------------------------------------------===//
#define DEBUG_TYPE "vec-merger"
#include "llvm/Support/Debug.h"
#include "AMDGPU.h"
#include "R600InstrInfo.h"
#include "llvm/CodeGen/DFAPacketizer.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
using namespace llvm;
namespace {
static bool
isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
for (MachineRegisterInfo::def_iterator It = MRI.def_begin(Reg),
E = MRI.def_end(); It != E; ++It) {
return (*It).isImplicitDef();
}
llvm_unreachable("Reg without a def");
return false;
}
class RegSeqInfo {
public:
MachineInstr *Instr;
DenseMap<unsigned, unsigned> RegToChan;
std::vector<unsigned> UndefReg;
RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) {
assert (MI->getOpcode() == AMDGPU::REG_SEQUENCE);
for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) {
MachineOperand &MO = Instr->getOperand(i);
unsigned Chan = Instr->getOperand(i + 1).getImm();
if (isImplicitlyDef(MRI, MO.getReg()))
UndefReg.push_back(Chan);
else
RegToChan[MO.getReg()] = Chan;
}
}
RegSeqInfo() {}
bool operator==(const RegSeqInfo &RSI) const {
return RSI.Instr == Instr;
}
};
class R600VectorRegMerger : public MachineFunctionPass {
private:
MachineRegisterInfo *MRI;
const R600InstrInfo *TII;
bool canSwizzle(const MachineInstr &) const;
bool areAllUsesSwizzeable(unsigned Reg) const;
void SwizzleInput(MachineInstr &,
const std::vector<std::pair<unsigned, unsigned> > &) const;
bool tryMergeVector(const RegSeqInfo *, RegSeqInfo *,
std::vector<std::pair<unsigned, unsigned> > &Remap) const;
bool tryMergeUsingCommonSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI,
std::vector<std::pair<unsigned, unsigned> > &RemapChan);
bool tryMergeUsingFreeSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI,
std::vector<std::pair<unsigned, unsigned> > &RemapChan);
MachineInstr *RebuildVector(RegSeqInfo *MI,
const RegSeqInfo *BaseVec,
const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const;
void RemoveMI(MachineInstr *);
void trackRSI(const RegSeqInfo &RSI);
typedef DenseMap<unsigned, std::vector<MachineInstr *> > InstructionSetMap;
DenseMap<MachineInstr *, RegSeqInfo> PreviousRegSeq;
InstructionSetMap PreviousRegSeqByReg;
InstructionSetMap PreviousRegSeqByUndefCount;
public:
static char ID;
R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID),
TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())) { }
void getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesCFG();
AU.addRequired<MachineDominatorTree>();
AU.addPreserved<MachineDominatorTree>();
AU.addRequired<MachineLoopInfo>();
AU.addPreserved<MachineLoopInfo>();
MachineFunctionPass::getAnalysisUsage(AU);
}
const char *getPassName() const {
return "R600 Vector Registers Merge Pass";
}
bool runOnMachineFunction(MachineFunction &Fn);
};
char R600VectorRegMerger::ID = 0;
bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI)
const {
if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
return true;
switch (MI.getOpcode()) {
case AMDGPU::R600_ExportSwz:
case AMDGPU::EG_ExportSwz:
return true;
default:
return false;
}
}
bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched,
RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned> > &Remap)
const {
unsigned CurrentUndexIdx = 0;
for (DenseMap<unsigned, unsigned>::iterator It = ToMerge->RegToChan.begin(),
E = ToMerge->RegToChan.end(); It != E; ++It) {
DenseMap<unsigned, unsigned>::const_iterator PosInUntouched =
Untouched->RegToChan.find((*It).first);
if (PosInUntouched != Untouched->RegToChan.end()) {
Remap.push_back(std::pair<unsigned, unsigned>
((*It).second, (*PosInUntouched).second));
continue;
}
if (CurrentUndexIdx >= Untouched->UndefReg.size())
return false;
Remap.push_back(std::pair<unsigned, unsigned>
((*It).second, Untouched->UndefReg[CurrentUndexIdx++]));
}
return true;
}
MachineInstr *R600VectorRegMerger::RebuildVector(
RegSeqInfo *RSI, const RegSeqInfo *BaseRSI,
const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const {
unsigned Reg = RSI->Instr->getOperand(0).getReg();
MachineBasicBlock::iterator Pos = RSI->Instr;
MachineBasicBlock &MBB = *Pos->getParent();
DebugLoc DL = Pos->getDebugLoc();
unsigned SrcVec = BaseRSI->Instr->getOperand(0).getReg();
DenseMap<unsigned, unsigned> UpdatedRegToChan = BaseRSI->RegToChan;
std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg;
for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(),
E = RSI->RegToChan.end(); It != E; ++It) {
if (BaseRSI->RegToChan.find((*It).first) != BaseRSI->RegToChan.end()) {
UpdatedRegToChan[(*It).first] = (*It).second;
continue;
}
unsigned DstReg = MRI->createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
unsigned SubReg = (*It).first;
unsigned Swizzle = (*It).second;
unsigned Chan;
for (unsigned j = 0, je = RemapChan.size(); j < je; j++) {
if (RemapChan[j].first == Swizzle) {
Chan = RemapChan[j].second;
break;
}
}
MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::INSERT_SUBREG),
DstReg)
.addReg(SrcVec)
.addReg(SubReg)
.addImm(Chan);
UpdatedRegToChan[SubReg] = Chan;
for (std::vector<unsigned>::iterator RemoveIt = UpdatedUndef.begin(),
RemoveE = UpdatedUndef.end(); RemoveIt != RemoveE; ++ RemoveIt) {
if (*RemoveIt == Chan)
UpdatedUndef.erase(RemoveIt);
}
DEBUG(dbgs() << " ->"; Tmp->dump(););
SrcVec = DstReg;
}
Pos = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg)
.addReg(SrcVec);
DEBUG(dbgs() << " ->"; Pos->dump(););
DEBUG(dbgs() << " Updating Swizzle:\n");
for (MachineRegisterInfo::use_iterator It = MRI->use_begin(Reg),
E = MRI->use_end(); It != E; ++It) {
DEBUG(dbgs() << " ";(*It).dump(); dbgs() << " ->");
SwizzleInput(*It, RemapChan);
DEBUG((*It).dump());
}
RSI->Instr->eraseFromParent();
// Update RSI
RSI->Instr = Pos;
RSI->RegToChan = UpdatedRegToChan;
RSI->UndefReg = UpdatedUndef;
return Pos;
}
void R600VectorRegMerger::RemoveMI(MachineInstr *MI) {
for (InstructionSetMap::iterator It = PreviousRegSeqByReg.begin(),
E = PreviousRegSeqByReg.end(); It != E; ++It) {
std::vector<MachineInstr *> &MIs = (*It).second;
MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end());
}
for (InstructionSetMap::iterator It = PreviousRegSeqByUndefCount.begin(),
E = PreviousRegSeqByUndefCount.end(); It != E; ++It) {
std::vector<MachineInstr *> &MIs = (*It).second;
MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end());
}
}
void R600VectorRegMerger::SwizzleInput(MachineInstr &MI,
const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const {
unsigned Offset;
if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
Offset = 2;
else
Offset = 3;
for (unsigned i = 0; i < 4; i++) {
unsigned Swizzle = MI.getOperand(i + Offset).getImm() + 1;
for (unsigned j = 0, e = RemapChan.size(); j < e; j++) {
if (RemapChan[j].first == Swizzle) {
MI.getOperand(i + Offset).setImm(RemapChan[j].second - 1);
break;
}
}
}
}
bool R600VectorRegMerger::areAllUsesSwizzeable(unsigned Reg) const {
for (MachineRegisterInfo::use_iterator It = MRI->use_begin(Reg),
E = MRI->use_end(); It != E; ++It) {
if (!canSwizzle(*It))
return false;
}
return true;
}
bool R600VectorRegMerger::tryMergeUsingCommonSlot(RegSeqInfo &RSI,
RegSeqInfo &CompatibleRSI,
std::vector<std::pair<unsigned, unsigned> > &RemapChan) {
for (MachineInstr::mop_iterator MOp = RSI.Instr->operands_begin(),
MOE = RSI.Instr->operands_end(); MOp != MOE; ++MOp) {
if (!MOp->isReg())
continue;
if (PreviousRegSeqByReg[MOp->getReg()].empty())
continue;
std::vector<MachineInstr *> MIs = PreviousRegSeqByReg[MOp->getReg()];
for (unsigned i = 0, e = MIs.size(); i < e; i++) {
CompatibleRSI = PreviousRegSeq[MIs[i]];
if (RSI == CompatibleRSI)
continue;
if (tryMergeVector(&CompatibleRSI, &RSI, RemapChan))
return true;
}
}
return false;
}
bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI,
RegSeqInfo &CompatibleRSI,
std::vector<std::pair<unsigned, unsigned> > &RemapChan) {
unsigned NeededUndefs = 4 - RSI.UndefReg.size();
if (PreviousRegSeqByUndefCount[NeededUndefs].empty())
return false;
std::vector<MachineInstr *> &MIs =
PreviousRegSeqByUndefCount[NeededUndefs];
CompatibleRSI = PreviousRegSeq[MIs.back()];
tryMergeVector(&CompatibleRSI, &RSI, RemapChan);
return true;
}
void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) {
for (DenseMap<unsigned, unsigned>::const_iterator
It = RSI.RegToChan.begin(), E = RSI.RegToChan.end(); It != E; ++It) {
PreviousRegSeqByReg[(*It).first].push_back(RSI.Instr);
}
PreviousRegSeqByUndefCount[RSI.UndefReg.size()].push_back(RSI.Instr);
PreviousRegSeq[RSI.Instr] = RSI;
}
bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
MRI = &(Fn.getRegInfo());
for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
MBB != MBBe; ++MBB) {
MachineBasicBlock *MB = MBB;
PreviousRegSeq.clear();
PreviousRegSeqByReg.clear();
PreviousRegSeqByUndefCount.clear();
for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end();
MII != MIIE; ++MII) {
MachineInstr *MI = MII;
if (MI->getOpcode() != AMDGPU::REG_SEQUENCE)
continue;
RegSeqInfo RSI(*MRI, MI);
// All uses of MI are swizzeable ?
unsigned Reg = MI->getOperand(0).getReg();
if (!areAllUsesSwizzeable(Reg))
continue;
DEBUG (dbgs() << "Trying to optimize ";
MI->dump();
);
RegSeqInfo CandidateRSI;
std::vector<std::pair<unsigned, unsigned> > RemapChan;
DEBUG(dbgs() << "Using common slots...\n";);
if (tryMergeUsingCommonSlot(RSI, CandidateRSI, RemapChan)) {
// Remove CandidateRSI mapping
RemoveMI(CandidateRSI.Instr);
MII = RebuildVector(&RSI, &CandidateRSI, RemapChan);
trackRSI(RSI);
continue;
}
DEBUG(dbgs() << "Using free slots...\n";);
RemapChan.clear();
if (tryMergeUsingFreeSlot(RSI, CandidateRSI, RemapChan)) {
RemoveMI(CandidateRSI.Instr);
MII = RebuildVector(&RSI, &CandidateRSI, RemapChan);
trackRSI(RSI);
continue;
}
//Failed to merge
trackRSI(RSI);
}
}
return false;
}
}
llvm::FunctionPass *llvm::createR600VectorRegMerger(TargetMachine &tm) {
return new R600VectorRegMerger(tm);
}

View File

@ -0,0 +1,30 @@
;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
;CHECK-NOT: MOV
define void @test() {
%1 = call float @llvm.R600.load.input(i32 0)
%2 = call float @llvm.R600.load.input(i32 1)
%3 = call float @llvm.R600.load.input(i32 2)
%4 = call float @llvm.R600.load.input(i32 3)
%5 = fmul float %1, 3.0
%6 = fmul float %2, 3.0
%7 = fmul float %3, 3.0
%8 = fmul float %4, 3.0
%9 = insertelement <4 x float> undef, float %5, i32 0
%10 = insertelement <4 x float> %9, float %6, i32 1
%11 = insertelement <4 x float> undef, float %7, i32 0
%12 = insertelement <4 x float> %11, float %5, i32 1
%13 = insertelement <4 x float> undef, float %8, i32 0
%14 = call <4 x float> @llvm.R600.tex(<4 x float> %10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%15 = call <4 x float> @llvm.R600.tex(<4 x float> %12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%16 = call <4 x float> @llvm.R600.tex(<4 x float> %13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%17 = fadd <4 x float> %14, %15
%18 = fadd <4 x float> %17, %16
call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 0)
ret void
}
declare float @llvm.R600.load.input(i32) readnone
declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone
declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)