[SystemZ] Use BRCT and BRCTG to eliminate add-&-compare sequences

This patch just uses a peephole test for "add; compare; branch" sequences
within a single block.  The IR optimizers already convert loops to
decrement-and-branch-on-nonzero form in some cases, so even this
simplistic test triggers many times during a clang bootstrap and
projects/test-suite run.  It looks like there are still cases where we
need to more strongly prefer branches on nonzero though.  E.g. I saw a
case where a loop that started out with a check for 0 ended up with a
check for -1.  I'll try to look at that sometime.

I ended up adding the Reference class because MachineInstr::readsRegister()
doesn't check for subregisters (by design, as far as I could tell).


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@187723 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Richard Sandiford 2013-08-05 11:23:46 +00:00
parent f8e16c6f5a
commit 9379557478
8 changed files with 419 additions and 18 deletions

View File

@ -28,10 +28,38 @@
using namespace llvm;
STATISTIC(BranchOnCounts, "Number of branch-on-count instructions");
STATISTIC(EliminatedComparisons, "Number of eliminated comparisons");
STATISTIC(FusedComparisons, "Number of fused compare-and-branch instructions");
namespace {
// Represents the references to a particular register in one or more
// instructions.
struct Reference {
Reference()
: Def(false), Use(false), IndirectDef(false), IndirectUse(false) {}
Reference &operator|=(const Reference &Other) {
Def |= Other.Def;
IndirectDef |= Other.IndirectDef;
Use |= Other.Use;
IndirectUse |= Other.IndirectUse;
return *this;
}
operator bool() const { return Def || Use; }
// True if the register is defined or used in some form, either directly or
// via a sub- or super-register.
bool Def;
bool Use;
// True if the register is defined or used indirectly, by a sub- or
// super-register.
bool IndirectDef;
bool IndirectUse;
};
class SystemZElimCompare : public MachineFunctionPass {
public:
static char ID;
@ -46,6 +74,9 @@ namespace {
bool runOnMachineFunction(MachineFunction &F);
private:
Reference getRegReferences(MachineInstr *MI, unsigned Reg);
bool convertToBRCT(MachineInstr *MI, MachineInstr *Compare,
SmallVectorImpl<MachineInstr *> &CCUsers);
bool convertToLoadAndTest(MachineInstr *MI);
bool adjustCCMasksForInstr(MachineInstr *MI, MachineInstr *Compare,
SmallVectorImpl<MachineInstr *> &CCUsers);
@ -99,6 +130,80 @@ static bool resultTests(MachineInstr *MI, unsigned Reg, unsigned SubReg) {
return false;
}
// Describe the references to Reg in MI, including sub- and super-registers.
Reference SystemZElimCompare::getRegReferences(MachineInstr *MI, unsigned Reg) {
Reference Ref;
for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
const MachineOperand &MO = MI->getOperand(I);
if (MO.isReg()) {
if (unsigned MOReg = MO.getReg()) {
if (MOReg == Reg || TRI->regsOverlap(MOReg, Reg)) {
if (MO.isUse()) {
Ref.Use = true;
Ref.IndirectUse |= (MOReg != Reg);
}
if (MO.isDef()) {
Ref.Def = true;
Ref.IndirectDef |= (MOReg != Reg);
}
}
}
}
}
return Ref;
}
// Compare compares the result of MI against zero. If MI is an addition
// of -1 and if CCUsers is a single branch on nonzero, eliminate the addition
// and convert the branch to a BRCT(G). Return true on success.
bool
SystemZElimCompare::convertToBRCT(MachineInstr *MI, MachineInstr *Compare,
SmallVectorImpl<MachineInstr *> &CCUsers) {
// Check whether we have an addition of -1.
unsigned Opcode = MI->getOpcode();
unsigned BRCT;
if (Opcode == SystemZ::AHI)
BRCT = SystemZ::BRCT;
else if (Opcode == SystemZ::AGHI)
BRCT = SystemZ::BRCTG;
else
return false;
if (MI->getOperand(2).getImm() != -1)
return false;
// Check whether we have a single JLH.
if (CCUsers.size() != 1)
return false;
MachineInstr *Branch = CCUsers[0];
if (Branch->getOpcode() != SystemZ::BRC ||
Branch->getOperand(0).getImm() != SystemZ::CCMASK_ICMP ||
Branch->getOperand(1).getImm() != SystemZ::CCMASK_CMP_NE)
return false;
// We already know that there are no references to the register between
// MI and Compare. Make sure that there are also no references between
// Compare and Branch.
unsigned SrcReg = Compare->getOperand(0).getReg();
MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch;
for (++MBBI; MBBI != MBBE; ++MBBI)
if (getRegReferences(MBBI, SrcReg))
return false;
// The transformation is OK. Rebuild Branch as a BRCT(G).
MachineOperand Target(Branch->getOperand(2));
Branch->RemoveOperand(2);
Branch->RemoveOperand(1);
Branch->RemoveOperand(0);
Branch->setDesc(TII->get(BRCT));
MachineInstrBuilder(*Branch->getParent()->getParent(), Branch)
.addOperand(MI->getOperand(0))
.addOperand(MI->getOperand(1))
.addOperand(Target)
.addReg(SystemZ::CC, RegState::ImplicitDefine);
MI->removeFromParent();
return true;
}
// If MI is a load instruction, try to convert it into a LOAD AND TEST.
// Return true on success.
bool SystemZElimCompare::convertToLoadAndTest(MachineInstr *MI) {
@ -210,21 +315,32 @@ optimizeCompareZero(MachineInstr *Compare,
unsigned SrcSubReg = Compare->getOperand(0).getSubReg();
MachineBasicBlock *MBB = Compare->getParent();
MachineBasicBlock::iterator MBBI = Compare, MBBE = MBB->begin();
bool SeenUseOfCC = false;
Reference CCRefs;
Reference SrcRefs;
while (MBBI != MBBE) {
--MBBI;
MachineInstr *MI = MBBI;
if (resultTests(MI, SrcReg, SrcSubReg) &&
((!SeenUseOfCC && convertToLoadAndTest(MI)) ||
adjustCCMasksForInstr(MI, Compare, CCUsers))) {
EliminatedComparisons += 1;
return true;
if (resultTests(MI, SrcReg, SrcSubReg)) {
// Try to remove both MI and Compare by converting a branch to BRCT(G).
// We don't care in this case whether CC is modified between MI and
// Compare.
if (!CCRefs.Use && !SrcRefs && convertToBRCT(MI, Compare, CCUsers)) {
BranchOnCounts += 1;
return true;
}
// Try to eliminate Compare by reusing a CC result from MI.
if ((!CCRefs && convertToLoadAndTest(MI)) ||
(!CCRefs.Def && adjustCCMasksForInstr(MI, Compare, CCUsers))) {
EliminatedComparisons += 1;
return true;
}
}
if (MI->modifiesRegister(SrcReg, TRI) ||
MI->modifiesRegister(SystemZ::CC, TRI))
SrcRefs |= getRegReferences(MI, SrcReg);
if (SrcRefs.Def)
return false;
CCRefs |= getRegReferences(MI, SystemZ::CC);
if (CCRefs.Use && CCRefs.Def)
return false;
if (MI->readsRegister(SystemZ::CC, TRI))
SeenUseOfCC = true;
}
return false;
}
@ -316,13 +432,12 @@ bool SystemZElimCompare::processBlock(MachineBasicBlock *MBB) {
continue;
}
if (MI->definesRegister(SystemZ::CC, TRI)) {
Reference CCRefs(getRegReferences(MI, SystemZ::CC));
if (CCRefs.Def) {
CCUsers.clear();
CompleteCCUsers = true;
} else if (MI->modifiesRegister(SystemZ::CC, TRI))
CompleteCCUsers = false;
if (CompleteCCUsers && MI->readsRegister(SystemZ::CC, TRI))
CompleteCCUsers = !CCRefs.IndirectDef;
}
if (CompleteCCUsers && CCRefs.Use)
CCUsers.push_back(MI);
}
return Changed;

View File

@ -684,6 +684,14 @@ SystemZInstrInfo::getBranchInfo(const MachineInstr *MI) const {
MI->getOperand(0).getImm(),
MI->getOperand(1).getImm(), &MI->getOperand(2));
case SystemZ::BRCT:
return SystemZII::Branch(SystemZII::BranchCT, SystemZ::CCMASK_ICMP,
SystemZ::CCMASK_CMP_NE, &MI->getOperand(2));
case SystemZ::BRCTG:
return SystemZII::Branch(SystemZII::BranchCTG, SystemZ::CCMASK_ICMP,
SystemZ::CCMASK_CMP_NE, &MI->getOperand(2));
case SystemZ::CIJ:
case SystemZ::CRJ:
return SystemZII::Branch(SystemZII::BranchC, SystemZ::CCMASK_ICMP,

View File

@ -69,7 +69,15 @@ namespace SystemZII {
// An instruction that peforms a 64-bit signed comparison and branches
// on the result.
BranchCG
BranchCG,
// An instruction that decrements a 32-bit register and branches if
// the result is nonzero.
BranchCT,
// An instruction that decrements a 64-bit register and branches if
// the result is nonzero.
BranchCTG
};
// Information about a branch instruction.
struct Branch {

View File

@ -148,6 +148,7 @@ namespace {
bool mustRelaxBranch(const TerminatorInfo &Terminator, uint64_t Address);
bool mustRelaxABranch();
void setWorstCaseAddresses();
void splitBranchOnCount(MachineInstr *MI, unsigned AddOpcode);
void splitCompareBranch(MachineInstr *MI, unsigned CompareOpcode);
void relaxBranch(TerminatorInfo &Terminator);
void relaxBranches();
@ -218,6 +219,11 @@ TerminatorInfo SystemZLongBranch::describeTerminator(MachineInstr *MI) {
// Relaxes to BRCL, which is 2 bytes longer.
Terminator.ExtraRelaxSize = 2;
break;
case SystemZ::BRCT:
case SystemZ::BRCTG:
// Relaxes to A(G)HI and BRCL, which is 6 bytes longer.
Terminator.ExtraRelaxSize = 6;
break;
case SystemZ::CRJ:
// Relaxes to a CR/BRCL sequence, which is 2 bytes longer.
Terminator.ExtraRelaxSize = 2;
@ -330,6 +336,25 @@ void SystemZLongBranch::setWorstCaseAddresses() {
}
}
// Split BRANCH ON COUNT MI into the addition given by AddOpcode followed
// by a BRCL on the result.
void SystemZLongBranch::splitBranchOnCount(MachineInstr *MI,
unsigned AddOpcode) {
MachineBasicBlock *MBB = MI->getParent();
DebugLoc DL = MI->getDebugLoc();
BuildMI(*MBB, MI, DL, TII->get(AddOpcode))
.addOperand(MI->getOperand(0))
.addOperand(MI->getOperand(1))
.addImm(-1);
MachineInstr *BRCL = BuildMI(*MBB, MI, DL, TII->get(SystemZ::BRCL))
.addImm(SystemZ::CCMASK_ICMP)
.addImm(SystemZ::CCMASK_CMP_NE)
.addOperand(MI->getOperand(2));
// The implicit use of CC is a killing use.
BRCL->addRegisterKilled(SystemZ::CC, &TII->getRegisterInfo());
MI->eraseFromParent();
}
// Split MI into the comparison given by CompareOpcode followed
// a BRCL on the result.
void SystemZLongBranch::splitCompareBranch(MachineInstr *MI,
@ -358,6 +383,12 @@ void SystemZLongBranch::relaxBranch(TerminatorInfo &Terminator) {
case SystemZ::BRC:
Branch->setDesc(TII->get(SystemZ::BRCL));
break;
case SystemZ::BRCT:
splitBranchOnCount(Branch, SystemZ::AHI);
break;
case SystemZ::BRCTG:
splitBranchOnCount(Branch, SystemZ::AGHI);
break;
case SystemZ::CRJ:
splitCompareBranch(Branch, SystemZ::CR);
break;

View File

@ -82,6 +82,9 @@ bool SystemZPassConfig::addPreEmitPass() {
// CC values (while still being worthwhile) and others that happen to make
// the CC result more useful than it was originally.
//
// Another reason is that we only want to use BRANCH ON COUNT in cases
// where we know that the count register is not going to be spilled.
//
// Doing it so late makes it more likely that a register will be reused
// between the comparison and the branch, but it isn't clear whether
// preventing that would be a win or not.

View File

@ -0,0 +1,68 @@
# Test 32-bit BRANCH RELATIVE ON COUNT in cases where some branches are out
# of range.
# RUN: python %s | llc -mtriple=s390x-linux-gnu | FileCheck %s
# Construct:
#
# loopN:
# load of countN
# ...
# loop0:
# 0xffd8 bytes, from MVIY instructions
# conditional branch to main
# after0:
# ...
# decrement of countN
# conditional branch to loopN
# afterN:
#
# Each load occupies 4 bytes. Each decrement and branch occupies 4
# bytes if BRCT can be used, otherwise it occupies 10 bytes (AHI + BRCL).
# This means that loop 6 contains 5 * 4 + 0xffd8 + 5 * 4 == 0x10000 bytes
# and is therefore (just) in range. Loop 7 is out of range.
#
# CHECK: brct {{%r[0-9]+}}
# CHECK: brct {{%r[0-9]+}}
# CHECK: brct {{%r[0-9]+}}
# CHECK: brct {{%r[0-9]+}}
# CHECK: brct {{%r[0-9]+}}
# CHECK: brct {{%r[0-9]+}}
# CHECK: ahi {{%r[0-9]+}}, -1
# CHECK: jglh
# CHECK: ahi {{%r[0-9]+}}, -1
# CHECK: jglh
branch_blocks = 8
main_size = 0xffd8
print 'define void @f1(i8 *%base, i32 *%counts) {'
print 'entry:'
for i in xrange(branch_blocks - 1, -1, -1):
print ' %%countptr%d = getelementptr i32 *%%counts, i64 %d' % (i, i)
print ' %%initcount%d = load i32 *%%countptr%d' % (i, i)
print ' br label %%loop%d' % i
print 'loop%d:' % i
block1 = 'entry' if i == branch_blocks - 1 else 'loop%d' % (i + 1)
block2 = 'loop0' if i == 0 else 'after%d' % (i - 1)
print (' %%count%d = phi i32 [ %%initcount%d, %%%s ],'
' [ %%nextcount%d, %%%s ]' % (i, i, block1, i, block2))
a, b = 1, 1
for i in xrange(0, main_size, 6):
a, b = b, a + b
offset = 4096 + b % 500000
value = a % 256
print ' %%ptr%d = getelementptr i8 *%%base, i64 %d' % (i, offset)
print ' store volatile i8 %d, i8 *%%ptr%d' % (value, i)
for i in xrange(branch_blocks):
print ' %%nextcount%d = add i32 %%count%d, -1' % (i, i)
print ' %%test%d = icmp ne i32 %%nextcount%d, 0' % (i, i)
print ' br i1 %%test%d, label %%loop%d, label %%after%d' % (i, i, i)
print ''
print 'after%d:' % i
print ' ret void'
print '}'

View File

@ -0,0 +1,69 @@
# Test 64-bit BRANCH RELATIVE ON COUNT in cases where some branches are out
# of range.
# RUN: python %s | llc -mtriple=s390x-linux-gnu | FileCheck %s
# Construct:
#
# loopN:
# load of countN
# ...
# loop0:
# 0xffd8 bytes, from MVIY instructions
# conditional branch to main
# after0:
# ...
# decrement of countN
# conditional branch to loopN
# afterN:
#
# Each load occupies 6 bytes. Each decrement and branch occupies 4
# bytes if BRCTG can be used, otherwise it occupies 10 bytes (AGHI + BRCL).
# This means that loop 5 contains 4 * 6 + 0xffd8 + 4 * 4 == 0x10000 bytes
# and is therefore (just) in range. Loop 6 is out of range.
#
# CHECK: brctg {{%r[0-9]+}}
# CHECK: brctg {{%r[0-9]+}}
# CHECK: brctg {{%r[0-9]+}}
# CHECK: brctg {{%r[0-9]+}}
# CHECK: brctg {{%r[0-9]+}}
# CHECK: aghi {{%r[0-9]+}}, -1
# CHECK: jglh
# CHECK: aghi {{%r[0-9]+}}, -1
# CHECK: jglh
# CHECK: aghi {{%r[0-9]+}}, -1
# CHECK: jglh
branch_blocks = 8
main_size = 0xffd8
print 'define void @f1(i8 *%base, i64 *%counts) {'
print 'entry:'
for i in xrange(branch_blocks - 1, -1, -1):
print ' %%countptr%d = getelementptr i64 *%%counts, i64 %d' % (i, i)
print ' %%initcount%d = load i64 *%%countptr%d' % (i, i)
print ' br label %%loop%d' % i
print 'loop%d:' % i
block1 = 'entry' if i == branch_blocks - 1 else 'loop%d' % (i + 1)
block2 = 'loop0' if i == 0 else 'after%d' % (i - 1)
print (' %%count%d = phi i64 [ %%initcount%d, %%%s ],'
' [ %%nextcount%d, %%%s ]' % (i, i, block1, i, block2))
a, b = 1, 1
for i in xrange(0, main_size, 6):
a, b = b, a + b
offset = 4096 + b % 500000
value = a % 256
print ' %%ptr%d = getelementptr i8 *%%base, i64 %d' % (i, offset)
print ' store volatile i8 %d, i8 *%%ptr%d' % (value, i)
for i in xrange(branch_blocks):
print ' %%nextcount%d = add i64 %%count%d, -1' % (i, i)
print ' %%test%d = icmp ne i64 %%nextcount%d, 0' % (i, i)
print ' br i1 %%test%d, label %%loop%d, label %%after%d' % (i, i, i)
print ''
print 'after%d:' % i
print ' ret void'
print '}'

View File

@ -5,7 +5,7 @@
; Test that strength reduction is applied to addresses with a scale factor,
; but that indexed addressing can still be used.
define void @f1(i32 *%dest, i32 %a) {
; CHECK-LABEL: f1
; CHECK-LABEL: f1:
; CHECK-NOT: sllg
; CHECK: st %r3, 0({{%r[1-5],%r[1-5]}})
; CHECK: br %r14
@ -23,3 +23,102 @@ loop:
exit:
ret void
}
; Test a loop that should be converted into dbr form and then use BRCT.
define void @f2(i32 *%src, i32 *%dest) {
; CHECK-LABEL: f2:
; CHECK: lhi [[REG:%r[0-5]]], 100
; CHECK: [[LABEL:\.[^:]*]]:{{.*}} %loop
; CHECK: brct [[REG]], [[LABEL]]
; CHECK: br %r14
entry:
br label %loop
loop:
%count = phi i32 [ 0, %entry ], [ %next, %loop.next ]
%next = add i32 %count, 1
%val = load volatile i32 *%src
%cmp = icmp eq i32 %val, 0
br i1 %cmp, label %loop.next, label %loop.store
loop.store:
%add = add i32 %val, 1
store volatile i32 %add, i32 *%dest
br label %loop.next
loop.next:
%cont = icmp ne i32 %next, 100
br i1 %cont, label %loop, label %exit
exit:
ret void
}
; Like f2, but for BRCTG.
define void @f3(i64 *%src, i64 *%dest) {
; CHECK-LABEL: f3:
; CHECK: lghi [[REG:%r[0-5]]], 100
; CHECK: [[LABEL:\.[^:]*]]:{{.*}} %loop
; CHECK: brctg [[REG]], [[LABEL]]
; CHECK: br %r14
entry:
br label %loop
loop:
%count = phi i64 [ 0, %entry ], [ %next, %loop.next ]
%next = add i64 %count, 1
%val = load volatile i64 *%src
%cmp = icmp eq i64 %val, 0
br i1 %cmp, label %loop.next, label %loop.store
loop.store:
%add = add i64 %val, 1
store volatile i64 %add, i64 *%dest
br label %loop.next
loop.next:
%cont = icmp ne i64 %next, 100
br i1 %cont, label %loop, label %exit
exit:
ret void
}
; Test a loop with a 64-bit decremented counter in which the 32-bit
; low part of the counter is used after the decrement. This is an example
; of a subregister use being the only thing that blocks a conversion to BRCTG.
define void @f4(i32 *%src, i32 *%dest, i64 *%dest2, i64 %count) {
; CHECK-LABEL: f4:
; CHECK: aghi [[REG:%r[0-5]]], -1
; CHECK: lr [[REG2:%r[0-5]]], [[REG]]
; CHECK: stg [[REG2]],
; CHECK: jne {{\..*}}
; CHECK: br %r14
entry:
br label %loop
loop:
%left = phi i64 [ %count, %entry ], [ %next, %loop.next ]
store volatile i64 %left, i64 *%dest2
%val = load volatile i32 *%src
%cmp = icmp eq i32 %val, 0
br i1 %cmp, label %loop.next, label %loop.store
loop.store:
%add = add i32 %val, 1
store volatile i32 %add, i32 *%dest
br label %loop.next
loop.next:
%next = add i64 %left, -1
%ext = zext i32 %val to i64
%shl = shl i64 %ext, 32
%and = and i64 %next, 4294967295
%or = or i64 %shl, %and
store volatile i64 %or, i64 *%dest2
%cont = icmp ne i64 %next, 0
br i1 %cont, label %loop, label %exit
exit:
ret void
}