From 93795574785de252703591e7fcc8f052c762f25e Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Mon, 5 Aug 2013 11:23:46 +0000 Subject: [PATCH] [SystemZ] Use BRCT and BRCTG to eliminate add-&-compare sequences This patch just uses a peephole test for "add; compare; branch" sequences within a single block. The IR optimizers already convert loops to decrement-and-branch-on-nonzero form in some cases, so even this simplistic test triggers many times during a clang bootstrap and projects/test-suite run. It looks like there are still cases where we need to more strongly prefer branches on nonzero though. E.g. I saw a case where a loop that started out with a check for 0 ended up with a check for -1. I'll try to look at that sometime. I ended up adding the Reference class because MachineInstr::readsRegister() doesn't check for subregisters (by design, as far as I could tell). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@187723 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/SystemZ/SystemZElimCompare.cpp | 147 ++++++++++++++++-- lib/Target/SystemZ/SystemZInstrInfo.cpp | 8 + lib/Target/SystemZ/SystemZInstrInfo.h | 10 +- lib/Target/SystemZ/SystemZLongBranch.cpp | 31 ++++ lib/Target/SystemZ/SystemZTargetMachine.cpp | 3 + test/CodeGen/SystemZ/Large/branch-range-07.py | 68 ++++++++ test/CodeGen/SystemZ/Large/branch-range-08.py | 69 ++++++++ test/CodeGen/SystemZ/loop-01.ll | 101 +++++++++++- 8 files changed, 419 insertions(+), 18 deletions(-) create mode 100644 test/CodeGen/SystemZ/Large/branch-range-07.py create mode 100644 test/CodeGen/SystemZ/Large/branch-range-08.py diff --git a/lib/Target/SystemZ/SystemZElimCompare.cpp b/lib/Target/SystemZ/SystemZElimCompare.cpp index bcdc5b728f0..07afc86acba 100644 --- a/lib/Target/SystemZ/SystemZElimCompare.cpp +++ b/lib/Target/SystemZ/SystemZElimCompare.cpp @@ -28,10 +28,38 @@ using namespace llvm; +STATISTIC(BranchOnCounts, "Number of branch-on-count instructions"); STATISTIC(EliminatedComparisons, "Number of eliminated comparisons"); STATISTIC(FusedComparisons, "Number of fused compare-and-branch instructions"); namespace { + // Represents the references to a particular register in one or more + // instructions. + struct Reference { + Reference() + : Def(false), Use(false), IndirectDef(false), IndirectUse(false) {} + + Reference &operator|=(const Reference &Other) { + Def |= Other.Def; + IndirectDef |= Other.IndirectDef; + Use |= Other.Use; + IndirectUse |= Other.IndirectUse; + return *this; + } + + operator bool() const { return Def || Use; } + + // True if the register is defined or used in some form, either directly or + // via a sub- or super-register. + bool Def; + bool Use; + + // True if the register is defined or used indirectly, by a sub- or + // super-register. + bool IndirectDef; + bool IndirectUse; + }; + class SystemZElimCompare : public MachineFunctionPass { public: static char ID; @@ -46,6 +74,9 @@ namespace { bool runOnMachineFunction(MachineFunction &F); private: + Reference getRegReferences(MachineInstr *MI, unsigned Reg); + bool convertToBRCT(MachineInstr *MI, MachineInstr *Compare, + SmallVectorImpl &CCUsers); bool convertToLoadAndTest(MachineInstr *MI); bool adjustCCMasksForInstr(MachineInstr *MI, MachineInstr *Compare, SmallVectorImpl &CCUsers); @@ -99,6 +130,80 @@ static bool resultTests(MachineInstr *MI, unsigned Reg, unsigned SubReg) { return false; } +// Describe the references to Reg in MI, including sub- and super-registers. +Reference SystemZElimCompare::getRegReferences(MachineInstr *MI, unsigned Reg) { + Reference Ref; + for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { + const MachineOperand &MO = MI->getOperand(I); + if (MO.isReg()) { + if (unsigned MOReg = MO.getReg()) { + if (MOReg == Reg || TRI->regsOverlap(MOReg, Reg)) { + if (MO.isUse()) { + Ref.Use = true; + Ref.IndirectUse |= (MOReg != Reg); + } + if (MO.isDef()) { + Ref.Def = true; + Ref.IndirectDef |= (MOReg != Reg); + } + } + } + } + } + return Ref; +} + +// Compare compares the result of MI against zero. If MI is an addition +// of -1 and if CCUsers is a single branch on nonzero, eliminate the addition +// and convert the branch to a BRCT(G). Return true on success. +bool +SystemZElimCompare::convertToBRCT(MachineInstr *MI, MachineInstr *Compare, + SmallVectorImpl &CCUsers) { + // Check whether we have an addition of -1. + unsigned Opcode = MI->getOpcode(); + unsigned BRCT; + if (Opcode == SystemZ::AHI) + BRCT = SystemZ::BRCT; + else if (Opcode == SystemZ::AGHI) + BRCT = SystemZ::BRCTG; + else + return false; + if (MI->getOperand(2).getImm() != -1) + return false; + + // Check whether we have a single JLH. + if (CCUsers.size() != 1) + return false; + MachineInstr *Branch = CCUsers[0]; + if (Branch->getOpcode() != SystemZ::BRC || + Branch->getOperand(0).getImm() != SystemZ::CCMASK_ICMP || + Branch->getOperand(1).getImm() != SystemZ::CCMASK_CMP_NE) + return false; + + // We already know that there are no references to the register between + // MI and Compare. Make sure that there are also no references between + // Compare and Branch. + unsigned SrcReg = Compare->getOperand(0).getReg(); + MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch; + for (++MBBI; MBBI != MBBE; ++MBBI) + if (getRegReferences(MBBI, SrcReg)) + return false; + + // The transformation is OK. Rebuild Branch as a BRCT(G). + MachineOperand Target(Branch->getOperand(2)); + Branch->RemoveOperand(2); + Branch->RemoveOperand(1); + Branch->RemoveOperand(0); + Branch->setDesc(TII->get(BRCT)); + MachineInstrBuilder(*Branch->getParent()->getParent(), Branch) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addOperand(Target) + .addReg(SystemZ::CC, RegState::ImplicitDefine); + MI->removeFromParent(); + return true; +} + // If MI is a load instruction, try to convert it into a LOAD AND TEST. // Return true on success. bool SystemZElimCompare::convertToLoadAndTest(MachineInstr *MI) { @@ -210,21 +315,32 @@ optimizeCompareZero(MachineInstr *Compare, unsigned SrcSubReg = Compare->getOperand(0).getSubReg(); MachineBasicBlock *MBB = Compare->getParent(); MachineBasicBlock::iterator MBBI = Compare, MBBE = MBB->begin(); - bool SeenUseOfCC = false; + Reference CCRefs; + Reference SrcRefs; while (MBBI != MBBE) { --MBBI; MachineInstr *MI = MBBI; - if (resultTests(MI, SrcReg, SrcSubReg) && - ((!SeenUseOfCC && convertToLoadAndTest(MI)) || - adjustCCMasksForInstr(MI, Compare, CCUsers))) { - EliminatedComparisons += 1; - return true; + if (resultTests(MI, SrcReg, SrcSubReg)) { + // Try to remove both MI and Compare by converting a branch to BRCT(G). + // We don't care in this case whether CC is modified between MI and + // Compare. + if (!CCRefs.Use && !SrcRefs && convertToBRCT(MI, Compare, CCUsers)) { + BranchOnCounts += 1; + return true; + } + // Try to eliminate Compare by reusing a CC result from MI. + if ((!CCRefs && convertToLoadAndTest(MI)) || + (!CCRefs.Def && adjustCCMasksForInstr(MI, Compare, CCUsers))) { + EliminatedComparisons += 1; + return true; + } } - if (MI->modifiesRegister(SrcReg, TRI) || - MI->modifiesRegister(SystemZ::CC, TRI)) + SrcRefs |= getRegReferences(MI, SrcReg); + if (SrcRefs.Def) + return false; + CCRefs |= getRegReferences(MI, SystemZ::CC); + if (CCRefs.Use && CCRefs.Def) return false; - if (MI->readsRegister(SystemZ::CC, TRI)) - SeenUseOfCC = true; } return false; } @@ -316,13 +432,12 @@ bool SystemZElimCompare::processBlock(MachineBasicBlock *MBB) { continue; } - if (MI->definesRegister(SystemZ::CC, TRI)) { + Reference CCRefs(getRegReferences(MI, SystemZ::CC)); + if (CCRefs.Def) { CCUsers.clear(); - CompleteCCUsers = true; - } else if (MI->modifiesRegister(SystemZ::CC, TRI)) - CompleteCCUsers = false; - - if (CompleteCCUsers && MI->readsRegister(SystemZ::CC, TRI)) + CompleteCCUsers = !CCRefs.IndirectDef; + } + if (CompleteCCUsers && CCRefs.Use) CCUsers.push_back(MI); } return Changed; diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp index 5dd8d98d27d..c2a6a7f1018 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -684,6 +684,14 @@ SystemZInstrInfo::getBranchInfo(const MachineInstr *MI) const { MI->getOperand(0).getImm(), MI->getOperand(1).getImm(), &MI->getOperand(2)); + case SystemZ::BRCT: + return SystemZII::Branch(SystemZII::BranchCT, SystemZ::CCMASK_ICMP, + SystemZ::CCMASK_CMP_NE, &MI->getOperand(2)); + + case SystemZ::BRCTG: + return SystemZII::Branch(SystemZII::BranchCTG, SystemZ::CCMASK_ICMP, + SystemZ::CCMASK_CMP_NE, &MI->getOperand(2)); + case SystemZ::CIJ: case SystemZ::CRJ: return SystemZII::Branch(SystemZII::BranchC, SystemZ::CCMASK_ICMP, diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h index 1392745672f..b12b471a4da 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.h +++ b/lib/Target/SystemZ/SystemZInstrInfo.h @@ -69,7 +69,15 @@ namespace SystemZII { // An instruction that peforms a 64-bit signed comparison and branches // on the result. - BranchCG + BranchCG, + + // An instruction that decrements a 32-bit register and branches if + // the result is nonzero. + BranchCT, + + // An instruction that decrements a 64-bit register and branches if + // the result is nonzero. + BranchCTG }; // Information about a branch instruction. struct Branch { diff --git a/lib/Target/SystemZ/SystemZLongBranch.cpp b/lib/Target/SystemZ/SystemZLongBranch.cpp index c5c4cab6afd..114f74e14ac 100644 --- a/lib/Target/SystemZ/SystemZLongBranch.cpp +++ b/lib/Target/SystemZ/SystemZLongBranch.cpp @@ -148,6 +148,7 @@ namespace { bool mustRelaxBranch(const TerminatorInfo &Terminator, uint64_t Address); bool mustRelaxABranch(); void setWorstCaseAddresses(); + void splitBranchOnCount(MachineInstr *MI, unsigned AddOpcode); void splitCompareBranch(MachineInstr *MI, unsigned CompareOpcode); void relaxBranch(TerminatorInfo &Terminator); void relaxBranches(); @@ -218,6 +219,11 @@ TerminatorInfo SystemZLongBranch::describeTerminator(MachineInstr *MI) { // Relaxes to BRCL, which is 2 bytes longer. Terminator.ExtraRelaxSize = 2; break; + case SystemZ::BRCT: + case SystemZ::BRCTG: + // Relaxes to A(G)HI and BRCL, which is 6 bytes longer. + Terminator.ExtraRelaxSize = 6; + break; case SystemZ::CRJ: // Relaxes to a CR/BRCL sequence, which is 2 bytes longer. Terminator.ExtraRelaxSize = 2; @@ -330,6 +336,25 @@ void SystemZLongBranch::setWorstCaseAddresses() { } } +// Split BRANCH ON COUNT MI into the addition given by AddOpcode followed +// by a BRCL on the result. +void SystemZLongBranch::splitBranchOnCount(MachineInstr *MI, + unsigned AddOpcode) { + MachineBasicBlock *MBB = MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + BuildMI(*MBB, MI, DL, TII->get(AddOpcode)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addImm(-1); + MachineInstr *BRCL = BuildMI(*MBB, MI, DL, TII->get(SystemZ::BRCL)) + .addImm(SystemZ::CCMASK_ICMP) + .addImm(SystemZ::CCMASK_CMP_NE) + .addOperand(MI->getOperand(2)); + // The implicit use of CC is a killing use. + BRCL->addRegisterKilled(SystemZ::CC, &TII->getRegisterInfo()); + MI->eraseFromParent(); +} + // Split MI into the comparison given by CompareOpcode followed // a BRCL on the result. void SystemZLongBranch::splitCompareBranch(MachineInstr *MI, @@ -358,6 +383,12 @@ void SystemZLongBranch::relaxBranch(TerminatorInfo &Terminator) { case SystemZ::BRC: Branch->setDesc(TII->get(SystemZ::BRCL)); break; + case SystemZ::BRCT: + splitBranchOnCount(Branch, SystemZ::AHI); + break; + case SystemZ::BRCTG: + splitBranchOnCount(Branch, SystemZ::AGHI); + break; case SystemZ::CRJ: splitCompareBranch(Branch, SystemZ::CR); break; diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp index 2bacc2bc24f..856183c6f49 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -82,6 +82,9 @@ bool SystemZPassConfig::addPreEmitPass() { // CC values (while still being worthwhile) and others that happen to make // the CC result more useful than it was originally. // + // Another reason is that we only want to use BRANCH ON COUNT in cases + // where we know that the count register is not going to be spilled. + // // Doing it so late makes it more likely that a register will be reused // between the comparison and the branch, but it isn't clear whether // preventing that would be a win or not. diff --git a/test/CodeGen/SystemZ/Large/branch-range-07.py b/test/CodeGen/SystemZ/Large/branch-range-07.py new file mode 100644 index 00000000000..90c442092e8 --- /dev/null +++ b/test/CodeGen/SystemZ/Large/branch-range-07.py @@ -0,0 +1,68 @@ +# Test 32-bit BRANCH RELATIVE ON COUNT in cases where some branches are out +# of range. +# RUN: python %s | llc -mtriple=s390x-linux-gnu | FileCheck %s + +# Construct: +# +# loopN: +# load of countN +# ... +# loop0: +# 0xffd8 bytes, from MVIY instructions +# conditional branch to main +# after0: +# ... +# decrement of countN +# conditional branch to loopN +# afterN: +# +# Each load occupies 4 bytes. Each decrement and branch occupies 4 +# bytes if BRCT can be used, otherwise it occupies 10 bytes (AHI + BRCL). +# This means that loop 6 contains 5 * 4 + 0xffd8 + 5 * 4 == 0x10000 bytes +# and is therefore (just) in range. Loop 7 is out of range. +# +# CHECK: brct {{%r[0-9]+}} +# CHECK: brct {{%r[0-9]+}} +# CHECK: brct {{%r[0-9]+}} +# CHECK: brct {{%r[0-9]+}} +# CHECK: brct {{%r[0-9]+}} +# CHECK: brct {{%r[0-9]+}} +# CHECK: ahi {{%r[0-9]+}}, -1 +# CHECK: jglh +# CHECK: ahi {{%r[0-9]+}}, -1 +# CHECK: jglh + +branch_blocks = 8 +main_size = 0xffd8 + +print 'define void @f1(i8 *%base, i32 *%counts) {' +print 'entry:' + +for i in xrange(branch_blocks - 1, -1, -1): + print ' %%countptr%d = getelementptr i32 *%%counts, i64 %d' % (i, i) + print ' %%initcount%d = load i32 *%%countptr%d' % (i, i) + print ' br label %%loop%d' % i + + print 'loop%d:' % i + block1 = 'entry' if i == branch_blocks - 1 else 'loop%d' % (i + 1) + block2 = 'loop0' if i == 0 else 'after%d' % (i - 1) + print (' %%count%d = phi i32 [ %%initcount%d, %%%s ],' + ' [ %%nextcount%d, %%%s ]' % (i, i, block1, i, block2)) + +a, b = 1, 1 +for i in xrange(0, main_size, 6): + a, b = b, a + b + offset = 4096 + b % 500000 + value = a % 256 + print ' %%ptr%d = getelementptr i8 *%%base, i64 %d' % (i, offset) + print ' store volatile i8 %d, i8 *%%ptr%d' % (value, i) + +for i in xrange(branch_blocks): + print ' %%nextcount%d = add i32 %%count%d, -1' % (i, i) + print ' %%test%d = icmp ne i32 %%nextcount%d, 0' % (i, i) + print ' br i1 %%test%d, label %%loop%d, label %%after%d' % (i, i, i) + print '' + print 'after%d:' % i + +print ' ret void' +print '}' diff --git a/test/CodeGen/SystemZ/Large/branch-range-08.py b/test/CodeGen/SystemZ/Large/branch-range-08.py new file mode 100644 index 00000000000..ac1b1370a3e --- /dev/null +++ b/test/CodeGen/SystemZ/Large/branch-range-08.py @@ -0,0 +1,69 @@ +# Test 64-bit BRANCH RELATIVE ON COUNT in cases where some branches are out +# of range. +# RUN: python %s | llc -mtriple=s390x-linux-gnu | FileCheck %s + +# Construct: +# +# loopN: +# load of countN +# ... +# loop0: +# 0xffd8 bytes, from MVIY instructions +# conditional branch to main +# after0: +# ... +# decrement of countN +# conditional branch to loopN +# afterN: +# +# Each load occupies 6 bytes. Each decrement and branch occupies 4 +# bytes if BRCTG can be used, otherwise it occupies 10 bytes (AGHI + BRCL). +# This means that loop 5 contains 4 * 6 + 0xffd8 + 4 * 4 == 0x10000 bytes +# and is therefore (just) in range. Loop 6 is out of range. +# +# CHECK: brctg {{%r[0-9]+}} +# CHECK: brctg {{%r[0-9]+}} +# CHECK: brctg {{%r[0-9]+}} +# CHECK: brctg {{%r[0-9]+}} +# CHECK: brctg {{%r[0-9]+}} +# CHECK: aghi {{%r[0-9]+}}, -1 +# CHECK: jglh +# CHECK: aghi {{%r[0-9]+}}, -1 +# CHECK: jglh +# CHECK: aghi {{%r[0-9]+}}, -1 +# CHECK: jglh + +branch_blocks = 8 +main_size = 0xffd8 + +print 'define void @f1(i8 *%base, i64 *%counts) {' +print 'entry:' + +for i in xrange(branch_blocks - 1, -1, -1): + print ' %%countptr%d = getelementptr i64 *%%counts, i64 %d' % (i, i) + print ' %%initcount%d = load i64 *%%countptr%d' % (i, i) + print ' br label %%loop%d' % i + + print 'loop%d:' % i + block1 = 'entry' if i == branch_blocks - 1 else 'loop%d' % (i + 1) + block2 = 'loop0' if i == 0 else 'after%d' % (i - 1) + print (' %%count%d = phi i64 [ %%initcount%d, %%%s ],' + ' [ %%nextcount%d, %%%s ]' % (i, i, block1, i, block2)) + +a, b = 1, 1 +for i in xrange(0, main_size, 6): + a, b = b, a + b + offset = 4096 + b % 500000 + value = a % 256 + print ' %%ptr%d = getelementptr i8 *%%base, i64 %d' % (i, offset) + print ' store volatile i8 %d, i8 *%%ptr%d' % (value, i) + +for i in xrange(branch_blocks): + print ' %%nextcount%d = add i64 %%count%d, -1' % (i, i) + print ' %%test%d = icmp ne i64 %%nextcount%d, 0' % (i, i) + print ' br i1 %%test%d, label %%loop%d, label %%after%d' % (i, i, i) + print '' + print 'after%d:' % i + +print ' ret void' +print '}' diff --git a/test/CodeGen/SystemZ/loop-01.ll b/test/CodeGen/SystemZ/loop-01.ll index 025a34eaf5c..58008017356 100644 --- a/test/CodeGen/SystemZ/loop-01.ll +++ b/test/CodeGen/SystemZ/loop-01.ll @@ -5,7 +5,7 @@ ; Test that strength reduction is applied to addresses with a scale factor, ; but that indexed addressing can still be used. define void @f1(i32 *%dest, i32 %a) { -; CHECK-LABEL: f1 +; CHECK-LABEL: f1: ; CHECK-NOT: sllg ; CHECK: st %r3, 0({{%r[1-5],%r[1-5]}}) ; CHECK: br %r14 @@ -23,3 +23,102 @@ loop: exit: ret void } + +; Test a loop that should be converted into dbr form and then use BRCT. +define void @f2(i32 *%src, i32 *%dest) { +; CHECK-LABEL: f2: +; CHECK: lhi [[REG:%r[0-5]]], 100 +; CHECK: [[LABEL:\.[^:]*]]:{{.*}} %loop +; CHECK: brct [[REG]], [[LABEL]] +; CHECK: br %r14 +entry: + br label %loop + +loop: + %count = phi i32 [ 0, %entry ], [ %next, %loop.next ] + %next = add i32 %count, 1 + %val = load volatile i32 *%src + %cmp = icmp eq i32 %val, 0 + br i1 %cmp, label %loop.next, label %loop.store + +loop.store: + %add = add i32 %val, 1 + store volatile i32 %add, i32 *%dest + br label %loop.next + +loop.next: + %cont = icmp ne i32 %next, 100 + br i1 %cont, label %loop, label %exit + +exit: + ret void +} + +; Like f2, but for BRCTG. +define void @f3(i64 *%src, i64 *%dest) { +; CHECK-LABEL: f3: +; CHECK: lghi [[REG:%r[0-5]]], 100 +; CHECK: [[LABEL:\.[^:]*]]:{{.*}} %loop +; CHECK: brctg [[REG]], [[LABEL]] +; CHECK: br %r14 +entry: + br label %loop + +loop: + %count = phi i64 [ 0, %entry ], [ %next, %loop.next ] + %next = add i64 %count, 1 + %val = load volatile i64 *%src + %cmp = icmp eq i64 %val, 0 + br i1 %cmp, label %loop.next, label %loop.store + +loop.store: + %add = add i64 %val, 1 + store volatile i64 %add, i64 *%dest + br label %loop.next + +loop.next: + %cont = icmp ne i64 %next, 100 + br i1 %cont, label %loop, label %exit + +exit: + ret void +} + +; Test a loop with a 64-bit decremented counter in which the 32-bit +; low part of the counter is used after the decrement. This is an example +; of a subregister use being the only thing that blocks a conversion to BRCTG. +define void @f4(i32 *%src, i32 *%dest, i64 *%dest2, i64 %count) { +; CHECK-LABEL: f4: +; CHECK: aghi [[REG:%r[0-5]]], -1 +; CHECK: lr [[REG2:%r[0-5]]], [[REG]] +; CHECK: stg [[REG2]], +; CHECK: jne {{\..*}} +; CHECK: br %r14 +entry: + br label %loop + +loop: + %left = phi i64 [ %count, %entry ], [ %next, %loop.next ] + store volatile i64 %left, i64 *%dest2 + %val = load volatile i32 *%src + %cmp = icmp eq i32 %val, 0 + br i1 %cmp, label %loop.next, label %loop.store + +loop.store: + %add = add i32 %val, 1 + store volatile i32 %add, i32 *%dest + br label %loop.next + +loop.next: + %next = add i64 %left, -1 + %ext = zext i32 %val to i64 + %shl = shl i64 %ext, 32 + %and = and i64 %next, 4294967295 + %or = or i64 %shl, %and + store volatile i64 %or, i64 *%dest2 + %cont = icmp ne i64 %next, 0 + br i1 %cont, label %loop, label %exit + +exit: + ret void +}