From 45e0010e14d419ebeca9266b9b705867fd251b83 Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Fri, 8 May 2009 06:34:09 +0000 Subject: [PATCH] Optimize code placement in loop to eliminate unconditional branches or move unconditional branch to the outside of the loop. e.g. /// A: /// ... /// /// /// B: --> loop header /// ... /// jcc C, [exit] /// /// C: /// ... /// jmp B /// /// ==> /// /// A: /// ... /// jmp B /// /// C: --> new loop header /// ... /// /// /// B: /// ... /// jcc C, [exit] git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@71209 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/CodePlacementOpt.cpp | 252 +++++++++++++++++++++++++++-- lib/Target/X86/X86InstrInfo.cpp | 2 +- test/CodeGen/X86/code_placement.ll | 134 +++++++++++++++ 3 files changed, 371 insertions(+), 17 deletions(-) create mode 100644 test/CodeGen/X86/code_placement.ll diff --git a/lib/CodeGen/CodePlacementOpt.cpp b/lib/CodeGen/CodePlacementOpt.cpp index 924a96930b5..d297d5aceb6 100644 --- a/lib/CodeGen/CodePlacementOpt.cpp +++ b/lib/CodeGen/CodePlacementOpt.cpp @@ -16,15 +16,40 @@ #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/ADT/Statistic.h" using namespace llvm; +static cl::opt +OptLoopBBPlacement("opt-loop-bb-placement", + cl::init(false), cl::Hidden, + cl::desc("Optimize block placements in loops")); + +STATISTIC(NumHeaderAligned, "Number of loop header aligned"); +STATISTIC(NumIntraElim, "Number of intra loop branches eliminated"); +STATISTIC(NumIntraMoved, "Number of intra loop branches moved"); + namespace { class CodePlacementOpt : public MachineFunctionPass { const MachineLoopInfo *MLI; + const TargetInstrInfo *TII; + const TargetLowering *TLI; + + /// ChangedMBBs - BBs which are modified by OptimizeIntraLoopEdges. + SmallPtrSet ChangedMBBs; + + /// UncondJmpMBBs - A list of BBs which are in loops and end with + /// unconditional branches. + SmallVector, 4> + UncondJmpMBBs; + + /// LoopHeaders - A list of BBs which are loop headers. + SmallVector LoopHeaders; public: static char ID; @@ -37,12 +62,12 @@ namespace { virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); - AU.addPreserved(); AU.addPreservedID(MachineDominatorsID); MachineFunctionPass::getAnalysisUsage(AU); } private: + bool OptimizeIntraLoopEdges(); bool AlignLoops(MachineFunction &MF); }; @@ -53,32 +78,199 @@ FunctionPass *llvm::createCodePlacementOptPass() { return new CodePlacementOpt(); } +/// OptimizeBackEdges - Place loop back edges to move unconditional branches +/// out of the loop. +/// +/// A: +/// ... +/// +/// +/// B: --> loop header +/// ... +/// jcc C, [exit] +/// +/// C: +/// ... +/// jmp B +/// +/// ==> +/// +/// A: +/// ... +/// jmp B +/// +/// C: --> new loop header +/// ... +/// +/// +/// B: +/// ... +/// jcc C, [exit] +/// +bool CodePlacementOpt::OptimizeIntraLoopEdges() { + if (!OptLoopBBPlacement) + return false; + + bool Changed = false; + for (unsigned i = 0, e = UncondJmpMBBs.size(); i != e; ++i) { + MachineBasicBlock *MBB = UncondJmpMBBs[i].first; + MachineBasicBlock *SuccMBB = UncondJmpMBBs[i].second; + MachineLoop *L = MLI->getLoopFor(MBB); + assert(L && "BB is expected to be in a loop!"); + + if (ChangedMBBs.count(MBB)) { + // BB has been modified, re-analyze. + MachineBasicBlock *TBB = 0, *FBB = 0; + SmallVector Cond; + if (TII->AnalyzeBranch(*MBB, TBB, FBB, Cond) || !Cond.empty()) + continue; + if (MLI->getLoopFor(TBB) != L || TBB->isLandingPad()) + continue; + SuccMBB = TBB; + } else { + assert(MLI->getLoopFor(SuccMBB) == L && + "Successor is not in the same loop!"); + } + + if (MBB->isLayoutSuccessor(SuccMBB)) { + // Successor is right after MBB, just eliminate the unconditional jmp. + // Can this happen? + TII->RemoveBranch(*MBB); + ChangedMBBs.insert(MBB); + ++NumIntraElim; + continue; + } + + // Now check if the predecessor is fallthrough from any BB. If there is, + // that BB should be from outside the loop since edge will become a jmp. + bool OkToMove = true; + MachineBasicBlock *FtMBB = 0, *FtTBB = 0, *FtFBB = 0; + SmallVector FtCond; + for (MachineBasicBlock::pred_iterator PI = SuccMBB->pred_begin(), + PE = SuccMBB->pred_end(); PI != PE; ++PI) { + MachineBasicBlock *PredMBB = *PI; + if (PredMBB->isLayoutSuccessor(SuccMBB)) { + if (TII->AnalyzeBranch(*PredMBB, FtTBB, FtFBB, FtCond)) { + OkToMove = false; + break; + } + if (!FtTBB) + FtTBB = SuccMBB; + else if (!FtFBB) { + assert(FtFBB != SuccMBB && "Unexpected control flow!"); + FtFBB = SuccMBB; + } + + // A fallthrough. + FtMBB = PredMBB; + MachineLoop *PL = MLI->getLoopFor(PredMBB); + if (PL && (PL == L || PL->getLoopDepth() >= L->getLoopDepth())) { + OkToMove = false; + break; + } + } + } + + if (!OkToMove) + continue; + + // Is it profitable? If SuccMBB can fallthrough itself, that can be changed + // into a jmp. + MachineBasicBlock *TBB = 0, *FBB = 0; + SmallVector Cond; + if (TII->AnalyzeBranch(*SuccMBB, TBB, FBB, Cond)) + continue; + if (!TBB && Cond.empty()) + TBB = next(MachineFunction::iterator(SuccMBB)); + else if (!FBB && !Cond.empty()) + FBB = next(MachineFunction::iterator(SuccMBB)); + + // This calculate the cost of the transformation. Also, it finds the *only* + // intra-loop edge if there is one. + int Cost = 0; + bool HasOneIntraSucc = true; + MachineBasicBlock *IntraSucc = 0; + for (MachineBasicBlock::succ_iterator SI = SuccMBB->succ_begin(), + SE = SuccMBB->succ_end(); SI != SE; ++SI) { + MachineBasicBlock *SSMBB = *SI; + if (MLI->getLoopFor(SSMBB) == L) { + if (!IntraSucc) + IntraSucc = SSMBB; + else + HasOneIntraSucc = false; + } + + if (SuccMBB->isLayoutSuccessor(SSMBB)) + // This will become a jmp. + ++Cost; + else if (MBB->isLayoutSuccessor(SSMBB)) + // One of the successor will become the new fallthrough. + if (SSMBB == FBB) { + FBB = 0; + --Cost; + } else if (!FBB && SSMBB == TBB && Cond.empty()) { + TBB = 0; + --Cost; + } else if (!TII->ReverseBranchCondition(Cond)) { + TBB = FBB; + FBB = 0; + --Cost; + } + } + if (Cost) + continue; + + // Now, let's move the successor to below the BB to eliminate the jmp. + SuccMBB->moveAfter(MBB); + TII->RemoveBranch(*MBB); + TII->RemoveBranch(*SuccMBB); + if (TBB) + TII->InsertBranch(*SuccMBB, TBB, FBB, Cond); + ChangedMBBs.insert(MBB); + ChangedMBBs.insert(SuccMBB); + if (FtMBB) { + TII->RemoveBranch(*FtMBB); + TII->InsertBranch(*FtMBB, FtTBB, FtFBB, FtCond); + ChangedMBBs.insert(FtMBB); + } + + // If BB is the loop latch, we may have a new loop headr. + if (MBB == L->getLoopLatch()) { + assert(MLI->isLoopHeader(SuccMBB) && + "Only succ of loop latch is not the header?"); + if (HasOneIntraSucc && IntraSucc) + std::replace(LoopHeaders.begin(),LoopHeaders.end(), SuccMBB, IntraSucc); + } + } + + ++NumIntraMoved; + return Changed; +} + /// AlignLoops - Align loop headers to target preferred alignments. /// bool CodePlacementOpt::AlignLoops(MachineFunction &MF) { - const TargetLowering *TLI = MF.getTarget().getTargetLowering(); - if (!TLI) + const Function *F = MF.getFunction(); + if (F->hasFnAttr(Attribute::OptimizeForSize)) return false; unsigned Align = TLI->getPrefLoopAlignment(); if (!Align) return false; // Don't care about loop alignment. - const Function *F = MF.getFunction(); - if (F->hasFnAttr(Attribute::OptimizeForSize)) - return false; + // Make sure blocks are numbered in order + MF.RenumberBlocks(); bool Changed = false; - for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { - MachineBasicBlock *MBB = I; - if (MLI->isLoopHeader(MBB)) { - MachineBasicBlock *PredBB = prior(I); - if (MLI->getLoopFor(MBB) == MLI->getLoopFor(PredBB)) - // If previously BB is in the same loop, don't align this BB. We want - // to prevent adding noop's inside a loop. - continue; - MBB->setAlignment(Align); + for (unsigned i = 0, e = LoopHeaders.size(); i != e; ++i) { + MachineBasicBlock *HeaderMBB = LoopHeaders[i]; + MachineBasicBlock *PredMBB = prior(MachineFunction::iterator(HeaderMBB)); + if (MLI->getLoopFor(HeaderMBB) != MLI->getLoopFor(PredMBB)) { + // If previously BB is in the same loop, don't align this BB. We want + // to prevent adding noop's inside a loop. + HeaderMBB->setAlignment(Align); Changed = true; + ++NumHeaderAligned; } } @@ -90,8 +282,36 @@ bool CodePlacementOpt::runOnMachineFunction(MachineFunction &MF) { if (MLI->empty()) return false; // No loops. - bool Changed = false; + TLI = MF.getTarget().getTargetLowering(); + TII = MF.getTarget().getInstrInfo(); + + // Analyze the BBs first and keep track of loop headers and BBs that + // end with an unconditional jmp to another block in the same loop. + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { + MachineBasicBlock *MBB = I; + if (MBB->isLandingPad()) + continue; + MachineLoop *L = MLI->getLoopFor(MBB); + if (!L) + continue; + if (MLI->isLoopHeader(MBB)) + LoopHeaders.push_back(MBB); + + MachineBasicBlock *TBB = 0, *FBB = 0; + SmallVector Cond; + if (TII->AnalyzeBranch(*MBB, TBB, FBB, Cond) || !Cond.empty()) + continue; + if (MLI->getLoopFor(TBB) == L && !TBB->isLandingPad()) + UncondJmpMBBs.push_back(std::make_pair(MBB, TBB)); + } + + bool Changed = OptimizeIntraLoopEdges(); + Changed |= AlignLoops(MF); + ChangedMBBs.clear(); + UncondJmpMBBs.clear(); + LoopHeaders.clear(); + return Changed; } diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 50115c29ca3..369c4c6e7e0 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -1508,7 +1508,7 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, if (I->getOpcode() == X86::JMP) { if (!AllowModify) { TBB = I->getOperand(0).getMBB(); - return false; + continue; } // If the block has any instructions after a JMP, delete them. diff --git a/test/CodeGen/X86/code_placement.ll b/test/CodeGen/X86/code_placement.ll new file mode 100644 index 00000000000..3ffb6ba3d82 --- /dev/null +++ b/test/CodeGen/X86/code_placement.ll @@ -0,0 +1,134 @@ +; RUN: llvm-as < %s | llc -march=x86 -opt-loop-bb-placement | %prcontext jmp 1 | grep align + +@Te0 = external global [256 x i32] ; <[256 x i32]*> [#uses=5] +@Te1 = external global [256 x i32] ; <[256 x i32]*> [#uses=4] +@Te3 = external global [256 x i32] ; <[256 x i32]*> [#uses=2] + +define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r) nounwind ssp { +entry: + %0 = load i32* %rk, align 4 ; [#uses=1] + %1 = getelementptr i32* %rk, i64 1 ; [#uses=1] + %2 = load i32* %1, align 4 ; [#uses=1] + %tmp15 = add i32 %r, -1 ; [#uses=1] + %tmp.16 = zext i32 %tmp15 to i64 ; [#uses=2] + br label %bb + +bb: ; preds = %bb1, %entry + %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %bb1 ] ; [#uses=3] + %s1.0 = phi i32 [ %2, %entry ], [ %56, %bb1 ] ; [#uses=2] + %s0.0 = phi i32 [ %0, %entry ], [ %43, %bb1 ] ; [#uses=2] + %tmp18 = shl i64 %indvar, 4 ; [#uses=4] + %rk26 = bitcast i32* %rk to i8* ; [#uses=6] + %3 = lshr i32 %s0.0, 24 ; [#uses=1] + %4 = zext i32 %3 to i64 ; [#uses=1] + %5 = getelementptr [256 x i32]* @Te0, i64 0, i64 %4 ; [#uses=1] + %6 = load i32* %5, align 4 ; [#uses=1] + %7 = lshr i32 %s1.0, 16 ; [#uses=1] + %8 = and i32 %7, 255 ; [#uses=1] + %9 = zext i32 %8 to i64 ; [#uses=1] + %10 = getelementptr [256 x i32]* @Te1, i64 0, i64 %9 ; [#uses=1] + %11 = load i32* %10, align 4 ; [#uses=1] + %ctg2.sum2728 = or i64 %tmp18, 8 ; [#uses=1] + %12 = getelementptr i8* %rk26, i64 %ctg2.sum2728 ; [#uses=1] + %13 = bitcast i8* %12 to i32* ; [#uses=1] + %14 = load i32* %13, align 4 ; [#uses=1] + %15 = xor i32 %11, %6 ; [#uses=1] + %16 = xor i32 %15, %14 ; [#uses=3] + %17 = lshr i32 %s1.0, 24 ; [#uses=1] + %18 = zext i32 %17 to i64 ; [#uses=1] + %19 = getelementptr [256 x i32]* @Te0, i64 0, i64 %18 ; [#uses=1] + %20 = load i32* %19, align 4 ; [#uses=1] + %21 = and i32 %s0.0, 255 ; [#uses=1] + %22 = zext i32 %21 to i64 ; [#uses=1] + %23 = getelementptr [256 x i32]* @Te3, i64 0, i64 %22 ; [#uses=1] + %24 = load i32* %23, align 4 ; [#uses=1] + %ctg2.sum2930 = or i64 %tmp18, 12 ; [#uses=1] + %25 = getelementptr i8* %rk26, i64 %ctg2.sum2930 ; [#uses=1] + %26 = bitcast i8* %25 to i32* ; [#uses=1] + %27 = load i32* %26, align 4 ; [#uses=1] + %28 = xor i32 %24, %20 ; [#uses=1] + %29 = xor i32 %28, %27 ; [#uses=4] + %30 = lshr i32 %16, 24 ; [#uses=1] + %31 = zext i32 %30 to i64 ; [#uses=1] + %32 = getelementptr [256 x i32]* @Te0, i64 0, i64 %31 ; [#uses=1] + %33 = load i32* %32, align 4 ; [#uses=2] + %exitcond = icmp eq i64 %indvar, %tmp.16 ; [#uses=1] + br i1 %exitcond, label %bb2, label %bb1 + +bb1: ; preds = %bb + %ctg2.sum31 = add i64 %tmp18, 16 ; [#uses=1] + %34 = getelementptr i8* %rk26, i64 %ctg2.sum31 ; [#uses=1] + %35 = bitcast i8* %34 to i32* ; [#uses=1] + %36 = lshr i32 %29, 16 ; [#uses=1] + %37 = and i32 %36, 255 ; [#uses=1] + %38 = zext i32 %37 to i64 ; [#uses=1] + %39 = getelementptr [256 x i32]* @Te1, i64 0, i64 %38 ; [#uses=1] + %40 = load i32* %39, align 4 ; [#uses=1] + %41 = load i32* %35, align 4 ; [#uses=1] + %42 = xor i32 %40, %33 ; [#uses=1] + %43 = xor i32 %42, %41 ; [#uses=1] + %44 = lshr i32 %29, 24 ; [#uses=1] + %45 = zext i32 %44 to i64 ; [#uses=1] + %46 = getelementptr [256 x i32]* @Te0, i64 0, i64 %45 ; [#uses=1] + %47 = load i32* %46, align 4 ; [#uses=1] + %48 = and i32 %16, 255 ; [#uses=1] + %49 = zext i32 %48 to i64 ; [#uses=1] + %50 = getelementptr [256 x i32]* @Te3, i64 0, i64 %49 ; [#uses=1] + %51 = load i32* %50, align 4 ; [#uses=1] + %ctg2.sum32 = add i64 %tmp18, 20 ; [#uses=1] + %52 = getelementptr i8* %rk26, i64 %ctg2.sum32 ; [#uses=1] + %53 = bitcast i8* %52 to i32* ; [#uses=1] + %54 = load i32* %53, align 4 ; [#uses=1] + %55 = xor i32 %51, %47 ; [#uses=1] + %56 = xor i32 %55, %54 ; [#uses=1] + %indvar.next = add i64 %indvar, 1 ; [#uses=1] + br label %bb + +bb2: ; preds = %bb + %tmp10 = shl i64 %tmp.16, 4 ; [#uses=2] + %ctg2.sum = add i64 %tmp10, 16 ; [#uses=1] + %tmp1213 = getelementptr i8* %rk26, i64 %ctg2.sum ; [#uses=1] + %57 = bitcast i8* %tmp1213 to i32* ; [#uses=1] + %58 = and i32 %33, -16777216 ; [#uses=1] + %59 = lshr i32 %29, 16 ; [#uses=1] + %60 = and i32 %59, 255 ; [#uses=1] + %61 = zext i32 %60 to i64 ; [#uses=1] + %62 = getelementptr [256 x i32]* @Te1, i64 0, i64 %61 ; [#uses=1] + %63 = load i32* %62, align 4 ; [#uses=1] + %64 = and i32 %63, 16711680 ; [#uses=1] + %65 = or i32 %64, %58 ; [#uses=1] + %66 = load i32* %57, align 4 ; [#uses=1] + %67 = xor i32 %65, %66 ; [#uses=2] + %68 = lshr i32 %29, 8 ; [#uses=1] + %69 = zext i32 %68 to i64 ; [#uses=1] + %70 = getelementptr [256 x i32]* @Te0, i64 0, i64 %69 ; [#uses=1] + %71 = load i32* %70, align 4 ; [#uses=1] + %72 = and i32 %71, -16777216 ; [#uses=1] + %73 = and i32 %16, 255 ; [#uses=1] + %74 = zext i32 %73 to i64 ; [#uses=1] + %75 = getelementptr [256 x i32]* @Te1, i64 0, i64 %74 ; [#uses=1] + %76 = load i32* %75, align 4 ; [#uses=1] + %77 = and i32 %76, 16711680 ; [#uses=1] + %78 = or i32 %77, %72 ; [#uses=1] + %ctg2.sum25 = add i64 %tmp10, 20 ; [#uses=1] + %79 = getelementptr i8* %rk26, i64 %ctg2.sum25 ; [#uses=1] + %80 = bitcast i8* %79 to i32* ; [#uses=1] + %81 = load i32* %80, align 4 ; [#uses=1] + %82 = xor i32 %78, %81 ; [#uses=2] + %83 = lshr i32 %67, 24 ; [#uses=1] + %84 = trunc i32 %83 to i8 ; [#uses=1] + store i8 %84, i8* %out, align 1 + %85 = lshr i32 %67, 16 ; [#uses=1] + %86 = trunc i32 %85 to i8 ; [#uses=1] + %87 = getelementptr i8* %out, i64 1 ; [#uses=1] + store i8 %86, i8* %87, align 1 + %88 = getelementptr i8* %out, i64 4 ; [#uses=1] + %89 = lshr i32 %82, 24 ; [#uses=1] + %90 = trunc i32 %89 to i8 ; [#uses=1] + store i8 %90, i8* %88, align 1 + %91 = lshr i32 %82, 16 ; [#uses=1] + %92 = trunc i32 %91 to i8 ; [#uses=1] + %93 = getelementptr i8* %out, i64 5 ; [#uses=1] + store i8 %92, i8* %93, align 1 + ret void +}