From 05da4dd998a212a262aca58e49a1c5e04c8844f3 Mon Sep 17 00:00:00 2001 From: Nick Lewycky Date: Mon, 5 May 2014 23:59:03 +0000 Subject: [PATCH] Improve 'tail' call marking in TRE. A bootstrap of clang goes from 375k calls marked tail in the IR to 470k, however this improvement does not carry into an improvement of the call/jmp ratio on x86. The most common pattern is a tail call + br to a block with nothing but a 'ret'. The number of tail call to loop conversions remains the same (1618 by my count). The new algorithm does a local scan over the use-def chains to identify local "alloca-derived" values, as well as points where the alloca could escape. Then, a visit over the CFG marks blocks as being before or after the allocas have escaped, and annotates the calls accordingly. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@208017 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/CallSite.h | 5 + .../Scalar/TailRecursionElimination.cpp | 326 +++++++++++++----- test/Transforms/TailCallElim/basic.ll | 23 ++ 3 files changed, 275 insertions(+), 79 deletions(-) diff --git a/include/llvm/IR/CallSite.h b/include/llvm/IR/CallSite.h index 9847e70050b..deea4151ddd 100644 --- a/include/llvm/IR/CallSite.h +++ b/include/llvm/IR/CallSite.h @@ -166,6 +166,11 @@ public: return isCall() && cast(getInstruction())->isMustTailCall(); } + /// \brief Tests if this call site is marked as a tail call. + bool isTailCall() const { + return isCall() && cast(getInstruction())->isTailCall(); + } + #define CALLSITE_DELEGATE_GETTER(METHOD) \ InstrTy *II = getInstruction(); \ return isCall() \ diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index 8b2fb1b3f9b..64fdae1c580 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -55,6 +55,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/Loads.h" @@ -95,6 +96,9 @@ namespace { bool runOnFunction(Function &F) override; private: + bool runTRE(Function &F); + bool markTails(Function &F, bool &AllCallsAreTailCalls); + CallInst *FindTRECandidate(Instruction *I, bool CannotTailCallElimCallsMarkedTail); bool EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, @@ -146,35 +150,232 @@ static bool CanTRE(AllocaInst *AI) { isa(AI->getArraySize()); } -namespace { -struct AllocaCaptureTracker : public CaptureTracker { - AllocaCaptureTracker() : Captured(false) {} - - void tooManyUses() override { Captured = true; } - - bool shouldExplore(const Use *U) override { - Value *V = U->getUser(); - if (isa(V) || isa(V)) - UsesAlloca.insert(V); - return true; - } - - bool captured(const Use *U) override { - if (isa(U->getUser())) - return false; - Captured = true; - return true; - } - - bool Captured; - SmallPtrSet UsesAlloca; -}; -} // end anonymous namespace - bool TailCallElim::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; + bool AllCallsAreTailCalls = false; + bool Modified = markTails(F, AllCallsAreTailCalls); + if (AllCallsAreTailCalls) + Modified |= runTRE(F); + return Modified; +} + +namespace { +struct AllocaDerivedValueTracker { + // Start at a root value and walk its use-def chain to mark calls that use the + // value or a derived value in AllocaUsers, and places where it may escape in + // EscapePoints. + void walk(Value *Root) { + SmallVector Worklist; + SmallPtrSet Visited; + + auto AddUsesToWorklist = [&](Value *V) { + for (auto &U : V->uses()) { + if (!Visited.insert(&U)) + continue; + Worklist.push_back(&U); + } + }; + + AddUsesToWorklist(Root); + + while (!Worklist.empty()) { + Use *U = Worklist.pop_back_val(); + Instruction *I = cast(U->getUser()); + + switch (I->getOpcode()) { + case Instruction::Call: + case Instruction::Invoke: { + CallSite CS(I); + bool IsNocapture = !CS.isCallee(U) && + CS.doesNotCapture(CS.getArgumentNo(U)); + callUsesLocalStack(CS, IsNocapture); + if (IsNocapture) { + // If the alloca-derived argument is passed in as nocapture, then it + // can't propagate to the call's return. That would be capturing. + continue; + } + break; + } + case Instruction::Load: { + // The result of a load is not alloca-derived (unless an alloca has + // otherwise escaped, but this is a local analysis). + continue; + } + case Instruction::Store: { + if (U->getOperandNo() == 0) + EscapePoints.insert(I); + continue; // Stores have no users to analyze. + } + case Instruction::BitCast: + case Instruction::GetElementPtr: + case Instruction::PHI: + case Instruction::Select: + case Instruction::AddrSpaceCast: + break; + default: + EscapePoints.insert(I); + break; + } + + AddUsesToWorklist(I); + } + } + + void callUsesLocalStack(CallSite CS, bool IsNocapture) { + // Add it to the list of alloca users. If it's already there, skip further + // processing. + if (!AllocaUsers.insert(CS.getInstruction())) + return; + + // If it's nocapture then it can't capture the alloca. + if (IsNocapture) + return; + + // If it can write to memory, it can leak the alloca value. + if (!CS.onlyReadsMemory()) + EscapePoints.insert(CS.getInstruction()); + } + + SmallPtrSet AllocaUsers; + SmallPtrSet EscapePoints; +}; +} + +bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) { + if (F.callsFunctionThatReturnsTwice()) + return false; + AllCallsAreTailCalls = true; + + // The local stack holds all alloca instructions and all byval arguments. + AllocaDerivedValueTracker Tracker; + for (Argument &Arg : F.args()) { + if (Arg.hasByValAttr()) + Tracker.walk(&Arg); + } + for (auto &BB : F) { + for (auto &I : BB) + if (AllocaInst *AI = dyn_cast(&I)) + Tracker.walk(AI); + } + + bool Modified = false; + + // Track whether a block is reachable after an alloca has escaped. Blocks that + // contain the escaping instruction will be marked as being visited without an + // escaped alloca, since that is how the block began. + enum VisitType { + UNVISITED, + UNESCAPED, + ESCAPED + }; + DenseMap Visited; + + // We propagate the fact that an alloca has escaped from block to successor. + // Visit the blocks that are propagating the escapedness first. To do this, we + // maintain two worklists. + SmallVector WorklistUnescaped, WorklistEscaped; + + // We may enter a block and visit it thinking that no alloca has escaped yet, + // then see an escape point and go back around a loop edge and come back to + // the same block twice. Because of this, we defer setting tail on calls when + // we first encounter them in a block. Every entry in this list does not + // statically use an alloca via use-def chain analysis, but may find an alloca + // through other means if the block turns out to be reachable after an escape + // point. + SmallVector DeferredTails; + + BasicBlock *BB = &F.getEntryBlock(); + VisitType Escaped = UNESCAPED; + do { + for (auto &I : *BB) { + if (Tracker.EscapePoints.count(&I)) + Escaped = ESCAPED; + + CallInst *CI = dyn_cast(&I); + if (!CI || CI->isTailCall()) + continue; + + if (CI->doesNotAccessMemory()) { + // A call to a readnone function whose arguments are all things computed + // outside this function can be marked tail. Even if you stored the + // alloca address into a global, a readnone function can't load the + // global anyhow. + // + // Note that this runs whether we know an alloca has escaped or not. If + // it has, then we can't trust Tracker.AllocaUsers to be accurate. + bool SafeToTail = true; + for (auto &Arg : CI->arg_operands()) { + if (isa(Arg.getUser())) + continue; + if (Argument *A = dyn_cast(Arg.getUser())) + if (!A->hasByValAttr()) + continue; + SafeToTail = false; + break; + } + if (SafeToTail) { + F.getContext().emitOptimizationRemark( + "tailcallelim", F, CI->getDebugLoc(), + "found readnone tail call candidate"); + CI->setTailCall(); + Modified = true; + continue; + } + } + + if (Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) { + DeferredTails.push_back(CI); + } else { + AllCallsAreTailCalls = false; + } + } + + for (auto *SuccBB : make_range(succ_begin(BB), succ_end(BB))) { + auto &State = Visited[SuccBB]; + if (State < Escaped) { + State = Escaped; + if (State == ESCAPED) + WorklistEscaped.push_back(SuccBB); + else + WorklistUnescaped.push_back(SuccBB); + } + } + + if (!WorklistEscaped.empty()) { + BB = WorklistEscaped.pop_back_val(); + Escaped = ESCAPED; + } else { + BB = nullptr; + while (!WorklistUnescaped.empty()) { + auto *NextBB = WorklistUnescaped.pop_back_val(); + if (Visited[NextBB] == UNESCAPED) { + BB = NextBB; + Escaped = UNESCAPED; + break; + } + } + } + } while (BB); + + for (CallInst *CI : DeferredTails) { + if (Visited[CI->getParent()] != ESCAPED) { + // If the escape point was part way through the block, calls after the + // escape point wouldn't have been put into DeferredTails. + F.getContext().emitOptimizationRemark( + "tailcallelim", F, CI->getDebugLoc(), "found tail call candidate"); + CI->setTailCall(); + Modified = true; + } else { + AllCallsAreTailCalls = false; + } + } + + return Modified; +} + +bool TailCallElim::runTRE(Function &F) { // If this function is a varargs function, we won't be able to PHI the args // right, so don't even try to convert it... if (F.getFunctionType()->isVarArg()) return false; @@ -191,46 +392,30 @@ bool TailCallElim::runOnFunction(Function &F) { // doesn't). bool CanTRETailMarkedCall = true; - // Find calls that can be marked tail. - AllocaCaptureTracker ACT; + // Find dynamic allocas. for (Function::iterator BB = F.begin(), EE = F.end(); BB != EE; ++BB) { for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { if (AllocaInst *AI = dyn_cast(I)) { CanTRETailMarkedCall &= CanTRE(AI); - PointerMayBeCaptured(AI, &ACT); - // If any allocas are captured, exit. - if (ACT.Captured) - return false; } } } - // If any byval or inalloca args are captured, exit. They are also allocated - // in our stack frame. - for (Argument &Arg : F.args()) { - if (Arg.hasByValOrInAllocaAttr()) - PointerMayBeCaptured(&Arg, &ACT); - if (ACT.Captured) - return false; - } - - // Second pass, change any tail recursive calls to loops. + // Change any tail recursive calls to loops. // // FIXME: The code generator produces really bad code when an 'escaping // alloca' is changed from being a static alloca to being a dynamic alloca. // Until this is resolved, disable this transformation if that would ever // happen. This bug is PR962. - if (ACT.UsesAlloca.empty()) { - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - if (ReturnInst *Ret = dyn_cast(BB->getTerminator())) { - bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, - ArgumentPHIs, !CanTRETailMarkedCall); - if (!Change && BB->getFirstNonPHIOrDbg() == Ret) - Change = FoldReturnAndProcessPred(BB, Ret, OldEntry, - TailCallsAreMarkedTail, ArgumentPHIs, - !CanTRETailMarkedCall); - MadeChange |= Change; - } + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + if (ReturnInst *Ret = dyn_cast(BB->getTerminator())) { + bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, + ArgumentPHIs, !CanTRETailMarkedCall); + if (!Change && BB->getFirstNonPHIOrDbg() == Ret) + Change = FoldReturnAndProcessPred(BB, Ret, OldEntry, + TailCallsAreMarkedTail, ArgumentPHIs, + !CanTRETailMarkedCall); + MadeChange |= Change; } } @@ -239,34 +424,13 @@ bool TailCallElim::runOnFunction(Function &F) { // with themselves. Check to see if we did and clean up our mess if so. This // occurs when a function passes an argument straight through to its tail // call. - if (!ArgumentPHIs.empty()) { - for (unsigned i = 0, e = ArgumentPHIs.size(); i != e; ++i) { - PHINode *PN = ArgumentPHIs[i]; + for (unsigned i = 0, e = ArgumentPHIs.size(); i != e; ++i) { + PHINode *PN = ArgumentPHIs[i]; - // If the PHI Node is a dynamic constant, replace it with the value it is. - if (Value *PNV = SimplifyInstruction(PN)) { - PN->replaceAllUsesWith(PNV); - PN->eraseFromParent(); - } - } - } - - // At this point, we know that the function does not have any captured - // allocas. If additionally the function does not call setjmp, mark all calls - // in the function that do not access stack memory with the tail keyword. This - // implies ensuring that there does not exist any path from a call that takes - // in an alloca but does not capture it and the call which we wish to mark - // with "tail". - if (!F.callsFunctionThatReturnsTwice()) { - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { - if (CallInst *CI = dyn_cast(I)) { - if (!ACT.UsesAlloca.count(CI)) { - CI->setTailCall(); - MadeChange = true; - } - } - } + // If the PHI Node is a dynamic constant, replace it with the value it is. + if (Value *PNV = SimplifyInstruction(PN)) { + PN->replaceAllUsesWith(PNV); + PN->eraseFromParent(); } } @@ -520,6 +684,10 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, BasicBlock *BB = Ret->getParent(); Function *F = BB->getParent(); + F->getContext().emitOptimizationRemark( + "tailcallelim", *F, CI->getDebugLoc(), + "transforming tail recursion to loop"); + // OK! We can transform this tail call. If this is the first one found, // create the new entry block, allowing us to branch back to the old entry. if (!OldEntry) { diff --git a/test/Transforms/TailCallElim/basic.ll b/test/Transforms/TailCallElim/basic.ll index 5582ee33edc..341736d48ef 100644 --- a/test/Transforms/TailCallElim/basic.ll +++ b/test/Transforms/TailCallElim/basic.ll @@ -151,3 +151,26 @@ define void @test9(i32* byval %a) { call void @use(i32* %a) ret void } + +%struct.X = type { i8* } + +declare void @ctor(%struct.X*) +define void @test10(%struct.X* noalias sret %agg.result, i1 zeroext %b) { +; CHECK-LABEL @test10 +entry: + %x = alloca %struct.X, align 8 + br i1 %b, label %if.then, label %if.end + +if.then: ; preds = %entry + call void @ctor(%struct.X* %agg.result) +; CHECK: tail call void @ctor + br label %return + +if.end: + call void @ctor(%struct.X* %x) +; CHECK: call void @ctor + br label %return + +return: + ret void +}