From 6a35b40250735a50efe66c88414cdd3b79185019 Mon Sep 17 00:00:00 2001 From: Chris Lattner Date: Fri, 19 Jun 2009 04:22:16 +0000 Subject: [PATCH] Improve tail call elim to move loads above readonly calls when it allows forming a tail call. Patch by Frits van Bommel. This implements PR4323. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@73752 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../Scalar/TailRecursionElimination.cpp | 16 ++- .../TailCallElim/dont_reorder_load.ll | 64 +++++++++++ test/Transforms/TailCallElim/reorder_load.ll | 101 ++++++++++++++++++ 3 files changed, 180 insertions(+), 1 deletion(-) create mode 100644 test/Transforms/TailCallElim/dont_reorder_load.ll create mode 100644 test/Transforms/TailCallElim/reorder_load.ll diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index 682d069923e..34ee57c9b9d 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -52,6 +52,7 @@ #define DEBUG_TYPE "tailcallelim" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" #include "llvm/Function.h" @@ -201,8 +202,21 @@ bool TailCallElim::runOnFunction(Function &F) { bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) { // FIXME: We can move load/store/call/free instructions above the call if the // call does not mod/ref the memory location being processed. - if (I->mayHaveSideEffects() || isa(I)) + if (I->mayHaveSideEffects()) // This also handles volatile loads. return false; + + if (LoadInst* L = dyn_cast(I)) { + // Loads may always be moved above calls without side effects. + if (CI->mayHaveSideEffects()) { + // Non-volatile loads may be moved above a call with side effects if it + // does not write to memory and the load provably won't trap. + // FIXME: Writes to memory only matter if they may alias the pointer + // being loaded from. + if (CI->mayWriteToMemory() || + !isSafeToLoadUnconditionally(L->getPointerOperand(), L)) + return false; + } + } // Otherwise, if this is a side-effect free instruction, check to make sure // that it does not use the return value of the call. If it doesn't use the diff --git a/test/Transforms/TailCallElim/dont_reorder_load.ll b/test/Transforms/TailCallElim/dont_reorder_load.ll new file mode 100644 index 00000000000..8fbe0083876 --- /dev/null +++ b/test/Transforms/TailCallElim/dont_reorder_load.ll @@ -0,0 +1,64 @@ +; RUN: llvm-as <%s | opt -tailcallelim | llvm-dis | grep call | count 3 +; PR4323 + +; Several cases where tail call elimination should not move the load above the +; call, and thus can't eliminate the tail recursion. + + +@extern_weak_global = extern_weak global i32 ; [#uses=1] + + +; This load can't be safely moved above the call because the load is from an +; extern_weak global and may trap, but the call may unwind before that happens. +define fastcc i32 @no_tailrecelim_1(i32* %a_arg, i32 %a_len_arg, i32 %start_arg) readonly { +entry: + %tmp2 = icmp sge i32 %start_arg, %a_len_arg ; [#uses=1] + br i1 %tmp2, label %if, label %else + +if: ; preds = %entry + unwind + +else: ; preds = %entry + %tmp7 = add i32 %start_arg, 1 ; [#uses=1] + %tmp8 = call fastcc i32 @no_tailrecelim_1(i32* %a_arg, i32 %a_len_arg, i32 %tmp7) ; [#uses=1] + %tmp9 = load i32* @extern_weak_global ; [#uses=1] + %tmp10 = add i32 %tmp9, %tmp8 ; [#uses=1] + ret i32 %tmp10 +} + + +; This load can't be safely moved above the call because function may write to the pointer. +define fastcc i32 @no_tailrecelim_2(i32* %a_arg, i32 %a_len_arg, i32 %start_arg) nounwind { +entry: + %tmp2 = icmp sge i32 %start_arg, %a_len_arg ; [#uses=1] + br i1 %tmp2, label %if, label %else + +if: ; preds = %entry + store i32 1, i32* %a_arg; + ret i32 0; + +else: ; preds = %entry + %tmp7 = add i32 %start_arg, 1 ; [#uses=1] + %tmp8 = call fastcc i32 @no_tailrecelim_2(i32* %a_arg, i32 %a_len_arg, i32 %tmp7) ; [#uses=1] + %tmp9 = load i32* %a_arg ; [#uses=1] + %tmp10 = add i32 %tmp9, %tmp8 ; [#uses=1] + ret i32 %tmp10 +} + +; This load can't be safely moved above the call because that would change the +; order in which the volatile loads are performed. +define fastcc i32 @no_tailrecelim_3(i32* %a_arg, i32 %a_len_arg, i32 %start_arg) nounwind { +entry: + %tmp2 = icmp sge i32 %start_arg, %a_len_arg ; [#uses=1] + br i1 %tmp2, label %if, label %else + +if: ; preds = %entry + ret i32 0; + +else: ; preds = %entry + %tmp7 = add i32 %start_arg, 1 ; [#uses=1] + %tmp8 = call fastcc i32 @no_tailrecelim_3(i32* %a_arg, i32 %a_len_arg, i32 %tmp7) ; [#uses=1] + %tmp9 = volatile load i32* %a_arg ; [#uses=1] + %tmp10 = add i32 %tmp9, %tmp8 ; [#uses=1] + ret i32 %tmp10 +} diff --git a/test/Transforms/TailCallElim/reorder_load.ll b/test/Transforms/TailCallElim/reorder_load.ll new file mode 100644 index 00000000000..aeb9042bc79 --- /dev/null +++ b/test/Transforms/TailCallElim/reorder_load.ll @@ -0,0 +1,101 @@ +; RUN: llvm-as <%s | opt -tailcallelim | llvm-dis | not grep call +; PR4323 + +; Several cases where tail call elimination should move the load above the call, +; then eliminate the tail recursion. + + +@global = external global i32 ; [#uses=1] +@extern_weak_global = extern_weak global i32 ; [#uses=1] + + +; This load can be moved above the call because the function won't write to it +; and the call has no side effects. +define fastcc i32 @raise_load_1(i32* %a_arg, i32 %a_len_arg, i32 %start_arg) nounwind readonly { +entry: + %tmp2 = icmp sge i32 %start_arg, %a_len_arg ; [#uses=1] + br i1 %tmp2, label %if, label %else + +if: ; preds = %entry + ret i32 0 + +else: ; preds = %entry + %tmp7 = add i32 %start_arg, 1 ; [#uses=1] + %tmp8 = call fastcc i32 @raise_load_1(i32* %a_arg, i32 %a_len_arg, i32 %tmp7) ; [#uses=1] + %tmp9 = load i32* %a_arg ; [#uses=1] + %tmp10 = add i32 %tmp9, %tmp8 ; [#uses=1] + ret i32 %tmp10 +} + + +; This load can be moved above the call because the function won't write to it +; and the load provably can't trap. +define fastcc i32 @raise_load_2(i32* %a_arg, i32 %a_len_arg, i32 %start_arg) readonly { +entry: + %tmp2 = icmp sge i32 %start_arg, %a_len_arg ; [#uses=1] + br i1 %tmp2, label %if, label %else + +if: ; preds = %entry + ret i32 0 + +else: ; preds = %entry + %nullcheck = icmp eq i32* %a_arg, null ; [#uses=1] + br i1 %nullcheck, label %unwind, label %recurse + +unwind: ; preds = %else + unwind + +recurse: ; preds = %else + %tmp7 = add i32 %start_arg, 1 ; [#uses=1] + %tmp8 = call fastcc i32 @raise_load_2(i32* %a_arg, i32 %a_len_arg, i32 %tmp7) ; [#uses=1] + %tmp9 = load i32* @global ; [#uses=1] + %tmp10 = add i32 %tmp9, %tmp8 ; [#uses=1] + ret i32 %tmp10 +} + + +; This load can be safely moved above the call (even though it's from an +; extern_weak global) because the call has no side effects. +define fastcc i32 @raise_load_3(i32* %a_arg, i32 %a_len_arg, i32 %start_arg) nounwind readonly { +entry: + %tmp2 = icmp sge i32 %start_arg, %a_len_arg ; [#uses=1] + br i1 %tmp2, label %if, label %else + +if: ; preds = %entry + ret i32 0 + +else: ; preds = %entry + %tmp7 = add i32 %start_arg, 1 ; [#uses=1] + %tmp8 = call fastcc i32 @raise_load_3(i32* %a_arg, i32 %a_len_arg, i32 %tmp7) ; [#uses=1] + %tmp9 = load i32* @extern_weak_global ; [#uses=1] + %tmp10 = add i32 %tmp9, %tmp8 ; [#uses=1] + ret i32 %tmp10 +} + + +; The second load can be safely moved above the call even though it's from an +; unknown pointer (which normally means it might trap) because the first load +; proves it doesn't trap. +define fastcc i32 @raise_load_4(i32* %a_arg, i32 %a_len_arg, i32 %start_arg) readonly { +entry: + %tmp2 = icmp sge i32 %start_arg, %a_len_arg ; [#uses=1] + br i1 %tmp2, label %if, label %else + +if: ; preds = %entry + ret i32 0 + +else: ; preds = %entry + %nullcheck = icmp eq i32* %a_arg, null ; [#uses=1] + br i1 %nullcheck, label %unwind, label %recurse + +unwind: ; preds = %else + unwind + +recurse: ; preds = %else + %tmp7 = add i32 %start_arg, 1 ; [#uses=1] + %first = load i32* %a_arg ; [#uses=1] + %tmp8 = call fastcc i32 @raise_load_4(i32* %a_arg, i32 %first, i32 %tmp7) ; [#uses=1] + %second = load i32* %a_arg ; [#uses=1] + %tmp10 = add i32 %second, %tmp8 ; [#uses=1] + ret i32 %tmp10 +}