From a16c1b55e2aac49c7336f3f54b50bbe85335712e Mon Sep 17 00:00:00 2001 From: Arnold Schwaighofer Date: Sun, 2 Feb 2014 03:12:34 +0000 Subject: [PATCH] LoopVectorizer: Enable unrolling of conditional stores and the load/store unrolling heuristic per default Benchmarking on x86_64 (thanks Chandler!) and ARM has shown those options speed up some benchmarks while not causing any interesting regressions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@200621 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 6 +++--- test/Transforms/LoopVectorize/ARM/arm-unroll.ll | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 1f494922b31..930cf7799a2 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -180,16 +180,16 @@ static cl::opt LoopVectorizeWithBlockFrequency( // Runtime unroll loops for load/store throughput. static cl::opt EnableLoadStoreRuntimeUnroll( - "enable-loadstore-runtime-unroll", cl::init(false), cl::Hidden, + "enable-loadstore-runtime-unroll", cl::init(true), cl::Hidden, cl::desc("Enable runtime unrolling until load/store ports are saturated")); /// The number of stores in a loop that are allowed to need predication. static cl::opt NumberOfStoresToPredicate( - "vectorize-num-stores-pred", cl::init(0), cl::Hidden, + "vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if.")); static cl::opt EnableIndVarRegisterHeur( - "enable-ind-var-reg-heur", cl::init(false), cl::Hidden, + "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when unrolling")); static cl::opt EnableCondStoresVectorization( diff --git a/test/Transforms/LoopVectorize/ARM/arm-unroll.ll b/test/Transforms/LoopVectorize/ARM/arm-unroll.ll index 0b87e0e7405..8843fc2d2b1 100644 --- a/test/Transforms/LoopVectorize/ARM/arm-unroll.ll +++ b/test/Transforms/LoopVectorize/ARM/arm-unroll.ll @@ -47,6 +47,7 @@ define i32 @register_limit(i32* nocapture %A, i32 %n) { %sum.03 = phi i32 [ %7, %.lr.ph ], [ 0, %0 ] %sum.04 = phi i32 [ %8, %.lr.ph ], [ 0, %0 ] %sum.05 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ] + %sum.06 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ] %2 = getelementptr inbounds i32* %A, i32 %i.02 %3 = load i32* %2, align 4 %4 = add nsw i32 %3, %sum.01 @@ -55,6 +56,7 @@ define i32 @register_limit(i32* nocapture %A, i32 %n) { %7 = add nsw i32 %3, %sum.03 %8 = add nsw i32 %3, %sum.04 %9 = add nsw i32 %3, %sum.05 + %10 = add nsw i32 %3, %sum.05 %exitcond = icmp eq i32 %5, %n br i1 %exitcond, label %._crit_edge, label %.lr.ph @@ -64,5 +66,6 @@ define i32 @register_limit(i32* nocapture %A, i32 %n) { %sum.2.lcssa = phi i32 [ 0, %0 ], [ %7, %.lr.ph ] %sum.4.lcssa = phi i32 [ 0, %0 ], [ %8, %.lr.ph ] %sum.5.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ] + %sum.6.lcssa = phi i32 [ 0, %0 ], [ %10, %.lr.ph ] ret i32 %sum.0.lcssa }