From 05d43d8b6fd520bcb828ea5025743b7eeb407fbf Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Tue, 28 Jan 2014 09:10:41 +0000 Subject: [PATCH] [vectorizer] Completely disable the block frequency guidance of the loop vectorizer, placing it behind an off-by-default flag. It turns out that block frequency isn't what we want at all, here or elsewhere. This has been I think a nagging feeling for several of us working with it, but Arnold has given some really nice simple examples where the results are so comprehensively wrong that they aren't useful. I'm planning to email the dev list with a summary of why its not really useful and a couple of ideas about how to better structure these types of heuristics. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@200294 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 16 +++++++++++++--- test/Transforms/LoopVectorize/X86/small-size.ll | 2 +- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 78674952847..c72c2dc41fc 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -172,6 +172,12 @@ static cl::opt SmallLoopCost( "small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the unroller.")); +static cl::opt LoopVectorizeWithBlockFrequency( + "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden, + cl::desc("Enable the use of the block frequency analysis to access PGO " + "heuristics minimizing code growth in cold regions and being more " + "aggressive in hot regions.")); + // Runtime unroll loops for load/store throughput. static cl::opt EnableLoadStoreRuntimeUnroll( "enable-loadstore-runtime-unroll", cl::init(false), cl::Hidden, @@ -1099,9 +1105,13 @@ struct LoopVectorize : public FunctionPass { // Compute the weighted frequency of this loop being executed and see if it // is less than 20% of the function entry baseline frequency. Note that we // always have a canonical loop here because we think we *can* vectoriez. - BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader()); - if (Hints.Force != 1 && LoopEntryFreq < ColdEntryFreq) - OptForSize = true; + // FIXME: This is hidden behind a flag due to pervasive problems with + // exactly what block frequency models. + if (LoopVectorizeWithBlockFrequency) { + BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader()); + if (Hints.Force != 1 && LoopEntryFreq < ColdEntryFreq) + OptForSize = true; + } // Check the function attributes to see if implicit floats are allowed.a // FIXME: This check doesn't seem possibly correct -- what if the loop is diff --git a/test/Transforms/LoopVectorize/X86/small-size.ll b/test/Transforms/LoopVectorize/X86/small-size.ll index 1d46366369c..dfa4faaf09e 100644 --- a/test/Transforms/LoopVectorize/X86/small-size.ll +++ b/test/Transforms/LoopVectorize/X86/small-size.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -loop-vectorize-with-block-frequency -dce -instcombine -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0"