Move late partial-unrolling thresholds into the processor definitions

The old method used by X86TTI to determine partial-unrolling thresholds was messy (because it worked by testing target features), and also would not correctly identify the target CPU if certain target features were disabled. After some discussions on IRC with Chandler et al., it was decided that the processor scheduling models were the right containers for this information (because it is often tied to special uop dispatch-buffer sizes). This does represent a small functionality change: - For generic x86-64 (which uses the SB model and, thus, will get some unrolling). - For AMD cores (because they still currently use the SB scheduling model) - For Haswell (based on benchmarking by Louis Gerbarg, it was decided to bump the default threshold to 50; we're working on a test case for this). Otherwise, nothing has changed for any other targets. The logic, however, has been moved into BasicTTI, so other targets may now also opt-in to this functionality simply by setting LoopMicroOpBufferSize in their processor model definitions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@208289 91177308-0d34-0410-b5e6-96231b3b80d8
2025-06-15 05:24:01 +00:00 · 2014-05-08 09:14:44 +00:00
parent f46646d87b
commit f35ce2376c
11 changed files with 102 additions and 91 deletions
--- a/lib/CodeGen/BasicTargetTransformInfo.cpp
+++ b/lib/CodeGen/BasicTargetTransformInfo.cpp
@ -16,11 +16,18 @@
 //===----------------------------------------------------------------------===//

 #include "llvm/CodeGen/Passes.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <utility>
 using namespace llvm;

+static cl::opt<unsigned>
+PartialUnrollingThreshold("partial-unrolling-threshold", cl::init(0),
+  cl::desc("Threshold for partial unrolling"), cl::Hidden);
+
 #define DEBUG_TYPE "basictti"

 namespace {
@ -187,7 +194,61 @@ bool BasicTTI::haveFastSqrt(Type *Ty) const {
  return TLI->isTypeLegal(VT) && TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
 }

-void BasicTTI::getUnrollingPreferences(Loop *, UnrollingPreferences &) const { }
+void BasicTTI::getUnrollingPreferences(Loop *L,
+                                       UnrollingPreferences &UP) const {
+  // This unrolling functionality is target independent, but to provide some
+  // motivation for its indended use, for x86:
+
+  // According to the Intel 64 and IA-32 Architectures Optimization Reference
+  // Manual, Intel Core models and later have a loop stream detector
+  // (and associated uop queue) that can benefit from partial unrolling.
+  // The relevant requirements are:
+  //  - The loop must have no more than 4 (8 for Nehalem and later) branches
+  //    taken, and none of them may be calls.
+  //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
+
+  // According to the Software Optimization Guide for AMD Family 15h Processors,
+  // models 30h-4fh (Steamroller and later) have a loop predictor and loop
+  // buffer which can benefit from partial unrolling.
+  // The relevant requirements are:
+  //  - The loop must have fewer than 16 branches
+  //  - The loop must have less than 40 uops in all executed loop branches
+
+  // The number of taken branches in a loop is hard to estimate here, and
+  // benchmarking has revealed that it is better not to be conservative when
+  // estimating the branch count. As a result, we'll ignore the branch limits
+  // until someone finds a case where it matters in practice.
+
+  unsigned MaxOps;
+  const TargetSubtargetInfo *ST = &TM->getSubtarget<TargetSubtargetInfo>();
+  if (PartialUnrollingThreshold.getNumOccurrences() > 0)
+    MaxOps = PartialUnrollingThreshold;
+  else if (ST->getSchedModel()->LoopMicroOpBufferSize > 0)
+    MaxOps = ST->getSchedModel()->LoopMicroOpBufferSize;
+  else
+    return;
+
+  // Scan the loop: don't unroll loops with calls.
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I) {
+    BasicBlock *BB = *I;
+
+    for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J)
+      if (isa<CallInst>(J) || isa<InvokeInst>(J)) {
+        ImmutableCallSite CS(J);
+        if (const Function *F = CS.getCalledFunction()) {
+          if (!TopTTI->isLoweredToCall(F))
+            continue;
+        }
+
+        return;
+      }
+  }
+
+  // Enable runtime and partial unrolling up to the specified size.
+  UP.Partial = UP.Runtime = true;
+  UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps;
+}

 //===----------------------------------------------------------------------===//
 //