[SLPVectorizer] Try different vectorization factors for store chains

...and set max vector register size based on target This patch is based on discussion on the llvmdev mailing list: http://lists.cs.uiuc.edu/pipermail/llvmdev/2015-July/087405.html and also solves: https://llvm.org/bugs/show_bug.cgi?id=17170 Several FIXME/TODO items are noted in comments as potential improvements. Differential Revision: http://reviews.llvm.org/D10950 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@241760 91177308-0d34-0410-b5e6-96231b3b80d8
2025-07-02 19:24:25 +00:00 · 2015-07-08 23:40:55 +00:00
parent 6e999b0a30
commit c1c43c15cc
6 changed files with 76 additions and 31 deletions
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@ -69,8 +69,13 @@ static cl::opt<bool> ShouldStartVectorizeHorAtStore(
    cl::desc(
        "Attempt to vectorize horizontal reductions feeding into a store"));

+static cl::opt<int>
+MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
+    cl::desc("Attempt to vectorize for this register size in bits"));
+
 namespace {

+// FIXME: Set this via cl::opt to allow overriding.
 static const unsigned MinVecRegSize = 128;

 static const unsigned RecursionMaxDepth = 12;
@ -3088,6 +3093,17 @@ struct SLPVectorizer : public FunctionPass {
    if (!TTI->getNumberOfRegisters(true))
      return false;

+    // Use the vector register size specified by the target unless overridden
+    // by a command-line option.
+    // TODO: It would be better to limit the vectorization factor based on
+    //       data type rather than just register size. For example, x86 AVX has
+    //       256-bit registers, but it does not support integer operations
+    //       at that width (that requires AVX2).
+    if (MaxVectorRegSizeOption.getNumOccurrences())
+      MaxVecRegSize = MaxVectorRegSizeOption;
+    else
+      MaxVecRegSize = TTI->getRegisterBitWidth(true);
+
    // Don't vectorize when the attribute NoImplicitFloat is used.
    if (F.hasFnAttribute(Attribute::NoImplicitFloat))
      return false;
@ -3165,12 +3181,13 @@ private:
  bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);

  bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
-                           BoUpSLP &R);
+                           BoUpSLP &R, unsigned VecRegSize);

  bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
                       BoUpSLP &R);
 private:
  StoreListMap StoreRefs;
+  unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
 };

 /// \brief Check that the Values in the slice in VL array are still existent in
@ -3185,14 +3202,15 @@ static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
 }

 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
-                                          int CostThreshold, BoUpSLP &R) {
+                                        int CostThreshold, BoUpSLP &R,
+                                        unsigned VecRegSize) {
  unsigned ChainLen = Chain.size();
  DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
        << "\n");
  Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
  auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
  unsigned Sz = DL.getTypeSizeInBits(StoreTy);
-  unsigned VF = MinVecRegSize / Sz;
+  unsigned VF = VecRegSize / Sz;

  if (!isPowerOf2_32(Sz) || VF < 2)
    return false;
@ -3276,10 +3294,15 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
      I = ConsecutiveChain[I];
    }

-    if (vectorizeStoreChain(Operands, costThreshold, R)) {
-      // Mark the vectorized stores so that we don't vectorize them again.
-      VectorizedStores.insert(Operands.begin(), Operands.end());
-      Changed = true;
+    // FIXME: Is division-by-2 the correct step? Should we assert that the
+    // register size is a power-of-2?
+    for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
+      if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
+        // Mark the vectorized stores so that we don't vectorize them again.
+        VectorizedStores.insert(Operands.begin(), Operands.end());
+        Changed = true;
+        break;
+      }
    }
  }

@ -3340,6 +3363,8 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,

  Type *Ty0 = I0->getType();
  unsigned Sz = DL.getTypeSizeInBits(Ty0);
+  // FIXME: Register size should be a parameter to this function, so we can
+  // try different vectorization factors.
  unsigned VF = MinVecRegSize / Sz;

  for (Value *V : VL) {
@ -3569,6 +3594,8 @@ public:
    const DataLayout &DL = B->getModule()->getDataLayout();
    ReductionOpcode = B->getOpcode();
    ReducedValueOpcode = 0;
+    // FIXME: Register size should be a parameter to this function, so we can
+    // try different vectorization factors.
    ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
    ReductionRoot = B;
    ReductionPHI = Phi;
@ -3995,6 +4022,9 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
          << it->second.size() << ".\n");

    // Process the stores in chunks of 16.
+    // TODO: The limit of 16 inhibits greater vectorization factors.
+    //       For example, AVX2 supports v32i8. Increasing this limit, however,
+    //       may cause a significant compile-time increase.
    for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
      unsigned Len = std::min<unsigned>(CE - CI, 16);
      Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),