From e503319874f57ab4a0354521b03a71cf8e07b866 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Fri, 4 Jan 2013 17:48:25 +0000
Subject: [PATCH] LoopVectorizer:

1. Add code to estimate register pressure.
2. Add code to select the unroll factor based on register pressure.
3. Add bits to TargetTransformInfo to provide the number of registers.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171469 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetTransformImpl.h     |   2 +
 include/llvm/TargetTransformInfo.h            |  25 ++-
 lib/Target/TargetTransformImpl.cpp            |   4 +
 lib/Target/X86/X86ISelLowering.cpp            |   7 +
 lib/Target/X86/X86ISelLowering.h              |   2 +
 lib/Transforms/Vectorize/LoopVectorize.cpp    | 166 +++++++++++++++++-
 lib/Transforms/Vectorize/LoopVectorize.h      |  32 +++-
 .../LoopVectorize/X86/gcc-examples.ll         |  21 ++-
 test/Transforms/LoopVectorize/gcc-examples.ll |  39 ++++
 9 files changed, 279 insertions(+), 19 deletions(-)

diff --git a/include/llvm/Target/TargetTransformImpl.h b/include/llvm/Target/TargetTransformImpl.h
index bbdb4f1dc24..a285f5ba8f4 100644
--- a/include/llvm/Target/TargetTransformImpl.h
+++ b/include/llvm/Target/TargetTransformImpl.h
@@ -69,6 +69,8 @@ public:
 
   virtual ~VectorTargetTransformImpl() {}
 
+  virtual unsigned getNumberOfRegisters(bool Vector) const;
+
   virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
 
   virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
diff --git a/include/llvm/TargetTransformInfo.h b/include/llvm/TargetTransformInfo.h
index 97744c0f78d..7dd95a79c2d 100644
--- a/include/llvm/TargetTransformInfo.h
+++ b/include/llvm/TargetTransformInfo.h
@@ -164,12 +164,19 @@ public:
     ExtractSubvector // ExtractSubvector Index indicates start offset.
   };
 
-  /// Returns the expected cost of arithmetic ops, such as mul, xor, fsub, etc.
+  /// \return The number of scalar or vector registers that the target has.
+  /// If 'Vectors' is true, it returns the number of vector registers. If it is
+  /// set to false, it returns the number of scalar registers.
+  virtual unsigned getNumberOfRegisters(bool Vector) const {
+    return 8;
+  }
+
+  /// \return The expected cost of arithmetic ops, such as mul, xor, fsub, etc.
   virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const {
     return 1;
   }
 
-  /// Returns the cost of a shuffle instruction of kind Kind and of type Tp.
+  /// \return The cost of a shuffle instruction of kind Kind and of type Tp.
   /// The index and subtype parameters are used by the subvector insertion and
   /// extraction shuffle kinds.
   virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
@@ -177,47 +184,47 @@ public:
     return 1;
   }
 
-  /// Returns the expected cost of cast instructions, such as bitcast, trunc,
+  /// \return The expected cost of cast instructions, such as bitcast, trunc,
   /// zext, etc.
   virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
                                     Type *Src) const {
     return 1;
   }
 
-  /// Returns the expected cost of control-flow related instrutctions such as
+  /// \return The expected cost of control-flow related instrutctions such as
   /// Phi, Ret, Br.
   virtual unsigned getCFInstrCost(unsigned Opcode) const {
     return 1;
   }
 
-  /// Returns the expected cost of compare and select instructions.
+  /// \returns The expected cost of compare and select instructions.
   virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
                                       Type *CondTy = 0) const {
     return 1;
   }
 
-  /// Returns the expected cost of vector Insert and Extract.
+  /// \return The expected cost of vector Insert and Extract.
   /// Use -1 to indicate that there is no information on the index value.
   virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
                                       unsigned Index = -1) const {
     return 1;
   }
 
-  /// Returns the cost of Load and Store instructions.
+  /// \return The cost of Load and Store instructions.
   virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
                                    unsigned Alignment,
                                    unsigned AddressSpace) const {
     return 1;
   }
 
-  /// Returns the cost of Intrinsic instructions.
+  /// \returns The cost of Intrinsic instructions.
   virtual unsigned getIntrinsicInstrCost(Intrinsic::ID,
                                          Type *RetTy,
                                          ArrayRef<Type*> Tys) const {
     return 1;
   }
 
-  /// Returns the number of pieces into which the provided type must be
+  /// \returns The number of pieces into which the provided type must be
   /// split during legalization. Zero is returned when the answer is unknown.
   virtual unsigned getNumberOfParts(Type *Tp) const {
     return 0;
diff --git a/lib/Target/TargetTransformImpl.cpp b/lib/Target/TargetTransformImpl.cpp
index f8c588934fe..7d663f57f98 100644
--- a/lib/Target/TargetTransformImpl.cpp
+++ b/lib/Target/TargetTransformImpl.cpp
@@ -171,6 +171,10 @@ VectorTargetTransformImpl::getScalarizationOverhead(Type *Ty,
   return Cost;
 }
 
+unsigned VectorTargetTransformImpl::getNumberOfRegisters(bool Vector) const {
+  return 8;
+}
+
 unsigned VectorTargetTransformImpl::getArithmeticInstrCost(unsigned Opcode,
                                                            Type *Ty) const {
   // Check if any of the operands are vector operands.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index eca63f80ae0..f482ac98462 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -18115,6 +18115,13 @@ X86ScalarTargetTransformImpl::getPopcntHwSupport(unsigned TyWidth) const {
   return ST.hasSSE41() ? Fast : None;
 }
 
+unsigned X86VectorTargetTransformInfo::getNumberOfRegisters(bool Vector) const {
+  const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>();
+  if (ST.is64Bit())
+    return 16;
+  return 8;
+}
+
 unsigned
 X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode,
                                                      Type *Ty) const {
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 2e2fc2a234f..86b7764c136 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -959,6 +959,8 @@ namespace llvm {
     explicit X86VectorTargetTransformInfo(const TargetLowering *TL) :
     VectorTargetTransformImpl(TL) {}
 
+    virtual unsigned getNumberOfRegisters(bool Vector) const;
+
     virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
 
     virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8feea9360a0..0f84fe05ef0 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 #include "LoopVectorize.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
@@ -43,7 +44,7 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
                     cl::desc("Sets the SIMD width. Zero is autoselect."));
 
 static cl::opt<unsigned>
-VectorizationUnroll("force-vector-unroll", cl::init(1), cl::Hidden,
+VectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden,
                     cl::desc("Sets the vectorization unroll count. "
                              "Zero is autoselect."));
 
@@ -94,7 +95,7 @@ struct LoopVectorize : public LoopPass {
     if (TTI)
       VTTI = TTI->getVectorTargetTransformInfo();
     // Use the cost model.
-    LoopVectorizationCostModel CM(L, SE, &LVL, VTTI);
+    LoopVectorizationCostModel CM(L, SE, LI, &LVL, VTTI);
 
     // Check the function attribues to find out if this function should be
     // optimized for size.
@@ -112,6 +113,7 @@ struct LoopVectorize : public LoopPass {
     }
 
     unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
+    unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll);
 
     if (VF == 1) {
       DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
@@ -120,9 +122,10 @@ struct LoopVectorize : public LoopPass {
 
     DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
           F->getParent()->getModuleIdentifier()<<"\n");
+    DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
 
     // If we decided that it is *legal* to vectorizer the loop then do it.
-    InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, VectorizationUnroll);
+    InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, UF);
     LB.vectorize(&LVL);
 
     DEBUG(verifyFunction(*L->getHeader()->getParent()));
@@ -2082,7 +2085,7 @@ bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
 
 unsigned
 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
-                                                        unsigned UserVF) {
+                                                      unsigned UserVF) {
   if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
     DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
     return 1;
@@ -2148,6 +2151,161 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
   return Width;
 }
 
+unsigned
+LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
+                                               unsigned UserUF) {
+  // Use the user preference, unless 'auto' is selected.
+  if (UserUF != 0)
+    return UserUF;
+
+  // When we optimize for size we don't unroll.
+  if (OptForSize)
+    return 1;
+
+  unsigned TargetVectorRegisters = VTTI->getNumberOfRegisters(true);
+  DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters <<
+        " vector registers\n");
+
+  LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
+  // We divide by these constants so assume that we have at least one
+  // instruction that uses at least one register.
+  R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
+  R.NumInstructions = std::max(R.NumInstructions, 1U);
+
+  // We calculate the unroll factor using the following formula.
+  // Subtract the number of loop invariants from the number of available
+  // registers. These registers are used by all of the unrolled instances.
+  // Next, divide the remaining registers by the number of registers that is
+  // required by the loop, in order to estimate how many parallel instances
+  // fit without causing spills.
+  unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers;
+
+  // We don't want to unroll the loops to the point where they do not fit into
+  // the decoded cache. Assume that we only allow 32 IR instructions.
+  UF = std::min(UF, (32 / R.NumInstructions));
+
+  // Clamp the unroll factor ranges to reasonable factors.
+  if (UF > MaxUnrollSize)
+    UF = MaxUnrollSize;
+  else if (UF < 1)
+    UF = 1;
+
+  return UF;
+}
+
+LoopVectorizationCostModel::RegisterUsage
+LoopVectorizationCostModel::calculateRegisterUsage() {
+  // This function calculates the register usage by measuring the highest number
+  // of values that are alive at a single location. Obviously, this is a very
+  // rough estimation. We scan the loop in a topological order in order and
+  // assign a number to each instruction. We use RPO to ensure that defs are
+  // met before their users. We assume that each instruction that has in-loop
+  // users starts an interval. We record every time that an in-loop value is
+  // used, so we have a list of the first and last occurrences of each
+  // instruction. Next, we transpose this data structure into a multi map that
+  // holds the list of intervals that *end* at a specific location. This multi
+  // map allows us to perform a linear search. We scan the instructions linearly
+  // and record each time that a new interval starts, by placing it in a set.
+  // If we find this value in the multi-map then we remove it from the set.
+  // The max register usage is the maximum size of the set.
+  // We also search for instructions that are defined outside the loop, but are
+  // used inside the loop. We need this number separately from the max-interval
+  // usage number because when we unroll, loop-invariant values do not take
+  // more register.
+  LoopBlocksDFS DFS(TheLoop);
+  DFS.perform(LI);
+
+  RegisterUsage R;
+  R.NumInstructions = 0;
+
+  // Each 'key' in the map opens a new interval. The values
+  // of the map are the index of the 'last seen' usage of the
+  // instruction that is the key.
+  typedef DenseMap<Instruction*, unsigned> IntervalMap;
+  // Maps instruction to its index.
+  DenseMap<unsigned, Instruction*> IdxToInstr;
+  // Marks the end of each interval.
+  IntervalMap EndPoint;
+  // Saves the list of instruction indices that are used in the loop.
+  SmallSet<Instruction*, 8> Ends;
+  // Saves the list of values that are used in the loop but are
+  // defined outside the loop, such as arguments and constants.
+  SmallPtrSet<Value*, 8> LoopInvariants;
+
+  unsigned Index = 0;
+  for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
+       be = DFS.endRPO(); bb != be; ++bb) {
+    R.NumInstructions += (*bb)->size();
+    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+         ++it) {
+      Instruction *I = it;
+      IdxToInstr[Index++] = I;
+
+      // Save the end location of each USE.
+      for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+        Value *U = I->getOperand(i);
+        Instruction *Instr = dyn_cast<Instruction>(U);
+
+        // Ignore non-instruction values such as arguments, constants, etc.
+        if (!Instr) continue;
+
+        // If this instruction is outside the loop then record it and continue.
+        if (!TheLoop->contains(Instr)) {
+          LoopInvariants.insert(Instr);
+          continue;
+        }
+
+        // Overwrite previous end points.
+        EndPoint[Instr] = Index;
+        Ends.insert(Instr);
+      }
+    }
+  }
+
+  // Saves the list of intervals that end with the index in 'key'.
+  typedef SmallVector<Instruction*, 2> InstrList;
+  DenseMap<unsigned, InstrList> TransposeEnds;
+
+  // Transpose the EndPoints to a list of values that end at each index.
+  for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end();
+       it != e; ++it)
+    TransposeEnds[it->second].push_back(it->first);
+
+  SmallSet<Instruction*, 8> OpenIntervals;
+  unsigned MaxUsage = 0;
+
+
+  DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
+  for (unsigned int i = 0; i < Index; ++i) {
+    Instruction *I = IdxToInstr[i];
+    // Ignore instructions that are never used within the loop.
+    if (!Ends.count(I)) continue;
+
+    // Remove all of the instructions that end at this location.
+    InstrList &List = TransposeEnds[i];
+    for (unsigned int i=0, e = List.size(); i < e; ++i)
+      OpenIntervals.erase(List[i]);
+
+    // Count the number of live interals.
+    MaxUsage = std::max(MaxUsage, OpenIntervals.size());
+
+    DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
+          OpenIntervals.size() <<"\n");
+
+    // Add the current instruction to the list of open intervals.
+    OpenIntervals.insert(I);
+  }
+
+  unsigned Invariant = LoopInvariants.size();
+  DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n");
+  DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n");
+  DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n");
+
+  R.LoopInvariantRegs = Invariant;
+  R.MaxLocalUsers = MaxUsage;
+  return R;
+}
+
 unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
   unsigned Cost = 0;
 
diff --git a/lib/Transforms/Vectorize/LoopVectorize.h b/lib/Transforms/Vectorize/LoopVectorize.h
index 68d7ee70469..2333b2bcf94 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.h
+++ b/lib/Transforms/Vectorize/LoopVectorize.h
@@ -68,6 +68,9 @@ const unsigned RuntimeMemoryCheckThreshold = 4;
 /// This is the highest vector width that we try to generate.
 const unsigned MaxVectorSize = 8;
 
+/// This is the highest Unroll Factor.
+const unsigned MaxUnrollSize = 4;
+
 namespace llvm {
 
 // Forward declarations.
@@ -473,17 +476,37 @@ private:
 class LoopVectorizationCostModel {
 public:
   /// C'tor.
-  LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se,
+  LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, LoopInfo *Li,
                              LoopVectorizationLegality *Leg,
                              const VectorTargetTransformInfo *Vtti):
-  TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { }
+  TheLoop(Lp), SE(Se), LI(Li), Legal(Leg), VTTI(Vtti) { }
 
-  /// Returns the most profitable vectorization factor in powers of two.
+  /// \return The most profitable vectorization factor.
   /// This method checks every power of two up to VF. If UserVF is not ZERO
   /// then this vectorization factor will be selected if vectorization is
   /// possible.
   unsigned selectVectorizationFactor(bool OptForSize, unsigned UserVF);
 
+
+  /// \return The most profitable unroll factor.
+  /// If UserUF is non-zero then this method finds the best unroll-factor
+  /// based on register pressure and other parameters.
+  unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF);
+
+  /// \brief A struct that represents some properties of the register usage
+  /// of a loop.
+  struct RegisterUsage {
+    /// Holds the number of loop invariant values that are used in the loop.
+    unsigned LoopInvariantRegs;
+    /// Holds the maximum number of concurrent live intervals in the loop.
+    unsigned MaxLocalUsers;
+    /// Holds the number of instructions in the loop.
+    unsigned NumInstructions;
+  };
+
+  /// \return  information about the register usage of the loop.
+  RegisterUsage calculateRegisterUsage();
+
 private:
   /// Returns the expected execution cost. The unit of the cost does
   /// not matter because we use the 'cost' units to compare different
@@ -504,7 +527,8 @@ private:
   Loop *TheLoop;
   /// Scev analysis.
   ScalarEvolution *SE;
-
+  /// Loop Info analysis.
+  LoopInfo *LI;
   /// Vectorization legality.
   LoopVectorizationLegality *Legal;
   /// Vector target information.
diff --git a/test/Transforms/LoopVectorize/X86/gcc-examples.ll b/test/Transforms/LoopVectorize/X86/gcc-examples.ll
index 156e7454a4c..0f21ba678c3 100644
--- a/test/Transforms/LoopVectorize/X86/gcc-examples.ll
+++ b/test/Transforms/LoopVectorize/X86/gcc-examples.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -force-vector-unroll=0 -dce -instcombine -licm -S | FileCheck %s -check-prefix=UNROLL
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -13,6 +14,15 @@ target triple = "x86_64-apple-macosx10.8.0"
 ;CHECK: add nsw <4 x i32>
 ;CHECK: store <4 x i32>
 ;CHECK: ret void
+
+;UNROLL: @example1
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: ret void
 define void @example1() nounwind uwtable ssp {
   br label %1
 
@@ -34,13 +44,20 @@ define void @example1() nounwind uwtable ssp {
   ret void
 }
 
-
-; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive. 
+; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive.
 ;CHECK: @example10b
 ;CHECK: load <4 x i16>
 ;CHECK: sext <4 x i16>
 ;CHECK: store <4 x i32>
 ;CHECK: ret void
+;UNROLL: @example10b
+;UNROLL: load <4 x i16>
+;UNROLL: load <4 x i16>
+;UNROLL: load <4 x i16>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: ret void
 define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
   br label %1
 
diff --git a/test/Transforms/LoopVectorize/gcc-examples.ll b/test/Transforms/LoopVectorize/gcc-examples.ll
index b0f5a80d1e6..652c2a0d025 100644
--- a/test/Transforms/LoopVectorize/gcc-examples.ll
+++ b/test/Transforms/LoopVectorize/gcc-examples.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -force-vector-unroll=4 -dce -instcombine -licm -S | FileCheck %s -check-prefix=UNROLL
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -24,6 +25,20 @@ target triple = "x86_64-apple-macosx10.8.0"
 ;CHECK: add nsw <4 x i32>
 ;CHECK: store <4 x i32>
 ;CHECK: ret void
+;UNROLL: @example1
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: ret void
 define void @example1() nounwind uwtable ssp {
   br label %1
 
@@ -48,6 +63,12 @@ define void @example1() nounwind uwtable ssp {
 ;CHECK: @example2
 ;CHECK: store <4 x i32>
 ;CHECK: ret void
+;UNROLL: @example2
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: ret void
 define void @example2(i32 %n, i32 %x) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 0
   br i1 %1, label %.lr.ph5, label %.preheader
@@ -92,6 +113,12 @@ define void @example2(i32 %n, i32 %x) nounwind uwtable ssp {
 ;CHECK: @example3
 ;CHECK: <4 x i32>
 ;CHECK: ret void
+;UNROLL: @example3
+;UNROLL: <4 x i32>
+;UNROLL: <4 x i32>
+;UNROLL: <4 x i32>
+;UNROLL: <4 x i32>
+;UNROLL: ret void
 define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp {
   %1 = icmp eq i32 %n, 0
   br i1 %1, label %._crit_edge, label %.lr.ph
@@ -115,6 +142,12 @@ define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture
 ;CHECK: @example4
 ;CHECK: load <4 x i32>
 ;CHECK: ret void
+;UNROLL: @example4
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: ret void
 define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp {
   %1 = add nsw i32 %n, -1
   %2 = icmp eq i32 %n, 0
@@ -175,6 +208,12 @@ define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture
 ;CHECK: @example8
 ;CHECK: store <4 x i32>
 ;CHECK: ret void
+;UNROLL: @example8
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: ret void
 define void @example8(i32 %x) nounwind uwtable ssp {
   br label %.preheader