From e503319874f57ab4a0354521b03a71cf8e07b866 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Fri, 4 Jan 2013 17:48:25 +0000 Subject: [PATCH] LoopVectorizer: 1. Add code to estimate register pressure. 2. Add code to select the unroll factor based on register pressure. 3. Add bits to TargetTransformInfo to provide the number of registers. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171469 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Target/TargetTransformImpl.h | 2 + include/llvm/TargetTransformInfo.h | 25 ++- lib/Target/TargetTransformImpl.cpp | 4 + lib/Target/X86/X86ISelLowering.cpp | 7 + lib/Target/X86/X86ISelLowering.h | 2 + lib/Transforms/Vectorize/LoopVectorize.cpp | 166 +++++++++++++++++- lib/Transforms/Vectorize/LoopVectorize.h | 32 +++- .../LoopVectorize/X86/gcc-examples.ll | 21 ++- test/Transforms/LoopVectorize/gcc-examples.ll | 39 ++++ 9 files changed, 279 insertions(+), 19 deletions(-) diff --git a/include/llvm/Target/TargetTransformImpl.h b/include/llvm/Target/TargetTransformImpl.h index bbdb4f1dc24..a285f5ba8f4 100644 --- a/include/llvm/Target/TargetTransformImpl.h +++ b/include/llvm/Target/TargetTransformImpl.h @@ -69,6 +69,8 @@ public: virtual ~VectorTargetTransformImpl() {} + virtual unsigned getNumberOfRegisters(bool Vector) const; + virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const; virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, diff --git a/include/llvm/TargetTransformInfo.h b/include/llvm/TargetTransformInfo.h index 97744c0f78d..7dd95a79c2d 100644 --- a/include/llvm/TargetTransformInfo.h +++ b/include/llvm/TargetTransformInfo.h @@ -164,12 +164,19 @@ public: ExtractSubvector // ExtractSubvector Index indicates start offset. }; - /// Returns the expected cost of arithmetic ops, such as mul, xor, fsub, etc. + /// \return The number of scalar or vector registers that the target has. + /// If 'Vectors' is true, it returns the number of vector registers. If it is + /// set to false, it returns the number of scalar registers. + virtual unsigned getNumberOfRegisters(bool Vector) const { + return 8; + } + + /// \return The expected cost of arithmetic ops, such as mul, xor, fsub, etc. virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const { return 1; } - /// Returns the cost of a shuffle instruction of kind Kind and of type Tp. + /// \return The cost of a shuffle instruction of kind Kind and of type Tp. /// The index and subtype parameters are used by the subvector insertion and /// extraction shuffle kinds. virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, @@ -177,47 +184,47 @@ public: return 1; } - /// Returns the expected cost of cast instructions, such as bitcast, trunc, + /// \return The expected cost of cast instructions, such as bitcast, trunc, /// zext, etc. virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { return 1; } - /// Returns the expected cost of control-flow related instrutctions such as + /// \return The expected cost of control-flow related instrutctions such as /// Phi, Ret, Br. virtual unsigned getCFInstrCost(unsigned Opcode) const { return 1; } - /// Returns the expected cost of compare and select instructions. + /// \returns The expected cost of compare and select instructions. virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy = 0) const { return 1; } - /// Returns the expected cost of vector Insert and Extract. + /// \return The expected cost of vector Insert and Extract. /// Use -1 to indicate that there is no information on the index value. virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index = -1) const { return 1; } - /// Returns the cost of Load and Store instructions. + /// \return The cost of Load and Store instructions. virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const { return 1; } - /// Returns the cost of Intrinsic instructions. + /// \returns The cost of Intrinsic instructions. virtual unsigned getIntrinsicInstrCost(Intrinsic::ID, Type *RetTy, ArrayRef Tys) const { return 1; } - /// Returns the number of pieces into which the provided type must be + /// \returns The number of pieces into which the provided type must be /// split during legalization. Zero is returned when the answer is unknown. virtual unsigned getNumberOfParts(Type *Tp) const { return 0; diff --git a/lib/Target/TargetTransformImpl.cpp b/lib/Target/TargetTransformImpl.cpp index f8c588934fe..7d663f57f98 100644 --- a/lib/Target/TargetTransformImpl.cpp +++ b/lib/Target/TargetTransformImpl.cpp @@ -171,6 +171,10 @@ VectorTargetTransformImpl::getScalarizationOverhead(Type *Ty, return Cost; } +unsigned VectorTargetTransformImpl::getNumberOfRegisters(bool Vector) const { + return 8; +} + unsigned VectorTargetTransformImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const { // Check if any of the operands are vector operands. diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index eca63f80ae0..f482ac98462 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -18115,6 +18115,13 @@ X86ScalarTargetTransformImpl::getPopcntHwSupport(unsigned TyWidth) const { return ST.hasSSE41() ? Fast : None; } +unsigned X86VectorTargetTransformInfo::getNumberOfRegisters(bool Vector) const { + const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget(); + if (ST.is64Bit()) + return 16; + return 8; +} + unsigned X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const { diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 2e2fc2a234f..86b7764c136 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -959,6 +959,8 @@ namespace llvm { explicit X86VectorTargetTransformInfo(const TargetLowering *TL) : VectorTargetTransformImpl(TL) {} + virtual unsigned getNumberOfRegisters(bool Vector) const; + virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const; virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src, diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 8feea9360a0..0f84fe05ef0 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7,6 +7,7 @@ // //===----------------------------------------------------------------------===// #include "LoopVectorize.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" @@ -43,7 +44,7 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect.")); static cl::opt -VectorizationUnroll("force-vector-unroll", cl::init(1), cl::Hidden, +VectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden, cl::desc("Sets the vectorization unroll count. " "Zero is autoselect.")); @@ -94,7 +95,7 @@ struct LoopVectorize : public LoopPass { if (TTI) VTTI = TTI->getVectorTargetTransformInfo(); // Use the cost model. - LoopVectorizationCostModel CM(L, SE, &LVL, VTTI); + LoopVectorizationCostModel CM(L, SE, LI, &LVL, VTTI); // Check the function attribues to find out if this function should be // optimized for size. @@ -112,6 +113,7 @@ struct LoopVectorize : public LoopPass { } unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor); + unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll); if (VF == 1) { DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); @@ -120,9 +122,10 @@ struct LoopVectorize : public LoopPass { DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<< F->getParent()->getModuleIdentifier()<<"\n"); + DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n"); // If we decided that it is *legal* to vectorizer the loop then do it. - InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, VectorizationUnroll); + InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, UF); LB.vectorize(&LVL); DEBUG(verifyFunction(*L->getHeader()->getParent())); @@ -2082,7 +2085,7 @@ bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { unsigned LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, - unsigned UserVF) { + unsigned UserVF) { if (OptForSize && Legal->getRuntimePointerCheck()->Need) { DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n"); return 1; @@ -2148,6 +2151,161 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, return Width; } +unsigned +LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, + unsigned UserUF) { + // Use the user preference, unless 'auto' is selected. + if (UserUF != 0) + return UserUF; + + // When we optimize for size we don't unroll. + if (OptForSize) + return 1; + + unsigned TargetVectorRegisters = VTTI->getNumberOfRegisters(true); + DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters << + " vector registers\n"); + + LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage(); + // We divide by these constants so assume that we have at least one + // instruction that uses at least one register. + R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); + R.NumInstructions = std::max(R.NumInstructions, 1U); + + // We calculate the unroll factor using the following formula. + // Subtract the number of loop invariants from the number of available + // registers. These registers are used by all of the unrolled instances. + // Next, divide the remaining registers by the number of registers that is + // required by the loop, in order to estimate how many parallel instances + // fit without causing spills. + unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers; + + // We don't want to unroll the loops to the point where they do not fit into + // the decoded cache. Assume that we only allow 32 IR instructions. + UF = std::min(UF, (32 / R.NumInstructions)); + + // Clamp the unroll factor ranges to reasonable factors. + if (UF > MaxUnrollSize) + UF = MaxUnrollSize; + else if (UF < 1) + UF = 1; + + return UF; +} + +LoopVectorizationCostModel::RegisterUsage +LoopVectorizationCostModel::calculateRegisterUsage() { + // This function calculates the register usage by measuring the highest number + // of values that are alive at a single location. Obviously, this is a very + // rough estimation. We scan the loop in a topological order in order and + // assign a number to each instruction. We use RPO to ensure that defs are + // met before their users. We assume that each instruction that has in-loop + // users starts an interval. We record every time that an in-loop value is + // used, so we have a list of the first and last occurrences of each + // instruction. Next, we transpose this data structure into a multi map that + // holds the list of intervals that *end* at a specific location. This multi + // map allows us to perform a linear search. We scan the instructions linearly + // and record each time that a new interval starts, by placing it in a set. + // If we find this value in the multi-map then we remove it from the set. + // The max register usage is the maximum size of the set. + // We also search for instructions that are defined outside the loop, but are + // used inside the loop. We need this number separately from the max-interval + // usage number because when we unroll, loop-invariant values do not take + // more register. + LoopBlocksDFS DFS(TheLoop); + DFS.perform(LI); + + RegisterUsage R; + R.NumInstructions = 0; + + // Each 'key' in the map opens a new interval. The values + // of the map are the index of the 'last seen' usage of the + // instruction that is the key. + typedef DenseMap IntervalMap; + // Maps instruction to its index. + DenseMap IdxToInstr; + // Marks the end of each interval. + IntervalMap EndPoint; + // Saves the list of instruction indices that are used in the loop. + SmallSet Ends; + // Saves the list of values that are used in the loop but are + // defined outside the loop, such as arguments and constants. + SmallPtrSet LoopInvariants; + + unsigned Index = 0; + for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), + be = DFS.endRPO(); bb != be; ++bb) { + R.NumInstructions += (*bb)->size(); + for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; + ++it) { + Instruction *I = it; + IdxToInstr[Index++] = I; + + // Save the end location of each USE. + for (unsigned i = 0; i < I->getNumOperands(); ++i) { + Value *U = I->getOperand(i); + Instruction *Instr = dyn_cast(U); + + // Ignore non-instruction values such as arguments, constants, etc. + if (!Instr) continue; + + // If this instruction is outside the loop then record it and continue. + if (!TheLoop->contains(Instr)) { + LoopInvariants.insert(Instr); + continue; + } + + // Overwrite previous end points. + EndPoint[Instr] = Index; + Ends.insert(Instr); + } + } + } + + // Saves the list of intervals that end with the index in 'key'. + typedef SmallVector InstrList; + DenseMap TransposeEnds; + + // Transpose the EndPoints to a list of values that end at each index. + for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end(); + it != e; ++it) + TransposeEnds[it->second].push_back(it->first); + + SmallSet OpenIntervals; + unsigned MaxUsage = 0; + + + DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); + for (unsigned int i = 0; i < Index; ++i) { + Instruction *I = IdxToInstr[i]; + // Ignore instructions that are never used within the loop. + if (!Ends.count(I)) continue; + + // Remove all of the instructions that end at this location. + InstrList &List = TransposeEnds[i]; + for (unsigned int i=0, e = List.size(); i < e; ++i) + OpenIntervals.erase(List[i]); + + // Count the number of live interals. + MaxUsage = std::max(MaxUsage, OpenIntervals.size()); + + DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " << + OpenIntervals.size() <<"\n"); + + // Add the current instruction to the list of open intervals. + OpenIntervals.insert(I); + } + + unsigned Invariant = LoopInvariants.size(); + DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n"); + DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n"); + DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n"); + + R.LoopInvariantRegs = Invariant; + R.MaxLocalUsers = MaxUsage; + return R; +} + unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { unsigned Cost = 0; diff --git a/lib/Transforms/Vectorize/LoopVectorize.h b/lib/Transforms/Vectorize/LoopVectorize.h index 68d7ee70469..2333b2bcf94 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.h +++ b/lib/Transforms/Vectorize/LoopVectorize.h @@ -68,6 +68,9 @@ const unsigned RuntimeMemoryCheckThreshold = 4; /// This is the highest vector width that we try to generate. const unsigned MaxVectorSize = 8; +/// This is the highest Unroll Factor. +const unsigned MaxUnrollSize = 4; + namespace llvm { // Forward declarations. @@ -473,17 +476,37 @@ private: class LoopVectorizationCostModel { public: /// C'tor. - LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, + LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, LoopInfo *Li, LoopVectorizationLegality *Leg, const VectorTargetTransformInfo *Vtti): - TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { } + TheLoop(Lp), SE(Se), LI(Li), Legal(Leg), VTTI(Vtti) { } - /// Returns the most profitable vectorization factor in powers of two. + /// \return The most profitable vectorization factor. /// This method checks every power of two up to VF. If UserVF is not ZERO /// then this vectorization factor will be selected if vectorization is /// possible. unsigned selectVectorizationFactor(bool OptForSize, unsigned UserVF); + + /// \return The most profitable unroll factor. + /// If UserUF is non-zero then this method finds the best unroll-factor + /// based on register pressure and other parameters. + unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF); + + /// \brief A struct that represents some properties of the register usage + /// of a loop. + struct RegisterUsage { + /// Holds the number of loop invariant values that are used in the loop. + unsigned LoopInvariantRegs; + /// Holds the maximum number of concurrent live intervals in the loop. + unsigned MaxLocalUsers; + /// Holds the number of instructions in the loop. + unsigned NumInstructions; + }; + + /// \return information about the register usage of the loop. + RegisterUsage calculateRegisterUsage(); + private: /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different @@ -504,7 +527,8 @@ private: Loop *TheLoop; /// Scev analysis. ScalarEvolution *SE; - + /// Loop Info analysis. + LoopInfo *LI; /// Vectorization legality. LoopVectorizationLegality *Legal; /// Vector target information. diff --git a/test/Transforms/LoopVectorize/X86/gcc-examples.ll b/test/Transforms/LoopVectorize/X86/gcc-examples.ll index 156e7454a4c..0f21ba678c3 100644 --- a/test/Transforms/LoopVectorize/X86/gcc-examples.ll +++ b/test/Transforms/LoopVectorize/X86/gcc-examples.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -dce -instcombine -licm -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -force-vector-unroll=0 -dce -instcombine -licm -S | FileCheck %s -check-prefix=UNROLL target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -13,6 +14,15 @@ target triple = "x86_64-apple-macosx10.8.0" ;CHECK: add nsw <4 x i32> ;CHECK: store <4 x i32> ;CHECK: ret void + +;UNROLL: @example1 +;UNROLL: load <4 x i32> +;UNROLL: load <4 x i32> +;UNROLL: add nsw <4 x i32> +;UNROLL: add nsw <4 x i32> +;UNROLL: store <4 x i32> +;UNROLL: store <4 x i32> +;UNROLL: ret void define void @example1() nounwind uwtable ssp { br label %1 @@ -34,13 +44,20 @@ define void @example1() nounwind uwtable ssp { ret void } - -; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive. +; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive. ;CHECK: @example10b ;CHECK: load <4 x i16> ;CHECK: sext <4 x i16> ;CHECK: store <4 x i32> ;CHECK: ret void +;UNROLL: @example10b +;UNROLL: load <4 x i16> +;UNROLL: load <4 x i16> +;UNROLL: load <4 x i16> +;UNROLL: store <4 x i32> +;UNROLL: store <4 x i32> +;UNROLL: store <4 x i32> +;UNROLL: ret void define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp { br label %1 diff --git a/test/Transforms/LoopVectorize/gcc-examples.ll b/test/Transforms/LoopVectorize/gcc-examples.ll index b0f5a80d1e6..652c2a0d025 100644 --- a/test/Transforms/LoopVectorize/gcc-examples.ll +++ b/test/Transforms/LoopVectorize/gcc-examples.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-unroll=4 -dce -instcombine -licm -S | FileCheck %s -check-prefix=UNROLL target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -24,6 +25,20 @@ target triple = "x86_64-apple-macosx10.8.0" ;CHECK: add nsw <4 x i32> ;CHECK: store <4 x i32> ;CHECK: ret void +;UNROLL: @example1 +;UNROLL: load <4 x i32> +;UNROLL: load <4 x i32> +;UNROLL: load <4 x i32> +;UNROLL: load <4 x i32> +;UNROLL: add nsw <4 x i32> +;UNROLL: add nsw <4 x i32> +;UNROLL: add nsw <4 x i32> +;UNROLL: add nsw <4 x i32> +;UNROLL: store <4 x i32> +;UNROLL: store <4 x i32> +;UNROLL: store <4 x i32> +;UNROLL: store <4 x i32> +;UNROLL: ret void define void @example1() nounwind uwtable ssp { br label %1 @@ -48,6 +63,12 @@ define void @example1() nounwind uwtable ssp { ;CHECK: @example2 ;CHECK: store <4 x i32> ;CHECK: ret void +;UNROLL: @example2 +;UNROLL: store <4 x i32> +;UNROLL: store <4 x i32> +;UNROLL: store <4 x i32> +;UNROLL: store <4 x i32> +;UNROLL: ret void define void @example2(i32 %n, i32 %x) nounwind uwtable ssp { %1 = icmp sgt i32 %n, 0 br i1 %1, label %.lr.ph5, label %.preheader @@ -92,6 +113,12 @@ define void @example2(i32 %n, i32 %x) nounwind uwtable ssp { ;CHECK: @example3 ;CHECK: <4 x i32> ;CHECK: ret void +;UNROLL: @example3 +;UNROLL: <4 x i32> +;UNROLL: <4 x i32> +;UNROLL: <4 x i32> +;UNROLL: <4 x i32> +;UNROLL: ret void define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp { %1 = icmp eq i32 %n, 0 br i1 %1, label %._crit_edge, label %.lr.ph @@ -115,6 +142,12 @@ define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture ;CHECK: @example4 ;CHECK: load <4 x i32> ;CHECK: ret void +;UNROLL: @example4 +;UNROLL: load <4 x i32> +;UNROLL: load <4 x i32> +;UNROLL: load <4 x i32> +;UNROLL: load <4 x i32> +;UNROLL: ret void define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp { %1 = add nsw i32 %n, -1 %2 = icmp eq i32 %n, 0 @@ -175,6 +208,12 @@ define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture ;CHECK: @example8 ;CHECK: store <4 x i32> ;CHECK: ret void +;UNROLL: @example8 +;UNROLL: store <4 x i32> +;UNROLL: store <4 x i32> +;UNROLL: store <4 x i32> +;UNROLL: store <4 x i32> +;UNROLL: ret void define void @example8(i32 %x) nounwind uwtable ssp { br label %.preheader