diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 2a8380613e0..fd20c7507e3 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -55,7 +55,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" @@ -410,11 +409,10 @@ bool LoadHoisting::canHoistAllLoads() { class LoopVectorizationLegality { public: LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL, - DominatorTree *DT, TargetTransformInfo* TTI, - AliasAnalysis *AA, TargetLibraryInfo *TLI) - : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI), + DominatorTree *DT, TargetLibraryInfo *TLI) + : TheLoop(L), SE(SE), DL(DL), DT(DT), TLI(TLI), Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false), - LoadSpeculation(L, DT) {} + MaxSafeDepDistBytes(-1U), LoadSpeculation(L, DT) {} /// This enum represents the kinds of reductions that we support. enum ReductionKind { @@ -501,7 +499,8 @@ public: } /// Insert a pointer and calculate the start and end SCEVs. - void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr); + void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, + unsigned DepSetId); /// This flag indicates if we need to add the runtime check. bool Need; @@ -513,6 +512,9 @@ public: SmallVector Ends; /// Holds the information if this pointer is used for writing to memory. SmallVector IsWritePtr; + /// Holds the id of the set of pointers that could be dependent because of a + /// shared underlying object. + SmallVector DependencySetId; }; /// A POD for saving information about induction variables. @@ -533,11 +535,6 @@ public: /// induction descriptor. typedef MapVector InductionList; - /// Alias(Multi)Map stores the values (GEPs or underlying objects and their - /// respective Store/Load instruction(s) to calculate aliasing. - typedef MapVector AliasMap; - typedef DenseMap > AliasMultiMap; - /// Returns true if it is legal to vectorize this loop. /// This does not mean that it is profitable to vectorize this /// loop, only that it is legal to do so. @@ -584,6 +581,9 @@ public: /// This function returns the identity element (or neutral element) for /// the operation K. static Constant *getReductionIdentity(ReductionKind K, Type *Tp); + + unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } + private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -624,16 +624,6 @@ private: /// Returns the induction kind of Phi. This function may return NoInduction /// if the PHI is not an induction variable. InductionKind isInductionVariable(PHINode *Phi); - /// Return true if can compute the address bounds of Ptr within the loop. - bool hasComputableBounds(Value *Ptr); - /// Return true if there is the chance of write reorder. - bool hasPossibleGlobalWriteReorder(Value *Object, - Instruction *Inst, - AliasMultiMap &WriteObjects, - unsigned MaxByteWidth); - /// Return the AA location for a load or a store. - AliasAnalysis::Location getLoadStoreLocation(Instruction *Inst); - /// The loop that we evaluate. Loop *TheLoop; @@ -643,10 +633,6 @@ private: DataLayout *DL; /// Dominators. DominatorTree *DT; - /// Target Info. - TargetTransformInfo *TTI; - /// Alias Analysis. - AliasAnalysis *AA; /// Target Library Info. TargetLibraryInfo *TLI; @@ -676,6 +662,8 @@ private: /// Can we assume the absence of NaNs. bool HasFunNoNaNAttr; + unsigned MaxSafeDepDistBytes; + /// Utility to determine whether loads can be speculated. LoadHoisting LoadSpeculation; }; @@ -904,7 +892,6 @@ struct LoopVectorize : public LoopPass { LoopInfo *LI; TargetTransformInfo *TTI; DominatorTree *DT; - AliasAnalysis *AA; TargetLibraryInfo *TLI; virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { @@ -917,7 +904,6 @@ struct LoopVectorize : public LoopPass { LI = &getAnalysis(); TTI = &getAnalysis(); DT = &getAnalysis(); - AA = getAnalysisIfAvailable(); TLI = getAnalysisIfAvailable(); if (DL == NULL) { @@ -936,7 +922,7 @@ struct LoopVectorize : public LoopPass { } // Check if it is legal to vectorize the loop. - LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA, TLI); + LoopVectorizationLegality LVL(L, SE, DL, DT, TLI); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing.\n"); return false; @@ -1011,7 +997,8 @@ struct LoopVectorize : public LoopPass { void LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, - bool WritePtr) { + bool WritePtr, + unsigned DepSetId) { const SCEV *Sc = SE->getSCEV(Ptr); const SCEVAddRecExpr *AR = dyn_cast(Sc); assert(AR && "Invalid addrec expression"); @@ -1021,6 +1008,7 @@ LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, Starts.push_back(AR->getStart()); Ends.push_back(ScEnd); IsWritePtr.push_back(WritePtr); + DependencySetId.push_back(DepSetId); } Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { @@ -1358,10 +1346,9 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, if (!PtrRtCheck->Need) return NULL; - Instruction *MemoryRuntimeCheck = 0; unsigned NumPointers = PtrRtCheck->Pointers.size(); - SmallVector Starts; - SmallVector Ends; + SmallVector , 2> Starts; + SmallVector , 2> Ends; SCEVExpander Exp(*SE, "induction"); @@ -1388,13 +1375,18 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, } IRBuilder<> ChkBuilder(Loc); - + // Our instructions might fold to a constant. + Value *MemoryRuntimeCheck = 0; for (unsigned i = 0; i < NumPointers; ++i) { for (unsigned j = i+1; j < NumPointers; ++j) { // No need to check if two readonly pointers intersect. if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j]) continue; + // Only need to check pointers between two different dependency sets. + if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j]) + continue; + Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc"); Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc"); Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy, "bc"); @@ -1406,12 +1398,18 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, if (MemoryRuntimeCheck) IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx"); - - MemoryRuntimeCheck = cast(IsConflict); + MemoryRuntimeCheck = IsConflict; } } - return MemoryRuntimeCheck; + // We have to do this trickery because the IRBuilder might fold the check to a + // constant expression in which case there is no Instruction anchored in a + // the block. + LLVMContext &Ctx = Loc->getContext(); + Instruction * Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck, + ConstantInt::getTrue(Ctx)); + ChkBuilder.Insert(Check, "memcheck.conflict"); + return Check; } void @@ -2982,7 +2980,7 @@ bool AccessAnalysis::canCheckPtrAtRT( // Each access has its own dependence set. DepId = RunningDepId++; - //RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId); + RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId); DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr <<"\n"); } else { @@ -3464,53 +3462,29 @@ MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, return true; } -AliasAnalysis::Location -LoopVectorizationLegality::getLoadStoreLocation(Instruction *Inst) { - if (StoreInst *Store = dyn_cast(Inst)) - return AA->getLocation(Store); - else if (LoadInst *Load = dyn_cast(Inst)) - return AA->getLocation(Load); - - llvm_unreachable("Should be either load or store instruction"); -} - -bool -LoopVectorizationLegality::hasPossibleGlobalWriteReorder( - Value *Object, - Instruction *Inst, - AliasMultiMap& WriteObjects, - unsigned MaxByteWidth) { - - AliasAnalysis::Location ThisLoc = getLoadStoreLocation(Inst); - - std::vector::iterator - it = WriteObjects[Object].begin(), - end = WriteObjects[Object].end(); - - for (; it != end; ++it) { - Instruction* I = *it; - if (I == Inst) - continue; - - AliasAnalysis::Location ThatLoc = getLoadStoreLocation(I); - if (AA->alias(ThisLoc.getWithNewSize(MaxByteWidth), - ThatLoc.getWithNewSize(MaxByteWidth))) - return true; - } - return false; -} - bool LoopVectorizationLegality::canVectorizeMemory() { typedef SmallVector ValueVector; typedef SmallPtrSet ValueSet; + + // Stores a pair of memory access location and whether the access is a store + // (true) or a load (false). + typedef std::pair MemAccessInfo; + typedef DenseSet PtrAccessSet; + // Holds the Load and Store *instructions*. ValueVector Loads; ValueVector Stores; + + // Holds all the different accesses in the loop. + unsigned NumReads = 0; + unsigned NumReadWrites = 0; + PtrRtCheck.Pointers.clear(); PtrRtCheck.Need = false; const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); + MemoryDepChecker DepChecker(SE, DL, TheLoop); // For each block. for (Loop::block_iterator bb = TheLoop->block_begin(), @@ -3531,6 +3505,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return false; } Loads.push_back(Ld); + DepChecker.addAccess(Ld); continue; } @@ -3543,6 +3518,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return false; } Stores.push_back(St); + DepChecker.addAccess(St); } } // next instr. } // next block. @@ -3557,10 +3533,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return true; } - // Holds the read and read-write *pointers* that we find. These maps hold - // unique values for pointers (so no need for multi-map). - AliasMap Reads; - AliasMap ReadWrites; + AccessAnalysis::DepCandidates DependentAccesses; + AccessAnalysis Accesses(DL, DependentAccesses); // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects // multiple times on the same object. If the ptr is accessed twice, once @@ -3579,10 +3553,12 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return false; } - // If we did *not* see this pointer before, insert it to - // the read-write list. At this phase it is only a 'write' list. - if (Seen.insert(Ptr)) - ReadWrites.insert(std::make_pair(Ptr, ST)); + // If we did *not* see this pointer before, insert it to the read-write + // list. At this phase it is only a 'write' list. + if (Seen.insert(Ptr)) { + ++NumReadWrites; + Accesses.addStore(Ptr); + } } if (IsAnnotatedParallel) { @@ -3592,6 +3568,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return true; } + SmallPtrSet ReadOnlyPtr; for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { LoadInst *LD = cast(*I); Value* Ptr = LD->getPointerOperand(); @@ -3603,51 +3580,44 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // If the address of i is unknown (for example A[B[i]]) then we may // read a few words, modify, and write a few words, and some of the // words may be written to the same address. - if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr)) - Reads.insert(std::make_pair(Ptr, LD)); + bool IsReadOnlyPtr = false; + if (Seen.insert(Ptr) || !isStridedPtr(SE, DL, Ptr, TheLoop)) { + ++NumReads; + IsReadOnlyPtr = true; + } + Accesses.addLoad(Ptr, IsReadOnlyPtr); } // If we write (or read-write) to a single destination and there are no // other reads in this loop then is it safe to vectorize. - if (ReadWrites.size() == 1 && Reads.size() == 0) { + if (NumReadWrites == 1 && NumReads == 0) { DEBUG(dbgs() << "LV: Found a write-only loop!\n"); return true; } - unsigned NumReadPtrs = 0; - unsigned NumWritePtrs = 0; + // Build dependence sets and check whether we need a runtime pointer bounds + // check. + Accesses.buildDependenceSets(); + bool NeedRTCheck = Accesses.isRTCheckNeeded(); // Find pointers with computable bounds. We are going to use this information // to place a runtime bound check. - bool CanDoRT = true; - AliasMap::iterator MI, ME; - for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) { - Value *V = (*MI).first; - if (hasComputableBounds(V)) { - PtrRtCheck.insert(SE, TheLoop, V, true); - NumWritePtrs++; - DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n"); - } else { - CanDoRT = false; - break; - } - } - for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) { - Value *V = (*MI).first; - if (hasComputableBounds(V)) { - PtrRtCheck.insert(SE, TheLoop, V, false); - NumReadPtrs++; - DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n"); - } else { - CanDoRT = false; - break; - } - } + unsigned NumComparisons = 0; + bool CanDoRT = false; + if (NeedRTCheck) + CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop); - // Check that we did not collect too many pointers or found a - // unsizeable pointer. - unsigned NumComparisons = (NumWritePtrs * (NumReadPtrs + NumWritePtrs - 1)); - DEBUG(dbgs() << "LV: We need to compare " << NumComparisons << " ptrs.\n"); + + DEBUG(dbgs() << "LV: We need to do " << NumComparisons << + " pointer comparisons.\n"); + + // If we only have one set of dependences to check pointers among we don't + // need a runtime check. + if (NumComparisons == 0 && NeedRTCheck) + NeedRTCheck = false; + + // Check that we did not collect too many pointers or found a unsizeable + // pointer. if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { PtrRtCheck.reset(); CanDoRT = false; @@ -3657,113 +3627,6 @@ bool LoopVectorizationLegality::canVectorizeMemory() { DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); } - bool NeedRTCheck = false; - - // Biggest vectorized access possible, vector width * unroll factor. - // TODO: We're being very pessimistic here, find a way to know the - // real access width before getting here. - unsigned MaxByteWidth = (TTI->getRegisterBitWidth(true) / 8) * - TTI->getMaximumUnrollFactor(); - // Now that the pointers are in two lists (Reads and ReadWrites), we - // can check that there are no conflicts between each of the writes and - // between the writes to the reads. - // Note that WriteObjects duplicates the stores (indexed now by underlying - // objects) to avoid pointing to elements inside ReadWrites. - // TODO: Maybe create a new type where they can interact without duplication. - AliasMultiMap WriteObjects; - ValueVector TempObjects; - - // Check that the read-writes do not conflict with other read-write - // pointers. - bool AllWritesIdentified = true; - for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) { - Value *Val = (*MI).first; - Instruction *Inst = (*MI).second; - - GetUnderlyingObjects(Val, TempObjects, DL); - for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end(); - UI != UE; ++UI) { - if (!isIdentifiedObject(*UI)) { - DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **UI <<"\n"); - NeedRTCheck = true; - AllWritesIdentified = false; - } - - // Never seen it before, can't alias. - if (WriteObjects[*UI].empty()) { - DEBUG(dbgs() << "LV: Adding Underlying value:" << **UI <<"\n"); - WriteObjects[*UI].push_back(Inst); - continue; - } - // Direct alias found. - if (!AA || dyn_cast(*UI) == NULL) { - DEBUG(dbgs() << "LV: Found a possible write-write reorder:" - << **UI <<"\n"); - return false; - } - DEBUG(dbgs() << "LV: Found a conflicting global value:" - << **UI <<"\n"); - DEBUG(dbgs() << "LV: While examining store:" << *Inst <<"\n"); - DEBUG(dbgs() << "LV: On value:" << *Val <<"\n"); - - // If global alias, make sure they do alias. - if (hasPossibleGlobalWriteReorder(*UI, - Inst, - WriteObjects, - MaxByteWidth)) { - DEBUG(dbgs() << "LV: Found a possible write-write reorder:" << **UI - << "\n"); - return false; - } - - // Didn't alias, insert into map for further reference. - WriteObjects[*UI].push_back(Inst); - } - TempObjects.clear(); - } - - /// Check that the reads don't conflict with the read-writes. - for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) { - Value *Val = (*MI).first; - GetUnderlyingObjects(Val, TempObjects, DL); - for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end(); - UI != UE; ++UI) { - // If all of the writes are identified then we don't care if the read - // pointer is identified or not. - if (!AllWritesIdentified && !isIdentifiedObject(*UI)) { - DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **UI <<"\n"); - NeedRTCheck = true; - } - - // Never seen it before, can't alias. - if (WriteObjects[*UI].empty()) - continue; - // Direct alias found. - if (!AA || dyn_cast(*UI) == NULL) { - DEBUG(dbgs() << "LV: Found a possible write-write reorder:" - << **UI <<"\n"); - return false; - } - DEBUG(dbgs() << "LV: Found a global value: " - << **UI <<"\n"); - Instruction *Inst = (*MI).second; - DEBUG(dbgs() << "LV: While examining load:" << *Inst <<"\n"); - DEBUG(dbgs() << "LV: On value:" << *Val <<"\n"); - - // If global alias, make sure they do alias. - if (hasPossibleGlobalWriteReorder(*UI, - Inst, - WriteObjects, - MaxByteWidth)) { - DEBUG(dbgs() << "LV: Found a possible read-write reorder:" << **UI - << "\n"); - return false; - } - } - TempObjects.clear(); - } - - PtrRtCheck.Need = NeedRTCheck; if (NeedRTCheck && !CanDoRT) { DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << "the array bounds.\n"); @@ -3771,9 +3634,20 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return false; } + PtrRtCheck.Need = NeedRTCheck; + + bool CanVecMem = true; + if (Accesses.isDependencyCheckNeeded()) { + DEBUG(dbgs() << "LV: Checking memory dependencies\n"); + CanVecMem = DepChecker.areDepsSafe(DependentAccesses, + Accesses.getDependenciesToCheck()); + MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes(); + } + DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") << " need a runtime memory check.\n"); - return true; + + return CanVecMem; } static bool hasMultipleUsesOf(Instruction *I, @@ -4126,15 +4000,6 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) { return true; } -bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { - const SCEV *PhiScev = SE->getSCEV(Ptr); - const SCEVAddRecExpr *AR = dyn_cast(PhiScev); - if (!AR) - return false; - - return AR->isAffine(); -} - LoopVectorizationCostModel::VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, unsigned UserVF) { @@ -4151,6 +4016,10 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, unsigned WidestType = getWidestType(); unsigned WidestRegister = TTI.getRegisterBitWidth(true); + unsigned MaxSafeDepDist = -1U; + if (Legal->getMaxSafeDepDistBytes() != -1U) + MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; + WidestRegister = WidestRegister < MaxSafeDepDist ? WidestRegister : MaxSafeDepDist; unsigned MaxVectorSize = WidestRegister / WidestType; DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n"); @@ -4284,6 +4153,10 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, if (OptForSize) return 1; + // We used the distance for the unroll factor. + if (Legal->getMaxSafeDepDistBytes() != -1U) + return 1; + // Do not unroll loops with a relatively small trip count. unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch()); @@ -4680,7 +4553,6 @@ Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { char LoopVectorize::ID = 0; static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) diff --git a/test/Transforms/LoopVectorize/12-12-11-if-conv.ll b/test/Transforms/LoopVectorize/12-12-11-if-conv.ll index bab6300f2e7..6ef101074de 100644 --- a/test/Transforms/LoopVectorize/12-12-11-if-conv.ll +++ b/test/Transforms/LoopVectorize/12-12-11-if-conv.ll @@ -30,7 +30,7 @@ if.then: ; preds = %for.body if.end: ; preds = %for.body, %if.then %z.0 = phi i32 [ %add1, %if.then ], [ 9, %for.body ] store i32 %z.0, i32* %arrayidx, align 4 - %indvars.iv.next = add i64 %indvars.iv, 1 + %indvars.iv.next = add nsw i64 %indvars.iv, 1 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 %exitcond = icmp eq i32 %lftr.wideiv, %x br i1 %exitcond, label %for.end, label %for.body diff --git a/test/Transforms/LoopVectorize/memdep.ll b/test/Transforms/LoopVectorize/memdep.ll new file mode 100644 index 00000000000..56e86a4c0dc --- /dev/null +++ b/test/Transforms/LoopVectorize/memdep.ll @@ -0,0 +1,222 @@ +; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-unroll=1 -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-unroll=1 -S | FileCheck %s -check-prefix=WIDTH + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; Vectorization with dependence checks. + +; No plausible dependence - can be vectorized. +; for (i = 0; i < 1024; ++i) +; A[i] = A[i + 1] + 1; + +; CHECK: f1_vec +; CHECK: <2 x i32> + +define void @f1_vec(i32* %A) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %indvars.iv.next = add i32 %indvars.iv, 1 + %arrayidx = getelementptr inbounds i32* %A, i32 %indvars.iv.next + %0 = load i32* %arrayidx, align 4 + %add1 = add nsw i32 %0, 1 + %arrayidx3 = getelementptr inbounds i32* %A, i32 %indvars.iv + store i32 %add1, i32* %arrayidx3, align 4 + %exitcond = icmp ne i32 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.body, label %for.end + +for.end: + ret void +} + +; Plausible dependence of distance 1 - can't be vectorized. +; for (i = 0; i < 1024; ++i) +; A[i+1] = A[i] + 1; + +; CHECK: f2_novec +; CHECK-NOT: <2 x i32> + +define void @f2_novec(i32* %A) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32* %A, i32 %indvars.iv + %0 = load i32* %arrayidx, align 4 + %add = add nsw i32 %0, 1 + %indvars.iv.next = add i32 %indvars.iv, 1 + %arrayidx3 = getelementptr inbounds i32* %A, i32 %indvars.iv.next + store i32 %add, i32* %arrayidx3, align 4 + %exitcond = icmp ne i32 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.body, label %for.end + +for.end: + ret void +} + +; Plausible dependence of distance 2 - can be vectorized with a width of 2. +; for (i = 0; i < 1024; ++i) +; A[i+2] = A[i] + 1; + +; CHECK: f3_vec_len +; CHECK: <2 x i32> + +; WIDTH: f3_vec_len +; WIDTH-NOT: <4 x i32> + +define void @f3_vec_len(i32* %A) { +entry: + br label %for.body + +for.body: + %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %idxprom = sext i32 %i.01 to i64 + %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom + %0 = load i32* %arrayidx, align 4 + %add = add nsw i32 %0, 1 + %add1 = add nsw i32 %i.01, 2 + %idxprom2 = sext i32 %add1 to i64 + %arrayidx3 = getelementptr inbounds i32* %A, i64 %idxprom2 + store i32 %add, i32* %arrayidx3, align 4 + %inc = add nsw i32 %i.01, 1 + %cmp = icmp slt i32 %inc, 1024 + br i1 %cmp, label %for.body, label %for.end + +for.end: + ret void +} + +; Plausible dependence of distance 1 - cannot be vectorized (without reordering +; accesses). +; for (i = 0; i < 1024; ++i) { +; B[i] = A[i]; +; A[i] = B[i + 1]; +; } + +; CHECK: f5 +; CHECK-NOT: <2 x i32> + +define void @f5(i32* %A, i32* %B) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv + store i32 %0, i32* %arrayidx2, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, 1 + %arrayidx4 = getelementptr inbounds i32* %B, i64 %indvars.iv.next + %1 = load i32* %arrayidx4, align 4 + store i32 %1, i32* %arrayidx, align 4 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp ne i32 %lftr.wideiv, 1024 + br i1 %exitcond, label %for.body, label %for.end + +for.end: + ret void +} + +; Dependence through a phi node - must not vectorize. +; for (i = 0; i < 1024; ++i) { +; a[i+1] = tmp; +; tmp = a[i]; +; } + +; CHECK: f6 +; CHECK-NOT: <2 x i32> + +define i32 @f6(i32* %a, i32 %tmp) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %tmp.addr.08 = phi i32 [ %tmp, %entry ], [ %0, %for.body ] + %indvars.iv.next = add nsw i64 %indvars.iv, 1 + %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv.next + store i32 %tmp.addr.08, i32* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds i32* %a, i64 %indvars.iv + %0 = load i32* %arrayidx3, align 4 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp ne i32 %lftr.wideiv, 1024 + br i1 %exitcond, label %for.body, label %for.end + +for.end: + ret i32 undef +} + +; Don't vectorize true loop carried dependencies that are not a multiple of the +; vector width. +; Example: +; for (int i = ...; ++i) { +; a[i] = a[i-3] + ...; +; It is a bad idea to vectorize this loop because store-load forwarding will not +; happen. +; + +; CHECK: @nostoreloadforward +; CHECK-NOT: <2 x i32> + +define void @nostoreloadforward(i32* %A) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 16, %entry ], [ %indvars.iv.next, %for.body ] + %0 = add nsw i64 %indvars.iv, -3 + %arrayidx = getelementptr inbounds i32* %A, i64 %0 + %1 = load i32* %arrayidx, align 4 + %2 = add nsw i64 %indvars.iv, 4 + %arrayidx2 = getelementptr inbounds i32* %A, i64 %2 + %3 = load i32* %arrayidx2, align 4 + %add3 = add nsw i32 %3, %1 + %arrayidx5 = getelementptr inbounds i32* %A, i64 %indvars.iv + store i32 %add3, i32* %arrayidx5, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp ne i32 %lftr.wideiv, 128 + br i1 %exitcond, label %for.body, label %for.end + +for.end: + ret void +} + +; Example: +; for (int i = ...; ++i) { +; a[i] = b[i]; +; c[i] = a[i-3] + ...; +; It is a bad idea to vectorize this loop because store-load forwarding will not +; happen. +; + +; CHECK: @nostoreloadforward2 +; CHECK-NOT: <2 x i32> + +define void @nostoreloadforward2(i32* noalias %A, i32* noalias %B, i32* noalias %C) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 16, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32* %B, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32* %A, i64 %indvars.iv + store i32 %0, i32* %arrayidx2, align 4 + %1 = add nsw i64 %indvars.iv, -3 + %arrayidx4 = getelementptr inbounds i32* %A, i64 %1 + %2 = load i32* %arrayidx4, align 4 + %arrayidx6 = getelementptr inbounds i32* %C, i64 %indvars.iv + store i32 %2, i32* %arrayidx6, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp ne i32 %lftr.wideiv, 128 + br i1 %exitcond, label %for.body, label %for.end + +for.end: + ret void +} diff --git a/test/Transforms/LoopVectorize/runtime-check.ll b/test/Transforms/LoopVectorize/runtime-check.ll index 014c4fc48f8..47722566e17 100644 --- a/test/Transforms/LoopVectorize/runtime-check.ll +++ b/test/Transforms/LoopVectorize/runtime-check.ll @@ -12,7 +12,7 @@ target triple = "x86_64-apple-macosx10.9.0" ;CHECK: for.body.preheader: ;CHECK: br i1 %cmp.zero, label %middle.block, label %vector.memcheck ;CHECK: vector.memcheck: -;CHECK: br i1 %found.conflict, label %middle.block, label %vector.ph +;CHECK: br i1 %memcheck.conflict, label %middle.block, label %vector.ph ;CHECK: load <4 x float> define i32 @foo(float* nocapture %a, float* nocapture %b, i32 %n) nounwind uwtable ssp { entry: