Reapply 184685 after the SetVector iteration order fix.

This should hopefully have fixed the stage2/stage3 miscompare on the dragonegg testers. "LoopVectorize: Use the dependence test utility class We now no longer need alias analysis - the cases that alias analysis would handle are now handled as accesses with a large dependence distance. We can now vectorize loops with simple constant dependence distances. for (i = 8; i < 256; ++i) { a[i] = a[i+4] * a[i+8]; } for (i = 8; i < 256; ++i) { a[i] = a[i-4] * a[i-8]; } We would be able to vectorize about 200 more loops (in many cases the cost model instructs us no to) in the test suite now. Results on x86-64 are a wash. I have seen one degradation in ammp. Interestingly, the function in which we now vectorize a loop is never executed so we probably see some instruction cache effects. There is a 2% improvement in h264ref. There is one or the other TSCV loop kernel that speeds up. radar://13681598" git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@184724 91177308-0d34-0410-b5e6-96231b3b80d8
2025-07-31 09:25:42 +00:00 · 2013-06-24 12:09:15 +00:00
parent 7e96b4dfce
commit bc7c58d2b1
4 changed files with 328 additions and 234 deletions
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -55,7 +55,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
@@ -410,11 +409,10 @@ bool LoadHoisting::canHoistAllLoads() {
 class LoopVectorizationLegality {
 public:
  LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
-                            DominatorTree *DT, TargetTransformInfo* TTI,
-                            AliasAnalysis *AA, TargetLibraryInfo *TLI)
-      : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI),
+                            DominatorTree *DT, TargetLibraryInfo *TLI)
+      : TheLoop(L), SE(SE), DL(DL), DT(DT), TLI(TLI),
        Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false),
-        LoadSpeculation(L, DT) {}
+        MaxSafeDepDistBytes(-1U), LoadSpeculation(L, DT) {}

  /// This enum represents the kinds of reductions that we support.
  enum ReductionKind {
@@ -501,7 +499,8 @@ public:
    }

    /// Insert a pointer and calculate the start and end SCEVs.
-    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr);
+    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr,
+                unsigned DepSetId);

    /// This flag indicates if we need to add the runtime check.
    bool Need;
@@ -513,6 +512,9 @@ public:
    SmallVector<const SCEV*, 2> Ends;
    /// Holds the information if this pointer is used for writing to memory.
    SmallVector<bool, 2> IsWritePtr;
+    /// Holds the id of the set of pointers that could be dependent because of a
+    /// shared underlying object.
+    SmallVector<unsigned, 2> DependencySetId;
  };

  /// A POD for saving information about induction variables.
@@ -533,11 +535,6 @@ public:
  /// induction descriptor.
  typedef MapVector<PHINode*, InductionInfo> InductionList;

-  /// Alias(Multi)Map stores the values (GEPs or underlying objects and their
-  /// respective Store/Load instruction(s) to calculate aliasing.
-  typedef MapVector<Value*, Instruction* > AliasMap;
-  typedef DenseMap<Value*, std::vector<Instruction*> > AliasMultiMap;
-
  /// Returns true if it is legal to vectorize this loop.
  /// This does not mean that it is profitable to vectorize this
  /// loop, only that it is legal to do so.
@@ -584,6 +581,9 @@ public:
  /// This function returns the identity element (or neutral element) for
  /// the operation K.
  static Constant *getReductionIdentity(ReductionKind K, Type *Tp);
+
+  unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
+
 private:
  /// Check if a single basic block loop is vectorizable.
  /// At this point we know that this is a loop with a constant trip count
@@ -624,16 +624,6 @@ private:
  /// Returns the induction kind of Phi. This function may return NoInduction
  /// if the PHI is not an induction variable.
  InductionKind isInductionVariable(PHINode *Phi);
-  /// Return true if can compute the address bounds of Ptr within the loop.
-  bool hasComputableBounds(Value *Ptr);
-  /// Return true if there is the chance of write reorder.
-  bool hasPossibleGlobalWriteReorder(Value *Object,
-                                     Instruction *Inst,
-                                     AliasMultiMap &WriteObjects,
-                                     unsigned MaxByteWidth);
-  /// Return the AA location for a load or a store.
-  AliasAnalysis::Location getLoadStoreLocation(Instruction *Inst);
-

  /// The loop that we evaluate.
  Loop *TheLoop;
@@ -643,10 +633,6 @@ private:
  DataLayout *DL;
  /// Dominators.
  DominatorTree *DT;
-  /// Target Info.
-  TargetTransformInfo *TTI;
-  /// Alias Analysis.
-  AliasAnalysis *AA;
  /// Target Library Info.
  TargetLibraryInfo *TLI;

@@ -676,6 +662,8 @@ private:
  /// Can we assume the absence of NaNs.
  bool HasFunNoNaNAttr;

+  unsigned MaxSafeDepDistBytes;
+
  /// Utility to determine whether loads can be speculated.
  LoadHoisting LoadSpeculation;
 };
@@ -904,7 +892,6 @@ struct LoopVectorize : public LoopPass {
  LoopInfo *LI;
  TargetTransformInfo *TTI;
  DominatorTree *DT;
-  AliasAnalysis *AA;
  TargetLibraryInfo *TLI;

  virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
@@ -917,7 +904,6 @@ struct LoopVectorize : public LoopPass {
    LI = &getAnalysis<LoopInfo>();
    TTI = &getAnalysis<TargetTransformInfo>();
    DT = &getAnalysis<DominatorTree>();
-    AA = getAnalysisIfAvailable<AliasAnalysis>();
    TLI = getAnalysisIfAvailable<TargetLibraryInfo>();

    if (DL == NULL) {
@@ -936,7 +922,7 @@ struct LoopVectorize : public LoopPass {
    }

    // Check if it is legal to vectorize the loop.
-    LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA, TLI);
+    LoopVectorizationLegality LVL(L, SE, DL, DT, TLI);
    if (!LVL.canVectorize()) {
      DEBUG(dbgs() << "LV: Not vectorizing.\n");
      return false;
@@ -1011,7 +997,8 @@ struct LoopVectorize : public LoopPass {
 void
 LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
                                                       Loop *Lp, Value *Ptr,
-                                                       bool WritePtr) {
+                                                       bool WritePtr,
+                                                       unsigned DepSetId) {
  const SCEV *Sc = SE->getSCEV(Ptr);
  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
  assert(AR && "Invalid addrec expression");
@@ -1021,6 +1008,7 @@ LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
  Starts.push_back(AR->getStart());
  Ends.push_back(ScEnd);
  IsWritePtr.push_back(WritePtr);
+  DependencySetId.push_back(DepSetId);
 }

 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
@@ -1358,10 +1346,9 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
  if (!PtrRtCheck->Need)
    return NULL;

-  Instruction *MemoryRuntimeCheck = 0;
  unsigned NumPointers = PtrRtCheck->Pointers.size();
-  SmallVector<Value* , 2> Starts;
-  SmallVector<Value* , 2> Ends;
+  SmallVector<TrackingVH<Value> , 2> Starts;
+  SmallVector<TrackingVH<Value> , 2> Ends;

  SCEVExpander Exp(*SE, "induction");

@@ -1388,13 +1375,18 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
  }

  IRBuilder<> ChkBuilder(Loc);
-
+  // Our instructions might fold to a constant.
+  Value *MemoryRuntimeCheck = 0;
  for (unsigned i = 0; i < NumPointers; ++i) {
    for (unsigned j = i+1; j < NumPointers; ++j) {
      // No need to check if two readonly pointers intersect.
      if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j])
        continue;

+      // Only need to check pointers between two different dependency sets.
+      if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j])
+       continue;
+
      Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc");
      Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc");
      Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy, "bc");
@@ -1406,12 +1398,18 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
      if (MemoryRuntimeCheck)
        IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
                                         "conflict.rdx");
-
-      MemoryRuntimeCheck = cast<Instruction>(IsConflict);
+      MemoryRuntimeCheck = IsConflict;
    }
  }

-  return MemoryRuntimeCheck;
+  // We have to do this trickery because the IRBuilder might fold the check to a
+  // constant expression in which case there is no Instruction anchored in a
+  // the block.
+  LLVMContext &Ctx = Loc->getContext();
+  Instruction * Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
+                                                  ConstantInt::getTrue(Ctx));
+  ChkBuilder.Insert(Check, "memcheck.conflict");
+  return Check;
 }

 void
@@ -2982,7 +2980,7 @@ bool AccessAnalysis::canCheckPtrAtRT(
        // Each access has its own dependence set.
        DepId = RunningDepId++;

-      //RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId);
+      RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId);

      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr <<"\n");
    } else {
@@ -3464,53 +3462,29 @@ MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
  return true;
 }

-AliasAnalysis::Location
-LoopVectorizationLegality::getLoadStoreLocation(Instruction *Inst) {
-  if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
-    return AA->getLocation(Store);
-  else if (LoadInst *Load = dyn_cast<LoadInst>(Inst))
-    return AA->getLocation(Load);
-
-  llvm_unreachable("Should be either load or store instruction");
-}
-
-bool
-LoopVectorizationLegality::hasPossibleGlobalWriteReorder(
-                                                Value *Object,
-                                                Instruction *Inst,
-                                                AliasMultiMap& WriteObjects,
-                                                unsigned MaxByteWidth) {
-
-  AliasAnalysis::Location ThisLoc = getLoadStoreLocation(Inst);
-
-  std::vector<Instruction*>::iterator
-              it = WriteObjects[Object].begin(),
-              end = WriteObjects[Object].end();
-
-  for (; it != end; ++it) {
-    Instruction* I = *it;
-    if (I == Inst)
-      continue;
-
-    AliasAnalysis::Location ThatLoc = getLoadStoreLocation(I);
-    if (AA->alias(ThisLoc.getWithNewSize(MaxByteWidth),
-                  ThatLoc.getWithNewSize(MaxByteWidth)))
-      return true;
-  }
-  return false;
-}
-
 bool LoopVectorizationLegality::canVectorizeMemory() {

  typedef SmallVector<Value*, 16> ValueVector;
  typedef SmallPtrSet<Value*, 16> ValueSet;
+
+  // Stores a pair of memory access location and whether the access is a store
+  // (true) or a load (false).
+  typedef std::pair<Value*, char> MemAccessInfo;
+  typedef DenseSet<MemAccessInfo> PtrAccessSet;
+
  // Holds the Load and Store *instructions*.
  ValueVector Loads;
  ValueVector Stores;
+
+  // Holds all the different accesses in the loop.
+  unsigned NumReads = 0;
+  unsigned NumReadWrites = 0;
+
  PtrRtCheck.Pointers.clear();
  PtrRtCheck.Need = false;

  const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
+  MemoryDepChecker DepChecker(SE, DL, TheLoop);

  // For each block.
  for (Loop::block_iterator bb = TheLoop->block_begin(),
@@ -3531,6 +3505,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
          return false;
        }
        Loads.push_back(Ld);
+        DepChecker.addAccess(Ld);
        continue;
      }

@@ -3543,6 +3518,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
          return false;
        }
        Stores.push_back(St);
+        DepChecker.addAccess(St);
      }
    } // next instr.
  } // next block.
@@ -3557,10 +3533,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
    return true;
  }

-  // Holds the read and read-write *pointers* that we find. These maps hold
-  // unique values for pointers (so no need for multi-map).
-  AliasMap Reads;
-  AliasMap ReadWrites;
+  AccessAnalysis::DepCandidates DependentAccesses;
+  AccessAnalysis Accesses(DL, DependentAccesses);

  // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
  // multiple times on the same object. If the ptr is accessed twice, once
@@ -3579,10 +3553,12 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
      return false;
    }

-    // If we did *not* see this pointer before, insert it to
-    // the read-write list. At this phase it is only a 'write' list.
-    if (Seen.insert(Ptr))
-      ReadWrites.insert(std::make_pair(Ptr, ST));
+    // If we did *not* see this pointer before, insert it to  the read-write
+    // list. At this phase it is only a 'write' list.
+    if (Seen.insert(Ptr)) {
+      ++NumReadWrites;
+      Accesses.addStore(Ptr);
+    }
  }

  if (IsAnnotatedParallel) {
@@ -3592,6 +3568,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
    return true;
  }

+  SmallPtrSet<Value *, 16> ReadOnlyPtr;
  for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
    LoadInst *LD = cast<LoadInst>(*I);
    Value* Ptr = LD->getPointerOperand();
@@ -3603,51 +3580,44 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
    // If the address of i is unknown (for example A[B[i]]) then we may
    // read a few words, modify, and write a few words, and some of the
    // words may be written to the same address.
-    if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr))
-      Reads.insert(std::make_pair(Ptr, LD));
+    bool IsReadOnlyPtr = false;
+    if (Seen.insert(Ptr) || !isStridedPtr(SE, DL, Ptr, TheLoop)) {
+      ++NumReads;
+      IsReadOnlyPtr = true;
+    }
+    Accesses.addLoad(Ptr, IsReadOnlyPtr);
  }

  // If we write (or read-write) to a single destination and there are no
  // other reads in this loop then is it safe to vectorize.
-  if (ReadWrites.size() == 1 && Reads.size() == 0) {
+  if (NumReadWrites == 1 && NumReads == 0) {
    DEBUG(dbgs() << "LV: Found a write-only loop!\n");
    return true;
  }

-  unsigned NumReadPtrs = 0;
-  unsigned NumWritePtrs = 0;
+  // Build dependence sets and check whether we need a runtime pointer bounds
+  // check.
+  Accesses.buildDependenceSets();
+  bool NeedRTCheck = Accesses.isRTCheckNeeded();

  // Find pointers with computable bounds. We are going to use this information
  // to place a runtime bound check.
-  bool CanDoRT = true;
-  AliasMap::iterator MI, ME;
-  for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
-    Value *V = (*MI).first;
-    if (hasComputableBounds(V)) {
-      PtrRtCheck.insert(SE, TheLoop, V, true);
-      NumWritePtrs++;
-      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
-    } else {
-      CanDoRT = false;
-      break;
-    }
-  }
-  for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
-    Value *V = (*MI).first;
-    if (hasComputableBounds(V)) {
-      PtrRtCheck.insert(SE, TheLoop, V, false);
-      NumReadPtrs++;
-      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
-    } else {
-      CanDoRT = false;
-      break;
-    }
-  }
+  unsigned NumComparisons = 0;
+  bool CanDoRT = false;
+  if (NeedRTCheck)
+    CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop);

-  // Check that we did not collect too many pointers or found a
-  // unsizeable pointer.
-  unsigned NumComparisons = (NumWritePtrs * (NumReadPtrs + NumWritePtrs - 1));
-  DEBUG(dbgs() << "LV: We need to compare " << NumComparisons << " ptrs.\n");
+
+  DEBUG(dbgs() << "LV: We need to do " << NumComparisons <<
+        " pointer comparisons.\n");
+
+  // If we only have one set of dependences to check pointers among we don't
+  // need a runtime check.
+  if (NumComparisons == 0 && NeedRTCheck)
+    NeedRTCheck = false;
+
+  // Check that we did not collect too many pointers or found a unsizeable
+  // pointer.
  if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
    PtrRtCheck.reset();
    CanDoRT = false;
@@ -3657,113 +3627,6 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
    DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
  }

-  bool NeedRTCheck = false;
-
-  // Biggest vectorized access possible, vector width * unroll factor.
-  // TODO: We're being very pessimistic here, find a way to know the
-  // real access width before getting here.
-  unsigned MaxByteWidth = (TTI->getRegisterBitWidth(true) / 8) *
-                           TTI->getMaximumUnrollFactor();
-  // Now that the pointers are in two lists (Reads and ReadWrites), we
-  // can check that there are no conflicts between each of the writes and
-  // between the writes to the reads.
-  // Note that WriteObjects duplicates the stores (indexed now by underlying
-  // objects) to avoid pointing to elements inside ReadWrites.
-  // TODO: Maybe create a new type where they can interact without duplication.
-  AliasMultiMap WriteObjects;
-  ValueVector TempObjects;
-
-  // Check that the read-writes do not conflict with other read-write
-  // pointers.
-  bool AllWritesIdentified = true;
-  for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
-    Value *Val = (*MI).first;
-    Instruction *Inst = (*MI).second;
-
-    GetUnderlyingObjects(Val, TempObjects, DL);
-    for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end();
-         UI != UE; ++UI) {
-      if (!isIdentifiedObject(*UI)) {
-        DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **UI <<"\n");
-        NeedRTCheck = true;
-        AllWritesIdentified = false;
-      }
-
-      // Never seen it before, can't alias.
-      if (WriteObjects[*UI].empty()) {
-        DEBUG(dbgs() << "LV: Adding Underlying value:" << **UI <<"\n");
-        WriteObjects[*UI].push_back(Inst);
-        continue;
-      }
-      // Direct alias found.
-      if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) {
-        DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
-              << **UI <<"\n");
-        return false;
-      }
-      DEBUG(dbgs() << "LV: Found a conflicting global value:"
-            << **UI <<"\n");
-      DEBUG(dbgs() << "LV: While examining store:" << *Inst <<"\n");
-      DEBUG(dbgs() << "LV: On value:" << *Val <<"\n");
-
-      // If global alias, make sure they do alias.
-      if (hasPossibleGlobalWriteReorder(*UI,
-                                        Inst,
-                                        WriteObjects,
-                                        MaxByteWidth)) {
-        DEBUG(dbgs() << "LV: Found a possible write-write reorder:" << **UI
-                     << "\n");
-        return false;
-      }
-
-      // Didn't alias, insert into map for further reference.
-      WriteObjects[*UI].push_back(Inst);
-    }
-    TempObjects.clear();
-  }
-
-  /// Check that the reads don't conflict with the read-writes.
-  for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
-    Value *Val = (*MI).first;
-    GetUnderlyingObjects(Val, TempObjects, DL);
-    for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end();
-         UI != UE; ++UI) {
-      // If all of the writes are identified then we don't care if the read
-      // pointer is identified or not.
-      if (!AllWritesIdentified && !isIdentifiedObject(*UI)) {
-        DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **UI <<"\n");
-        NeedRTCheck = true;
-      }
-
-      // Never seen it before, can't alias.
-      if (WriteObjects[*UI].empty())
-        continue;
-      // Direct alias found.
-      if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) {
-        DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
-              << **UI <<"\n");
-        return false;
-      }
-      DEBUG(dbgs() << "LV: Found a global value:  "
-            << **UI <<"\n");
-      Instruction *Inst = (*MI).second;
-      DEBUG(dbgs() << "LV: While examining load:" << *Inst <<"\n");
-      DEBUG(dbgs() << "LV: On value:" << *Val <<"\n");
-
-      // If global alias, make sure they do alias.
-      if (hasPossibleGlobalWriteReorder(*UI,
-                                        Inst,
-                                        WriteObjects,
-                                        MaxByteWidth)) {
-        DEBUG(dbgs() << "LV: Found a possible read-write reorder:" << **UI
-                     << "\n");
-        return false;
-      }
-    }
-    TempObjects.clear();
-  }
-
-  PtrRtCheck.Need = NeedRTCheck;
  if (NeedRTCheck && !CanDoRT) {
    DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<
          "the array bounds.\n");
@@ -3771,9 +3634,20 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
    return false;
  }

+  PtrRtCheck.Need = NeedRTCheck;
+
+  bool CanVecMem = true;
+  if (Accesses.isDependencyCheckNeeded()) {
+    DEBUG(dbgs() << "LV: Checking memory dependencies\n");
+    CanVecMem = DepChecker.areDepsSafe(DependentAccesses,
+                                       Accesses.getDependenciesToCheck());
+    MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes();
+  }
+
  DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") <<
        " need a runtime memory check.\n");
-  return true;
+
+  return CanVecMem;
 }

 static bool hasMultipleUsesOf(Instruction *I,
@@ -4126,15 +4000,6 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) {
  return true;
 }

-bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
-  const SCEV *PhiScev = SE->getSCEV(Ptr);
-  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
-  if (!AR)
-    return false;
-
-  return AR->isAffine();
-}
-
 LoopVectorizationCostModel::VectorizationFactor
 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
                                                      unsigned UserVF) {
@@ -4151,6 +4016,10 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,

  unsigned WidestType = getWidestType();
  unsigned WidestRegister = TTI.getRegisterBitWidth(true);
+  unsigned MaxSafeDepDist = -1U;
+  if (Legal->getMaxSafeDepDistBytes() != -1U)
+    MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
+  WidestRegister = WidestRegister < MaxSafeDepDist ?  WidestRegister : MaxSafeDepDist;
  unsigned MaxVectorSize = WidestRegister / WidestType;
  DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
  DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n");
@@ -4284,6 +4153,10 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
  if (OptForSize)
    return 1;

+  // We used the distance for the unroll factor.
+  if (Legal->getMaxSafeDepDistBytes() != -1U)
+    return 1;
+
  // Do not unroll loops with a relatively small trip count.
  unsigned TC = SE->getSmallConstantTripCount(TheLoop,
                                              TheLoop->getLoopLatch());
@@ -4680,7 +4553,6 @@ Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
 char LoopVectorize::ID = 0;
 static const char lv_name[] = "Loop Vectorization";
 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
--- a/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
+++ b/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
@@ -30,7 +30,7 @@ if.then:                                          ; preds = %for.body
 if.end:                                           ; preds = %for.body, %if.then
  %z.0 = phi i32 [ %add1, %if.then ], [ 9, %for.body ]
  store i32 %z.0, i32* %arrayidx, align 4
-  %indvars.iv.next = add i64 %indvars.iv, 1
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
  %exitcond = icmp eq i32 %lftr.wideiv, %x
  br i1 %exitcond, label %for.end, label %for.body
--- a/test/Transforms/LoopVectorize/memdep.ll
+++ b/test/Transforms/LoopVectorize/memdep.ll
@@ -0,0 +1,222 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-unroll=1 -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-unroll=1 -S | FileCheck %s -check-prefix=WIDTH
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Vectorization with dependence checks.
+
+; No plausible dependence - can be vectorized.
+;  for (i = 0; i < 1024; ++i)
+;    A[i] = A[i + 1] + 1;
+
+; CHECK: f1_vec
+; CHECK: <2 x i32>
+
+define void @f1_vec(i32* %A) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds i32* %A, i32 %indvars.iv.next
+  %0 = load i32* %arrayidx, align 4
+  %add1 = add nsw i32 %0, 1
+  %arrayidx3 = getelementptr inbounds i32* %A, i32 %indvars.iv
+  store i32 %add1, i32* %arrayidx3, align 4
+  %exitcond = icmp ne i32 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Plausible dependence of distance 1 - can't be vectorized.
+;  for (i = 0; i < 1024; ++i)
+;    A[i+1] = A[i] + 1;
+
+; CHECK: f2_novec
+; CHECK-NOT: <2 x i32>
+
+define void @f2_novec(i32* %A) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %A, i32 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %0, 1
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %arrayidx3 = getelementptr inbounds i32* %A, i32 %indvars.iv.next
+  store i32 %add, i32* %arrayidx3, align 4
+  %exitcond = icmp ne i32 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Plausible dependence of distance 2 - can be vectorized with a width of 2.
+;  for (i = 0; i < 1024; ++i)
+;    A[i+2] = A[i] + 1;
+
+; CHECK: f3_vec_len
+; CHECK: <2 x i32>
+
+; WIDTH: f3_vec_len
+; WIDTH-NOT: <4 x i32>
+
+define void @f3_vec_len(i32* %A) {
+entry:
+  br label %for.body
+
+for.body:
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %idxprom = sext i32 %i.01 to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  %0 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %0, 1
+  %add1 = add nsw i32 %i.01, 2
+  %idxprom2 = sext i32 %add1 to i64
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %idxprom2
+  store i32 %add, i32* %arrayidx3, align 4
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Plausible dependence of distance 1 - cannot be vectorized (without reordering
+; accesses).
+;   for (i = 0; i < 1024; ++i) {
+;     B[i] = A[i];
+;     A[i] = B[i + 1];
+;   }
+
+; CHECK: f5
+; CHECK-NOT: <2 x i32>
+
+define void @f5(i32*  %A, i32* %B) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  store i32 %0, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %arrayidx4 = getelementptr inbounds i32* %B, i64 %indvars.iv.next
+  %1 = load i32* %arrayidx4, align 4
+  store i32 %1, i32* %arrayidx, align 4
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Dependence through a phi node - must not vectorize.
+;   for (i = 0; i < 1024; ++i) {
+;     a[i+1] = tmp;
+;     tmp = a[i];
+;   }
+
+; CHECK: f6
+; CHECK-NOT: <2 x i32>
+
+define i32 @f6(i32* %a, i32 %tmp) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp.addr.08 = phi i32 [ %tmp, %entry ], [ %0, %for.body ]
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv.next
+  store i32 %tmp.addr.08, i32* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx3, align 4
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret i32 undef
+}
+
+; Don't vectorize true loop carried dependencies that are not a multiple of the
+; vector width.
+; Example:
+;   for (int i = ...; ++i) {
+;     a[i] = a[i-3] + ...;
+; It is a bad idea to vectorize this loop because store-load forwarding will not
+; happen.
+;
+
+; CHECK: @nostoreloadforward
+; CHECK-NOT: <2 x i32>
+
+define void @nostoreloadforward(i32* %A) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 16, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = add nsw i64 %indvars.iv, -3
+  %arrayidx = getelementptr inbounds i32* %A, i64 %0
+  %1 = load i32* %arrayidx, align 4
+  %2 = add nsw i64 %indvars.iv, 4
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %2
+  %3 = load i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %3, %1
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  store i32 %add3, i32* %arrayidx5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Example:
+;   for (int i = ...; ++i) {
+;     a[i] = b[i];
+;     c[i] = a[i-3] + ...;
+; It is a bad idea to vectorize this loop because store-load forwarding will not
+; happen.
+;
+
+; CHECK: @nostoreloadforward2
+; CHECK-NOT: <2 x i32>
+
+define void @nostoreloadforward2(i32* noalias %A, i32* noalias %B, i32* noalias %C) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 16, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  store i32 %0, i32* %arrayidx2, align 4
+  %1 = add nsw i64 %indvars.iv, -3
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 %1
+  %2 = load i32* %arrayidx4, align 4
+  %arrayidx6 = getelementptr inbounds i32* %C, i64 %indvars.iv
+  store i32 %2, i32* %arrayidx6, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
--- a/test/Transforms/LoopVectorize/runtime-check.ll
+++ b/test/Transforms/LoopVectorize/runtime-check.ll
@@ -12,7 +12,7 @@ target triple = "x86_64-apple-macosx10.9.0"
 ;CHECK: for.body.preheader:
 ;CHECK: br i1 %cmp.zero, label %middle.block, label %vector.memcheck
 ;CHECK: vector.memcheck:
-;CHECK: br i1 %found.conflict, label %middle.block, label %vector.ph
+;CHECK: br i1 %memcheck.conflict, label %middle.block, label %vector.ph
 ;CHECK: load <4 x float>
 define i32 @foo(float* nocapture %a, float* nocapture %b, i32 %n) nounwind uwtable ssp {
 entry: