[LoopVectorize] Use AA to partition potential dependency checks

Prior to this change, the loop vectorizer did not make use of the alias
analysis infrastructure. Instead, it performed memory dependence analysis using
ScalarEvolution-based linear dependence checks within equivalence classes
derived from the results of ValueTracking's GetUnderlyingObjects.

Unfortunately, this meant that:
  1. The loop vectorizer had logic that essentially duplicated that in BasicAA
     for aliasing based on identified objects.
  2. The loop vectorizer could not partition the space of dependency checks
     based on information only easily available from within AA (TBAA metadata is
     currently the prime example).

This means, for example, regardless of whether -fno-strict-aliasing was
provided, the vectorizer would only vectorize this loop with a runtime
memory-overlap check:

void foo(int *a, float *b) {
  for (int i = 0; i < 1600; ++i)
    a[i] = b[i];
}

This is suboptimal because the TBAA metadata already provides the information
necessary to show that this check unnecessary. Of course, the vectorizer has a
limit on the number of such checks it will insert, so in practice, ignoring
TBAA means not vectorizing more-complicated loops that we should.

This change causes the vectorizer to use an AliasSetTracker to keep track of
the pointers in the loop. The resulting alias sets are then used to partition
the space of dependency checks, and potential runtime checks; this results in
more-efficient vectorizations.

When pointer locations are added to the AliasSetTracker, two things are done:
  1. The location size is set to UnknownSize (otherwise you'd not catch
     inter-iteration dependencies)
  2. For instructions in blocks that would need to be predicated, TBAA is
     removed (because the metadata might have a control dependency on the condition
     being speculated).

For non-predicated blocks, you can leave the TBAA metadata. This is safe
because you can't have an iteration dependency on the TBAA metadata (if you
did, and you unrolled sufficiently, you'd end up with the same pointer value
used by two accesses that TBAA says should not alias, and that would yield
undefined behavior).

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213486 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Hal Finkel 2014-07-20 23:07:52 +00:00
parent 5ee5fc4c47
commit 160f9b9c10
10 changed files with 307 additions and 158 deletions

View File

@ -54,6 +54,7 @@
#include "llvm/ADT/Statistic.h" #include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AliasSetTracker.h"
#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopIterator.h"
@ -409,6 +410,8 @@ protected:
LoopInfo *LI; LoopInfo *LI;
/// Dominator Tree. /// Dominator Tree.
DominatorTree *DT; DominatorTree *DT;
/// Alias Analysis.
AliasAnalysis *AA;
/// Data Layout. /// Data Layout.
const DataLayout *DL; const DataLayout *DL;
/// Target Library Info. /// Target Library Info.
@ -567,9 +570,9 @@ public:
LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL, LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL,
DominatorTree *DT, TargetLibraryInfo *TLI, DominatorTree *DT, TargetLibraryInfo *TLI,
Function *F) AliasAnalysis *AA, Function *F)
: NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL), : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL),
DT(DT), TLI(TLI), TheFunction(F), Induction(nullptr), DT(DT), TLI(TLI), AA(AA), TheFunction(F), Induction(nullptr),
WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) { WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {
} }
@ -657,11 +660,12 @@ public:
Ends.clear(); Ends.clear();
IsWritePtr.clear(); IsWritePtr.clear();
DependencySetId.clear(); DependencySetId.clear();
AliasSetId.clear();
} }
/// Insert a pointer and calculate the start and end SCEVs. /// Insert a pointer and calculate the start and end SCEVs.
void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr,
unsigned DepSetId, ValueToValueMap &Strides); unsigned DepSetId, unsigned ASId, ValueToValueMap &Strides);
/// This flag indicates if we need to add the runtime check. /// This flag indicates if we need to add the runtime check.
bool Need; bool Need;
@ -676,6 +680,8 @@ public:
/// Holds the id of the set of pointers that could be dependent because of a /// Holds the id of the set of pointers that could be dependent because of a
/// shared underlying object. /// shared underlying object.
SmallVector<unsigned, 2> DependencySetId; SmallVector<unsigned, 2> DependencySetId;
/// Holds the id of the disjoint alias set to which this pointer belongs.
SmallVector<unsigned, 2> AliasSetId;
}; };
/// A struct for saving information about induction variables. /// A struct for saving information about induction variables.
@ -820,6 +826,8 @@ private:
DominatorTree *DT; DominatorTree *DT;
/// Target Library Info. /// Target Library Info.
TargetLibraryInfo *TLI; TargetLibraryInfo *TLI;
/// Alias analysis.
AliasAnalysis *AA;
/// Parent function /// Parent function
Function *TheFunction; Function *TheFunction;
@ -1158,6 +1166,7 @@ struct LoopVectorize : public FunctionPass {
DominatorTree *DT; DominatorTree *DT;
BlockFrequencyInfo *BFI; BlockFrequencyInfo *BFI;
TargetLibraryInfo *TLI; TargetLibraryInfo *TLI;
AliasAnalysis *AA;
bool DisableUnrolling; bool DisableUnrolling;
bool AlwaysVectorize; bool AlwaysVectorize;
@ -1172,6 +1181,7 @@ struct LoopVectorize : public FunctionPass {
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
BFI = &getAnalysis<BlockFrequencyInfo>(); BFI = &getAnalysis<BlockFrequencyInfo>();
TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
AA = &getAnalysis<AliasAnalysis>();
// Compute some weights outside of the loop over the loops. Compute this // Compute some weights outside of the loop over the loops. Compute this
// using a BranchProbability to re-use its scaling math. // using a BranchProbability to re-use its scaling math.
@ -1283,7 +1293,7 @@ struct LoopVectorize : public FunctionPass {
} }
// Check if it is legal to vectorize the loop. // Check if it is legal to vectorize the loop.
LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, F); LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F);
if (!LVL.canVectorize()) { if (!LVL.canVectorize()) {
DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
emitMissedWarning(F, L, Hints); emitMissedWarning(F, L, Hints);
@ -1387,8 +1397,10 @@ struct LoopVectorize : public FunctionPass {
AU.addRequired<LoopInfo>(); AU.addRequired<LoopInfo>();
AU.addRequired<ScalarEvolution>(); AU.addRequired<ScalarEvolution>();
AU.addRequired<TargetTransformInfo>(); AU.addRequired<TargetTransformInfo>();
AU.addRequired<AliasAnalysis>();
AU.addPreserved<LoopInfo>(); AU.addPreserved<LoopInfo>();
AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<AliasAnalysis>();
} }
}; };
@ -1444,7 +1456,7 @@ static const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE,
void LoopVectorizationLegality::RuntimePointerCheck::insert( void LoopVectorizationLegality::RuntimePointerCheck::insert(
ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId, ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId,
ValueToValueMap &Strides) { unsigned ASId, ValueToValueMap &Strides) {
// Get the stride replaced scev. // Get the stride replaced scev.
const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr); const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
@ -1456,6 +1468,7 @@ void LoopVectorizationLegality::RuntimePointerCheck::insert(
Ends.push_back(ScEnd); Ends.push_back(ScEnd);
IsWritePtr.push_back(WritePtr); IsWritePtr.push_back(WritePtr);
DependencySetId.push_back(DepSetId); DependencySetId.push_back(DepSetId);
AliasSetId.push_back(ASId);
} }
Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
@ -2001,6 +2014,9 @@ InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) {
// Only need to check pointers between two different dependency sets. // Only need to check pointers between two different dependency sets.
if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j]) if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j])
continue; continue;
// Only need to check pointers in the same alias set.
if (PtrRtCheck->AliasSetId[i] != PtrRtCheck->AliasSetId[j])
continue;
unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace(); unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace();
unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace(); unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace();
@ -3912,19 +3928,22 @@ public:
/// \brief Set of potential dependent memory accesses. /// \brief Set of potential dependent memory accesses.
typedef EquivalenceClasses<MemAccessInfo> DepCandidates; typedef EquivalenceClasses<MemAccessInfo> DepCandidates;
AccessAnalysis(const DataLayout *Dl, DepCandidates &DA) : AccessAnalysis(const DataLayout *Dl, AliasAnalysis *AA, DepCandidates &DA) :
DL(Dl), DepCands(DA), AreAllWritesIdentified(true), DL(Dl), AA(AA), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {}
AreAllReadsIdentified(true), IsRTCheckNeeded(false) {}
/// \brief Register a load and whether it is only read from. /// \brief Register a load and whether it is only read from.
void addLoad(Value *Ptr, bool IsReadOnly) { void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) {
Value *Ptr = const_cast<Value*>(Loc.Ptr);
AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.TBAATag);
Accesses.insert(MemAccessInfo(Ptr, false)); Accesses.insert(MemAccessInfo(Ptr, false));
if (IsReadOnly) if (IsReadOnly)
ReadOnlyPtr.insert(Ptr); ReadOnlyPtr.insert(Ptr);
} }
/// \brief Register a store. /// \brief Register a store.
void addStore(Value *Ptr) { void addStore(AliasAnalysis::Location &Loc) {
Value *Ptr = const_cast<Value*>(Loc.Ptr);
AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.TBAATag);
Accesses.insert(MemAccessInfo(Ptr, true)); Accesses.insert(MemAccessInfo(Ptr, true));
} }
@ -3938,10 +3957,7 @@ public:
/// \brief Goes over all memory accesses, checks whether a RT check is needed /// \brief Goes over all memory accesses, checks whether a RT check is needed
/// and builds sets of dependent accesses. /// and builds sets of dependent accesses.
void buildDependenceSets() { void buildDependenceSets() {
// Process read-write pointers first. processMemAccesses();
processMemAccesses(false);
// Next, process read pointers.
processMemAccesses(true);
} }
bool isRTCheckNeeded() { return IsRTCheckNeeded; } bool isRTCheckNeeded() { return IsRTCheckNeeded; }
@ -3953,40 +3969,32 @@ public:
private: private:
typedef SetVector<MemAccessInfo> PtrAccessSet; typedef SetVector<MemAccessInfo> PtrAccessSet;
typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap;
/// \brief Go over all memory access or only the deferred ones if /// \brief Go over all memory access and check whether runtime pointer checks
/// \p UseDeferred is true and check whether runtime pointer checks are needed /// are needed /// and build sets of dependency check candidates.
/// and build sets of dependency check candidates. void processMemAccesses();
void processMemAccesses(bool UseDeferred);
/// Set of all accesses. /// Set of all accesses.
PtrAccessSet Accesses; PtrAccessSet Accesses;
/// Set of access to check after all writes have been processed.
PtrAccessSet DeferredAccesses;
/// Map of pointers to last access encountered.
UnderlyingObjToAccessMap ObjToLastAccess;
/// Set of accesses that need a further dependence check. /// Set of accesses that need a further dependence check.
MemAccessInfoSet CheckDeps; MemAccessInfoSet CheckDeps;
/// Set of pointers that are read only. /// Set of pointers that are read only.
SmallPtrSet<Value*, 16> ReadOnlyPtr; SmallPtrSet<Value*, 16> ReadOnlyPtr;
/// Set of underlying objects already written to.
SmallPtrSet<Value*, 16> WriteObjects;
const DataLayout *DL; const DataLayout *DL;
AliasAnalysis *AA;
/// An alias set tracker to partition the access set by underlying object and
//intrinsic property (such as TBAA metadata).
AliasSetTracker AST;
/// Sets of potentially dependent accesses - members of one set share an /// Sets of potentially dependent accesses - members of one set share an
/// underlying pointer. The set "CheckDeps" identfies which sets really need a /// underlying pointer. The set "CheckDeps" identfies which sets really need a
/// dependence check. /// dependence check.
DepCandidates &DepCands; DepCandidates &DepCands;
bool AreAllWritesIdentified;
bool AreAllReadsIdentified;
bool IsRTCheckNeeded; bool IsRTCheckNeeded;
}; };
@ -4014,62 +4022,67 @@ bool AccessAnalysis::canCheckPtrAtRT(
ValueToValueMap &StridesMap, bool ShouldCheckStride) { ValueToValueMap &StridesMap, bool ShouldCheckStride) {
// Find pointers with computable bounds. We are going to use this information // Find pointers with computable bounds. We are going to use this information
// to place a runtime bound check. // to place a runtime bound check.
unsigned NumReadPtrChecks = 0;
unsigned NumWritePtrChecks = 0;
bool CanDoRT = true; bool CanDoRT = true;
bool IsDepCheckNeeded = isDependencyCheckNeeded(); bool IsDepCheckNeeded = isDependencyCheckNeeded();
// We assign consecutive id to access from different dependence sets. NumComparisons = 0;
// Accesses within the same set don't need a runtime check.
unsigned RunningDepId = 1;
DenseMap<Value *, unsigned> DepSetId;
for (PtrAccessSet::iterator AI = Accesses.begin(), AE = Accesses.end(); // We assign a consecutive id to access from different alias sets.
AI != AE; ++AI) { // Accesses between different groups doesn't need to be checked.
const MemAccessInfo &Access = *AI; unsigned ASId = 1;
Value *Ptr = Access.getPointer(); for (auto &AS : AST) {
bool IsWrite = Access.getInt(); unsigned NumReadPtrChecks = 0;
unsigned NumWritePtrChecks = 0;
// Just add write checks if we have both. // We assign consecutive id to access from different dependence sets.
if (!IsWrite && Accesses.count(MemAccessInfo(Ptr, true))) // Accesses within the same set don't need a runtime check.
continue; unsigned RunningDepId = 1;
DenseMap<Value *, unsigned> DepSetId;
if (IsWrite) for (auto A : AS) {
++NumWritePtrChecks; Value *Ptr = A.getValue();
else bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
++NumReadPtrChecks; MemAccessInfo Access(Ptr, IsWrite);
if (hasComputableBounds(SE, StridesMap, Ptr) && if (IsWrite)
// When we run after a failing dependency check we have to make sure we ++NumWritePtrChecks;
// don't have wrapping pointers. else
(!ShouldCheckStride || ++NumReadPtrChecks;
isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) {
// The id of the dependence set.
unsigned DepId;
if (IsDepCheckNeeded) { if (hasComputableBounds(SE, StridesMap, Ptr) &&
Value *Leader = DepCands.getLeaderValue(Access).getPointer(); // When we run after a failing dependency check we have to make sure we
unsigned &LeaderId = DepSetId[Leader]; // don't have wrapping pointers.
if (!LeaderId) (!ShouldCheckStride ||
LeaderId = RunningDepId++; isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) {
DepId = LeaderId; // The id of the dependence set.
} else unsigned DepId;
// Each access has its own dependence set.
DepId = RunningDepId++;
RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, StridesMap); if (IsDepCheckNeeded) {
Value *Leader = DepCands.getLeaderValue(Access).getPointer();
unsigned &LeaderId = DepSetId[Leader];
if (!LeaderId)
LeaderId = RunningDepId++;
DepId = LeaderId;
} else
// Each access has its own dependence set.
DepId = RunningDepId++;
DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n'); RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap);
} else {
CanDoRT = false; DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n');
} else {
CanDoRT = false;
}
} }
}
if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2) if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2)
NumComparisons = 0; // Only one dependence set. NumComparisons += 0; // Only one dependence set.
else { else {
NumComparisons = (NumWritePtrChecks * (NumReadPtrChecks + NumComparisons += (NumWritePtrChecks * (NumReadPtrChecks +
NumWritePtrChecks - 1)); NumWritePtrChecks - 1));
}
++ASId;
} }
// If the pointers that we would use for the bounds comparison have different // If the pointers that we would use for the bounds comparison have different
@ -4083,6 +4096,9 @@ bool AccessAnalysis::canCheckPtrAtRT(
// Only need to check pointers between two different dependency sets. // Only need to check pointers between two different dependency sets.
if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j]) if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j])
continue; continue;
// Only need to check pointers in the same alias set.
if (RtCheck.AliasSetId[i] != RtCheck.AliasSetId[j])
continue;
Value *PtrI = RtCheck.Pointers[i]; Value *PtrI = RtCheck.Pointers[i];
Value *PtrJ = RtCheck.Pointers[j]; Value *PtrJ = RtCheck.Pointers[j];
@ -4100,90 +4116,99 @@ bool AccessAnalysis::canCheckPtrAtRT(
return CanDoRT; return CanDoRT;
} }
static bool isFunctionScopeIdentifiedObject(Value *Ptr) { void AccessAnalysis::processMemAccesses() {
return isNoAliasArgument(Ptr) || isNoAliasCall(Ptr) || isa<AllocaInst>(Ptr);
}
void AccessAnalysis::processMemAccesses(bool UseDeferred) {
// We process the set twice: first we process read-write pointers, last we // We process the set twice: first we process read-write pointers, last we
// process read-only pointers. This allows us to skip dependence tests for // process read-only pointers. This allows us to skip dependence tests for
// read-only pointers. // read-only pointers.
PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses; DEBUG(dbgs() << "LV: Processing memory accesses...\n");
for (PtrAccessSet::iterator AI = S.begin(), AE = S.end(); AI != AE; ++AI) { DEBUG(dbgs() << " AST: "; AST.dump());
const MemAccessInfo &Access = *AI; DEBUG(dbgs() << "LV: Accesses:\n");
Value *Ptr = Access.getPointer(); DEBUG({
bool IsWrite = Access.getInt(); for (auto A : Accesses)
dbgs() << "\t" << *A.getPointer() << " (" <<
(A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ?
"read-only" : "read")) << ")\n";
});
DepCands.insert(Access); // The AliasSetTracker has nicely partitioned our pointers by metadata
// compatibility and potential for underlying-object overlap. As a result, we
// only need to check for potential pointer dependencies within each alias
// set.
for (auto &AS : AST) {
// Note that both the alias-set tracker and the alias sets themselves used
// linked lists internally and so the iteration order here is deterministic
// (matching the original instruction order within each set).
// Memorize read-only pointers for later processing and skip them in the bool SetHasWrite = false;
// first round (they need to be checked after we have seen all write
// pointers). Note: we also mark pointer that are not consecutive as
// "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need the
// second check for "!IsWrite".
bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
if (!UseDeferred && IsReadOnlyPtr) {
DeferredAccesses.insert(Access);
continue;
}
bool NeedDepCheck = false; // Map of pointers to last access encountered.
// Check whether there is the possibility of dependency because of typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap;
// underlying objects being the same. UnderlyingObjToAccessMap ObjToLastAccess;
typedef SmallVector<Value*, 16> ValueVector;
ValueVector TempObjects;
GetUnderlyingObjects(Ptr, TempObjects, DL);
for (ValueVector::iterator UI = TempObjects.begin(), UE = TempObjects.end();
UI != UE; ++UI) {
Value *UnderlyingObj = *UI;
// If this is a write then it needs to be an identified object. If this a // Set of access to check after all writes have been processed.
// read and all writes (so far) are identified function scope objects we PtrAccessSet DeferredAccesses;
// don't need an identified underlying object but only an Argument (the
// next write is going to invalidate this assumption if it is // Iterate over each alias set twice, once to process read/write pointers,
// unidentified). // and then to process read-only pointers.
// This is a micro-optimization for the case where all writes are for (int SetIteration = 0; SetIteration < 2; ++SetIteration) {
// identified and we have one argument pointer. bool UseDeferred = SetIteration > 0;
// Otherwise, we do need a runtime check. PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses;
if ((IsWrite && !isFunctionScopeIdentifiedObject(UnderlyingObj)) ||
(!IsWrite && (!AreAllWritesIdentified || for (auto A : AS) {
!isa<Argument>(UnderlyingObj)) && Value *Ptr = A.getValue();
!isIdentifiedObject(UnderlyingObj))) { bool IsWrite = S.count(MemAccessInfo(Ptr, true));
DEBUG(dbgs() << "LV: Found an unidentified " <<
(IsWrite ? "write" : "read" ) << " ptr: " << *UnderlyingObj << // If we're using the deferred access set, then it contains only reads.
"\n"); bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
IsRTCheckNeeded = (IsRTCheckNeeded || if (UseDeferred && !IsReadOnlyPtr)
!isIdentifiedObject(UnderlyingObj) || continue;
!AreAllReadsIdentified); // Otherwise, the pointer must be in the PtrAccessSet, either as a read
// or a write.
assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||
S.count(MemAccessInfo(Ptr, false))) &&
"Alias-set pointer not in the access set?");
MemAccessInfo Access(Ptr, IsWrite);
DepCands.insert(Access);
// Memorize read-only pointers for later processing and skip them in the
// first round (they need to be checked after we have seen all write
// pointers). Note: we also mark pointer that are not consecutive as
// "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need
// the second check for "!IsWrite".
if (!UseDeferred && IsReadOnlyPtr) {
DeferredAccesses.insert(Access);
continue;
}
// If this is a write - check other reads and writes for conflicts. If
// this is a read only check other writes for conflicts (but only if
// there is no other write to the ptr - this is an optimization to
// catch "a[i] = a[i] + " without having to do a dependence check).
if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
CheckDeps.insert(Access);
IsRTCheckNeeded = true;
}
if (IsWrite) if (IsWrite)
AreAllWritesIdentified = false; SetHasWrite = true;
if (!IsWrite)
AreAllReadsIdentified = false; // Create sets of pointers connected by a shared alias set and
// underlying object.
typedef SmallVector<Value*, 16> ValueVector;
ValueVector TempObjects;
GetUnderlyingObjects(Ptr, TempObjects, DL);
for (Value *UnderlyingObj : TempObjects) {
UnderlyingObjToAccessMap::iterator Prev =
ObjToLastAccess.find(UnderlyingObj);
if (Prev != ObjToLastAccess.end())
DepCands.unionSets(Access, Prev->second);
ObjToLastAccess[UnderlyingObj] = Access;
}
} }
// If this is a write - check other reads and writes for conflicts. If
// this is a read only check other writes for conflicts (but only if there
// is no other write to the ptr - this is an optimization to catch "a[i] =
// a[i] + " without having to do a dependence check).
if ((IsWrite || IsReadOnlyPtr) && WriteObjects.count(UnderlyingObj))
NeedDepCheck = true;
if (IsWrite)
WriteObjects.insert(UnderlyingObj);
// Create sets of pointers connected by shared underlying objects.
UnderlyingObjToAccessMap::iterator Prev =
ObjToLastAccess.find(UnderlyingObj);
if (Prev != ObjToLastAccess.end())
DepCands.unionSets(Access, Prev->second);
ObjToLastAccess[UnderlyingObj] = Access;
} }
if (NeedDepCheck)
CheckDeps.insert(Access);
} }
} }
@ -4443,6 +4468,11 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
if (!AIsWrite && !BIsWrite) if (!AIsWrite && !BIsWrite)
return false; return false;
// We cannot check pointers in different address spaces.
if (APtr->getType()->getPointerAddressSpace() !=
BPtr->getType()->getPointerAddressSpace())
return true;
const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr); const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr);
const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr); const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr);
@ -4673,7 +4703,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
} }
AccessAnalysis::DepCandidates DependentAccesses; AccessAnalysis::DepCandidates DependentAccesses;
AccessAnalysis Accesses(DL, DependentAccesses); AccessAnalysis Accesses(DL, AA, DependentAccesses);
// Holds the analyzed pointers. We don't want to call GetUnderlyingObjects // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
// multiple times on the same object. If the ptr is accessed twice, once // multiple times on the same object. If the ptr is accessed twice, once
@ -4699,7 +4729,15 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
// list. At this phase it is only a 'write' list. // list. At this phase it is only a 'write' list.
if (Seen.insert(Ptr)) { if (Seen.insert(Ptr)) {
++NumReadWrites; ++NumReadWrites;
Accesses.addStore(Ptr);
AliasAnalysis::Location Loc = AA->getLocation(ST);
// The TBAA metadata could have a control dependency on the predication
// condition, so we cannot rely on it when determining whether or not we
// need runtime pointer checks.
if (blockNeedsPredication(ST->getParent()))
Loc.TBAATag = nullptr;
Accesses.addStore(Loc);
} }
} }
@ -4726,7 +4764,15 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
++NumReads; ++NumReads;
IsReadOnlyPtr = true; IsReadOnlyPtr = true;
} }
Accesses.addLoad(Ptr, IsReadOnlyPtr);
AliasAnalysis::Location Loc = AA->getLocation(LD);
// The TBAA metadata could have a control dependency on the predication
// condition, so we cannot rely on it when determining whether or not we
// need runtime pointer checks.
if (blockNeedsPredication(LD->getParent()))
Loc.TBAATag = nullptr;
Accesses.addLoad(Loc, IsReadOnlyPtr);
} }
// If we write (or read-write) to a single destination and there are no // If we write (or read-write) to a single destination and there are no
@ -5911,6 +5957,7 @@ char LoopVectorize::ID = 0;
static const char lv_name[] = "Loop Vectorization"; static const char lv_name[] = "Loop Vectorization";
INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo) INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)

View File

@ -1,4 +1,4 @@
; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -loop-vectorize-with-block-frequency -dce -instcombine -S | FileCheck %s ; RUN: opt < %s -basicaa -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -loop-vectorize-with-block-frequency -dce -instcombine -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0" target triple = "x86_64-apple-macosx10.8.0"

View File

@ -1,4 +1,4 @@
; RUN: opt -loop-vectorize -mcpu=corei7-avx -debug -S < %s 2>&1 | FileCheck %s ; RUN: opt -basicaa -loop-vectorize -mcpu=corei7-avx -debug -S < %s 2>&1 | FileCheck %s
; REQUIRES: asserts ; REQUIRES: asserts
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"

View File

@ -1,4 +1,4 @@
; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s ; RUN: opt < %s -basicaa -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.9.0" target triple = "x86_64-apple-macosx10.9.0"

View File

@ -1,5 +1,5 @@
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-unroll=1 -dce -instcombine -S | FileCheck %s ; RUN: opt < %s -basicaa -loop-vectorize -force-vector-width=4 -force-vector-unroll=1 -dce -instcombine -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-unroll=4 -dce -instcombine -S | FileCheck %s -check-prefix=UNROLL ; RUN: opt < %s -basicaa -loop-vectorize -force-vector-width=4 -force-vector-unroll=4 -dce -instcombine -S | FileCheck %s -check-prefix=UNROLL
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0" target triple = "x86_64-apple-macosx10.8.0"

View File

@ -1,4 +1,4 @@
; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s ; RUN: opt < %s -basicaa -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
; From a simple program with two address spaces: ; From a simple program with two address spaces:
; char Y[4*10000] __attribute__((address_space(1))); ; char Y[4*10000] __attribute__((address_space(1)));

View File

@ -1,4 +1,4 @@
; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S -enable-if-conversion | FileCheck %s ; RUN: opt < %s -basicaa -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S -enable-if-conversion | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0" target triple = "x86_64-apple-macosx10.8.0"

View File

@ -1,4 +1,4 @@
; RUN: opt -S -march=r600 -mcpu=cayman -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s ; RUN: opt -S -march=r600 -mcpu=cayman -basicaa -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s
; Check vectorization that would ordinarily require a runtime bounds ; Check vectorization that would ordinarily require a runtime bounds
; check on the pointers when mixing address spaces. For now we cannot ; check on the pointers when mixing address spaces. For now we cannot

View File

@ -1,4 +1,4 @@
; RUN: opt -S -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s ; RUN: opt -S -basicaa -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0" target triple = "x86_64-apple-macosx10.8.0"

View File

@ -0,0 +1,102 @@
; RUN: opt < %s -tbaa -basicaa -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -simplifycfg -S | FileCheck %s
; RUN: opt < %s -basicaa -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -simplifycfg -S | FileCheck %s --check-prefix=CHECK-NOTBAA
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: nounwind uwtable
define i32 @test1(i32* nocapture %a, float* nocapture readonly %b) #0 {
entry:
br label %for.body
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds float* %b, i64 %indvars.iv
%0 = load float* %arrayidx, align 4, !tbaa !0
%conv = fptosi float %0 to i32
%arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv
store i32 %conv, i32* %arrayidx2, align 4, !tbaa !4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1600
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body
ret i32 0
; TBAA partitions the accesses in this loop, so it can be vectorized without
; runtime checks.
; CHECK-LABEL: @test1
; CHECK: entry:
; CHECK-NEXT: br label %vector.body
; CHECK: vector.body:
; CHECK: load <4 x float>* %{{.*}}, align 4, !tbaa
; CHECK: store <4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, align 4, !tbaa
; CHECK: ret i32 0
; CHECK-NOTBAA-LABEL: @test1
; CHECK-NOTBAA: icmp uge i32*
; CHECK-NOTBAA: load <4 x float>* %{{.*}}, align 4, !tbaa
; CHECK-NOTBAA: store <4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, align 4, !tbaa
; CHECK-NOTBAA: ret i32 0
}
; Function Attrs: nounwind uwtable
define i32 @test2(i32* nocapture readonly %a, float* nocapture readonly %b, float* nocapture %c) #0 {
entry:
br label %for.body
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds float* %b, i64 %indvars.iv
%0 = load float* %arrayidx, align 4, !tbaa !0
%arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv
%1 = load i32* %arrayidx2, align 4, !tbaa !4
%conv = sitofp i32 %1 to float
%mul = fmul float %0, %conv
%arrayidx4 = getelementptr inbounds float* %c, i64 %indvars.iv
store float %mul, float* %arrayidx4, align 4, !tbaa !0
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1600
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body
ret i32 0
; This test is like the first, except here there is still one runtime check
; required. Without TBAA, however, two checks are required.
; CHECK-LABEL: @test2
; CHECK: icmp uge float*
; CHECK: icmp uge float*
; CHECK-NOT: icmp uge i32*
; CHECK: load <4 x float>* %{{.*}}, align 4, !tbaa
; CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 4, !tbaa
; CHECK: ret i32 0
; CHECK-NOTBAA-LABEL: @test2
; CHECK-NOTBAA: icmp uge float*
; CHECK-NOTBAA: icmp uge float*
; CHECK-NOTBAA-DAG: icmp uge float*
; CHECK-NOTBAA-DAG: icmp uge i32*
; CHECK-NOTBAA: load <4 x float>* %{{.*}}, align 4, !tbaa
; CHECK-NOTBAA: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 4, !tbaa
; CHECK-NOTBAA: ret i32 0
}
attributes #0 = { nounwind uwtable }
!0 = metadata !{metadata !1, metadata !1, i64 0}
!1 = metadata !{metadata !"float", metadata !2, i64 0}
!2 = metadata !{metadata !"omnipotent char", metadata !3, i64 0}
!3 = metadata !{metadata !"Simple C/C++ TBAA"}
!4 = metadata !{metadata !5, metadata !5, i64 0}
!5 = metadata !{metadata !"int", metadata !2, i64 0}