Vectorize: teach cavVectorizeMemory to distinguish between A[i]+=x and A[B[i]]+=x.

If the pointer is consecutive then it is safe to read and write. If the pointer is non-loop-consecutive then
it is unsafe to vectorize it because we may hit an ordering issue.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166371 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Nadav Rotem
2012-10-20 08:26:33 +00:00
parent 71a1482239
commit bf8772ed2c
3 changed files with 236 additions and 75 deletions

View File

@@ -76,7 +76,7 @@ public:
// Perform the actual loop widening (vectorization). // Perform the actual loop widening (vectorization).
void vectorize(LoopVectorizationLegality *Legal) { void vectorize(LoopVectorizationLegality *Legal) {
///Create a new empty loop. Unlink the old loop and connect the new one. ///Create a new empty loop. Unlink the old loop and connect the new one.
createEmptyLoop(); createEmptyLoop(Legal);
/// Widen each instruction in the old loop to a new one in the new loop. /// Widen each instruction in the old loop to a new one in the new loop.
/// Use the Legality module to find the induction and reduction variables. /// Use the Legality module to find the induction and reduction variables.
vectorizeLoop(Legal); vectorizeLoop(Legal);
@@ -86,7 +86,7 @@ public:
private: private:
/// Create an empty loop, based on the loop ranges of the old loop. /// Create an empty loop, based on the loop ranges of the old loop.
void createEmptyLoop(); void createEmptyLoop(LoopVectorizationLegality *Legal);
/// Copy and widen the instructions from the old loop. /// Copy and widen the instructions from the old loop.
void vectorizeLoop(LoopVectorizationLegality *Legal); void vectorizeLoop(LoopVectorizationLegality *Legal);
/// Insert the new loop to the loop hierarchy and pass manager. /// Insert the new loop to the loop hierarchy and pass manager.
@@ -107,10 +107,6 @@ private:
/// for each element in the vector. Starting from zero. /// for each element in the vector. Starting from zero.
Value *getConsecutiveVector(Value* Val); Value *getConsecutiveVector(Value* Val);
/// Check that the GEP operands are all uniform except for the last index
/// which has to be the induction variable.
bool isConsecutiveGep(GetElementPtrInst *Gep);
/// When we go over instructions in the basic block we rely on previous /// When we go over instructions in the basic block we rely on previous
/// values within the current basic block or on loop invariant values. /// values within the current basic block or on loop invariant values.
/// When we widen (vectorize) values we place them in the map. If the values /// When we widen (vectorize) values we place them in the map. If the values
@@ -196,6 +192,10 @@ public:
/// Returns the reduction variables found in the loop. /// Returns the reduction variables found in the loop.
ReductionList *getReductionVars() { return &Reductions; } ReductionList *getReductionVars() { return &Reductions; }
/// Check that the GEP operands are all uniform except for the last index
/// which has to be the induction variable.
bool isConsecutiveGep(Value *Ptr);
private: private:
/// Check if a single basic block loop is vectorizable. /// Check if a single basic block loop is vectorizable.
/// At this point we know that this is a loop with a constant trip count /// At this point we know that this is a loop with a constant trip count
@@ -221,6 +221,8 @@ private:
/// Returns true if the instruction I can be a reduction variable of type /// Returns true if the instruction I can be a reduction variable of type
/// 'Kind'. /// 'Kind'.
bool isReductionInstr(Instruction *I, ReductionKind Kind); bool isReductionInstr(Instruction *I, ReductionKind Kind);
/// Returns True, if 'Phi' is an induction variable.
bool isInductionVariable(PHINode *Phi);
/// The loop that we evaluate. /// The loop that we evaluate.
Loop *TheLoop; Loop *TheLoop;
@@ -338,8 +340,8 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
return Builder.CreateAdd(Val, Cv, "induction"); return Builder.CreateAdd(Val, Cv, "induction");
} }
bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) { GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
if (!Gep) if (!Gep)
return false; return false;
@@ -348,7 +350,7 @@ bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) {
// Check that all of the gep indices are uniform except for the last. // Check that all of the gep indices are uniform except for the last.
for (unsigned i = 0; i < NumOperands - 1; ++i) for (unsigned i = 0; i < NumOperands - 1; ++i)
if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), Orig)) if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
return false; return false;
// We can emit wide load/stores only of the last index is the induction // We can emit wide load/stores only of the last index is the induction
@@ -460,7 +462,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
WidenMap[Instr] = VecResults; WidenMap[Instr] = VecResults;
} }
void SingleBlockLoopVectorizer::createEmptyLoop() { void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
/* /*
In this function we generate a new loop. The new loop will contain In this function we generate a new loop. The new loop will contain
the vectorized instructions while the old loop will continue to run the the vectorized instructions while the old loop will continue to run the
@@ -510,7 +512,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
"scalar.preheader"); "scalar.preheader");
// Find the induction variable. // Find the induction variable.
BasicBlock *OldBasicBlock = Orig->getHeader(); BasicBlock *OldBasicBlock = Orig->getHeader();
OldInduction = dyn_cast<PHINode>(OldBasicBlock->begin()); OldInduction = Legal->getInduction();
assert(OldInduction && "We must have a single phi node."); assert(OldInduction && "We must have a single phi node.");
Type *IdxTy = OldInduction->getType(); Type *IdxTy = OldInduction->getType();
@@ -637,7 +639,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
// Special handling for the induction var. // Special handling for the induction var.
if (OldInduction == Inst) if (OldInduction == Inst)
continue; continue;
// This is phase I of vectorizing PHIs. // This is phase one of vectorizing PHIs.
// This has to be a reduction variable. // This has to be a reduction variable.
assert(Legal->getReductionVars()->count(P) && "Not a Reduction"); assert(Legal->getReductionVars()->count(P) && "Not a Reduction");
Type *VecTy = VectorType::get(Inst->getType(), VF); Type *VecTy = VectorType::get(Inst->getType(), VF);
@@ -704,7 +706,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
unsigned Alignment = SI->getAlignment(); unsigned Alignment = SI->getAlignment();
GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
// This store does not use GEPs. // This store does not use GEPs.
if (!isConsecutiveGep(Gep)) { if (!Legal->isConsecutiveGep(Gep)) {
scalarizeInstruction(Inst); scalarizeInstruction(Inst);
break; break;
} }
@@ -728,7 +730,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
// We don't have a gep. Scalarize the load. // We don't have a gep. Scalarize the load.
if (!isConsecutiveGep(Gep)) { if (!Legal->isConsecutiveGep(Gep)) {
scalarizeInstruction(Inst); scalarizeInstruction(Inst);
break; break;
} }
@@ -930,6 +932,16 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
DEBUG(dbgs() << "LV: Found an non-int PHI.\n"); DEBUG(dbgs() << "LV: Found an non-int PHI.\n");
return false; return false;
} }
if (isInductionVariable(Phi)) {
if (Induction) {
DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
return false;
}
DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n");
Induction = Phi;
continue;
}
if (AddReductionVar(Phi, IntegerAdd)) { if (AddReductionVar(Phi, IntegerAdd)) {
DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
continue; continue;
@@ -938,28 +950,6 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
DEBUG(dbgs() << "LV: Found an Mult reduction PHI."<< *Phi <<"\n"); DEBUG(dbgs() << "LV: Found an Mult reduction PHI."<< *Phi <<"\n");
continue; continue;
} }
if (Induction) {
DEBUG(dbgs() << "LV: Found too many PHIs.\n");
return false;
}
// Found the induction variable.
Induction = Phi;
// Check that the PHI is consecutive and starts at zero.
const SCEV *PhiScev = SE->getSCEV(Phi);
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
if (!AR) {
DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
return false;
}
const SCEV *Step = AR->getStepRecurrence(*SE);
const SCEV *Start = AR->getStart();
if (!Step->isOne() || !Start->isZero()) {
DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n");
return false;
}
}// end of PHI handling }// end of PHI handling
// We still don't handle functions. // We still don't handle functions.
@@ -1004,16 +994,19 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
} }
bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
// Holds the read and write pointers that we find. typedef SmallVector<Value*, 16> ValueVector;
typedef SmallVector<Value*, 10> ValueVector; typedef SmallPtrSet<Value*, 16> ValueSet;
ValueVector Reads; // Holds the Load and Store *instructions*.
ValueVector Writes; ValueVector Loads;
ValueVector Stores;
// Scan the BB and collect legal loads and stores.
for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
Instruction *I = it; Instruction *I = it;
// If this is a load, record its pointer. If it is not a load, abort. // If this is a load, save it. If this instruction can read from memory
// Notice that we don't handle function calls that read or write. // but is not a load, then we quit. Notice that we don't handle function
// calls that read or write.
if (I->mayReadFromMemory()) { if (I->mayReadFromMemory()) {
LoadInst *Ld = dyn_cast<LoadInst>(I); LoadInst *Ld = dyn_cast<LoadInst>(I);
if (!Ld) return false; if (!Ld) return false;
@@ -1021,13 +1014,11 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
DEBUG(dbgs() << "LV: Found a non-simple load.\n"); DEBUG(dbgs() << "LV: Found a non-simple load.\n");
return false; return false;
} }
Loads.push_back(Ld);
Value* Ptr = Ld->getPointerOperand(); continue;
GetUnderlyingObjects(Ptr, Reads, DL);
} }
// Record store pointers. Abort on all other instructions that write to // Save store instructions. Abort if other instructions write to memory.
// memory.
if (I->mayWriteToMemory()) { if (I->mayWriteToMemory()) {
StoreInst *St = dyn_cast<StoreInst>(I); StoreInst *St = dyn_cast<StoreInst>(I);
if (!St) return false; if (!St) return false;
@@ -1035,45 +1026,99 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
DEBUG(dbgs() << "LV: Found a non-simple store.\n"); DEBUG(dbgs() << "LV: Found a non-simple store.\n");
return false; return false;
} }
Stores.push_back(St);
Value* Ptr = St->getPointerOperand();
GetUnderlyingObjects(Ptr, Writes, DL);
} }
} // next instr. } // next instr.
// Now we have two lists that hold the loads and the stores.
// Next, we find the pointers that they use.
// Check that the underlying objects of the reads and writes are either // Check if we see any stores. If there are no stores, then we don't
// disjoint memory locations, or that they are no-alias arguments. // care if the pointers are *restrict*.
ValueVector::iterator r, re, w, we; if (!Stores.size()) {
for (r = Reads.begin(), re = Reads.end(); r != re; ++r) { DEBUG(dbgs() << "LV: Found a read-only loop!\n");
if (!isIdentifiedSafeObject(*r)) { return true;
DEBUG(dbgs() << "LV: Found a bad read Ptr: "<< **r << "\n");
return false;
}
} }
for (w = Writes.begin(), we = Writes.end(); w != we; ++w) { // Holds the read and read-write *pointers* that we find.
if (!isIdentifiedSafeObject(*w)) { ValueVector Reads;
DEBUG(dbgs() << "LV: Found a bad write Ptr: "<< **w << "\n"); ValueVector ReadWrites;
return false;
} // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
// multiple times on the same object. If the ptr is accessed twice, once
// for read and once for write, it will only appear once (on the write
// list). This is okay, since we are going to check for conflicts between
// writes and between reads and writes, but not between reads and reads.
ValueSet Seen;
ValueVector::iterator I, IE;
for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
StoreInst *ST = dyn_cast<StoreInst>(*I);
assert(ST && "Bad StoreInst");
Value* Ptr = ST->getPointerOperand();
// If we did *not* see this pointer before, insert it to
// the read-write list. At this phase it is only a 'write' list.
if (Seen.insert(Ptr))
ReadWrites.push_back(Ptr);
} }
// Check that there are no multiple write locations to the same pointer. for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
SmallPtrSet<Value*, 8> WritePointerSet; LoadInst *LD = dyn_cast<LoadInst>(*I);
for (w = Writes.begin(), we = Writes.end(); w != we; ++w) { assert(LD && "Bad LoadInst");
if (!WritePointerSet.insert(*w)) { Value* Ptr = LD->getPointerOperand();
DEBUG(dbgs() << "LV: Multiple writes to the same index :"<< **w << "\n"); // If we did *not* see this pointer before, insert it to the
return false; // read list. If we *did* see it before, then it is already in
} // the read-write list. This allows us to vectorize expressions
// such as A[i] += x; Because the address of A[i] is a read-write
// pointer. This only works if the index of A[i] is consecutive.
// If the address of i is unknown (for example A[B[i]]) then we may
// read a few words, modify, and write a few words, and some of the
// words may be written to the same address.
if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr))
Reads.push_back(Ptr);
} }
// Check that the reads and the writes are disjoint. // Now that the pointers are in two lists (Reads and ReadWrites), we
for (r = Reads.begin(), re = Reads.end(); r != re; ++r) { // can check that there are no conflicts between each of the writes and
if (WritePointerSet.count(*r)) { // between the writes to the reads.
DEBUG(dbgs() << "Vectorizer: Found a read/write ptr:"<< **r << "\n"); ValueSet WriteObjects;
return false; ValueVector TempObjects;
// Check that the read-writes do not conflict with other read-write
// pointers.
for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) {
GetUnderlyingObjects(*I, TempObjects, DL);
for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
it != e; ++it) {
if (!isIdentifiedSafeObject(*it)) {
DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
return false;
}
if (!WriteObjects.insert(*it)) {
DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
<< **it <<"\n");
return false;
}
} }
TempObjects.clear();
}
/// Check that the reads don't conflict with the read-writes.
for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) {
GetUnderlyingObjects(*I, TempObjects, DL);
for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
it != e; ++it) {
if (!isIdentifiedSafeObject(*it)) {
DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
return false;
}
if (WriteObjects.count(*it)) {
DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
<< **it <<"\n");
return false;
}
}
TempObjects.clear();
} }
// All is okay. // All is okay.
@@ -1198,6 +1243,24 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
} }
} }
bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
// Check that the PHI is consecutive and starts at zero.
const SCEV *PhiScev = SE->getSCEV(Phi);
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
if (!AR) {
DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
return false;
}
const SCEV *Step = AR->getStepRecurrence(*SE);
const SCEV *Start = AR->getStart();
if (!Step->isOne() || !Start->isZero()) {
DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n");
return false;
}
return true;
}
} // namespace } // namespace
char LoopVectorize::ID = 0; char LoopVectorize::ID = 0;

View File

@@ -0,0 +1,66 @@
; RUN: opt < %s -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"
@a = common global [2048 x i32] zeroinitializer, align 16
; This is the loop.
; for (i=0; i<n; i++){
; a[i] += i;
; }
;CHECK: @inc
;CHECK: load <4 x i32>
;CHECK: add <4 x i32>
;CHECK: store <4 x i32>
;CHECK: ret void
define void @inc(i32 %n) nounwind uwtable noinline ssp {
%1 = icmp sgt i32 %n, 0
br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %0, %.lr.ph
%indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
%2 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
%3 = load i32* %2, align 4
%4 = trunc i64 %indvars.iv to i32
%5 = add nsw i32 %3, %4
store i32 %5, i32* %2, align 4
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %n
br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge: ; preds = %.lr.ph, %0
ret void
}
; Can't vectorize this loop because the access to A[X] is non linear.
;
; for (i = 0; i < n; ++i) {
; A[B[i]]++;
;
;CHECK: @histogram
;CHECK-NOT: <4 x i32>
;CHECK: ret i32
define i32 @histogram(i32* nocapture noalias %A, i32* nocapture noalias %B, i32 %n) nounwind uwtable ssp {
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body, label %for.end
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i32* %B, i64 %indvars.iv
%0 = load i32* %arrayidx, align 4
%idxprom1 = sext i32 %0 to i64
%arrayidx2 = getelementptr inbounds i32* %A, i64 %idxprom1
%1 = load i32* %arrayidx2, align 4
%inc = add nsw i32 %1, 1
store i32 %inc, i32* %arrayidx2, align 4
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %n
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body, %entry
ret i32 0
}

View File

@@ -0,0 +1,32 @@
; RUN: opt < %s -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"
;CHECK: @read_only_func
;CHECK: load <4 x i32>
;CHECK: ret i32
define i32 @read_only_func(i32* nocapture %A, i32* nocapture %B, i32 %n) nounwind uwtable readonly ssp {
%1 = icmp sgt i32 %n, 0
br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %0, %.lr.ph
%indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
%sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
%2 = getelementptr inbounds i32* %A, i64 %indvars.iv
%3 = load i32* %2, align 4
%4 = add nsw i64 %indvars.iv, 13
%5 = getelementptr inbounds i32* %B, i64 %4
%6 = load i32* %5, align 4
%7 = shl i32 %6, 1
%8 = add i32 %3, %sum.02
%9 = add i32 %8, %7
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %n
br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge: ; preds = %.lr.ph, %0
%sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
ret i32 %sum.0.lcssa
}