LoopVectorizer: Optimize the vectorization of consecutive memory access when the iteration step is -1

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171114 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Nadav Rotem
2012-12-26 19:08:17 +00:00
parent f1a26cf9df
commit 13eb1e7817
3 changed files with 73 additions and 24 deletions

View File

@ -202,7 +202,7 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, bool Negate) {
return Builder.CreateAdd(Val, Cv, "induction"); return Builder.CreateAdd(Val, Cv, "induction");
} }
bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
// If this value is a pointer induction variable we know it is consecutive. // If this value is a pointer induction variable we know it is consecutive.
@ -210,12 +210,12 @@ bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
if (Phi && Inductions.count(Phi)) { if (Phi && Inductions.count(Phi)) {
InductionInfo II = Inductions[Phi]; InductionInfo II = Inductions[Phi];
if (PtrInduction == II.IK) if (PtrInduction == II.IK)
return true; return 1;
} }
GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
if (!Gep) if (!Gep)
return false; return 0;
unsigned NumOperands = Gep->getNumOperands(); unsigned NumOperands = Gep->getNumOperands();
Value *LastIndex = Gep->getOperand(NumOperands - 1); Value *LastIndex = Gep->getOperand(NumOperands - 1);
@ -223,7 +223,7 @@ bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
// Check that all of the gep indices are uniform except for the last. // Check that all of the gep indices are uniform except for the last.
for (unsigned i = 0; i < NumOperands - 1; ++i) for (unsigned i = 0; i < NumOperands - 1; ++i)
if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
return false; return 0;
// We can emit wide load/stores only if the last index is the induction // We can emit wide load/stores only if the last index is the induction
// variable. // variable.
@ -234,10 +234,12 @@ bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
// The memory is consecutive because the last index is consecutive // The memory is consecutive because the last index is consecutive
// and all other indices are loop invariant. // and all other indices are loop invariant.
if (Step->isOne()) if (Step->isOne())
return true; return 1;
if (Step->isAllOnesValue())
return -1;
} }
return false; return 0;
} }
bool LoopVectorizationLegality::isUniform(Value *V) { bool LoopVectorizationLegality::isUniform(Value *V) {
@ -263,6 +265,17 @@ InnerLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true)); return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true));
} }
Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
assert(Vec->getType()->isVectorTy() && "Invalid type");
SmallVector<Constant*, 8> ShuffleMask;
for (unsigned i = 0; i < VF; ++i)
ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
ConstantVector::get(ShuffleMask),
"reverse");
}
void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
// Holds vector parameters or scalars, in case of uniform vals. // Holds vector parameters or scalars, in case of uniform vals.
@ -941,8 +954,7 @@ Value *InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
void void
InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
BasicBlock *BB, PhiVector *PV) { BasicBlock *BB, PhiVector *PV) {
Constant *Zero = Constant *Zero = Builder.getInt32(0);
ConstantInt::get(IntegerType::getInt32Ty(BB->getContext()), 0);
// For each instruction in the old loop. // For each instruction in the old loop.
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
@ -1142,14 +1154,15 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
assert(!Legal->isUniform(Ptr) && assert(!Legal->isUniform(Ptr) &&
"We do not allow storing to uniform addresses"); "We do not allow storing to uniform addresses");
GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
// This store does not use GEPs. int Stride = Legal->isConsecutivePtr(Ptr);
if (!Legal->isConsecutivePtr(Ptr)) { bool Reverse = Stride < 0;
if (Stride == 0) {
scalarizeInstruction(it); scalarizeInstruction(it);
break; break;
} }
GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
if (Gep) { if (Gep) {
// The last index does not have to be the induction. It can be // The last index does not have to be the induction. It can be
// consecutive and be a function of the index. For example A[I+1]; // consecutive and be a function of the index. For example A[I+1];
@ -1166,8 +1179,16 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
} }
// If the address is consecutive but reversed, then the
// wide load needs to start at the last vector element.
if (Reverse)
Ptr = Builder.CreateGEP(Ptr, Builder.getInt32(1 - VF));
Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
Value *Val = getVectorValue(SI->getValueOperand()); Value *Val = getVectorValue(SI->getValueOperand());
if (Reverse)
Val = reverseVector(Val);
Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
break; break;
} }
@ -1177,16 +1198,17 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
Type *RetTy = VectorType::get(LI->getType(), VF); Type *RetTy = VectorType::get(LI->getType(), VF);
Value *Ptr = LI->getPointerOperand(); Value *Ptr = LI->getPointerOperand();
unsigned Alignment = LI->getAlignment(); unsigned Alignment = LI->getAlignment();
GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
// If the pointer is loop invariant or if it is non consecutive, // If the pointer is loop invariant or if it is non consecutive,
// scalarize the load. // scalarize the load.
bool Con = Legal->isConsecutivePtr(Ptr); int Stride = Legal->isConsecutivePtr(Ptr);
if (Legal->isUniform(Ptr) || !Con) { bool Reverse = Stride < 0;
if (Legal->isUniform(Ptr) || Stride == 0) {
scalarizeInstruction(it); scalarizeInstruction(it);
break; break;
} }
GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
if (Gep) { if (Gep) {
// The last index does not have to be the induction. It can be // The last index does not have to be the induction. It can be
// consecutive and be a function of the index. For example A[I+1]; // consecutive and be a function of the index. For example A[I+1];
@ -1203,12 +1225,17 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
} }
// If the address is consecutive but reversed, then the
// wide load needs to start at the last vector element.
if (Reverse)
Ptr = Builder.CreateGEP(Ptr, Builder.getInt32(1 - VF));
Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
LI = Builder.CreateLoad(Ptr); LI = Builder.CreateLoad(Ptr);
LI->setAlignment(Alignment); LI->setAlignment(Alignment);
// Use this vector value for all users of the load. // Use this vector value for all users of the load.
WidenMap[it] = LI; WidenMap[it] = Reverse ? reverseVector(LI) : LI;
break; break;
} }
case Instruction::ZExt: case Instruction::ZExt:
@ -1625,7 +1652,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
// If the address of i is unknown (for example A[B[i]]) then we may // If the address of i is unknown (for example A[B[i]]) then we may
// read a few words, modify, and write a few words, and some of the // read a few words, modify, and write a few words, and some of the
// words may be written to the same address. // words may be written to the same address.
if (Seen.insert(Ptr) || !isConsecutivePtr(Ptr)) if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr))
Reads.push_back(Ptr); Reads.push_back(Ptr);
} }
@ -2094,7 +2121,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
SI->getPointerAddressSpace()); SI->getPointerAddressSpace());
// Scalarized stores. // Scalarized stores.
if (!Legal->isConsecutivePtr(SI->getPointerOperand())) { int Stride = Legal->isConsecutivePtr(SI->getPointerOperand());
bool Reverse = Stride < 0;
if (0 == Stride) {
unsigned Cost = 0; unsigned Cost = 0;
// The cost of extracting from the value vector and pointer vector. // The cost of extracting from the value vector and pointer vector.
@ -2115,8 +2144,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
} }
// Wide stores. // Wide stores.
return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), unsigned Cost = VTTI->getMemoryOpCost(I->getOpcode(), VectorTy,
SI->getPointerAddressSpace()); SI->getAlignment(),
SI->getPointerAddressSpace());
if (Reverse)
Cost += VTTI->getShuffleCost(VectorTargetTransformInfo::Reverse,
VectorTy, 0);
return Cost;
} }
case Instruction::Load: { case Instruction::Load: {
LoadInst *LI = cast<LoadInst>(I); LoadInst *LI = cast<LoadInst>(I);
@ -2127,7 +2161,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
LI->getPointerAddressSpace()); LI->getPointerAddressSpace());
// Scalarized loads. // Scalarized loads.
if (!Legal->isConsecutivePtr(LI->getPointerOperand())) { int Stride = Legal->isConsecutivePtr(LI->getPointerOperand());
bool Reverse = Stride < 0;
if (0 == Stride) {
unsigned Cost = 0; unsigned Cost = 0;
Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF); Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF);
@ -2150,8 +2186,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
} }
// Wide loads. // Wide loads.
return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), unsigned Cost = VTTI->getMemoryOpCost(I->getOpcode(), VectorTy,
LI->getPointerAddressSpace()); LI->getAlignment(),
LI->getPointerAddressSpace());
if (Reverse)
Cost += VTTI->getShuffleCost(VectorTargetTransformInfo::Reverse,
VectorTy, 0);
return Cost;
} }
case Instruction::ZExt: case Instruction::ZExt:
case Instruction::SExt: case Instruction::SExt:

View File

@ -161,6 +161,9 @@ private:
/// vectors of ones and zeros for the reduction code. /// vectors of ones and zeros for the reduction code.
Constant* getUniformVector(unsigned Val, Type* ScalarTy); Constant* getUniformVector(unsigned Val, Type* ScalarTy);
/// Generate a shuffle sequence that will reverse the vector Vec.
Value *reverseVector(Value *Vec);
typedef DenseMap<Value*, Value*> ValueMap; typedef DenseMap<Value*, Value*> ValueMap;
/// The original loop. /// The original loop.
@ -331,7 +334,11 @@ public:
/// when the last index of the GEP is the induction variable, or that the /// when the last index of the GEP is the induction variable, or that the
/// pointer itself is an induction variable. /// pointer itself is an induction variable.
/// This check allows us to vectorize A[idx] into a wide load/store. /// This check allows us to vectorize A[idx] into a wide load/store.
bool isConsecutivePtr(Value *Ptr); /// Returns:
/// 0 - Stride is unknown or non consecutive.
/// 1 - Address is consecutive.
/// -1 - Address is consecutive, and decreasing.
int isConsecutivePtr(Value *Ptr);
/// Returns true if the value V is uniform within the loop. /// Returns true if the value V is uniform within the loop.
bool isUniform(Value *V); bool isUniform(Value *V);

View File

@ -537,7 +537,8 @@ define void @example14(i32** nocapture %in, i32** nocapture %coeff, i32* nocaptu
} }
;CHECK: @example21 ;CHECK: @example21
;CHECK: <4 x i32> ;CHECK: load <4 x i32>
;CHECK: shufflevector {{.*}} <i32 3, i32 2, i32 1, i32 0>
;CHECK: ret i32 ;CHECK: ret i32
define i32 @example21(i32* nocapture %b, i32 %n) nounwind uwtable readonly ssp { define i32 @example21(i32* nocapture %b, i32 %n) nounwind uwtable readonly ssp {
%1 = icmp sgt i32 %n, 0 %1 = icmp sgt i32 %n, 0