SLPVectorizer: add support for vectorization of diamond shaped trees. We now perform a preliminary traversal of the graph to collect values with multiple users and check where the users came from.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179414 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Nadav Rotem 2013-04-12 21:16:54 +00:00
parent 9eb366acba
commit a74f91e44c
3 changed files with 337 additions and 46 deletions

View File

@ -6,7 +6,7 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#define DEBUG_TYPE "VecUtils"
#define DEBUG_TYPE "SLP"
#include "VecUtils.h"
#include "llvm/ADT/DenseMap.h"
@ -37,6 +37,10 @@
using namespace llvm;
static const unsigned MinVecRegSize = 128;
static const unsigned RecursionMaxDepth = 6;
namespace llvm {
BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl,
@ -98,9 +102,39 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
return ((-Offset) == Sz);
}
bool BoUpSLP::vectorizeStoreChain(ValueList &Chain, int CostThreshold) {
Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
unsigned Sz = DL->getTypeSizeInBits(StoreTy);
unsigned VF = MinVecRegSize / Sz;
if (!isPowerOf2_32(Sz) || VF < 2) return false;
bool Changed = false;
for (unsigned i = 0, e = Chain.size(); i < e; ++i) {
if (i + VF > e) return Changed;
DEBUG(dbgs()<<"SLP: Analyzing " << VF << " stores at offset "<< i << "\n");
ValueList Operands(&Chain[i], &Chain[i] + VF);
int Cost = getTreeCost(Operands);
DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
if (Cost < CostThreshold) {
DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
vectorizeTree(Operands, VF);
i += VF;
Changed = true;
}
}
return Changed;
}
bool BoUpSLP::vectorizeStores(StoreList &Stores, int costThreshold) {
ValueSet Heads, Tails;
SmallDenseMap<Value*, Value*> ConsecutiveChain;
// We may run into multiple chains that merge into a single chain. We mark the
// stores that we vectorized so that we don't visit the same store twice.
ValueSet VectorizedStores;
bool Changed = false;
// Do a quadratic search on all of the given stores and find
@ -123,27 +157,17 @@ bool BoUpSLP::vectorizeStores(StoreList &Stores, int costThreshold) {
// to vectorize it.
ValueList Operands;
Value *I = *it;
int MinCost = 0, MinVF = 0;
// Collect the chain into a list.
while (Tails.count(I) || Heads.count(I)) {
if (VectorizedStores.count(I)) break;
Operands.push_back(I);
unsigned VF = Operands.size();
if (isPowerOf2_32(VF) && VF > 1) {
int cost = getTreeRollCost(Operands, 0);
DEBUG(dbgs() << "Found cost=" << cost << " for VF=" << VF << "\n");
if (cost < MinCost) { MinCost = cost; MinVF = VF; }
}
// Move to the next value in the chain.
I = ConsecutiveChain[I];
}
if (MinCost <= costThreshold && MinVF > 1) {
DEBUG(dbgs() << "Decided to vectorize cost=" << MinCost << "\n");
vectorizeTree(Operands, MinVF);
Stores.clear();
// The current numbering is invalid because we added and removed instrs.
numberInstructions();
Changed = true;
}
bool Vectorized = vectorizeStoreChain(Operands, costThreshold);
if (Vectorized) VectorizedStores.insert(Operands.begin(), Operands.end());
Changed |= Vectorized;
}
return Changed;
@ -184,8 +208,138 @@ Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) {
return 0;
}
int BoUpSLP::getTreeRollCost(ValueList &VL, unsigned Depth) {
if (Depth == 6) return max_cost;
int BoUpSLP::getTreeCost(ValueList &VL) {
// Get rid of the list of stores that were removed, and from the
// lists of instructions with multiple users.
MemBarrierIgnoreList.clear();
LaneMap.clear();
MultiUserVals.clear();
MustScalarize.clear();
// Scan the tree and find which value is used by which lane, and which values
// must be scalarized.
getTreeUses_rec(VL, 0);
// Check that instructions with multiple users can be vectorized. Mark unsafe
// instructions.
for (ValueSet::iterator it = MultiUserVals.begin(),
e = MultiUserVals.end(); it != e; ++it) {
// Check that all of the users of this instr are within the tree
// and that they are all from the same lane.
int Lane = -1;
for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end();
I != E; ++I) {
if (LaneMap.find(*I) == LaneMap.end()) {
MustScalarize.insert(*it);
DEBUG(dbgs()<<"SLP: Adding " << **it <<
" to MustScalarize because of an out of tree usage.\n");
break;
}
if (Lane == -1) Lane = LaneMap[*I];
if (Lane != LaneMap[*I]) {
MustScalarize.insert(*it);
DEBUG(dbgs()<<"Adding " << **it <<
" to MustScalarize because multiple lane use it: "
<< Lane << " and " << LaneMap[*I] << ".\n");
break;
}
}
}
// Now calculate the cost of vectorizing the tree.
return getTreeCost_rec(VL, 0);
}
void BoUpSLP::getTreeUses_rec(ValueList &VL, unsigned Depth) {
if (Depth == RecursionMaxDepth) return;
// Don't handle vectors.
if (VL[0]->getType()->isVectorTy()) return;
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
if (SI->getValueOperand()->getType()->isVectorTy()) return;
// Check if all of the operands are constants.
bool AllConst = true;
bool AllSameScalar = true;
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
AllConst &= isa<Constant>(VL[i]);
AllSameScalar &= (VL[0] == VL[i]);
Instruction *I = dyn_cast<Instruction>(VL[i]);
// If one of the instructions is out of this BB, we need to scalarize all.
if (I && I->getParent() != BB) return;
}
// If all of the operands are identical or constant we have a simple solution.
if (AllConst || AllSameScalar) return;
// Scalarize unknown structures.
Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
if (!VL0) return;
unsigned Opcode = VL0->getOpcode();
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
Instruction *I = dyn_cast<Instruction>(VL[i]);
// If not all of the instructions are identical then we have to scalarize.
if (!I || Opcode != I->getOpcode()) return;
}
// Mark instructions with multiple users.
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
Instruction *I = dyn_cast<Instruction>(VL[i]);
// Remember to check if all of the users of this instr are vectorized
// within our tree.
if (I && I->getNumUses() > 1) MultiUserVals.insert(I);
}
for (int i = 0, e = VL.size(); i < e; ++i) {
// Check that the instruction is only used within
// one lane.
if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) return;
// Make this instruction as 'seen' and remember the lane.
LaneMap[VL[i]] = i;
}
switch (Opcode) {
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::FDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (unsigned j = 0; j < VL.size(); ++j)
Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
getTreeUses_rec(Operands, Depth+1);
}
}
case Instruction::Store: {
ValueList Operands;
for (unsigned j = 0; j < VL.size(); ++j)
Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
getTreeUses_rec(Operands, Depth+1);
return;
}
default:
return;
}
}
int BoUpSLP::getTreeCost_rec(ValueList &VL, unsigned Depth) {
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
@ -193,9 +347,10 @@ int BoUpSLP::getTreeRollCost(ValueList &VL, unsigned Depth) {
/// Don't mess with vectors.
if (ScalarTy->isVectorTy()) return max_cost;
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
if (Depth == RecursionMaxDepth) return getScalarizationCost(VecTy);
// Check if all of the operands are constants.
bool AllConst = true;
bool AllSameScalar = true;
@ -204,8 +359,8 @@ int BoUpSLP::getTreeRollCost(ValueList &VL, unsigned Depth) {
AllSameScalar &= (VL[0] == VL[i]);
// Must have a single use.
Instruction *I = dyn_cast<Instruction>(VL[i]);
// Need to scalarize instructions with multiple users or from other BBs.
if (I && ((I->getNumUses() > 1) || (I->getParent() != BB)))
// This instruction is outside the basic block or if it is a known hazard.
if (MustScalarize.count(VL[i]) || (I && I->getParent() != BB))
return getScalarizationCost(VecTy);
}
@ -239,7 +394,7 @@ int BoUpSLP::getTreeRollCost(ValueList &VL, unsigned Depth) {
if (VL[i] == Last) continue;
Value *Barrier = isUnsafeToSink(cast<Instruction>(VL[i]), Last);
if (Barrier) {
DEBUG(dbgs() << "LR: Can't sink " << *VL[i] << "\n down to " <<
DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " <<
*Last << "\n because of " << *Barrier << "\n");
return max_cost;
}
@ -265,20 +420,22 @@ int BoUpSLP::getTreeRollCost(ValueList &VL, unsigned Depth) {
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
ValueList Operands;
int Cost = 0;
// Calculate the cost of all of the operands.
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (unsigned j = 0; j < VL.size(); ++j)
Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
Cost += getTreeRollCost(Operands, Depth+1);
Operands.clear();
Cost += getTreeCost_rec(Operands, Depth+1);
if (Cost >= max_cost) return max_cost;
}
// Calculate the cost of this instruction.
int ScalarCost = VecTy->getNumElements() *
TTI->getArithmeticInstrCost(Opcode, ScalarTy);
int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
Cost += (VecCost - ScalarCost);
return Cost;
@ -308,8 +465,7 @@ int BoUpSLP::getTreeRollCost(ValueList &VL, unsigned Depth) {
MemBarrierIgnoreList.insert(VL[j]);
}
int TotalCost = StoreCost + getTreeRollCost(Operands, Depth + 1);
MemBarrierIgnoreList.clear();
int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1);
return TotalCost;
}
default:
@ -334,6 +490,15 @@ Value *BoUpSLP::Scalarize(ValueList &VL, VectorType *Ty) {
}
Value *BoUpSLP::vectorizeTree(ValueList &VL, int VF) {
Value *V = vectorizeTree_rec(VL, VF);
// We moved some instructions around. We have to number them again
// before we can do any analysis.
numberInstructions();
MustScalarize.clear();
return V;
}
Value *BoUpSLP::vectorizeTree_rec(ValueList &VL, int VF) {
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
ScalarTy = SI->getValueOperand()->getType();
@ -345,19 +510,21 @@ Value *BoUpSLP::vectorizeTree(ValueList &VL, int VF) {
for (unsigned i = 0, e = VF; i < e; ++i) {
AllConst &= !!dyn_cast<Constant>(VL[i]);
AllSameScalar &= (VL[0] == VL[i]);
// Must have a single use.
// The instruction must be in the same BB, and it must be vectorizable.
Instruction *I = dyn_cast<Instruction>(VL[i]);
if (I && (I->getNumUses() > 1 || I->getParent() != BB))
if (MustScalarize.count(VL[i]) || (I && I->getParent() != BB))
return Scalarize(VL, VecTy);
}
// Is this a simple vector constant.
// Check that this is a simple vector constant.
if (AllConst || AllSameScalar) return Scalarize(VL, VecTy);
// Scalarize unknown structures.
Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
if (!VL0) return Scalarize(VL, VecTy);
if (VectorizedValues.count(VL0)) return VectorizedValues[VL0];
unsigned Opcode = VL0->getOpcode();
for (unsigned i = 0, e = VF; i < e; ++i) {
Instruction *I = dyn_cast<Instruction>(VL[i]);
@ -390,11 +557,13 @@ Value *BoUpSLP::vectorizeTree(ValueList &VL, int VF) {
LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1));
}
Value *RHS = vectorizeTree(RHSVL, VF);
Value *LHS = vectorizeTree(LHSVL, VF);
Value *RHS = vectorizeTree_rec(RHSVL, VF);
Value *LHS = vectorizeTree_rec(LHSVL, VF);
IRBuilder<> Builder(GetLastInstr(VL, VF));
BinaryOperator *BinOp = dyn_cast<BinaryOperator>(VL0);
return Builder.CreateBinOp(BinOp->getOpcode(), RHS,LHS);
Value *V = Builder.CreateBinOp(BinOp->getOpcode(), RHS,LHS);
VectorizedValues[VL0] = V;
return V;
}
case Instruction::Load: {
LoadInst *LI = dyn_cast<LoadInst>(VL0);
@ -410,6 +579,7 @@ Value *BoUpSLP::vectorizeTree(ValueList &VL, int VF) {
VecTy->getPointerTo());
LI = Builder.CreateLoad(VecPtr);
LI->setAlignment(Alignment);
VectorizedValues[VL0] = LI;
return LI;
}
case Instruction::Store: {
@ -420,7 +590,7 @@ Value *BoUpSLP::vectorizeTree(ValueList &VL, int VF) {
for (int i = 0; i < VF; ++i)
ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand());
Value *VecValue = vectorizeTree(ValueOp, VF);
Value *VecValue = vectorizeTree_rec(ValueOp, VF);
IRBuilder<> Builder(GetLastInstr(VL, VF));
Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
@ -432,7 +602,9 @@ Value *BoUpSLP::vectorizeTree(ValueList &VL, int VF) {
return 0;
}
default:
return Scalarize(VL, VecTy);
Value *S = Scalarize(VL, VecTy);
VectorizedValues[VL0] = S;
return S;
}
}

View File

@ -42,6 +42,14 @@ struct BoUpSLP {
BoUpSLP(BasicBlock *Bb, ScalarEvolution *Se, DataLayout *Dl,
TargetTransformInfo *Tti, AliasAnalysis *Aa);
/// \brief Take the pointer operand from the Load/Store instruction.
/// \returns NULL if this is not a valid Load/Store instruction.
static Value *getPointerOperand(Value *I);
/// \brief Take the address space operand from the Load/Store instruction.
/// \returns -1 if this is not a valid Load/Store instruction.
static unsigned getAddressSpaceOperand(Value *I);
/// \returns true if the memory operations A and B are consecutive.
bool isConsecutiveAccess(Value *A, Value *B);
@ -51,25 +59,31 @@ struct BoUpSLP {
/// \returns the vectorization cost of the subtree that starts at \p VL.
/// A negative number means that this is profitable.
int getTreeRollCost(ValueList &VL, unsigned Depth);
/// \brief Take the pointer operand from the Load/Store instruction.
/// \returns NULL if this is not a valid Load/Store instruction.
static Value *getPointerOperand(Value *I);
/// \brief Take the address space operand from the Load/Store instruction.
/// \returns -1 if this is not a valid Load/Store instruction.
static unsigned getAddressSpaceOperand(Value *I);
int getTreeCost(ValueList &VL);
/// \brief Attempts to order and vectorize a sequence of stores. This
/// function does a quadratic scan of the given stores.
/// \returns true if the basic block was modified.
bool vectorizeStores(StoreList &Stores, int costThreshold);
private:
/// \returns This method contains the recursive part of getTreeCost.
int getTreeCost_rec(ValueList &VL, unsigned Depth);
/// \returns This recursive method looks for vectorization hazards such as
/// values that are used by multiple users and checks that values are used
/// by only one vector lane. It updates the variables LaneMap, MultiUserVals.
void getTreeUses_rec(ValueList &VL, unsigned Depth);
/// \brief This method contains the recursive part of vectorizeTree.
Value *vectorizeTree_rec(ValueList &VL, int VF);
/// \brief Number all of the instructions in the block.
void numberInstructions();
private:
/// \brief Vectorize a sorted sequence of stores.
bool vectorizeStoreChain(ValueList &Chain, int CostThreshold);
/// \returns the scalarization cost for this type. Scalarization in this
/// context means the creation of vectors from a group of scalars.
int getScalarizationCost(Type *Ty);
@ -89,12 +103,34 @@ private:
/// \returns a vector from a collection of scalars in \p VL.
Value *Scalarize(ValueList &VL, VectorType *Ty);
private:
// Maps instructions to numbers and back.
SmallDenseMap<Value*, int> InstrIdx;
// Maps integers to Instructions.
std::vector<Instruction*> InstrVec;
// -- containers that are used during getTreeCost -- //
/// Contains values that must be scalarized because they are used
/// by multiple lanes, or by users outside the tree.
/// NOTICE: The vectorization methods also use this set.
ValueSet MustScalarize;
// Contains a list of values that are used outside the current tree. This
// set must be reset between runs.
ValueSet MultiUserVals;
// Maps values in the tree to the vector lanes that uses them. This map must
// be reset between runs of getCost.
std::map<Value*, int> LaneMap;
// A list of instructions to ignore while sinking
// memory instructions.
// memory instructions. This map must be reset between runs of getCost.
SmallSet<Value*, 8> MemBarrierIgnoreList;
// -- containers that are used during vectorizeTree -- //
// Maps between the first scalar to the vector. This map must be reset between
// runs.
DenseMap<Value*, Value*> VectorizedValues;
// Analysis and block reference.
BasicBlock *BB;
ScalarEvolution *SE;

View File

@ -0,0 +1,83 @@
; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"
; int foo(int * restrict B, int * restrict A, int n, int m) {
; B[0] = n * A[0] + m * A[0];
; B[1] = n * A[1] + m * A[1];
; B[2] = n * A[2] + m * A[2];
; B[3] = n * A[3] + m * A[3];
; return 0;
; }
; CHECK: @foo
; CHECK: load <4 x i32>
; CHECK: mul <4 x i32>
; CHECK: store <4 x i32>
; CHECK: ret
define i32 @foo(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) #0 {
entry:
%0 = load i32* %A, align 4, !tbaa !0
%mul238 = add i32 %m, %n
%add = mul i32 %0, %mul238
store i32 %add, i32* %B, align 4, !tbaa !0
%arrayidx4 = getelementptr inbounds i32* %A, i64 1
%1 = load i32* %arrayidx4, align 4, !tbaa !0
%add8 = mul i32 %1, %mul238
%arrayidx9 = getelementptr inbounds i32* %B, i64 1
store i32 %add8, i32* %arrayidx9, align 4, !tbaa !0
%arrayidx10 = getelementptr inbounds i32* %A, i64 2
%2 = load i32* %arrayidx10, align 4, !tbaa !0
%add14 = mul i32 %2, %mul238
%arrayidx15 = getelementptr inbounds i32* %B, i64 2
store i32 %add14, i32* %arrayidx15, align 4, !tbaa !0
%arrayidx16 = getelementptr inbounds i32* %A, i64 3
%3 = load i32* %arrayidx16, align 4, !tbaa !0
%add20 = mul i32 %3, %mul238
%arrayidx21 = getelementptr inbounds i32* %B, i64 3
store i32 %add20, i32* %arrayidx21, align 4, !tbaa !0
ret i32 0
}
; int foo_fail(int * restrict B, int * restrict A, int n, int m) {
; B[0] = n * A[0] + m * A[0];
; B[1] = n * A[1] + m * A[1];
; B[2] = n * A[2] + m * A[2];
; B[3] = n * A[3] + m * A[3];
; return A[0];
; }
; CHECK: @foo_fail
; CHECK-NOT: load <4 x i32>
; CHECK: ret
define i32 @foo_fail(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) #0 {
entry:
%0 = load i32* %A, align 4, !tbaa !0
%mul238 = add i32 %m, %n
%add = mul i32 %0, %mul238
store i32 %add, i32* %B, align 4, !tbaa !0
%arrayidx4 = getelementptr inbounds i32* %A, i64 1
%1 = load i32* %arrayidx4, align 4, !tbaa !0
%add8 = mul i32 %1, %mul238
%arrayidx9 = getelementptr inbounds i32* %B, i64 1
store i32 %add8, i32* %arrayidx9, align 4, !tbaa !0
%arrayidx10 = getelementptr inbounds i32* %A, i64 2
%2 = load i32* %arrayidx10, align 4, !tbaa !0
%add14 = mul i32 %2, %mul238
%arrayidx15 = getelementptr inbounds i32* %B, i64 2
store i32 %add14, i32* %arrayidx15, align 4, !tbaa !0
%arrayidx16 = getelementptr inbounds i32* %A, i64 3
%3 = load i32* %arrayidx16, align 4, !tbaa !0
%add20 = mul i32 %3, %mul238
%arrayidx21 = getelementptr inbounds i32* %B, i64 3
store i32 %add20, i32* %arrayidx21, align 4, !tbaa !0
ret i32 %0 ;<--------- This value has multiple users and can't be vectorized.
}
attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
!0 = metadata !{metadata !"int", metadata !1}
!1 = metadata !{metadata !"omnipotent char", metadata !2}
!2 = metadata !{metadata !"Simple C/C++ TBAA"}