diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt index 7ae082f55e0..07967d883a9 100644 --- a/lib/Transforms/Vectorize/CMakeLists.txt +++ b/lib/Transforms/Vectorize/CMakeLists.txt @@ -3,7 +3,6 @@ add_llvm_library(LLVMVectorize Vectorize.cpp LoopVectorize.cpp SLPVectorizer.cpp - VecUtils.cpp ) add_dependencies(LLVMVectorize intrinsics_gen) diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index c3cb03764b2..1adc1ba8e2c 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -18,17 +18,20 @@ #define SV_NAME "slp-vectorizer" #define DEBUG_TYPE "SLP" -#include "VecUtils.h" #include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/Verifier.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" @@ -36,6 +39,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include #include using namespace llvm; @@ -46,9 +50,1138 @@ static cl::opt "number. (gain = -cost of vectorization)")); namespace { +static const unsigned MinVecRegSize = 128; + +static const unsigned RecursionMaxDepth = 6; + +/// RAII pattern to save the insertion point of the IR builder. +class BuilderLocGuard { +public: + BuilderLocGuard(IRBuilder<> &B) : Builder(B), Loc(B.GetInsertPoint()) {} + ~BuilderLocGuard() { Builder.SetInsertPoint(Loc); } + +private: + // Prevent copying. + BuilderLocGuard(const BuilderLocGuard &); + BuilderLocGuard &operator=(const BuilderLocGuard &); + IRBuilder<> &Builder; + BasicBlock::iterator Loc; +}; + +/// A helper class for numbering instructions in multible blocks. +/// Numbers starts at zero for each basic block. +struct BlockNumbering { + + BlockNumbering(BasicBlock *Bb) : BB(Bb), Valid(false) {} + + BlockNumbering() : BB(0), Valid(false) {} + + void numberInstructions() { + unsigned Loc = 0; + InstrIdx.clear(); + InstrVec.clear(); + // Number the instructions in the block. + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + InstrIdx[it] = Loc++; + InstrVec.push_back(it); + assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation"); + } + Valid = true; + } + + int getIndex(Instruction *I) { + if (!Valid) + numberInstructions(); + assert(InstrIdx.count(I) && "Unknown instruction"); + return InstrIdx[I]; + } + + Instruction *getInstruction(unsigned loc) { + if (!Valid) + numberInstructions(); + assert(InstrVec.size() > loc && "Invalid Index"); + return InstrVec[loc]; + } + + void forget() { Valid = false; } + +private: + /// The block we are numbering. + BasicBlock *BB; + /// Is the block numbered. + bool Valid; + /// Maps instructions to numbers and back. + SmallDenseMap InstrIdx; + /// Maps integers to Instructions. + std::vector InstrVec; +}; + +class FuncSLP { + typedef SmallVector ValueList; + typedef SmallVector InstrList; + typedef SmallPtrSet ValueSet; + typedef SmallVector StoreList; + +public: + static const int MAX_COST = INT_MIN; + + FuncSLP(Function *Func, ScalarEvolution *Se, DataLayout *Dl, + TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li) + : F(Func), SE(Se), DL(Dl), TTI(Tti), AA(Aa), LI(Li), + Builder(Se->getContext()) { + for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { + BasicBlock *BB = it; + BlocksNumbers[BB] = BlockNumbering(BB); + } + } + + /// \brief Take the pointer operand from the Load/Store instruction. + /// \returns NULL if this is not a valid Load/Store instruction. + static Value *getPointerOperand(Value *I); + + /// \brief Take the address space operand from the Load/Store instruction. + /// \returns -1 if this is not a valid Load/Store instruction. + static unsigned getAddressSpaceOperand(Value *I); + + /// \returns true if the memory operations A and B are consecutive. + bool isConsecutiveAccess(Value *A, Value *B); + + /// \brief Vectorize the tree that starts with the elements in \p VL. + /// \returns the vectorized value. + Value *vectorizeTree(ArrayRef VL); + + /// \returns the vectorization cost of the subtree that starts at \p VL. + /// A negative number means that this is profitable. + int getTreeCost(ArrayRef VL); + + /// \returns the scalarization cost for this list of values. Assuming that + /// this subtree gets vectorized, we may need to extract the values from the + /// roots. This method calculates the cost of extracting the values. + int getGatherCost(ArrayRef VL); + + /// \brief Attempts to order and vectorize a sequence of stores. This + /// function does a quadratic scan of the given stores. + /// \returns true if the basic block was modified. + bool vectorizeStores(ArrayRef Stores, int costThreshold); + + /// \brief Vectorize a group of scalars into a vector tree. + /// \returns the vectorized value. + Value *vectorizeArith(ArrayRef Operands); + + /// \brief This method contains the recursive part of getTreeCost. + int getTreeCost_rec(ArrayRef VL, unsigned Depth); + + /// \brief This recursive method looks for vectorization hazards such as + /// values that are used by multiple users and checks that values are used + /// by only one vector lane. It updates the variables LaneMap, MultiUserVals. + void getTreeUses_rec(ArrayRef VL, unsigned Depth); + + /// \brief This method contains the recursive part of vectorizeTree. + Value *vectorizeTree_rec(ArrayRef VL); + + /// \brief Vectorize a sorted sequence of stores. + bool vectorizeStoreChain(ArrayRef Chain, int CostThreshold); + + /// \returns the scalarization cost for this type. Scalarization in this + /// context means the creation of vectors from a group of scalars. + int getGatherCost(Type *Ty); + + /// \returns the AA location that is being access by the instruction. + AliasAnalysis::Location getLocation(Instruction *I); + + /// \brief Checks if it is possible to sink an instruction from + /// \p Src to \p Dst. + /// \returns the pointer to the barrier instruction if we can't sink. + Value *getSinkBarrier(Instruction *Src, Instruction *Dst); + + /// \returns the index of the last instrucion in the BB from \p VL. + int getLastIndex(ArrayRef VL); + + /// \returns the Instrucion in the bundle \p VL. + Instruction *getLastInstruction(ArrayRef VL); + + /// \returns the Instruction at index \p Index which is in Block \p BB. + Instruction *getInstructionForIndex(unsigned Index, BasicBlock *BB); + + /// \returns the index of the first User of \p VL. + int getFirstUserIndex(ArrayRef VL); + + /// \returns a vector from a collection of scalars in \p VL. + Value *Gather(ArrayRef VL, VectorType *Ty); + + /// \brief Try to hoist gather sequences outside of the loop in cases where + /// all of the sources are loop invariant. + void hoistGatherSequence(); + + bool needToGatherAny(ArrayRef VL) { + for (int i = 0, e = VL.size(); i < e; ++i) + if (MustGather.count(VL[i])) + return true; + return false; + } + + /// -- Vectorization State -- + + /// Maps values in the tree to the vector lanes that uses them. This map must + /// be reset between runs of getCost. + std::map LaneMap; + /// A list of instructions to ignore while sinking + /// memory instructions. This map must be reset between runs of getCost. + ValueSet MemBarrierIgnoreList; + + /// Maps between the first scalar to the vector. This map must be reset + /// between runs. + DenseMap VectorizedValues; + + /// Contains values that must be gathered because they are used + /// by multiple lanes, or by users outside the tree. + /// NOTICE: The vectorization methods also use this set. + ValueSet MustGather; + + /// Contains a list of values that are used outside the current tree. This + /// set must be reset between runs. + SetVector MultiUserVals; + + /// Holds all of the instructions that we gathered. + SetVector GatherSeq; + + /// Numbers instructions in different blocks. + std::map BlocksNumbers; + + // Analysis and block reference. + Function *F; + ScalarEvolution *SE; + DataLayout *DL; + TargetTransformInfo *TTI; + AliasAnalysis *AA; + LoopInfo *LI; + /// Instruction builder to construct the vectorized tree. + IRBuilder<> Builder; +}; + +int FuncSLP::getGatherCost(Type *Ty) { + int Cost = 0; + for (unsigned i = 0, e = cast(Ty)->getNumElements(); i < e; ++i) + Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + return Cost; +} + +int FuncSLP::getGatherCost(ArrayRef VL) { + // Find the type of the operands in VL. + Type *ScalarTy = VL[0]->getType(); + if (StoreInst *SI = dyn_cast(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + // Find the cost of inserting/extracting values from the vector. + return getGatherCost(VecTy); +} + +AliasAnalysis::Location FuncSLP::getLocation(Instruction *I) { + if (StoreInst *SI = dyn_cast(I)) + return AA->getLocation(SI); + if (LoadInst *LI = dyn_cast(I)) + return AA->getLocation(LI); + return AliasAnalysis::Location(); +} + +Value *FuncSLP::getPointerOperand(Value *I) { + if (LoadInst *LI = dyn_cast(I)) + return LI->getPointerOperand(); + if (StoreInst *SI = dyn_cast(I)) + return SI->getPointerOperand(); + return 0; +} + +unsigned FuncSLP::getAddressSpaceOperand(Value *I) { + if (LoadInst *L = dyn_cast(I)) + return L->getPointerAddressSpace(); + if (StoreInst *S = dyn_cast(I)) + return S->getPointerAddressSpace(); + return -1; +} + +bool FuncSLP::isConsecutiveAccess(Value *A, Value *B) { + Value *PtrA = getPointerOperand(A); + Value *PtrB = getPointerOperand(B); + unsigned ASA = getAddressSpaceOperand(A); + unsigned ASB = getAddressSpaceOperand(B); + + // Check that the address spaces match and that the pointers are valid. + if (!PtrA || !PtrB || (ASA != ASB)) + return false; + + // Check that A and B are of the same type. + if (PtrA->getType() != PtrB->getType()) + return false; + + // Calculate the distance. + const SCEV *PtrSCEVA = SE->getSCEV(PtrA); + const SCEV *PtrSCEVB = SE->getSCEV(PtrB); + const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB); + const SCEVConstant *ConstOffSCEV = dyn_cast(OffsetSCEV); + + // Non constant distance. + if (!ConstOffSCEV) + return false; + + int64_t Offset = ConstOffSCEV->getValue()->getSExtValue(); + Type *Ty = cast(PtrA->getType())->getElementType(); + // The Instructions are connsecutive if the size of the first load/store is + // the same as the offset. + int64_t Sz = DL->getTypeStoreSize(Ty); + return ((-Offset) == Sz); +} + +Value *FuncSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) { + assert(Src->getParent() == Dst->getParent() && "Not the same BB"); + BasicBlock::iterator I = Src, E = Dst; + /// Scan all of the instruction from SRC to DST and check if + /// the source may alias. + for (++I; I != E; ++I) { + // Ignore store instructions that are marked as 'ignore'. + if (MemBarrierIgnoreList.count(I)) + continue; + if (Src->mayWriteToMemory()) /* Write */ { + if (!I->mayReadOrWriteMemory()) + continue; + } else /* Read */ { + if (!I->mayWriteToMemory()) + continue; + } + AliasAnalysis::Location A = getLocation(&*I); + AliasAnalysis::Location B = getLocation(Src); + + if (!A.Ptr || !B.Ptr || AA->alias(A, B)) + return I; + } + return 0; +} + +static BasicBlock *getSameBlock(ArrayRef VL) { + BasicBlock *BB = 0; + for (int i = 0, e = VL.size(); i < e; i++) { + Instruction *I = dyn_cast(VL[i]); + if (!I) + return 0; + + if (!BB) { + BB = I->getParent(); + continue; + } + + if (BB != I->getParent()) + return 0; + } + return BB; +} + +static bool allConstant(ArrayRef VL) { + for (unsigned i = 0, e = VL.size(); i < e; ++i) + if (!isa(VL[i])) + return false; + return true; +} + +static bool isSplat(ArrayRef VL) { + for (unsigned i = 1, e = VL.size(); i < e; ++i) + if (VL[i] != VL[0]) + return false; + return true; +} + +static unsigned getSameOpcode(ArrayRef VL) { + unsigned Opcode = 0; + for (int i = 0, e = VL.size(); i < e; i++) { + if (Instruction *I = dyn_cast(VL[i])) { + if (!Opcode) { + Opcode = I->getOpcode(); + continue; + } + if (Opcode != I->getOpcode()) + return 0; + } + } + return Opcode; +} + +static bool CanReuseExtract(ArrayRef VL, unsigned VF, + VectorType *VecTy) { + assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode"); + // Check if all of the extracts come from the same vector and from the + // correct offset. + Value *VL0 = VL[0]; + ExtractElementInst *E0 = cast(VL0); + Value *Vec = E0->getOperand(0); + + // We have to extract from the same vector type. + if (Vec->getType() != VecTy) + return false; + + // Check that all of the indices extract from the correct offset. + ConstantInt *CI = dyn_cast(E0->getOperand(1)); + if (!CI || CI->getZExtValue()) + return false; + + for (unsigned i = 1, e = VF; i < e; ++i) { + ExtractElementInst *E = cast(VL[i]); + ConstantInt *CI = dyn_cast(E->getOperand(1)); + + if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) + return false; + } + + return true; +} + +void FuncSLP::getTreeUses_rec(ArrayRef VL, unsigned Depth) { + if (Depth == RecursionMaxDepth) + return MustGather.insert(VL.begin(), VL.end()); + + // Don't handle vectors. + if (VL[0]->getType()->isVectorTy()) + return; + + if (StoreInst *SI = dyn_cast(VL[0])) + if (SI->getValueOperand()->getType()->isVectorTy()) + return; + + // If all of the operands are identical or constant we have a simple solution. + if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL)) + return MustGather.insert(VL.begin(), VL.end()); + + // Stop the scan at unknown IR. + Instruction *VL0 = dyn_cast(VL[0]); + assert(VL0 && "Invalid instruction"); + + // Mark instructions with multiple users. + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + Instruction *I = dyn_cast(VL[i]); + // Remember to check if all of the users of this instruction are vectorized + // within our tree. At depth zero we have no local users, only external + // users that we don't care about. + if (Depth && I && I->getNumUses() > 1) { + DEBUG(dbgs() << "SLP: Adding to MultiUserVals " + "because it has multiple users:" << *I << " \n"); + MultiUserVals.insert(I); + } + } + + // Check that the instruction is only used within one lane. + for (int i = 0, e = VL.size(); i < e; ++i) { + if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) { + DEBUG(dbgs() << "SLP: Value used by multiple lanes:" << *VL[i] << "\n"); + return MustGather.insert(VL.begin(), VL.end()); + } + // Make this instruction as 'seen' and remember the lane. + LaneMap[VL[i]] = i; + } + + unsigned Opcode = getSameOpcode(VL); + if (!Opcode) + return MustGather.insert(VL.begin(), VL.end()); + + switch (Opcode) { + case Instruction::ExtractElement: { + VectorType *VecTy = VectorType::get(VL[0]->getType(), VL.size()); + // No need to follow ExtractElements that are going to be optimized away. + if (CanReuseExtract(VL, VL.size(), VecTy)) + return; + // Fall through. + } + case Instruction::Load: + return; + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: + case Instruction::Select: + case Instruction::ICmp: + case Instruction::FCmp: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(i)); + + getTreeUses_rec(Operands, Depth + 1); + } + return; + } + case Instruction::Store: { + ValueList Operands; + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(0)); + getTreeUses_rec(Operands, Depth + 1); + return; + } + default: + return MustGather.insert(VL.begin(), VL.end()); + } +} + +int FuncSLP::getLastIndex(ArrayRef VL) { + BasicBlock *BB = cast(VL[0])->getParent(); + assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); + BlockNumbering &BN = BlocksNumbers[BB]; + + int MaxIdx = BN.getIndex(BB->getFirstNonPHI()); + for (unsigned i = 0, e = VL.size(); i < e; ++i) + MaxIdx = std::max(MaxIdx, BN.getIndex(cast(VL[i]))); + return MaxIdx; +} + +Instruction *FuncSLP::getLastInstruction(ArrayRef VL) { + BasicBlock *BB = cast(VL[0])->getParent(); + assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); + BlockNumbering &BN = BlocksNumbers[BB]; + + int MaxIdx = BN.getIndex(cast(VL[0])); + for (unsigned i = 1, e = VL.size(); i < e; ++i) + MaxIdx = std::max(MaxIdx, BN.getIndex(cast(VL[i]))); + return BN.getInstruction(MaxIdx); +} + +Instruction *FuncSLP::getInstructionForIndex(unsigned Index, BasicBlock *BB) { + BlockNumbering &BN = BlocksNumbers[BB]; + return BN.getInstruction(Index); +} + +int FuncSLP::getFirstUserIndex(ArrayRef VL) { + BasicBlock *BB = getSameBlock(VL); + BlockNumbering &BN = BlocksNumbers[BB]; + + // Find the first user of the values. + int FirstUser = BN.getIndex(BB->getTerminator()); + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end(); + U != UE; ++U) { + Instruction *Instr = dyn_cast(*U); + + if (!Instr || Instr->getParent() != BB) + continue; + + FirstUser = std::min(FirstUser, BN.getIndex(Instr)); + } + } + return FirstUser; +} + +int FuncSLP::getTreeCost_rec(ArrayRef VL, unsigned Depth) { + Type *ScalarTy = VL[0]->getType(); + + if (StoreInst *SI = dyn_cast(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + + /// Don't mess with vectors. + if (ScalarTy->isVectorTy()) + return FuncSLP::MAX_COST; + + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + + if (allConstant(VL)) + return 0; + + if (isSplat(VL)) + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); + + if (Depth == RecursionMaxDepth || needToGatherAny(VL)) + return getGatherCost(VecTy); + + BasicBlock *BB = getSameBlock(VL); + unsigned Opcode = getSameOpcode(VL); + assert(Opcode && BB && "Invalid Instruction Value"); + + // Check if it is safe to sink the loads or the stores. + if (Opcode == Instruction::Load || Opcode == Instruction::Store) { + int MaxIdx = getLastIndex(VL); + Instruction *Last = getInstructionForIndex(MaxIdx, BB); + + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + if (VL[i] == Last) + continue; + Value *Barrier = getSinkBarrier(cast(VL[i]), Last); + if (Barrier) { + DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last + << "\n because of " << *Barrier << "\n"); + return MAX_COST; + } + } + } + + Instruction *VL0 = cast(VL[0]); + switch (Opcode) { + case Instruction::ExtractElement: { + if (CanReuseExtract(VL, VL.size(), VecTy)) + return 0; + return getGatherCost(VecTy); + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + ValueList Operands; + Type *SrcTy = VL0->getOperand(0)->getType(); + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) { + Operands.push_back(cast(VL[j])->getOperand(0)); + // Check that the casted type is the same for all users. + if (cast(VL[j])->getOperand(0)->getType() != SrcTy) + return getGatherCost(VecTy); + } + + int Cost = getTreeCost_rec(Operands, Depth + 1); + if (Cost == FuncSLP::MAX_COST) + return Cost; + + // Calculate the cost of this instruction. + int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), + VL0->getType(), SrcTy); + + VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); + int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); + Cost += (VecCost - ScalarCost); + return Cost; + } + case Instruction::FCmp: + case Instruction::ICmp: { + // Check that all of the compares have the same predicate. + CmpInst::Predicate P0 = dyn_cast(VL0)->getPredicate(); + for (unsigned i = 1, e = VL.size(); i < e; ++i) { + CmpInst *Cmp = cast(VL[i]); + if (Cmp->getPredicate() != P0) + return getGatherCost(VecTy); + } + // Fall through. + } + case Instruction::Select: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + int TotalCost = 0; + // Calculate the cost of all of the operands. + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(i)); + + int Cost = getTreeCost_rec(Operands, Depth + 1); + if (Cost == MAX_COST) + return MAX_COST; + TotalCost += TotalCost; + } + + // Calculate the cost of this instruction. + int ScalarCost = 0; + int VecCost = 0; + if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp || + Opcode == Instruction::Select) { + VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); + ScalarCost = + VecTy->getNumElements() * + TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); + VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); + } else { + ScalarCost = VecTy->getNumElements() * + TTI->getArithmeticInstrCost(Opcode, ScalarTy); + VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); + } + TotalCost += (VecCost - ScalarCost); + return TotalCost; + } + case Instruction::Load: { + // If we are scalarize the loads, add the cost of forming the vector. + for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) + if (!isConsecutiveAccess(VL[i], VL[i + 1])) + return getGatherCost(VecTy); + + // Cost of wide load - cost of scalar loads. + int ScalarLdCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + return VecLdCost - ScalarLdCost; + } + case Instruction::Store: { + // We know that we can merge the stores. Calculate the cost. + int ScalarStCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); + int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); + int StoreCost = VecStCost - ScalarStCost; + + ValueList Operands; + for (unsigned j = 0; j < VL.size(); ++j) { + Operands.push_back(cast(VL[j])->getOperand(0)); + MemBarrierIgnoreList.insert(VL[j]); + } + + int Cost = getTreeCost_rec(Operands, Depth + 1); + if (Cost == MAX_COST) + return MAX_COST; + + int TotalCost = StoreCost + Cost; + return TotalCost; + } + default: + // Unable to vectorize unknown instructions. + return getGatherCost(VecTy); + } +} + +int FuncSLP::getTreeCost(ArrayRef VL) { + // Get rid of the list of stores that were removed, and from the + // lists of instructions with multiple users. + MemBarrierIgnoreList.clear(); + LaneMap.clear(); + MultiUserVals.clear(); + MustGather.clear(); + + if (!getSameBlock(VL)) + return MAX_COST; + + // Find the location of the last root. + int LastRootIndex = getLastIndex(VL); + int FirstUserIndex = getFirstUserIndex(VL); + + // Don't vectorize if there are users of the tree roots inside the tree + // itself. + if (LastRootIndex > FirstUserIndex) + return MAX_COST; + + // Scan the tree and find which value is used by which lane, and which values + // must be scalarized. + getTreeUses_rec(VL, 0); + + // Check that instructions with multiple users can be vectorized. Mark unsafe + // instructions. + for (SetVector::iterator it = MultiUserVals.begin(), + e = MultiUserVals.end(); + it != e; ++it) { + // Check that all of the users of this instr are within the tree. + for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end(); + I != E; ++I) { + if (LaneMap.find(*I) == LaneMap.end()) { + DEBUG(dbgs() << "SLP: Adding to MustExtract " + "because of an out of tree usage.\n"); + MustGather.insert(*it); + continue; + } + } + } + + // Now calculate the cost of vectorizing the tree. + return getTreeCost_rec(VL, 0); +} +bool FuncSLP::vectorizeStoreChain(ArrayRef Chain, int CostThreshold) { + unsigned ChainLen = Chain.size(); + DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen + << "\n"); + Type *StoreTy = cast(Chain[0])->getValueOperand()->getType(); + unsigned Sz = DL->getTypeSizeInBits(StoreTy); + unsigned VF = MinVecRegSize / Sz; + + if (!isPowerOf2_32(Sz) || VF < 2) + return false; + + bool Changed = false; + // Look for profitable vectorizable trees at all offsets, starting at zero. + for (unsigned i = 0, e = ChainLen; i < e; ++i) { + if (i + VF > e) + break; + DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i + << "\n"); + ArrayRef Operands = Chain.slice(i, VF); + + int Cost = getTreeCost(Operands); + if (Cost == FuncSLP::MAX_COST) + continue; + DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); + if (Cost < CostThreshold) { + DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + vectorizeTree(Operands); + i += VF - 1; + Changed = true; + } + } + + if (Changed || ChainLen > VF) + return Changed; + + // Handle short chains. This helps us catch types such as <3 x float> that + // are smaller than vector size. + int Cost = getTreeCost(Chain); + if (Cost == FuncSLP::MAX_COST) + return false; + if (Cost < CostThreshold) { + DEBUG(dbgs() << "SLP: Found store chain cost = " << Cost + << " for size = " << ChainLen << "\n"); + vectorizeTree(Chain); + return true; + } + + return false; +} + +bool FuncSLP::vectorizeStores(ArrayRef Stores, int costThreshold) { + SetVector Heads, Tails; + SmallDenseMap ConsecutiveChain; + + // We may run into multiple chains that merge into a single chain. We mark the + // stores that we vectorized so that we don't visit the same store twice. + ValueSet VectorizedStores; + bool Changed = false; + + // Do a quadratic search on all of the given stores and find + // all of the pairs of loads that follow each other. + for (unsigned i = 0, e = Stores.size(); i < e; ++i) + for (unsigned j = 0; j < e; ++j) { + if (i == j) + continue; + + if (isConsecutiveAccess(Stores[i], Stores[j])) { + Tails.insert(Stores[j]); + Heads.insert(Stores[i]); + ConsecutiveChain[Stores[i]] = Stores[j]; + } + } + + // For stores that start but don't end a link in the chain: + for (SetVector::iterator it = Heads.begin(), e = Heads.end(); + it != e; ++it) { + if (Tails.count(*it)) + continue; + + // We found a store instr that starts a chain. Now follow the chain and try + // to vectorize it. + ValueList Operands; + Value *I = *it; + // Collect the chain into a list. + while (Tails.count(I) || Heads.count(I)) { + if (VectorizedStores.count(I)) + break; + Operands.push_back(I); + // Move to the next value in the chain. + I = ConsecutiveChain[I]; + } + + bool Vectorized = vectorizeStoreChain(Operands, costThreshold); + + // Mark the vectorized stores so that we don't vectorize them again. + if (Vectorized) + VectorizedStores.insert(Operands.begin(), Operands.end()); + Changed |= Vectorized; + } + + return Changed; +} + +Value *FuncSLP::Gather(ArrayRef VL, VectorType *Ty) { + Value *Vec = UndefValue::get(Ty); + // Generate the 'InsertElement' instruction. + for (unsigned i = 0; i < Ty->getNumElements(); ++i) { + Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); + if (Instruction *I = dyn_cast(Vec)) + GatherSeq.insert(I); + } + + VectorizedValues[VL[0]] = Vec; + return Vec; +} + +Value *FuncSLP::vectorizeTree_rec(ArrayRef VL) { + BuilderLocGuard Guard(Builder); + + Type *ScalarTy = VL[0]->getType(); + if (StoreInst *SI = dyn_cast(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + + if (needToGatherAny(VL)) + return Gather(VL, VecTy); + + if (VectorizedValues.count(VL[0])) { + DEBUG(dbgs() << "SLP: Diamond merged at depth.\n"); + return VectorizedValues[VL[0]]; + } + + Instruction *VL0 = cast(VL[0]); + unsigned Opcode = VL0->getOpcode(); + assert(Opcode == getSameOpcode(VL) && "Invalid opcode"); + + switch (Opcode) { + case Instruction::ExtractElement: { + if (CanReuseExtract(VL, VL.size(), VecTy)) + return VL0->getOperand(0); + return Gather(VL, VecTy); + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + ValueList INVL; + for (int i = 0, e = VL.size(); i < e; ++i) + INVL.push_back(cast(VL[i])->getOperand(0)); + + Builder.SetInsertPoint(getLastInstruction(VL)); + Value *InVec = vectorizeTree_rec(INVL); + CastInst *CI = dyn_cast(VL0); + Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); + VectorizedValues[VL0] = V; + return V; + } + case Instruction::FCmp: + case Instruction::ICmp: { + // Check that all of the compares have the same predicate. + CmpInst::Predicate P0 = dyn_cast(VL0)->getPredicate(); + for (unsigned i = 1, e = VL.size(); i < e; ++i) { + CmpInst *Cmp = cast(VL[i]); + if (Cmp->getPredicate() != P0) + return Gather(VL, VecTy); + } + + ValueList LHSV, RHSV; + for (int i = 0, e = VL.size(); i < e; ++i) { + LHSV.push_back(cast(VL[i])->getOperand(0)); + RHSV.push_back(cast(VL[i])->getOperand(1)); + } + + Builder.SetInsertPoint(getLastInstruction(VL)); + Value *L = vectorizeTree_rec(LHSV); + Value *R = vectorizeTree_rec(RHSV); + Value *V; + + if (Opcode == Instruction::FCmp) + V = Builder.CreateFCmp(P0, L, R); + else + V = Builder.CreateICmp(P0, L, R); + + VectorizedValues[VL0] = V; + return V; + } + case Instruction::Select: { + ValueList TrueVec, FalseVec, CondVec; + for (int i = 0, e = VL.size(); i < e; ++i) { + CondVec.push_back(cast(VL[i])->getOperand(0)); + TrueVec.push_back(cast(VL[i])->getOperand(1)); + FalseVec.push_back(cast(VL[i])->getOperand(2)); + } + + Builder.SetInsertPoint(getLastInstruction(VL)); + Value *True = vectorizeTree_rec(TrueVec); + Value *False = vectorizeTree_rec(FalseVec); + Value *Cond = vectorizeTree_rec(CondVec); + Value *V = Builder.CreateSelect(Cond, True, False); + VectorizedValues[VL0] = V; + return V; + } + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + ValueList LHSVL, RHSVL; + for (int i = 0, e = VL.size(); i < e; ++i) { + LHSVL.push_back(cast(VL[i])->getOperand(0)); + RHSVL.push_back(cast(VL[i])->getOperand(1)); + } + + Builder.SetInsertPoint(getLastInstruction(VL)); + Value *LHS = vectorizeTree_rec(LHSVL); + Value *RHS = vectorizeTree_rec(RHSVL); + + if (LHS == RHS) { + assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order"); + } + + BinaryOperator *BinOp = cast(VL0); + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); + VectorizedValues[VL0] = V; + return V; + } + case Instruction::Load: { + // Check if all of the loads are consecutive. + for (unsigned i = 1, e = VL.size(); i < e; ++i) + if (!isConsecutiveAccess(VL[i - 1], VL[i])) + return Gather(VL, VecTy); + + // Loads are inserted at the head of the tree because we don't want to + // sink them all the way down past store instructions. + Builder.SetInsertPoint(getLastInstruction(VL)); + LoadInst *LI = cast(VL0); + Value *VecPtr = + Builder.CreateBitCast(LI->getPointerOperand(), VecTy->getPointerTo()); + unsigned Alignment = LI->getAlignment(); + LI = Builder.CreateLoad(VecPtr); + LI->setAlignment(Alignment); + + VectorizedValues[VL0] = LI; + return LI; + } + case Instruction::Store: { + StoreInst *SI = cast(VL0); + unsigned Alignment = SI->getAlignment(); + + ValueList ValueOp; + for (int i = 0, e = VL.size(); i < e; ++i) + ValueOp.push_back(cast(VL[i])->getValueOperand()); + + Value *VecValue = vectorizeTree_rec(ValueOp); + + Builder.SetInsertPoint(getLastInstruction(VL)); + Value *VecPtr = + Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo()); + Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment); + + for (int i = 0, e = VL.size(); i < e; ++i) + cast(VL[i])->eraseFromParent(); + return 0; + } + default: + return Gather(VL, VecTy); + } +} + +Value *FuncSLP::vectorizeTree(ArrayRef VL) { + Builder.SetInsertPoint(getLastInstruction(VL)); + Value *V = vectorizeTree_rec(VL); + + // We moved some instructions around. We have to number them again + // before we can do any analysis. + MustGather.clear(); + VectorizedValues.clear(); + MemBarrierIgnoreList.clear(); + for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) + BlocksNumbers[it].forget(); + return V; +} + +Value *FuncSLP::vectorizeArith(ArrayRef Operands) { + Value *Vec = vectorizeTree(Operands); + // After vectorizing the operands we need to generate extractelement + // instructions and replace all of the uses of the scalar values with + // the values that we extracted from the vectorized tree. + for (unsigned i = 0, e = Operands.size(); i != e; ++i) { + Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i)); + Operands[i]->replaceAllUsesWith(S); + } + + return Vec; +} + +void FuncSLP::hoistGatherSequence() { + for (SetVector::iterator it = GatherSeq.begin(), + e = GatherSeq.end(); + it != e; ++it) { + InsertElementInst *Insert = dyn_cast_or_null(*it); + + // The InsertElement sequence can be simplified into a constant. + // Also Ignore NULL pointers because they are only here to separate + // sequences. + if (!Insert) + continue; + + BasicBlock *BB = Insert->getParent(); + + // Check if this block is inside a loop. + Loop *L = LI->getLoopFor(BB); + if (!L) + return; + + // Check if it has a preheader. + BasicBlock *PreHeader = L->getLoopPreheader(); + if (!PreHeader) + return; + + // If the vector or the element that we insert into it are + // instructions that are defined in this basic block then we can't + // hoist this instruction. + Instruction *CurrVec = dyn_cast(Insert->getOperand(0)); + Instruction *NewElem = dyn_cast(Insert->getOperand(1)); + if (CurrVec && L->contains(CurrVec)) + continue; + if (NewElem && L->contains(NewElem)) + continue; + + // Mark the insertion point for the block. + Instruction *Location = PreHeader->getTerminator(); + // We can hoist this instruction. Move it to the pre-header. + Insert->moveBefore(Location); + } +} + /// The SLPVectorizer Pass. struct SLPVectorizer : public FunctionPass { - typedef MapVector StoreListMap; + typedef SmallVector StoreList; + typedef MapVector StoreListMap; /// Pass identification, replacement for typeid static char ID; @@ -80,34 +1213,26 @@ struct SLPVectorizer : public FunctionPass { DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); + // Use the bollom up slp vectorizer to construct chains that start with + // he store instructions. + FuncSLP R(&F, SE, DL, TTI, AA, LI); + for (Function::iterator it = F.begin(), e = F.end(); it != e; ++it) { BasicBlock *BB = it; - bool BBChanged = false; - - // Use the bollom up slp vectorizer to construct chains that start with - // he store instructions. - BoUpSLP R(BB, SE, DL, TTI, AA, LI->getLoopFor(BB)); // Vectorize trees that end at reductions. - BBChanged |= vectorizeChainsInBlock(BB, R); + Changed |= vectorizeChainsInBlock(BB, R); // Vectorize trees that end at stores. if (unsigned count = collectStores(BB, R)) { (void)count; DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n"); - BBChanged |= vectorizeStoreChains(R); + Changed |= vectorizeStoreChains(R); } - - // Try to hoist some of the scalarization code to the preheader. - if (BBChanged) { - hoistGatherSequence(LI, BB, R); - Changed |= vectorizeUsingGatherHints(R.getGatherSeqInstructions()); - } - - Changed |= BBChanged; } if (Changed) { + R.hoistGatherSequence(); DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); DEBUG(verifyFunction(F)); } @@ -128,42 +1253,31 @@ private: /// object. We sort the stores to their base objects to reduce the cost of the /// quadratic search on the stores. TODO: We can further reduce this cost /// if we flush the chain creation every time we run into a memory barrier. - unsigned collectStores(BasicBlock *BB, BoUpSLP &R); + unsigned collectStores(BasicBlock *BB, FuncSLP &R); /// \brief Try to vectorize a chain that starts at two arithmetic instrs. - bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); + bool tryToVectorizePair(Value *A, Value *B, FuncSLP &R); /// \brief Try to vectorize a list of operands. If \p NeedExtracts is true /// then we calculate the cost of extracting the scalars from the vector. /// \returns true if a value was vectorized. - bool tryToVectorizeList(ArrayRef VL, BoUpSLP &R, bool NeedExtracts); + bool tryToVectorizeList(ArrayRef VL, FuncSLP &R, bool NeedExtracts); /// \brief Try to vectorize a chain that may start at the operands of \V; - bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); + bool tryToVectorize(BinaryOperator *V, FuncSLP &R); /// \brief Vectorize the stores that were collected in StoreRefs. - bool vectorizeStoreChains(BoUpSLP &R); - - /// \brief Try to hoist gather sequences outside of the loop in cases where - /// all of the sources are loop invariant. - void hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, BoUpSLP &R); - - /// \brief Try to vectorize additional sequences in different basic blocks - /// based on values that we gathered in previous blocks. The list \p Gathers - /// holds the gather InsertElement instructions that were generated during - /// vectorization. - /// \returns True if some code was vectorized. - bool vectorizeUsingGatherHints(BoUpSLP::InstrList &Gathers); + bool vectorizeStoreChains(FuncSLP &R); /// \brief Scan the basic block and look for patterns that are likely to start /// a vectorization chain. - bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); + bool vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R); private: StoreListMap StoreRefs; }; -unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { +unsigned SLPVectorizer::collectStores(BasicBlock *BB, FuncSLP &R) { unsigned count = 0; StoreRefs.clear(); for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { @@ -188,14 +1302,14 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { return count; } -bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { +bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, FuncSLP &R) { if (!A || !B) return false; Value *VL[] = { A, B }; return tryToVectorizeList(VL, R, true); } -bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, +bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, FuncSLP &R, bool NeedExtracts) { if (VL.size() < 2) return false; @@ -219,7 +1333,10 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, } int Cost = R.getTreeCost(VL); - int ExtrCost = NeedExtracts ? R.getScalarizationCost(VL) : 0; + if (Cost == FuncSLP::MAX_COST) + return false; + + int ExtrCost = NeedExtracts ? R.getGatherCost(VL) : 0; DEBUG(dbgs() << "SLP: Cost of pair:" << Cost << " Cost of extract:" << ExtrCost << ".\n"); if ((Cost + ExtrCost) >= -SLPCostThreshold) @@ -229,10 +1346,10 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, return true; } -bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { +bool SLPVectorizer::tryToVectorize(BinaryOperator *V, FuncSLP &R) { if (!V) return false; - + // Try to vectorize V. if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R)) return true; @@ -269,7 +1386,7 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { return 0; } -bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { +bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R) { bool Changed = false; for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { if (isa(it)) @@ -292,7 +1409,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { Value *Inst = BI->getOperand(0); if (Inst == P) Inst = BI->getOperand(1); - + Changed |= tryToVectorize(dyn_cast(Inst), R); continue; } @@ -337,7 +1454,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { return Changed; } -bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { +bool SLPVectorizer::vectorizeStoreChains(FuncSLP &R) { bool Changed = false; // Attempt to sort and vectorize each of the store-groups. for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end(); @@ -353,92 +1470,6 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { return Changed; } -bool SLPVectorizer::vectorizeUsingGatherHints(BoUpSLP::InstrList &Gathers) { - SmallVector Seq; - bool Changed = false; - for (int i = 0, e = Gathers.size(); i < e; ++i) { - InsertElementInst *IEI = dyn_cast_or_null(Gathers[i]); - - if (IEI) { - if (Instruction *I = dyn_cast(IEI->getOperand(1))) - Seq.push_back(I); - } else { - - if (!Seq.size()) - continue; - - Instruction *I = cast(Seq[0]); - BasicBlock *BB = I->getParent(); - - DEBUG(dbgs() << "SLP: Inspecting a gather list of size " << Seq.size() - << " in " << BB->getName() << ".\n"); - - // Check if the gathered values have multiple uses. If they only have one - // user then we know that the insert/extract pair will go away. - bool HasMultipleUsers = false; - for (int i = 0; e = Seq.size(), i < e; ++i) { - if (!Seq[i]->hasOneUse()) { - HasMultipleUsers = true; - break; - } - } - - BoUpSLP BO(BB, SE, DL, TTI, AA, LI->getLoopFor(BB)); - - if (tryToVectorizeList(Seq, BO, HasMultipleUsers)) { - DEBUG(dbgs() << "SLP: Vectorized a gather list of len " << Seq.size() - << " in " << BB->getName() << ".\n"); - Changed = true; - } - - Seq.clear(); - } - } - - return Changed; -} - -void SLPVectorizer::hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, - BoUpSLP &R) { - // Check if this block is inside a loop. - Loop *L = LI->getLoopFor(BB); - if (!L) - return; - - // Check if it has a preheader. - BasicBlock *PreHeader = L->getLoopPreheader(); - if (!PreHeader) - return; - - // Mark the insertion point for the block. - Instruction *Location = PreHeader->getTerminator(); - - BoUpSLP::InstrList &Gathers = R.getGatherSeqInstructions(); - for (BoUpSLP::InstrList::iterator it = Gathers.begin(), e = Gathers.end(); - it != e; ++it) { - InsertElementInst *Insert = dyn_cast_or_null(*it); - - // The InsertElement sequence can be simplified into a constant. - // Also Ignore NULL pointers because they are only here to separate - // sequences. - if (!Insert) - continue; - - // If the vector or the element that we insert into it are - // instructions that are defined in this basic block then we can't - // hoist this instruction. - Instruction *CurrVec = dyn_cast(Insert->getOperand(0)); - Instruction *NewElem = dyn_cast(Insert->getOperand(1)); - if (CurrVec && L->contains(CurrVec)) - continue; - if (NewElem && L->contains(NewElem)) - continue; - - // We can hoist this instruction. Move it to the pre-header. - Insert->moveBefore(Location); - } -} - } // end anonymous namespace char SLPVectorizer::ID = 0; diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp deleted file mode 100644 index 3db4adf95c8..00000000000 --- a/lib/Transforms/Vectorize/VecUtils.cpp +++ /dev/null @@ -1,1031 +0,0 @@ -//===- VecUtils.cpp --- Vectorization Utilities ---------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -#define DEBUG_TYPE "SLP" - -#include "VecUtils.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Analysis/Verifier.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Type.h" -#include "llvm/IR/Value.h" -#include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" -#include -#include - -using namespace llvm; - -static const unsigned MinVecRegSize = 128; - -static const unsigned RecursionMaxDepth = 6; - -namespace llvm { - -BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl, - TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp) - : Builder(S->getContext()), BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa), L(Lp) { - numberInstructions(); -} - -void BoUpSLP::numberInstructions() { - int Loc = 0; - InstrIdx.clear(); - InstrVec.clear(); - // Number the instructions in the block. - for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - InstrIdx[it] = Loc++; - InstrVec.push_back(it); - assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation"); - } -} - -Value *BoUpSLP::getPointerOperand(Value *I) { - if (LoadInst *LI = dyn_cast(I)) - return LI->getPointerOperand(); - if (StoreInst *SI = dyn_cast(I)) - return SI->getPointerOperand(); - return 0; -} - -unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { - if (LoadInst *L = dyn_cast(I)) - return L->getPointerAddressSpace(); - if (StoreInst *S = dyn_cast(I)) - return S->getPointerAddressSpace(); - return -1; -} - -bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { - Value *PtrA = getPointerOperand(A); - Value *PtrB = getPointerOperand(B); - unsigned ASA = getAddressSpaceOperand(A); - unsigned ASB = getAddressSpaceOperand(B); - - // Check that the address spaces match and that the pointers are valid. - if (!PtrA || !PtrB || (ASA != ASB)) - return false; - - // Check that A and B are of the same type. - if (PtrA->getType() != PtrB->getType()) - return false; - - // Calculate the distance. - const SCEV *PtrSCEVA = SE->getSCEV(PtrA); - const SCEV *PtrSCEVB = SE->getSCEV(PtrB); - const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB); - const SCEVConstant *ConstOffSCEV = dyn_cast(OffsetSCEV); - - // Non constant distance. - if (!ConstOffSCEV) - return false; - - int64_t Offset = ConstOffSCEV->getValue()->getSExtValue(); - Type *Ty = cast(PtrA->getType())->getElementType(); - // The Instructions are connsecutive if the size of the first load/store is - // the same as the offset. - int64_t Sz = DL->getTypeStoreSize(Ty); - return ((-Offset) == Sz); -} - -bool BoUpSLP::vectorizeStoreChain(ArrayRef Chain, int CostThreshold) { - unsigned ChainLen = Chain.size(); - DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen - << "\n"); - Type *StoreTy = cast(Chain[0])->getValueOperand()->getType(); - unsigned Sz = DL->getTypeSizeInBits(StoreTy); - unsigned VF = MinVecRegSize / Sz; - - if (!isPowerOf2_32(Sz) || VF < 2) - return false; - - bool Changed = false; - // Look for profitable vectorizable trees at all offsets, starting at zero. - for (unsigned i = 0, e = ChainLen; i < e; ++i) { - if (i + VF > e) - break; - DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i - << "\n"); - ArrayRef Operands = Chain.slice(i, VF); - - int Cost = getTreeCost(Operands); - DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); - if (Cost < CostThreshold) { - DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); - Builder.SetInsertPoint(getInsertionPoint(getLastIndex(Operands, VF))); - vectorizeTree(Operands, VF); - i += VF - 1; - Changed = true; - } - } - - if (Changed || ChainLen > VF) - return Changed; - - // Handle short chains. This helps us catch types such as <3 x float> that - // are smaller than vector size. - int Cost = getTreeCost(Chain); - if (Cost < CostThreshold) { - DEBUG(dbgs() << "SLP: Found store chain cost = " << Cost - << " for size = " << ChainLen << "\n"); - Builder.SetInsertPoint(getInsertionPoint(getLastIndex(Chain, ChainLen))); - vectorizeTree(Chain, ChainLen); - return true; - } - - return false; -} - -bool BoUpSLP::vectorizeStores(ArrayRef Stores, int costThreshold) { - SetVector Heads, Tails; - SmallDenseMap ConsecutiveChain; - - // We may run into multiple chains that merge into a single chain. We mark the - // stores that we vectorized so that we don't visit the same store twice. - ValueSet VectorizedStores; - bool Changed = false; - - // Do a quadratic search on all of the given stores and find - // all of the pairs of loads that follow each other. - for (unsigned i = 0, e = Stores.size(); i < e; ++i) - for (unsigned j = 0; j < e; ++j) { - if (i == j) - continue; - - if (isConsecutiveAccess(Stores[i], Stores[j])) { - Tails.insert(Stores[j]); - Heads.insert(Stores[i]); - ConsecutiveChain[Stores[i]] = Stores[j]; - } - } - - // For stores that start but don't end a link in the chain: - for (SetVector::iterator it = Heads.begin(), e = Heads.end(); - it != e; ++it) { - if (Tails.count(*it)) - continue; - - // We found a store instr that starts a chain. Now follow the chain and try - // to vectorize it. - ValueList Operands; - Value *I = *it; - // Collect the chain into a list. - while (Tails.count(I) || Heads.count(I)) { - if (VectorizedStores.count(I)) - break; - Operands.push_back(I); - // Move to the next value in the chain. - I = ConsecutiveChain[I]; - } - - bool Vectorized = vectorizeStoreChain(Operands, costThreshold); - - // Mark the vectorized stores so that we don't vectorize them again. - if (Vectorized) - VectorizedStores.insert(Operands.begin(), Operands.end()); - Changed |= Vectorized; - } - - return Changed; -} - -int BoUpSLP::getScalarizationCost(ArrayRef VL) { - // Find the type of the operands in VL. - Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); - // Find the cost of inserting/extracting values from the vector. - return getScalarizationCost(VecTy); -} - -int BoUpSLP::getScalarizationCost(Type *Ty) { - int Cost = 0; - for (unsigned i = 0, e = cast(Ty)->getNumElements(); i < e; ++i) - Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); - return Cost; -} - -AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) { - if (StoreInst *SI = dyn_cast(I)) - return AA->getLocation(SI); - if (LoadInst *LI = dyn_cast(I)) - return AA->getLocation(LI); - return AliasAnalysis::Location(); -} - -Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) { - assert(Src->getParent() == Dst->getParent() && "Not the same BB"); - BasicBlock::iterator I = Src, E = Dst; - /// Scan all of the instruction from SRC to DST and check if - /// the source may alias. - for (++I; I != E; ++I) { - // Ignore store instructions that are marked as 'ignore'. - if (MemBarrierIgnoreList.count(I)) - continue; - if (Src->mayWriteToMemory()) /* Write */ { - if (!I->mayReadOrWriteMemory()) - continue; - } else /* Read */ { - if (!I->mayWriteToMemory()) - continue; - } - AliasAnalysis::Location A = getLocation(&*I); - AliasAnalysis::Location B = getLocation(Src); - - if (!A.Ptr || !B.Ptr || AA->alias(A, B)) - return I; - } - return 0; -} - -Value *BoUpSLP::vectorizeArith(ArrayRef Operands) { - int LastIdx = getLastIndex(Operands, Operands.size()); - Instruction *Loc = getInsertionPoint(LastIdx); - Builder.SetInsertPoint(Loc); - - assert(getFirstUserIndex(Operands, Operands.size()) > LastIdx && - "Vectorizing with in-tree users"); - - Value *Vec = vectorizeTree(Operands, Operands.size()); - // After vectorizing the operands we need to generate extractelement - // instructions and replace all of the uses of the scalar values with - // the values that we extracted from the vectorized tree. - for (unsigned i = 0, e = Operands.size(); i != e; ++i) { - Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i)); - Operands[i]->replaceAllUsesWith(S); - } - - return Vec; -} - -int BoUpSLP::getTreeCost(ArrayRef VL) { - // Get rid of the list of stores that were removed, and from the - // lists of instructions with multiple users. - MemBarrierIgnoreList.clear(); - LaneMap.clear(); - MultiUserVals.clear(); - MustScalarize.clear(); - MustExtract.clear(); - - // Find the location of the last root. - int LastRootIndex = getLastIndex(VL, VL.size()); - int FirstUserIndex = getFirstUserIndex(VL, VL.size()); - - // Don't vectorize if there are users of the tree roots inside the tree - // itself. - if (LastRootIndex > FirstUserIndex) - return max_cost; - - // Scan the tree and find which value is used by which lane, and which values - // must be scalarized. - getTreeUses_rec(VL, 0); - - // Check that instructions with multiple users can be vectorized. Mark unsafe - // instructions. - for (SetVector::iterator it = MultiUserVals.begin(), - e = MultiUserVals.end(); - it != e; ++it) { - // Check that all of the users of this instr are within the tree - // and that they are all from the same lane. - int Lane = -1; - for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end(); - I != E; ++I) { - if (LaneMap.find(*I) == LaneMap.end()) { - DEBUG(dbgs() << "SLP: Instr " << **it << " has multiple users.\n"); - - // We don't have an ordering problem if the user is not in this basic - // block. - Instruction *Inst = cast(*I); - if (Inst->getParent() != BB) { - MustExtract.insert(*it); - continue; - } - - // We don't have an ordering problem if the user is after the last root. - int Idx = InstrIdx[Inst]; - if (Idx < LastRootIndex) { - MustScalarize.insert(*it); - DEBUG(dbgs() << "SLP: Adding to MustScalarize " - "because of an unsafe out of tree usage.\n"); - break; - } - - DEBUG(dbgs() << "SLP: Adding to MustExtract " - "because of a safe out of tree usage.\n"); - MustExtract.insert(*it); - continue; - } - if (Lane == -1) - Lane = LaneMap[*I]; - if (Lane != LaneMap[*I]) { - MustScalarize.insert(*it); - DEBUG(dbgs() << "SLP: Adding " << **it - << " to MustScalarize because multiple lane use it: " - << Lane << " and " << LaneMap[*I] << ".\n"); - break; - } - } - } - - // Now calculate the cost of vectorizing the tree. - return getTreeCost_rec(VL, 0); -} - -static bool CanReuseExtract(ArrayRef VL, unsigned VF, - VectorType *VecTy) { - // Check if all of the extracts come from the same vector and from the - // correct offset. - Value *VL0 = VL[0]; - ExtractElementInst *E0 = cast(VL0); - Value *Vec = E0->getOperand(0); - - // We have to extract from the same vector type. - if (Vec->getType() != VecTy) - return false; - - // Check that all of the indices extract from the correct offset. - ConstantInt *CI = dyn_cast(E0->getOperand(1)); - if (!CI || CI->getZExtValue()) - return false; - - for (unsigned i = 1, e = VF; i < e; ++i) { - ExtractElementInst *E = cast(VL[i]); - ConstantInt *CI = dyn_cast(E->getOperand(1)); - - if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) - return false; - } - - return true; -} - -void BoUpSLP::getTreeUses_rec(ArrayRef VL, unsigned Depth) { - if (Depth == RecursionMaxDepth) - return; - - // Don't handle vectors. - if (VL[0]->getType()->isVectorTy()) - return; - - if (StoreInst *SI = dyn_cast(VL[0])) - if (SI->getValueOperand()->getType()->isVectorTy()) - return; - - // Check if all of the operands are constants. - bool AllConst = true; - bool AllSameScalar = true; - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - AllConst &= isa(VL[i]); - AllSameScalar &= (VL[0] == VL[i]); - Instruction *I = dyn_cast(VL[i]); - // If one of the instructions is out of this BB, we need to scalarize all. - if (I && I->getParent() != BB) - return; - } - - // If all of the operands are identical or constant we have a simple solution. - if (AllConst || AllSameScalar) - return; - - // Scalarize unknown structures. - Instruction *VL0 = dyn_cast(VL[0]); - if (!VL0) - return; - - unsigned Opcode = VL0->getOpcode(); - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *I = dyn_cast(VL[i]); - // If not all of the instructions are identical then we have to scalarize. - if (!I || Opcode != I->getOpcode()) - return; - } - - for (int i = 0, e = VL.size(); i < e; ++i) { - // Check that the instruction is only used within - // one lane. - if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) - return; - // Make this instruction as 'seen' and remember the lane. - LaneMap[VL[i]] = i; - } - - // Mark instructions with multiple users. - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *I = dyn_cast(VL[i]); - // Remember to check if all of the users of this instr are vectorized - // within our tree. At depth zero we have no local users, only external - // users that we don't care about. - if (Depth && I && I->getNumUses() > 1) { - DEBUG(dbgs() << "SLP: Adding to MultiUserVals " - "because it has multiple users:" << *I << " \n"); - MultiUserVals.insert(I); - } - } - - switch (Opcode) { - case Instruction::ExtractElement: { - VectorType *VecTy = VectorType::get(VL[0]->getType(), VL.size()); - // No need to follow ExtractElements that are going to be optimized away. - if (CanReuseExtract(VL, VL.size(), VecTy)) - return; - // Fall through. - } - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: - case Instruction::Select: - case Instruction::ICmp: - case Instruction::FCmp: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast(VL[j])->getOperand(i)); - - getTreeUses_rec(Operands, Depth + 1); - } - return; - } - case Instruction::Store: { - ValueList Operands; - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast(VL[j])->getOperand(0)); - getTreeUses_rec(Operands, Depth + 1); - return; - } - default: - return; - } -} - -int BoUpSLP::getTreeCost_rec(ArrayRef VL, unsigned Depth) { - Type *ScalarTy = VL[0]->getType(); - - if (StoreInst *SI = dyn_cast(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - - /// Don't mess with vectors. - if (ScalarTy->isVectorTy()) - return max_cost; - - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); - - if (Depth == RecursionMaxDepth) - return getScalarizationCost(VecTy); - - // Check if all of the operands are constants. - bool AllConst = true; - bool AllSameScalar = true; - bool MustScalarizeFlag = false; - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - AllConst &= isa(VL[i]); - AllSameScalar &= (VL[0] == VL[i]); - // Must have a single use. - Instruction *I = dyn_cast(VL[i]); - MustScalarizeFlag |= MustScalarize.count(VL[i]); - // This instruction is outside the basic block. - if (I && I->getParent() != BB) - return getScalarizationCost(VecTy); - } - - // Is this a simple vector constant. - if (AllConst) - return 0; - - // If all of the operands are identical we can broadcast them. - Instruction *VL0 = dyn_cast(VL[0]); - if (AllSameScalar) { - // If we are in a loop, and this is not an instruction (e.g. constant or - // argument) or the instruction is defined outside the loop then assume - // that the cost is zero. - if (L && (!VL0 || !L->contains(VL0))) - return 0; - - // We need to broadcast the scalar. - return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); - } - - // If this is not a constant, or a scalar from outside the loop then we - // need to scalarize it. - if (MustScalarizeFlag) - return getScalarizationCost(VecTy); - - if (!VL0) - return getScalarizationCost(VecTy); - assert(VL0->getParent() == BB && "Wrong BB"); - - unsigned Opcode = VL0->getOpcode(); - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *I = dyn_cast(VL[i]); - // If not all of the instructions are identical then we have to scalarize. - if (!I || Opcode != I->getOpcode()) - return getScalarizationCost(VecTy); - } - - // Check if it is safe to sink the loads or the stores. - if (Opcode == Instruction::Load || Opcode == Instruction::Store) { - int MaxIdx = getLastIndex(VL, VL.size()); - Instruction *Last = InstrVec[MaxIdx]; - - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - if (VL[i] == Last) - continue; - Value *Barrier = isUnsafeToSink(cast(VL[i]), Last); - if (Barrier) { - DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last - << "\n because of " << *Barrier << "\n"); - return max_cost; - } - } - } - - // Calculate the extract cost. - unsigned ExternalUserExtractCost = 0; - for (unsigned i = 0, e = VL.size(); i < e; ++i) - if (MustExtract.count(VL[i])) - ExternalUserExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); - - switch (Opcode) { - case Instruction::ExtractElement: { - if (CanReuseExtract(VL, VL.size(), VecTy)) - return 0; - return getScalarizationCost(VecTy); - } - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - int Cost = ExternalUserExtractCost; - ValueList Operands; - Type *SrcTy = VL0->getOperand(0)->getType(); - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) { - Operands.push_back(cast(VL[j])->getOperand(0)); - // Check that the casted type is the same for all users. - if (cast(VL[j])->getOperand(0)->getType() != SrcTy) - return getScalarizationCost(VecTy); - } - - Cost += getTreeCost_rec(Operands, Depth + 1); - if (Cost >= max_cost) - return max_cost; - - // Calculate the cost of this instruction. - int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), - VL0->getType(), SrcTy); - - VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); - int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); - Cost += (VecCost - ScalarCost); - return Cost; - } - case Instruction::FCmp: - case Instruction::ICmp: { - // Check that all of the compares have the same predicate. - CmpInst::Predicate P0 = dyn_cast(VL0)->getPredicate(); - for (unsigned i = 1, e = VL.size(); i < e; ++i) { - CmpInst *Cmp = cast(VL[i]); - if (Cmp->getPredicate() != P0) - return getScalarizationCost(VecTy); - } - // Fall through. - } - case Instruction::Select: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - int Cost = ExternalUserExtractCost; - // Calculate the cost of all of the operands. - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast(VL[j])->getOperand(i)); - - Cost += getTreeCost_rec(Operands, Depth + 1); - if (Cost >= max_cost) - return max_cost; - } - - // Calculate the cost of this instruction. - int ScalarCost = 0; - int VecCost = 0; - if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp || - Opcode == Instruction::Select) { - VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); - ScalarCost = - VecTy->getNumElements() * - TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); - VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); - } else { - ScalarCost = VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Opcode, ScalarTy); - VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); - } - Cost += (VecCost - ScalarCost); - return Cost; - } - case Instruction::Load: { - // If we are scalarize the loads, add the cost of forming the vector. - for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) - if (!isConsecutiveAccess(VL[i], VL[i + 1])) - return getScalarizationCost(VecTy); - - // Cost of wide load - cost of scalar loads. - int ScalarLdCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); - int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); - return VecLdCost - ScalarLdCost + ExternalUserExtractCost; - } - case Instruction::Store: { - // We know that we can merge the stores. Calculate the cost. - int ScalarStCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); - int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); - int StoreCost = VecStCost - ScalarStCost; - - ValueList Operands; - for (unsigned j = 0; j < VL.size(); ++j) { - Operands.push_back(cast(VL[j])->getOperand(0)); - MemBarrierIgnoreList.insert(VL[j]); - } - - int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1); - return TotalCost + ExternalUserExtractCost; - } - default: - // Unable to vectorize unknown instructions. - return getScalarizationCost(VecTy); - } -} - -int BoUpSLP::getLastIndex(ArrayRef VL, unsigned VF) { - int MaxIdx = InstrIdx[BB->getFirstNonPHI()]; - for (unsigned i = 0; i < VF; ++i) - MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]); - return MaxIdx; -} - -int BoUpSLP::getFirstUserIndex(ArrayRef VL, unsigned VF) { - // Find the first user of the values. - int FirstUser = InstrVec.size(); - for (unsigned i = 0; i < VF; ++i) { - for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end(); - U != UE; ++U) { - Instruction *Instr = dyn_cast(*U); - if (!Instr || Instr->getParent() != BB) - continue; - - FirstUser = std::min(FirstUser, InstrIdx[Instr]); - } - } - return FirstUser; -} - -int BoUpSLP::getLastIndex(Instruction *I, Instruction *J) { - assert(I->getParent() == BB && "Invalid parent for instruction I"); - assert(J->getParent() == BB && "Invalid parent for instruction J"); - return std::max(InstrIdx[I], InstrIdx[J]); -} - -Instruction *BoUpSLP::getInsertionPoint(unsigned Index) { - return InstrVec[Index + 1]; -} - -Value *BoUpSLP::Scalarize(ArrayRef VL, VectorType *Ty) { - Value *Vec = UndefValue::get(Ty); - for (unsigned i = 0; i < Ty->getNumElements(); ++i) { - // Generate the 'InsertElement' instruction. - Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); - // Remember that this instruction is used as part of a 'gather' sequence. - // The caller of the bottom-up slp vectorizer can try to hoist the sequence - // if the users are outside of the basic block. - if (InsertElementInst *IEI = dyn_cast(Vec)) - GatherInstructions.push_back(IEI); - } - - // Mark the end of the gather sequence. - GatherInstructions.push_back(0); - - for (unsigned i = 0; i < Ty->getNumElements(); ++i) - VectorizedValues[VL[i]] = Vec; - - return Vec; -} - -Value *BoUpSLP::vectorizeTree(ArrayRef VL, int VF) { - Value *V = vectorizeTree_rec(VL, VF); - - int LastInstrIdx = getLastIndex(VL, VL.size()); - for (SetVector::iterator it = MustExtract.begin(), - e = MustExtract.end(); - it != e; ++it) { - Instruction *I = cast(*it); - - // This is a scalarized value, so we can use the original value. - // No need to extract from the vector. - if (!LaneMap.count(I)) - continue; - - Value *Vec = VectorizedValues[I]; - // We decided not to vectorize I because one of its users was not - // vectorizerd. This is okay. - if (!Vec) - continue; - - Value *Idx = Builder.getInt32(LaneMap[I]); - Value *Extract = Builder.CreateExtractElement(Vec, Idx); - bool Replaced = false; - for (Value::use_iterator U = I->use_begin(), UE = I->use_end(); U != UE; - ++U) { - Instruction *UI = cast(*U); - if (UI->getParent() != I->getParent() || InstrIdx[UI] > LastInstrIdx) - UI->replaceUsesOfWith(I, Extract); - Replaced = true; - } - assert(Replaced && "Must replace at least one outside user"); - (void)Replaced; - } - - // We moved some instructions around. We have to number them again - // before we can do any analysis. - numberInstructions(); - MustScalarize.clear(); - MustExtract.clear(); - VectorizedValues.clear(); - return V; -} - -Value *BoUpSLP::vectorizeTree_rec(ArrayRef VL, int VF) { - Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, VF); - - // Check if all of the operands are constants or identical. - bool AllConst = true; - bool AllSameScalar = true; - for (unsigned i = 0, e = VF; i < e; ++i) { - AllConst &= isa(VL[i]); - AllSameScalar &= (VL[0] == VL[i]); - // The instruction must be in the same BB, and it must be vectorizable. - Instruction *I = dyn_cast(VL[i]); - if (MustScalarize.count(VL[i]) || (I && I->getParent() != BB)) - return Scalarize(VL, VecTy); - } - - // Check that this is a simple vector constant. - if (AllConst || AllSameScalar) - return Scalarize(VL, VecTy); - - // Scalarize unknown structures. - Instruction *VL0 = dyn_cast(VL[0]); - if (!VL0) - return Scalarize(VL, VecTy); - - if (VectorizedValues.count(VL0)) { - Value *Vec = VectorizedValues[VL0]; - for (int i = 0; i < VF; ++i) - VectorizedValues[VL[i]] = Vec; - return Vec; - } - - unsigned Opcode = VL0->getOpcode(); - for (unsigned i = 0, e = VF; i < e; ++i) { - Instruction *I = dyn_cast(VL[i]); - // If not all of the instructions are identical then we have to scalarize. - if (!I || Opcode != I->getOpcode()) - return Scalarize(VL, VecTy); - } - - switch (Opcode) { - case Instruction::ExtractElement: { - if (CanReuseExtract(VL, VL.size(), VecTy)) - return VL0->getOperand(0); - return Scalarize(VL, VecTy); - } - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - ValueList INVL; - for (int i = 0; i < VF; ++i) - INVL.push_back(cast(VL[i])->getOperand(0)); - Value *InVec = vectorizeTree_rec(INVL, VF); - CastInst *CI = dyn_cast(VL0); - Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); - - for (int i = 0; i < VF; ++i) - VectorizedValues[VL[i]] = V; - - return V; - } - case Instruction::FCmp: - case Instruction::ICmp: { - // Check that all of the compares have the same predicate. - CmpInst::Predicate P0 = dyn_cast(VL0)->getPredicate(); - for (unsigned i = 1, e = VF; i < e; ++i) { - CmpInst *Cmp = cast(VL[i]); - if (Cmp->getPredicate() != P0) - return Scalarize(VL, VecTy); - } - - ValueList LHSV, RHSV; - for (int i = 0; i < VF; ++i) { - LHSV.push_back(cast(VL[i])->getOperand(0)); - RHSV.push_back(cast(VL[i])->getOperand(1)); - } - - Value *L = vectorizeTree_rec(LHSV, VF); - Value *R = vectorizeTree_rec(RHSV, VF); - Value *V; - if (VL0->getOpcode() == Instruction::FCmp) - V = Builder.CreateFCmp(P0, L, R); - else - V = Builder.CreateICmp(P0, L, R); - - for (int i = 0; i < VF; ++i) - VectorizedValues[VL[i]] = V; - - return V; - } - case Instruction::Select: { - ValueList TrueVec, FalseVec, CondVec; - for (int i = 0; i < VF; ++i) { - CondVec.push_back(cast(VL[i])->getOperand(0)); - TrueVec.push_back(cast(VL[i])->getOperand(1)); - FalseVec.push_back(cast(VL[i])->getOperand(2)); - } - - Value *True = vectorizeTree_rec(TrueVec, VF); - Value *False = vectorizeTree_rec(FalseVec, VF); - Value *Cond = vectorizeTree_rec(CondVec, VF); - Value *V = Builder.CreateSelect(Cond, True, False); - - for (int i = 0; i < VF; ++i) - VectorizedValues[VL[i]] = V; - - return V; - } - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - ValueList LHSVL, RHSVL; - for (int i = 0; i < VF; ++i) { - LHSVL.push_back(cast(VL[i])->getOperand(0)); - RHSVL.push_back(cast(VL[i])->getOperand(1)); - } - - Value *LHS = vectorizeTree_rec(LHSVL, VF); - Value *RHS = vectorizeTree_rec(RHSVL, VF); - BinaryOperator *BinOp = cast(VL0); - Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); - - for (int i = 0; i < VF; ++i) - VectorizedValues[VL[i]] = V; - - return V; - } - case Instruction::Load: { - LoadInst *LI = cast(VL0); - unsigned Alignment = LI->getAlignment(); - - // Check if all of the loads are consecutive. - for (unsigned i = 1, e = VF; i < e; ++i) - if (!isConsecutiveAccess(VL[i - 1], VL[i])) - return Scalarize(VL, VecTy); - - // Loads are inserted at the head of the tree because we don't want to sink - // them all the way down past store instructions. - Instruction *Loc = getInsertionPoint(getLastIndex(VL, VL.size())); - IRBuilder<> LoadBuilder(Loc); - Value *VecPtr = LoadBuilder.CreateBitCast(LI->getPointerOperand(), - VecTy->getPointerTo()); - LI = LoadBuilder.CreateLoad(VecPtr); - LI->setAlignment(Alignment); - - for (int i = 0; i < VF; ++i) - VectorizedValues[VL[i]] = LI; - - return LI; - } - case Instruction::Store: { - StoreInst *SI = cast(VL0); - unsigned Alignment = SI->getAlignment(); - - ValueList ValueOp; - for (int i = 0; i < VF; ++i) - ValueOp.push_back(cast(VL[i])->getValueOperand()); - - Value *VecValue = vectorizeTree_rec(ValueOp, VF); - Value *VecPtr = - Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo()); - Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment); - - for (int i = 0; i < VF; ++i) - cast(VL[i])->eraseFromParent(); - return 0; - } - default: - return Scalarize(VL, VecTy); - } -} - -} // end of namespace diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h deleted file mode 100644 index c9fe6d23ab6..00000000000 --- a/lib/Transforms/Vectorize/VecUtils.h +++ /dev/null @@ -1,194 +0,0 @@ -//===- VecUtils.h - Vectorization Utilities -------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This family of classes and functions manipulate vectors and chains of -// vectors. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H -#define LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H - -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SetVector.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/IR/IRBuilder.h" -#include - -namespace llvm { - -class BasicBlock; -class Instruction; -class Type; -class VectorType; -class StoreInst; -class Value; -class ScalarEvolution; -class DataLayout; -class TargetTransformInfo; -class AliasAnalysis; -class Loop; - -/// Bottom Up SLP vectorization utility class. -struct BoUpSLP { - typedef SmallVector ValueList; - typedef SmallVector InstrList; - typedef SmallPtrSet ValueSet; - typedef SmallVector StoreList; - static const int max_cost = 1 << 20; - - // \brief C'tor. - BoUpSLP(BasicBlock *Bb, ScalarEvolution *Se, DataLayout *Dl, - TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp); - - /// \brief Take the pointer operand from the Load/Store instruction. - /// \returns NULL if this is not a valid Load/Store instruction. - static Value *getPointerOperand(Value *I); - - /// \brief Take the address space operand from the Load/Store instruction. - /// \returns -1 if this is not a valid Load/Store instruction. - static unsigned getAddressSpaceOperand(Value *I); - - /// \returns true if the memory operations A and B are consecutive. - bool isConsecutiveAccess(Value *A, Value *B); - - /// \brief Vectorize the tree that starts with the elements in \p VL. - /// \returns the vectorized value. - Value *vectorizeTree(ArrayRef VL, int VF); - - /// \returns the vectorization cost of the subtree that starts at \p VL. - /// A negative number means that this is profitable. - int getTreeCost(ArrayRef VL); - - /// \returns the scalarization cost for this list of values. Assuming that - /// this subtree gets vectorized, we may need to extract the values from the - /// roots. This method calculates the cost of extracting the values. - int getScalarizationCost(ArrayRef VL); - - /// \brief Attempts to order and vectorize a sequence of stores. This - /// function does a quadratic scan of the given stores. - /// \returns true if the basic block was modified. - bool vectorizeStores(ArrayRef Stores, int costThreshold); - - /// \brief Vectorize a group of scalars into a vector tree. - /// \returns the vectorized value. - Value *vectorizeArith(ArrayRef Operands); - - /// \returns the list of new instructions that were added in order to collect - /// scalars into vectors. This list can be used to further optimize the gather - /// sequences. - InstrList &getGatherSeqInstructions() { return GatherInstructions; } - -private: - /// \brief This method contains the recursive part of getTreeCost. - int getTreeCost_rec(ArrayRef VL, unsigned Depth); - - /// \brief This recursive method looks for vectorization hazards such as - /// values that are used by multiple users and checks that values are used - /// by only one vector lane. It updates the variables LaneMap, MultiUserVals. - void getTreeUses_rec(ArrayRef VL, unsigned Depth); - - /// \brief This method contains the recursive part of vectorizeTree. - Value *vectorizeTree_rec(ArrayRef VL, int VF); - - /// \brief Number all of the instructions in the block. - void numberInstructions(); - - /// \brief Vectorize a sorted sequence of stores. - bool vectorizeStoreChain(ArrayRef Chain, int CostThreshold); - - /// \returns the scalarization cost for this type. Scalarization in this - /// context means the creation of vectors from a group of scalars. - int getScalarizationCost(Type *Ty); - - /// \returns the AA location that is being access by the instruction. - AliasAnalysis::Location getLocation(Instruction *I); - - /// \brief Checks if it is possible to sink an instruction from - /// \p Src to \p Dst. - /// \returns the pointer to the barrier instruction if we can't sink. - Value *isUnsafeToSink(Instruction *Src, Instruction *Dst); - - /// \returns the index of the last instrucion in the BB from \p VL. - /// Only consider the first \p VF elements. - int getLastIndex(ArrayRef VL, unsigned VF); - - /// \returns the index of the first User of \p VL. - /// Only consider the first \p VF elements. - int getFirstUserIndex(ArrayRef VL, unsigned VF); - - /// \returns the instruction \p I or \p J that appears last in the BB . - int getLastIndex(Instruction *I, Instruction *J); - - /// \returns the insertion point for \p Index. - Instruction *getInsertionPoint(unsigned Index); - - /// \returns a vector from a collection of scalars in \p VL. - Value *Scalarize(ArrayRef VL, VectorType *Ty); - -private: - /// Maps instructions to numbers and back. - SmallDenseMap InstrIdx; - /// Maps integers to Instructions. - std::vector InstrVec; - - // -- containers that are used during getTreeCost -- // - - /// Contains values that must be scalarized because they are used - /// by multiple lanes, or by users outside the tree. - /// NOTICE: The vectorization methods also use this set. - ValueSet MustScalarize; - - /// Contains values that have users outside of the vectorized graph. - /// We need to generate extract instructions for these values. - /// NOTICE: The vectorization methods also use this set. - SetVector MustExtract; - - /// Contains a list of values that are used outside the current tree. This - /// set must be reset between runs. - SetVector MultiUserVals; - /// Maps values in the tree to the vector lanes that uses them. This map must - /// be reset between runs of getCost. - std::map LaneMap; - /// A list of instructions to ignore while sinking - /// memory instructions. This map must be reset between runs of getCost. - ValueSet MemBarrierIgnoreList; - - // -- Containers that are used during vectorizeTree -- // - - /// Maps between the first scalar to the vector. This map must be reset - /// between runs. - DenseMap VectorizedValues; - - // -- Containers that are used after vectorization by the caller -- // - - /// A list of instructions that are used when gathering scalars into vectors. - /// In many cases these instructions can be hoisted outside of the BB. - /// Iterating over this list is faster than calling LICM. - /// Notice: We insert NULL ptrs to separate between the different gather - /// sequences. - InstrList GatherInstructions; - - /// Instruction builder to construct the vectorized tree. - IRBuilder<> Builder; - - // Analysis and block reference. - BasicBlock *BB; - ScalarEvolution *SE; - DataLayout *DL; - TargetTransformInfo *TTI; - AliasAnalysis *AA; - Loop *L; -}; - -} // end of namespace - -#endif // LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H diff --git a/test/Transforms/SLPVectorizer/X86/diamond.ll b/test/Transforms/SLPVectorizer/X86/diamond.ll index 8959b0d9eec..008f09db454 100644 --- a/test/Transforms/SLPVectorizer/X86/diamond.ll +++ b/test/Transforms/SLPVectorizer/X86/diamond.ll @@ -50,9 +50,9 @@ entry: ; } ; CHECK: @extr_user +; CHECK: load i32* ; CHECK: store <4 x i32> -; CHECK-NEXT: extractelement <4 x i32> -; CHECK: ret +; CHECK-NEXT: ret define i32 @extr_user(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) { entry: %0 = load i32* %A, align 4 @@ -79,9 +79,9 @@ entry: ; In this example we have an external user that is not the first element in the vector. ; CHECK: @extr_user1 +; CHECK: load i32* ; CHECK: store <4 x i32> -; CHECK-NEXT: extractelement <4 x i32> -; CHECK: ret +; CHECK-NEXT: ret define i32 @extr_user1(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) { entry: %0 = load i32* %A, align 4 diff --git a/test/Transforms/SLPVectorizer/X86/multi_block.ll b/test/Transforms/SLPVectorizer/X86/multi_block.ll new file mode 100644 index 00000000000..eed3f371b80 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/multi_block.ll @@ -0,0 +1,55 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.7.0" + +; int bar(double *A, int d) { +; double A0 = A[0]; +; double A1 = A[1]; +; float F0 = A0; +; float F1 = A1; +; if (d) foo(); <----- This splits the blocks +; F0+=4.0; +; F1+=5.0; +; A[8] = 9.0 + F0; +; A[9] = 5.0 + F1; +; } + + +;CHECK: @bar +;CHECK: load <2 x double> +;CHECK: fptrunc <2 x double> +;CHECK: call i32 +;CHECK: fadd <2 x float> +;CHECK: fpext <2 x float> +;CHECK: store <2 x double> +;CHECK: ret +define i32 @bar(double* nocapture %A, i32 %d) { + %1 = load double* %A, align 8 + %2 = getelementptr inbounds double* %A, i64 1 + %3 = load double* %2, align 8 + %4 = fptrunc double %1 to float + %5 = fptrunc double %3 to float + %6 = icmp eq i32 %d, 0 + br i1 %6, label %9, label %7 + +;