From 53a0552b06cb8288004f7698f6e4640fe2a74f61 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sat, 22 Jun 2013 21:34:10 +0000 Subject: [PATCH] SLP Vectorizer: Implement multi-block slp-vectorization. Rewrote the SLP-vectorization as a whole-function vectorization pass. It is now able to vectorize chains across multiple basic blocks. It still does not vectorize PHIs, but this should be easy to do now that we scan the entire function. I removed the support for extracting values from trees. We are now able to vectorize more programs, but there are some serious regressions in many workloads (such as flops-6 and mandel-2). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@184647 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/CMakeLists.txt | 1 - lib/Transforms/Vectorize/SLPVectorizer.cpp | 1289 +++++++++++++++-- lib/Transforms/Vectorize/VecUtils.cpp | 1031 ------------- lib/Transforms/Vectorize/VecUtils.h | 194 --- test/Transforms/SLPVectorizer/X86/diamond.ll | 8 +- .../SLPVectorizer/X86/multi_block.ll | 55 + .../SLPVectorizer/X86/multi_user.ll | 2 +- 7 files changed, 1220 insertions(+), 1360 deletions(-) delete mode 100644 lib/Transforms/Vectorize/VecUtils.cpp delete mode 100644 lib/Transforms/Vectorize/VecUtils.h create mode 100644 test/Transforms/SLPVectorizer/X86/multi_block.ll diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt index 7ae082f55e0..07967d883a9 100644 --- a/lib/Transforms/Vectorize/CMakeLists.txt +++ b/lib/Transforms/Vectorize/CMakeLists.txt @@ -3,7 +3,6 @@ add_llvm_library(LLVMVectorize Vectorize.cpp LoopVectorize.cpp SLPVectorizer.cpp - VecUtils.cpp ) add_dependencies(LLVMVectorize intrinsics_gen) diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index c3cb03764b2..1adc1ba8e2c 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -18,17 +18,20 @@ #define SV_NAME "slp-vectorizer" #define DEBUG_TYPE "SLP" -#include "VecUtils.h" #include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/Verifier.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" @@ -36,6 +39,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include #include using namespace llvm; @@ -46,9 +50,1138 @@ static cl::opt "number. (gain = -cost of vectorization)")); namespace { +static const unsigned MinVecRegSize = 128; + +static const unsigned RecursionMaxDepth = 6; + +/// RAII pattern to save the insertion point of the IR builder. +class BuilderLocGuard { +public: + BuilderLocGuard(IRBuilder<> &B) : Builder(B), Loc(B.GetInsertPoint()) {} + ~BuilderLocGuard() { Builder.SetInsertPoint(Loc); } + +private: + // Prevent copying. + BuilderLocGuard(const BuilderLocGuard &); + BuilderLocGuard &operator=(const BuilderLocGuard &); + IRBuilder<> &Builder; + BasicBlock::iterator Loc; +}; + +/// A helper class for numbering instructions in multible blocks. +/// Numbers starts at zero for each basic block. +struct BlockNumbering { + + BlockNumbering(BasicBlock *Bb) : BB(Bb), Valid(false) {} + + BlockNumbering() : BB(0), Valid(false) {} + + void numberInstructions() { + unsigned Loc = 0; + InstrIdx.clear(); + InstrVec.clear(); + // Number the instructions in the block. + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + InstrIdx[it] = Loc++; + InstrVec.push_back(it); + assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation"); + } + Valid = true; + } + + int getIndex(Instruction *I) { + if (!Valid) + numberInstructions(); + assert(InstrIdx.count(I) && "Unknown instruction"); + return InstrIdx[I]; + } + + Instruction *getInstruction(unsigned loc) { + if (!Valid) + numberInstructions(); + assert(InstrVec.size() > loc && "Invalid Index"); + return InstrVec[loc]; + } + + void forget() { Valid = false; } + +private: + /// The block we are numbering. + BasicBlock *BB; + /// Is the block numbered. + bool Valid; + /// Maps instructions to numbers and back. + SmallDenseMap InstrIdx; + /// Maps integers to Instructions. + std::vector InstrVec; +}; + +class FuncSLP { + typedef SmallVector ValueList; + typedef SmallVector InstrList; + typedef SmallPtrSet ValueSet; + typedef SmallVector StoreList; + +public: + static const int MAX_COST = INT_MIN; + + FuncSLP(Function *Func, ScalarEvolution *Se, DataLayout *Dl, + TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li) + : F(Func), SE(Se), DL(Dl), TTI(Tti), AA(Aa), LI(Li), + Builder(Se->getContext()) { + for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { + BasicBlock *BB = it; + BlocksNumbers[BB] = BlockNumbering(BB); + } + } + + /// \brief Take the pointer operand from the Load/Store instruction. + /// \returns NULL if this is not a valid Load/Store instruction. + static Value *getPointerOperand(Value *I); + + /// \brief Take the address space operand from the Load/Store instruction. + /// \returns -1 if this is not a valid Load/Store instruction. + static unsigned getAddressSpaceOperand(Value *I); + + /// \returns true if the memory operations A and B are consecutive. + bool isConsecutiveAccess(Value *A, Value *B); + + /// \brief Vectorize the tree that starts with the elements in \p VL. + /// \returns the vectorized value. + Value *vectorizeTree(ArrayRef VL); + + /// \returns the vectorization cost of the subtree that starts at \p VL. + /// A negative number means that this is profitable. + int getTreeCost(ArrayRef VL); + + /// \returns the scalarization cost for this list of values. Assuming that + /// this subtree gets vectorized, we may need to extract the values from the + /// roots. This method calculates the cost of extracting the values. + int getGatherCost(ArrayRef VL); + + /// \brief Attempts to order and vectorize a sequence of stores. This + /// function does a quadratic scan of the given stores. + /// \returns true if the basic block was modified. + bool vectorizeStores(ArrayRef Stores, int costThreshold); + + /// \brief Vectorize a group of scalars into a vector tree. + /// \returns the vectorized value. + Value *vectorizeArith(ArrayRef Operands); + + /// \brief This method contains the recursive part of getTreeCost. + int getTreeCost_rec(ArrayRef VL, unsigned Depth); + + /// \brief This recursive method looks for vectorization hazards such as + /// values that are used by multiple users and checks that values are used + /// by only one vector lane. It updates the variables LaneMap, MultiUserVals. + void getTreeUses_rec(ArrayRef VL, unsigned Depth); + + /// \brief This method contains the recursive part of vectorizeTree. + Value *vectorizeTree_rec(ArrayRef VL); + + /// \brief Vectorize a sorted sequence of stores. + bool vectorizeStoreChain(ArrayRef Chain, int CostThreshold); + + /// \returns the scalarization cost for this type. Scalarization in this + /// context means the creation of vectors from a group of scalars. + int getGatherCost(Type *Ty); + + /// \returns the AA location that is being access by the instruction. + AliasAnalysis::Location getLocation(Instruction *I); + + /// \brief Checks if it is possible to sink an instruction from + /// \p Src to \p Dst. + /// \returns the pointer to the barrier instruction if we can't sink. + Value *getSinkBarrier(Instruction *Src, Instruction *Dst); + + /// \returns the index of the last instrucion in the BB from \p VL. + int getLastIndex(ArrayRef VL); + + /// \returns the Instrucion in the bundle \p VL. + Instruction *getLastInstruction(ArrayRef VL); + + /// \returns the Instruction at index \p Index which is in Block \p BB. + Instruction *getInstructionForIndex(unsigned Index, BasicBlock *BB); + + /// \returns the index of the first User of \p VL. + int getFirstUserIndex(ArrayRef VL); + + /// \returns a vector from a collection of scalars in \p VL. + Value *Gather(ArrayRef VL, VectorType *Ty); + + /// \brief Try to hoist gather sequences outside of the loop in cases where + /// all of the sources are loop invariant. + void hoistGatherSequence(); + + bool needToGatherAny(ArrayRef VL) { + for (int i = 0, e = VL.size(); i < e; ++i) + if (MustGather.count(VL[i])) + return true; + return false; + } + + /// -- Vectorization State -- + + /// Maps values in the tree to the vector lanes that uses them. This map must + /// be reset between runs of getCost. + std::map LaneMap; + /// A list of instructions to ignore while sinking + /// memory instructions. This map must be reset between runs of getCost. + ValueSet MemBarrierIgnoreList; + + /// Maps between the first scalar to the vector. This map must be reset + /// between runs. + DenseMap VectorizedValues; + + /// Contains values that must be gathered because they are used + /// by multiple lanes, or by users outside the tree. + /// NOTICE: The vectorization methods also use this set. + ValueSet MustGather; + + /// Contains a list of values that are used outside the current tree. This + /// set must be reset between runs. + SetVector MultiUserVals; + + /// Holds all of the instructions that we gathered. + SetVector GatherSeq; + + /// Numbers instructions in different blocks. + std::map BlocksNumbers; + + // Analysis and block reference. + Function *F; + ScalarEvolution *SE; + DataLayout *DL; + TargetTransformInfo *TTI; + AliasAnalysis *AA; + LoopInfo *LI; + /// Instruction builder to construct the vectorized tree. + IRBuilder<> Builder; +}; + +int FuncSLP::getGatherCost(Type *Ty) { + int Cost = 0; + for (unsigned i = 0, e = cast(Ty)->getNumElements(); i < e; ++i) + Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + return Cost; +} + +int FuncSLP::getGatherCost(ArrayRef VL) { + // Find the type of the operands in VL. + Type *ScalarTy = VL[0]->getType(); + if (StoreInst *SI = dyn_cast(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + // Find the cost of inserting/extracting values from the vector. + return getGatherCost(VecTy); +} + +AliasAnalysis::Location FuncSLP::getLocation(Instruction *I) { + if (StoreInst *SI = dyn_cast(I)) + return AA->getLocation(SI); + if (LoadInst *LI = dyn_cast(I)) + return AA->getLocation(LI); + return AliasAnalysis::Location(); +} + +Value *FuncSLP::getPointerOperand(Value *I) { + if (LoadInst *LI = dyn_cast(I)) + return LI->getPointerOperand(); + if (StoreInst *SI = dyn_cast(I)) + return SI->getPointerOperand(); + return 0; +} + +unsigned FuncSLP::getAddressSpaceOperand(Value *I) { + if (LoadInst *L = dyn_cast(I)) + return L->getPointerAddressSpace(); + if (StoreInst *S = dyn_cast(I)) + return S->getPointerAddressSpace(); + return -1; +} + +bool FuncSLP::isConsecutiveAccess(Value *A, Value *B) { + Value *PtrA = getPointerOperand(A); + Value *PtrB = getPointerOperand(B); + unsigned ASA = getAddressSpaceOperand(A); + unsigned ASB = getAddressSpaceOperand(B); + + // Check that the address spaces match and that the pointers are valid. + if (!PtrA || !PtrB || (ASA != ASB)) + return false; + + // Check that A and B are of the same type. + if (PtrA->getType() != PtrB->getType()) + return false; + + // Calculate the distance. + const SCEV *PtrSCEVA = SE->getSCEV(PtrA); + const SCEV *PtrSCEVB = SE->getSCEV(PtrB); + const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB); + const SCEVConstant *ConstOffSCEV = dyn_cast(OffsetSCEV); + + // Non constant distance. + if (!ConstOffSCEV) + return false; + + int64_t Offset = ConstOffSCEV->getValue()->getSExtValue(); + Type *Ty = cast(PtrA->getType())->getElementType(); + // The Instructions are connsecutive if the size of the first load/store is + // the same as the offset. + int64_t Sz = DL->getTypeStoreSize(Ty); + return ((-Offset) == Sz); +} + +Value *FuncSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) { + assert(Src->getParent() == Dst->getParent() && "Not the same BB"); + BasicBlock::iterator I = Src, E = Dst; + /// Scan all of the instruction from SRC to DST and check if + /// the source may alias. + for (++I; I != E; ++I) { + // Ignore store instructions that are marked as 'ignore'. + if (MemBarrierIgnoreList.count(I)) + continue; + if (Src->mayWriteToMemory()) /* Write */ { + if (!I->mayReadOrWriteMemory()) + continue; + } else /* Read */ { + if (!I->mayWriteToMemory()) + continue; + } + AliasAnalysis::Location A = getLocation(&*I); + AliasAnalysis::Location B = getLocation(Src); + + if (!A.Ptr || !B.Ptr || AA->alias(A, B)) + return I; + } + return 0; +} + +static BasicBlock *getSameBlock(ArrayRef VL) { + BasicBlock *BB = 0; + for (int i = 0, e = VL.size(); i < e; i++) { + Instruction *I = dyn_cast(VL[i]); + if (!I) + return 0; + + if (!BB) { + BB = I->getParent(); + continue; + } + + if (BB != I->getParent()) + return 0; + } + return BB; +} + +static bool allConstant(ArrayRef VL) { + for (unsigned i = 0, e = VL.size(); i < e; ++i) + if (!isa(VL[i])) + return false; + return true; +} + +static bool isSplat(ArrayRef VL) { + for (unsigned i = 1, e = VL.size(); i < e; ++i) + if (VL[i] != VL[0]) + return false; + return true; +} + +static unsigned getSameOpcode(ArrayRef VL) { + unsigned Opcode = 0; + for (int i = 0, e = VL.size(); i < e; i++) { + if (Instruction *I = dyn_cast(VL[i])) { + if (!Opcode) { + Opcode = I->getOpcode(); + continue; + } + if (Opcode != I->getOpcode()) + return 0; + } + } + return Opcode; +} + +static bool CanReuseExtract(ArrayRef VL, unsigned VF, + VectorType *VecTy) { + assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode"); + // Check if all of the extracts come from the same vector and from the + // correct offset. + Value *VL0 = VL[0]; + ExtractElementInst *E0 = cast(VL0); + Value *Vec = E0->getOperand(0); + + // We have to extract from the same vector type. + if (Vec->getType() != VecTy) + return false; + + // Check that all of the indices extract from the correct offset. + ConstantInt *CI = dyn_cast(E0->getOperand(1)); + if (!CI || CI->getZExtValue()) + return false; + + for (unsigned i = 1, e = VF; i < e; ++i) { + ExtractElementInst *E = cast(VL[i]); + ConstantInt *CI = dyn_cast(E->getOperand(1)); + + if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) + return false; + } + + return true; +} + +void FuncSLP::getTreeUses_rec(ArrayRef VL, unsigned Depth) { + if (Depth == RecursionMaxDepth) + return MustGather.insert(VL.begin(), VL.end()); + + // Don't handle vectors. + if (VL[0]->getType()->isVectorTy()) + return; + + if (StoreInst *SI = dyn_cast(VL[0])) + if (SI->getValueOperand()->getType()->isVectorTy()) + return; + + // If all of the operands are identical or constant we have a simple solution. + if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL)) + return MustGather.insert(VL.begin(), VL.end()); + + // Stop the scan at unknown IR. + Instruction *VL0 = dyn_cast(VL[0]); + assert(VL0 && "Invalid instruction"); + + // Mark instructions with multiple users. + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + Instruction *I = dyn_cast(VL[i]); + // Remember to check if all of the users of this instruction are vectorized + // within our tree. At depth zero we have no local users, only external + // users that we don't care about. + if (Depth && I && I->getNumUses() > 1) { + DEBUG(dbgs() << "SLP: Adding to MultiUserVals " + "because it has multiple users:" << *I << " \n"); + MultiUserVals.insert(I); + } + } + + // Check that the instruction is only used within one lane. + for (int i = 0, e = VL.size(); i < e; ++i) { + if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) { + DEBUG(dbgs() << "SLP: Value used by multiple lanes:" << *VL[i] << "\n"); + return MustGather.insert(VL.begin(), VL.end()); + } + // Make this instruction as 'seen' and remember the lane. + LaneMap[VL[i]] = i; + } + + unsigned Opcode = getSameOpcode(VL); + if (!Opcode) + return MustGather.insert(VL.begin(), VL.end()); + + switch (Opcode) { + case Instruction::ExtractElement: { + VectorType *VecTy = VectorType::get(VL[0]->getType(), VL.size()); + // No need to follow ExtractElements that are going to be optimized away. + if (CanReuseExtract(VL, VL.size(), VecTy)) + return; + // Fall through. + } + case Instruction::Load: + return; + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: + case Instruction::Select: + case Instruction::ICmp: + case Instruction::FCmp: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(i)); + + getTreeUses_rec(Operands, Depth + 1); + } + return; + } + case Instruction::Store: { + ValueList Operands; + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(0)); + getTreeUses_rec(Operands, Depth + 1); + return; + } + default: + return MustGather.insert(VL.begin(), VL.end()); + } +} + +int FuncSLP::getLastIndex(ArrayRef VL) { + BasicBlock *BB = cast(VL[0])->getParent(); + assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); + BlockNumbering &BN = BlocksNumbers[BB]; + + int MaxIdx = BN.getIndex(BB->getFirstNonPHI()); + for (unsigned i = 0, e = VL.size(); i < e; ++i) + MaxIdx = std::max(MaxIdx, BN.getIndex(cast(VL[i]))); + return MaxIdx; +} + +Instruction *FuncSLP::getLastInstruction(ArrayRef VL) { + BasicBlock *BB = cast(VL[0])->getParent(); + assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); + BlockNumbering &BN = BlocksNumbers[BB]; + + int MaxIdx = BN.getIndex(cast(VL[0])); + for (unsigned i = 1, e = VL.size(); i < e; ++i) + MaxIdx = std::max(MaxIdx, BN.getIndex(cast(VL[i]))); + return BN.getInstruction(MaxIdx); +} + +Instruction *FuncSLP::getInstructionForIndex(unsigned Index, BasicBlock *BB) { + BlockNumbering &BN = BlocksNumbers[BB]; + return BN.getInstruction(Index); +} + +int FuncSLP::getFirstUserIndex(ArrayRef VL) { + BasicBlock *BB = getSameBlock(VL); + BlockNumbering &BN = BlocksNumbers[BB]; + + // Find the first user of the values. + int FirstUser = BN.getIndex(BB->getTerminator()); + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end(); + U != UE; ++U) { + Instruction *Instr = dyn_cast(*U); + + if (!Instr || Instr->getParent() != BB) + continue; + + FirstUser = std::min(FirstUser, BN.getIndex(Instr)); + } + } + return FirstUser; +} + +int FuncSLP::getTreeCost_rec(ArrayRef VL, unsigned Depth) { + Type *ScalarTy = VL[0]->getType(); + + if (StoreInst *SI = dyn_cast(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + + /// Don't mess with vectors. + if (ScalarTy->isVectorTy()) + return FuncSLP::MAX_COST; + + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + + if (allConstant(VL)) + return 0; + + if (isSplat(VL)) + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); + + if (Depth == RecursionMaxDepth || needToGatherAny(VL)) + return getGatherCost(VecTy); + + BasicBlock *BB = getSameBlock(VL); + unsigned Opcode = getSameOpcode(VL); + assert(Opcode && BB && "Invalid Instruction Value"); + + // Check if it is safe to sink the loads or the stores. + if (Opcode == Instruction::Load || Opcode == Instruction::Store) { + int MaxIdx = getLastIndex(VL); + Instruction *Last = getInstructionForIndex(MaxIdx, BB); + + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + if (VL[i] == Last) + continue; + Value *Barrier = getSinkBarrier(cast(VL[i]), Last); + if (Barrier) { + DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last + << "\n because of " << *Barrier << "\n"); + return MAX_COST; + } + } + } + + Instruction *VL0 = cast(VL[0]); + switch (Opcode) { + case Instruction::ExtractElement: { + if (CanReuseExtract(VL, VL.size(), VecTy)) + return 0; + return getGatherCost(VecTy); + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + ValueList Operands; + Type *SrcTy = VL0->getOperand(0)->getType(); + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) { + Operands.push_back(cast(VL[j])->getOperand(0)); + // Check that the casted type is the same for all users. + if (cast(VL[j])->getOperand(0)->getType() != SrcTy) + return getGatherCost(VecTy); + } + + int Cost = getTreeCost_rec(Operands, Depth + 1); + if (Cost == FuncSLP::MAX_COST) + return Cost; + + // Calculate the cost of this instruction. + int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), + VL0->getType(), SrcTy); + + VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); + int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); + Cost += (VecCost - ScalarCost); + return Cost; + } + case Instruction::FCmp: + case Instruction::ICmp: { + // Check that all of the compares have the same predicate. + CmpInst::Predicate P0 = dyn_cast(VL0)->getPredicate(); + for (unsigned i = 1, e = VL.size(); i < e; ++i) { + CmpInst *Cmp = cast(VL[i]); + if (Cmp->getPredicate() != P0) + return getGatherCost(VecTy); + } + // Fall through. + } + case Instruction::Select: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + int TotalCost = 0; + // Calculate the cost of all of the operands. + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(i)); + + int Cost = getTreeCost_rec(Operands, Depth + 1); + if (Cost == MAX_COST) + return MAX_COST; + TotalCost += TotalCost; + } + + // Calculate the cost of this instruction. + int ScalarCost = 0; + int VecCost = 0; + if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp || + Opcode == Instruction::Select) { + VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); + ScalarCost = + VecTy->getNumElements() * + TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); + VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); + } else { + ScalarCost = VecTy->getNumElements() * + TTI->getArithmeticInstrCost(Opcode, ScalarTy); + VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); + } + TotalCost += (VecCost - ScalarCost); + return TotalCost; + } + case Instruction::Load: { + // If we are scalarize the loads, add the cost of forming the vector. + for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) + if (!isConsecutiveAccess(VL[i], VL[i + 1])) + return getGatherCost(VecTy); + + // Cost of wide load - cost of scalar loads. + int ScalarLdCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + return VecLdCost - ScalarLdCost; + } + case Instruction::Store: { + // We know that we can merge the stores. Calculate the cost. + int ScalarStCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); + int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); + int StoreCost = VecStCost - ScalarStCost; + + ValueList Operands; + for (unsigned j = 0; j < VL.size(); ++j) { + Operands.push_back(cast(VL[j])->getOperand(0)); + MemBarrierIgnoreList.insert(VL[j]); + } + + int Cost = getTreeCost_rec(Operands, Depth + 1); + if (Cost == MAX_COST) + return MAX_COST; + + int TotalCost = StoreCost + Cost; + return TotalCost; + } + default: + // Unable to vectorize unknown instructions. + return getGatherCost(VecTy); + } +} + +int FuncSLP::getTreeCost(ArrayRef VL) { + // Get rid of the list of stores that were removed, and from the + // lists of instructions with multiple users. + MemBarrierIgnoreList.clear(); + LaneMap.clear(); + MultiUserVals.clear(); + MustGather.clear(); + + if (!getSameBlock(VL)) + return MAX_COST; + + // Find the location of the last root. + int LastRootIndex = getLastIndex(VL); + int FirstUserIndex = getFirstUserIndex(VL); + + // Don't vectorize if there are users of the tree roots inside the tree + // itself. + if (LastRootIndex > FirstUserIndex) + return MAX_COST; + + // Scan the tree and find which value is used by which lane, and which values + // must be scalarized. + getTreeUses_rec(VL, 0); + + // Check that instructions with multiple users can be vectorized. Mark unsafe + // instructions. + for (SetVector::iterator it = MultiUserVals.begin(), + e = MultiUserVals.end(); + it != e; ++it) { + // Check that all of the users of this instr are within the tree. + for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end(); + I != E; ++I) { + if (LaneMap.find(*I) == LaneMap.end()) { + DEBUG(dbgs() << "SLP: Adding to MustExtract " + "because of an out of tree usage.\n"); + MustGather.insert(*it); + continue; + } + } + } + + // Now calculate the cost of vectorizing the tree. + return getTreeCost_rec(VL, 0); +} +bool FuncSLP::vectorizeStoreChain(ArrayRef Chain, int CostThreshold) { + unsigned ChainLen = Chain.size(); + DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen + << "\n"); + Type *StoreTy = cast(Chain[0])->getValueOperand()->getType(); + unsigned Sz = DL->getTypeSizeInBits(StoreTy); + unsigned VF = MinVecRegSize / Sz; + + if (!isPowerOf2_32(Sz) || VF < 2) + return false; + + bool Changed = false; + // Look for profitable vectorizable trees at all offsets, starting at zero. + for (unsigned i = 0, e = ChainLen; i < e; ++i) { + if (i + VF > e) + break; + DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i + << "\n"); + ArrayRef Operands = Chain.slice(i, VF); + + int Cost = getTreeCost(Operands); + if (Cost == FuncSLP::MAX_COST) + continue; + DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); + if (Cost < CostThreshold) { + DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + vectorizeTree(Operands); + i += VF - 1; + Changed = true; + } + } + + if (Changed || ChainLen > VF) + return Changed; + + // Handle short chains. This helps us catch types such as <3 x float> that + // are smaller than vector size. + int Cost = getTreeCost(Chain); + if (Cost == FuncSLP::MAX_COST) + return false; + if (Cost < CostThreshold) { + DEBUG(dbgs() << "SLP: Found store chain cost = " << Cost + << " for size = " << ChainLen << "\n"); + vectorizeTree(Chain); + return true; + } + + return false; +} + +bool FuncSLP::vectorizeStores(ArrayRef Stores, int costThreshold) { + SetVector Heads, Tails; + SmallDenseMap ConsecutiveChain; + + // We may run into multiple chains that merge into a single chain. We mark the + // stores that we vectorized so that we don't visit the same store twice. + ValueSet VectorizedStores; + bool Changed = false; + + // Do a quadratic search on all of the given stores and find + // all of the pairs of loads that follow each other. + for (unsigned i = 0, e = Stores.size(); i < e; ++i) + for (unsigned j = 0; j < e; ++j) { + if (i == j) + continue; + + if (isConsecutiveAccess(Stores[i], Stores[j])) { + Tails.insert(Stores[j]); + Heads.insert(Stores[i]); + ConsecutiveChain[Stores[i]] = Stores[j]; + } + } + + // For stores that start but don't end a link in the chain: + for (SetVector::iterator it = Heads.begin(), e = Heads.end(); + it != e; ++it) { + if (Tails.count(*it)) + continue; + + // We found a store instr that starts a chain. Now follow the chain and try + // to vectorize it. + ValueList Operands; + Value *I = *it; + // Collect the chain into a list. + while (Tails.count(I) || Heads.count(I)) { + if (VectorizedStores.count(I)) + break; + Operands.push_back(I); + // Move to the next value in the chain. + I = ConsecutiveChain[I]; + } + + bool Vectorized = vectorizeStoreChain(Operands, costThreshold); + + // Mark the vectorized stores so that we don't vectorize them again. + if (Vectorized) + VectorizedStores.insert(Operands.begin(), Operands.end()); + Changed |= Vectorized; + } + + return Changed; +} + +Value *FuncSLP::Gather(ArrayRef VL, VectorType *Ty) { + Value *Vec = UndefValue::get(Ty); + // Generate the 'InsertElement' instruction. + for (unsigned i = 0; i < Ty->getNumElements(); ++i) { + Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); + if (Instruction *I = dyn_cast(Vec)) + GatherSeq.insert(I); + } + + VectorizedValues[VL[0]] = Vec; + return Vec; +} + +Value *FuncSLP::vectorizeTree_rec(ArrayRef VL) { + BuilderLocGuard Guard(Builder); + + Type *ScalarTy = VL[0]->getType(); + if (StoreInst *SI = dyn_cast(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + + if (needToGatherAny(VL)) + return Gather(VL, VecTy); + + if (VectorizedValues.count(VL[0])) { + DEBUG(dbgs() << "SLP: Diamond merged at depth.\n"); + return VectorizedValues[VL[0]]; + } + + Instruction *VL0 = cast(VL[0]); + unsigned Opcode = VL0->getOpcode(); + assert(Opcode == getSameOpcode(VL) && "Invalid opcode"); + + switch (Opcode) { + case Instruction::ExtractElement: { + if (CanReuseExtract(VL, VL.size(), VecTy)) + return VL0->getOperand(0); + return Gather(VL, VecTy); + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + ValueList INVL; + for (int i = 0, e = VL.size(); i < e; ++i) + INVL.push_back(cast(VL[i])->getOperand(0)); + + Builder.SetInsertPoint(getLastInstruction(VL)); + Value *InVec = vectorizeTree_rec(INVL); + CastInst *CI = dyn_cast(VL0); + Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); + VectorizedValues[VL0] = V; + return V; + } + case Instruction::FCmp: + case Instruction::ICmp: { + // Check that all of the compares have the same predicate. + CmpInst::Predicate P0 = dyn_cast(VL0)->getPredicate(); + for (unsigned i = 1, e = VL.size(); i < e; ++i) { + CmpInst *Cmp = cast(VL[i]); + if (Cmp->getPredicate() != P0) + return Gather(VL, VecTy); + } + + ValueList LHSV, RHSV; + for (int i = 0, e = VL.size(); i < e; ++i) { + LHSV.push_back(cast(VL[i])->getOperand(0)); + RHSV.push_back(cast(VL[i])->getOperand(1)); + } + + Builder.SetInsertPoint(getLastInstruction(VL)); + Value *L = vectorizeTree_rec(LHSV); + Value *R = vectorizeTree_rec(RHSV); + Value *V; + + if (Opcode == Instruction::FCmp) + V = Builder.CreateFCmp(P0, L, R); + else + V = Builder.CreateICmp(P0, L, R); + + VectorizedValues[VL0] = V; + return V; + } + case Instruction::Select: { + ValueList TrueVec, FalseVec, CondVec; + for (int i = 0, e = VL.size(); i < e; ++i) { + CondVec.push_back(cast(VL[i])->getOperand(0)); + TrueVec.push_back(cast(VL[i])->getOperand(1)); + FalseVec.push_back(cast(VL[i])->getOperand(2)); + } + + Builder.SetInsertPoint(getLastInstruction(VL)); + Value *True = vectorizeTree_rec(TrueVec); + Value *False = vectorizeTree_rec(FalseVec); + Value *Cond = vectorizeTree_rec(CondVec); + Value *V = Builder.CreateSelect(Cond, True, False); + VectorizedValues[VL0] = V; + return V; + } + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + ValueList LHSVL, RHSVL; + for (int i = 0, e = VL.size(); i < e; ++i) { + LHSVL.push_back(cast(VL[i])->getOperand(0)); + RHSVL.push_back(cast(VL[i])->getOperand(1)); + } + + Builder.SetInsertPoint(getLastInstruction(VL)); + Value *LHS = vectorizeTree_rec(LHSVL); + Value *RHS = vectorizeTree_rec(RHSVL); + + if (LHS == RHS) { + assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order"); + } + + BinaryOperator *BinOp = cast(VL0); + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); + VectorizedValues[VL0] = V; + return V; + } + case Instruction::Load: { + // Check if all of the loads are consecutive. + for (unsigned i = 1, e = VL.size(); i < e; ++i) + if (!isConsecutiveAccess(VL[i - 1], VL[i])) + return Gather(VL, VecTy); + + // Loads are inserted at the head of the tree because we don't want to + // sink them all the way down past store instructions. + Builder.SetInsertPoint(getLastInstruction(VL)); + LoadInst *LI = cast(VL0); + Value *VecPtr = + Builder.CreateBitCast(LI->getPointerOperand(), VecTy->getPointerTo()); + unsigned Alignment = LI->getAlignment(); + LI = Builder.CreateLoad(VecPtr); + LI->setAlignment(Alignment); + + VectorizedValues[VL0] = LI; + return LI; + } + case Instruction::Store: { + StoreInst *SI = cast(VL0); + unsigned Alignment = SI->getAlignment(); + + ValueList ValueOp; + for (int i = 0, e = VL.size(); i < e; ++i) + ValueOp.push_back(cast(VL[i])->getValueOperand()); + + Value *VecValue = vectorizeTree_rec(ValueOp); + + Builder.SetInsertPoint(getLastInstruction(VL)); + Value *VecPtr = + Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo()); + Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment); + + for (int i = 0, e = VL.size(); i < e; ++i) + cast(VL[i])->eraseFromParent(); + return 0; + } + default: + return Gather(VL, VecTy); + } +} + +Value *FuncSLP::vectorizeTree(ArrayRef VL) { + Builder.SetInsertPoint(getLastInstruction(VL)); + Value *V = vectorizeTree_rec(VL); + + // We moved some instructions around. We have to number them again + // before we can do any analysis. + MustGather.clear(); + VectorizedValues.clear(); + MemBarrierIgnoreList.clear(); + for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) + BlocksNumbers[it].forget(); + return V; +} + +Value *FuncSLP::vectorizeArith(ArrayRef Operands) { + Value *Vec = vectorizeTree(Operands); + // After vectorizing the operands we need to generate extractelement + // instructions and replace all of the uses of the scalar values with + // the values that we extracted from the vectorized tree. + for (unsigned i = 0, e = Operands.size(); i != e; ++i) { + Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i)); + Operands[i]->replaceAllUsesWith(S); + } + + return Vec; +} + +void FuncSLP::hoistGatherSequence() { + for (SetVector::iterator it = GatherSeq.begin(), + e = GatherSeq.end(); + it != e; ++it) { + InsertElementInst *Insert = dyn_cast_or_null(*it); + + // The InsertElement sequence can be simplified into a constant. + // Also Ignore NULL pointers because they are only here to separate + // sequences. + if (!Insert) + continue; + + BasicBlock *BB = Insert->getParent(); + + // Check if this block is inside a loop. + Loop *L = LI->getLoopFor(BB); + if (!L) + return; + + // Check if it has a preheader. + BasicBlock *PreHeader = L->getLoopPreheader(); + if (!PreHeader) + return; + + // If the vector or the element that we insert into it are + // instructions that are defined in this basic block then we can't + // hoist this instruction. + Instruction *CurrVec = dyn_cast(Insert->getOperand(0)); + Instruction *NewElem = dyn_cast(Insert->getOperand(1)); + if (CurrVec && L->contains(CurrVec)) + continue; + if (NewElem && L->contains(NewElem)) + continue; + + // Mark the insertion point for the block. + Instruction *Location = PreHeader->getTerminator(); + // We can hoist this instruction. Move it to the pre-header. + Insert->moveBefore(Location); + } +} + /// The SLPVectorizer Pass. struct SLPVectorizer : public FunctionPass { - typedef MapVector StoreListMap; + typedef SmallVector StoreList; + typedef MapVector StoreListMap; /// Pass identification, replacement for typeid static char ID; @@ -80,34 +1213,26 @@ struct SLPVectorizer : public FunctionPass { DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); + // Use the bollom up slp vectorizer to construct chains that start with + // he store instructions. + FuncSLP R(&F, SE, DL, TTI, AA, LI); + for (Function::iterator it = F.begin(), e = F.end(); it != e; ++it) { BasicBlock *BB = it; - bool BBChanged = false; - - // Use the bollom up slp vectorizer to construct chains that start with - // he store instructions. - BoUpSLP R(BB, SE, DL, TTI, AA, LI->getLoopFor(BB)); // Vectorize trees that end at reductions. - BBChanged |= vectorizeChainsInBlock(BB, R); + Changed |= vectorizeChainsInBlock(BB, R); // Vectorize trees that end at stores. if (unsigned count = collectStores(BB, R)) { (void)count; DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n"); - BBChanged |= vectorizeStoreChains(R); + Changed |= vectorizeStoreChains(R); } - - // Try to hoist some of the scalarization code to the preheader. - if (BBChanged) { - hoistGatherSequence(LI, BB, R); - Changed |= vectorizeUsingGatherHints(R.getGatherSeqInstructions()); - } - - Changed |= BBChanged; } if (Changed) { + R.hoistGatherSequence(); DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); DEBUG(verifyFunction(F)); } @@ -128,42 +1253,31 @@ private: /// object. We sort the stores to their base objects to reduce the cost of the /// quadratic search on the stores. TODO: We can further reduce this cost /// if we flush the chain creation every time we run into a memory barrier. - unsigned collectStores(BasicBlock *BB, BoUpSLP &R); + unsigned collectStores(BasicBlock *BB, FuncSLP &R); /// \brief Try to vectorize a chain that starts at two arithmetic instrs. - bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); + bool tryToVectorizePair(Value *A, Value *B, FuncSLP &R); /// \brief Try to vectorize a list of operands. If \p NeedExtracts is true /// then we calculate the cost of extracting the scalars from the vector. /// \returns true if a value was vectorized. - bool tryToVectorizeList(ArrayRef VL, BoUpSLP &R, bool NeedExtracts); + bool tryToVectorizeList(ArrayRef VL, FuncSLP &R, bool NeedExtracts); /// \brief Try to vectorize a chain that may start at the operands of \V; - bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); + bool tryToVectorize(BinaryOperator *V, FuncSLP &R); /// \brief Vectorize the stores that were collected in StoreRefs. - bool vectorizeStoreChains(BoUpSLP &R); - - /// \brief Try to hoist gather sequences outside of the loop in cases where - /// all of the sources are loop invariant. - void hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, BoUpSLP &R); - - /// \brief Try to vectorize additional sequences in different basic blocks - /// based on values that we gathered in previous blocks. The list \p Gathers - /// holds the gather InsertElement instructions that were generated during - /// vectorization. - /// \returns True if some code was vectorized. - bool vectorizeUsingGatherHints(BoUpSLP::InstrList &Gathers); + bool vectorizeStoreChains(FuncSLP &R); /// \brief Scan the basic block and look for patterns that are likely to start /// a vectorization chain. - bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); + bool vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R); private: StoreListMap StoreRefs; }; -unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { +unsigned SLPVectorizer::collectStores(BasicBlock *BB, FuncSLP &R) { unsigned count = 0; StoreRefs.clear(); for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { @@ -188,14 +1302,14 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { return count; } -bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { +bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, FuncSLP &R) { if (!A || !B) return false; Value *VL[] = { A, B }; return tryToVectorizeList(VL, R, true); } -bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, +bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, FuncSLP &R, bool NeedExtracts) { if (VL.size() < 2) return false; @@ -219,7 +1333,10 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, } int Cost = R.getTreeCost(VL); - int ExtrCost = NeedExtracts ? R.getScalarizationCost(VL) : 0; + if (Cost == FuncSLP::MAX_COST) + return false; + + int ExtrCost = NeedExtracts ? R.getGatherCost(VL) : 0; DEBUG(dbgs() << "SLP: Cost of pair:" << Cost << " Cost of extract:" << ExtrCost << ".\n"); if ((Cost + ExtrCost) >= -SLPCostThreshold) @@ -229,10 +1346,10 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, return true; } -bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { +bool SLPVectorizer::tryToVectorize(BinaryOperator *V, FuncSLP &R) { if (!V) return false; - + // Try to vectorize V. if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R)) return true; @@ -269,7 +1386,7 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { return 0; } -bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { +bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R) { bool Changed = false; for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { if (isa(it)) @@ -292,7 +1409,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { Value *Inst = BI->getOperand(0); if (Inst == P) Inst = BI->getOperand(1); - + Changed |= tryToVectorize(dyn_cast(Inst), R); continue; } @@ -337,7 +1454,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { return Changed; } -bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { +bool SLPVectorizer::vectorizeStoreChains(FuncSLP &R) { bool Changed = false; // Attempt to sort and vectorize each of the store-groups. for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end(); @@ -353,92 +1470,6 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { return Changed; } -bool SLPVectorizer::vectorizeUsingGatherHints(BoUpSLP::InstrList &Gathers) { - SmallVector Seq; - bool Changed = false; - for (int i = 0, e = Gathers.size(); i < e; ++i) { - InsertElementInst *IEI = dyn_cast_or_null(Gathers[i]); - - if (IEI) { - if (Instruction *I = dyn_cast(IEI->getOperand(1))) - Seq.push_back(I); - } else { - - if (!Seq.size()) - continue; - - Instruction *I = cast(Seq[0]); - BasicBlock *BB = I->getParent(); - - DEBUG(dbgs() << "SLP: Inspecting a gather list of size " << Seq.size() - << " in " << BB->getName() << ".\n"); - - // Check if the gathered values have multiple uses. If they only have one - // user then we know that the insert/extract pair will go away. - bool HasMultipleUsers = false; - for (int i = 0; e = Seq.size(), i < e; ++i) { - if (!Seq[i]->hasOneUse()) { - HasMultipleUsers = true; - break; - } - } - - BoUpSLP BO(BB, SE, DL, TTI, AA, LI->getLoopFor(BB)); - - if (tryToVectorizeList(Seq, BO, HasMultipleUsers)) { - DEBUG(dbgs() << "SLP: Vectorized a gather list of len " << Seq.size() - << " in " << BB->getName() << ".\n"); - Changed = true; - } - - Seq.clear(); - } - } - - return Changed; -} - -void SLPVectorizer::hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, - BoUpSLP &R) { - // Check if this block is inside a loop. - Loop *L = LI->getLoopFor(BB); - if (!L) - return; - - // Check if it has a preheader. - BasicBlock *PreHeader = L->getLoopPreheader(); - if (!PreHeader) - return; - - // Mark the insertion point for the block. - Instruction *Location = PreHeader->getTerminator(); - - BoUpSLP::InstrList &Gathers = R.getGatherSeqInstructions(); - for (BoUpSLP::InstrList::iterator it = Gathers.begin(), e = Gathers.end(); - it != e; ++it) { - InsertElementInst *Insert = dyn_cast_or_null(*it); - - // The InsertElement sequence can be simplified into a constant. - // Also Ignore NULL pointers because they are only here to separate - // sequences. - if (!Insert) - continue; - - // If the vector or the element that we insert into it are - // instructions that are defined in this basic block then we can't - // hoist this instruction. - Instruction *CurrVec = dyn_cast(Insert->getOperand(0)); - Instruction *NewElem = dyn_cast(Insert->getOperand(1)); - if (CurrVec && L->contains(CurrVec)) - continue; - if (NewElem && L->contains(NewElem)) - continue; - - // We can hoist this instruction. Move it to the pre-header. - Insert->moveBefore(Location); - } -} - } // end anonymous namespace char SLPVectorizer::ID = 0; diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp deleted file mode 100644 index 3db4adf95c8..00000000000 --- a/lib/Transforms/Vectorize/VecUtils.cpp +++ /dev/null @@ -1,1031 +0,0 @@ -//===- VecUtils.cpp --- Vectorization Utilities ---------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -#define DEBUG_TYPE "SLP" - -#include "VecUtils.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Analysis/Verifier.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Type.h" -#include "llvm/IR/Value.h" -#include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" -#include -#include - -using namespace llvm; - -static const unsigned MinVecRegSize = 128; - -static const unsigned RecursionMaxDepth = 6; - -namespace llvm { - -BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl, - TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp) - : Builder(S->getContext()), BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa), L(Lp) { - numberInstructions(); -} - -void BoUpSLP::numberInstructions() { - int Loc = 0; - InstrIdx.clear(); - InstrVec.clear(); - // Number the instructions in the block. - for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - InstrIdx[it] = Loc++; - InstrVec.push_back(it); - assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation"); - } -} - -Value *BoUpSLP::getPointerOperand(Value *I) { - if (LoadInst *LI = dyn_cast(I)) - return LI->getPointerOperand(); - if (StoreInst *SI = dyn_cast(I)) - return SI->getPointerOperand(); - return 0; -} - -unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { - if (LoadInst *L = dyn_cast(I)) - return L->getPointerAddressSpace(); - if (StoreInst *S = dyn_cast(I)) - return S->getPointerAddressSpace(); - return -1; -} - -bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { - Value *PtrA = getPointerOperand(A); - Value *PtrB = getPointerOperand(B); - unsigned ASA = getAddressSpaceOperand(A); - unsigned ASB = getAddressSpaceOperand(B); - - // Check that the address spaces match and that the pointers are valid. - if (!PtrA || !PtrB || (ASA != ASB)) - return false; - - // Check that A and B are of the same type. - if (PtrA->getType() != PtrB->getType()) - return false; - - // Calculate the distance. - const SCEV *PtrSCEVA = SE->getSCEV(PtrA); - const SCEV *PtrSCEVB = SE->getSCEV(PtrB); - const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB); - const SCEVConstant *ConstOffSCEV = dyn_cast(OffsetSCEV); - - // Non constant distance. - if (!ConstOffSCEV) - return false; - - int64_t Offset = ConstOffSCEV->getValue()->getSExtValue(); - Type *Ty = cast(PtrA->getType())->getElementType(); - // The Instructions are connsecutive if the size of the first load/store is - // the same as the offset. - int64_t Sz = DL->getTypeStoreSize(Ty); - return ((-Offset) == Sz); -} - -bool BoUpSLP::vectorizeStoreChain(ArrayRef Chain, int CostThreshold) { - unsigned ChainLen = Chain.size(); - DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen - << "\n"); - Type *StoreTy = cast(Chain[0])->getValueOperand()->getType(); - unsigned Sz = DL->getTypeSizeInBits(StoreTy); - unsigned VF = MinVecRegSize / Sz; - - if (!isPowerOf2_32(Sz) || VF < 2) - return false; - - bool Changed = false; - // Look for profitable vectorizable trees at all offsets, starting at zero. - for (unsigned i = 0, e = ChainLen; i < e; ++i) { - if (i + VF > e) - break; - DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i - << "\n"); - ArrayRef Operands = Chain.slice(i, VF); - - int Cost = getTreeCost(Operands); - DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); - if (Cost < CostThreshold) { - DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); - Builder.SetInsertPoint(getInsertionPoint(getLastIndex(Operands, VF))); - vectorizeTree(Operands, VF); - i += VF - 1; - Changed = true; - } - } - - if (Changed || ChainLen > VF) - return Changed; - - // Handle short chains. This helps us catch types such as <3 x float> that - // are smaller than vector size. - int Cost = getTreeCost(Chain); - if (Cost < CostThreshold) { - DEBUG(dbgs() << "SLP: Found store chain cost = " << Cost - << " for size = " << ChainLen << "\n"); - Builder.SetInsertPoint(getInsertionPoint(getLastIndex(Chain, ChainLen))); - vectorizeTree(Chain, ChainLen); - return true; - } - - return false; -} - -bool BoUpSLP::vectorizeStores(ArrayRef Stores, int costThreshold) { - SetVector Heads, Tails; - SmallDenseMap ConsecutiveChain; - - // We may run into multiple chains that merge into a single chain. We mark the - // stores that we vectorized so that we don't visit the same store twice. - ValueSet VectorizedStores; - bool Changed = false; - - // Do a quadratic search on all of the given stores and find - // all of the pairs of loads that follow each other. - for (unsigned i = 0, e = Stores.size(); i < e; ++i) - for (unsigned j = 0; j < e; ++j) { - if (i == j) - continue; - - if (isConsecutiveAccess(Stores[i], Stores[j])) { - Tails.insert(Stores[j]); - Heads.insert(Stores[i]); - ConsecutiveChain[Stores[i]] = Stores[j]; - } - } - - // For stores that start but don't end a link in the chain: - for (SetVector::iterator it = Heads.begin(), e = Heads.end(); - it != e; ++it) { - if (Tails.count(*it)) - continue; - - // We found a store instr that starts a chain. Now follow the chain and try - // to vectorize it. - ValueList Operands; - Value *I = *it; - // Collect the chain into a list. - while (Tails.count(I) || Heads.count(I)) { - if (VectorizedStores.count(I)) - break; - Operands.push_back(I); - // Move to the next value in the chain. - I = ConsecutiveChain[I]; - } - - bool Vectorized = vectorizeStoreChain(Operands, costThreshold); - - // Mark the vectorized stores so that we don't vectorize them again. - if (Vectorized) - VectorizedStores.insert(Operands.begin(), Operands.end()); - Changed |= Vectorized; - } - - return Changed; -} - -int BoUpSLP::getScalarizationCost(ArrayRef VL) { - // Find the type of the operands in VL. - Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); - // Find the cost of inserting/extracting values from the vector. - return getScalarizationCost(VecTy); -} - -int BoUpSLP::getScalarizationCost(Type *Ty) { - int Cost = 0; - for (unsigned i = 0, e = cast(Ty)->getNumElements(); i < e; ++i) - Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); - return Cost; -} - -AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) { - if (StoreInst *SI = dyn_cast(I)) - return AA->getLocation(SI); - if (LoadInst *LI = dyn_cast(I)) - return AA->getLocation(LI); - return AliasAnalysis::Location(); -} - -Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) { - assert(Src->getParent() == Dst->getParent() && "Not the same BB"); - BasicBlock::iterator I = Src, E = Dst; - /// Scan all of the instruction from SRC to DST and check if - /// the source may alias. - for (++I; I != E; ++I) { - // Ignore store instructions that are marked as 'ignore'. - if (MemBarrierIgnoreList.count(I)) - continue; - if (Src->mayWriteToMemory()) /* Write */ { - if (!I->mayReadOrWriteMemory()) - continue; - } else /* Read */ { - if (!I->mayWriteToMemory()) - continue; - } - AliasAnalysis::Location A = getLocation(&*I); - AliasAnalysis::Location B = getLocation(Src); - - if (!A.Ptr || !B.Ptr || AA->alias(A, B)) - return I; - } - return 0; -} - -Value *BoUpSLP::vectorizeArith(ArrayRef Operands) { - int LastIdx = getLastIndex(Operands, Operands.size()); - Instruction *Loc = getInsertionPoint(LastIdx); - Builder.SetInsertPoint(Loc); - - assert(getFirstUserIndex(Operands, Operands.size()) > LastIdx && - "Vectorizing with in-tree users"); - - Value *Vec = vectorizeTree(Operands, Operands.size()); - // After vectorizing the operands we need to generate extractelement - // instructions and replace all of the uses of the scalar values with - // the values that we extracted from the vectorized tree. - for (unsigned i = 0, e = Operands.size(); i != e; ++i) { - Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i)); - Operands[i]->replaceAllUsesWith(S); - } - - return Vec; -} - -int BoUpSLP::getTreeCost(ArrayRef VL) { - // Get rid of the list of stores that were removed, and from the - // lists of instructions with multiple users. - MemBarrierIgnoreList.clear(); - LaneMap.clear(); - MultiUserVals.clear(); - MustScalarize.clear(); - MustExtract.clear(); - - // Find the location of the last root. - int LastRootIndex = getLastIndex(VL, VL.size()); - int FirstUserIndex = getFirstUserIndex(VL, VL.size()); - - // Don't vectorize if there are users of the tree roots inside the tree - // itself. - if (LastRootIndex > FirstUserIndex) - return max_cost; - - // Scan the tree and find which value is used by which lane, and which values - // must be scalarized. - getTreeUses_rec(VL, 0); - - // Check that instructions with multiple users can be vectorized. Mark unsafe - // instructions. - for (SetVector::iterator it = MultiUserVals.begin(), - e = MultiUserVals.end(); - it != e; ++it) { - // Check that all of the users of this instr are within the tree - // and that they are all from the same lane. - int Lane = -1; - for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end(); - I != E; ++I) { - if (LaneMap.find(*I) == LaneMap.end()) { - DEBUG(dbgs() << "SLP: Instr " << **it << " has multiple users.\n"); - - // We don't have an ordering problem if the user is not in this basic - // block. - Instruction *Inst = cast(*I); - if (Inst->getParent() != BB) { - MustExtract.insert(*it); - continue; - } - - // We don't have an ordering problem if the user is after the last root. - int Idx = InstrIdx[Inst]; - if (Idx < LastRootIndex) { - MustScalarize.insert(*it); - DEBUG(dbgs() << "SLP: Adding to MustScalarize " - "because of an unsafe out of tree usage.\n"); - break; - } - - DEBUG(dbgs() << "SLP: Adding to MustExtract " - "because of a safe out of tree usage.\n"); - MustExtract.insert(*it); - continue; - } - if (Lane == -1) - Lane = LaneMap[*I]; - if (Lane != LaneMap[*I]) { - MustScalarize.insert(*it); - DEBUG(dbgs() << "SLP: Adding " << **it - << " to MustScalarize because multiple lane use it: " - << Lane << " and " << LaneMap[*I] << ".\n"); - break; - } - } - } - - // Now calculate the cost of vectorizing the tree. - return getTreeCost_rec(VL, 0); -} - -static bool CanReuseExtract(ArrayRef VL, unsigned VF, - VectorType *VecTy) { - // Check if all of the extracts come from the same vector and from the - // correct offset. - Value *VL0 = VL[0]; - ExtractElementInst *E0 = cast(VL0); - Value *Vec = E0->getOperand(0); - - // We have to extract from the same vector type. - if (Vec->getType() != VecTy) - return false; - - // Check that all of the indices extract from the correct offset. - ConstantInt *CI = dyn_cast(E0->getOperand(1)); - if (!CI || CI->getZExtValue()) - return false; - - for (unsigned i = 1, e = VF; i < e; ++i) { - ExtractElementInst *E = cast(VL[i]); - ConstantInt *CI = dyn_cast(E->getOperand(1)); - - if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) - return false; - } - - return true; -} - -void BoUpSLP::getTreeUses_rec(ArrayRef VL, unsigned Depth) { - if (Depth == RecursionMaxDepth) - return; - - // Don't handle vectors. - if (VL[0]->getType()->isVectorTy()) - return; - - if (StoreInst *SI = dyn_cast(VL[0])) - if (SI->getValueOperand()->getType()->isVectorTy()) - return; - - // Check if all of the operands are constants. - bool AllConst = true; - bool AllSameScalar = true; - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - AllConst &= isa(VL[i]); - AllSameScalar &= (VL[0] == VL[i]); - Instruction *I = dyn_cast(VL[i]); - // If one of the instructions is out of this BB, we need to scalarize all. - if (I && I->getParent() != BB) - return; - } - - // If all of the operands are identical or constant we have a simple solution. - if (AllConst || AllSameScalar) - return; - - // Scalarize unknown structures. - Instruction *VL0 = dyn_cast(VL[0]); - if (!VL0) - return; - - unsigned Opcode = VL0->getOpcode(); - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *I = dyn_cast(VL[i]); - // If not all of the instructions are identical then we have to scalarize. - if (!I || Opcode != I->getOpcode()) - return; - } - - for (int i = 0, e = VL.size(); i < e; ++i) { - // Check that the instruction is only used within - // one lane. - if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) - return; - // Make this instruction as 'seen' and remember the lane. - LaneMap[VL[i]] = i; - } - - // Mark instructions with multiple users. - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *I = dyn_cast(VL[i]); - // Remember to check if all of the users of this instr are vectorized - // within our tree. At depth zero we have no local users, only external - // users that we don't care about. - if (Depth && I && I->getNumUses() > 1) { - DEBUG(dbgs() << "SLP: Adding to MultiUserVals " - "because it has multiple users:" << *I << " \n"); - MultiUserVals.insert(I); - } - } - - switch (Opcode) { - case Instruction::ExtractElement: { - VectorType *VecTy = VectorType::get(VL[0]->getType(), VL.size()); - // No need to follow ExtractElements that are going to be optimized away. - if (CanReuseExtract(VL, VL.size(), VecTy)) - return; - // Fall through. - } - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: - case Instruction::Select: - case Instruction::ICmp: - case Instruction::FCmp: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast(VL[j])->getOperand(i)); - - getTreeUses_rec(Operands, Depth + 1); - } - return; - } - case Instruction::Store: { - ValueList Operands; - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast(VL[j])->getOperand(0)); - getTreeUses_rec(Operands, Depth + 1); - return; - } - default: - return; - } -} - -int BoUpSLP::getTreeCost_rec(ArrayRef VL, unsigned Depth) { - Type *ScalarTy = VL[0]->getType(); - - if (StoreInst *SI = dyn_cast(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - - /// Don't mess with vectors. - if (ScalarTy->isVectorTy()) - return max_cost; - - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); - - if (Depth == RecursionMaxDepth) - return getScalarizationCost(VecTy); - - // Check if all of the operands are constants. - bool AllConst = true; - bool AllSameScalar = true; - bool MustScalarizeFlag = false; - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - AllConst &= isa(VL[i]); - AllSameScalar &= (VL[0] == VL[i]); - // Must have a single use. - Instruction *I = dyn_cast(VL[i]); - MustScalarizeFlag |= MustScalarize.count(VL[i]); - // This instruction is outside the basic block. - if (I && I->getParent() != BB) - return getScalarizationCost(VecTy); - } - - // Is this a simple vector constant. - if (AllConst) - return 0; - - // If all of the operands are identical we can broadcast them. - Instruction *VL0 = dyn_cast(VL[0]); - if (AllSameScalar) { - // If we are in a loop, and this is not an instruction (e.g. constant or - // argument) or the instruction is defined outside the loop then assume - // that the cost is zero. - if (L && (!VL0 || !L->contains(VL0))) - return 0; - - // We need to broadcast the scalar. - return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); - } - - // If this is not a constant, or a scalar from outside the loop then we - // need to scalarize it. - if (MustScalarizeFlag) - return getScalarizationCost(VecTy); - - if (!VL0) - return getScalarizationCost(VecTy); - assert(VL0->getParent() == BB && "Wrong BB"); - - unsigned Opcode = VL0->getOpcode(); - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *I = dyn_cast(VL[i]); - // If not all of the instructions are identical then we have to scalarize. - if (!I || Opcode != I->getOpcode()) - return getScalarizationCost(VecTy); - } - - // Check if it is safe to sink the loads or the stores. - if (Opcode == Instruction::Load || Opcode == Instruction::Store) { - int MaxIdx = getLastIndex(VL, VL.size()); - Instruction *Last = InstrVec[MaxIdx]; - - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - if (VL[i] == Last) - continue; - Value *Barrier = isUnsafeToSink(cast(VL[i]), Last); - if (Barrier) { - DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last - << "\n because of " << *Barrier << "\n"); - return max_cost; - } - } - } - - // Calculate the extract cost. - unsigned ExternalUserExtractCost = 0; - for (unsigned i = 0, e = VL.size(); i < e; ++i) - if (MustExtract.count(VL[i])) - ExternalUserExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); - - switch (Opcode) { - case Instruction::ExtractElement: { - if (CanReuseExtract(VL, VL.size(), VecTy)) - return 0; - return getScalarizationCost(VecTy); - } - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - int Cost = ExternalUserExtractCost; - ValueList Operands; - Type *SrcTy = VL0->getOperand(0)->getType(); - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) { - Operands.push_back(cast(VL[j])->getOperand(0)); - // Check that the casted type is the same for all users. - if (cast(VL[j])->getOperand(0)->getType() != SrcTy) - return getScalarizationCost(VecTy); - } - - Cost += getTreeCost_rec(Operands, Depth + 1); - if (Cost >= max_cost) - return max_cost; - - // Calculate the cost of this instruction. - int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), - VL0->getType(), SrcTy); - - VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); - int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); - Cost += (VecCost - ScalarCost); - return Cost; - } - case Instruction::FCmp: - case Instruction::ICmp: { - // Check that all of the compares have the same predicate. - CmpInst::Predicate P0 = dyn_cast(VL0)->getPredicate(); - for (unsigned i = 1, e = VL.size(); i < e; ++i) { - CmpInst *Cmp = cast(VL[i]); - if (Cmp->getPredicate() != P0) - return getScalarizationCost(VecTy); - } - // Fall through. - } - case Instruction::Select: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - int Cost = ExternalUserExtractCost; - // Calculate the cost of all of the operands. - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast(VL[j])->getOperand(i)); - - Cost += getTreeCost_rec(Operands, Depth + 1); - if (Cost >= max_cost) - return max_cost; - } - - // Calculate the cost of this instruction. - int ScalarCost = 0; - int VecCost = 0; - if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp || - Opcode == Instruction::Select) { - VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); - ScalarCost = - VecTy->getNumElements() * - TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); - VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); - } else { - ScalarCost = VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Opcode, ScalarTy); - VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); - } - Cost += (VecCost - ScalarCost); - return Cost; - } - case Instruction::Load: { - // If we are scalarize the loads, add the cost of forming the vector. - for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) - if (!isConsecutiveAccess(VL[i], VL[i + 1])) - return getScalarizationCost(VecTy); - - // Cost of wide load - cost of scalar loads. - int ScalarLdCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); - int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); - return VecLdCost - ScalarLdCost + ExternalUserExtractCost; - } - case Instruction::Store: { - // We know that we can merge the stores. Calculate the cost. - int ScalarStCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); - int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); - int StoreCost = VecStCost - ScalarStCost; - - ValueList Operands; - for (unsigned j = 0; j < VL.size(); ++j) { - Operands.push_back(cast(VL[j])->getOperand(0)); - MemBarrierIgnoreList.insert(VL[j]); - } - - int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1); - return TotalCost + ExternalUserExtractCost; - } - default: - // Unable to vectorize unknown instructions. - return getScalarizationCost(VecTy); - } -} - -int BoUpSLP::getLastIndex(ArrayRef VL, unsigned VF) { - int MaxIdx = InstrIdx[BB->getFirstNonPHI()]; - for (unsigned i = 0; i < VF; ++i) - MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]); - return MaxIdx; -} - -int BoUpSLP::getFirstUserIndex(ArrayRef VL, unsigned VF) { - // Find the first user of the values. - int FirstUser = InstrVec.size(); - for (unsigned i = 0; i < VF; ++i) { - for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end(); - U != UE; ++U) { - Instruction *Instr = dyn_cast(*U); - if (!Instr || Instr->getParent() != BB) - continue; - - FirstUser = std::min(FirstUser, InstrIdx[Instr]); - } - } - return FirstUser; -} - -int BoUpSLP::getLastIndex(Instruction *I, Instruction *J) { - assert(I->getParent() == BB && "Invalid parent for instruction I"); - assert(J->getParent() == BB && "Invalid parent for instruction J"); - return std::max(InstrIdx[I], InstrIdx[J]); -} - -Instruction *BoUpSLP::getInsertionPoint(unsigned Index) { - return InstrVec[Index + 1]; -} - -Value *BoUpSLP::Scalarize(ArrayRef VL, VectorType *Ty) { - Value *Vec = UndefValue::get(Ty); - for (unsigned i = 0; i < Ty->getNumElements(); ++i) { - // Generate the 'InsertElement' instruction. - Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); - // Remember that this instruction is used as part of a 'gather' sequence. - // The caller of the bottom-up slp vectorizer can try to hoist the sequence - // if the users are outside of the basic block. - if (InsertElementInst *IEI = dyn_cast(Vec)) - GatherInstructions.push_back(IEI); - } - - // Mark the end of the gather sequence. - GatherInstructions.push_back(0); - - for (unsigned i = 0; i < Ty->getNumElements(); ++i) - VectorizedValues[VL[i]] = Vec; - - return Vec; -} - -Value *BoUpSLP::vectorizeTree(ArrayRef VL, int VF) { - Value *V = vectorizeTree_rec(VL, VF); - - int LastInstrIdx = getLastIndex(VL, VL.size()); - for (SetVector::iterator it = MustExtract.begin(), - e = MustExtract.end(); - it != e; ++it) { - Instruction *I = cast(*it); - - // This is a scalarized value, so we can use the original value. - // No need to extract from the vector. - if (!LaneMap.count(I)) - continue; - - Value *Vec = VectorizedValues[I]; - // We decided not to vectorize I because one of its users was not - // vectorizerd. This is okay. - if (!Vec) - continue; - - Value *Idx = Builder.getInt32(LaneMap[I]); - Value *Extract = Builder.CreateExtractElement(Vec, Idx); - bool Replaced = false; - for (Value::use_iterator U = I->use_begin(), UE = I->use_end(); U != UE; - ++U) { - Instruction *UI = cast(*U); - if (UI->getParent() != I->getParent() || InstrIdx[UI] > LastInstrIdx) - UI->replaceUsesOfWith(I, Extract); - Replaced = true; - } - assert(Replaced && "Must replace at least one outside user"); - (void)Replaced; - } - - // We moved some instructions around. We have to number them again - // before we can do any analysis. - numberInstructions(); - MustScalarize.clear(); - MustExtract.clear(); - VectorizedValues.clear(); - return V; -} - -Value *BoUpSLP::vectorizeTree_rec(ArrayRef VL, int VF) { - Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, VF); - - // Check if all of the operands are constants or identical. - bool AllConst = true; - bool AllSameScalar = true; - for (unsigned i = 0, e = VF; i < e; ++i) { - AllConst &= isa(VL[i]); - AllSameScalar &= (VL[0] == VL[i]); - // The instruction must be in the same BB, and it must be vectorizable. - Instruction *I = dyn_cast(VL[i]); - if (MustScalarize.count(VL[i]) || (I && I->getParent() != BB)) - return Scalarize(VL, VecTy); - } - - // Check that this is a simple vector constant. - if (AllConst || AllSameScalar) - return Scalarize(VL, VecTy); - - // Scalarize unknown structures. - Instruction *VL0 = dyn_cast(VL[0]); - if (!VL0) - return Scalarize(VL, VecTy); - - if (VectorizedValues.count(VL0)) { - Value *Vec = VectorizedValues[VL0]; - for (int i = 0; i < VF; ++i) - VectorizedValues[VL[i]] = Vec; - return Vec; - } - - unsigned Opcode = VL0->getOpcode(); - for (unsigned i = 0, e = VF; i < e; ++i) { - Instruction *I = dyn_cast(VL[i]); - // If not all of the instructions are identical then we have to scalarize. - if (!I || Opcode != I->getOpcode()) - return Scalarize(VL, VecTy); - } - - switch (Opcode) { - case Instruction::ExtractElement: { - if (CanReuseExtract(VL, VL.size(), VecTy)) - return VL0->getOperand(0); - return Scalarize(VL, VecTy); - } - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - ValueList INVL; - for (int i = 0; i < VF; ++i) - INVL.push_back(cast(VL[i])->getOperand(0)); - Value *InVec = vectorizeTree_rec(INVL, VF); - CastInst *CI = dyn_cast(VL0); - Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); - - for (int i = 0; i < VF; ++i) - VectorizedValues[VL[i]] = V; - - return V; - } - case Instruction::FCmp: - case Instruction::ICmp: { - // Check that all of the compares have the same predicate. - CmpInst::Predicate P0 = dyn_cast(VL0)->getPredicate(); - for (unsigned i = 1, e = VF; i < e; ++i) { - CmpInst *Cmp = cast(VL[i]); - if (Cmp->getPredicate() != P0) - return Scalarize(VL, VecTy); - } - - ValueList LHSV, RHSV; - for (int i = 0; i < VF; ++i) { - LHSV.push_back(cast(VL[i])->getOperand(0)); - RHSV.push_back(cast(VL[i])->getOperand(1)); - } - - Value *L = vectorizeTree_rec(LHSV, VF); - Value *R = vectorizeTree_rec(RHSV, VF); - Value *V; - if (VL0->getOpcode() == Instruction::FCmp) - V = Builder.CreateFCmp(P0, L, R); - else - V = Builder.CreateICmp(P0, L, R); - - for (int i = 0; i < VF; ++i) - VectorizedValues[VL[i]] = V; - - return V; - } - case Instruction::Select: { - ValueList TrueVec, FalseVec, CondVec; - for (int i = 0; i < VF; ++i) { - CondVec.push_back(cast(VL[i])->getOperand(0)); - TrueVec.push_back(cast(VL[i])->getOperand(1)); - FalseVec.push_back(cast(VL[i])->getOperand(2)); - } - - Value *True = vectorizeTree_rec(TrueVec, VF); - Value *False = vectorizeTree_rec(FalseVec, VF); - Value *Cond = vectorizeTree_rec(CondVec, VF); - Value *V = Builder.CreateSelect(Cond, True, False); - - for (int i = 0; i < VF; ++i) - VectorizedValues[VL[i]] = V; - - return V; - } - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - ValueList LHSVL, RHSVL; - for (int i = 0; i < VF; ++i) { - LHSVL.push_back(cast(VL[i])->getOperand(0)); - RHSVL.push_back(cast(VL[i])->getOperand(1)); - } - - Value *LHS = vectorizeTree_rec(LHSVL, VF); - Value *RHS = vectorizeTree_rec(RHSVL, VF); - BinaryOperator *BinOp = cast(VL0); - Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); - - for (int i = 0; i < VF; ++i) - VectorizedValues[VL[i]] = V; - - return V; - } - case Instruction::Load: { - LoadInst *LI = cast(VL0); - unsigned Alignment = LI->getAlignment(); - - // Check if all of the loads are consecutive. - for (unsigned i = 1, e = VF; i < e; ++i) - if (!isConsecutiveAccess(VL[i - 1], VL[i])) - return Scalarize(VL, VecTy); - - // Loads are inserted at the head of the tree because we don't want to sink - // them all the way down past store instructions. - Instruction *Loc = getInsertionPoint(getLastIndex(VL, VL.size())); - IRBuilder<> LoadBuilder(Loc); - Value *VecPtr = LoadBuilder.CreateBitCast(LI->getPointerOperand(), - VecTy->getPointerTo()); - LI = LoadBuilder.CreateLoad(VecPtr); - LI->setAlignment(Alignment); - - for (int i = 0; i < VF; ++i) - VectorizedValues[VL[i]] = LI; - - return LI; - } - case Instruction::Store: { - StoreInst *SI = cast(VL0); - unsigned Alignment = SI->getAlignment(); - - ValueList ValueOp; - for (int i = 0; i < VF; ++i) - ValueOp.push_back(cast(VL[i])->getValueOperand()); - - Value *VecValue = vectorizeTree_rec(ValueOp, VF); - Value *VecPtr = - Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo()); - Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment); - - for (int i = 0; i < VF; ++i) - cast(VL[i])->eraseFromParent(); - return 0; - } - default: - return Scalarize(VL, VecTy); - } -} - -} // end of namespace diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h deleted file mode 100644 index c9fe6d23ab6..00000000000 --- a/lib/Transforms/Vectorize/VecUtils.h +++ /dev/null @@ -1,194 +0,0 @@ -//===- VecUtils.h - Vectorization Utilities -------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This family of classes and functions manipulate vectors and chains of -// vectors. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H -#define LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H - -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SetVector.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/IR/IRBuilder.h" -#include - -namespace llvm { - -class BasicBlock; -class Instruction; -class Type; -class VectorType; -class StoreInst; -class Value; -class ScalarEvolution; -class DataLayout; -class TargetTransformInfo; -class AliasAnalysis; -class Loop; - -/// Bottom Up SLP vectorization utility class. -struct BoUpSLP { - typedef SmallVector ValueList; - typedef SmallVector InstrList; - typedef SmallPtrSet ValueSet; - typedef SmallVector StoreList; - static const int max_cost = 1 << 20; - - // \brief C'tor. - BoUpSLP(BasicBlock *Bb, ScalarEvolution *Se, DataLayout *Dl, - TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp); - - /// \brief Take the pointer operand from the Load/Store instruction. - /// \returns NULL if this is not a valid Load/Store instruction. - static Value *getPointerOperand(Value *I); - - /// \brief Take the address space operand from the Load/Store instruction. - /// \returns -1 if this is not a valid Load/Store instruction. - static unsigned getAddressSpaceOperand(Value *I); - - /// \returns true if the memory operations A and B are consecutive. - bool isConsecutiveAccess(Value *A, Value *B); - - /// \brief Vectorize the tree that starts with the elements in \p VL. - /// \returns the vectorized value. - Value *vectorizeTree(ArrayRef VL, int VF); - - /// \returns the vectorization cost of the subtree that starts at \p VL. - /// A negative number means that this is profitable. - int getTreeCost(ArrayRef VL); - - /// \returns the scalarization cost for this list of values. Assuming that - /// this subtree gets vectorized, we may need to extract the values from the - /// roots. This method calculates the cost of extracting the values. - int getScalarizationCost(ArrayRef VL); - - /// \brief Attempts to order and vectorize a sequence of stores. This - /// function does a quadratic scan of the given stores. - /// \returns true if the basic block was modified. - bool vectorizeStores(ArrayRef Stores, int costThreshold); - - /// \brief Vectorize a group of scalars into a vector tree. - /// \returns the vectorized value. - Value *vectorizeArith(ArrayRef Operands); - - /// \returns the list of new instructions that were added in order to collect - /// scalars into vectors. This list can be used to further optimize the gather - /// sequences. - InstrList &getGatherSeqInstructions() { return GatherInstructions; } - -private: - /// \brief This method contains the recursive part of getTreeCost. - int getTreeCost_rec(ArrayRef VL, unsigned Depth); - - /// \brief This recursive method looks for vectorization hazards such as - /// values that are used by multiple users and checks that values are used - /// by only one vector lane. It updates the variables LaneMap, MultiUserVals. - void getTreeUses_rec(ArrayRef VL, unsigned Depth); - - /// \brief This method contains the recursive part of vectorizeTree. - Value *vectorizeTree_rec(ArrayRef VL, int VF); - - /// \brief Number all of the instructions in the block. - void numberInstructions(); - - /// \brief Vectorize a sorted sequence of stores. - bool vectorizeStoreChain(ArrayRef Chain, int CostThreshold); - - /// \returns the scalarization cost for this type. Scalarization in this - /// context means the creation of vectors from a group of scalars. - int getScalarizationCost(Type *Ty); - - /// \returns the AA location that is being access by the instruction. - AliasAnalysis::Location getLocation(Instruction *I); - - /// \brief Checks if it is possible to sink an instruction from - /// \p Src to \p Dst. - /// \returns the pointer to the barrier instruction if we can't sink. - Value *isUnsafeToSink(Instruction *Src, Instruction *Dst); - - /// \returns the index of the last instrucion in the BB from \p VL. - /// Only consider the first \p VF elements. - int getLastIndex(ArrayRef VL, unsigned VF); - - /// \returns the index of the first User of \p VL. - /// Only consider the first \p VF elements. - int getFirstUserIndex(ArrayRef VL, unsigned VF); - - /// \returns the instruction \p I or \p J that appears last in the BB . - int getLastIndex(Instruction *I, Instruction *J); - - /// \returns the insertion point for \p Index. - Instruction *getInsertionPoint(unsigned Index); - - /// \returns a vector from a collection of scalars in \p VL. - Value *Scalarize(ArrayRef VL, VectorType *Ty); - -private: - /// Maps instructions to numbers and back. - SmallDenseMap InstrIdx; - /// Maps integers to Instructions. - std::vector InstrVec; - - // -- containers that are used during getTreeCost -- // - - /// Contains values that must be scalarized because they are used - /// by multiple lanes, or by users outside the tree. - /// NOTICE: The vectorization methods also use this set. - ValueSet MustScalarize; - - /// Contains values that have users outside of the vectorized graph. - /// We need to generate extract instructions for these values. - /// NOTICE: The vectorization methods also use this set. - SetVector MustExtract; - - /// Contains a list of values that are used outside the current tree. This - /// set must be reset between runs. - SetVector MultiUserVals; - /// Maps values in the tree to the vector lanes that uses them. This map must - /// be reset between runs of getCost. - std::map LaneMap; - /// A list of instructions to ignore while sinking - /// memory instructions. This map must be reset between runs of getCost. - ValueSet MemBarrierIgnoreList; - - // -- Containers that are used during vectorizeTree -- // - - /// Maps between the first scalar to the vector. This map must be reset - /// between runs. - DenseMap VectorizedValues; - - // -- Containers that are used after vectorization by the caller -- // - - /// A list of instructions that are used when gathering scalars into vectors. - /// In many cases these instructions can be hoisted outside of the BB. - /// Iterating over this list is faster than calling LICM. - /// Notice: We insert NULL ptrs to separate between the different gather - /// sequences. - InstrList GatherInstructions; - - /// Instruction builder to construct the vectorized tree. - IRBuilder<> Builder; - - // Analysis and block reference. - BasicBlock *BB; - ScalarEvolution *SE; - DataLayout *DL; - TargetTransformInfo *TTI; - AliasAnalysis *AA; - Loop *L; -}; - -} // end of namespace - -#endif // LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H diff --git a/test/Transforms/SLPVectorizer/X86/diamond.ll b/test/Transforms/SLPVectorizer/X86/diamond.ll index 8959b0d9eec..008f09db454 100644 --- a/test/Transforms/SLPVectorizer/X86/diamond.ll +++ b/test/Transforms/SLPVectorizer/X86/diamond.ll @@ -50,9 +50,9 @@ entry: ; } ; CHECK: @extr_user +; CHECK: load i32* ; CHECK: store <4 x i32> -; CHECK-NEXT: extractelement <4 x i32> -; CHECK: ret +; CHECK-NEXT: ret define i32 @extr_user(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) { entry: %0 = load i32* %A, align 4 @@ -79,9 +79,9 @@ entry: ; In this example we have an external user that is not the first element in the vector. ; CHECK: @extr_user1 +; CHECK: load i32* ; CHECK: store <4 x i32> -; CHECK-NEXT: extractelement <4 x i32> -; CHECK: ret +; CHECK-NEXT: ret define i32 @extr_user1(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) { entry: %0 = load i32* %A, align 4 diff --git a/test/Transforms/SLPVectorizer/X86/multi_block.ll b/test/Transforms/SLPVectorizer/X86/multi_block.ll new file mode 100644 index 00000000000..eed3f371b80 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/multi_block.ll @@ -0,0 +1,55 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.7.0" + +; int bar(double *A, int d) { +; double A0 = A[0]; +; double A1 = A[1]; +; float F0 = A0; +; float F1 = A1; +; if (d) foo(); <----- This splits the blocks +; F0+=4.0; +; F1+=5.0; +; A[8] = 9.0 + F0; +; A[9] = 5.0 + F1; +; } + + +;CHECK: @bar +;CHECK: load <2 x double> +;CHECK: fptrunc <2 x double> +;CHECK: call i32 +;CHECK: fadd <2 x float> +;CHECK: fpext <2 x float> +;CHECK: store <2 x double> +;CHECK: ret +define i32 @bar(double* nocapture %A, i32 %d) { + %1 = load double* %A, align 8 + %2 = getelementptr inbounds double* %A, i64 1 + %3 = load double* %2, align 8 + %4 = fptrunc double %1 to float + %5 = fptrunc double %3 to float + %6 = icmp eq i32 %d, 0 + br i1 %6, label %9, label %7 + +;