mirror of
				https://github.com/c64scene-ar/llvm-6502.git
				synced 2025-10-30 16:17:05 +00:00 
			
		
		
		
	Reapply "SLPVectorizer: Handle more horizontal reductions (disabled)""
Reapply r191108 with a fix for a memory corruption error I introduced. Of course, we can't reference the scalars that we replace by vectorizing and then call their eraseFromParent method. I only 'needed' the scalars to get the DebugLoc. Just store the DebugLoc before actually vectorizing instead. As a nice side effect, this also simplifies the interface between BoUpSLP and the HorizontalReduction class to returning a value pointer (the vectorized tree root). radar://14607682 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@191123 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
		| @@ -49,6 +49,11 @@ static cl::opt<int> | |||||||
|     SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, |     SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, | ||||||
|                      cl::desc("Only vectorize if you gain more than this " |                      cl::desc("Only vectorize if you gain more than this " | ||||||
|                               "number ")); |                               "number ")); | ||||||
|  |  | ||||||
|  | static cl::opt<bool> | ||||||
|  | ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden, | ||||||
|  |                    cl::desc("Attempt to vectorize horizontal reductions")); | ||||||
|  |  | ||||||
| namespace { | namespace { | ||||||
|  |  | ||||||
| static const unsigned MinVecRegSize = 128; | static const unsigned MinVecRegSize = 128; | ||||||
| @@ -238,17 +243,20 @@ public: | |||||||
|     } |     } | ||||||
|  |  | ||||||
|   /// \brief Vectorize the tree that starts with the elements in \p VL. |   /// \brief Vectorize the tree that starts with the elements in \p VL. | ||||||
|   void vectorizeTree(); |   /// Returns the vectorized root. | ||||||
|  |   Value *vectorizeTree(); | ||||||
|  |  | ||||||
|   /// \returns the vectorization cost of the subtree that starts at \p VL. |   /// \returns the vectorization cost of the subtree that starts at \p VL. | ||||||
|   /// A negative number means that this is profitable. |   /// A negative number means that this is profitable. | ||||||
|   int getTreeCost(); |   int getTreeCost(); | ||||||
|  |  | ||||||
|   /// Construct a vectorizable tree that starts at \p Roots. |   /// Construct a vectorizable tree that starts at \p Roots and is possibly | ||||||
|   void buildTree(ArrayRef<Value *> Roots); |   /// used by a reduction of \p RdxOps. | ||||||
|  |   void buildTree(ArrayRef<Value *> Roots, ValueSet *RdxOps = 0); | ||||||
|  |  | ||||||
|   /// Clear the internal data structures that are created by 'buildTree'. |   /// Clear the internal data structures that are created by 'buildTree'. | ||||||
|   void deleteTree() { |   void deleteTree() { | ||||||
|  |     RdxOps = 0; | ||||||
|     VectorizableTree.clear(); |     VectorizableTree.clear(); | ||||||
|     ScalarToTreeEntry.clear(); |     ScalarToTreeEntry.clear(); | ||||||
|     MustGather.clear(); |     MustGather.clear(); | ||||||
| @@ -401,6 +409,9 @@ private: | |||||||
|   /// Numbers instructions in different blocks. |   /// Numbers instructions in different blocks. | ||||||
|   DenseMap<BasicBlock *, BlockNumbering> BlocksNumbers; |   DenseMap<BasicBlock *, BlockNumbering> BlocksNumbers; | ||||||
|  |  | ||||||
|  |   /// Reduction operators. | ||||||
|  |   ValueSet *RdxOps; | ||||||
|  |  | ||||||
|   // Analysis and block reference. |   // Analysis and block reference. | ||||||
|   Function *F; |   Function *F; | ||||||
|   ScalarEvolution *SE; |   ScalarEvolution *SE; | ||||||
| @@ -413,8 +424,9 @@ private: | |||||||
|   IRBuilder<> Builder; |   IRBuilder<> Builder; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| void BoUpSLP::buildTree(ArrayRef<Value *> Roots) { | void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) { | ||||||
|   deleteTree(); |   deleteTree(); | ||||||
|  |   RdxOps = Rdx; | ||||||
|   if (!getSameType(Roots)) |   if (!getSameType(Roots)) | ||||||
|     return; |     return; | ||||||
|   buildTree_rec(Roots, 0); |   buildTree_rec(Roots, 0); | ||||||
| @@ -445,8 +457,12 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots) { | |||||||
|           assert(!VectorizableTree[Idx].NeedToGather && "Bad state"); |           assert(!VectorizableTree[Idx].NeedToGather && "Bad state"); | ||||||
|           continue; |           continue; | ||||||
|         } |         } | ||||||
|  |         Instruction *UserInst = dyn_cast<Instruction>(*User); | ||||||
|  |         if (!UserInst) | ||||||
|  |           continue; | ||||||
|  |  | ||||||
|         if (!isa<Instruction>(*User)) |         // Ignore uses that are part of the reduction. | ||||||
|  |         if (Rdx && std::find(Rdx->begin(), Rdx->end(), UserInst) != Rdx->end()) | ||||||
|           continue; |           continue; | ||||||
|  |  | ||||||
|         DEBUG(dbgs() << "SLP: Need to extract:" << **User << " from lane " << |         DEBUG(dbgs() << "SLP: Need to extract:" << **User << " from lane " << | ||||||
| @@ -578,6 +594,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { | |||||||
|         continue; |         continue; | ||||||
|       } |       } | ||||||
|  |  | ||||||
|  |       // This user is part of the reduction. | ||||||
|  |       if (RdxOps && RdxOps->count(User)) | ||||||
|  |         continue; | ||||||
|  |  | ||||||
|       // Make sure that we can schedule this unknown user. |       // Make sure that we can schedule this unknown user. | ||||||
|       BlockNumbering &BN = BlocksNumbers[BB]; |       BlockNumbering &BN = BlocksNumbers[BB]; | ||||||
|       int UserIndex = BN.getIndex(User); |       int UserIndex = BN.getIndex(User); | ||||||
| @@ -1372,7 +1392,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { | |||||||
|   return 0; |   return 0; | ||||||
| } | } | ||||||
|  |  | ||||||
| void BoUpSLP::vectorizeTree() { | Value *BoUpSLP::vectorizeTree() { | ||||||
|   Builder.SetInsertPoint(F->getEntryBlock().begin()); |   Builder.SetInsertPoint(F->getEntryBlock().begin()); | ||||||
|   vectorizeTree(&VectorizableTree[0]); |   vectorizeTree(&VectorizableTree[0]); | ||||||
|  |  | ||||||
| @@ -1449,7 +1469,10 @@ void BoUpSLP::vectorizeTree() { | |||||||
|           DEBUG(dbgs() << "SLP: \tvalidating user:" << **User << ".\n"); |           DEBUG(dbgs() << "SLP: \tvalidating user:" << **User << ".\n"); | ||||||
|           assert(!MustGather.count(*User) && |           assert(!MustGather.count(*User) && | ||||||
|                  "Replacing gathered value with undef"); |                  "Replacing gathered value with undef"); | ||||||
|           assert(ScalarToTreeEntry.count(*User) && |  | ||||||
|  |           assert((ScalarToTreeEntry.count(*User) || | ||||||
|  |                   // It is legal to replace the reduction users by undef. | ||||||
|  |                   (RdxOps && RdxOps->count(*User))) && | ||||||
|                  "Replacing out-of-tree value with undef"); |                  "Replacing out-of-tree value with undef"); | ||||||
|         } |         } | ||||||
|         Value *Undef = UndefValue::get(Ty); |         Value *Undef = UndefValue::get(Ty); | ||||||
| @@ -1464,6 +1487,8 @@ void BoUpSLP::vectorizeTree() { | |||||||
|     BlocksNumbers[it].forget(); |     BlocksNumbers[it].forget(); | ||||||
|   } |   } | ||||||
|   Builder.ClearInsertionPoint(); |   Builder.ClearInsertionPoint(); | ||||||
|  |  | ||||||
|  |   return VectorizableTree[0].VectorizedValue; | ||||||
| } | } | ||||||
|  |  | ||||||
| void BoUpSLP::optimizeGatherSequence() { | void BoUpSLP::optimizeGatherSequence() { | ||||||
| @@ -1887,6 +1912,308 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { | |||||||
|   return 0; |   return 0; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | /// \brief Generate a shuffle mask to be used in a reduction tree. | ||||||
|  | /// | ||||||
|  | /// \param VecLen The length of the vector to be reduced. | ||||||
|  | /// \param NumEltsToRdx The number of elements that should be reduced in the | ||||||
|  | ///        vector. | ||||||
|  | /// \param IsPairwise Whether the reduction is a pairwise or splitting | ||||||
|  | ///        reduction. A pairwise reduction will generate a mask of  | ||||||
|  | ///        <0,2,...> or <1,3,..> while a splitting reduction will generate | ||||||
|  | ///        <2,3, undef,undef> for a vector of 4 and NumElts = 2. | ||||||
|  | /// \param IsLeft True will generate a mask of even elements, odd otherwise. | ||||||
|  | static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, | ||||||
|  |                                    bool IsPairwise, bool IsLeft, | ||||||
|  |                                    IRBuilder<> &Builder) { | ||||||
|  |   assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask"); | ||||||
|  |  | ||||||
|  |   SmallVector<Constant *, 32> ShuffleMask( | ||||||
|  |       VecLen, UndefValue::get(Builder.getInt32Ty())); | ||||||
|  |  | ||||||
|  |   if (IsPairwise) | ||||||
|  |     // Build a mask of 0, 2, ... (left) or 1, 3, ... (right). | ||||||
|  |     for (unsigned i = 0; i != NumEltsToRdx; ++i) | ||||||
|  |       ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft); | ||||||
|  |   else | ||||||
|  |     // Move the upper half of the vector to the lower half. | ||||||
|  |     for (unsigned i = 0; i != NumEltsToRdx; ++i) | ||||||
|  |       ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i); | ||||||
|  |  | ||||||
|  |   return ConstantVector::get(ShuffleMask); | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | /// Model horizontal reductions. | ||||||
|  | /// | ||||||
|  | /// A horizontal reduction is a tree of reduction operations (currently add and | ||||||
|  | /// fadd) that has operations that can be put into a vector as its leaf. | ||||||
|  | /// For example, this tree: | ||||||
|  | /// | ||||||
|  | /// mul mul mul mul | ||||||
|  | ///  \  /    \  / | ||||||
|  | ///   +       + | ||||||
|  | ///    \     / | ||||||
|  | ///       + | ||||||
|  | /// This tree has "mul" as its reduced values and "+" as its reduction | ||||||
|  | /// operations. A reduction might be feeding into a store or a binary operation | ||||||
|  | /// feeding a phi. | ||||||
|  | ///    ... | ||||||
|  | ///    \  / | ||||||
|  | ///     + | ||||||
|  | ///     \ | ||||||
|  | ///  phi += | ||||||
|  | /// | ||||||
|  | ///  Or: | ||||||
|  | ///    ... | ||||||
|  | ///    \  / | ||||||
|  | ///     + | ||||||
|  | ///     \ | ||||||
|  | ///   *p = | ||||||
|  | /// | ||||||
|  | class HorizontalReduction { | ||||||
|  |   SmallPtrSet<Value *, 16> ReductionOps; | ||||||
|  |   SmallVector<Value *, 32> ReducedVals; | ||||||
|  |  | ||||||
|  |   BinaryOperator *ReductionRoot; | ||||||
|  |   PHINode *ReductionPHI; | ||||||
|  |  | ||||||
|  |   /// The opcode of the reduction. | ||||||
|  |   unsigned ReductionOpcode; | ||||||
|  |   /// The opcode of the values we perform a reduction on. | ||||||
|  |   unsigned ReducedValueOpcode; | ||||||
|  |   /// The width of one full horizontal reduction operation. | ||||||
|  |   unsigned ReduxWidth; | ||||||
|  |   /// Should we model this reduction as a pairwise reduction tree or a tree that | ||||||
|  |   /// splits the vector in halves and adds those halves. | ||||||
|  |   bool IsPairwiseReduction; | ||||||
|  |  | ||||||
|  | public: | ||||||
|  |   HorizontalReduction() | ||||||
|  |     : ReductionRoot(0), ReductionPHI(0), ReductionOpcode(0), | ||||||
|  |     ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {} | ||||||
|  |  | ||||||
|  |   /// \brief Try to find a reduction tree. | ||||||
|  |   bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B, | ||||||
|  |                                  DataLayout *DL) { | ||||||
|  |     assert((!Phi || | ||||||
|  |             std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) && | ||||||
|  |            "Thi phi needs to use the binary operator"); | ||||||
|  |  | ||||||
|  |     // We could have a initial reductions that is not an add. | ||||||
|  |     //  r *= v1 + v2 + v3 + v4 | ||||||
|  |     // In such a case start looking for a tree rooted in the first '+'. | ||||||
|  |     if (Phi) { | ||||||
|  |       if (B->getOperand(0) == Phi) { | ||||||
|  |         Phi = 0; | ||||||
|  |         B = dyn_cast<BinaryOperator>(B->getOperand(1)); | ||||||
|  |       } else if (B->getOperand(1) == Phi) { | ||||||
|  |         Phi = 0; | ||||||
|  |         B = dyn_cast<BinaryOperator>(B->getOperand(0)); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     if (!B) | ||||||
|  |       return false; | ||||||
|  |  | ||||||
|  |     Type *Ty = B->getType(); | ||||||
|  |     if (Ty->isVectorTy()) | ||||||
|  |       return false; | ||||||
|  |  | ||||||
|  |     ReductionOpcode = B->getOpcode(); | ||||||
|  |     ReducedValueOpcode = 0; | ||||||
|  |     ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty); | ||||||
|  |     ReductionRoot = B; | ||||||
|  |     ReductionPHI = Phi; | ||||||
|  |  | ||||||
|  |     if (ReduxWidth < 4) | ||||||
|  |       return false; | ||||||
|  |  | ||||||
|  |     // We currently only support adds. | ||||||
|  |     if (ReductionOpcode != Instruction::Add && | ||||||
|  |         ReductionOpcode != Instruction::FAdd) | ||||||
|  |       return false; | ||||||
|  |  | ||||||
|  |     // Post order traverse the reduction tree starting at B. We only handle true | ||||||
|  |     // trees containing only binary operators. | ||||||
|  |     SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack; | ||||||
|  |     Stack.push_back(std::make_pair(B, 0)); | ||||||
|  |     while (!Stack.empty()) { | ||||||
|  |       BinaryOperator *TreeN = Stack.back().first; | ||||||
|  |       unsigned EdgeToVist = Stack.back().second++; | ||||||
|  |       bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode; | ||||||
|  |  | ||||||
|  |       // Only handle trees in the current basic block. | ||||||
|  |       if (TreeN->getParent() != B->getParent()) | ||||||
|  |         return false; | ||||||
|  |  | ||||||
|  |       // Each tree node needs to have one user except for the ultimate | ||||||
|  |       // reduction. | ||||||
|  |       if (!TreeN->hasOneUse() && TreeN != B) | ||||||
|  |         return false; | ||||||
|  |  | ||||||
|  |       // Postorder vist. | ||||||
|  |       if (EdgeToVist == 2 || IsReducedValue) { | ||||||
|  |         if (IsReducedValue) { | ||||||
|  |           // Make sure that the opcodes of the operations that we are going to | ||||||
|  |           // reduce match. | ||||||
|  |           if (!ReducedValueOpcode) | ||||||
|  |             ReducedValueOpcode = TreeN->getOpcode(); | ||||||
|  |           else if (ReducedValueOpcode != TreeN->getOpcode()) | ||||||
|  |             return false; | ||||||
|  |           ReducedVals.push_back(TreeN); | ||||||
|  |         } else { | ||||||
|  |           // We need to be able to reassociate the adds. | ||||||
|  |           if (!TreeN->isAssociative()) | ||||||
|  |             return false; | ||||||
|  |           ReductionOps.insert(TreeN); | ||||||
|  |         } | ||||||
|  |         // Retract. | ||||||
|  |         Stack.pop_back(); | ||||||
|  |         continue; | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       // Visit left or right. | ||||||
|  |       Value *NextV = TreeN->getOperand(EdgeToVist); | ||||||
|  |       BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV); | ||||||
|  |       if (Next) | ||||||
|  |         Stack.push_back(std::make_pair(Next, 0)); | ||||||
|  |       else if (NextV != Phi) | ||||||
|  |         return false; | ||||||
|  |     } | ||||||
|  |     return true; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   /// \brief Attempt to vectorize the tree found by | ||||||
|  |   /// matchAssociativeReduction. | ||||||
|  |   bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { | ||||||
|  |     if (ReducedVals.empty()) | ||||||
|  |       return false; | ||||||
|  |  | ||||||
|  |     unsigned NumReducedVals = ReducedVals.size(); | ||||||
|  |     if (NumReducedVals < ReduxWidth) | ||||||
|  |       return false; | ||||||
|  |  | ||||||
|  |     Value *VectorizedTree = 0; | ||||||
|  |     IRBuilder<> Builder(ReductionRoot); | ||||||
|  |     FastMathFlags Unsafe; | ||||||
|  |     Unsafe.setUnsafeAlgebra(); | ||||||
|  |     Builder.SetFastMathFlags(Unsafe); | ||||||
|  |     unsigned i = 0; | ||||||
|  |  | ||||||
|  |     for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { | ||||||
|  |       ArrayRef<Value *> ValsToReduce(&ReducedVals[i], ReduxWidth); | ||||||
|  |       V.buildTree(ValsToReduce, &ReductionOps); | ||||||
|  |  | ||||||
|  |       // Estimate cost. | ||||||
|  |       int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); | ||||||
|  |       if (Cost >= -SLPCostThreshold) | ||||||
|  |         break; | ||||||
|  |  | ||||||
|  |       DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost | ||||||
|  |                    << ". (HorRdx)\n"); | ||||||
|  |  | ||||||
|  |       // Vectorize a tree. | ||||||
|  |       DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); | ||||||
|  |       Value *VectorizedRoot = V.vectorizeTree(); | ||||||
|  |  | ||||||
|  |       // Emit a reduction. | ||||||
|  |       Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder); | ||||||
|  |       if (VectorizedTree) { | ||||||
|  |         Builder.SetCurrentDebugLocation(Loc); | ||||||
|  |         VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, | ||||||
|  |                                      ReducedSubTree, "bin.rdx"); | ||||||
|  |       } else | ||||||
|  |         VectorizedTree = ReducedSubTree; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     if (VectorizedTree) { | ||||||
|  |       // Finish the reduction. | ||||||
|  |       for (; i < NumReducedVals; ++i) { | ||||||
|  |         Builder.SetCurrentDebugLocation( | ||||||
|  |           cast<Instruction>(ReducedVals[i])->getDebugLoc()); | ||||||
|  |         VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, | ||||||
|  |                                      ReducedVals[i]); | ||||||
|  |       } | ||||||
|  |       // Update users. | ||||||
|  |       if (ReductionPHI) { | ||||||
|  |         assert(ReductionRoot != NULL && "Need a reduction operation"); | ||||||
|  |         ReductionRoot->setOperand(0, VectorizedTree); | ||||||
|  |         ReductionRoot->setOperand(1, ReductionPHI); | ||||||
|  |       } else | ||||||
|  |         ReductionRoot->replaceAllUsesWith(VectorizedTree); | ||||||
|  |     } | ||||||
|  |     return VectorizedTree != 0; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  | private: | ||||||
|  |  | ||||||
|  |   /// \brief Calcuate the cost of a reduction. | ||||||
|  |   int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) { | ||||||
|  |     Type *ScalarTy = FirstReducedVal->getType(); | ||||||
|  |     Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); | ||||||
|  |  | ||||||
|  |     int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true); | ||||||
|  |     int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false); | ||||||
|  |  | ||||||
|  |     IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost; | ||||||
|  |     int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; | ||||||
|  |  | ||||||
|  |     int ScalarReduxCost = | ||||||
|  |         ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy); | ||||||
|  |  | ||||||
|  |     DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost | ||||||
|  |                  << " for reduction that starts with " << *FirstReducedVal | ||||||
|  |                  << " (It is a " | ||||||
|  |                  << (IsPairwiseReduction ? "pairwise" : "splitting") | ||||||
|  |                  << " reduction)\n"); | ||||||
|  |  | ||||||
|  |     return VecReduxCost - ScalarReduxCost; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L, | ||||||
|  |                             Value *R, const Twine &Name = "") { | ||||||
|  |     if (Opcode == Instruction::FAdd) | ||||||
|  |       return Builder.CreateFAdd(L, R, Name); | ||||||
|  |     return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   /// \brief Emit a horizontal reduction of the vectorized value. | ||||||
|  |   Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) { | ||||||
|  |     assert(VectorizedValue && "Need to have a vectorized tree node"); | ||||||
|  |     Instruction *ValToReduce = dyn_cast<Instruction>(VectorizedValue); | ||||||
|  |     assert(isPowerOf2_32(ReduxWidth) && | ||||||
|  |            "We only handle power-of-two reductions for now"); | ||||||
|  |  | ||||||
|  |     SmallVector<Constant *, 32> ShuffleMask(ReduxWidth, 0); | ||||||
|  |     Value *TmpVec = ValToReduce; | ||||||
|  |     for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { | ||||||
|  |       if (IsPairwiseReduction) { | ||||||
|  |         Value *LeftMask = | ||||||
|  |           createRdxShuffleMask(ReduxWidth, i, true, true, Builder); | ||||||
|  |         Value *RightMask = | ||||||
|  |           createRdxShuffleMask(ReduxWidth, i, true, false, Builder); | ||||||
|  |  | ||||||
|  |         Value *LeftShuf = Builder.CreateShuffleVector( | ||||||
|  |           TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l"); | ||||||
|  |         Value *RightShuf = Builder.CreateShuffleVector( | ||||||
|  |           TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), | ||||||
|  |           "rdx.shuf.r"); | ||||||
|  |         TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf, | ||||||
|  |                              "bin.rdx"); | ||||||
|  |       } else { | ||||||
|  |         Value *UpperHalf = | ||||||
|  |           createRdxShuffleMask(ReduxWidth, i, false, false, Builder); | ||||||
|  |         Value *Shuf = Builder.CreateShuffleVector( | ||||||
|  |           TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); | ||||||
|  |         TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx"); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // The result is in the first element of the vector. | ||||||
|  |     return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  |  | ||||||
| /// \brief Recognize construction of vectors like | /// \brief Recognize construction of vectors like | ||||||
| ///  %ra = insertelement <4 x float> undef, float %s0, i32 0 | ///  %ra = insertelement <4 x float> undef, float %s0, i32 0 | ||||||
| ///  %rb = insertelement <4 x float> %ra, float %s1, i32 1 | ///  %rb = insertelement <4 x float> %ra, float %s1, i32 1 | ||||||
| @@ -1981,7 +2308,18 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { | |||||||
|       if (!BI) |       if (!BI) | ||||||
|         continue; |         continue; | ||||||
|  |  | ||||||
|       Value *Inst = BI->getOperand(0); |       // Try to match and vectorize a horizontal reduction. | ||||||
|  |       HorizontalReduction HorRdx; | ||||||
|  |       if (ShouldVectorizeHor && | ||||||
|  |           HorRdx.matchAssociativeReduction(P, BI, DL) && | ||||||
|  |           HorRdx.tryToReduce(R, TTI)) { | ||||||
|  |         Changed = true; | ||||||
|  |         it = BB->begin(); | ||||||
|  |         e = BB->end(); | ||||||
|  |         continue; | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |      Value *Inst = BI->getOperand(0); | ||||||
|       if (Inst == P) |       if (Inst == P) | ||||||
|         Inst = BI->getOperand(1); |         Inst = BI->getOperand(1); | ||||||
|  |  | ||||||
| @@ -1991,10 +2329,28 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { | |||||||
|         Changed = true; |         Changed = true; | ||||||
|         it = BB->begin(); |         it = BB->begin(); | ||||||
|         e = BB->end(); |         e = BB->end(); | ||||||
|  |         continue; | ||||||
|       } |       } | ||||||
|  |  | ||||||
|       continue; |       continue; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     // Try to vectorize horizontal reductions feeding into a store. | ||||||
|  |     if (StoreInst *SI = dyn_cast<StoreInst>(it)) | ||||||
|  |       if (BinaryOperator *BinOp = | ||||||
|  |               dyn_cast<BinaryOperator>(SI->getValueOperand())) { | ||||||
|  |         HorizontalReduction HorRdx; | ||||||
|  |         if (ShouldVectorizeHor && | ||||||
|  |             ((HorRdx.matchAssociativeReduction(0, BinOp, DL) && | ||||||
|  |               HorRdx.tryToReduce(R, TTI)) || | ||||||
|  |              tryToVectorize(BinOp, R))) { | ||||||
|  |           Changed = true; | ||||||
|  |           it = BB->begin(); | ||||||
|  |           e = BB->end(); | ||||||
|  |           continue; | ||||||
|  |         } | ||||||
|  |       } | ||||||
|  |  | ||||||
|     // Try to vectorize trees that start at compare instructions. |     // Try to vectorize trees that start at compare instructions. | ||||||
|     if (CmpInst *CI = dyn_cast<CmpInst>(it)) { |     if (CmpInst *CI = dyn_cast<CmpInst>(it)) { | ||||||
|       if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { |       if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { | ||||||
|   | |||||||
							
								
								
									
										415
									
								
								test/Transforms/SLPVectorizer/X86/horizontal.ll
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										415
									
								
								test/Transforms/SLPVectorizer/X86/horizontal.ll
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,415 @@ | |||||||
|  | ; RUN: opt -slp-vectorizer -slp-vectorize-hor -S <  %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s | ||||||
|  |  | ||||||
|  | target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" | ||||||
|  |  | ||||||
|  | ; #include <stdint.h> | ||||||
|  | ; | ||||||
|  | ; int foo(float *A, int n) { | ||||||
|  | ;   float sum = 0; | ||||||
|  | ;   for (intptr_t i=0; i < n; ++i) { | ||||||
|  | ;     sum += 7*A[i*4  ] + | ||||||
|  | ;            7*A[i*4+1] + | ||||||
|  | ;            7*A[i*4+2] + | ||||||
|  | ;            7*A[i*4+3]; | ||||||
|  | ;   } | ||||||
|  | ;   return sum; | ||||||
|  | ; } | ||||||
|  |  | ||||||
|  | ; CHECK-LABEL: add_red | ||||||
|  | ; CHECK: fmul <4 x float> | ||||||
|  | ; CHECK: shufflevector <4 x float> | ||||||
|  |  | ||||||
|  | define i32 @add_red(float* %A, i32 %n) { | ||||||
|  | entry: | ||||||
|  |   %cmp31 = icmp sgt i32 %n, 0 | ||||||
|  |   br i1 %cmp31, label %for.body.lr.ph, label %for.end | ||||||
|  |  | ||||||
|  | for.body.lr.ph: | ||||||
|  |   %0 = sext i32 %n to i64 | ||||||
|  |   br label %for.body | ||||||
|  |  | ||||||
|  | for.body: | ||||||
|  |   %i.033 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] | ||||||
|  |   %sum.032 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add17, %for.body ] | ||||||
|  |   %mul = shl nsw i64 %i.033, 2 | ||||||
|  |   %arrayidx = getelementptr inbounds float* %A, i64 %mul | ||||||
|  |   %1 = load float* %arrayidx, align 4 | ||||||
|  |   %mul2 = fmul float %1, 7.000000e+00 | ||||||
|  |   %add28 = or i64 %mul, 1 | ||||||
|  |   %arrayidx4 = getelementptr inbounds float* %A, i64 %add28 | ||||||
|  |   %2 = load float* %arrayidx4, align 4 | ||||||
|  |   %mul5 = fmul float %2, 7.000000e+00 | ||||||
|  |   %add6 = fadd fast float %mul2, %mul5 | ||||||
|  |   %add829 = or i64 %mul, 2 | ||||||
|  |   %arrayidx9 = getelementptr inbounds float* %A, i64 %add829 | ||||||
|  |   %3 = load float* %arrayidx9, align 4 | ||||||
|  |   %mul10 = fmul float %3, 7.000000e+00 | ||||||
|  |   %add11 = fadd fast float %add6, %mul10 | ||||||
|  |   %add1330 = or i64 %mul, 3 | ||||||
|  |   %arrayidx14 = getelementptr inbounds float* %A, i64 %add1330 | ||||||
|  |   %4 = load float* %arrayidx14, align 4 | ||||||
|  |   %mul15 = fmul float %4, 7.000000e+00 | ||||||
|  |   %add16 = fadd fast float %add11, %mul15 | ||||||
|  |   %add17 = fadd fast float %sum.032, %add16 | ||||||
|  |   %inc = add nsw i64 %i.033, 1 | ||||||
|  |   %exitcond = icmp eq i64 %inc, %0 | ||||||
|  |   br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body | ||||||
|  |  | ||||||
|  | for.cond.for.end_crit_edge: | ||||||
|  |   %phitmp = fptosi float %add17 to i32 | ||||||
|  |   br label %for.end | ||||||
|  |  | ||||||
|  | for.end: | ||||||
|  |   %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] | ||||||
|  |   ret i32 %sum.0.lcssa | ||||||
|  | } | ||||||
|  |  | ||||||
|  | ; int foo(float * restrict A, float * restrict B, int n) { | ||||||
|  | ;   float sum = 0; | ||||||
|  | ;   for (intptr_t i=0; i < n; ++i) { | ||||||
|  | ;     sum *= B[0]*A[i*4  ] + | ||||||
|  | ;       B[1]*A[i*4+1] + | ||||||
|  | ;       B[2]*A[i*4+2] + | ||||||
|  | ;       B[3]*A[i*4+3]; | ||||||
|  | ;   } | ||||||
|  | ;   return sum; | ||||||
|  | ; } | ||||||
|  |  | ||||||
|  | ; CHECK-LABEL: mul_red | ||||||
|  | ; CHECK: fmul <4 x float> | ||||||
|  | ; CHECK: shufflevector <4 x float> | ||||||
|  |  | ||||||
|  | define i32 @mul_red(float* noalias %A, float* noalias %B, i32 %n) { | ||||||
|  | entry: | ||||||
|  |   %cmp38 = icmp sgt i32 %n, 0 | ||||||
|  |   br i1 %cmp38, label %for.body.lr.ph, label %for.end | ||||||
|  |  | ||||||
|  | for.body.lr.ph: | ||||||
|  |   %0 = load float* %B, align 4 | ||||||
|  |   %arrayidx4 = getelementptr inbounds float* %B, i64 1 | ||||||
|  |   %1 = load float* %arrayidx4, align 4 | ||||||
|  |   %arrayidx9 = getelementptr inbounds float* %B, i64 2 | ||||||
|  |   %2 = load float* %arrayidx9, align 4 | ||||||
|  |   %arrayidx15 = getelementptr inbounds float* %B, i64 3 | ||||||
|  |   %3 = load float* %arrayidx15, align 4 | ||||||
|  |   %4 = sext i32 %n to i64 | ||||||
|  |   br label %for.body | ||||||
|  |  | ||||||
|  | for.body: | ||||||
|  |   %i.040 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] | ||||||
|  |   %sum.039 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %mul21, %for.body ] | ||||||
|  |   %mul = shl nsw i64 %i.040, 2 | ||||||
|  |   %arrayidx2 = getelementptr inbounds float* %A, i64 %mul | ||||||
|  |   %5 = load float* %arrayidx2, align 4 | ||||||
|  |   %mul3 = fmul float %0, %5 | ||||||
|  |   %add35 = or i64 %mul, 1 | ||||||
|  |   %arrayidx6 = getelementptr inbounds float* %A, i64 %add35 | ||||||
|  |   %6 = load float* %arrayidx6, align 4 | ||||||
|  |   %mul7 = fmul float %1, %6 | ||||||
|  |   %add8 = fadd fast float %mul3, %mul7 | ||||||
|  |   %add1136 = or i64 %mul, 2 | ||||||
|  |   %arrayidx12 = getelementptr inbounds float* %A, i64 %add1136 | ||||||
|  |   %7 = load float* %arrayidx12, align 4 | ||||||
|  |   %mul13 = fmul float %2, %7 | ||||||
|  |   %add14 = fadd fast float %add8, %mul13 | ||||||
|  |   %add1737 = or i64 %mul, 3 | ||||||
|  |   %arrayidx18 = getelementptr inbounds float* %A, i64 %add1737 | ||||||
|  |   %8 = load float* %arrayidx18, align 4 | ||||||
|  |   %mul19 = fmul float %3, %8 | ||||||
|  |   %add20 = fadd fast float %add14, %mul19 | ||||||
|  |   %mul21 = fmul float %sum.039, %add20 | ||||||
|  |   %inc = add nsw i64 %i.040, 1 | ||||||
|  |   %exitcond = icmp eq i64 %inc, %4 | ||||||
|  |   br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body | ||||||
|  |  | ||||||
|  | for.cond.for.end_crit_edge: | ||||||
|  |   %phitmp = fptosi float %mul21 to i32 | ||||||
|  |   br label %for.end | ||||||
|  |  | ||||||
|  | for.end: | ||||||
|  |   %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] | ||||||
|  |   ret i32 %sum.0.lcssa | ||||||
|  | } | ||||||
|  |  | ||||||
|  | ; int foo(float * restrict A, float * restrict B, int n) { | ||||||
|  | ;   float sum = 0; | ||||||
|  | ;   for (intptr_t i=0; i < n; ++i) { | ||||||
|  | ;     sum += B[0]*A[i*6  ] + | ||||||
|  | ;            B[1]*A[i*6+1] + | ||||||
|  | ;            B[2]*A[i*6+2] + | ||||||
|  | ;            B[3]*A[i*6+3] + | ||||||
|  | ;            B[4]*A[i*6+4] + | ||||||
|  | ;            B[5]*A[i*6+5] + | ||||||
|  | ;            B[6]*A[i*6+6] + | ||||||
|  | ;            B[7]*A[i*6+7] + | ||||||
|  | ;            B[8]*A[i*6+8]; | ||||||
|  | ;   } | ||||||
|  | ;   return sum; | ||||||
|  | ; } | ||||||
|  |  | ||||||
|  | ; CHECK-LABEL: long_red | ||||||
|  | ; CHECK: fmul <4 x float> | ||||||
|  | ; CHECK: shufflevector <4 x float> | ||||||
|  |  | ||||||
|  | define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) { | ||||||
|  | entry: | ||||||
|  |   %cmp81 = icmp sgt i32 %n, 0 | ||||||
|  |   br i1 %cmp81, label %for.body.lr.ph, label %for.end | ||||||
|  |  | ||||||
|  | for.body.lr.ph: | ||||||
|  |   %0 = load float* %B, align 4 | ||||||
|  |   %arrayidx4 = getelementptr inbounds float* %B, i64 1 | ||||||
|  |   %1 = load float* %arrayidx4, align 4 | ||||||
|  |   %arrayidx9 = getelementptr inbounds float* %B, i64 2 | ||||||
|  |   %2 = load float* %arrayidx9, align 4 | ||||||
|  |   %arrayidx15 = getelementptr inbounds float* %B, i64 3 | ||||||
|  |   %3 = load float* %arrayidx15, align 4 | ||||||
|  |   %arrayidx21 = getelementptr inbounds float* %B, i64 4 | ||||||
|  |   %4 = load float* %arrayidx21, align 4 | ||||||
|  |   %arrayidx27 = getelementptr inbounds float* %B, i64 5 | ||||||
|  |   %5 = load float* %arrayidx27, align 4 | ||||||
|  |   %arrayidx33 = getelementptr inbounds float* %B, i64 6 | ||||||
|  |   %6 = load float* %arrayidx33, align 4 | ||||||
|  |   %arrayidx39 = getelementptr inbounds float* %B, i64 7 | ||||||
|  |   %7 = load float* %arrayidx39, align 4 | ||||||
|  |   %arrayidx45 = getelementptr inbounds float* %B, i64 8 | ||||||
|  |   %8 = load float* %arrayidx45, align 4 | ||||||
|  |   %9 = sext i32 %n to i64 | ||||||
|  |   br label %for.body | ||||||
|  |  | ||||||
|  | for.body: | ||||||
|  |   %i.083 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] | ||||||
|  |   %sum.082 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add51, %for.body ] | ||||||
|  |   %mul = mul nsw i64 %i.083, 6 | ||||||
|  |   %arrayidx2 = getelementptr inbounds float* %A, i64 %mul | ||||||
|  |   %10 = load float* %arrayidx2, align 4 | ||||||
|  |   %mul3 = fmul fast float %0, %10 | ||||||
|  |   %add80 = or i64 %mul, 1 | ||||||
|  |   %arrayidx6 = getelementptr inbounds float* %A, i64 %add80 | ||||||
|  |   %11 = load float* %arrayidx6, align 4 | ||||||
|  |   %mul7 = fmul fast float %1, %11 | ||||||
|  |   %add8 = fadd fast float %mul3, %mul7 | ||||||
|  |   %add11 = add nsw i64 %mul, 2 | ||||||
|  |   %arrayidx12 = getelementptr inbounds float* %A, i64 %add11 | ||||||
|  |   %12 = load float* %arrayidx12, align 4 | ||||||
|  |   %mul13 = fmul fast float %2, %12 | ||||||
|  |   %add14 = fadd fast float %add8, %mul13 | ||||||
|  |   %add17 = add nsw i64 %mul, 3 | ||||||
|  |   %arrayidx18 = getelementptr inbounds float* %A, i64 %add17 | ||||||
|  |   %13 = load float* %arrayidx18, align 4 | ||||||
|  |   %mul19 = fmul fast float %3, %13 | ||||||
|  |   %add20 = fadd fast float %add14, %mul19 | ||||||
|  |   %add23 = add nsw i64 %mul, 4 | ||||||
|  |   %arrayidx24 = getelementptr inbounds float* %A, i64 %add23 | ||||||
|  |   %14 = load float* %arrayidx24, align 4 | ||||||
|  |   %mul25 = fmul fast float %4, %14 | ||||||
|  |   %add26 = fadd fast float %add20, %mul25 | ||||||
|  |   %add29 = add nsw i64 %mul, 5 | ||||||
|  |   %arrayidx30 = getelementptr inbounds float* %A, i64 %add29 | ||||||
|  |   %15 = load float* %arrayidx30, align 4 | ||||||
|  |   %mul31 = fmul fast float %5, %15 | ||||||
|  |   %add32 = fadd fast float %add26, %mul31 | ||||||
|  |   %add35 = add nsw i64 %mul, 6 | ||||||
|  |   %arrayidx36 = getelementptr inbounds float* %A, i64 %add35 | ||||||
|  |   %16 = load float* %arrayidx36, align 4 | ||||||
|  |   %mul37 = fmul fast float %6, %16 | ||||||
|  |   %add38 = fadd fast float %add32, %mul37 | ||||||
|  |   %add41 = add nsw i64 %mul, 7 | ||||||
|  |   %arrayidx42 = getelementptr inbounds float* %A, i64 %add41 | ||||||
|  |   %17 = load float* %arrayidx42, align 4 | ||||||
|  |   %mul43 = fmul fast float %7, %17 | ||||||
|  |   %add44 = fadd fast float %add38, %mul43 | ||||||
|  |   %add47 = add nsw i64 %mul, 8 | ||||||
|  |   %arrayidx48 = getelementptr inbounds float* %A, i64 %add47 | ||||||
|  |   %18 = load float* %arrayidx48, align 4 | ||||||
|  |   %mul49 = fmul fast float %8, %18 | ||||||
|  |   %add50 = fadd fast float %add44, %mul49 | ||||||
|  |   %add51 = fadd fast float %sum.082, %add50 | ||||||
|  |   %inc = add nsw i64 %i.083, 1 | ||||||
|  |   %exitcond = icmp eq i64 %inc, %9 | ||||||
|  |   br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body | ||||||
|  |  | ||||||
|  | for.cond.for.end_crit_edge: | ||||||
|  |   %phitmp = fptosi float %add51 to i32 | ||||||
|  |   br label %for.end | ||||||
|  |  | ||||||
|  | for.end: | ||||||
|  |   %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] | ||||||
|  |   ret i32 %sum.0.lcssa | ||||||
|  | } | ||||||
|  |  | ||||||
|  | ; int foo(float * restrict A, float * restrict B, int n) { | ||||||
|  | ;   float sum = 0; | ||||||
|  | ;   for (intptr_t i=0; i < n; ++i) { | ||||||
|  | ;     sum += B[0]*A[i*4  ]; | ||||||
|  | ;     sum += B[1]*A[i*4+1]; | ||||||
|  | ;     sum += B[2]*A[i*4+2]; | ||||||
|  | ;     sum += B[3]*A[i*4+3]; | ||||||
|  | ;   } | ||||||
|  | ;   return sum; | ||||||
|  | ; } | ||||||
|  |  | ||||||
|  | ; CHECK-LABEL: chain_red | ||||||
|  | ; CHECK: fmul <4 x float> | ||||||
|  | ; CHECK: shufflevector <4 x float> | ||||||
|  |  | ||||||
|  | define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) { | ||||||
|  | entry: | ||||||
|  |   %cmp41 = icmp sgt i32 %n, 0 | ||||||
|  |   br i1 %cmp41, label %for.body.lr.ph, label %for.end | ||||||
|  |  | ||||||
|  | for.body.lr.ph: | ||||||
|  |   %0 = load float* %B, align 4 | ||||||
|  |   %arrayidx4 = getelementptr inbounds float* %B, i64 1 | ||||||
|  |   %1 = load float* %arrayidx4, align 4 | ||||||
|  |   %arrayidx10 = getelementptr inbounds float* %B, i64 2 | ||||||
|  |   %2 = load float* %arrayidx10, align 4 | ||||||
|  |   %arrayidx16 = getelementptr inbounds float* %B, i64 3 | ||||||
|  |   %3 = load float* %arrayidx16, align 4 | ||||||
|  |   %4 = sext i32 %n to i64 | ||||||
|  |   br label %for.body | ||||||
|  |  | ||||||
|  | for.body: | ||||||
|  |   %i.043 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] | ||||||
|  |   %sum.042 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add21, %for.body ] | ||||||
|  |   %mul = shl nsw i64 %i.043, 2 | ||||||
|  |   %arrayidx2 = getelementptr inbounds float* %A, i64 %mul | ||||||
|  |   %5 = load float* %arrayidx2, align 4 | ||||||
|  |   %mul3 = fmul fast float %0, %5 | ||||||
|  |   %add = fadd fast float %sum.042, %mul3 | ||||||
|  |   %add638 = or i64 %mul, 1 | ||||||
|  |   %arrayidx7 = getelementptr inbounds float* %A, i64 %add638 | ||||||
|  |   %6 = load float* %arrayidx7, align 4 | ||||||
|  |   %mul8 = fmul fast float %1, %6 | ||||||
|  |   %add9 = fadd fast float %add, %mul8 | ||||||
|  |   %add1239 = or i64 %mul, 2 | ||||||
|  |   %arrayidx13 = getelementptr inbounds float* %A, i64 %add1239 | ||||||
|  |   %7 = load float* %arrayidx13, align 4 | ||||||
|  |   %mul14 = fmul fast float %2, %7 | ||||||
|  |   %add15 = fadd fast float %add9, %mul14 | ||||||
|  |   %add1840 = or i64 %mul, 3 | ||||||
|  |   %arrayidx19 = getelementptr inbounds float* %A, i64 %add1840 | ||||||
|  |   %8 = load float* %arrayidx19, align 4 | ||||||
|  |   %mul20 = fmul fast float %3, %8 | ||||||
|  |   %add21 = fadd fast float %add15, %mul20 | ||||||
|  |   %inc = add nsw i64 %i.043, 1 | ||||||
|  |   %exitcond = icmp eq i64 %inc, %4 | ||||||
|  |   br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body | ||||||
|  |  | ||||||
|  | for.cond.for.end_crit_edge: | ||||||
|  |   %phitmp = fptosi float %add21 to i32 | ||||||
|  |   br label %for.end | ||||||
|  |  | ||||||
|  | for.end: | ||||||
|  |   %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] | ||||||
|  |   ret i32 %sum.0.lcssa | ||||||
|  | } | ||||||
|  |  | ||||||
|  | ; int foo(float * restrict A, float * restrict B, float * restrict C, int n) { | ||||||
|  | ;   float sum = 0; | ||||||
|  | ;   for (intptr_t i=0; i < n; ++i) { | ||||||
|  | ;     C[i] = B[0] *A[i*4  ] + | ||||||
|  | ;          B[1] *A[i*4+1] + | ||||||
|  | ;          B[2] *A[i*4+2] + | ||||||
|  | ;          B[3] *A[i*4+3]; | ||||||
|  | ;   } | ||||||
|  | ;   return sum; | ||||||
|  | ; } | ||||||
|  |  | ||||||
|  | ; CHECK-LABEL: store_red | ||||||
|  | ; CHECK: fmul <4 x float> | ||||||
|  | ; CHECK: shufflevector <4 x float> | ||||||
|  |  | ||||||
|  | define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) { | ||||||
|  | entry: | ||||||
|  |   %cmp37 = icmp sgt i32 %n, 0 | ||||||
|  |   br i1 %cmp37, label %for.body.lr.ph, label %for.end | ||||||
|  |  | ||||||
|  | for.body.lr.ph: | ||||||
|  |   %arrayidx4 = getelementptr inbounds float* %B, i64 1 | ||||||
|  |   %arrayidx9 = getelementptr inbounds float* %B, i64 2 | ||||||
|  |   %arrayidx15 = getelementptr inbounds float* %B, i64 3 | ||||||
|  |   %0 = sext i32 %n to i64 | ||||||
|  |   br label %for.body | ||||||
|  |  | ||||||
|  | for.body: | ||||||
|  |   %i.039 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] | ||||||
|  |   %C.addr.038 = phi float* [ %C, %for.body.lr.ph ], [ %incdec.ptr, %for.body ] | ||||||
|  |   %1 = load float* %B, align 4 | ||||||
|  |   %mul = shl nsw i64 %i.039, 2 | ||||||
|  |   %arrayidx2 = getelementptr inbounds float* %A, i64 %mul | ||||||
|  |   %2 = load float* %arrayidx2, align 4 | ||||||
|  |   %mul3 = fmul fast float %1, %2 | ||||||
|  |   %3 = load float* %arrayidx4, align 4 | ||||||
|  |   %add34 = or i64 %mul, 1 | ||||||
|  |   %arrayidx6 = getelementptr inbounds float* %A, i64 %add34 | ||||||
|  |   %4 = load float* %arrayidx6, align 4 | ||||||
|  |   %mul7 = fmul fast float %3, %4 | ||||||
|  |   %add8 = fadd fast float %mul3, %mul7 | ||||||
|  |   %5 = load float* %arrayidx9, align 4 | ||||||
|  |   %add1135 = or i64 %mul, 2 | ||||||
|  |   %arrayidx12 = getelementptr inbounds float* %A, i64 %add1135 | ||||||
|  |   %6 = load float* %arrayidx12, align 4 | ||||||
|  |   %mul13 = fmul fast float %5, %6 | ||||||
|  |   %add14 = fadd fast float %add8, %mul13 | ||||||
|  |   %7 = load float* %arrayidx15, align 4 | ||||||
|  |   %add1736 = or i64 %mul, 3 | ||||||
|  |   %arrayidx18 = getelementptr inbounds float* %A, i64 %add1736 | ||||||
|  |   %8 = load float* %arrayidx18, align 4 | ||||||
|  |   %mul19 = fmul fast float %7, %8 | ||||||
|  |   %add20 = fadd fast float %add14, %mul19 | ||||||
|  |   store float %add20, float* %C.addr.038, align 4 | ||||||
|  |   %incdec.ptr = getelementptr inbounds float* %C.addr.038, i64 1 | ||||||
|  |   %inc = add nsw i64 %i.039, 1 | ||||||
|  |   %exitcond = icmp eq i64 %inc, %0 | ||||||
|  |   br i1 %exitcond, label %for.end, label %for.body | ||||||
|  |  | ||||||
|  | for.end: | ||||||
|  |   ret i32 0 | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | ; void foo(double * restrict A, double * restrict B, double * restrict C, | ||||||
|  | ;          int n) { | ||||||
|  | ;   for (intptr_t i=0; i < n; ++i) { | ||||||
|  | ;     C[i] = B[0] *A[i*4  ] + B[1] *A[i*4+1]; | ||||||
|  | ;   } | ||||||
|  | ; } | ||||||
|  |  | ||||||
|  | ; CHECK-LABEL: store_red_double | ||||||
|  | ; CHECK: fmul <2 x double> | ||||||
|  | ; CHECK: extractelement <2 x double> | ||||||
|  | ; CHECK: extractelement <2 x double> | ||||||
|  |  | ||||||
|  | define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) { | ||||||
|  | entry: | ||||||
|  |   %cmp17 = icmp sgt i32 %n, 0 | ||||||
|  |   br i1 %cmp17, label %for.body.lr.ph, label %for.end | ||||||
|  |  | ||||||
|  | for.body.lr.ph: | ||||||
|  |   %0 = load double* %B, align 8 | ||||||
|  |   %arrayidx4 = getelementptr inbounds double* %B, i64 1 | ||||||
|  |   %1 = load double* %arrayidx4, align 8 | ||||||
|  |   %2 = sext i32 %n to i64 | ||||||
|  |   br label %for.body | ||||||
|  |  | ||||||
|  | for.body: | ||||||
|  |   %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] | ||||||
|  |   %mul = shl nsw i64 %i.018, 2 | ||||||
|  |   %arrayidx2 = getelementptr inbounds double* %A, i64 %mul | ||||||
|  |   %3 = load double* %arrayidx2, align 8 | ||||||
|  |   %mul3 = fmul fast double %0, %3 | ||||||
|  |   %add16 = or i64 %mul, 1 | ||||||
|  |   %arrayidx6 = getelementptr inbounds double* %A, i64 %add16 | ||||||
|  |   %4 = load double* %arrayidx6, align 8 | ||||||
|  |   %mul7 = fmul fast double %1, %4 | ||||||
|  |   %add8 = fadd fast double %mul3, %mul7 | ||||||
|  |   %arrayidx9 = getelementptr inbounds double* %C, i64 %i.018 | ||||||
|  |   store double %add8, double* %arrayidx9, align 8 | ||||||
|  |   %inc = add nsw i64 %i.018, 1 | ||||||
|  |   %exitcond = icmp eq i64 %inc, %2 | ||||||
|  |   br i1 %exitcond, label %for.end, label %for.body | ||||||
|  |  | ||||||
|  | for.end: | ||||||
|  |   ret void | ||||||
|  | } | ||||||
		Reference in New Issue
	
	Block a user