[LoopVectorize] Teach Loop Vectorizor about interleaved memory accesses.

Interleaved memory accesses are grouped and vectorized into vector load/store and shufflevector. E.g. for (i = 0; i < N; i+=2) { a = A[i]; // load of even element b = A[i+1]; // load of odd element ... // operations on a, b, c, d A[i] = c; // store of even element A[i+1] = d; // store of odd element } The loads of even and odd elements are identified as an interleave load group, which will be transfered into vectorized IRs like: %wide.vec = load <8 x i32>, <8 x i32>* %ptr %vec.even = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> %vec.odd = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> The stores of even and odd elements are identified as an interleave store group, which will be transfered into vectorized IRs like: %interleaved.vec = shufflevector <4 x i32> %vec.even, %vec.odd, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> store <8 x i32> %interleaved.vec, <8 x i32>* %ptr This optimization is currently disabled by defaut. To try it by adding '-enable-interleaved-mem-accesses=true'. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@239291 91177308-0d34-0410-b5e6-96231b3b80d8
2025-11-25 15:19:14 +00:00 · 2015-06-08 06:39:56 +00:00
parent f57b36041b
commit 43be1d53d1
9 changed files with 1299 additions and 29 deletions
--- a/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -501,6 +501,11 @@ const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE,
                                      const ValueToValueMap &PtrToStride,
                                      Value *Ptr, Value *OrigPtr = nullptr);

+/// \brief Check the stride of the pointer and ensure that it does not wrap in
+/// the address space.
+int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
+                 const ValueToValueMap &StridesMap);
+
 /// \brief This analysis provides dependence information for the memory accesses
 /// of a loop.
 ///
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -448,6 +448,20 @@ public:
  unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                                 unsigned AddressSpace) const;

+  /// \return The cost of the interleaved memory operation.
+  /// \p Opcode is the memory operation code
+  /// \p VecTy is the vector type of the interleaved access.
+  /// \p Factor is the interleave factor
+  /// \p Indices is the indices for interleaved load members (as interleaved
+  ///    load allows gaps)
+  /// \p Alignment is the alignment of the memory operation
+  /// \p AddressSpace is address space of the pointer.
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                      unsigned Factor,
+                                      ArrayRef<unsigned> Indices,
+                                      unsigned Alignment,
+                                      unsigned AddressSpace) const;
+
  /// \brief Calculate the cost of performing a vector reduction.
  ///
  /// This is the cost of reducing the vector value of type \p Ty to a scalar
@@ -587,6 +601,11 @@ public:
  virtual unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                         unsigned Alignment,
                                         unsigned AddressSpace) = 0;
+  virtual unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                              unsigned Factor,
+                                              ArrayRef<unsigned> Indices,
+                                              unsigned Alignment,
+                                              unsigned AddressSpace) = 0;
  virtual unsigned getReductionCost(unsigned Opcode, Type *Ty,
                                    bool IsPairwiseForm) = 0;
  virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
@@ -748,6 +767,14 @@ public:
                                 unsigned AddressSpace) override {
    return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
  }
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                      unsigned Factor,
+                                      ArrayRef<unsigned> Indices,
+                                      unsigned Alignment,
+                                      unsigned AddressSpace) override {
+    return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                           Alignment, AddressSpace);
+  }
  unsigned getReductionCost(unsigned Opcode, Type *Ty,
                            bool IsPairwiseForm) override {
    return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm);
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -302,6 +302,14 @@ public:
    return 1;
  }

+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                      unsigned Factor,
+                                      ArrayRef<unsigned> Indices,
+                                      unsigned Alignment,
+                                      unsigned AddressSpace) {
+    return 1;
+  }
+
  unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                 ArrayRef<Type *> Tys) {
    return 1;
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -523,6 +523,73 @@ public:
    return Cost;
  }

+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                      unsigned Factor,
+                                      ArrayRef<unsigned> Indices,
+                                      unsigned Alignment,
+                                      unsigned AddressSpace) {
+    VectorType *VT = dyn_cast<VectorType>(VecTy);
+    assert(VT && "Expect a vector type for interleaved memory op");
+
+    unsigned NumElts = VT->getNumElements();
+    assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
+
+    unsigned NumSubElts = NumElts / Factor;
+    VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts);
+
+    // Firstly, the cost of load/store operation.
+    unsigned Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);
+
+    // Then plus the cost of interleave operation.
+    if (Opcode == Instruction::Load) {
+      // The interleave cost is similar to extract sub vectors' elements
+      // from the wide vector, and insert them into sub vectors.
+      //
+      // E.g. An interleaved load of factor 2 (with one member of index 0):
+      //      %vec = load <8 x i32>, <8 x i32>* %ptr
+      //      %v0 = shuffle %vec, undef, <0, 2, 4, 6>         ; Index 0
+      // The cost is estimated as extract elements at 0, 2, 4, 6 from the
+      // <8 x i32> vector and insert them into a <4 x i32> vector.
+
+      assert(Indices.size() <= Factor &&
+             "Interleaved memory op has too many members");
+      for (unsigned Index : Indices) {
+        assert(Index < Factor && "Invalid index for interleaved memory op");
+
+        // Extract elements from loaded vector for each sub vector.
+        for (unsigned i = 0; i < NumSubElts; i++)
+          Cost += getVectorInstrCost(Instruction::ExtractElement, VT,
+                                     Index + i * Factor);
+      }
+
+      unsigned InsSubCost = 0;
+      for (unsigned i = 0; i < NumSubElts; i++)
+        InsSubCost += getVectorInstrCost(Instruction::InsertElement, SubVT, i);
+
+      Cost += Indices.size() * InsSubCost;
+    } else {
+      // The interleave cost is extract all elements from sub vectors, and
+      // insert them into the wide vector.
+      //
+      // E.g. An interleaved store of factor 2:
+      //      %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
+      //      store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
+      // The cost is estimated as extract all elements from both <4 x i32>
+      // vectors and insert into the <8 x i32> vector.
+
+      unsigned ExtSubCost = 0;
+      for (unsigned i = 0; i < NumSubElts; i++)
+        ExtSubCost += getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
+
+      Cost += Factor * ExtSubCost;
+
+      for (unsigned i = 0; i < NumElts; i++)
+        Cost += getVectorInstrCost(Instruction::InsertElement, VT, i);
+    }
+
+    return Cost;
+  }
+
  unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
                                 ArrayRef<Type *> Tys) {
    unsigned ISD = 0;