[LoopVectorize] Teach Loop Vectorizor about interleaved memory accesses.

Interleaved memory accesses are grouped and vectorized into vector load/store and shufflevector.
E.g. for (i = 0; i < N; i+=2) {
       a = A[i];         // load of even element
       b = A[i+1];       // load of odd element
       ...               // operations on a, b, c, d
       A[i] = c;         // store of even element
       A[i+1] = d;       // store of odd element
     }

  The loads of even and odd elements are identified as an interleave load group, which will be transfered into vectorized IRs like:
     %wide.vec = load <8 x i32>, <8 x i32>* %ptr
     %vec.even = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
     %vec.odd = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>

  The stores of even and odd elements are identified as an interleave store group, which will be transfered into vectorized IRs like:
     %interleaved.vec = shufflevector <4 x i32> %vec.even, %vec.odd, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 
     store <8 x i32> %interleaved.vec, <8 x i32>* %ptr

This optimization is currently disabled by defaut. To try it by adding '-enable-interleaved-mem-accesses=true'. 



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@239291 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Hao Liu
2015-06-08 06:39:56 +00:00
parent f57b36041b
commit 43be1d53d1
9 changed files with 1299 additions and 29 deletions

View File

@@ -501,6 +501,11 @@ const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE,
const ValueToValueMap &PtrToStride,
Value *Ptr, Value *OrigPtr = nullptr);
/// \brief Check the stride of the pointer and ensure that it does not wrap in
/// the address space.
int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
const ValueToValueMap &StridesMap);
/// \brief This analysis provides dependence information for the memory accesses
/// of a loop.
///

View File

@@ -448,6 +448,20 @@ public:
unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace) const;
/// \return The cost of the interleaved memory operation.
/// \p Opcode is the memory operation code
/// \p VecTy is the vector type of the interleaved access.
/// \p Factor is the interleave factor
/// \p Indices is the indices for interleaved load members (as interleaved
/// load allows gaps)
/// \p Alignment is the alignment of the memory operation
/// \p AddressSpace is address space of the pointer.
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace) const;
/// \brief Calculate the cost of performing a vector reduction.
///
/// This is the cost of reducing the vector value of type \p Ty to a scalar
@@ -587,6 +601,11 @@ public:
virtual unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
unsigned Alignment,
unsigned AddressSpace) = 0;
virtual unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace) = 0;
virtual unsigned getReductionCost(unsigned Opcode, Type *Ty,
bool IsPairwiseForm) = 0;
virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
@@ -748,6 +767,14 @@ public:
unsigned AddressSpace) override {
return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
}
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace) override {
return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace);
}
unsigned getReductionCost(unsigned Opcode, Type *Ty,
bool IsPairwiseForm) override {
return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm);

View File

@@ -302,6 +302,14 @@ public:
return 1;
}
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace) {
return 1;
}
unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
ArrayRef<Type *> Tys) {
return 1;

View File

@@ -523,6 +523,73 @@ public:
return Cost;
}
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace) {
VectorType *VT = dyn_cast<VectorType>(VecTy);
assert(VT && "Expect a vector type for interleaved memory op");
unsigned NumElts = VT->getNumElements();
assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
unsigned NumSubElts = NumElts / Factor;
VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts);
// Firstly, the cost of load/store operation.
unsigned Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);
// Then plus the cost of interleave operation.
if (Opcode == Instruction::Load) {
// The interleave cost is similar to extract sub vectors' elements
// from the wide vector, and insert them into sub vectors.
//
// E.g. An interleaved load of factor 2 (with one member of index 0):
// %vec = load <8 x i32>, <8 x i32>* %ptr
// %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
// The cost is estimated as extract elements at 0, 2, 4, 6 from the
// <8 x i32> vector and insert them into a <4 x i32> vector.
assert(Indices.size() <= Factor &&
"Interleaved memory op has too many members");
for (unsigned Index : Indices) {
assert(Index < Factor && "Invalid index for interleaved memory op");
// Extract elements from loaded vector for each sub vector.
for (unsigned i = 0; i < NumSubElts; i++)
Cost += getVectorInstrCost(Instruction::ExtractElement, VT,
Index + i * Factor);
}
unsigned InsSubCost = 0;
for (unsigned i = 0; i < NumSubElts; i++)
InsSubCost += getVectorInstrCost(Instruction::InsertElement, SubVT, i);
Cost += Indices.size() * InsSubCost;
} else {
// The interleave cost is extract all elements from sub vectors, and
// insert them into the wide vector.
//
// E.g. An interleaved store of factor 2:
// %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
// store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
// The cost is estimated as extract all elements from both <4 x i32>
// vectors and insert into the <8 x i32> vector.
unsigned ExtSubCost = 0;
for (unsigned i = 0; i < NumSubElts; i++)
ExtSubCost += getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
Cost += Factor * ExtSubCost;
for (unsigned i = 0; i < NumElts; i++)
Cost += getVectorInstrCost(Instruction::InsertElement, VT, i);
}
return Cost;
}
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Type *> Tys) {
unsigned ISD = 0;