mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-11-25 15:19:14 +00:00
[LoopVectorize] Teach Loop Vectorizor about interleaved memory accesses.
Interleaved memory accesses are grouped and vectorized into vector load/store and shufflevector.
E.g. for (i = 0; i < N; i+=2) {
a = A[i]; // load of even element
b = A[i+1]; // load of odd element
... // operations on a, b, c, d
A[i] = c; // store of even element
A[i+1] = d; // store of odd element
}
The loads of even and odd elements are identified as an interleave load group, which will be transfered into vectorized IRs like:
%wide.vec = load <8 x i32>, <8 x i32>* %ptr
%vec.even = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%vec.odd = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
The stores of even and odd elements are identified as an interleave store group, which will be transfered into vectorized IRs like:
%interleaved.vec = shufflevector <4 x i32> %vec.even, %vec.odd, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
This optimization is currently disabled by defaut. To try it by adding '-enable-interleaved-mem-accesses=true'.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@239291 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
@@ -501,6 +501,11 @@ const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE,
|
||||
const ValueToValueMap &PtrToStride,
|
||||
Value *Ptr, Value *OrigPtr = nullptr);
|
||||
|
||||
/// \brief Check the stride of the pointer and ensure that it does not wrap in
|
||||
/// the address space.
|
||||
int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
|
||||
const ValueToValueMap &StridesMap);
|
||||
|
||||
/// \brief This analysis provides dependence information for the memory accesses
|
||||
/// of a loop.
|
||||
///
|
||||
|
||||
@@ -448,6 +448,20 @@ public:
|
||||
unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
|
||||
unsigned AddressSpace) const;
|
||||
|
||||
/// \return The cost of the interleaved memory operation.
|
||||
/// \p Opcode is the memory operation code
|
||||
/// \p VecTy is the vector type of the interleaved access.
|
||||
/// \p Factor is the interleave factor
|
||||
/// \p Indices is the indices for interleaved load members (as interleaved
|
||||
/// load allows gaps)
|
||||
/// \p Alignment is the alignment of the memory operation
|
||||
/// \p AddressSpace is address space of the pointer.
|
||||
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
|
||||
unsigned Factor,
|
||||
ArrayRef<unsigned> Indices,
|
||||
unsigned Alignment,
|
||||
unsigned AddressSpace) const;
|
||||
|
||||
/// \brief Calculate the cost of performing a vector reduction.
|
||||
///
|
||||
/// This is the cost of reducing the vector value of type \p Ty to a scalar
|
||||
@@ -587,6 +601,11 @@ public:
|
||||
virtual unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
|
||||
unsigned Alignment,
|
||||
unsigned AddressSpace) = 0;
|
||||
virtual unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
|
||||
unsigned Factor,
|
||||
ArrayRef<unsigned> Indices,
|
||||
unsigned Alignment,
|
||||
unsigned AddressSpace) = 0;
|
||||
virtual unsigned getReductionCost(unsigned Opcode, Type *Ty,
|
||||
bool IsPairwiseForm) = 0;
|
||||
virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
@@ -748,6 +767,14 @@ public:
|
||||
unsigned AddressSpace) override {
|
||||
return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
|
||||
}
|
||||
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
|
||||
unsigned Factor,
|
||||
ArrayRef<unsigned> Indices,
|
||||
unsigned Alignment,
|
||||
unsigned AddressSpace) override {
|
||||
return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
|
||||
Alignment, AddressSpace);
|
||||
}
|
||||
unsigned getReductionCost(unsigned Opcode, Type *Ty,
|
||||
bool IsPairwiseForm) override {
|
||||
return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm);
|
||||
|
||||
@@ -302,6 +302,14 @@ public:
|
||||
return 1;
|
||||
}
|
||||
|
||||
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
|
||||
unsigned Factor,
|
||||
ArrayRef<unsigned> Indices,
|
||||
unsigned Alignment,
|
||||
unsigned AddressSpace) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Type *> Tys) {
|
||||
return 1;
|
||||
|
||||
@@ -523,6 +523,73 @@ public:
|
||||
return Cost;
|
||||
}
|
||||
|
||||
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
|
||||
unsigned Factor,
|
||||
ArrayRef<unsigned> Indices,
|
||||
unsigned Alignment,
|
||||
unsigned AddressSpace) {
|
||||
VectorType *VT = dyn_cast<VectorType>(VecTy);
|
||||
assert(VT && "Expect a vector type for interleaved memory op");
|
||||
|
||||
unsigned NumElts = VT->getNumElements();
|
||||
assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
|
||||
|
||||
unsigned NumSubElts = NumElts / Factor;
|
||||
VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts);
|
||||
|
||||
// Firstly, the cost of load/store operation.
|
||||
unsigned Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);
|
||||
|
||||
// Then plus the cost of interleave operation.
|
||||
if (Opcode == Instruction::Load) {
|
||||
// The interleave cost is similar to extract sub vectors' elements
|
||||
// from the wide vector, and insert them into sub vectors.
|
||||
//
|
||||
// E.g. An interleaved load of factor 2 (with one member of index 0):
|
||||
// %vec = load <8 x i32>, <8 x i32>* %ptr
|
||||
// %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
|
||||
// The cost is estimated as extract elements at 0, 2, 4, 6 from the
|
||||
// <8 x i32> vector and insert them into a <4 x i32> vector.
|
||||
|
||||
assert(Indices.size() <= Factor &&
|
||||
"Interleaved memory op has too many members");
|
||||
for (unsigned Index : Indices) {
|
||||
assert(Index < Factor && "Invalid index for interleaved memory op");
|
||||
|
||||
// Extract elements from loaded vector for each sub vector.
|
||||
for (unsigned i = 0; i < NumSubElts; i++)
|
||||
Cost += getVectorInstrCost(Instruction::ExtractElement, VT,
|
||||
Index + i * Factor);
|
||||
}
|
||||
|
||||
unsigned InsSubCost = 0;
|
||||
for (unsigned i = 0; i < NumSubElts; i++)
|
||||
InsSubCost += getVectorInstrCost(Instruction::InsertElement, SubVT, i);
|
||||
|
||||
Cost += Indices.size() * InsSubCost;
|
||||
} else {
|
||||
// The interleave cost is extract all elements from sub vectors, and
|
||||
// insert them into the wide vector.
|
||||
//
|
||||
// E.g. An interleaved store of factor 2:
|
||||
// %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
|
||||
// store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
|
||||
// The cost is estimated as extract all elements from both <4 x i32>
|
||||
// vectors and insert into the <8 x i32> vector.
|
||||
|
||||
unsigned ExtSubCost = 0;
|
||||
for (unsigned i = 0; i < NumSubElts; i++)
|
||||
ExtSubCost += getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
|
||||
|
||||
Cost += Factor * ExtSubCost;
|
||||
|
||||
for (unsigned i = 0; i < NumElts; i++)
|
||||
Cost += getVectorInstrCost(Instruction::InsertElement, VT, i);
|
||||
}
|
||||
|
||||
return Cost;
|
||||
}
|
||||
|
||||
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
||||
ArrayRef<Type *> Tys) {
|
||||
unsigned ISD = 0;
|
||||
|
||||
Reference in New Issue
Block a user