mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-07-27 12:26:08 +00:00
PR14055: Implement support for sub-vector operations in SROA.
Now if we can transform an alloca into a single vector value, but it has subvector, non-element accesses, we form the appropriate shufflevectors to allow SROA to proceed. This fixes PR14055 which pointed out a very common pattern that SROA couldn't handle -- mixed vec3 and vec4 operations on a single alloca. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168418 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
@@ -2116,12 +2116,11 @@ static bool isVectorPromotionViable(const DataLayout &TD,
|
|||||||
EndIndex > Ty->getNumElements())
|
EndIndex > Ty->getNumElements())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
// FIXME: We should build shuffle vector instructions to handle
|
assert(EndIndex > BeginIndex && "Empty vector!");
|
||||||
// non-element-sized accesses. See PR14055 for an example of where this
|
uint64_t NumElements = EndIndex - BeginIndex;
|
||||||
// matters.
|
Type *PartitionTy
|
||||||
if ((EndOffset - BeginOffset) != ElementSize &&
|
= (NumElements == 1) ? Ty->getElementType()
|
||||||
(EndOffset - BeginOffset) != VecSize)
|
: VectorType::get(Ty->getElementType(), NumElements);
|
||||||
return false;
|
|
||||||
|
|
||||||
if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) {
|
if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) {
|
||||||
if (MI->isVolatile())
|
if (MI->isVolatile())
|
||||||
@@ -2138,9 +2137,13 @@ static bool isVectorPromotionViable(const DataLayout &TD,
|
|||||||
} else if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) {
|
} else if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) {
|
||||||
if (LI->isVolatile())
|
if (LI->isVolatile())
|
||||||
return false;
|
return false;
|
||||||
|
if (!canConvertValue(TD, PartitionTy, LI->getType()))
|
||||||
|
return false;
|
||||||
} else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) {
|
} else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) {
|
||||||
if (SI->isVolatile())
|
if (SI->isVolatile())
|
||||||
return false;
|
return false;
|
||||||
|
if (!canConvertValue(TD, SI->getValueOperand()->getType(), PartitionTy))
|
||||||
|
return false;
|
||||||
} else {
|
} else {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -2448,13 +2451,13 @@ private:
|
|||||||
return getOffsetTypeAlign(Ty, BeginOffset - NewAllocaBeginOffset);
|
return getOffsetTypeAlign(Ty, BeginOffset - NewAllocaBeginOffset);
|
||||||
}
|
}
|
||||||
|
|
||||||
ConstantInt *getIndex(IRBuilder<> &IRB, uint64_t Offset) {
|
unsigned getIndex(uint64_t Offset) {
|
||||||
assert(VecTy && "Can only call getIndex when rewriting a vector");
|
assert(VecTy && "Can only call getIndex when rewriting a vector");
|
||||||
uint64_t RelOffset = Offset - NewAllocaBeginOffset;
|
uint64_t RelOffset = Offset - NewAllocaBeginOffset;
|
||||||
assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
|
assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
|
||||||
uint32_t Index = RelOffset / ElementSize;
|
uint32_t Index = RelOffset / ElementSize;
|
||||||
assert(Index * ElementSize == RelOffset);
|
assert(Index * ElementSize == RelOffset);
|
||||||
return IRB.getInt32(Index);
|
return Index;
|
||||||
}
|
}
|
||||||
|
|
||||||
void deleteIfTriviallyDead(Value *V) {
|
void deleteIfTriviallyDead(Value *V) {
|
||||||
@@ -2466,10 +2469,24 @@ private:
|
|||||||
Value *rewriteVectorizedLoadInst(IRBuilder<> &IRB, LoadInst &LI, Value *OldOp) {
|
Value *rewriteVectorizedLoadInst(IRBuilder<> &IRB, LoadInst &LI, Value *OldOp) {
|
||||||
Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
|
Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
|
||||||
getName(".load"));
|
getName(".load"));
|
||||||
if (LI.getType() == VecTy->getElementType() ||
|
unsigned BeginIndex = getIndex(BeginOffset);
|
||||||
BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) {
|
unsigned EndIndex = getIndex(EndOffset);
|
||||||
V = IRB.CreateExtractElement(V, getIndex(IRB, BeginOffset),
|
assert(EndIndex > BeginIndex && "Empty vector!");
|
||||||
|
unsigned NumElements = EndIndex - BeginIndex;
|
||||||
|
assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
|
||||||
|
if (NumElements == 1) {
|
||||||
|
V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
|
||||||
getName(".extract"));
|
getName(".extract"));
|
||||||
|
DEBUG(dbgs() << " extract: " << *V << "\n");
|
||||||
|
} else if (NumElements < VecTy->getNumElements()) {
|
||||||
|
SmallVector<Constant*, 8> Mask;
|
||||||
|
Mask.reserve(NumElements);
|
||||||
|
for (unsigned i = BeginIndex; i != EndIndex; ++i)
|
||||||
|
Mask.push_back(IRB.getInt32(i));
|
||||||
|
V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
|
||||||
|
ConstantVector::get(Mask),
|
||||||
|
getName(".extract"));
|
||||||
|
DEBUG(dbgs() << " shuffle: " << *V << "\n");
|
||||||
}
|
}
|
||||||
return V;
|
return V;
|
||||||
}
|
}
|
||||||
@@ -2569,15 +2586,52 @@ private:
|
|||||||
|
|
||||||
bool rewriteVectorizedStoreInst(IRBuilder<> &IRB, Value *V,
|
bool rewriteVectorizedStoreInst(IRBuilder<> &IRB, Value *V,
|
||||||
StoreInst &SI, Value *OldOp) {
|
StoreInst &SI, Value *OldOp) {
|
||||||
if (V->getType() == ElementTy ||
|
unsigned BeginIndex = getIndex(BeginOffset);
|
||||||
BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) {
|
unsigned EndIndex = getIndex(EndOffset);
|
||||||
if (V->getType() != ElementTy)
|
assert(EndIndex > BeginIndex && "Empty vector!");
|
||||||
V = convertValue(TD, IRB, V, ElementTy);
|
unsigned NumElements = EndIndex - BeginIndex;
|
||||||
|
assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
|
||||||
|
Type *PartitionTy
|
||||||
|
= (NumElements == 1) ? ElementTy
|
||||||
|
: VectorType::get(ElementTy, NumElements);
|
||||||
|
if (V->getType() != PartitionTy)
|
||||||
|
V = convertValue(TD, IRB, V, PartitionTy);
|
||||||
|
if (NumElements < VecTy->getNumElements()) {
|
||||||
|
// We need to mix in the existing elements.
|
||||||
LoadInst *LI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
|
LoadInst *LI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
|
||||||
getName(".load"));
|
getName(".load"));
|
||||||
V = IRB.CreateInsertElement(LI, V, getIndex(IRB, BeginOffset),
|
if (NumElements == 1) {
|
||||||
getName(".insert"));
|
V = IRB.CreateInsertElement(LI, V, IRB.getInt32(BeginIndex),
|
||||||
} else if (V->getType() != VecTy) {
|
getName(".insert"));
|
||||||
|
DEBUG(dbgs() << " insert: " << *V << "\n");
|
||||||
|
} else {
|
||||||
|
// When inserting a smaller vector into the larger to store, we first
|
||||||
|
// use a shuffle vector to widen it with undef elements, and then
|
||||||
|
// a second shuffle vector to select between the loaded vector and the
|
||||||
|
// incoming vector.
|
||||||
|
SmallVector<Constant*, 8> Mask;
|
||||||
|
Mask.reserve(VecTy->getNumElements());
|
||||||
|
for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
|
||||||
|
if (i >= BeginIndex && i < EndIndex)
|
||||||
|
Mask.push_back(IRB.getInt32(i - BeginIndex));
|
||||||
|
else
|
||||||
|
Mask.push_back(UndefValue::get(IRB.getInt32Ty()));
|
||||||
|
V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
|
||||||
|
ConstantVector::get(Mask),
|
||||||
|
getName(".expand"));
|
||||||
|
DEBUG(dbgs() << " shuffle1: " << *V << "\n");
|
||||||
|
|
||||||
|
Mask.clear();
|
||||||
|
for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
|
||||||
|
if (i >= BeginIndex && i < EndIndex)
|
||||||
|
Mask.push_back(IRB.getInt32(i));
|
||||||
|
else
|
||||||
|
Mask.push_back(IRB.getInt32(i + VecTy->getNumElements()));
|
||||||
|
V = IRB.CreateShuffleVector(V, LI, ConstantVector::get(Mask),
|
||||||
|
getName("insert"));
|
||||||
|
DEBUG(dbgs() << " shuffle2: " << *V << "\n");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
V = convertValue(TD, IRB, V, VecTy);
|
V = convertValue(TD, IRB, V, VecTy);
|
||||||
}
|
}
|
||||||
StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
|
StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
|
||||||
@@ -2731,7 +2785,7 @@ private:
|
|||||||
IRB.CreateInsertElement(IRB.CreateAlignedLoad(&NewAI,
|
IRB.CreateInsertElement(IRB.CreateAlignedLoad(&NewAI,
|
||||||
NewAI.getAlignment(),
|
NewAI.getAlignment(),
|
||||||
getName(".load")),
|
getName(".load")),
|
||||||
V, getIndex(IRB, BeginOffset),
|
V, IRB.getInt32(getIndex(BeginOffset)),
|
||||||
getName(".insert")),
|
getName(".insert")),
|
||||||
&NewAI, NewAI.getAlignment());
|
&NewAI, NewAI.getAlignment());
|
||||||
(void)Store;
|
(void)Store;
|
||||||
@@ -2899,7 +2953,7 @@ private:
|
|||||||
// We have to extract rather than load.
|
// We have to extract rather than load.
|
||||||
Src = IRB.CreateExtractElement(
|
Src = IRB.CreateExtractElement(
|
||||||
IRB.CreateAlignedLoad(SrcPtr, Align, getName(".copyload")),
|
IRB.CreateAlignedLoad(SrcPtr, Align, getName(".copyload")),
|
||||||
getIndex(IRB, BeginOffset),
|
IRB.getInt32(getIndex(BeginOffset)),
|
||||||
getName(".copyextract"));
|
getName(".copyextract"));
|
||||||
} else if (IntTy && !IsWholeAlloca && !IsDest) {
|
} else if (IntTy && !IsWholeAlloca && !IsDest) {
|
||||||
Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
|
Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
|
||||||
@@ -2927,7 +2981,7 @@ private:
|
|||||||
// We have to insert into a loaded copy before storing.
|
// We have to insert into a loaded copy before storing.
|
||||||
Src = IRB.CreateInsertElement(
|
Src = IRB.CreateInsertElement(
|
||||||
IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".load")),
|
IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".load")),
|
||||||
Src, getIndex(IRB, BeginOffset),
|
Src, IRB.getInt32(getIndex(BeginOffset)),
|
||||||
getName(".insert"));
|
getName(".insert"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -36,15 +36,15 @@ entry:
|
|||||||
|
|
||||||
define i32 @test2(<4 x i32> %x, <4 x i32> %y) {
|
define i32 @test2(<4 x i32> %x, <4 x i32> %y) {
|
||||||
; CHECK: @test2
|
; CHECK: @test2
|
||||||
; FIXME: This should be handled!
|
|
||||||
entry:
|
entry:
|
||||||
%a = alloca [2 x <4 x i32>]
|
%a = alloca [2 x <4 x i32>]
|
||||||
; CHECK: alloca <4 x i32>
|
; CHECK-NOT: alloca
|
||||||
|
|
||||||
%a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0
|
%a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0
|
||||||
store <4 x i32> %x, <4 x i32>* %a.x
|
store <4 x i32> %x, <4 x i32>* %a.x
|
||||||
%a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1
|
%a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1
|
||||||
store <4 x i32> %y, <4 x i32>* %a.y
|
store <4 x i32> %y, <4 x i32>* %a.y
|
||||||
|
; CHECK-NOT: store
|
||||||
|
|
||||||
%a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
|
%a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
|
||||||
%tmp1 = load i32* %a.tmp1
|
%tmp1 = load i32* %a.tmp1
|
||||||
@@ -54,10 +54,18 @@ entry:
|
|||||||
%a.tmp3.cast = bitcast i32* %a.tmp3 to <2 x i32>*
|
%a.tmp3.cast = bitcast i32* %a.tmp3 to <2 x i32>*
|
||||||
%tmp3.vec = load <2 x i32>* %a.tmp3.cast
|
%tmp3.vec = load <2 x i32>* %a.tmp3.cast
|
||||||
%tmp3 = extractelement <2 x i32> %tmp3.vec, i32 0
|
%tmp3 = extractelement <2 x i32> %tmp3.vec, i32 0
|
||||||
|
; CHECK-NOT: load
|
||||||
|
; CHECK: %[[extract1:.*]] = extractelement <4 x i32> %x, i32 2
|
||||||
|
; CHECK-NEXT: %[[extract2:.*]] = extractelement <4 x i32> %y, i32 3
|
||||||
|
; CHECK-NEXT: %[[extract3:.*]] = shufflevector <4 x i32> %y, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
|
||||||
|
; CHECK-NEXT: %[[extract4:.*]] = extractelement <2 x i32> %[[extract3]], i32 0
|
||||||
|
|
||||||
%tmp4 = add i32 %tmp1, %tmp2
|
%tmp4 = add i32 %tmp1, %tmp2
|
||||||
%tmp5 = add i32 %tmp3, %tmp4
|
%tmp5 = add i32 %tmp3, %tmp4
|
||||||
ret i32 %tmp5
|
ret i32 %tmp5
|
||||||
|
; CHECK-NEXT: %[[sum1:.*]] = add i32 %[[extract1]], %[[extract2]]
|
||||||
|
; CHECK-NEXT: %[[sum2:.*]] = add i32 %[[extract4]], %[[sum1]]
|
||||||
|
; CHECK-NEXT: ret i32 %[[sum2]]
|
||||||
}
|
}
|
||||||
|
|
||||||
define i32 @test3(<4 x i32> %x, <4 x i32> %y) {
|
define i32 @test3(<4 x i32> %x, <4 x i32> %y) {
|
||||||
@@ -206,6 +214,71 @@ define i64 @test6(<4 x i64> %x, <4 x i64> %y, i64 %n) {
|
|||||||
ret i64 %res
|
ret i64 %res
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define <4 x i32> @test_subvec_store() {
|
||||||
|
; CHECK: @test_subvec_store
|
||||||
|
entry:
|
||||||
|
%a = alloca <4 x i32>
|
||||||
|
; CHECK-NOT: alloca
|
||||||
|
|
||||||
|
%a.gep0 = getelementptr <4 x i32>* %a, i32 0, i32 0
|
||||||
|
%a.cast0 = bitcast i32* %a.gep0 to <2 x i32>*
|
||||||
|
store <2 x i32> <i32 0, i32 0>, <2 x i32>* %a.cast0
|
||||||
|
; CHECK-NOT: store
|
||||||
|
; CHECK: %[[insert1:.*]] = shufflevector <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>, <4 x i32> undef, <4 x i32> <i32 0, i32 1, {{.*}}>
|
||||||
|
|
||||||
|
%a.gep1 = getelementptr <4 x i32>* %a, i32 0, i32 1
|
||||||
|
%a.cast1 = bitcast i32* %a.gep1 to <2 x i32>*
|
||||||
|
store <2 x i32> <i32 1, i32 1>, <2 x i32>* %a.cast1
|
||||||
|
; CHECK-NEXT: %[[insert2:.*]] = shufflevector <4 x i32> <i32 undef, i32 1, i32 1, i32 undef>, <4 x i32> %[[insert1]], <4 x i32> <i32 4, i32 1, i32 2, {{.*}}>
|
||||||
|
|
||||||
|
%a.gep2 = getelementptr <4 x i32>* %a, i32 0, i32 2
|
||||||
|
%a.cast2 = bitcast i32* %a.gep2 to <2 x i32>*
|
||||||
|
store <2 x i32> <i32 2, i32 2>, <2 x i32>* %a.cast2
|
||||||
|
; CHECK-NEXT: %[[insert3:.*]] = shufflevector <4 x i32> <i32 undef, i32 undef, i32 2, i32 2>, <4 x i32> %[[insert2]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
|
||||||
|
|
||||||
|
%a.gep3 = getelementptr <4 x i32>* %a, i32 0, i32 3
|
||||||
|
store i32 3, i32* %a.gep3
|
||||||
|
; CHECK-NEXT: %[[insert4:.*]] = insertelement <4 x i32> %[[insert3]], i32 3, i32 3
|
||||||
|
|
||||||
|
%ret = load <4 x i32>* %a
|
||||||
|
|
||||||
|
ret <4 x i32> %ret
|
||||||
|
; CHECK-NEXT: ret <4 x i32> %[[insert4]]
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x i32> @test_subvec_load() {
|
||||||
|
; CHECK: @test_subvec_load
|
||||||
|
entry:
|
||||||
|
%a = alloca <4 x i32>
|
||||||
|
; CHECK-NOT: alloca
|
||||||
|
store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32>* %a
|
||||||
|
; CHECK-NOT: store
|
||||||
|
|
||||||
|
%a.gep0 = getelementptr <4 x i32>* %a, i32 0, i32 0
|
||||||
|
%a.cast0 = bitcast i32* %a.gep0 to <2 x i32>*
|
||||||
|
%first = load <2 x i32>* %a.cast0
|
||||||
|
; CHECK-NOT: load
|
||||||
|
; CHECK: %[[extract1:.*]] = shufflevector <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
|
||||||
|
|
||||||
|
%a.gep1 = getelementptr <4 x i32>* %a, i32 0, i32 1
|
||||||
|
%a.cast1 = bitcast i32* %a.gep1 to <2 x i32>*
|
||||||
|
%second = load <2 x i32>* %a.cast1
|
||||||
|
; CHECK-NEXT: %[[extract2:.*]] = shufflevector <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> undef, <2 x i32> <i32 1, i32 2>
|
||||||
|
|
||||||
|
%a.gep2 = getelementptr <4 x i32>* %a, i32 0, i32 2
|
||||||
|
%a.cast2 = bitcast i32* %a.gep2 to <2 x i32>*
|
||||||
|
%third = load <2 x i32>* %a.cast2
|
||||||
|
; CHECK-NEXT: %[[extract3:.*]] = shufflevector <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
|
||||||
|
|
||||||
|
%tmp = shufflevector <2 x i32> %first, <2 x i32> %second, <2 x i32> <i32 0, i32 2>
|
||||||
|
%ret = shufflevector <2 x i32> %tmp, <2 x i32> %third, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||||
|
; CHECK-NEXT: %[[tmp:.*]] = shufflevector <2 x i32> %[[extract1]], <2 x i32> %[[extract2]], <2 x i32> <i32 0, i32 2>
|
||||||
|
; CHECK-NEXT: %[[ret:.*]] = shufflevector <2 x i32> %[[tmp]], <2 x i32> %[[extract3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||||
|
|
||||||
|
ret <4 x i32> %ret
|
||||||
|
; CHECK-NEXT: ret <4 x i32> %[[ret]]
|
||||||
|
}
|
||||||
|
|
||||||
define i32 @PR14212() {
|
define i32 @PR14212() {
|
||||||
; CHECK: @PR14212
|
; CHECK: @PR14212
|
||||||
; This caused a crash when "splitting" the load of the i32 in order to promote
|
; This caused a crash when "splitting" the load of the i32 in order to promote
|
||||||
|
Reference in New Issue
Block a user