mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-11-01 15:11:24 +00:00
Improve DAG combine pass on certain IR vector patterns
Loading 2 2x32-bit float vectors into the bottom half of a 256-bit vector produced suboptimal code in AVX2 mode with certain IR combinations. In particular, the IR optimizer folded 2f32 + 2f32 -> 4f32, 4f32 + 4f32 (undef) -> 8f32 into a 2f32 + 2f32 -> 8f32, which seems more canonical, but then mysteriously generated rather bad code; the movq/movhpd combination didn't match. The problem lay in the BUILD_VECTOR optimization path. The 2f32 inputs would get promoted to 4f32 by the type legalizer, eventually resulting in a BUILD_VECTOR on two 4f32 into an 8f32. The BUILD_VECTOR then, recognizing these were both half the output size, concatted them and then produced a shuffle. However, the resulting concat + shuffle was more complex than it should be; in the case where the upper half of the output is undef, we probably want to generate shuffle + concat instead. This enhancement causes the vector_shuffle combine step to recognize this suboptimal pattern and correct it. I included it there instead of in BUILD_VECTOR in case the same suboptimal pattern occurs for other reasons. This results in the optimizer correctly producing the optimal movq + movhpd sequence for all three variations on this IR, even with AVX2. I've included a test case. Radar link: rdar://problem/19287012 Fix for PR 21943. From: Fiona Glaser <fglaser@apple.com> git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@226360 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
f2a51a78f5
commit
5eed637b34
@ -11347,7 +11347,8 @@ static SDValue simplifyShuffleOperands(ShuffleVectorSDNode *SVN, SDValue N0,
|
||||
return DAG.getVectorShuffle(VT, SDLoc(SVN), S0, S1, SVN->getMask());
|
||||
}
|
||||
|
||||
// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat.
|
||||
// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat,
|
||||
// or turn a shuffle of a single concat into simpler shuffle then concat.
|
||||
static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
|
||||
EVT VT = N->getValueType(0);
|
||||
unsigned NumElts = VT.getVectorNumElements();
|
||||
@ -11361,6 +11362,18 @@ static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
|
||||
unsigned NumElemsPerConcat = ConcatVT.getVectorNumElements();
|
||||
unsigned NumConcats = NumElts / NumElemsPerConcat;
|
||||
|
||||
// Special case: shuffle(concat(A,B)) can be more efficiently represented
|
||||
// as concat(shuffle(A,B),UNDEF) if the shuffle doesn't set any of the high
|
||||
// half vector elements.
|
||||
if (NumElemsPerConcat * 2 == NumElts && N1.getOpcode() == ISD::UNDEF &&
|
||||
std::all_of(SVN->getMask().begin() + NumElemsPerConcat,
|
||||
SVN->getMask().end(), [](int i) { return i == -1; })) {
|
||||
N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0), N0.getOperand(1),
|
||||
ArrayRef<int>(SVN->getMask().begin(), NumElemsPerConcat));
|
||||
N1 = DAG.getUNDEF(ConcatVT);
|
||||
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1);
|
||||
}
|
||||
|
||||
// Look at every vector that's inserted. We're looking for exact
|
||||
// subvector-sized copies from a concatenated vector
|
||||
for (unsigned I = 0; I != NumConcats; ++I) {
|
||||
|
@ -1849,3 +1849,45 @@ define <8 x float> @splat_v8f32(<4 x float> %r) {
|
||||
%1 = shufflevector <4 x float> %r, <4 x float> undef, <8 x i32> zeroinitializer
|
||||
ret <8 x float> %1
|
||||
}
|
||||
|
||||
define <8x float> @concat_v2f32_1(<2 x float>* %tmp64, <2 x float>* %tmp65) {
|
||||
; ALL-LABEL: concat_v2f32_1:
|
||||
; ALL: # BB#0: # %entry
|
||||
; ALL-NEXT: vmovq (%rdi), %xmm0
|
||||
; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0
|
||||
; ALL-NEXT: retq
|
||||
entry:
|
||||
%tmp74 = load <2 x float>* %tmp65, align 8
|
||||
%tmp72 = load <2 x float>* %tmp64, align 8
|
||||
%tmp73 = shufflevector <2 x float> %tmp72, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%tmp75 = shufflevector <2 x float> %tmp74, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%tmp76 = shufflevector <8 x float> %tmp73, <8 x float> %tmp75, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
ret <8 x float> %tmp76
|
||||
}
|
||||
|
||||
define <8x float> @concat_v2f32_2(<2 x float>* %tmp64, <2 x float>* %tmp65) {
|
||||
; ALL-LABEL: concat_v2f32_2:
|
||||
; ALL: # BB#0: # %entry
|
||||
; ALL-NEXT: vmovq (%rdi), %xmm0
|
||||
; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0
|
||||
; ALL-NEXT: retq
|
||||
entry:
|
||||
%tmp74 = load <2 x float>* %tmp65, align 8
|
||||
%tmp72 = load <2 x float>* %tmp64, align 8
|
||||
%tmp76 = shufflevector <2 x float> %tmp72, <2 x float> %tmp74, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
ret <8 x float> %tmp76
|
||||
}
|
||||
|
||||
define <8x float> @concat_v2f32_3(<2 x float>* %tmp64, <2 x float>* %tmp65) {
|
||||
; ALL-LABEL: concat_v2f32_3:
|
||||
; ALL: # BB#0: # %entry
|
||||
; ALL-NEXT: vmovq (%rdi), %xmm0
|
||||
; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0
|
||||
; ALL-NEXT: retq
|
||||
entry:
|
||||
%tmp74 = load <2 x float>* %tmp65, align 8
|
||||
%tmp72 = load <2 x float>* %tmp64, align 8
|
||||
%tmp76 = shufflevector <2 x float> %tmp72, <2 x float> %tmp74, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%res = shufflevector <4 x float> %tmp76, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
ret <8 x float> %res
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user