mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-12-15 04:30:12 +00:00
[x86] Make the previous logic significantly less conservative and get
a bunch more improvements. Non-lane-crossing is fine, the key is that lane merging only makes sense for single-input shuffles. Not sure why I got so turned around here. The code all works, I was just using the wrong model for it. This only updates v4 and v8 lowering. The v16 and v32 lowering requires restructuring the entire check sequence. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@222537 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
607099b697
commit
0889d65fd5
@ -9999,8 +9999,8 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
|
||||
static SDValue lowerVectorShuffleByMerging128BitLanes(
|
||||
SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
|
||||
const X86Subtarget *Subtarget, SelectionDAG &DAG) {
|
||||
assert(is128BitLaneCrossingShuffleMask(VT, Mask) &&
|
||||
"This is only useful when there are cross-128-bit-lane shuffles.");
|
||||
assert(!isSingleInputShuffleMask(Mask) &&
|
||||
"This is only useful with multiple inputs.");
|
||||
|
||||
int Size = Mask.size();
|
||||
int LaneSize = 128 / VT.getScalarSizeInBits();
|
||||
@ -10170,8 +10170,7 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||
// shuffle. However, if we have AVX2 and either inputs are already in place,
|
||||
// we will be able to shuffle even across lanes the other input in a single
|
||||
// instruction so skip this pattern.
|
||||
if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
|
||||
!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
|
||||
if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
|
||||
isShuffleMaskInputInPlace(1, Mask))))
|
||||
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
|
||||
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
|
||||
@ -10251,8 +10250,7 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||
// shuffle. However, if we have AVX2 and either inputs are already in place,
|
||||
// we will be able to shuffle even across lanes the other input in a single
|
||||
// instruction so skip this pattern.
|
||||
if (is128BitLaneCrossingShuffleMask(MVT::v4i64, Mask) &&
|
||||
!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
|
||||
if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
|
||||
isShuffleMaskInputInPlace(1, Mask))))
|
||||
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
|
||||
DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
|
||||
@ -10337,10 +10335,9 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||
|
||||
// Try to simplify this by merging 128-bit lanes to enable a lane-based
|
||||
// shuffle.
|
||||
if (is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
|
||||
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
|
||||
DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
|
||||
return Result;
|
||||
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
|
||||
DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
|
||||
return Result;
|
||||
|
||||
// If we have AVX2 then we always want to lower with a blend because at v8 we
|
||||
// can fully permute the elements.
|
||||
@ -10407,10 +10404,9 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||
|
||||
// Try to simplify this by merging 128-bit lanes to enable a lane-based
|
||||
// shuffle.
|
||||
if (is128BitLaneCrossingShuffleMask(MVT::v8i32, Mask))
|
||||
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
|
||||
DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
|
||||
return Result;
|
||||
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
|
||||
DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
|
||||
return Result;
|
||||
|
||||
// Otherwise fall back on generic blend lowering.
|
||||
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
|
||||
|
@ -331,20 +331,11 @@ define <4 x double> @shuffle_v4f64_3276(<4 x double> %a, <4 x double> %b) {
|
||||
}
|
||||
|
||||
define <4 x double> @shuffle_v4f64_1076(<4 x double> %a, <4 x double> %b) {
|
||||
; AVX1-LABEL: shuffle_v4f64_1076:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v4f64_1076:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2]
|
||||
; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
|
||||
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
|
||||
; AVX2-NEXT: retq
|
||||
; ALL-LABEL: shuffle_v4f64_1076:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
|
||||
; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
|
||||
; ALL-NEXT: retq
|
||||
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
|
||||
ret <4 x double> %shuffle
|
||||
}
|
||||
@ -708,17 +699,14 @@ define <4 x i64> @shuffle_v4i64_3276(<4 x i64> %a, <4 x i64> %b) {
|
||||
define <4 x i64> @shuffle_v4i64_1076(<4 x i64> %a, <4 x i64> %b) {
|
||||
; AVX1-LABEL: shuffle_v4i64_1076:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v4i64_1076:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
|
||||
ret <4 x i64> %shuffle
|
||||
|
@ -746,20 +746,11 @@ define <8 x float> @shuffle_v8f32_3210ba98(<8 x float> %a, <8 x float> %b) {
|
||||
}
|
||||
|
||||
define <8 x float> @shuffle_v8f32_3210fedc(<8 x float> %a, <8 x float> %b) {
|
||||
; AVX1-LABEL: shuffle_v8f32_3210fedc:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v8f32_3210fedc:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
|
||||
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
|
||||
; AVX2-NEXT: retq
|
||||
; ALL-LABEL: shuffle_v8f32_3210fedc:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
|
||||
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; ALL-NEXT: retq
|
||||
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12>
|
||||
ret <8 x float> %shuffle
|
||||
}
|
||||
@ -785,39 +776,21 @@ define <8 x float> @shuffle_v8f32_fedc7654(<8 x float> %a, <8 x float> %b) {
|
||||
}
|
||||
|
||||
define <8 x float> @shuffle_v8f32_ba987654(<8 x float> %a, <8 x float> %b) {
|
||||
; AVX1-LABEL: shuffle_v8f32_ba987654:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v8f32_ba987654:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
|
||||
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
|
||||
; AVX2-NEXT: retq
|
||||
; ALL-LABEL: shuffle_v8f32_ba987654:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
|
||||
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; ALL-NEXT: retq
|
||||
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
|
||||
ret <8 x float> %shuffle
|
||||
}
|
||||
|
||||
define <8 x float> @shuffle_v8f32_ba983210(<8 x float> %a, <8 x float> %b) {
|
||||
; AVX1-LABEL: shuffle_v8f32_ba983210:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v8f32_ba983210:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
|
||||
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
|
||||
; AVX2-NEXT: retq
|
||||
; ALL-LABEL: shuffle_v8f32_ba983210:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
|
||||
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; ALL-NEXT: retq
|
||||
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
|
||||
ret <8 x float> %shuffle
|
||||
}
|
||||
@ -1774,17 +1747,14 @@ define <8 x i32> @shuffle_v8i32_3210ba98(<8 x i32> %a, <8 x i32> %b) {
|
||||
define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) {
|
||||
; AVX1-LABEL: shuffle_v8i32_3210fedc:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v8i32_3210fedc:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12>
|
||||
ret <8 x i32> %shuffle
|
||||
@ -1825,17 +1795,14 @@ define <8 x i32> @shuffle_v8i32_fedc7654(<8 x i32> %a, <8 x i32> %b) {
|
||||
define <8 x i32> @shuffle_v8i32_ba987654(<8 x i32> %a, <8 x i32> %b) {
|
||||
; AVX1-LABEL: shuffle_v8i32_ba987654:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v8i32_ba987654:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
|
||||
ret <8 x i32> %shuffle
|
||||
@ -1844,17 +1811,14 @@ define <8 x i32> @shuffle_v8i32_ba987654(<8 x i32> %a, <8 x i32> %b) {
|
||||
define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) {
|
||||
; AVX1-LABEL: shuffle_v8i32_ba983210:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v8i32_ba983210:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
|
||||
ret <8 x i32> %shuffle
|
||||
|
Loading…
Reference in New Issue
Block a user