mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-11-02 07:11:49 +00:00
[X86][AVX2] vpslldq/vpsrldq byte shifts for AVX2
This patch refactors the existing lowerVectorShuffleAsByteShift function to add support for 256-bit vectors on AVX2 targets. It also fixes a tablegen issue that prevented the lowering of vpslldq/vpsrldq vec256 instructions. Differential Revision: http://reviews.llvm.org/D7596 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@229311 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
3e93916175
commit
28f299b62d
@ -7834,12 +7834,10 @@ static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
|
||||
|
||||
/// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
|
||||
///
|
||||
/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
|
||||
/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ
|
||||
/// byte-shift instructions. The mask must consist of a shifted sequential
|
||||
/// shuffle from one of the input vectors and zeroable elements for the
|
||||
/// remaining 'shifted in' elements.
|
||||
///
|
||||
/// Note that this only handles 128-bit vector widths currently.
|
||||
static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
|
||||
SDValue V2, ArrayRef<int> Mask,
|
||||
SelectionDAG &DAG) {
|
||||
@ -7847,63 +7845,56 @@ static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
|
||||
|
||||
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
|
||||
|
||||
int Size = Mask.size();
|
||||
int Scale = 16 / Size;
|
||||
int NumElts = VT.getVectorNumElements();
|
||||
int NumLanes = VT.getSizeInBits() / 128;
|
||||
int NumLaneElts = NumElts / NumLanes;
|
||||
int Scale = 16 / NumLaneElts;
|
||||
MVT ShiftVT = MVT::getVectorVT(MVT::i64, 2 * NumLanes);
|
||||
|
||||
// PSLLDQ : (little-endian) left byte shift
|
||||
// [ zz, 0, 1, 2, 3, 4, 5, 6]
|
||||
// [ zz, zz, -1, -1, 2, 3, 4, -1]
|
||||
// [ zz, zz, zz, zz, zz, zz, -1, 1]
|
||||
// PSRLDQ : (little-endian) right byte shift
|
||||
// [ 5, 6, 7, zz, zz, zz, zz, zz]
|
||||
// [ -1, 5, 6, 7, zz, zz, zz, zz]
|
||||
// [ 1, 2, -1, -1, -1, -1, zz, zz]
|
||||
auto MatchByteShift = [&](int Shift) -> SDValue {
|
||||
bool MatchLeft = true, MatchRight = true;
|
||||
for (int l = 0; l < NumElts; l += NumLaneElts) {
|
||||
for (int i = 0; i < Shift; ++i)
|
||||
MatchLeft &= Zeroable[l + i];
|
||||
for (int i = NumLaneElts - Shift; i < NumLaneElts; ++i)
|
||||
MatchRight &= Zeroable[l + i];
|
||||
}
|
||||
if (!(MatchLeft || MatchRight))
|
||||
return SDValue();
|
||||
|
||||
bool MatchV1 = true, MatchV2 = true;
|
||||
for (int l = 0; l < NumElts; l += NumLaneElts) {
|
||||
unsigned Pos = MatchLeft ? Shift + l : l;
|
||||
unsigned Low = MatchLeft ? l : Shift + l;
|
||||
unsigned Len = NumLaneElts - Shift;
|
||||
MatchV1 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low);
|
||||
MatchV2 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low + NumElts);
|
||||
}
|
||||
if (!(MatchV1 || MatchV2))
|
||||
return SDValue();
|
||||
|
||||
for (int Shift = 1; Shift < Size; Shift++) {
|
||||
int ByteShift = Shift * Scale;
|
||||
unsigned Op = MatchRight ? X86ISD::VSRLDQ : X86ISD::VSHLDQ;
|
||||
SDValue V = MatchV1 ? V1 : V2;
|
||||
V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
|
||||
V = DAG.getNode(Op, DL, ShiftVT, V,
|
||||
DAG.getConstant(ByteShift * 8, MVT::i8));
|
||||
return DAG.getNode(ISD::BITCAST, DL, VT, V);
|
||||
};
|
||||
|
||||
// PSRLDQ : (little-endian) right byte shift
|
||||
// [ 5, 6, 7, zz, zz, zz, zz, zz]
|
||||
// [ -1, 5, 6, 7, zz, zz, zz, zz]
|
||||
// [ 1, 2, -1, -1, -1, -1, zz, zz]
|
||||
bool ZeroableRight = true;
|
||||
for (int i = Size - Shift; i < Size; i++) {
|
||||
ZeroableRight &= Zeroable[i];
|
||||
}
|
||||
|
||||
if (ZeroableRight) {
|
||||
bool ValidShiftRight1 =
|
||||
isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift);
|
||||
bool ValidShiftRight2 =
|
||||
isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift);
|
||||
|
||||
if (ValidShiftRight1 || ValidShiftRight2) {
|
||||
// Cast the inputs to v2i64 to match PSRLDQ.
|
||||
SDValue &TargetV = ValidShiftRight1 ? V1 : V2;
|
||||
SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
|
||||
SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,
|
||||
DAG.getConstant(ByteShift * 8, MVT::i8));
|
||||
return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
|
||||
}
|
||||
}
|
||||
|
||||
// PSLLDQ : (little-endian) left byte shift
|
||||
// [ zz, 0, 1, 2, 3, 4, 5, 6]
|
||||
// [ zz, zz, -1, -1, 2, 3, 4, -1]
|
||||
// [ zz, zz, zz, zz, zz, zz, -1, 1]
|
||||
bool ZeroableLeft = true;
|
||||
for (int i = 0; i < Shift; i++) {
|
||||
ZeroableLeft &= Zeroable[i];
|
||||
}
|
||||
|
||||
if (ZeroableLeft) {
|
||||
bool ValidShiftLeft1 =
|
||||
isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0);
|
||||
bool ValidShiftLeft2 =
|
||||
isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size);
|
||||
|
||||
if (ValidShiftLeft1 || ValidShiftLeft2) {
|
||||
// Cast the inputs to v2i64 to match PSLLDQ.
|
||||
SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;
|
||||
SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
|
||||
SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,
|
||||
DAG.getConstant(ByteShift * 8, MVT::i8));
|
||||
return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int Shift = 1; Shift < NumLaneElts; ++Shift)
|
||||
if (SDValue S = MatchByteShift(Shift))
|
||||
return S;
|
||||
|
||||
// no match
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
@ -10674,12 +10665,6 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||
DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
|
||||
getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
|
||||
}
|
||||
|
||||
// Use dedicated unpack instructions for masks that match their pattern.
|
||||
if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
|
||||
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
|
||||
if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
|
||||
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
|
||||
}
|
||||
|
||||
// AVX2 provides a direct instruction for permuting a single input across
|
||||
@ -10688,6 +10673,17 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||
return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
|
||||
getV4X86ShuffleImm8ForMask(Mask, DAG));
|
||||
|
||||
// Try to use byte shift instructions.
|
||||
if (SDValue Shift = lowerVectorShuffleAsByteShift(
|
||||
DL, MVT::v4i64, V1, V2, Mask, DAG))
|
||||
return Shift;
|
||||
|
||||
// Use dedicated unpack instructions for masks that match their pattern.
|
||||
if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
|
||||
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
|
||||
if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
|
||||
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
|
||||
|
||||
// Try to simplify this by merging 128-bit lanes to enable a lane-based
|
||||
// shuffle. However, if we have AVX2 and either inputs are already in place,
|
||||
// we will be able to shuffle even across lanes the other input in a single
|
||||
@ -10863,6 +10859,11 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||
DL, MVT::v8i32, V1, V2, Mask, DAG))
|
||||
return Shift;
|
||||
|
||||
// Try to use byte shift instructions.
|
||||
if (SDValue Shift = lowerVectorShuffleAsByteShift(
|
||||
DL, MVT::v8i32, V1, V2, Mask, DAG))
|
||||
return Shift;
|
||||
|
||||
// Try to simplify this by merging 128-bit lanes to enable a lane-based
|
||||
// shuffle.
|
||||
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
|
||||
@ -10951,6 +10952,11 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||
DL, MVT::v16i16, V1, V2, Mask, DAG))
|
||||
return Shift;
|
||||
|
||||
// Try to use byte shift instructions.
|
||||
if (SDValue Shift = lowerVectorShuffleAsByteShift(
|
||||
DL, MVT::v16i16, V1, V2, Mask, DAG))
|
||||
return Shift;
|
||||
|
||||
// Try to simplify this by merging 128-bit lanes to enable a lane-based
|
||||
// shuffle.
|
||||
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
|
||||
@ -11034,6 +11040,11 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||
DL, MVT::v32i8, V1, V2, Mask, DAG))
|
||||
return Shift;
|
||||
|
||||
// Try to use byte shift instructions.
|
||||
if (SDValue Shift = lowerVectorShuffleAsByteShift(
|
||||
DL, MVT::v32i8, V1, V2, Mask, DAG))
|
||||
return Shift;
|
||||
|
||||
// Try to simplify this by merging 128-bit lanes to enable a lane-based
|
||||
// shuffle.
|
||||
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
|
||||
|
@ -4296,6 +4296,12 @@ let Predicates = [HasAVX2] in {
|
||||
(VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
|
||||
def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2),
|
||||
(VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
|
||||
|
||||
// Shift up / down and insert zero's.
|
||||
def : Pat<(v4i64 (X86vshldq VR256:$src, (i8 imm:$amt))),
|
||||
(VPSLLDQYri VR256:$src, (BYTE_imm imm:$amt))>;
|
||||
def : Pat<(v4i64 (X86vshrdq VR256:$src, (i8 imm:$amt))),
|
||||
(VPSRLDQYri VR256:$src, (BYTE_imm imm:$amt))>;
|
||||
}
|
||||
|
||||
let Predicates = [UseSSE2] in {
|
||||
|
@ -1349,6 +1349,40 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_2
|
||||
ret <16 x i16> %shuffle
|
||||
}
|
||||
|
||||
define <16 x i16> @shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24(<16 x i16> %a) {
|
||||
; AVX1-LABEL: shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24>
|
||||
ret <16 x i16> %shuffle
|
||||
}
|
||||
|
||||
define <16 x i16> @shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz(<16 x i16> %a) {
|
||||
; AVX1-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0>
|
||||
ret <16 x i16> %shuffle
|
||||
}
|
||||
|
||||
;
|
||||
; Shuffle to logical bit shifts
|
||||
;
|
||||
|
@ -1627,6 +1627,40 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_
|
||||
ret <32 x i8> %shuffle
|
||||
}
|
||||
|
||||
define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48(<32 x i8> %a) {
|
||||
; AVX1-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 32, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 48>
|
||||
ret <32 x i8> %shuffle
|
||||
}
|
||||
|
||||
define <32 x i8> @shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) {
|
||||
; AVX1-LABEL: shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 47, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 63, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
|
||||
ret <32 x i8> %shuffle
|
||||
}
|
||||
|
||||
;
|
||||
; Shuffle to logical bit shifts
|
||||
;
|
||||
|
@ -745,6 +745,36 @@ define <4 x i64> @shuffle_v4i64_0415(<4 x i64> %a, <4 x i64> %b) {
|
||||
ret <4 x i64> %shuffle
|
||||
}
|
||||
|
||||
define <4 x i64> @shuffle_v4i64_z4z6(<4 x i64> %a) {
|
||||
; AVX1-LABEL: shuffle_v4i64_z4z6:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
|
||||
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v4i64_z4z6:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> <i32 0, i32 4, i32 0, i32 6>
|
||||
ret <4 x i64> %shuffle
|
||||
}
|
||||
|
||||
define <4 x i64> @shuffle_v4i64_5zuz(<4 x i64> %a) {
|
||||
; AVX1-LABEL: shuffle_v4i64_5zuz:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
|
||||
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[3]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v4i64_5zuz:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> <i32 5, i32 0, i32 undef, i32 0>
|
||||
ret <4 x i64> %shuffle
|
||||
}
|
||||
|
||||
define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) {
|
||||
; ALL-LABEL: stress_test1:
|
||||
; ALL: retq
|
||||
|
@ -1815,6 +1815,37 @@ define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) {
|
||||
ret <8 x i32> %shuffle
|
||||
}
|
||||
|
||||
define <8 x i32> @shuffle_v8i32_zuu8zuuc(<8 x i32> %a) {
|
||||
; AVX1-LABEL: shuffle_v8i32_zuu8zuuc:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v8i32_zuu8zuuc:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <8 x i32> <i32 0, i32 undef, i32 undef, i32 8, i32 0, i32 undef, i32 undef, i32 12>
|
||||
ret <8 x i32> %shuffle
|
||||
}
|
||||
|
||||
define <8 x i32> @shuffle_v8i32_9ubzdefz(<8 x i32> %a) {
|
||||
; AVX1-LABEL: shuffle_v8i32_9ubzdefz:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm1[2,0],ymm0[5,6],ymm1[6,4]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v8i32_9ubzdefz:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,ymm0[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <8 x i32> <i32 9, i32 undef, i32 11, i32 0, i32 13, i32 14, i32 15, i32 0>
|
||||
ret <8 x i32> %shuffle
|
||||
}
|
||||
|
||||
define <8 x float> @splat_mem_v8f32_2(float* %p) {
|
||||
; ALL-LABEL: splat_mem_v8f32_2:
|
||||
; ALL: # BB#0:
|
||||
|
Loading…
Reference in New Issue
Block a user