Reverted AVX-512 vector shuffle

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@240258 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Elena Demikhovsky
2015-06-22 09:01:15 +00:00
parent 12219f8c85
commit 42ceb12123
4 changed files with 573 additions and 792 deletions

View File

@@ -6259,42 +6259,6 @@ is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
return true;
}
/// \brief Test whether a shuffle mask is equivalent within each 256-bit lane.
///
/// This checks a shuffle mask to see if it is performing the same
/// 256-bit lane-relative shuffle in each 256-bit lane. This trivially implies
/// that it is also not lane-crossing. It may however involve a blend from the
/// same lane of a second vector.
///
/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
/// non-trivial to compute in the face of undef lanes. The representation is
/// *not* suitable for use with existing 256-bit shuffles as it will contain
/// entries from both V1 and V2 inputs to the wider mask.
static bool
is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
SmallVectorImpl<int> &RepeatedMask) {
int LaneSize = 256 / VT.getScalarSizeInBits();
RepeatedMask.resize(LaneSize, -1);
int Size = Mask.size();
for (int i = 0; i < Size; ++i) {
if (Mask[i] < 0)
continue;
if ((Mask[i] % Size) / LaneSize != i / LaneSize)
// This entry crosses lanes, so there is no way to model this shuffle.
return false;
// Ok, handle the in-lane shuffles by detecting if and when they repeat.
if (RepeatedMask[i % LaneSize] == -1)
// This is the first non-undef entry in this slot of a 256-bit lane.
RepeatedMask[i % LaneSize] =
Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
// Found a mismatch with the repeated mask.
return false;
}
return true;
}
/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
/// arguments.
///
@@ -6354,22 +6318,6 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
return DAG.getConstant(Imm, DL, MVT::i8);
}
/// \brief Get a 8-bit shuffle, 1 bit per lane, immediate for a mask.
///
/// This helper function produces an 8-bit shuffle immediate corresponding to
/// the ubiquitous shuffle encoding scheme used in x86 instructions for
/// shuffling 8 lanes.
static SDValue get1bitLaneShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
SelectionDAG &DAG) {
assert(Mask.size() <= 8 &&
"Up to 8 elts may be in Imm8 1-bit lane shuffle mask");
unsigned Imm = 0;
for (unsigned i = 0; i < Mask.size(); ++i)
if (Mask[i] >= 0)
Imm |= (Mask[i] % 2) << i;
return DAG.getConstant(Imm, DL, MVT::i8);
}
/// \brief Try to emit a blend instruction for a shuffle using bit math.
///
/// This is used as a fallback approach when first class blend instructions are
@@ -9385,30 +9333,6 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
DAG.getConstant(PermMask, DL, MVT::i8));
}
/// \brief Handle lowering 4-lane 128-bit shuffles.
static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> WidenedMask,
SelectionDAG &DAG) {
assert(WidenedMask.size() == 4 && "Unexpected mask size for 128bit shuffle!");
// form a 128-bit permutation.
// convert the 64-bit shuffle mask selection values into 128-bit selection
// bits defined by a vshuf64x2 instruction's immediate control byte.
unsigned PermMask = 0, Imm = 0;
for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
if(WidenedMask[i] == SM_SentinelZero)
return SDValue();
// use first element in place of undef musk
Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
PermMask |= (Imm % 4) << (i * 2);
}
return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
DAG.getConstant(PermMask, DL, MVT::i8));
}
/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
/// shuffling each lane.
///
@@ -10144,105 +10068,35 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
}
}
static SDValue lowerVectorShuffleWithVALIGN(SDLoc DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
assert(VT.getScalarSizeInBits() >= 32 && "Unexpected data type for VALIGN");
// VALIGN pattern 2, 3, 4, 5, .. (sequential, shifted right)
int AlignVal = -1;
for (int i = 0; i < (signed)VT.getVectorNumElements(); ++i) {
if (Mask[i] < 0)
continue;
if (Mask[i] < i)
return SDValue();
if (AlignVal == -1)
AlignVal = Mask[i] - i;
else if (Mask[i] - i != AlignVal)
return SDValue();
}
// Vector source operands should be swapped
return DAG.getNode(X86ISD::VALIGN, DL, VT, V2, V1,
DAG.getConstant(AlignVal, DL, MVT::i8));
}
static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV");
MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
SmallVector<SDValue, 32> VPermMask;
for (unsigned i = 0; i < VT.getVectorNumElements(); ++i)
VPermMask.push_back(Mask[i] < 0 ? DAG.getUNDEF(MaskEltVT) :
DAG.getConstant(Mask[i], DL,MaskEltVT));
SDValue MaskNode = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVecVT,
VPermMask);
if (isSingleInputShuffleMask(Mask))
return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
return DAG.getNode(X86ISD::VPERMV3, DL, VT, MaskNode, V1, V2);
}
/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
static SDValue lowerV8X64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
assert((V1.getSimpleValueType() == MVT::v8f64 ||
V1.getSimpleValueType() == MVT::v8i64) && "Bad operand type!");
assert((V2.getSimpleValueType() == MVT::v8f64 ||
V2.getSimpleValueType() == MVT::v8i64) && "Bad operand type!");
assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
SmallVector<int, 4> WidenedMask;
if (canWidenShuffleElements(Mask, WidenedMask))
if(SDValue Op = lowerV4X128VectorShuffle(DL, VT, V1, V2, WidenedMask, DAG))
return Op;
// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.
if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
if (SDValue Op = lowerVectorShuffleWithVALIGN(DL, VT, Mask, V1, V2, DAG))
return Op;
if (SDValue Op = lowerVectorShuffleWithSHUFPD(DL, VT, Mask, V1, V2, DAG))
return Op;
// PERMILPD instruction - mask 0/1, 0/1, 2/3, 2/3, 4/5, 4/5, 6/7, 6/7
if (isSingleInputShuffleMask(Mask)) {
if (!is128BitLaneCrossingShuffleMask(VT, Mask))
return DAG.getNode(X86ISD::VPERMILPI, DL, VT, V1,
get1bitLaneShuffleImm8ForMask(Mask, DL, DAG));
SmallVector<int, 4> RepeatedMask;
if (is256BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
return DAG.getNode(X86ISD::VPERMI, DL, VT, V1,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
}
return lowerVectorShuffleWithPERMV(DL, VT, Mask, V1, V2, DAG);
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
}
/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue lowerV16X32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
SDLoc DL(Op);
assert((V1.getSimpleValueType() == MVT::v16i32 ||
V1.getSimpleValueType() == MVT::v16f32) && "Bad operand type!");
assert((V2.getSimpleValueType() == MVT::v16i32 ||
V2.getSimpleValueType() == MVT::v16f32) && "Bad operand type!");
assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
@@ -10253,39 +10107,67 @@ static SDValue lowerV16X32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
0, 16, 1, 17, 4, 20, 5, 21,
// Second 128-bit lane.
8, 24, 9, 25, 12, 28, 13, 29}))
return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
if (isShuffleEquivalent(V1, V2, Mask,
{// First 128-bit lane.
2, 18, 3, 19, 6, 22, 7, 23,
// Second 128-bit lane.
10, 26, 11, 27, 14, 30, 15, 31}))
return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
12, 12, 14, 14}))
return DAG.getNode(X86ISD::MOVSLDUP, DL, VT, V1);
if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11,
13, 13, 15, 15}))
return DAG.getNode(X86ISD::MOVSHDUP, DL, VT, V1);
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
}
SmallVector<int, 4> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) {
if (isSingleInputShuffleMask(Mask)) {
unsigned Opc = VT.isInteger() ? X86ISD::PSHUFD : X86ISD::VPERMILPI;
return DAG.getNode(Opc, DL, VT, V1,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
}
/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
SDLoc DL(Op);
assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
for (int i = 0; i < 4; ++i)
if (RepeatedMask[i] >= 16)
RepeatedMask[i] -= 12;
return lowerVectorShuffleWithSHUFPS(DL, VT, RepeatedMask, V1, V2, DAG);
}
// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.
if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
if (SDValue Op = lowerVectorShuffleWithVALIGN(DL, VT, Mask, V1, V2, DAG))
return Op;
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
}
return lowerVectorShuffleWithPERMV(DL, VT, Mask, V1, V2, DAG);
/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
SDLoc DL(Op);
assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
// Use dedicated unpack instructions for masks that match their pattern.
if (isShuffleEquivalent(V1, V2, Mask,
{// First 128-bit lane.
0, 16, 1, 17, 4, 20, 5, 21,
// Second 128-bit lane.
8, 24, 9, 25, 12, 28, 13, 29}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
if (isShuffleEquivalent(V1, V2, Mask,
{// First 128-bit lane.
2, 18, 3, 19, 6, 22, 7, 23,
// Second 128-bit lane.
10, 26, 11, 27, 14, 30, 15, 31}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
}
/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
@@ -10345,11 +10227,13 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// the requisite ISA extensions for that element type are available.
switch (VT.SimpleTy) {
case MVT::v8f64:
case MVT::v8i64:
return lowerV8X64VectorShuffle(Op, V1, V2, Subtarget, DAG);
return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v16f32:
return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v8i64:
return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v16i32:
return lowerV16X32VectorShuffle(Op, V1, V2, Subtarget, DAG);
return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v32i16:
if (Subtarget->hasBWI())
return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);