[X86][SSE4A] Shuffle lowering using SSE4A EXTRQ/INSERTQ instructions

This patch adds support for v8i16 and v16i8 shuffle lowering using the immediate versions of the SSE4A EXTRQ and INSERTQ instructions. Although rather limited (they can only act on the lower 64-bits of the source vectors, leave the upper 64-bits of the result vector undefined and don't have VEX encoded variants), the instructions are still useful for the zero extension of any lane (EXTRQ) or inserting a lane into another vector (INSERTQ). Testing demonstrated that it wasn't typically worth it to use these instructions for v2i64 or v4i32 vector shuffles although they are capable of it. As well as adding specific pattern matching for the shuffles, the patch uses EXTRQ for zero extension cases where SSE41 isn't available and its more efficient than the SSE2 'unpack' default approach. It also adds shuffle decode support for the EXTRQ / INSERTQ cases when the instructions are handling full byte-sized extractions / insertions. From this foundation, future patches will be able to make use of the instructions for situations that use their ability to extract/insert at the bit level. Differential Revision: http://reviews.llvm.org/D10146 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@241508 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-04 10:30:01 +00:00 · 2015-07-06 20:46:41 +00:00 · 2015-07-06 20:46:41 +00:00 · 3ecdd44e5d
commit 3ecdd44e5d
parent ecb00f403c
9 changed files with 521 additions and 6 deletions
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@ -878,6 +878,29 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
    DestName = getRegName(MI->getOperand(0).getReg());
    break;

+  case X86::EXTRQI:
+    if (MI->getOperand(2).isImm() &&
+        MI->getOperand(3).isImm())
+      DecodeEXTRQIMask(MI->getOperand(2).getImm(),
+                       MI->getOperand(3).getImm(),
+                       ShuffleMask);
+
+    DestName = getRegName(MI->getOperand(0).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    break;
+
+  case X86::INSERTQI:
+    if (MI->getOperand(3).isImm() &&
+        MI->getOperand(4).isImm())
+      DecodeINSERTQIMask(MI->getOperand(3).getImm(),
+                         MI->getOperand(4).getImm(),
+                         ShuffleMask);
+
+    DestName = getRegName(MI->getOperand(0).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    break;
+
  case X86::PMOVZXBWrr:
  case X86::PMOVZXBDrr:
  case X86::PMOVZXBQrr:
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@ -431,4 +431,78 @@ void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &Mask) {
  for (unsigned i = 1; i < NumElts; i++)
    Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i);
 }
+
+void DecodeEXTRQIMask(int Len, int Idx,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  // Only the bottom 6 bits are valid for each immediate.
+  Len &= 0x3F;
+  Idx &= 0x3F;
+
+  // We can only decode this bit extraction instruction as a shuffle if both the
+  // length and index work with whole bytes.
+  if (0 != (Len % 8) || 0 != (Idx % 8))
+    return;
+
+  // A length of zero is equivalent to a bit length of 64.
+  if (Len == 0)
+    Len = 64;
+
+  // If the length + index exceeds the bottom 64 bits the result is undefined.
+  if ((Len + Idx) > 64) {
+    ShuffleMask.append(16, SM_SentinelUndef);
+    return;
+  }
+
+  // Convert index and index to work with bytes.
+  Len /= 8;
+  Idx /= 8;
+
+  // EXTRQ: Extract Len bytes starting from Idx. Zero pad the remaining bytes
+  // of the lower 64-bits. The upper 64-bits are undefined.
+  for (int i = 0; i != Len; ++i)
+    ShuffleMask.push_back(i + Idx);
+  for (int i = Len; i != 8; ++i)
+    ShuffleMask.push_back(SM_SentinelZero);
+  for (int i = 8; i != 16; ++i)
+    ShuffleMask.push_back(SM_SentinelUndef);
+}
+
+void DecodeINSERTQIMask(int Len, int Idx,
+                        SmallVectorImpl<int> &ShuffleMask) {
+  // Only the bottom 6 bits are valid for each immediate.
+  Len &= 0x3F;
+  Idx &= 0x3F;
+
+  // We can only decode this bit insertion instruction as a shuffle if both the
+  // length and index work with whole bytes.
+  if (0 != (Len % 8) || 0 != (Idx % 8))
+    return;
+
+  // A length of zero is equivalent to a bit length of 64.
+  if (Len == 0)
+    Len = 64;
+
+  // If the length + index exceeds the bottom 64 bits the result is undefined.
+  if ((Len + Idx) > 64) {
+    ShuffleMask.append(16, SM_SentinelUndef);
+    return;
+  }
+
+  // Convert index and index to work with bytes.
+  Len /= 8;
+  Idx /= 8;
+
+  // INSERTQ: Extract lowest Len bytes from lower half of second source and
+  // insert over first source starting at Idx byte. The upper 64-bits are
+  // undefined.
+  for (int i = 0; i != Idx; ++i)
+    ShuffleMask.push_back(i);
+  for (int i = 0; i != Len; ++i)
+    ShuffleMask.push_back(i + 16);
+  for (int i = Idx + Len; i != 8; ++i)
+    ShuffleMask.push_back(i);
+  for (int i = 8; i != 16; ++i)
+    ShuffleMask.push_back(SM_SentinelUndef);
+}
+
 } // llvm namespace
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@ -100,6 +100,14 @@ void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
 /// \brief Decode a scalar float move instruction as a shuffle mask.
 void DecodeScalarMoveMask(MVT VT, bool IsLoad,
                          SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a SSE4A EXTRQ instruction as a v16i8 shuffle mask.
+void DecodeEXTRQIMask(int Len, int Idx,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a SSE4A INSERTQ instruction as a v16i8 shuffle mask.
+void DecodeINSERTQIMask(int Len, int Idx,
+                        SmallVectorImpl<int> &ShuffleMask);
 } // llvm namespace

 #endif
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -3938,6 +3938,15 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {
  return Subtarget->hasLZCNT();
 }

+/// isUndefInRange - Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size is undef.
+static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
+  for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
+    if (0 <= Mask[i])
+      return false;
+  return true;
+}
+
 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
 /// the specified range (L, H].
 static bool isUndefOrInRange(int Val, int Low, int Hi) {
@ -6914,6 +6923,136 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
  return SDValue();
 }

+/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
+static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
+                                           SDValue V2, ArrayRef<int> Mask,
+                                           SelectionDAG &DAG) {
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  assert(!Zeroable.all() && "Fully zeroable shuffle mask");
+
+  int Size = Mask.size();
+  int HalfSize = Size / 2;
+  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+  // Upper half must be undefined.
+  if (!isUndefInRange(Mask, HalfSize, HalfSize))
+    return SDValue();
+
+  // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
+  // Remainder of lower half result is zero and upper half is all undef.
+  auto LowerAsEXTRQ = [&]() {
+    // Determine the extraction length from the part of the
+    // lower half that isn't zeroable.
+    int Len = HalfSize;
+    for (; Len >= 0; --Len)
+      if (!Zeroable[Len - 1])
+        break;
+    assert(Len > 0 && "Zeroable shuffle mask");
+
+    // Attempt to match first Len sequential elements from the lower half.
+    SDValue Src;
+    int Idx = -1;
+    for (int i = 0; i != Len; ++i) {
+      int M = Mask[i];
+      if (M < 0)
+        continue;
+      SDValue &V = (M < Size ? V1 : V2);
+      M = M % Size;
+
+      // All mask elements must be in the lower half.
+      if (M > HalfSize)
+        return SDValue();
+
+      if (Idx < 0 || (Src == V && Idx == (M - i))) {
+        Src = V;
+        Idx = M - i;
+        continue;
+      }
+      return SDValue();
+    }
+
+    if (Idx < 0)
+      return SDValue();
+
+    assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
+    int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+    int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+    return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
+                       DAG.getConstant(BitLen, DL, MVT::i8),
+                       DAG.getConstant(BitIdx, DL, MVT::i8));
+  };
+
+  if (SDValue ExtrQ = LowerAsEXTRQ())
+    return ExtrQ;
+
+  // INSERTQ: Extract lowest Len elements from lower half of second source and
+  // insert over first source, starting at Idx.
+  // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
+  auto LowerAsInsertQ = [&]() {
+    for (int Idx = 0; Idx != HalfSize; ++Idx) {
+      SDValue Base;
+
+      // Attempt to match first source from mask before insertion point.
+      if (isUndefInRange(Mask, 0, Idx)) {
+        /* EMPTY */
+      } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
+        Base = V1;
+      } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
+        Base = V2;
+      } else {
+        continue;
+      }
+
+      // Extend the extraction length looking to match both the insertion of
+      // the second source and the remaining elements of the first.
+      for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
+        SDValue Insert;
+        int Len = Hi - Idx;
+
+        // Match insertion.
+        if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
+          Insert = V1;
+        } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
+          Insert = V2;
+        } else {
+          continue;
+        }
+
+        // Match the remaining elements of the lower half.
+        if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
+          /* EMPTY */
+        } else if ((!Base || (Base == V1)) &&
+                   isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
+          Base = V1;
+        } else if ((!Base || (Base == V2)) &&
+                   isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
+                                              Size + Hi)) {
+          Base = V2;
+        } else {
+          continue;
+        }
+
+        // We may not have a base (first source) - this can safely be undefined.
+        if (!Base)
+          Base = DAG.getUNDEF(VT);
+
+        int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+        int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+        return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
+                           DAG.getConstant(BitLen, DL, MVT::i8),
+                           DAG.getConstant(BitIdx, DL, MVT::i8));
+      }
+    }
+
+    return SDValue();
+  };
+
+  if (SDValue InsertQ = LowerAsInsertQ())
+    return InsertQ;
+
+  return SDValue();
+}
+
 /// \brief Lower a vector shuffle as a zero or any extension.
 ///
 /// Given a specific number of elements, element bit width, and extension
@ -6921,7 +7060,7 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
 /// features of the subtarget.
 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
    SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
-    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+    ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) {
  assert(Scale > 1 && "Need a scale to extend.");
  int NumElements = VT.getVectorNumElements();
  int EltBits = VT.getScalarSizeInBits();
@ -6958,6 +7097,28 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
                        getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG)));
  }

+  // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
+  // to 64-bits.
+  if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) {
+    assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
+    assert(VT.getSizeInBits() == 128 && "Unexpected vector width!");
+
+    SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+                             DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+                                         DAG.getConstant(EltBits, DL, MVT::i8),
+                                         DAG.getConstant(0, DL, MVT::i8)));
+    if (isUndefInRange(Mask, NumElements/2, NumElements/2))
+      return DAG.getNode(ISD::BITCAST, DL, VT, Lo);
+
+    SDValue Hi =
+        DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+                    DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+                                DAG.getConstant(EltBits, DL, MVT::i8),
+                                DAG.getConstant(EltBits, DL, MVT::i8)));
+    return DAG.getNode(ISD::BITCAST, DL, VT,
+                       DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
+  }
+
  // If this would require more than 2 unpack instructions to expand, use
  // pshufb when available. We can only use more than 2 unpack instructions
  // when zero extending i8 elements which also makes it easier to use pshufb.
@ -7048,7 +7209,7 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
      return SDValue();

    return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
-        DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
+        DL, VT, Scale, AnyExt, InputV, Mask, Subtarget, DAG);
  };

  // The widest scale possible for extending is to a 64-bit integer.
@ -8575,6 +8736,11 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
          lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG))
    return Shift;

+  // See if we can use SSE4A Extraction / Insertion.
+  if (Subtarget->hasSSE4A())
+    if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG))
+      return V;
+
  // There are special ways we can lower some single-element blends.
  if (NumV2Inputs == 1)
    if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
@ -8727,6 +8893,11 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
          DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
    return ZExt;

+  // See if we can use SSE4A Extraction / Insertion.
+  if (Subtarget->hasSSE4A())
+    if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG))
+      return V;
+
  int NumV2Elements =
      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });

@ -15116,6 +15287,9 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
    case INTR_TYPE_3OP:
      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
        Op.getOperand(2), Op.getOperand(3));
+    case INTR_TYPE_4OP:
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
    case INTR_TYPE_1OP_MASK_RM: {
      SDValue Src = Op.getOperand(1);
      SDValue PassThru = Op.getOperand(2);
@ -18509,6 +18683,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
  case X86ISD::FMINC:              return "X86ISD::FMINC";
  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
  case X86ISD::FRCP:               return "X86ISD::FRCP";
+  case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
+  case X86ISD::INSERTQI:           return "X86ISD::INSERTQI";
  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
  case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
  case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@ -394,6 +394,9 @@ namespace llvm {
      VINSERT,
      VEXTRACT,

+      /// SSE4A Extraction and Insertion.
+      EXTRQI, INSERTQI,
+
      // Vector multiply packed unsigned doubleword integers
      PMULUDQ,
      // Vector multiply packed signed doubleword integers
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@ -204,6 +204,14 @@ def X86pmuldq  : SDNode<"X86ISD::PMULDQ",
                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
                                       SDTCisSameAs<1,2>]>>;

+def X86extrqi : SDNode<"X86ISD::EXTRQI",
+                  SDTypeProfile<1, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
+                                       SDTCisVT<2, i8>, SDTCisVT<3, i8>]>>;
+def X86insertqi : SDNode<"X86ISD::INSERTQI",
+                    SDTypeProfile<1, 4, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
+                                         SDTCisSameAs<1,2>, SDTCisVT<3, i8>,
+                                         SDTCisVT<4, i8>]>>;
+
 // Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get
 // translated into one of the target nodes below during lowering.
 // Note: this is a work in progress...
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@ -7773,7 +7773,7 @@ let Constraints = "$src = $dst" in {
 def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
                 (ins VR128:$src, u8imm:$len, u8imm:$idx),
                 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
-                 [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len,
+                 [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
                                    imm:$idx))]>, PD;
 def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
              (ins VR128:$src, VR128:$mask),
@ -7784,8 +7784,8 @@ def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
 def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
                   (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
                   "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
-                   [(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src,
-                                      VR128:$src2, imm:$len, imm:$idx))]>, XD;
+                   [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
+                                      imm:$len, imm:$idx))]>, XD;
 def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
                 (ins VR128:$src, VR128:$mask),
                 "insertq\t{$mask, $src|$src, $mask}",
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@ -19,7 +19,7 @@ namespace llvm {
 enum IntrinsicType {
  INTR_NO_TYPE,
  GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX,
-  INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP,
+  INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP,
  CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI,
  INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM,
  INTR_TYPE_3OP_MASK, FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK,
@ -1079,6 +1079,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
  X86_INTRINSIC_DATA(sse41_pmovzxwd,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
  X86_INTRINSIC_DATA(sse41_pmovzxwq,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
  X86_INTRINSIC_DATA(sse41_pmuldq,      INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
+  X86_INTRINSIC_DATA(sse4a_extrqi,      INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
+  X86_INTRINSIC_DATA(sse4a_insertqi,    INTR_TYPE_4OP, X86ISD::INSERTQI, 0),
  X86_INTRINSIC_DATA(sse_comieq_ss,     COMI, X86ISD::COMI, ISD::SETEQ),
  X86_INTRINSIC_DATA(sse_comige_ss,     COMI, X86ISD::COMI, ISD::SETGE),
  X86_INTRINSIC_DATA(sse_comigt_ss,     COMI, X86ISD::COMI, ISD::SETGT),
--- a/test/CodeGen/X86/vector-shuffle-sse4a.ll
+++ b/test/CodeGen/X86/vector-shuffle-sse4a.ll
@ -0,0 +1,221 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=BTVER1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=BTVER2
+
+;
+; EXTRQI
+;
+
+define <16 x i8> @shuf_0zzzuuuuuuuuuuuu(<16 x i8> %a0) {
+; BTVER1-LABEL: shuf_0zzzuuuuuuuuuuuu:
+; BTVER1:       # BB#0:
+; BTVER1-NEXT:    extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT:    retq
+;
+; BTVER2-LABEL: shuf_0zzzuuuuuuuuuuuu:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; BTVER2-NEXT:    retq
+  %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %s
+}
+
+define <16 x i8> @shuf_0zzzzzzz1zzzzzzz(<16 x i8> %a0) {
+; BTVER1-LABEL: shuf_0zzzzzzz1zzzzzzz:
+; BTVER1:       # BB#0:
+; BTVER1-NEXT:    movaps %xmm0, %xmm1
+; BTVER1-NEXT:    extrq {{.*#+}} xmm1 = xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT:    extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; BTVER1-NEXT:    retq
+;
+; BTVER2-LABEL: shuf_0zzzzzzz1zzzzzzz:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; BTVER2-NEXT:    retq
+  %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <16 x i8> %s
+}
+
+define <16 x i8> @shuf_01zzuuuuuuuuuuuu(<16 x i8> %a0) {
+; BTVER1-LABEL: shuf_01zzuuuuuuuuuuuu:
+; BTVER1:       # BB#0:
+; BTVER1-NEXT:    extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT:    retq
+;
+; BTVER2-LABEL: shuf_01zzuuuuuuuuuuuu:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; BTVER2-NEXT:    retq
+  %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %s
+}
+
+define <16 x i8> @shuf_01zzzzzz23zzzzzz(<16 x i8> %a0) {
+; BTVER1-LABEL: shuf_01zzzzzz23zzzzzz:
+; BTVER1:       # BB#0:
+; BTVER1-NEXT:    movaps %xmm0, %xmm1
+; BTVER1-NEXT:    extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT:    extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; BTVER1-NEXT:    retq
+;
+; BTVER2-LABEL: shuf_01zzzzzz23zzzzzz:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; BTVER2-NEXT:    retq
+  %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 2, i32 3, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <16 x i8> %s
+}
+
+define <16 x i8> @shuf_1zzzuuuuuuuuuuuu(<16 x i8> %a0) {
+; ALL-LABEL: shuf_1zzzuuuuuuuuuuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    extrq {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %s
+}
+
+define <8 x i16> @shuf_1zzzuuuu(<8 x i16> %a0) {
+; ALL-LABEL: shuf_1zzzuuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    extrq {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_12zzuuuu(<8 x i16> %a0) {
+; ALL-LABEL: shuf_12zzuuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    extrq {{.*#+}} xmm0 = xmm0[2,3,4,5],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 2, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_012zuuuu(<8 x i16> %a0) {
+; ALL-LABEL: shuf_012zuuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_0zzz1zzz(<8 x i16> %a0) {
+; BTVER1-LABEL: shuf_0zzz1zzz:
+; BTVER1:       # BB#0:
+; BTVER1-NEXT:    movaps %xmm0, %xmm1
+; BTVER1-NEXT:    extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT:    extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; BTVER1-NEXT:    retq
+;
+; BTVER2-LABEL: shuf_0zzz1zzz:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; BTVER2-NEXT:    retq
+  %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 8, i32 8, i32 1, i32 8, i32 8, i32 8>
+  ret <8 x i16> %s
+}
+
+define <4 x i32> @shuf_0z1z(<4 x i32> %a0) {
+; BTVER1-LABEL: shuf_0z1z:
+; BTVER1:       # BB#0:
+; BTVER1-NEXT:    pxor %xmm1, %xmm1
+; BTVER1-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; BTVER1-NEXT:    retq
+;
+; BTVER2-LABEL: shuf_0z1z:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; BTVER2-NEXT:    retq
+  %s = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
+  ret <4 x i32> %s
+}
+
+;
+; INSERTQI
+;
+
+define <16 x i8> @shuf_0_0_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
+; ALL-LABEL: shuf_0_0_2_3_uuuu_uuuu_uuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    insertq {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 0, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %s
+}
+
+define <16 x i8> @shuf_0_16_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
+; ALL-LABEL: shuf_0_16_2_3_uuuu_uuuu_uuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    insertq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3,4,5,6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %s
+}
+
+define <16 x i8> @shuf_16_1_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
+; ALL-LABEL: shuf_16_1_2_3_uuuu_uuuu_uuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    insertq {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %s
+}
+
+define <8 x i16> @shuf_0823uuuu(<8 x i16> %a0, <8 x i16> %a1) {
+; ALL-LABEL: shuf_0823uuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1],xmm0[4,5,6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_0183uuuu(<8 x i16> %a0, <8 x i16> %a1) {
+; ALL-LABEL: shuf_0183uuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    insertq {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[0,1],xmm0[6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_0128uuuu(<8 x i16> %a0, <8 x i16> %a1) {
+; ALL-LABEL: shuf_0128uuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    insertq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[0,1],xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_0893uuuu(<8 x i16> %a0, <8 x i16> %a1) {
+; ALL-LABEL: shuf_0893uuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3],xmm0[6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_089Auuuu(<8 x i16> %a0, <8 x i16> %a1) {
+; ALL-LABEL: shuf_089Auuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3,4,5],xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_089uuuuu(<8 x i16> %a0, <8 x i16> %a1) {
+; ALL-LABEL: shuf_089uuuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3],xmm0[6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %s
+}