diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 6e2f9f4851d..cdf8d0d40f8 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -2710,38 +2710,6 @@ unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { return Mask; } -/// isPSHUFHW_PSHUFLWMask - true if the specified VECTOR_SHUFFLE operand -/// specifies a 8 element shuffle that can be broken into a pair of -/// PSHUFHW and PSHUFLW. -static bool isPSHUFHW_PSHUFLWMask(SDNode *N) { - assert(N->getOpcode() == ISD::BUILD_VECTOR); - - if (N->getNumOperands() != 8) - return false; - - // Lower quadword shuffled. - for (unsigned i = 0; i != 4; ++i) { - SDValue Arg = N->getOperand(i); - if (Arg.getOpcode() == ISD::UNDEF) continue; - assert(isa(Arg) && "Invalid VECTOR_SHUFFLE mask!"); - unsigned Val = cast(Arg)->getZExtValue(); - if (Val >= 4) - return false; - } - - // Upper quadword shuffled. - for (unsigned i = 4; i != 8; ++i) { - SDValue Arg = N->getOperand(i); - if (Arg.getOpcode() == ISD::UNDEF) continue; - assert(isa(Arg) && "Invalid VECTOR_SHUFFLE mask!"); - unsigned Val = cast(Arg)->getZExtValue(); - if (Val < 4 || Val > 7) - return false; - } - - return true; -} - /// CommuteVectorShuffle - Swap vector_shuffle operands as well as /// values in ther permute mask. static SDValue CommuteVectorShuffle(SDValue Op, SDValue &V1, @@ -3556,262 +3524,382 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { return SDValue(); } +// v8i16 shuffles - Prefer shuffles in the following order: +// 1. [all] pshuflw, pshufhw, optional move +// 2. [ssse3] 1 x pshufb +// 3. [ssse3] 2 x pshufb + 1 x por +// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) static SDValue LowerVECTOR_SHUFFLEv8i16(SDValue V1, SDValue V2, SDValue PermMask, SelectionDAG &DAG, - TargetLowering &TLI, DebugLoc dl) { - SDValue NewV; - MVT MaskVT = MVT::getIntVectorWithNumElements(8); - MVT MaskEVT = MaskVT.getVectorElementType(); - MVT PtrVT = TLI.getPointerTy(); + X86TargetLowering &TLI, DebugLoc dl) { SmallVector MaskElts(PermMask.getNode()->op_begin(), PermMask.getNode()->op_end()); + SmallVector MaskVals; - // First record which half of which vector the low elements come from. - SmallVector LowQuad(4); - for (unsigned i = 0; i < 4; ++i) { + // Determine if more than 1 of the words in each of the low and high quadwords + // of the result come from the same quadword of one of the two inputs. Undef + // mask values count as coming from any quadword, for better codegen. + SmallVector LoQuad(4); + SmallVector HiQuad(4); + BitVector InputQuads(4); + for (unsigned i = 0; i < 8; ++i) { + SmallVectorImpl &Quad = i < 4 ? LoQuad : HiQuad; SDValue Elt = MaskElts[i]; - if (Elt.getOpcode() == ISD::UNDEF) + int EltIdx = Elt.getOpcode() == ISD::UNDEF ? -1 : + cast(Elt)->getZExtValue(); + MaskVals.push_back(EltIdx); + if (EltIdx < 0) { + ++Quad[0]; + ++Quad[1]; + ++Quad[2]; + ++Quad[3]; continue; - unsigned EltIdx = cast(Elt)->getZExtValue(); - int QuadIdx = EltIdx / 4; - ++LowQuad[QuadIdx]; + } + ++Quad[EltIdx / 4]; + InputQuads.set(EltIdx / 4); } - int BestLowQuad = -1; + int BestLoQuad = -1; unsigned MaxQuad = 1; for (unsigned i = 0; i < 4; ++i) { - if (LowQuad[i] > MaxQuad) { - BestLowQuad = i; - MaxQuad = LowQuad[i]; + if (LoQuad[i] > MaxQuad) { + BestLoQuad = i; + MaxQuad = LoQuad[i]; } } - // Record which half of which vector the high elements come from. - SmallVector HighQuad(4); - for (unsigned i = 4; i < 8; ++i) { - SDValue Elt = MaskElts[i]; - if (Elt.getOpcode() == ISD::UNDEF) - continue; - unsigned EltIdx = cast(Elt)->getZExtValue(); - int QuadIdx = EltIdx / 4; - ++HighQuad[QuadIdx]; - } - - int BestHighQuad = -1; + int BestHiQuad = -1; MaxQuad = 1; for (unsigned i = 0; i < 4; ++i) { - if (HighQuad[i] > MaxQuad) { - BestHighQuad = i; - MaxQuad = HighQuad[i]; + if (HiQuad[i] > MaxQuad) { + BestHiQuad = i; + MaxQuad = HiQuad[i]; } } - // If it's possible to sort parts of either half with PSHUF{H|L}W, then do it. - if (BestLowQuad != -1 || BestHighQuad != -1) { - // First sort the 4 chunks in order using shufpd. - SmallVector MaskVec; + // For SSSE3, If all 8 words of the result come from only 1 quadword of each + // of the two input vectors, shuffle them into one input vector so only a + // single pshufb instruction is necessary. If There are more than 2 input + // quads, disable the next transformation since it does not help SSSE3. + bool V1Used = InputQuads[0] || InputQuads[1]; + bool V2Used = InputQuads[2] || InputQuads[3]; + if (TLI.getSubtarget()->hasSSSE3()) { + if (InputQuads.count() == 2 && V1Used && V2Used) { + BestLoQuad = InputQuads.find_first(); + BestHiQuad = InputQuads.find_next(BestLoQuad); + } + if (InputQuads.count() > 2) { + BestLoQuad = -1; + BestHiQuad = -1; + } + } - if (BestLowQuad != -1) - MaskVec.push_back(DAG.getConstant(BestLowQuad, MVT::i32)); - else - MaskVec.push_back(DAG.getConstant(0, MVT::i32)); - - if (BestHighQuad != -1) - MaskVec.push_back(DAG.getConstant(BestHighQuad, MVT::i32)); - else - MaskVec.push_back(DAG.getConstant(1, MVT::i32)); - - SDValue Mask= DAG.getBUILD_VECTOR(MVT::v2i32, dl, &MaskVec[0],2); + // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update + // the shuffle mask. If a quad is scored as -1, that means that it contains + // words from all 4 input quadwords. + SDValue NewV; + if (BestLoQuad >= 0 || BestHiQuad >= 0) { + SmallVector MaskV; + MaskV.push_back(DAG.getConstant(BestLoQuad < 0 ? 0 : BestLoQuad, MVT::i64)); + MaskV.push_back(DAG.getConstant(BestHiQuad < 0 ? 1 : BestHiQuad, MVT::i64)); + SDValue Mask = DAG.getBUILD_VECTOR(MVT::v2i64, dl, &MaskV[0], 2); + NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v2i64, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), Mask); + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), Mask); NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); - // Now sort high and low parts separately. - BitVector InOrder(8); - if (BestLowQuad != -1) { - // Sort lower half in order using PSHUFLW. - MaskVec.clear(); - bool AnyOutOrder = false; - - for (unsigned i = 0; i != 4; ++i) { - SDValue Elt = MaskElts[i]; - if (Elt.getOpcode() == ISD::UNDEF) { - MaskVec.push_back(Elt); - InOrder.set(i); - } else { - unsigned EltIdx = cast(Elt)->getZExtValue(); - if (EltIdx != i) - AnyOutOrder = true; - - MaskVec.push_back(DAG.getConstant(EltIdx % 4, MaskEVT)); - - // If this element is in the right place after this shuffle, then - // remember it. - if ((int)(EltIdx / 4) == BestLowQuad) - InOrder.set(i); - } - } - if (AnyOutOrder) { - for (unsigned i = 4; i != 8; ++i) - MaskVec.push_back(DAG.getConstant(i, MaskEVT)); - SDValue Mask = DAG.getBUILD_VECTOR(MaskVT, dl, &MaskVec[0], 8); - NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, - NewV, NewV, Mask); - } - } - - if (BestHighQuad != -1) { - // Sort high half in order using PSHUFHW if possible. - MaskVec.clear(); - - for (unsigned i = 0; i != 4; ++i) - MaskVec.push_back(DAG.getConstant(i, MaskEVT)); - - bool AnyOutOrder = false; - for (unsigned i = 4; i != 8; ++i) { - SDValue Elt = MaskElts[i]; - if (Elt.getOpcode() == ISD::UNDEF) { - MaskVec.push_back(Elt); - InOrder.set(i); - } else { - unsigned EltIdx = cast(Elt)->getZExtValue(); - if (EltIdx != i) - AnyOutOrder = true; - - MaskVec.push_back(DAG.getConstant((EltIdx % 4) + 4, MaskEVT)); - - // If this element is in the right place after this shuffle, then - // remember it. - if ((int)(EltIdx / 4) == BestHighQuad) - InOrder.set(i); - } - } - - if (AnyOutOrder) { - SDValue Mask = DAG.getBUILD_VECTOR(MaskVT, dl, &MaskVec[0], 8); - NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, - NewV, NewV, Mask); - } - } - - // The other elements are put in the right place using pextrw and pinsrw. + // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the + // source words for the shuffle, to aid later transformations. + bool AllWordsInNewV = true; for (unsigned i = 0; i != 8; ++i) { - if (InOrder[i]) + int idx = MaskVals[i]; + if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) continue; - SDValue Elt = MaskElts[i]; - if (Elt.getOpcode() == ISD::UNDEF) - continue; - unsigned EltIdx = cast(Elt)->getZExtValue(); - SDValue ExtOp = (EltIdx < 8) - ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, - DAG.getConstant(EltIdx, PtrVT)) - : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, - DAG.getConstant(EltIdx - 8, PtrVT)); - NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, - DAG.getConstant(i, PtrVT)); + AllWordsInNewV = false; + break; } - return NewV; - } - - // PSHUF{H|L}W are not used. Lower into extracts and inserts but try to use as - // few as possible. First, let's find out how many elements are already in the - // right order. - unsigned V1InOrder = 0; - unsigned V1FromV1 = 0; - unsigned V2InOrder = 0; - unsigned V2FromV2 = 0; - SmallVector V1Elts; - SmallVector V2Elts; - for (unsigned i = 0; i < 8; ++i) { - SDValue Elt = MaskElts[i]; - if (Elt.getOpcode() == ISD::UNDEF) { - V1Elts.push_back(Elt); - V2Elts.push_back(Elt); - ++V1InOrder; - ++V2InOrder; - continue; - } - unsigned EltIdx = cast(Elt)->getZExtValue(); - if (EltIdx == i) { - V1Elts.push_back(Elt); - V2Elts.push_back(DAG.getConstant(i+8, MaskEVT)); - ++V1InOrder; - } else if (EltIdx == i+8) { - V1Elts.push_back(Elt); - V2Elts.push_back(DAG.getConstant(i, MaskEVT)); - ++V2InOrder; - } else if (EltIdx < 8) { - V1Elts.push_back(Elt); - V2Elts.push_back(DAG.getConstant(EltIdx+8, MaskEVT)); - ++V1FromV1; - } else { - V1Elts.push_back(Elt); - V2Elts.push_back(DAG.getConstant(EltIdx-8, MaskEVT)); - ++V2FromV2; - } - } - - if (V2InOrder > V1InOrder) { - PermMask = CommuteVectorShuffleMask(PermMask, DAG, dl); - std::swap(V1, V2); - std::swap(V1Elts, V2Elts); - std::swap(V1FromV1, V2FromV2); - } - - if ((V1FromV1 + V1InOrder) != 8) { - // Some elements are from V2. - if (V1FromV1) { - // If there are elements that are from V1 but out of place, - // then first sort them in place - SmallVector MaskVec; - for (unsigned i = 0; i < 8; ++i) { - SDValue Elt = V1Elts[i]; - if (Elt.getOpcode() == ISD::UNDEF) { - MaskVec.push_back(DAG.getUNDEF(MaskEVT)); + bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; + if (AllWordsInNewV) { + for (int i = 0; i != 8; ++i) { + int idx = MaskVals[i]; + if (idx < 0) continue; - } - unsigned EltIdx = cast(Elt)->getZExtValue(); - if (EltIdx >= 8) - MaskVec.push_back(DAG.getUNDEF(MaskEVT)); - else - MaskVec.push_back(DAG.getConstant(EltIdx, MaskEVT)); + idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; + if ((idx != i) && idx < 4) + pshufhw = false; + if ((idx != i) && idx > 3) + pshuflw = false; } - SDValue Mask = DAG.getBUILD_VECTOR(MaskVT, dl, &MaskVec[0], 8); - V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, V1, V1, Mask); + V1 = NewV; + V2Used = false; + BestLoQuad = 0; + BestHiQuad = 1; } - NewV = V1; - for (unsigned i = 0; i < 8; ++i) { - SDValue Elt = V1Elts[i]; - if (Elt.getOpcode() == ISD::UNDEF) - continue; - unsigned EltIdx = cast(Elt)->getZExtValue(); - if (EltIdx < 8) - continue; - SDValue ExtOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, - DAG.getConstant(EltIdx - 8, PtrVT)); - NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, - DAG.getConstant(i, PtrVT)); + // If we've eliminated the use of V2, and the new mask is a pshuflw or + // pshufhw, that's as cheap as it gets. Return the new shuffle. + if (pshufhw || pshuflw) { + MaskV.clear(); + for (unsigned i = 0; i != 8; ++i) + MaskV.push_back((MaskVals[i] < 0) ? DAG.getUNDEF(MVT::i16) + : DAG.getConstant(MaskVals[i], + MVT::i16)); + return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, NewV, + DAG.getUNDEF(MVT::v8i16), + DAG.getBUILD_VECTOR(MVT::v8i16, dl, &MaskV[0], 8)); } - return NewV; - } else { - // All elements are from V1. - NewV = V1; - for (unsigned i = 0; i < 8; ++i) { - SDValue Elt = V1Elts[i]; - if (Elt.getOpcode() == ISD::UNDEF) - continue; - unsigned EltIdx = cast(Elt)->getZExtValue(); - SDValue ExtOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, - DAG.getConstant(EltIdx, PtrVT)); - NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, - DAG.getConstant(i, PtrVT)); - } - return NewV; } + + // If we have SSSE3, and all words of the result are from 1 input vector, + // case 2 is generated, otherwise case 3 is generated. If no SSSE3 + // is present, fall back to case 4. + if (TLI.getSubtarget()->hasSSSE3()) { + SmallVector pshufbMask; + + // If we have elements from both input vectors, set the high bit of the + // shuffle mask element to zero out elements that come from V2 in the V1 + // mask, and elements that come from V1 in the V2 mask, so that the two + // results can be OR'd together. + bool TwoInputs = V1Used && V2Used; + for (unsigned i = 0; i != 8; ++i) { + int EltIdx = MaskVals[i] * 2; + if (TwoInputs && (EltIdx >= 16)) { + pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); + continue; + } + pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); + } + V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); + V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, + DAG.getBUILD_VECTOR(MVT::v16i8, dl, &pshufbMask[0], 16)); + if (!TwoInputs) + return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); + + // Calculate the shuffle mask for the second input, shuffle it, and + // OR it with the first shuffled input. + pshufbMask.clear(); + for (unsigned i = 0; i != 8; ++i) { + int EltIdx = MaskVals[i] * 2; + if (EltIdx < 16) { + pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); + continue; + } + pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); + } + V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); + V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, + DAG.getBUILD_VECTOR(MVT::v16i8, dl, &pshufbMask[0], 16)); + V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); + return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); + } + + // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, + // and update MaskVals with new element order. + BitVector InOrder(8); + if (BestLoQuad >= 0) { + SmallVector MaskV; + for (int i = 0; i != 4; ++i) { + int idx = MaskVals[i]; + if (idx < 0) { + MaskV.push_back(DAG.getUNDEF(MVT::i16)); + InOrder.set(i); + } else if ((idx / 4) == BestLoQuad) { + MaskV.push_back(DAG.getConstant(idx & 3, MVT::i16)); + InOrder.set(i); + } else { + MaskV.push_back(DAG.getUNDEF(MVT::i16)); + } + } + for (unsigned i = 4; i != 8; ++i) + MaskV.push_back(DAG.getConstant(i, MVT::i16)); + NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, NewV, + DAG.getUNDEF(MVT::v8i16), + DAG.getBUILD_VECTOR(MVT::v8i16, dl, &MaskV[0], 8)); + } + + // If BestHi >= 0, generate a pshufhw to put the high elements in order, + // and update MaskVals with the new element order. + if (BestHiQuad >= 0) { + SmallVector MaskV; + for (unsigned i = 0; i != 4; ++i) + MaskV.push_back(DAG.getConstant(i, MVT::i16)); + for (unsigned i = 4; i != 8; ++i) { + int idx = MaskVals[i]; + if (idx < 0) { + MaskV.push_back(DAG.getUNDEF(MVT::i16)); + InOrder.set(i); + } else if ((idx / 4) == BestHiQuad) { + MaskV.push_back(DAG.getConstant((idx & 3) + 4, MVT::i16)); + InOrder.set(i); + } else { + MaskV.push_back(DAG.getUNDEF(MVT::i16)); + } + } + NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, NewV, + DAG.getUNDEF(MVT::v8i16), + DAG.getBUILD_VECTOR(MVT::v8i16, dl, &MaskV[0], 8)); + } + + // In case BestHi & BestLo were both -1, which means each quadword has a word + // from each of the four input quadwords, calculate the InOrder bitvector now + // before falling through to the insert/extract cleanup. + if (BestLoQuad == -1 && BestHiQuad == -1) { + NewV = V1; + for (int i = 0; i != 8; ++i) + if (MaskVals[i] < 0 || MaskVals[i] == i) + InOrder.set(i); + } + + // The other elements are put in the right place using pextrw and pinsrw. + for (unsigned i = 0; i != 8; ++i) { + if (InOrder[i]) + continue; + int EltIdx = MaskVals[i]; + if (EltIdx < 0) + continue; + SDValue ExtOp = (EltIdx < 8) + ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, + DAG.getIntPtrConstant(EltIdx)) + : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, + DAG.getIntPtrConstant(EltIdx - 8)); + NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, + DAG.getIntPtrConstant(i)); + } + return NewV; +} + +// v16i8 shuffles - Prefer shuffles in the following order: +// 1. [ssse3] 1 x pshufb +// 2. [ssse3] 2 x pshufb + 1 x por +// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw +static +SDValue LowerVECTOR_SHUFFLEv16i8(SDValue V1, SDValue V2, + SDValue PermMask, SelectionDAG &DAG, + X86TargetLowering &TLI, DebugLoc dl) { + SmallVector MaskElts(PermMask.getNode()->op_begin(), + PermMask.getNode()->op_end()); + SmallVector MaskVals; + + // If we have SSSE3, case 1 is generated when all result bytes come from + // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is + // present, fall back to case 3. + // FIXME: kill V2Only once shuffles are canonizalized by getNode. + bool V1Only = true; + bool V2Only = true; + for (unsigned i = 0; i < 16; ++i) { + SDValue Elt = MaskElts[i]; + int EltIdx = Elt.getOpcode() == ISD::UNDEF ? -1 : + cast(Elt)->getZExtValue(); + MaskVals.push_back(EltIdx); + if (EltIdx < 0) + continue; + if (EltIdx < 16) + V2Only = false; + else + V1Only = false; + } + + // If SSSE3, use 1 pshufb instruction per vector with elements in the result. + if (TLI.getSubtarget()->hasSSSE3()) { + SmallVector pshufbMask; + + // If all result elements are from one input vector, then only translate + // undef mask values to 0x80 (zero out result) in the pshufb mask. + // + // Otherwise, we have elements from both input vectors, and must zero out + // elements that come from V2 in the first mask, and V1 in the second mask + // so that we can OR them together. + bool TwoInputs = !(V1Only || V2Only); + for (unsigned i = 0; i != 16; ++i) { + int EltIdx = MaskVals[i]; + if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { + pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); + continue; + } + pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); + } + // If all the elements are from V2, assign it to V1 and return after + // building the first pshufb. + if (V2Only) + V1 = V2; + V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, + DAG.getBUILD_VECTOR(MVT::v16i8, dl, &pshufbMask[0], 16)); + if (!TwoInputs) + return V1; + + // Calculate the shuffle mask for the second input, shuffle it, and + // OR it with the first shuffled input. + pshufbMask.clear(); + for (unsigned i = 0; i != 16; ++i) { + int EltIdx = MaskVals[i]; + if (EltIdx < 16) { + pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); + continue; + } + pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); + } + V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, + DAG.getBUILD_VECTOR(MVT::v16i8, dl, &pshufbMask[0], 16)); + return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); + } + + // No SSSE3 - Calculate in place words and then fix all out of place words + // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from + // the 16 different words that comprise the two doublequadword input vectors. + V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); + V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); + SDValue NewV = V2Only ? V2 : V1; + for (int i = 0; i != 8; ++i) { + int Elt0 = MaskVals[i*2]; + int Elt1 = MaskVals[i*2+1]; + + // This word of the result is all undef, skip it. + if (Elt0 < 0 && Elt1 < 0) + continue; + + // This word of the result is already in the correct place, skip it. + if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) + continue; + if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) + continue; + + SDValue Elt0Src = Elt0 < 16 ? V1 : V2; + SDValue Elt1Src = Elt1 < 16 ? V1 : V2; + SDValue InsElt; + + // If Elt1 is defined, extract it from the appropriate source. If the + // source byte is not also odd, shift the extracted word left 8 bits. + if (Elt1 >= 0) { + InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, + DAG.getIntPtrConstant(Elt1 / 2)); + if ((Elt1 & 1) == 0) + InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, + DAG.getConstant(8, TLI.getShiftAmountTy())); + } + // If Elt0 is defined, extract it from the appropriate source. If the + // source byte is not also even, shift the extracted word right 8 bits. If + // Elt1 was also defined, OR the extracted values together before + // inserting them in the result. + if (Elt0 >= 0) { + SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, + Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); + if ((Elt0 & 1) != 0) + InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, + DAG.getConstant(8, TLI.getShiftAmountTy())); + InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) + : InsElt0; + } + NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, + DAG.getIntPtrConstant(i)); + } + return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); } /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide @@ -4078,6 +4166,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { bool V1IsSplat = false; bool V2IsSplat = false; + // FIXME: Check for legal shuffle and return? + if (isUndefShuffle(Op.getNode())) return DAG.getUNDEF(VT); @@ -4239,6 +4329,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { return Op; } + // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. // Try PSHUF* first, then SHUFP*. // MMX doesn't have PSHUFD but it does have PSHUFW. While it's theoretically // possible to shuffle a v2i32 using PSHUFW, that's not yet implemented. @@ -4281,6 +4372,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { return NewOp; } + if (VT == MVT::v16i8) { + SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(V1, V2, PermMask, DAG, *this, dl); + if (NewOp.getNode()) + return NewOp; + } + // Handle all 4 wide cases with a number of shuffles except for MMX. if (NumElems == 4 && !isMMX) return LowerVECTOR_SHUFFLE_4wide(V1, V2, PermMask, VT, DAG, dl); @@ -4435,7 +4532,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){ if ((EVT.getSizeInBits() == 8 || EVT.getSizeInBits() == 16) && isa(N2)) { unsigned Opc = (EVT.getSizeInBits() == 8) ? X86ISD::PINSRB - : X86ISD::PINSRW; + : X86ISD::PINSRW; // Transform it so it match pinsr{b,w} which expects a GR32 as its second // argument. if (N1.getValueType() != MVT::i32) @@ -6830,6 +6927,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; case X86ISD::PINSRB: return "X86ISD::PINSRB"; case X86ISD::PINSRW: return "X86ISD::PINSRW"; + case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMIN: return "X86ISD::FMIN"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; @@ -6948,12 +7046,14 @@ bool X86TargetLowering::isTruncateFree(MVT VT1, MVT VT2) const { bool X86TargetLowering::isShuffleMaskLegal(SDValue Mask, MVT VT) const { // Only do shuffles on 128-bit vector types for now. + // FIXME: pshufb, blends if (VT.getSizeInBits() == 64) return false; return (Mask.getNode()->getNumOperands() <= 4 || isIdentityMask(Mask.getNode()) || isIdentityMask(Mask.getNode(), true) || isSplatMask(Mask.getNode()) || - isPSHUFHW_PSHUFLWMask(Mask.getNode()) || + X86::isPSHUFHWMask(Mask.getNode()) || + X86::isPSHUFLWMask(Mask.getNode()) || X86::isUNPCKLMask(Mask.getNode()) || X86::isUNPCKHMask(Mask.getNode()) || X86::isUNPCKL_v_undef_Mask(Mask.getNode()) || diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 2149847ff9b..09332930a77 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -176,6 +176,9 @@ namespace llvm { /// corresponds to X86::PINSRW. PINSRW, + /// PSHUFB - Shuffle 16 8-bit values within a vector. + PSHUFB, + /// FMAX, FMIN - Floating point max and min. /// FMAX, FMIN, diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 2b685a704c5..407b4f1b063 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -36,6 +36,9 @@ def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>; def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>; def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; +def X86pshufb : SDNode<"X86ISD::PSHUFB", + SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; def X86pextrb : SDNode<"X86ISD::PEXTRB", SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>; def X86pextrw : SDNode<"X86ISD::PEXTRW", @@ -2845,6 +2848,11 @@ let Constraints = "$src1 = $dst" in { imm:$src3))]>, OpSize; } +def : Pat<(X86pshufb VR128:$src, VR128:$mask), + (PSHUFBrr128 VR128:$src, VR128:$mask)>, Requires<[HasSSSE3]>; +def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))), + (PSHUFBrm128 VR128:$src, addr:$mask)>, Requires<[HasSSSE3]>; + //===----------------------------------------------------------------------===// // Non-Instruction Patterns //===----------------------------------------------------------------------===// diff --git a/test/CodeGen/X86/vec_shuffle-12.ll b/test/CodeGen/X86/vec_shuffle-12.ll index aad27ea2f8a..98b455a2351 100644 --- a/test/CodeGen/X86/vec_shuffle-12.ll +++ b/test/CodeGen/X86/vec_shuffle-12.ll @@ -1,8 +1,8 @@ -; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 > %t +; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah > %t ; RUN: not grep punpck %t ; RUN: grep pextrw %t | count 4 ; RUN: grep pinsrw %t | count 6 -; RUN: grep pshuflw %t | count 3 +; RUN: grep pshuflw %t | count 1 ; RUN: grep pshufhw %t | count 2 define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind { diff --git a/test/CodeGen/X86/vec_shuffle-13.ll b/test/CodeGen/X86/vec_shuffle-13.ll index 4511e9563a5..61cd1286e75 100644 --- a/test/CodeGen/X86/vec_shuffle-13.ll +++ b/test/CodeGen/X86/vec_shuffle-13.ll @@ -1,7 +1,7 @@ -; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 > %t +; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah > %t ; RUN: grep movlhps %t | count 1 -; RUN: grep movss %t | count 1 ; RUN: grep pshufd %t | count 1 +; RUN: grep movss %t | count 1 ; RUN: grep pshuflw %t | count 1 ; RUN: grep pshufhw %t | count 1 diff --git a/test/CodeGen/X86/vec_shuffle-2.ll b/test/CodeGen/X86/vec_shuffle-2.ll index ae698010681..b0495653102 100644 --- a/test/CodeGen/X86/vec_shuffle-2.ll +++ b/test/CodeGen/X86/vec_shuffle-2.ll @@ -1,4 +1,4 @@ -; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f +; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f ; RUN: grep pshufhw %t | count 1 ; RUN: grep pshuflw %t | count 1 ; RUN: grep movhps %t | count 1 diff --git a/test/CodeGen/X86/vec_shuffle-21.ll b/test/CodeGen/X86/vec_shuffle-21.ll index 5409acf2f7e..eec269159f7 100644 --- a/test/CodeGen/X86/vec_shuffle-21.ll +++ b/test/CodeGen/X86/vec_shuffle-21.ll @@ -1,4 +1,4 @@ -; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f +; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f ; RUN: grep pshuflw %t | count 1 ; RUN: grep pextrw %t | count 2 ; RUN: grep pinsrw %t | count 2 diff --git a/test/CodeGen/X86/vec_shuffle-28.ll b/test/CodeGen/X86/vec_shuffle-28.ll index 0c81e77b95e..f7e5001e64d 100644 --- a/test/CodeGen/X86/vec_shuffle-28.ll +++ b/test/CodeGen/X86/vec_shuffle-28.ll @@ -1,8 +1,12 @@ -; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 -o %t -f -; RUN: grep punpcklwd %t | count 1 -; RUN: grep pextrw %t | count 6 -; RUN: grep pinsrw %t | count 8 +; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f +; RUN: grep movd %t | count 1 +; RUN: grep pshuflw %t | count 1 +; RUN: grep pinsrw %t | count 1 +; RUN: llvm-as < %s | llc -march=x86 -mcpu=core2 -o %t -f +; RUN: grep pshufb %t | count 1 +; FIXME: this test has a superfluous punpcklqdq pre-pshufb currently. +; Don't XFAIL it because it's still better than the previous code. ; Pack various elements via shuffles. define <8 x i16> @shuf1(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { @@ -10,24 +14,3 @@ entry: %tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > ret <8 x i16> %tmp7 } - - -define <8 x i16> @shuf2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -entry: - %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef > - ret <8 x i16> %tmp8 -} - - -define <8 x i16> @shuf3(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -entry: - %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef > - ret <8 x i16> %tmp9 -} - - -define <8 x i16> @shuf4(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -entry: - %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef > - ret <8 x i16> %tmp9 -} diff --git a/test/CodeGen/X86/vec_shuffle-29.ll b/test/CodeGen/X86/vec_shuffle-29.ll index aac63c3bbec..49cb0a72e88 100644 --- a/test/CodeGen/X86/vec_shuffle-29.ll +++ b/test/CodeGen/X86/vec_shuffle-29.ll @@ -1,4 +1,4 @@ -; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 -disable-mmx -o %t -f +; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41,-ssse3 -disable-mmx -o %t -f ; RUN: not grep pextrw %t ; RUN: grep pinsrw %t diff --git a/test/CodeGen/X86/vec_shuffle-31.ll b/test/CodeGen/X86/vec_shuffle-31.ll new file mode 100644 index 00000000000..0a9dc1fa7ba --- /dev/null +++ b/test/CodeGen/X86/vec_shuffle-31.ll @@ -0,0 +1,13 @@ +; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f +; RUN: grep pextrw %t | count 1 +; RUN: grep punpcklqdq %t | count 1 +; RUN: grep pshufhw %t | count 1 +; RUN: grep pinsrw %t | count 1 +; RUN: llvm-as < %s | llc -march=x86 -mcpu=core2 -o %t -f +; RUN: grep pshufb %t | count 1 + +define <8 x i16> @shuf3(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { +entry: + %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef > + ret <8 x i16> %tmp9 +} diff --git a/test/CodeGen/X86/vec_shuffle-32.ll b/test/CodeGen/X86/vec_shuffle-32.ll new file mode 100644 index 00000000000..3a819483262 --- /dev/null +++ b/test/CodeGen/X86/vec_shuffle-32.ll @@ -0,0 +1,13 @@ +; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f +; RUN: grep punpcklqdq %t | count 1 +; RUN: grep pextrw %t | count 1 +; RUN: grep pshufd %t | count 1 +; RUN: grep pinsrw %t | count 1 +; RUN: llvm-as < %s | llc -march=x86 -mcpu=core2 -o %t -f +; RUN: grep pshufb %t | count 1 + +define <8 x i16> @shuf4(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { +entry: + %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef > + ret <8 x i16> %tmp9 +} diff --git a/test/CodeGen/X86/vec_shuffle-33.ll b/test/CodeGen/X86/vec_shuffle-33.ll new file mode 100644 index 00000000000..e3d630474bf --- /dev/null +++ b/test/CodeGen/X86/vec_shuffle-33.ll @@ -0,0 +1,11 @@ +; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f +; RUN: grep punpcklqdq %t | count 1 +; RUN: grep pshufhw %t | count 1 +; RUN: not grep pextrw %t +; RUN: not grep pinsrw %t + +define <8 x i16> @shuf5(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { +entry: + %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef , i32 undef > + ret <8 x i16> %tmp9 +} diff --git a/test/CodeGen/X86/vec_shuffle-34.ll b/test/CodeGen/X86/vec_shuffle-34.ll new file mode 100644 index 00000000000..99c95d1623e --- /dev/null +++ b/test/CodeGen/X86/vec_shuffle-34.ll @@ -0,0 +1,13 @@ +; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f +; RUN: grep pextrw %t | count 1 +; RUN: grep punpcklqdq %t | count 1 +; RUN: grep pshuflw %t | count 1 +; RUN: grep pinsrw %t | count 1 +; RUN: llvm-as < %s | llc -march=x86 -mcpu=core2 -o %t -f +; RUN: grep pshufb %t | count 2 + +define <8 x i16> @shuf2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { +entry: + %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef > + ret <8 x i16> %tmp8 +} diff --git a/test/CodeGen/X86/vec_shuffle-35.ll b/test/CodeGen/X86/vec_shuffle-35.ll new file mode 100644 index 00000000000..f2b241fa486 --- /dev/null +++ b/test/CodeGen/X86/vec_shuffle-35.ll @@ -0,0 +1,20 @@ +; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f +; RUN: grep pextrw %t | count 13 +; RUN: grep pinsrw %t | count 14 +; RUN: grep rolw %t | count 13 +; RUN: not grep esp %t +; RUN: not grep ebp %t +; RUN: llvm-as < %s | llc -march=x86 -mcpu=core2 -o %t -f +; RUN: grep pshufb %t | count 3 + +define <16 x i8> @shuf1(<16 x i8> %T0) nounwind readnone { +entry: + %tmp8 = shufflevector <16 x i8> %T0, <16 x i8> undef, <16 x i32> < i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 12, i32 13, i32 15 , i32 14 > + ret <16 x i8> %tmp8 +} + +define <16 x i8> @shuf2(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { +entry: + %tmp8 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> < i32 undef, i32 undef, i32 3, i32 2, i32 17, i32 16, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 12, i32 13, i32 15 , i32 14 > + ret <16 x i8> %tmp8 +}