diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index d64b77712cf..c2f284496d5 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -19335,7 +19335,9 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef Mask, assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!"); int Ratio = 16 / Mask.size(); for (unsigned i = 0; i < 16; ++i) { - int M = Ratio * Mask[i / Ratio] + i % Ratio; + int M = Mask[i / Ratio] != SM_SentinelZero + ? Ratio * Mask[i / Ratio] + i % Ratio + : 255; PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8)); } Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input); @@ -19384,8 +19386,9 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef Mask, /// combine-ordering. To fix this, we should do the redundant instruction /// combining in this recursive walk. static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, - ArrayRef IncomingMask, int Depth, - bool HasPSHUFB, SelectionDAG &DAG, + ArrayRef RootMask, + int Depth, bool HasPSHUFB, + SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { // Bound the depth of our recursive combine because this is ultimately @@ -19421,28 +19424,44 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, assert(VT.getVectorNumElements() == OpMask.size() && "Different mask size from vector size!"); + assert(((RootMask.size() > OpMask.size() && + RootMask.size() % OpMask.size() == 0) || + (OpMask.size() > RootMask.size() && + OpMask.size() % RootMask.size() == 0) || + OpMask.size() == RootMask.size()) && + "The smaller number of elements must divide the larger."); + int RootRatio = std::max(1, OpMask.size() / RootMask.size()); + int OpRatio = std::max(1, RootMask.size() / OpMask.size()); + assert(((RootRatio == 1 && OpRatio == 1) || + (RootRatio == 1) != (OpRatio == 1)) && + "Must not have a ratio for both incoming and op masks!"); SmallVector Mask; - Mask.reserve(std::max(OpMask.size(), IncomingMask.size())); + Mask.reserve(std::max(OpMask.size(), RootMask.size())); - // Merge this shuffle operation's mask into our accumulated mask. This is - // a bit tricky as the shuffle may have a different size from the root. - if (OpMask.size() == IncomingMask.size()) { - for (int M : IncomingMask) - Mask.push_back(OpMask[M]); - } else if (OpMask.size() < IncomingMask.size()) { - assert(IncomingMask.size() % OpMask.size() == 0 && - "The smaller number of elements must divide the larger."); - int Ratio = IncomingMask.size() / OpMask.size(); - for (int M : IncomingMask) - Mask.push_back(Ratio * OpMask[M / Ratio] + M % Ratio); - } else { - assert(OpMask.size() > IncomingMask.size() && "All other cases handled!"); - assert(OpMask.size() % IncomingMask.size() == 0 && - "The smaller number of elements must divide the larger."); - int Ratio = OpMask.size() / IncomingMask.size(); - for (int i = 0, e = OpMask.size(); i < e; ++i) - Mask.push_back(OpMask[Ratio * IncomingMask[i / Ratio] + i % Ratio]); + // Merge this shuffle operation's mask into our accumulated mask. Note that + // this shuffle's mask will be the first applied to the input, followed by the + // root mask to get us all the way to the root value arrangement. The reason + // for this order is that we are recursing up the operation chain. + for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) { + int RootIdx = i / RootRatio; + if (RootMask[RootIdx] == SM_SentinelZero) { + // This is a zero-ed lane, we're done. + Mask.push_back(SM_SentinelZero); + continue; + } + + int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio; + int OpIdx = RootMaskedIdx / OpRatio; + if (OpMask[OpIdx] == SM_SentinelZero) { + // The incoming lanes are zero, it doesn't matter which ones we are using. + Mask.push_back(SM_SentinelZero); + continue; + } + + // Ok, we have non-zero lanes, map them through. + Mask.push_back(OpMask[OpIdx] * OpRatio + + RootMaskedIdx % OpRatio); } // See if we can recurse into the operand to combine more things. diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index 913389382fa..6f49a03cb8b 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -313,3 +313,15 @@ entry: ret <16 x i8> %s.12.4 } + +define <16 x i8> @PR20540(<8 x i8> %a) { +; SSSE3-LABEL: @PR20540 +; SSSE3: # BB#0: +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pshufb {{.*}} # xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,0,0,0,0,0,0,0] +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: retq + %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %shuffle +}