diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index cd343192038..6255cce0d2b 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7888,40 +7888,39 @@ static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1, // [ 5, 6, 7, zz, zz, zz, zz, zz] // [ -1, 5, 6, 7, zz, zz, zz, zz] // [ 1, 2, -1, -1, -1, -1, zz, zz] - auto MatchByteShift = [&](int Shift) -> SDValue { - bool MatchLeft = true, MatchRight = true; - for (int l = 0; l < NumElts; l += NumLaneElts) { - for (int i = 0; i < Shift; ++i) - MatchLeft &= Zeroable[l + i]; - for (int i = NumLaneElts - Shift; i < NumLaneElts; ++i) - MatchRight &= Zeroable[l + i]; - } - if (!(MatchLeft || MatchRight)) - return SDValue(); - bool MatchV1 = true, MatchV2 = true; + auto CheckZeros = [&](int Shift, bool LeftShift) { + for (int l = 0; l < NumElts; l += NumLaneElts) + for (int i = 0; i < Shift; ++i) + if (!Zeroable[l + i + (LeftShift ? 0 : (NumLaneElts - Shift))]) + return false; + + return true; + }; + + auto MatchByteShift = [&](int Shift, bool LeftShift, SDValue V) { for (int l = 0; l < NumElts; l += NumLaneElts) { - unsigned Pos = MatchLeft ? Shift + l : l; - unsigned Low = MatchLeft ? l : Shift + l; + unsigned Pos = LeftShift ? Shift + l : l; + unsigned Low = LeftShift ? l : Shift + l; unsigned Len = NumLaneElts - Shift; - MatchV1 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low); - MatchV2 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low + NumElts); + if (!isSequentialOrUndefInRange(Mask, Pos, Len, + Low + (V == V1 ? 0 : NumElts))) + return SDValue(); } - if (!(MatchV1 || MatchV2)) - return SDValue(); int ByteShift = Shift * Scale; - unsigned Op = MatchRight ? X86ISD::VSRLDQ : X86ISD::VSHLDQ; - SDValue V = MatchV1 ? V1 : V2; + unsigned Op = LeftShift ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V); - V = DAG.getNode(Op, DL, ShiftVT, V, - DAG.getConstant(ByteShift, MVT::i8)); + V = DAG.getNode(Op, DL, ShiftVT, V, DAG.getConstant(ByteShift, MVT::i8)); return DAG.getNode(ISD::BITCAST, DL, VT, V); }; for (int Shift = 1; Shift < NumLaneElts; ++Shift) - if (SDValue S = MatchByteShift(Shift)) - return S; + for (bool LeftShift : {true, false}) + if (CheckZeros(Shift, LeftShift)) + for (SDValue V : {V1, V2}) + if (SDValue S = MatchByteShift(Shift, LeftShift, V)) + return S; // no match return SDValue(); diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index 9918d5e1d86..2d833467918 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -2146,3 +2146,35 @@ define <8 x i16> @shuffle_v8i16_0123456z(<8 x i16> %a) { %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> ret <8 x i16> %shuffle } + +define <8 x i16> @shuffle_v8i16_fu3ucc5u(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_fu3ucc5u: +; SSE2: # BB#0: +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,4] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_fu3ucc5u: +; SSSE3: # BB#0: +; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,4] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_fu3ucc5u: +; SSE41: # BB#0: +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,15,14,15,6,7,6,7,8,9,8,9,10,11,14,15] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_fu3ucc5u: +; AVX: # BB#0: +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,14,15,6,7,6,7,8,9,8,9,10,11,14,15] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %shuffle +}