diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 5145731f623..3bde9918793 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11347,7 +11347,8 @@ static SDValue simplifyShuffleOperands(ShuffleVectorSDNode *SVN, SDValue N0, return DAG.getVectorShuffle(VT, SDLoc(SVN), S0, S1, SVN->getMask()); } -// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat. +// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat, +// or turn a shuffle of a single concat into simpler shuffle then concat. static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); @@ -11361,6 +11362,18 @@ static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) { unsigned NumElemsPerConcat = ConcatVT.getVectorNumElements(); unsigned NumConcats = NumElts / NumElemsPerConcat; + // Special case: shuffle(concat(A,B)) can be more efficiently represented + // as concat(shuffle(A,B),UNDEF) if the shuffle doesn't set any of the high + // half vector elements. + if (NumElemsPerConcat * 2 == NumElts && N1.getOpcode() == ISD::UNDEF && + std::all_of(SVN->getMask().begin() + NumElemsPerConcat, + SVN->getMask().end(), [](int i) { return i == -1; })) { + N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0), N0.getOperand(1), + ArrayRef(SVN->getMask().begin(), NumElemsPerConcat)); + N1 = DAG.getUNDEF(ConcatVT); + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1); + } + // Look at every vector that's inserted. We're looking for exact // subvector-sized copies from a concatenated vector for (unsigned I = 0; I != NumConcats; ++I) { diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index 77903da3558..e4bd4c4f817 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -1849,3 +1849,45 @@ define <8 x float> @splat_v8f32(<4 x float> %r) { %1 = shufflevector <4 x float> %r, <4 x float> undef, <8 x i32> zeroinitializer ret <8 x float> %1 } + +define <8x float> @concat_v2f32_1(<2 x float>* %tmp64, <2 x float>* %tmp65) { +; ALL-LABEL: concat_v2f32_1: +; ALL: # BB#0: # %entry +; ALL-NEXT: vmovq (%rdi), %xmm0 +; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0 +; ALL-NEXT: retq +entry: + %tmp74 = load <2 x float>* %tmp65, align 8 + %tmp72 = load <2 x float>* %tmp64, align 8 + %tmp73 = shufflevector <2 x float> %tmp72, <2 x float> undef, <8 x i32> + %tmp75 = shufflevector <2 x float> %tmp74, <2 x float> undef, <8 x i32> + %tmp76 = shufflevector <8 x float> %tmp73, <8 x float> %tmp75, <8 x i32> + ret <8 x float> %tmp76 +} + +define <8x float> @concat_v2f32_2(<2 x float>* %tmp64, <2 x float>* %tmp65) { +; ALL-LABEL: concat_v2f32_2: +; ALL: # BB#0: # %entry +; ALL-NEXT: vmovq (%rdi), %xmm0 +; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0 +; ALL-NEXT: retq +entry: + %tmp74 = load <2 x float>* %tmp65, align 8 + %tmp72 = load <2 x float>* %tmp64, align 8 + %tmp76 = shufflevector <2 x float> %tmp72, <2 x float> %tmp74, <8 x i32> + ret <8 x float> %tmp76 +} + +define <8x float> @concat_v2f32_3(<2 x float>* %tmp64, <2 x float>* %tmp65) { +; ALL-LABEL: concat_v2f32_3: +; ALL: # BB#0: # %entry +; ALL-NEXT: vmovq (%rdi), %xmm0 +; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0 +; ALL-NEXT: retq +entry: + %tmp74 = load <2 x float>* %tmp65, align 8 + %tmp72 = load <2 x float>* %tmp64, align 8 + %tmp76 = shufflevector <2 x float> %tmp72, <2 x float> %tmp74, <4 x i32> + %res = shufflevector <4 x float> %tmp76, <4 x float> undef, <8 x i32> + ret <8 x float> %res +}