Improve DAG combine pass on certain IR vector patterns

Loading 2 2x32-bit float vectors into the bottom half of a 256-bit vector produced suboptimal code in AVX2 mode with certain IR combinations. In particular, the IR optimizer folded 2f32 + 2f32 -> 4f32, 4f32 + 4f32 (undef) -> 8f32 into a 2f32 + 2f32 -> 8f32, which seems more canonical, but then mysteriously generated rather bad code; the movq/movhpd combination didn't match. The problem lay in the BUILD_VECTOR optimization path. The 2f32 inputs would get promoted to 4f32 by the type legalizer, eventually resulting in a BUILD_VECTOR on two 4f32 into an 8f32. The BUILD_VECTOR then, recognizing these were both half the output size, concatted them and then produced a shuffle. However, the resulting concat + shuffle was more complex than it should be; in the case where the upper half of the output is undef, we probably want to generate shuffle + concat instead. This enhancement causes the vector_shuffle combine step to recognize this suboptimal pattern and correct it. I included it there instead of in BUILD_VECTOR in case the same suboptimal pattern occurs for other reasons. This results in the optimizer correctly producing the optimal movq + movhpd sequence for all three variations on this IR, even with AVX2. I've included a test case. Radar link: rdar://problem/19287012 Fix for PR 21943. From: Fiona Glaser <fglaser@apple.com> git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@226360 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-05 17:39:16 +00:00 · 2015-01-17 01:35:56 +00:00 · 2015-01-17 01:35:56 +00:00 · 5eed637b34
commit 5eed637b34
parent f2a51a78f5
2 changed files with 56 additions and 1 deletions
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -11347,7 +11347,8 @@ static SDValue simplifyShuffleOperands(ShuffleVectorSDNode *SVN, SDValue N0,
  return DAG.getVectorShuffle(VT, SDLoc(SVN), S0, S1, SVN->getMask());
 }

-// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat.
+// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat,
+// or turn a shuffle of a single concat into simpler shuffle then concat.
 static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
  EVT VT = N->getValueType(0);
  unsigned NumElts = VT.getVectorNumElements();
@ -11361,6 +11362,18 @@ static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
  unsigned NumElemsPerConcat = ConcatVT.getVectorNumElements();
  unsigned NumConcats = NumElts / NumElemsPerConcat;

+  // Special case: shuffle(concat(A,B)) can be more efficiently represented
+  // as concat(shuffle(A,B),UNDEF) if the shuffle doesn't set any of the high
+  // half vector elements.
+  if (NumElemsPerConcat * 2 == NumElts && N1.getOpcode() == ISD::UNDEF &&
+      std::all_of(SVN->getMask().begin() + NumElemsPerConcat,
+                  SVN->getMask().end(), [](int i) { return i == -1; })) {
+    N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0), N0.getOperand(1),
+                              ArrayRef<int>(SVN->getMask().begin(), NumElemsPerConcat));
+    N1 = DAG.getUNDEF(ConcatVT);
+    return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1);
+  }
+
  // Look at every vector that's inserted. We're looking for exact
  // subvector-sized copies from a concatenated vector
  for (unsigned I = 0; I != NumConcats; ++I) {
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@ -1849,3 +1849,45 @@ define <8 x float> @splat_v8f32(<4 x float> %r) {
  %1 = shufflevector <4 x float> %r, <4 x float> undef, <8 x i32> zeroinitializer
  ret <8 x float> %1
 }
+
+define <8x float> @concat_v2f32_1(<2 x float>* %tmp64, <2 x float>* %tmp65) {
+; ALL-LABEL: concat_v2f32_1:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    vmovq (%rdi), %xmm0
+; ALL-NEXT:    vmovhpd (%rsi), %xmm0, %xmm0
+; ALL-NEXT:    retq
+entry:
+  %tmp74 = load <2 x float>* %tmp65, align 8
+  %tmp72 = load <2 x float>* %tmp64, align 8
+  %tmp73 = shufflevector <2 x float> %tmp72, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %tmp75 = shufflevector <2 x float> %tmp74, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %tmp76 = shufflevector <8 x float> %tmp73, <8 x float> %tmp75, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x float> %tmp76
+}
+
+define <8x float> @concat_v2f32_2(<2 x float>* %tmp64, <2 x float>* %tmp65) {
+; ALL-LABEL: concat_v2f32_2:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    vmovq (%rdi), %xmm0
+; ALL-NEXT:    vmovhpd (%rsi), %xmm0, %xmm0
+; ALL-NEXT:    retq
+entry:
+  %tmp74 = load <2 x float>* %tmp65, align 8
+  %tmp72 = load <2 x float>* %tmp64, align 8
+  %tmp76 = shufflevector <2 x float> %tmp72, <2 x float> %tmp74, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x float> %tmp76
+}
+
+define <8x float> @concat_v2f32_3(<2 x float>* %tmp64, <2 x float>* %tmp65) {
+; ALL-LABEL: concat_v2f32_3:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    vmovq (%rdi), %xmm0
+; ALL-NEXT:    vmovhpd (%rsi), %xmm0, %xmm0
+; ALL-NEXT:    retq
+entry:
+  %tmp74 = load <2 x float>* %tmp65, align 8
+  %tmp72 = load <2 x float>* %tmp64, align 8
+  %tmp76 = shufflevector <2 x float> %tmp72, <2 x float> %tmp74, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = shufflevector <4 x float> %tmp76, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x float> %res
+}