[DAG] Further improve the logic in DAGCombiner that folds a pair of shuffles into a single shuffle if the resulting mask is legal.

This patch teaches the DAGCombiner how to fold shuffles according to the following new rules: 1. shuffle(shuffle(x, y), undef) -> x 2. shuffle(shuffle(x, y), undef) -> y 3. shuffle(shuffle(x, y), undef) -> shuffle(x, undef) 4. shuffle(shuffle(x, y), undef) -> shuffle(y, undef) The backend avoids to combine shuffles according to rules 3. and 4. if the resulting shuffle does not have a legal mask. This is to avoid introducing illegal shuffles that are potentially expanded into a sub-optimal sequence of target specific dag nodes during vector legalization. Added test case combine-vec-shuffle-2.ll to verify that we correctly triggers the new rules when combining shuffles. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@212748 91177308-0d34-0410-b5e6-96231b3b80d8
2025-03-28 21:38:44 +00:00 · 2014-07-10 18:04:55 +00:00 · 2014-07-10 18:04:55 +00:00 · 3996e5290a
commit 3996e5290a
parent 60079c18bc
3 changed files with 217 additions and 14 deletions
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -10677,17 +10677,12 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
  }

  // If this shuffle node is simply a swizzle of another shuffle node,
-  // and it reverses the swizzle of the previous shuffle then we can
-  // optimize shuffle(shuffle(x, undef), undef) -> x.
+  // then try to simplify it.
  if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
      N1.getOpcode() == ISD::UNDEF) {

    ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0);

-    // Shuffle nodes can only reverse shuffles with a single non-undef value.
-    if (N0.getOperand(1).getOpcode() != ISD::UNDEF)
-      return SDValue();
-
    // The incoming shuffle must be of the same type as the result of the
    // current shuffle.
    assert(OtherSV->getOperand(0).getValueType() == VT &&
@ -10704,27 +10699,69 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
        Idx = OtherSV->getMaskElt(Idx);
      Mask.push_back(Idx);
    }
+    
+    bool CommuteOperands = false;
+    if (N0.getOperand(1).getOpcode() != ISD::UNDEF) {
+      // To be valid, the combine shuffle mask should only reference elements
+      // from one of the two vectors in input to the inner shufflevector.
+      bool IsValidMask = true;
+      for (unsigned i = 0; i != NumElts && IsValidMask; ++i)
+        // See if the combined mask only reference undefs or elements coming
+        // from the first shufflevector operand.
+        IsValidMask = Mask[i] < 0 || (unsigned)Mask[i] < NumElts;

+      if (!IsValidMask) {
+        IsValidMask = true;
+        for (unsigned i = 0; i != NumElts && IsValidMask; ++i)
+          // Check that all the elements come from the second shuffle operand.
+          IsValidMask = Mask[i] < 0 || (unsigned)Mask[i] >= NumElts;
+        CommuteOperands = IsValidMask;
+      }
+
+      // Early exit if the combined shuffle mask is not valid.
+      if (!IsValidMask)
+        return SDValue();
+    }
+
+    // See if this pair of shuffles can be safely folded according to either
+    // of the following rules:
+    //   shuffle(shuffle(x, y), undef) -> x
+    //   shuffle(shuffle(x, undef), undef) -> x
+    //   shuffle(shuffle(x, y), undef) -> y
    bool IsIdentityMask = true;
+    unsigned BaseMaskIndex = CommuteOperands ? NumElts : 0;
    for (unsigned i = 0; i != NumElts && IsIdentityMask; ++i) {
      // Skip Undefs.
      if (Mask[i] < 0)
        continue;

      // The combined shuffle must map each index to itself.
-      IsIdentityMask = (unsigned)Mask[i] == i;
+      IsIdentityMask = (unsigned)Mask[i] == i + BaseMaskIndex;
    }
-
-    if (IsIdentityMask)
-      // optimize shuffle(shuffle(x, undef), undef) -> x.
+    
+    if (IsIdentityMask) {
+      if (CommuteOperands)
+        // optimize shuffle(shuffle(x, y), undef) -> y.
+        return OtherSV->getOperand(1);
+      
+      // optimize shuffle(shuffle(x, undef), undef) -> x
+      // optimize shuffle(shuffle(x, y), undef) -> x
      return OtherSV->getOperand(0);
+    }

    // It may still be beneficial to combine the two shuffles if the
    // resulting shuffle is legal.
-    //   shuffle(shuffle(x, undef, M1), undef, M2) -> shuffle(x, undef, M3).
-    if (TLI.isShuffleMaskLegal(Mask, VT))
-      return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(0), N1,
+    if (TLI.isShuffleMaskLegal(Mask, VT)) {
+      if (!CommuteOperands)
+        // shuffle(shuffle(x, undef, M1), undef, M2) -> shuffle(x, undef, M3).
+        // shuffle(shuffle(x, y, M1), undef, M2) -> shuffle(x, undef, M3)
+        return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(0), N1,
+                                    &Mask[0]);
+      
+      //   shuffle(shuffle(x, y, M1), undef, M2) -> shuffle(undef, y, M3)
+      return DAG.getVectorShuffle(VT, SDLoc(N), N1, N0->getOperand(1),
                                  &Mask[0]);
+    }
  }

  return SDValue();
--- a/test/CodeGen/X86/combine-vec-shuffle-2.ll
+++ b/test/CodeGen/X86/combine-vec-shuffle-2.ll
@ -0,0 +1,164 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
+
+; Check that DAGCombiner correctly folds the following pairs of shuffles
+; using the following rules:
+;  1. shuffle(shuffle(x, y), undef) -> x
+;  2. shuffle(shuffle(x, y), undef) -> y
+;  3. shuffle(shuffle(x, y), undef) -> shuffle(x, undef)
+;  4. shuffle(shuffle(x, y), undef) -> shuffle(undef, y)
+;
+; Rules 3. and 4. are used only if the resulting shuffle mask is legal.
+
+define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test1
+; Mask: [3,0,0,1]
+; CHECK: pshufd $67
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test2
+; Mask: [2,0,0,3]
+; CHECK: pshufd $-62
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test3(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test3
+; Mask: [2,0,0,3]
+; CHECK: pshufd $-62
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test4
+; Mask: [0,0,0,1]
+; CHECK: pshufd $64
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test5(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test5
+; Mask: [1,1]
+; CHECK: movhlps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test6(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test6
+; Mask: [2,0,0,0]
+; CHECK: pshufd $2
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test7(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test7
+; Mask: [0,2,0,2]
+; CHECK: pshufd $-120
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test8(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test8
+; Mask: [1,0,3,0]
+; CHECK: pshufd $49
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test9(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test9
+; Mask: [1,3,0,2]
+; CHECK: pshufd $-115
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test10(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test10
+; Mask: [1,0,1,0]
+; CHECK: pshufd $17
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test11(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test11
+; Mask: [1,0,2,1]
+; CHECK: pshufd $97
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test12(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test12
+; Mask: [0,0,0,0]
+; CHECK: pshufd $0
+; CHECK-NEXT: ret
+
+
+; The following pair of shuffles is folded into vector %A.
+define <4 x i32> @test13(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test13
+; CHECK-NOT: pshufd
+; CHECK: ret
+
+
+; The following pair of shuffles is folded into vector %B.
+define <4 x i32> @test14(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test14
+; CHECK-NOT: pshufd
+; CHECK: ret
+
--- a/test/CodeGen/X86/widen_shuffle-1.ll
+++ b/test/CodeGen/X86/widen_shuffle-1.ll
@ -33,7 +33,9 @@ entry:
 define void @shuf3(<4 x float> %tmp10, <4 x float> %vecinit15, <4 x float>* %dst) nounwind {
 entry:
 ; CHECK-LABEL: shuf3:
-; CHECK: shufps
+; CHECK-NOT: movlhps
+; CHECK-NOT: shufps
+; CHECK: pshufd
  %shuffle.i.i.i12 = shufflevector <4 x float> %tmp10, <4 x float> %vecinit15, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
  %tmp25.i.i = shufflevector <4 x float> %shuffle.i.i.i12, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 
  %tmp1.i.i = shufflevector <3 x float> %tmp25.i.i, <3 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>