[x86] Teach the new vector shuffle lowering to re-use the SHUFPS

lowering when it can use a symmetric SHUFPS across both 128-bit lanes. This required making the SHUFPS lowering tolerant of other vector types, and adjusting our canonicalization to canonicalize harder. This is the last of the clever uses of symmetry I've thought of for v8f32. The rest of the tricks I'm aware of here are to work around assymetry in the mask. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218216 91177308-0d34-0410-b5e6-96231b3b80d8
2025-01-10 02:36:06 +00:00 · 2014-09-21 13:35:14 +00:00 · 2014-09-21 13:35:14 +00:00 · 974542d7d8
commit 974542d7d8
parent 38e181630a
2 changed files with 28 additions and 15 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -7800,7 +7800,7 @@ static SDValue lowerVectorShuffleWithSHUPFS(SDLoc DL, MVT VT,
      // To make this work, blend them together as the first step.
      int V1Index = V2AdjIndex;
      int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
-      V2 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V2, V1,
+      V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
                       getV4X86ShuffleImm8ForMask(BlendMask, DAG));

      // Now proceed to reconstruct the final blend as we have the necessary
@ -7831,7 +7831,7 @@ static SDValue lowerVectorShuffleWithSHUPFS(SDLoc DL, MVT VT,
                          Mask[2] < 4 ? Mask[2] : Mask[3],
                          (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
                          (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
-      V1 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V2,
+      V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
                       getV4X86ShuffleImm8ForMask(BlendMask, DAG));

      // Now we do a normal shuffle of V1 by giving V1 as both operands to
@ -7843,7 +7843,7 @@ static SDValue lowerVectorShuffleWithSHUPFS(SDLoc DL, MVT VT,
      NewMask[3] = Mask[2] < 4 ? 3 : 1;
    }
  }
-  return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, LowV, HighV,
+  return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
                     getV4X86ShuffleImm8ForMask(NewMask, DAG));
 }

@ -9385,6 +9385,14 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
    if (isShuffleEquivalent(LoMask, 2, 10, 3, 11))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
+
+    // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
+    // have already handled any direct blends.
+    int SHUFPSMask[] = {Mask[0], Mask[1], Mask[2], Mask[3]};
+    for (int &M : SHUFPSMask)
+      if (M >= 8)
+        M -= 4;
+    return lowerVectorShuffleWithSHUPFS(DL, MVT::v8f32, SHUFPSMask, V1, V2, DAG);
  }

  if (isSingleInputShuffleMask(Mask))
@ -9531,7 +9539,9 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
    return DAG.getCommutedVectorShuffle(*SVOp);

  // When the number of V1 and V2 elements are the same, try to minimize the
-  // number of uses of V2 in the low half of the vector.
+  // number of uses of V2 in the low half of the vector. When that is tied,
+  // ensure that the sum of indices for V1 is equal to or lower than the sum
+  // indices for V2.
  if (NumV1Elements == NumV2Elements) {
    int LowV1Elements = 0, LowV2Elements = 0;
    for (int M : SVOp->getMask().slice(0, NumElements / 2))
@ -9541,6 +9551,15 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
        ++LowV1Elements;
    if (LowV2Elements > LowV1Elements)
      return DAG.getCommutedVectorShuffle(*SVOp);
+
+    int SumV1Indices = 0, SumV2Indices = 0;
+    for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
+      if (SVOp->getMask()[i] >= NumElements)
+        SumV2Indices += i;
+      else if (SVOp->getMask()[i] >= 0)
+        SumV1Indices += i;
+    if (SumV2Indices < SumV1Indices)
+      return DAG.getCommutedVectorShuffle(*SVOp);
  }

  // For each vector width, delegate to a specialized lowering routine.
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@ -153,9 +153,8 @@ define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_08084c4c
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm1[0,0,2,0,4,4,6,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,1,0,3,4,5,4,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; ALL-NEXT:    vshufps {{.*}} # ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
+; ALL-NEXT:    vshufps {{.*}} # ymm0 = ymm0[0,2,1,3,4,6,5,7]
 ; ALL-NEXT:    retq
  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
  ret <8 x float> %shuffle
@ -164,8 +163,7 @@ define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_8823cc67(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_8823cc67
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm1[0,0,2,3,4,4,6,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; ALL-NEXT:    vshufps {{.*}} # ymm0 = ymm1[0,0],ymm0[2,3],ymm1[4,4],ymm0[6,7]
 ; ALL-NEXT:    retq
  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
  ret <8 x float> %shuffle
@ -174,9 +172,7 @@ define <8 x float> @shuffle_v8f32_8823cc67(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_9832dc76(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_9832dc76
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm1[1,0,2,3,5,4,6,7]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,1,3,2,4,5,7,6]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; ALL-NEXT:    vshufps {{.*}} # ymm0 = ymm1[1,0],ymm0[3,2],ymm1[5,4],ymm0[7,6]
 ; ALL-NEXT:    retq
  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
  ret <8 x float> %shuffle
@ -185,9 +181,7 @@ define <8 x float> @shuffle_v8f32_9832dc76(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_9810dc54(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_9810dc54
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm1[1,0,2,3,5,4,6,7]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,1,1,0,4,5,5,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; ALL-NEXT:    vshufps {{.*}} # ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
 ; ALL-NEXT:    retq
  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
  ret <8 x float> %shuffle