[x86] Teach the new vector shuffle lowering the first step toward more

actual support for complex AVX shuffling tricks. We can do independent blends of the low and high 128-bit lanes of an avx vector, so shuffle the inputs into place and then do the blend at 256 bits. This will in many cases remove one blend instruction. The next step is to permute the low and high halves in-place rather than extracting them and re-inserting them. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218202 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-05 17:39:16 +00:00 · 2014-09-21 09:35:22 +00:00 · 2014-09-21 09:35:22 +00:00 · 291140b112
commit 291140b112
parent 31b080d57f
2 changed files with 68 additions and 35 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -9292,6 +9292,44 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                     lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG));
 }

+/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
+///
+/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
+/// isn't available.
+static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+  if (isHalfCrossingShuffleMask(Mask) ||
+      isSingleInputShuffleMask(Mask))
+    return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
+
+  // Shuffle the input elements into the desired positions in V1 and V2 and
+  // blend them together.
+  int V1Mask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
+  int V2Mask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
+  unsigned BlendMask = 0;
+  for (int i = 0; i < 8; ++i)
+    if (Mask[i] >= 0 && Mask[i] < 8) {
+      V1Mask[i] = Mask[i];
+    } else if (Mask[i] >= 8) {
+      V2Mask[i] = Mask[i] - 8;
+      BlendMask |= 1 << i;
+    }
+
+  V1 = DAG.getVectorShuffle(MVT::v8f32, DL, V1, DAG.getUNDEF(MVT::v8f32), V1Mask);
+  V2 = DAG.getVectorShuffle(MVT::v8f32, DL, V2, DAG.getUNDEF(MVT::v8f32), V2Mask);
+
+  return DAG.getNode(X86ISD::BLENDI, DL, MVT::v8f32, V1, V2,
+                     DAG.getConstant(BlendMask, MVT::i8));
+}
+
 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
 ///
 /// This routine either breaks down the specific type of a 256-bit x86 vector
@ -9305,8 +9343,9 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
  case MVT::v4i64:
    return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
-  case MVT::v8i32:
  case MVT::v8f32:
+    return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v8i32:
  case MVT::v16i16:
  case MVT::v32i8:
    // Fall back to the basic pattern of extracting the high half and forming
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@ -135,11 +135,7 @@ define <8 x float> @shuffle_v8f32_00001111(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_81a3c5e7(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_81a3c5e7
 ; ALL:       # BB#0:
-; ALL-NEXT:    vextractf128  $1, %ymm1, %xmm2
-; ALL-NEXT:    vextractf128  $1, %ymm0, %xmm3
-; ALL-NEXT:    vblendps  $5, %xmm2, %xmm3, %xmm2 # xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3]
-; ALL-NEXT:    vblendps  $5, %xmm1, %xmm0, %xmm0 # xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
-; ALL-NEXT:    vinsertf128  $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT:    vblendps  $85, %ymm1, %ymm0, %ymm0 # ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
 ; ALL-NEXT:    retq
  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
  ret <8 x float> %shuffle
@ -160,15 +156,15 @@ define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_08084c4c
 ; ALL:       # BB#0:
-; ALL-NEXT:    vextractf128  $1, %ymm1, %xmm2
-; ALL-NEXT:    vpermilps {{.*}} # xmm2 = xmm2[0,0,2,0]
-; ALL-NEXT:    vextractf128  $1, %ymm0, %xmm3
-; ALL-NEXT:    vpermilps {{.*}} # xmm3 = xmm3[0,1,0,3]
-; ALL-NEXT:    vblendps  $10, %xmm2, %xmm3, %xmm2 # xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
+; ALL-NEXT:    vpermilps {{.*}} # xmm2 = xmm1[0,0,2,0]
+; ALL-NEXT:    vextractf128  $1, %ymm1, %xmm1
 ; ALL-NEXT:    vpermilps {{.*}} # xmm1 = xmm1[0,0,2,0]
+; ALL-NEXT:    vinsertf128  $1, %xmm1, %ymm2, %ymm1
+; ALL-NEXT:    vpermilps {{.*}} # xmm2 = xmm0[0,1,0,3]
+; ALL-NEXT:    vextractf128  $1, %ymm0, %xmm0
 ; ALL-NEXT:    vpermilps {{.*}} # xmm0 = xmm0[0,1,0,3]
-; ALL-NEXT:    vblendps  $10, %xmm1, %xmm0, %xmm0 # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; ALL-NEXT:    vinsertf128  $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT:    vinsertf128  $1, %xmm0, %ymm2, %ymm0
+; ALL-NEXT:    vblendps  $-86, %ymm1, %ymm0, %ymm0 # ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
 ; ALL-NEXT:    retq
  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
  ret <8 x float> %shuffle
@ -177,13 +173,11 @@ define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_8823cc67(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_8823cc67
 ; ALL:       # BB#0:
-; ALL-NEXT:    vextractf128  $1, %ymm0, %xmm2
-; ALL-NEXT:    vextractf128  $1, %ymm1, %xmm3
-; ALL-NEXT:    vpermilps {{.*}} # xmm3 = xmm3[0,0,2,3]
-; ALL-NEXT:    vblendps {{.*}} # xmm2 = xmm3[0,1],xmm2[2,3]
+; ALL-NEXT:    vpermilps {{.*}} # xmm2 = xmm1[0,0,2,3]
+; ALL-NEXT:    vextractf128  $1, %ymm1, %xmm1
 ; ALL-NEXT:    vpermilps {{.*}} # xmm1 = xmm1[0,0,2,3]
-; ALL-NEXT:    vblendps {{.*}} # xmm0 = xmm1[0,1],xmm0[2,3]
-; ALL-NEXT:    vinsertf128  $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT:    vinsertf128  $1, %xmm1, %ymm2, %ymm1
+; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
 ; ALL-NEXT:    retq
  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
  ret <8 x float> %shuffle
@ -192,15 +186,15 @@ define <8 x float> @shuffle_v8f32_8823cc67(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_9832dc76(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_9832dc76
 ; ALL:       # BB#0:
-; ALL-NEXT:    vextractf128  $1, %ymm0, %xmm2
-; ALL-NEXT:    vpermilps {{.*}} # xmm2 = xmm2[0,1,3,2]
-; ALL-NEXT:    vextractf128  $1, %ymm1, %xmm3
-; ALL-NEXT:    vpermilps {{.*}} # xmm3 = xmm3[1,0,2,3]
-; ALL-NEXT:    vblendps {{.*}} # xmm2 = xmm3[0,1],xmm2[2,3]
-; ALL-NEXT:    vpermilps {{.*}} # xmm0 = xmm0[0,1,3,2]
+; ALL-NEXT:    vpermilps {{.*}} # xmm2 = xmm1[1,0,2,3]
+; ALL-NEXT:    vextractf128  $1, %ymm1, %xmm1
 ; ALL-NEXT:    vpermilps {{.*}} # xmm1 = xmm1[1,0,2,3]
-; ALL-NEXT:    vblendps {{.*}} # xmm0 = xmm1[0,1],xmm0[2,3]
-; ALL-NEXT:    vinsertf128  $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT:    vinsertf128  $1, %xmm1, %ymm2, %ymm1
+; ALL-NEXT:    vpermilps {{.*}} # xmm2 = xmm0[0,1,3,2]
+; ALL-NEXT:    vextractf128  $1, %ymm0, %xmm0
+; ALL-NEXT:    vpermilps {{.*}} # xmm0 = xmm0[0,1,3,2]
+; ALL-NEXT:    vinsertf128  $1, %xmm0, %ymm2, %ymm0
+; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
 ; ALL-NEXT:    retq
  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
  ret <8 x float> %shuffle
@ -209,15 +203,15 @@ define <8 x float> @shuffle_v8f32_9832dc76(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_9810dc54(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_9810dc54
 ; ALL:       # BB#0:
-; ALL-NEXT:    vextractf128  $1, %ymm0, %xmm2
-; ALL-NEXT:    vpermilps {{.*}} # xmm2 = xmm2[0,1,1,0]
-; ALL-NEXT:    vextractf128  $1, %ymm1, %xmm3
-; ALL-NEXT:    vpermilps {{.*}} # xmm3 = xmm3[1,0,2,3]
-; ALL-NEXT:    vblendps {{.*}} # xmm2 = xmm3[0,1],xmm2[2,3]
-; ALL-NEXT:    vpermilps {{.*}} # xmm0 = xmm0[0,1,1,0]
+; ALL-NEXT:    vpermilps {{.*}} # xmm2 = xmm1[1,0,2,3]
+; ALL-NEXT:    vextractf128  $1, %ymm1, %xmm1
 ; ALL-NEXT:    vpermilps {{.*}} # xmm1 = xmm1[1,0,2,3]
-; ALL-NEXT:    vblendps {{.*}} # xmm0 = xmm1[0,1],xmm0[2,3]
-; ALL-NEXT:    vinsertf128  $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT:    vinsertf128  $1, %xmm1, %ymm2, %ymm1
+; ALL-NEXT:    vpermilps {{.*}} # xmm2 = xmm0[0,1,1,0]
+; ALL-NEXT:    vextractf128  $1, %ymm0, %xmm0
+; ALL-NEXT:    vpermilps {{.*}} # xmm0 = xmm0[0,1,1,0]
+; ALL-NEXT:    vinsertf128  $1, %xmm0, %ymm2, %ymm0
+; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
 ; ALL-NEXT:    retq
  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
  ret <8 x float> %shuffle