From da8bb20158469544bab61b24a5123639d8ee3e09 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Sat, 30 May 2015 09:46:16 +0000
Subject: [PATCH] [x86] Split out the horizontal byte sum lowering component of
 the LUT lowering into a helper function.

NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@238650 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 214 ++++++++++++++++-------------
 1 file changed, 118 insertions(+), 96 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 6c34efc49f7..ef409d3d453 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -17290,11 +17290,124 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
   return SDValue();
 }
 
+/// Compute the horizontal sum of bytes in V for the elements of VT.
+///
+/// Requires V to be a byte vector and VT to be an integer vector type with
+/// wider elements than V's type. The width of the elements of VT determines
+/// how many bytes of V are summed horizontally to produce each element of the
+/// result.
+static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
+                                      const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
+  SDLoc DL(V);
+  MVT ByteVecVT = V.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  int NumElts = VT.getVectorNumElements();
+  assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
+         "Expected value to have byte element type.");
+  assert(EltVT != MVT::i8 &&
+         "Horizontal byte sum only makes sense for wider elements!");
+  unsigned VecSize = VT.getSizeInBits();
+  assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
+
+  // PSADBW instruction horizontally add all bytes and leave the result in i64
+  // chunks, thus directly computes the pop count for v2i64 and v4i64.
+  if (EltVT == MVT::i64) {
+    SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
+    V = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, V, Zeros);
+    return DAG.getBitcast(VT, V);
+  }
+
+  if (EltVT == MVT::i32) {
+    // We unpack the low half and high half into i32s interleaved with zeros so
+    // that we can use PSADBW to horizontally sum them. The most useful part of
+    // this is that it lines up the results of two PSADBW instructions to be
+    // two v2i64 vectors which concatenated are the 4 population counts. We can
+    // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
+    SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
+    SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V, Zeros);
+    SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V, Zeros);
+
+    // Do the horizontal sums into two v2i64s.
+    Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
+    Low = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT,
+                      DAG.getBitcast(ByteVecVT, Low), Zeros);
+    High = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT,
+                       DAG.getBitcast(ByteVecVT, High), Zeros);
+
+    // Merge them together.
+    MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
+    V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
+                    DAG.getBitcast(ShortVecVT, Low),
+                    DAG.getBitcast(ShortVecVT, High));
+
+    return DAG.getBitcast(VT, V);
+  }
+
+  // To obtain pop count for each i16 element, shuffle the byte pop count to get
+  // even and odd elements into distinct vectors, add them and zero-extend each
+  // i8 elemento into i16, i.e.:
+  //
+  //  B -> pop count per i8
+  //  W -> pop count per i16
+  //
+  //  Y = shuffle B, undef <0, 2, ...>
+  //  Z = shuffle B, undef <1, 3, ...>
+  //  W = zext <... x i8> to <... x i16> (Y + Z)
+  //
+  // Use a byte shuffle mask that matches PSHUFB.
+  //
+  assert(EltVT == MVT::i16 && "Unknown how to handle type");
+  SDValue Undef = DAG.getUNDEF(ByteVecVT);
+  SmallVector<int, 32> MaskA, MaskB;
+
+  // We can't use PSHUFB across lanes, so do the shuffle and sum inside each
+  // 128-bit lane, and then collapse the result.
+  int NumLanes = VecSize / 128;
+  assert(VecSize % 128 == 0 && "Must have 16-byte multiple vectors!");
+  for (int i = 0; i < NumLanes; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      MaskA.push_back(i * 16 + j * 2);
+      MaskB.push_back(i * 16 + (j * 2) + 1);
+    }
+    MaskA.append((size_t)8, -1);
+    MaskB.append((size_t)8, -1);
+  }
+
+  SDValue ShuffA = DAG.getVectorShuffle(ByteVecVT, DL, V, Undef, MaskA);
+  SDValue ShuffB = DAG.getVectorShuffle(ByteVecVT, DL, V, Undef, MaskB);
+  V = DAG.getNode(ISD::ADD, DL, ByteVecVT, ShuffA, ShuffB);
+
+  SmallVector<int, 4> Mask;
+  for (int i = 0; i < NumLanes; ++i)
+    Mask.push_back(2 * i);
+  Mask.append((size_t)NumLanes, -1);
+
+  int NumI64Elts = VecSize / 64;
+  MVT VecI64VT = MVT::getVectorVT(MVT::i64, NumI64Elts);
+
+  V = DAG.getBitcast(VecI64VT, V);
+  V = DAG.getVectorShuffle(VecI64VT, DL, V, DAG.getUNDEF(VecI64VT), Mask);
+  V = DAG.getBitcast(ByteVecVT, V);
+
+  // Zero extend i8s into i16 elts
+  SmallVector<int, 16> ZExtInRegMask;
+  for (int i = 0; i < NumElts; ++i) {
+    ZExtInRegMask.push_back(i);
+    ZExtInRegMask.push_back(2 * NumElts);
+  }
+
+  return DAG.getBitcast(
+      VT, DAG.getVectorShuffle(ByteVecVT, DL, V,
+                               getZeroVector(ByteVecVT, Subtarget, DAG, DL),
+                               ZExtInRegMask));
+}
+
 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL,
-                                  const X86Subtarget *Subtarget,
-                                  SelectionDAG &DAG) {
-  EVT VT = Op.getValueType();
-  MVT EltVT = VT.getVectorElementType().getSimpleVT();
+                                        const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
   unsigned VecSize = VT.getSizeInBits();
 
   // Implement a lookup table in register by using an algorithm based on:
@@ -17347,98 +17460,7 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL,
   if (EltVT == MVT::i8)
     return PopCnt;
 
-  // PSADBW instruction horizontally add all bytes and leave the result in i64
-  // chunks, thus directly computes the pop count for v2i64 and v4i64.
-  if (EltVT == MVT::i64) {
-    SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
-    PopCnt = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, PopCnt, Zeros);
-    return DAG.getBitcast(VT, PopCnt);
-  }
-
-  int NumI64Elts = VecSize / 64;
-  MVT VecI64VT = MVT::getVectorVT(MVT::i64, NumI64Elts);
-
-  if (EltVT == MVT::i32) {
-    // We unpack the low half and high half into i32s interleaved with zeros so
-    // that we can use PSADBW to horizontally sum them. The most useful part of
-    // this is that it lines up the results of two PSADBW instructions to be
-    // two v2i64 vectors which concatenated are the 4 population counts. We can
-    // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
-    SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
-    SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, PopCnt, Zeros);
-    SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, PopCnt, Zeros);
-
-    // Do the horizontal sums into two v2i64s.
-    Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
-    Low = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT,
-                      DAG.getBitcast(ByteVecVT, Low), Zeros);
-    High = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT,
-                       DAG.getBitcast(ByteVecVT, High), Zeros);
-
-    // Merge them together.
-    MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
-    PopCnt = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
-                         DAG.getBitcast(ShortVecVT, Low),
-                         DAG.getBitcast(ShortVecVT, High));
-
-    return DAG.getBitcast(VT, PopCnt);
-  }
-
-  // To obtain pop count for each i16 element, shuffle the byte pop count to get
-  // even and odd elements into distinct vectors, add them and zero-extend each
-  // i8 elemento into i16, i.e.:
-  //
-  //  B -> pop count per i8
-  //  W -> pop count per i16
-  //
-  //  Y = shuffle B, undef <0, 2, ...>
-  //  Z = shuffle B, undef <1, 3, ...>
-  //  W = zext <... x i8> to <... x i16> (Y + Z)
-  //
-  // Use a byte shuffle mask that matches PSHUFB.
-  //
-  assert(EltVT == MVT::i16 && "Unknown how to handle type");
-  SDValue Undef = DAG.getUNDEF(ByteVecVT);
-  SmallVector<int, 32> MaskA, MaskB;
-
-  // We can't use PSHUFB across lanes, so do the shuffle and sum inside each
-  // 128-bit lane, and then collapse the result.
-  int NumLanes = NumByteElts / 16;
-  assert(NumByteElts % 16 == 0 && "Must have 16-byte multiple vectors!");
-  for (int i = 0; i < NumLanes; ++i) {
-    for (int j = 0; j < 8; ++j) {
-      MaskA.push_back(i * 16 + j * 2);
-      MaskB.push_back(i * 16 + (j * 2) + 1);
-    }
-    MaskA.append((size_t)8, -1);
-    MaskB.append((size_t)8, -1);
-  }
-
-  SDValue ShuffA = DAG.getVectorShuffle(ByteVecVT, DL, PopCnt, Undef, MaskA);
-  SDValue ShuffB = DAG.getVectorShuffle(ByteVecVT, DL, PopCnt, Undef, MaskB);
-  PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, ShuffA, ShuffB);
-
-  SmallVector<int, 4> Mask;
-  for (int i = 0; i < NumLanes; ++i)
-    Mask.push_back(2 * i);
-  Mask.append((size_t)NumLanes, -1);
-
-  PopCnt = DAG.getBitcast(VecI64VT, PopCnt);
-  PopCnt =
-      DAG.getVectorShuffle(VecI64VT, DL, PopCnt, DAG.getUNDEF(VecI64VT), Mask);
-  PopCnt = DAG.getBitcast(ByteVecVT, PopCnt);
-
-  // Zero extend i8s into i16 elts
-  SmallVector<int, 16> ZExtInRegMask;
-  for (int i = 0; i < NumByteElts / 2; ++i) {
-    ZExtInRegMask.push_back(i);
-    ZExtInRegMask.push_back(NumByteElts);
-  }
-
-  return DAG.getBitcast(
-      VT, DAG.getVectorShuffle(ByteVecVT, DL, PopCnt,
-                               getZeroVector(ByteVecVT, Subtarget, DAG, DL),
-                               ZExtInRegMask));
+  return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
 }
 
 static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL,