[ARM] Improve the instruction selection of vector loads.

In the ARM back-end, build_vector nodes are lowered to a target specific build_vector that uses floating point type. This works well, unless the inserted bitcasts survive until instruction selection. In that case, they incur moves between integer unit and floating point unit that may result in inefficient code. In other words, this conversion may introduce artificial dependencies when the code leading to the build vector cannot be completed with a floating point type. In particular, this happens when loads are not aligned. Before this patch, in that case, the compiler generates general purpose loads and creates the floating point vector from them, instead of directly using the vector unit. The patch uses a vector friendly sequence of code when the inserted bitcasts to floating point survived DAGCombine. This is done by a target specific DAGCombine that changes the target specific build_vector into a sequence of insert_vector_elt that get rid of the bitcasts. <rdar://problem/14170854> git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@185587 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-05 17:39:16 +00:00 · 2013-07-03 21:42:57 +00:00 · 2013-07-03 21:42:57 +00:00 · 8e2e5ff024
commit 8e2e5ff024
parent 4e976457eb
3 changed files with 122 additions and 1 deletions
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@ -8773,6 +8773,98 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N,
  return DAG.getNode(ISD::BITCAST, dl, VT, BV);
 }

+/// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
+static SDValue
+PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+  // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
+  // At that time, we may have inserted bitcasts from integer to float.
+  // If these bitcasts have survived DAGCombine, change the lowering of this
+  // BUILD_VECTOR in something more vector friendly, i.e., that does not
+  // force to use floating point types.
+
+  // Make sure we can change the type of the vector.
+  // This is possible iff:
+  // 1. The vector is only used in a bitcast to a integer type. I.e.,
+  //    1.1. Vector is used only once.
+  //    1.2. Use is a bit convert to an integer type.
+  // 2. The size of its operands are 32-bits (64-bits are not legal).
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+
+  // Check 1.1. and 2.
+  if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
+    return SDValue();
+
+  // By construction, the input type must be float.
+  assert(EltVT == MVT::f32 && "Unexpected type!");
+
+  // Check 1.2.
+  SDNode *Use = *N->use_begin();
+  if (Use->getOpcode() != ISD::BITCAST ||
+      Use->getValueType(0).isFloatingPoint())
+    return SDValue();
+
+  // Check profitability.
+  // Model is, if more than half of the relevant operands are bitcast from
+  // i32, turn the build_vector into a sequence of insert_vector_elt.
+  // Relevant operands are everything that is not statically
+  // (i.e., at compile time) bitcasted.
+  unsigned NumOfBitCastedElts = 0;
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumOfRelevantElts = NumElts;
+  for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
+    SDValue Elt = N->getOperand(Idx);
+    if (Elt->getOpcode() == ISD::BITCAST) {
+      // Assume only bit cast to i32 will go away.
+      if (Elt->getOperand(0).getValueType() == MVT::i32)
+        ++NumOfBitCastedElts;
+    } else if (Elt.getOpcode() == ISD::UNDEF || isa<ConstantSDNode>(Elt))
+      // Constants are statically casted, thus do not count them as
+      // relevant operands.
+      --NumOfRelevantElts;
+  }
+
+  // Check if more than half of the elements require a non-free bitcast.
+  if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  // Create the new vector type.
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+  // Check if the type is legal.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isTypeLegal(VecVT))
+    return SDValue();
+
+  // Combine:
+  // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
+  // => BITCAST INSERT_VECTOR_ELT
+  //                      (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
+  //                      (BITCAST EN), N.
+  SDValue Vec = DAG.getUNDEF(VecVT);
+  SDLoc dl(N);
+  for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
+    SDValue V = N->getOperand(Idx);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    if (V.getOpcode() == ISD::BITCAST &&
+        V->getOperand(0).getValueType() == MVT::i32)
+      // Fold obvious case.
+      V = V.getOperand(0);
+    else {
+      V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); 
+      // Make the DAGCombiner fold the bitcasts.
+      DCI.AddToWorklist(V.getNode());
+    }
+    SDValue LaneIdx = DAG.getConstant(Idx, MVT::i32);
+    Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
+  }
+  Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
+  // Make the DAGCombiner fold the bitcasts.
+  DCI.AddToWorklist(Vec.getNode());
+  return Vec;
+}
+
 /// PerformInsertEltCombine - Target-specific dag combine xforms for
 /// ISD::INSERT_VECTOR_ELT.
 static SDValue PerformInsertEltCombine(SDNode *N,
@ -9709,6 +9801,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
  case ARMISD::VLD3DUP:
  case ARMISD::VLD4DUP:
    return CombineBaseUpdate(N, DCI);
+  case ARMISD::BUILD_VECTOR:
+    return PerformARMBUILD_VECTORCombine(N, DCI);
  case ISD::INTRINSIC_VOID:
  case ISD::INTRINSIC_W_CHAIN:
    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
--- a/test/CodeGen/ARM/2012-05-04-vmov.ll
+++ b/test/CodeGen/ARM/2012-05-04-vmov.ll
@ -7,5 +7,8 @@ entry:
  %div = udiv <2 x i32> %A, %B
  ret <2 x i32> %div
 ; A9-CHECK: vmov.32
-; SWIFT-CHECK-NOT: vmov.32
+; vmov.32 should not be used to get a lane:
+; vmov.32 <dst>, <src>[<lane>].
+; but vmov.32 <dst>[<lane>], <src> is fine.
+; SWIFT-CHECK-NOT: vmov.32 {{r[0-9]+}}, {{d[0-9]\[[0-9]+\]}}
 }
--- a/test/CodeGen/ARM/vector-DAGCombine.ll
+++ b/test/CodeGen/ARM/vector-DAGCombine.ll
@ -160,3 +160,27 @@ define void @reverse_v16i8(<16 x i8>* %loadaddr, <16 x i8>* %storeaddr) {
  store <16 x i8> %v1, <16 x i8>* %storeaddr
  ret void
 }
+
+; <rdar://problem/14170854>.
+; vldr cannot handle unaligned loads.
+; Fall back to vld1.32, which can, instead of using the general purpose loads
+; followed by a costly sequence of instructions to build the vector register.
+; CHECK: t3
+; CHECK: vld1.32 {[[REG:d[0-9]+]][0]}
+; CHECK: vld1.32 {[[REG]][1]}
+; CHECK: vmull.u8 q{{[0-9]+}}, [[REG]], [[REG]]
+define <8 x i16> @t3(i8 zeroext %xf, i8* nocapture %sp0, i8* nocapture %sp1, i32* nocapture %outp) {
+entry:
+  %pix_sp0.0.cast = bitcast i8* %sp0 to i32*
+  %pix_sp0.0.copyload = load i32* %pix_sp0.0.cast, align 1
+  %pix_sp1.0.cast = bitcast i8* %sp1 to i32*
+  %pix_sp1.0.copyload = load i32* %pix_sp1.0.cast, align 1
+  %vecinit = insertelement <2 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
+  %vecinit1 = insertelement <2 x i32> %vecinit, i32 %pix_sp1.0.copyload, i32 1
+  %0 = bitcast <2 x i32> %vecinit1 to <8 x i8>
+  %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %0, <8 x i8> %0)
+  ret <8 x i16> %vmull.i
+}
+
+; Function Attrs: nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>)