Implement vector constants which are splat of

integers with mov + vdup. 8003375. This is currently disabled by default because LICM will not hoist a VDUP, so it pessimizes the code if the construct occurs inside a loop (8248029). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@109799 91177308-0d34-0410-b5e6-96231b3b80d8
2025-02-21 06:30:16 +00:00 · 2010-07-29 20:10:08 +00:00 · 2010-07-29 20:10:08 +00:00 · f630c712b1
commit f630c712b1
parent fb13b95162
2 changed files with 100 additions and 8 deletions
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@ -57,6 +57,13 @@ EnableARMTailCalls("arm-tail-calls", cl::Hidden,
  cl::desc("Generate tail calls (TEMPORARY OPTION)."),
  cl::init(true));

+// This option should go away when Machine LICM is smart enough to hoist a 
+// reg-to-reg VDUP.
+static cl::opt<bool>
+EnableARMVDUPsplat("arm-vdup-splat", cl::Hidden,
+  cl::desc("Generate VDUP for integer constant splats (TEMPORARY OPTION)."),
+  cl::init(false));
+
 static cl::opt<bool>
 EnableARMLongCalls("arm-long-calls", cl::Hidden,
  cl::desc("Generate calls via indirect call instructions"),
@ -3257,9 +3264,30 @@ static bool isVZIP_v_undef_Mask(const SmallVectorImpl<int> &M, EVT VT,
  return true;
 }

+// If N is an integer constant that can be moved into a register in one
+// instruction, return an SDValue of such a constant (will become a MOV
+// instruction).  Otherwise return null.
+static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
+                                     const ARMSubtarget *ST, DebugLoc dl) {
+  uint64_t Val;
+  if (!isa<ConstantSDNode>(N))
+    return SDValue();
+  Val = cast<ConstantSDNode>(N)->getZExtValue();
+
+  if (ST->isThumb1Only()) {
+    if (Val <= 255 || ~Val <= 255)
+      return DAG.getConstant(Val, MVT::i32);
+  } else {
+    if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
+      return DAG.getConstant(Val, MVT::i32);
+  }
+  return SDValue();
+}
+
 // If this is a case we can't handle, return null and let the default
 // expansion code take care of it.
-static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 
+                                 const ARMSubtarget *ST) {
  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
  DebugLoc dl = Op.getDebugLoc();
  EVT VT = Op.getValueType();
@ -3319,15 +3347,41 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
  if (isOnlyLowElement)
    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);

-  // If all elements are constants, fall back to the default expansion, which
-  // will generate a load from the constant pool.
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+
+  if (EnableARMVDUPsplat) {
+    // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
+    // i32 and try again.
+    if (usesOnlyOneValue && EltSize <= 32) {
+      if (!isConstant)
+        return DAG.getNode(ARMISD::VDUP, dl, VT, Value);
+      if (VT.getVectorElementType().isFloatingPoint()) {
+        SmallVector<SDValue, 8> Ops;
+        for (unsigned i = 0; i < NumElts; ++i)
+          Ops.push_back(DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, 
+                                    Op.getOperand(i)));
+        SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &Ops[0],
+                                  NumElts);
+        return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 
+                           LowerBUILD_VECTOR(Val, DAG, ST));
+      }
+      SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
+      if (Val.getNode())
+        return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
+    }
+  }
+
+  // If all elements are constants and the case above didn't get hit, fall back
+  // to the default expansion, which will generate a load from the constant
+  // pool.
  if (isConstant)
    return SDValue();

-  // Use VDUP for non-constant splats.
-  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
-  if (usesOnlyOneValue && EltSize <= 32)
-    return DAG.getNode(ARMISD::VDUP, dl, VT, Value);
+  if (!EnableARMVDUPsplat) {
+    // Use VDUP for non-constant splats.
+    if (usesOnlyOneValue && EltSize <= 32)
+      return DAG.getNode(ARMISD::VDUP, dl, VT, Value);
+  }

  // Vectors with 32- or 64-bit elements can be built by directly assigning
  // the subregisters.  Lower it to an ARMISD::BUILD_VECTOR so the operands
@ -3647,7 +3701,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
  case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
  case ISD::CTTZ:          return LowerCTTZ(Op.getNode(), DAG, Subtarget);
  case ISD::VSETCC:        return LowerVSETCC(Op, DAG);
-  case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
  case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
--- a/test/CodeGen/Thumb2/machine-licm-vdup.ll
+++ b/test/CodeGen/Thumb2/machine-licm-vdup.ll
@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -disable-fp-elim                -arm-vdup-splat | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -relocation-model=pic -disable-fp-elim -arm-vdup-splat | FileCheck %s 
+; Modified version of machine-licm.ll with -arm-vdup-splat turned on, 8003375.
+; Eventually this should become the default and be moved into machine-licm.ll.
+; FIXME: the vdup should be hoisted out of the loop, 8248029.
+
+define void @t2(i8* %ptr1, i8* %ptr2) nounwind {
+entry:
+; CHECK: t2:
+; CHECK: mov.w r3, #1065353216
+  br i1 undef, label %bb1, label %bb2
+
+bb1:
+; CHECK-NEXT: %bb1
+; CHECK: vdup.32 q1, r3
+  %indvar = phi i32 [ %indvar.next, %bb1 ], [ 0, %entry ]
+  %tmp1 = shl i32 %indvar, 2
+  %gep1 = getelementptr i8* %ptr1, i32 %tmp1
+  %tmp2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %gep1)
+  %tmp3 = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float> %tmp2)
+  %gep2 = getelementptr i8* %ptr2, i32 %tmp1
+  call void @llvm.arm.neon.vst1.v4f32(i8* %gep2, <4 x float> %tmp3)
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp eq i32 %indvar.next, 10
+  br i1 %cond, label %bb2, label %bb1
+
+bb2:
+  ret void
+}
+
+; CHECK-NOT: LCPI1_0:
+; CHECK: .subsections_via_symbols
+
+declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*) nounwind readonly
+
+declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>) nounwind
+
+declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone