diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index cc09754ef70..9b68590f856 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -8773,6 +8773,98 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, return DAG.getNode(ISD::BITCAST, dl, VT, BV); } +/// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR. +static SDValue +PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR. + // At that time, we may have inserted bitcasts from integer to float. + // If these bitcasts have survived DAGCombine, change the lowering of this + // BUILD_VECTOR in something more vector friendly, i.e., that does not + // force to use floating point types. + + // Make sure we can change the type of the vector. + // This is possible iff: + // 1. The vector is only used in a bitcast to a integer type. I.e., + // 1.1. Vector is used only once. + // 1.2. Use is a bit convert to an integer type. + // 2. The size of its operands are 32-bits (64-bits are not legal). + EVT VT = N->getValueType(0); + EVT EltVT = VT.getVectorElementType(); + + // Check 1.1. and 2. + if (EltVT.getSizeInBits() != 32 || !N->hasOneUse()) + return SDValue(); + + // By construction, the input type must be float. + assert(EltVT == MVT::f32 && "Unexpected type!"); + + // Check 1.2. + SDNode *Use = *N->use_begin(); + if (Use->getOpcode() != ISD::BITCAST || + Use->getValueType(0).isFloatingPoint()) + return SDValue(); + + // Check profitability. + // Model is, if more than half of the relevant operands are bitcast from + // i32, turn the build_vector into a sequence of insert_vector_elt. + // Relevant operands are everything that is not statically + // (i.e., at compile time) bitcasted. + unsigned NumOfBitCastedElts = 0; + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumOfRelevantElts = NumElts; + for (unsigned Idx = 0; Idx < NumElts; ++Idx) { + SDValue Elt = N->getOperand(Idx); + if (Elt->getOpcode() == ISD::BITCAST) { + // Assume only bit cast to i32 will go away. + if (Elt->getOperand(0).getValueType() == MVT::i32) + ++NumOfBitCastedElts; + } else if (Elt.getOpcode() == ISD::UNDEF || isa(Elt)) + // Constants are statically casted, thus do not count them as + // relevant operands. + --NumOfRelevantElts; + } + + // Check if more than half of the elements require a non-free bitcast. + if (NumOfBitCastedElts <= NumOfRelevantElts / 2) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + // Create the new vector type. + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); + // Check if the type is legal. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(VecVT)) + return SDValue(); + + // Combine: + // ARMISD::BUILD_VECTOR E1, E2, ..., EN. + // => BITCAST INSERT_VECTOR_ELT + // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1), + // (BITCAST EN), N. + SDValue Vec = DAG.getUNDEF(VecVT); + SDLoc dl(N); + for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { + SDValue V = N->getOperand(Idx); + if (V.getOpcode() == ISD::UNDEF) + continue; + if (V.getOpcode() == ISD::BITCAST && + V->getOperand(0).getValueType() == MVT::i32) + // Fold obvious case. + V = V.getOperand(0); + else { + V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); + // Make the DAGCombiner fold the bitcasts. + DCI.AddToWorklist(V.getNode()); + } + SDValue LaneIdx = DAG.getConstant(Idx, MVT::i32); + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); + } + Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); + // Make the DAGCombiner fold the bitcasts. + DCI.AddToWorklist(Vec.getNode()); + return Vec; +} + /// PerformInsertEltCombine - Target-specific dag combine xforms for /// ISD::INSERT_VECTOR_ELT. static SDValue PerformInsertEltCombine(SDNode *N, @@ -9709,6 +9801,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ARMISD::VLD3DUP: case ARMISD::VLD4DUP: return CombineBaseUpdate(N, DCI); + case ARMISD::BUILD_VECTOR: + return PerformARMBUILD_VECTORCombine(N, DCI); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { diff --git a/test/CodeGen/ARM/2012-05-04-vmov.ll b/test/CodeGen/ARM/2012-05-04-vmov.ll index d52ef2cc5a1..14dbf7ff4ac 100644 --- a/test/CodeGen/ARM/2012-05-04-vmov.ll +++ b/test/CodeGen/ARM/2012-05-04-vmov.ll @@ -7,5 +7,8 @@ entry: %div = udiv <2 x i32> %A, %B ret <2 x i32> %div ; A9-CHECK: vmov.32 -; SWIFT-CHECK-NOT: vmov.32 +; vmov.32 should not be used to get a lane: +; vmov.32 , []. +; but vmov.32 [], is fine. +; SWIFT-CHECK-NOT: vmov.32 {{r[0-9]+}}, {{d[0-9]\[[0-9]+\]}} } diff --git a/test/CodeGen/ARM/vector-DAGCombine.ll b/test/CodeGen/ARM/vector-DAGCombine.ll index 42964deb0b5..6d586f24264 100644 --- a/test/CodeGen/ARM/vector-DAGCombine.ll +++ b/test/CodeGen/ARM/vector-DAGCombine.ll @@ -160,3 +160,27 @@ define void @reverse_v16i8(<16 x i8>* %loadaddr, <16 x i8>* %storeaddr) { store <16 x i8> %v1, <16 x i8>* %storeaddr ret void } + +; . +; vldr cannot handle unaligned loads. +; Fall back to vld1.32, which can, instead of using the general purpose loads +; followed by a costly sequence of instructions to build the vector register. +; CHECK: t3 +; CHECK: vld1.32 {[[REG:d[0-9]+]][0]} +; CHECK: vld1.32 {[[REG]][1]} +; CHECK: vmull.u8 q{{[0-9]+}}, [[REG]], [[REG]] +define <8 x i16> @t3(i8 zeroext %xf, i8* nocapture %sp0, i8* nocapture %sp1, i32* nocapture %outp) { +entry: + %pix_sp0.0.cast = bitcast i8* %sp0 to i32* + %pix_sp0.0.copyload = load i32* %pix_sp0.0.cast, align 1 + %pix_sp1.0.cast = bitcast i8* %sp1 to i32* + %pix_sp1.0.copyload = load i32* %pix_sp1.0.cast, align 1 + %vecinit = insertelement <2 x i32> undef, i32 %pix_sp0.0.copyload, i32 0 + %vecinit1 = insertelement <2 x i32> %vecinit, i32 %pix_sp1.0.copyload, i32 1 + %0 = bitcast <2 x i32> %vecinit1 to <8 x i8> + %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %0, <8 x i8> %0) + ret <8 x i16> %vmull.i +} + +; Function Attrs: nounwind readnone +declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>)