diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 21e7e98c85f..6ae44eb3537 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -4230,9 +4230,26 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // If we are VDUPing a value that comes directly from a vector, that will // cause an unnecessary move to and from a GPR, where instead we could // just use VDUPLANE. - if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT) - N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, + if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + // We need to create a new undef vector to use for the VDUPLANE if the + // size of the vector from which we get the value is different than the + // size of the vector that we need to create. We will insert the element + // such that the register coalescer will remove unnecessary copies. + if (VT != Value->getOperand(0).getValueType()) { + ConstantSDNode *constIndex; + constIndex = dyn_cast(Value->getOperand(1)); + assert(constIndex && "The index is not a constant!"); + unsigned index = constIndex->getAPIntValue().getLimitedValue() % + VT.getVectorNumElements(); + N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, + DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), + Value, DAG.getConstant(index, MVT::i32)), + DAG.getConstant(index, MVT::i32)); + } else { + N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, Value->getOperand(0), Value->getOperand(1)); + } + } else N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); diff --git a/test/CodeGen/ARM/vdup.ll b/test/CodeGen/ARM/vdup.ll index a8c224b4385..2cf94d63ca1 100644 --- a/test/CodeGen/ARM/vdup.ll +++ b/test/CodeGen/ARM/vdup.ll @@ -295,3 +295,39 @@ define <4 x i32> @tduplane(<4 x i32> %invec) { %4 = insertelement <4 x i32> %3, i32 255, i32 3 ret <4 x i32> %4 } + +define <2 x float> @check_f32(<4 x float> %v) nounwind { +;CHECK: check_f32: +;CHECK: vdup.32 {{.*}}, d{{..}}[1] + %x = extractelement <4 x float> %v, i32 3 + %1 = insertelement <2 x float> undef, float %x, i32 0 + %2 = insertelement <2 x float> %1, float %x, i32 1 + ret <2 x float> %2 +} + +define <2 x i32> @check_i32(<4 x i32> %v) nounwind { +;CHECK: check_i32: +;CHECK: vdup.32 {{.*}}, d{{..}}[1] + %x = extractelement <4 x i32> %v, i32 3 + %1 = insertelement <2 x i32> undef, i32 %x, i32 0 + %2 = insertelement <2 x i32> %1, i32 %x, i32 1 + ret <2 x i32> %2 +} + +define <4 x i16> @check_i16(<8 x i16> %v) nounwind { +;CHECK: check_i16: +;CHECK: vdup.16 {{.*}}, d{{..}}[3] + %x = extractelement <8 x i16> %v, i32 3 + %1 = insertelement <4 x i16> undef, i16 %x, i32 0 + %2 = insertelement <4 x i16> %1, i16 %x, i32 1 + ret <4 x i16> %2 +} + +define <8 x i8> @check_i8(<16 x i8> %v) nounwind { +;CHECK: check_i8: +;CHECK: vdup.8 {{.*}}, d{{..}}[3] + %x = extractelement <16 x i8> %v, i32 3 + %1 = insertelement <8 x i8> undef, i8 %x, i32 0 + %2 = insertelement <8 x i8> %1, i8 %x, i32 1 + ret <8 x i8> %2 +}