diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 569beadc60e..ea13c88be82 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -4674,7 +4674,9 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if (ValueCounts.size() == 0) return DAG.getUNDEF(VT); - if (isOnlyLowElement) + // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. + // Keep going if we are hitting this case. + if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); unsigned EltSize = VT.getVectorElementType().getSizeInBits(); diff --git a/test/CodeGen/ARM/vector-DAGCombine.ll b/test/CodeGen/ARM/vector-DAGCombine.ll index 6d586f24264..3e138199e6f 100644 --- a/test/CodeGen/ARM/vector-DAGCombine.ll +++ b/test/CodeGen/ARM/vector-DAGCombine.ll @@ -184,3 +184,17 @@ entry: ; Function Attrs: nounwind readnone declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>) + +; Check that (insert_vector_elt (load)) => (vector_load). +; Thus, check that scalar_to_vector do not interfer with that. +define <8 x i16> @t4(i8* nocapture %sp0) { +; CHECK: t4 +; CHECK: vld1.32 {{{d[0-9]+}}[0]}, [r0] +entry: + %pix_sp0.0.cast = bitcast i8* %sp0 to i32* + %pix_sp0.0.copyload = load i32* %pix_sp0.0.cast, align 1 + %vec = insertelement <2 x i32> undef, i32 %pix_sp0.0.copyload, i32 0 + %0 = bitcast <2 x i32> %vec to <8 x i8> + %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %0, <8 x i8> %0) + ret <8 x i16> %vmull.i +}