diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 822950c5283..d22ec353fd0 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -2841,13 +2841,16 @@ def VMULfd_sfp : N3VDs<1, 0, 0b00, 0b1101, 1, "vmul.f32", v2f32, v2f32, fmul,1>; def : N3VDsPat; // Vector Multiply-Accumulate/Subtract used for single-precision FP -let neverHasSideEffects = 1 in -def VMLAfd_sfp : N3VDMulOps<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla.f32", v2f32,fmul,fadd>; -def : N3VDMulOpsPat; +// vml[as].f32 can cause 4-8 cycle stalls in following ASIMD instructions, so +// we want to avoid them for now. e.g., alternating vmla/vadd instructions. -let neverHasSideEffects = 1 in -def VMLSfd_sfp : N3VDMulOps<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls.f32", v2f32,fmul,fsub>; -def : N3VDMulOpsPat; +//let neverHasSideEffects = 1 in +//def VMLAfd_sfp : N3VDMulOps<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla.f32", v2f32,fmul,fadd>; +//def : N3VDMulOpsPat; + +//let neverHasSideEffects = 1 in +//def VMLSfd_sfp : N3VDMulOps<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls.f32", v2f32,fmul,fsub>; +//def : N3VDMulOpsPat; // Vector Absolute used for single-precision FP let neverHasSideEffects = 1 in diff --git a/test/CodeGen/ARM/fmacs.ll b/test/CodeGen/ARM/fmacs.ll index 1a1cd0747b4..5c31ea641de 100644 --- a/test/CodeGen/ARM/fmacs.ll +++ b/test/CodeGen/ARM/fmacs.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -march=arm -mattr=+vfp2 | grep -E {fmacs\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1 -; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | grep -E {vmla.f32\\W*d\[0-9\]+,\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 1 +; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | grep -E {vmul.f32\\W*d\[0-9\]+,\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 1 ; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | grep -E {fmacs\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1 -; RUN: llc < %s -march=arm -mcpu=cortex-a8 | grep -E {vmla.f32\\W*d\[0-9\]+,\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 1 +; RUN: llc < %s -march=arm -mcpu=cortex-a8 | grep -E {vmul.f32\\W*d\[0-9\]+,\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 1 ; RUN: llc < %s -march=arm -mcpu=cortex-a9 | grep -E {fmacs\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1 define float @test(float %acc, float %a, float %b) { diff --git a/test/CodeGen/ARM/fnmacs.ll b/test/CodeGen/ARM/fnmacs.ll index 969eb727467..8fc13e78bc3 100644 --- a/test/CodeGen/ARM/fnmacs.ll +++ b/test/CodeGen/ARM/fnmacs.ll @@ -7,8 +7,10 @@ entry: ; VFP2: fnmacs ; NEON: fnmacs -; NEONFP: vmls +; NEONFP-NOT: vmls ; NEONFP-NOT: fcpys +; NEONFP: vmul.f32 +; NEONFP: vsub.f32 ; NEONFP: fmrs %0 = fmul float %a, %b diff --git a/test/CodeGen/Thumb2/cross-rc-coalescing-2.ll b/test/CodeGen/Thumb2/cross-rc-coalescing-2.ll index ea88a2680f3..4320328e9c1 100644 --- a/test/CodeGen/Thumb2/cross-rc-coalescing-2.ll +++ b/test/CodeGen/Thumb2/cross-rc-coalescing-2.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -mcpu=cortex-a8 | grep fcpys | count 5 +; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -mcpu=cortex-a8 | grep fcpys | count 4 define arm_apcscc void @fht(float* nocapture %fz, i16 signext %n) nounwind { entry: