From 4b4e62219be91839091f9e35d8accf877f925d81 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Sun, 22 Jan 2012 12:07:33 +0000 Subject: [PATCH] Add fused multiple+add instructions from VFPv4. Patch by Ana Pazos! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@148658 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARM.td | 6 ++ lib/Target/ARM/ARMAsmPrinter.cpp | 16 +++- lib/Target/ARM/ARMInstrInfo.td | 6 ++ lib/Target/ARM/ARMInstrNEON.td | 34 ++++++-- lib/Target/ARM/ARMInstrVFP.td | 137 +++++++++++++++++++++++++++---- lib/Target/ARM/ARMSchedule.td | 4 + lib/Target/ARM/ARMSubtarget.cpp | 2 + lib/Target/ARM/ARMSubtarget.h | 8 +- test/CodeGen/ARM/fusedMAC.ll | 68 +++++++++++++++ 9 files changed, 254 insertions(+), 27 deletions(-) create mode 100644 test/CodeGen/ARM/fusedMAC.ll diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index 86aeeb21a79..dd9c4bd793e 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -32,9 +32,15 @@ def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFPv2", "true", def FeatureVFP3 : SubtargetFeature<"vfp3", "HasVFPv3", "true", "Enable VFP3 instructions", [FeatureVFP2]>; +def FeatureVFP4 : SubtargetFeature<"vfp4", "HasVFPv4", "true", + "Enable VFP4 instructions", + [FeatureVFP3]>; def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true", "Enable NEON instructions", [FeatureVFP3]>; +def FeatureNEONVFP4 : SubtargetFeature<"neon-vfpv4", "HasNEONVFPv4", "true", + "Enable NEON-VFP4 instructions", + [FeatureVFP4, FeatureNEON]>; def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true", "Enable Thumb2 instructions">; def FeatureNoARM : SubtargetFeature<"noarm", "NoARM", "true", diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp index 270d9d87353..288b7f14e4c 100644 --- a/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/lib/Target/ARM/ARMAsmPrinter.cpp @@ -719,15 +719,25 @@ void ARMAsmPrinter::emitAttributes() { if (Subtarget->hasNEON() && emitFPU) { /* NEON is not exactly a VFP architecture, but GAS emit one of - * neon/vfpv3/vfpv2 for .fpu parameters */ - AttrEmitter->EmitTextAttribute(ARMBuildAttrs::Advanced_SIMD_arch, "neon"); + * neon/neon-vfpv4/vfpv3/vfpv2 for .fpu parameters */ + if (Subtarget->hasNEONVFP4()) + AttrEmitter->EmitTextAttribute(ARMBuildAttrs::Advanced_SIMD_arch, "neon-vfpv4"); + else + AttrEmitter->EmitTextAttribute(ARMBuildAttrs::Advanced_SIMD_arch, "neon"); /* If emitted for NEON, omit from VFP below, since you can have both * NEON and VFP in build attributes but only one .fpu */ emitFPU = false; } + /* VFPv4 + .fpu */ + if (Subtarget->hasVFP4()) { + AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch, + ARMBuildAttrs::AllowFPv4A); + if (emitFPU) + AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "vfpv4"); + /* VFPv3 + .fpu */ - if (Subtarget->hasVFP3()) { + } else if (Subtarget->hasVFP3()) { AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch, ARMBuildAttrs::AllowFPv3A); if (emitFPU) diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index dabd187069f..99d180a5695 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -179,8 +179,14 @@ def HasVFP2 : Predicate<"Subtarget->hasVFP2()">, AssemblerPredicate<"FeatureVFP2">; def HasVFP3 : Predicate<"Subtarget->hasVFP3()">, AssemblerPredicate<"FeatureVFP3">; +def HasVFP4 : Predicate<"Subtarget->hasVFP4()">, + AssemblerPredicate<"FeatureVFP4">; +def NoVFP4 : Predicate<"!Subtarget->hasVFP4()">; def HasNEON : Predicate<"Subtarget->hasNEON()">, AssemblerPredicate<"FeatureNEON">; +def HasNEONVFP4 : Predicate<"Subtarget->hasNEONVFP4()">, + AssemblerPredicate<"FeatureNEONVFP4">; +def NoNEONVFP4 : Predicate<"!Subtarget->hasNEONVFP4()">; def HasFP16 : Predicate<"Subtarget->hasFP16()">, AssemblerPredicate<"FeatureFP16">; def HasDivide : Predicate<"Subtarget->hasDivide()">, diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index a2df5bde880..f9626fef00b 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -3897,10 +3897,10 @@ defm VMLA : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>; def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32", v2f32, fmul_su, fadd_mlx>, - Requires<[HasNEON, UseFPVMLx]>; + Requires<[HasNEON, UseFPVMLx, NoNEONVFP4]>; def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32", v4f32, fmul_su, fadd_mlx>, - Requires<[HasNEON, UseFPVMLx]>; + Requires<[HasNEON, UseFPVMLx, NoNEONVFP4]>; defm VMLAsl : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>; def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32", @@ -3955,10 +3955,10 @@ defm VMLS : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>; def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32", v2f32, fmul_su, fsub_mlx>, - Requires<[HasNEON, UseFPVMLx]>; + Requires<[HasNEON, UseFPVMLx, NoNEONVFP4]>; def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32", v4f32, fmul_su, fsub_mlx>, - Requires<[HasNEON, UseFPVMLx]>; + Requires<[HasNEON, UseFPVMLx, NoNEONVFP4]>; defm VMLSsl : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>; def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32", @@ -4007,6 +4007,24 @@ defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D, "vqdmlsl", "s", int_arm_neon_vqdmlsl>; defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", int_arm_neon_vqdmlsl>; + +// Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations. +def VFMAfd : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32", + v2f32, fmul_su, fadd_mlx>, + Requires<[HasNEONVFP4]>; + +def VFMAfq : N3VQMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACQ, "vfma", "f32", + v4f32, fmul_su, fadd_mlx>, + Requires<[HasNEONVFP4]>; + +// Fused Vector Multiply Subtract (floating-point) +def VFMSfd : N3VDMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACD, "vfms", "f32", + v2f32, fmul_su, fsub_mlx>, + Requires<[HasNEONVFP4]>; +def VFMSfq : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32", + v4f32, fmul_su, fsub_mlx>, + Requires<[HasNEONVFP4]>; + // Vector Subtract Operations. // VSUB : Vector Subtract (integer and floating-point) @@ -5358,9 +5376,13 @@ def : N3VSPat; def : N3VSPat; def : N3VSPat; def : N3VSMulOpPat, - Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>; + Requires<[HasNEON, UseNEONForFP, UseFPVMLx, NoNEONVFP4]>; def : N3VSMulOpPat, - Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>; + Requires<[HasNEON, UseNEONForFP, UseFPVMLx, NoNEONVFP4]>; +def : N3VSMulOpPat, + Requires<[HasNEONVFP4, UseNEONForFP]>; +def : N3VSMulOpPat, + Requires<[HasNEONVFP4, UseNEONForFP]>; def : N2VSPat; def : N2VSPat; def : N3VSPat; diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td index 81c8c7e45fc..1c07394f398 100644 --- a/lib/Target/ARM/ARMInstrVFP.td +++ b/lib/Target/ARM/ARMInstrVFP.td @@ -920,7 +920,7 @@ def VMLAD : ADbI<0b11100, 0b00, 0, 0, [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm), (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP2,UseFPVMLx]>; + Requires<[HasVFP2,UseFPVMLx,NoVFP4]>; def VMLAS : ASbIn<0b11100, 0b00, 0, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), @@ -928,7 +928,7 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0, [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> { + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,NoVFP4]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. let D = VFPNeonA8Domain; @@ -936,10 +936,10 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0, def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), (VMLAD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP2,UseFPVMLx]>; + Requires<[HasVFP2,UseFPVMLx,NoVFP4]>; def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx]>; + Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx,NoVFP4]>; def VMLSD : ADbI<0b11100, 0b00, 1, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), @@ -947,7 +947,7 @@ def VMLSD : ADbI<0b11100, 0b00, 1, 0, [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP2,UseFPVMLx]>; + Requires<[HasVFP2,UseFPVMLx,NoVFP4]>; def VMLSS : ASbIn<0b11100, 0b00, 1, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), @@ -955,7 +955,7 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0, [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> { + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,NoVFP4]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. let D = VFPNeonA8Domain; @@ -963,10 +963,10 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0, def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP2,UseFPVMLx]>; + Requires<[HasVFP2,UseFPVMLx,NoVFP4]>; def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,NoVFP4]>; def VNMLAD : ADbI<0b11100, 0b01, 1, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), @@ -974,7 +974,7 @@ def VNMLAD : ADbI<0b11100, 0b01, 1, 0, [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP2,UseFPVMLx]>; + Requires<[HasVFP2,UseFPVMLx,NoVFP4]>; def VNMLAS : ASbI<0b11100, 0b01, 1, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), @@ -982,7 +982,7 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0, [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> { + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,NoVFP4]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. let D = VFPNeonA8Domain; @@ -990,10 +990,10 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0, def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin), (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP2,UseFPVMLx]>; + Requires<[HasVFP2,UseFPVMLx,NoVFP4]>; def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin), (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,NoVFP4]>; def VNMLSD : ADbI<0b11100, 0b01, 0, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), @@ -1001,14 +1001,14 @@ def VNMLSD : ADbI<0b11100, 0b01, 0, 0, [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm), (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP2,UseFPVMLx]>; + Requires<[HasVFP2,UseFPVMLx,NoVFP4]>; def VNMLSS : ASbI<0b11100, 0b01, 0, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm", [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> { + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,NoVFP4]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. let D = VFPNeonA8Domain; @@ -1016,11 +1016,116 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0, def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin), (VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP2,UseFPVMLx]>; + Requires<[HasVFP2,UseFPVMLx,NoVFP4]>; def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin), (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,NoVFP4]>; +//===----------------------------------------------------------------------===// +// Fused FP Multiply-Accumulate Operations. +// +def VFMAD : ADbI<0b11101, 0b10, 0, 0, + (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), + IIC_fpFMAC64, "vfma", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm), + (f64 DPR:$Ddin)))]>, + RegConstraint<"$Ddin = $Dd">, + Requires<[HasVFP4]>; + +def VFMAS : ASbIn<0b11101, 0b10, 0, 0, + (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + IIC_fpFMAC32, "vfma", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm), + SPR:$Sdin))]>, + RegConstraint<"$Sdin = $Sd">, + Requires<[HasVFP4,DontUseNEONForFP]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines. +} + +def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), + (VFMAD DPR:$dstin, DPR:$a, DPR:$b)>, + Requires<[HasVFP4]>; +def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), + (VFMAS SPR:$dstin, SPR:$a, SPR:$b)>, + Requires<[HasVFP4,DontUseNEONForFP]>; + +def VFMSD : ADbI<0b11101, 0b10, 1, 0, + (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), + IIC_fpFMAC64, "vfms", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), + (f64 DPR:$Ddin)))]>, + RegConstraint<"$Ddin = $Dd">, + Requires<[HasVFP4]>; + +def VFMSS : ASbIn<0b11101, 0b10, 1, 0, + (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + IIC_fpFMAC32, "vfms", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), + SPR:$Sdin))]>, + RegConstraint<"$Sdin = $Sd">, + Requires<[HasVFP4,DontUseNEONForFP]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines. +} + +def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), + (VFMSD DPR:$dstin, DPR:$a, DPR:$b)>, + Requires<[HasVFP4]>; +def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), + (VFMSS SPR:$dstin, SPR:$a, SPR:$b)>, + Requires<[HasVFP4,DontUseNEONForFP]>; + +def VFNMAD : ADbI<0b11101, 0b01, 1, 0, + (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), + IIC_fpFMAC64, "vfnma", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), + (f64 DPR:$Ddin)))]>, + RegConstraint<"$Ddin = $Dd">, + Requires<[HasVFP4]>; + +def VFNMAS : ASbI<0b11101, 0b01, 1, 0, + (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + IIC_fpFMAC32, "vfnma", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), + SPR:$Sdin))]>, + RegConstraint<"$Sdin = $Sd">, + Requires<[HasVFP4,DontUseNEONForFP]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines. +} + +def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin), + (VFNMAD DPR:$dstin, DPR:$a, DPR:$b)>, + Requires<[HasVFP4]>; +def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin), + (VFNMAS SPR:$dstin, SPR:$a, SPR:$b)>, + Requires<[HasVFP4,DontUseNEONForFP]>; + +def VFNMSD : ADbI<0b11101, 0b01, 0, 0, + (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), + IIC_fpFMAC64, "vfnms", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm), + (f64 DPR:$Ddin)))]>, + RegConstraint<"$Ddin = $Dd">, + Requires<[HasVFP4]>; + +def VFNMSS : ASbI<0b11101, 0b01, 0, 0, + (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + IIC_fpFMAC32, "vfnms", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>, + RegConstraint<"$Sdin = $Sd">, + Requires<[HasVFP4,DontUseNEONForFP]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines. +} + +def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin), + (VFNMSD DPR:$dstin, DPR:$a, DPR:$b)>, + Requires<[HasVFP4]>; +def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin), + (VFNMSS SPR:$dstin, SPR:$a, SPR:$b)>, + Requires<[HasVFP4,DontUseNEONForFP]>; //===----------------------------------------------------------------------===// // FP Conditional moves. diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td index 958c5c64701..86caa403f9b 100644 --- a/lib/Target/ARM/ARMSchedule.td +++ b/lib/Target/ARM/ARMSchedule.td @@ -118,6 +118,8 @@ def IIC_fpMUL32 : InstrItinClass; def IIC_fpMUL64 : InstrItinClass; def IIC_fpMAC32 : InstrItinClass; def IIC_fpMAC64 : InstrItinClass; +def IIC_fpFMAC32 : InstrItinClass; +def IIC_fpFMAC64 : InstrItinClass; def IIC_fpDIV32 : InstrItinClass; def IIC_fpDIV64 : InstrItinClass; def IIC_fpSQRT32 : InstrItinClass; @@ -208,6 +210,8 @@ def IIC_VPERMQ : InstrItinClass; def IIC_VPERMQ3 : InstrItinClass; def IIC_VMACD : InstrItinClass; def IIC_VMACQ : InstrItinClass; +def IIC_VFMACD : InstrItinClass; +def IIC_VFMACQ : InstrItinClass; def IIC_VRECSD : InstrItinClass; def IIC_VRECSQ : InstrItinClass; def IIC_VCNTiD : InstrItinClass; diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index 074cd4c0d7d..bb77a3c48ca 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -47,7 +47,9 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU, , HasV7Ops(false) , HasVFPv2(false) , HasVFPv3(false) + , HasVFPv4(false) , HasNEON(false) + , HasNEONVFPv4(false) , UseNEONForSinglePrecisionFP(false) , SlowFPVMLx(false) , HasVMLxForwarding(false) diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 85479a5e1c9..b6c40d32cf2 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -45,11 +45,13 @@ protected: bool HasV6T2Ops; bool HasV7Ops; - /// HasVFPv2, HasVFPv3, HasNEON - Specify what floating point ISAs are - /// supported. + /// HasVFPv2, HasVFPv3, HasVFPv4, HasNEON, HasNEONVFPv4 - Specify what + /// floating point ISAs are supported. bool HasVFPv2; bool HasVFPv3; + bool HasVFPv4; bool HasNEON; + bool HasNEONVFPv4; /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been /// specified. Use the method useNEONForSinglePrecisionFP() to @@ -197,7 +199,9 @@ protected: bool hasVFP2() const { return HasVFPv2; } bool hasVFP3() const { return HasVFPv3; } + bool hasVFP4() const { return HasVFPv4; } bool hasNEON() const { return HasNEON; } + bool hasNEONVFP4() const { return HasNEONVFPv4; } bool useNEONForSinglePrecisionFP() const { return hasNEON() && UseNEONForSinglePrecisionFP; } diff --git a/test/CodeGen/ARM/fusedMAC.ll b/test/CodeGen/ARM/fusedMAC.ll new file mode 100644 index 00000000000..d7b8ed165b4 --- /dev/null +++ b/test/CodeGen/ARM/fusedMAC.ll @@ -0,0 +1,68 @@ +; RUN: llc < %s -march=arm -mattr=+neon-vfpv4 | FileCheck %s +; Check generated fused MAC and MLS. + +define double @fusedMACTest1(double %d1, double %d2, double %d3) nounwind readnone noinline { +;CHECK: fusedMACTest1: +;CHECK: vfma.f64 + %1 = fmul double %d1, %d2 + %2 = fadd double %1, %d3 + ret double %2 +} + +define float @fusedMACTest2(float %f1, float %f2, float %f3) nounwind readnone noinline { +;CHECK: fusedMACTest2: +;CHECK: vfma.f32 + %1 = fmul float %f1, %f2 + %2 = fadd float %1, %f3 + ret float %2 +} + +define double @fusedMACTest3(double %d1, double %d2, double %d3) nounwind readnone noinline { +;CHECK: fusedMACTest3: +;CHECK: vfms.f64 + %1 = fmul double %d2, %d3 + %2 = fsub double %d1, %1 + ret double %2 +} + +define float @fusedMACTest4(float %f1, float %f2, float %f3) nounwind readnone noinline { +;CHECK: fusedMACTest4: +;CHECK: vfms.f32 + %1 = fmul float %f2, %f3 + %2 = fsub float %f1, %1 + ret float %2 +} + +define double @fusedMACTest5(double %d1, double %d2, double %d3) nounwind readnone noinline { +;CHECK: fusedMACTest5: +;CHECK: vfnma.f64 + %1 = fmul double %d1, %d2 + %2 = fsub double -0.0, %1 + %3 = fsub double %2, %d3 + ret double %3 +} + +define float @fusedMACTest6(float %f1, float %f2, float %f3) nounwind { +;CHECK: fusedMACTest6: +;CHECK: vfnma.f32 + %1 = fmul float %f1, %f2 + %2 = fsub float -0.0, %1 + %3 = fsub float %2, %f3 + ret float %3 +} + +define double @fusedMACTest7(double %d1, double %d2, double %d3) nounwind { +;CHECK: fusedMACTest7: +;CHECK: vfnms.f64 + %1 = fmul double %d1, %d2 + %2 = fsub double %1, %d3 + ret double %2 +} + +define float @fusedMACTest8(float %f1, float %f2, float %f3) nounwind { +;CHECK: fusedMACTest8: +;CHECK: vfnms.f32 + %1 = fmul float %f1, %f2 + %2 = fsub float %1, %f3 + ret float %2 +}