diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index b05fe629b74..85c41fc75d4 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -76,8 +76,6 @@ def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding", def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP", "true", "Use NEON for single precision FP">; -// Allow more precision in FP computation -def FPContractions : Predicate<"!TM.Options.NoExcessFPPrecision">; // Disable 32-bit to 16-bit narrowing for experimentation. def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true", diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 6b8f4cc4327..37284f979d4 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -181,11 +181,11 @@ def HasVFP3 : Predicate<"Subtarget->hasVFP3()">, AssemblerPredicate<"FeatureVFP3">; def HasVFP4 : Predicate<"Subtarget->hasVFP4()">, AssemblerPredicate<"FeatureVFP4">; -def NoVFP4 : Predicate<"!Subtarget->hasVFP4()">; +def NoVFP4 : Predicate<"!Subtarget->hasVFP4()">; def HasNEON : Predicate<"Subtarget->hasNEON()">, AssemblerPredicate<"FeatureNEON">; def HasNEON2 : Predicate<"Subtarget->hasNEON2()">, - AssemblerPredicate<"FeatureNEON2">; + AssemblerPredicate<"FeatureNEON,FeatureVFP4">; def NoNEON2 : Predicate<"!Subtarget->hasNEON2()">; def HasFP16 : Predicate<"Subtarget->hasFP16()">, AssemblerPredicate<"FeatureFP16">; @@ -221,6 +221,9 @@ def UseMovt : Predicate<"Subtarget->useMovt()">; def DontUseMovt : Predicate<"!Subtarget->useMovt()">; def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">; +// Allow more precision in FP computation +def FPContractions : Predicate<"!TM.Options.NoExcessFPPrecision">; + //===----------------------------------------------------------------------===// // ARM Flag Definitions. diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 99dbb95431a..501cc8f4db9 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -4115,7 +4115,6 @@ defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D, "vqdmlsl", "s", int_arm_neon_vqdmlsl>; defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", int_arm_neon_vqdmlsl>; - // Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations. def VFMAfd : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32", v2f32, fmul_su, fadd_mlx>, @@ -4136,10 +4135,10 @@ def VFMSfq : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32", // Match @llvm.fma.* intrinsics def : Pat<(fma (v2f32 DPR:$src1), (v2f32 DPR:$Vn), (v2f32 DPR:$Vm)), (VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>, - Requires<[HasNEON, HasVFP4]>; + Requires<[HasNEON2]>; def : Pat<(fma (v4f32 QPR:$src1), (v4f32 QPR:$Vn), (v4f32 QPR:$Vm)), (VFMAfq QPR:$src1, QPR:$Vn, QPR:$Vm)>, - Requires<[HasNEON, HasVFP4]>; + Requires<[HasNEON2]>; // Vector Subtract Operations. @@ -5497,9 +5496,9 @@ def : N3VSMulOpPat, def : N3VSMulOpPat, Requires<[HasNEON, UseNEONForFP, UseFPVMLx, NoNEON2]>; def : N3VSMulOpPat, - Requires<[HasNEON2, UseNEONForFP,FPContractions]>; + Requires<[HasNEON2, UseNEONForFP, FPContractions]>; def : N3VSMulOpPat, - Requires<[HasNEON2, UseNEONForFP,FPContractions]>; + Requires<[HasNEON2, UseNEONForFP, FPContractions]>; def : N2VSPat; def : N2VSPat; def : N3VSPat; diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td index 8d86c01dc74..8b1fb9386ad 100644 --- a/lib/Target/ARM/ARMScheduleA8.td +++ b/lib/Target/ARM/ARMScheduleA8.td @@ -324,6 +324,15 @@ def CortexA8Itineraries : ProcessorItineraries< InstrStage<19, [A8_NPipe], 0>, InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>, // + // Single-precision Fused FP MAC + InstrItinData, + InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>, + // + // Double-precision Fused FP MAC + InstrItinData, + InstrStage<19, [A8_NPipe], 0>, + InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>, + // // Single-precision FP DIV InstrItinData, InstrStage<20, [A8_NPipe], 0>, @@ -860,6 +869,16 @@ def CortexA8Itineraries : ProcessorItineraries< InstrItinData, InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>, // + // Double-register Fused FP Multiple-Accumulate + InstrItinData, + InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>, + // + // Quad-register Fused FP Multiple-Accumulate + // Result written in N9, but that is relative to the last cycle of multicycle, + // so we use 10 for those cases + InstrItinData, + InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>, + // // Double-register Reciprical Step InstrItinData, InstrStage<1, [A8_NPipe]>], [9, 2, 2]>, diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index 49fedf63f8b..0d710cc1ace 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -604,6 +604,22 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<2, [A9_NPipe]>], [9, 1, 1, 1]>, // + // Single-precision Fused FP MAC + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<9, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [8, 1, 1, 1]>, + // + // Double-precision Fused FP MAC + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<10, [A9_DRegsN], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [9, 1, 1, 1]>, + // // Single-precision FP DIV InstrItinData, InstrStage<1, [A9_MUX0], 0>, @@ -1697,6 +1713,26 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<4, [A9_NPipe]>], [8, 4, 2, 1]>, // + // Double-register Fused FP Multiple-Accumulate + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [6, 3, 2, 1]>, + // + // Quad-register Fused FP Multiple-Accumulate + // Result written in N9, but that is relative to the last cycle of multicycle, + // so we use 10 for those cases + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 9 cycles + InstrStage<10, [A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe]>], + [8, 4, 2, 1]>, + // // Double-register Reciprical Step InstrItinData, InstrStage<1, [A9_MUX0], 0>, diff --git a/lib/Target/ARM/ARMScheduleV6.td b/lib/Target/ARM/ARMScheduleV6.td index 4d959f565e0..0ace9bc1796 100644 --- a/lib/Target/ARM/ARMScheduleV6.td +++ b/lib/Target/ARM/ARMScheduleV6.td @@ -243,6 +243,12 @@ def ARMV6Itineraries : ProcessorItineraries< // Double-precision FP MAC InstrItinData], [9, 2, 2, 2]>, // + // Single-precision Fused FP MAC + InstrItinData], [9, 2, 2, 2]>, + // + // Double-precision Fused FP MAC + InstrItinData], [9, 2, 2, 2]>, + // // Single-precision FP DIV InstrItinData], [20, 2, 2]>, // diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 3d9c03d5dd2..5cf54b94a8a 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -45,7 +45,7 @@ protected: bool HasV6T2Ops; bool HasV7Ops; - /// HasVFPv2, HasVFPv3, HasVFPv4, HasNEON, HasNEONVFPv4 - Specify what + /// HasVFPv2, HasVFPv3, HasVFPv4, HasNEON, HasNEON2 - Specify what /// floating point ISAs are supported. bool HasVFPv2; bool HasVFPv3; diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 34dadf88238..8fa7378ffff 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -4659,6 +4659,7 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic, Mnemonic == "fmrs" || Mnemonic == "fsqrts" || Mnemonic == "fsubs" || Mnemonic == "fsts" || Mnemonic == "fcpys" || Mnemonic == "fdivs" || Mnemonic == "fmuls" || Mnemonic == "fcmps" || Mnemonic == "fcmpzs" || + Mnemonic == "vfms" || Mnemonic == "vfnms" || (Mnemonic == "movs" && isThumb()))) { Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1); CarrySetting = true; @@ -4702,6 +4703,7 @@ getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet, Mnemonic == "orr" || Mnemonic == "mvn" || Mnemonic == "rsb" || Mnemonic == "rsc" || Mnemonic == "orn" || Mnemonic == "sbc" || Mnemonic == "eor" || Mnemonic == "neg" || + Mnemonic == "vfm" || Mnemonic == "vfnm" || (!isThumb() && (Mnemonic == "smull" || Mnemonic == "mov" || Mnemonic == "mla" || Mnemonic == "smlal" || Mnemonic == "umlal" || Mnemonic == "umull"))) { diff --git a/test/MC/ARM/vfp4.s b/test/MC/ARM/vfp4.s new file mode 100644 index 00000000000..009d31d6433 --- /dev/null +++ b/test/MC/ARM/vfp4.s @@ -0,0 +1,50 @@ +@ RUN: llvm-mc < %s -triple armv7-unknown-unknown -show-encoding -mattr=+neon,+vfp4 | FileCheck %s --check-prefix=ARM +@ RUN: llvm-mc < %s -triple thumbv7-unknown-unknown -show-encoding -mattr=+neon,+vfp4 | FileCheck %s --check-prefix=THUMB + + @ ARM: vfma.f64 d16, d18, d17 @ encoding: [0xa1,0x0b,0xe2,0xee] +@ THUMB: vfma.f64 d16, d18, d17 @ encoding: [0xe2,0xee,0xa1,0x0b] +vfma.f64 d16, d18, d17 + +@ ARM: vfma.f32 s2, s4, s0 @ encoding: [0x00,0x1a,0xa2,0xee] +@ THUMB: vfma.f32 s2, s4, s0 @ encoding: [0xa2,0xee,0x00,0x1a] +vfma.f32 s2, s4, s0 + +@ ARM: vfma.f32 d16, d18, d17 @ encoding: [0xb1,0x0c,0x42,0xf2] +@ THUMB: vfma.f32 d16, d18, d17 @ encoding: [0x42,0xef,0xb1,0x0c] +vfma.f32 d16, d18, d17 + +@ ARM: vfma.f32 q2, q4, q0 @ encoding: [0x50,0x4c,0x08,0xf2] +@ THUMB: vfma.f32 q2, q4, q0 @ encoding: [0x08,0xef,0x50,0x4c] +vfma.f32 q2, q4, q0 + +@ ARM: vfnma.f64 d16, d18, d17 @ encoding: [0xe1,0x0b,0xd2,0xee] +@ THUMB: vfnma.f64 d16, d18, d17 @ encoding: [0xd2,0xee,0xe1,0x0b] +vfnma.f64 d16, d18, d17 + +@ ARM: vfnma.f32 s2, s4, s0 @ encoding: [0x40,0x1a,0x92,0xee] +@ THUMB: vfnma.f32 s2, s4, s0 @ encoding: [0x92,0xee,0x40,0x1a] +vfnma.f32 s2, s4, s0 + +@ ARM: vfms.f64 d16, d18, d17 @ encoding: [0xe1,0x0b,0xe2,0xee] +@ THUMB: vfms.f64 d16, d18, d17 @ encoding: [0xe2,0xee,0xe1,0x0b] +vfms.f64 d16, d18, d17 + +@ ARM: vfms.f32 s2, s4, s0 @ encoding: [0x40,0x1a,0xa2,0xee] +@ THUMB: vfms.f32 s2, s4, s0 @ encoding: [0xa2,0xee,0x40,0x1a] +vfms.f32 s2, s4, s0 + +@ ARM: vfms.f32 d16, d18, d17 @ encoding: [0xb1,0x0c,0x62,0xf2] +@ THUMB: vfms.f32 d16, d18, d17 @ encoding: [0x62,0xef,0xb1,0x0c] +vfms.f32 d16, d18, d17 + +@ ARM: vfms.f32 q2, q4, q0 @ encoding: [0x50,0x4c,0x28,0xf2] +@ THUMB: vfms.f32 q2, q4, q0 @ encoding: [0x28,0xef,0x50,0x4c] +vfms.f32 q2, q4, q0 + +@ ARM: vfnms.f64 d16, d18, d17 @ encoding: [0xa1,0x0b,0xd2,0xee] +@ THUMB: vfnms.f64 d16, d18, d17 @ encoding: [0xd2,0xee,0xa1,0x0b] +vfnms.f64 d16, d18, d17 + +@ ARM: vfnms.f32 s2, s4, s0 @ encoding: [0x00,0x1a,0x92,0xee] +@ THUMB: vfnms.f32 s2, s4, s0 @ encoding: [0x92,0xee,0x00,0x1a] +vfnms.f32 s2, s4, s0 diff --git a/test/MC/Disassembler/ARM/vfp4.txt b/test/MC/Disassembler/ARM/vfp4.txt new file mode 100644 index 00000000000..4f2c7321183 --- /dev/null +++ b/test/MC/Disassembler/ARM/vfp4.txt @@ -0,0 +1,37 @@ +# RUN: llvm-mc < %s -triple thumbv7-unknown-unknown --disassemble -mattr=+neon,+vfp4 | FileCheck %s + +# CHECK: vfma.f64 d16, d18, d17 +0xe2 0xee 0xa1 0x0b + +# CHECK: vfma.f32 s2, s4, s0 +0xa2 0xee 0x00 0x1a + +# CHECK: vfma.f32 d16, d18, d17 +0x42 0xef 0xb1 0x0c + +# CHECK: vfma.f32 q2, q4, q0 +0x08 0xef 0x50 0x4c + +# CHECK: vfnms.f64 d16, d18, d17 +0xd2 0xee 0xa1 0x0b + +# CHECK: vfnms.f32 s2, s4, s0 +0x92 0xee 0x00 0x1a + +# CHECK: vfms.f64 d16, d18, d17 +0xe2 0xee 0xe1 0x0b + +# CHECK: vfms.f32 s2, s4, s0 +0xa2 0xee 0x40 0x1a + +# CHECK: vfms.f32 d16, d18, d17 +0x62 0xef 0xb1 0x0c + +# CHECK: vfms.f32 q2, q4, q0 +0x28 0xef 0x50 0x4c + +# CHECK: vfnma.f64 d16, d18, d17 +0xd2 0xee 0xe1 0x0b + +# CHECK: vfnma.f32 s2, s4, s0 +0x92 0xee 0x40 0x1a