More ARM scheduling itinerary fixes.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@116266 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-04 10:30:01 +00:00 · 2010-10-11 23:41:41 +00:00 · 2010-10-11 23:41:41 +00:00 · 08cec1ef27
commit 08cec1ef27
parent 633e702317
4 changed files with 460 additions and 377 deletions
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@ -2549,9 +2549,9 @@ def  VMULpd   : N3VDInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16D, "vmul",
                        "p8", v8i8, v8i8, int_arm_neon_vmulp, 1>;
 def  VMULpq   : N3VQInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16Q, "vmul",
                        "p8", v16i8, v16i8, int_arm_neon_vmulp, 1>;
-def  VMULfd   : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VBIND, "vmul", "f32",
+def  VMULfd   : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VFMULD, "vmul", "f32",
                     v2f32, v2f32, fmul, 1>;
-def  VMULfq   : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VBINQ, "vmul", "f32",
+def  VMULfq   : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VFMULQ, "vmul", "f32",
                     v4f32, v4f32, fmul, 1>;
 defm VMULsl   : N3VSL_HS<0b1000, "vmul", "i", mul>;
 def  VMULslfd : N3VDSL<0b10, 0b1001, IIC_VBIND, "vmul", "f32", v2f32, fmul>;
@ -3046,7 +3046,7 @@ def  VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, N3RegFrm, IIC_VSHLiD,
                        "vpadd", "i32",
                        v2i32, v2i32, int_arm_neon_vpadd, 0>;
 def  VPADDf   : N3VDInt<1, 0, 0b00, 0b1101, 0, N3RegFrm, 
-                        IIC_VBIND, "vpadd", "f32",
+                        IIC_VPBIND, "vpadd", "f32",
                        v2f32, v2f32, int_arm_neon_vpadd, 0>;

 //   VPADDL   : Vector Pairwise Add Long
@ -3074,7 +3074,7 @@ def  VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
                        "u16", v4i16, v4i16, int_arm_neon_vpmaxu, 0>;
 def  VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
                        "u32", v2i32, v2i32, int_arm_neon_vpmaxu, 0>;
-def  VPMAXf   : N3VDInt<1, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+def  VPMAXf   : N3VDInt<1, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmax",
                        "f32", v2f32, v2f32, int_arm_neon_vpmaxs, 0>;

 //   VPMIN    : Vector Pairwise Minimum
@ -3090,7 +3090,7 @@ def  VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
                        "u16", v4i16, v4i16, int_arm_neon_vpminu, 0>;
 def  VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
                        "u32", v2i32, v2i32, int_arm_neon_vpminu, 0>;
-def  VPMINf   : N3VDInt<1, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VSUBi4D, "vpmin",
+def  VPMINf   : N3VDInt<1, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmin",
                        "f32", v2f32, v2f32, int_arm_neon_vpmins, 0>;

 // Vector Reciprocal and Reciprocal Square Root Estimate and Step.
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@ -174,6 +174,9 @@ def IIC_VUNAD      : InstrItinClass;
 def IIC_VUNAQ      : InstrItinClass;
 def IIC_VBIND      : InstrItinClass;
 def IIC_VBINQ      : InstrItinClass;
+def IIC_VPBIND     : InstrItinClass;
+def IIC_VFMULD     : InstrItinClass;
+def IIC_VFMULQ     : InstrItinClass;
 def IIC_VMOV       : InstrItinClass;
 def IIC_VMOVImm    : InstrItinClass;
 def IIC_VMOVD      : InstrItinClass;
--- a/lib/Target/ARM/ARMScheduleA8.td
+++ b/lib/Target/ARM/ARMScheduleA8.td
@ -665,12 +665,25 @@ def CortexA8Itineraries : ProcessorItineraries<
  InstrItinData<IIC_VBIND,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                               InstrStage<1, [A8_NPipe]>], [5, 2, 2]>,
  //
+  // VPADD, etc.
+  InstrItinData<IIC_VPBIND,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<1, [A8_NPipe]>], [5, 2, 2]>,
+  //
+  // Double-register FP VMUL
+  InstrItinData<IIC_VFMULD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<1, [A8_NPipe]>], [5, 2, 1]>,
+
+  //
  // Quad-register FP Binary
  // Result written in N5, but that is relative to the last cycle of multicycle,
  // so we use 6 for those cases
  InstrItinData<IIC_VBINQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                               InstrStage<2, [A8_NPipe]>], [6, 2, 2]>,
  //
+  // Quad-register FP VMUL
+  InstrItinData<IIC_VFMULQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<1, [A8_NPipe]>], [6, 2, 1]>,
+  //
  // Move
  InstrItinData<IIC_VMOV,     [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                               InstrStage<1, [A8_NPipe]>], [1, 1]>,
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td