From 84f69e8436d522cb3a772766ba67a1d7658dfdf5 Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Sat, 9 Oct 2010 01:45:34 +0000 Subject: [PATCH] Finish vld3 and vld4. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@116140 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMInstrNEON.td | 44 ++++++++++++++++----------------- lib/Target/ARM/ARMSchedule.td | 3 +++ lib/Target/ARM/ARMScheduleA8.td | 33 +++++++++++++++++++------ lib/Target/ARM/ARMScheduleA9.td | 32 ++++++++++++++++++++---- 4 files changed, 78 insertions(+), 34 deletions(-) diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 24563a1a5e2..4340a7d3c74 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -344,7 +344,7 @@ def VLD3d32Pseudo : VLDQQPseudo; class VLD3DWB op11_8, bits<4> op7_4, string Dt> : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset), IIC_VLD3, + (ins addrmode6:$addr, am6offset:$offset), IIC_VLD3u, "vld3", Dt, "\\{$dst1, $dst2, $dst3\\}, $addr$offset", "$addr.addr = $wb", []>; @@ -352,9 +352,9 @@ def VLD3d8_UPD : VLD3DWB<0b0100, 0b0000, "8">; def VLD3d16_UPD : VLD3DWB<0b0100, 0b0100, "16">; def VLD3d32_UPD : VLD3DWB<0b0100, 0b1000, "32">; -def VLD3d8Pseudo_UPD : VLDQQWBPseudo; -def VLD3d16Pseudo_UPD : VLDQQWBPseudo; -def VLD3d32Pseudo_UPD : VLDQQWBPseudo; +def VLD3d8Pseudo_UPD : VLDQQWBPseudo; +def VLD3d16Pseudo_UPD : VLDQQWBPseudo; +def VLD3d32Pseudo_UPD : VLDQQWBPseudo; // ...with double-spaced registers (non-updating versions for disassembly only): def VLD3q8 : VLD3D<0b0101, 0b0000, "8">; @@ -364,14 +364,14 @@ def VLD3q8_UPD : VLD3DWB<0b0101, 0b0000, "8">; def VLD3q16_UPD : VLD3DWB<0b0101, 0b0100, "16">; def VLD3q32_UPD : VLD3DWB<0b0101, 0b1000, "32">; -def VLD3q8Pseudo_UPD : VLDQQQQWBPseudo; -def VLD3q16Pseudo_UPD : VLDQQQQWBPseudo; -def VLD3q32Pseudo_UPD : VLDQQQQWBPseudo; +def VLD3q8Pseudo_UPD : VLDQQQQWBPseudo; +def VLD3q16Pseudo_UPD : VLDQQQQWBPseudo; +def VLD3q32Pseudo_UPD : VLDQQQQWBPseudo; // ...alternate versions to be allocated odd register numbers: -def VLD3q8oddPseudo_UPD : VLDQQQQWBPseudo; -def VLD3q16oddPseudo_UPD : VLDQQQQWBPseudo; -def VLD3q32oddPseudo_UPD : VLDQQQQWBPseudo; +def VLD3q8oddPseudo_UPD : VLDQQQQWBPseudo; +def VLD3q16oddPseudo_UPD : VLDQQQQWBPseudo; +def VLD3q32oddPseudo_UPD : VLDQQQQWBPseudo; // VLD4 : Vector Load (multiple 4-element structures) class VLD4D op11_8, bits<4> op7_4, string Dt> @@ -499,7 +499,7 @@ def VLD2LNq32Pseudo_UPD : VLDQQLNWBPseudo; class VLD3LN op11_8, bits<4> op7_4, string Dt> : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3), (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, - nohash_imm:$lane), IIC_VLD3, "vld3", Dt, + nohash_imm:$lane), IIC_VLD3ln, "vld3", Dt, "\\{$dst1[$lane], $dst2[$lane], $dst3[$lane]\\}, $addr", "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3", []>; @@ -507,16 +507,16 @@ def VLD3LNd8 : VLD3LN<0b0010, {?,?,?,0}, "8">; def VLD3LNd16 : VLD3LN<0b0110, {?,?,0,0}, "16">; def VLD3LNd32 : VLD3LN<0b1010, {?,0,0,0}, "32">; -def VLD3LNd8Pseudo : VLDQQLNPseudo; -def VLD3LNd16Pseudo : VLDQQLNPseudo; -def VLD3LNd32Pseudo : VLDQQLNPseudo; +def VLD3LNd8Pseudo : VLDQQLNPseudo; +def VLD3LNd16Pseudo : VLDQQLNPseudo; +def VLD3LNd32Pseudo : VLDQQLNPseudo; // ...with double-spaced registers: def VLD3LNq16 : VLD3LN<0b0110, {?,?,1,0}, "16">; def VLD3LNq32 : VLD3LN<0b1010, {?,1,0,0}, "32">; -def VLD3LNq16Pseudo : VLDQQQQLNPseudo; -def VLD3LNq32Pseudo : VLDQQQQLNPseudo; +def VLD3LNq16Pseudo : VLDQQQQLNPseudo; +def VLD3LNq32Pseudo : VLDQQQQLNPseudo; // ...with address register writeback: class VLD3LNWB op11_8, bits<4> op7_4, string Dt> @@ -524,7 +524,7 @@ class VLD3LNWB op11_8, bits<4> op7_4, string Dt> (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb), (ins addrmode6:$addr, am6offset:$offset, DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane), - IIC_VLD3, "vld3", Dt, + IIC_VLD3lnu, "vld3", Dt, "\\{$dst1[$lane], $dst2[$lane], $dst3[$lane]\\}, $addr$offset", "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $addr.addr = $wb", []>; @@ -533,15 +533,15 @@ def VLD3LNd8_UPD : VLD3LNWB<0b0010, {?,?,?,0}, "8">; def VLD3LNd16_UPD : VLD3LNWB<0b0110, {?,?,0,0}, "16">; def VLD3LNd32_UPD : VLD3LNWB<0b1010, {?,0,0,0}, "32">; -def VLD3LNd8Pseudo_UPD : VLDQQLNWBPseudo; -def VLD3LNd16Pseudo_UPD : VLDQQLNWBPseudo; -def VLD3LNd32Pseudo_UPD : VLDQQLNWBPseudo; +def VLD3LNd8Pseudo_UPD : VLDQQLNWBPseudo; +def VLD3LNd16Pseudo_UPD : VLDQQLNWBPseudo; +def VLD3LNd32Pseudo_UPD : VLDQQLNWBPseudo; def VLD3LNq16_UPD : VLD3LNWB<0b0110, {?,?,1,0}, "16">; def VLD3LNq32_UPD : VLD3LNWB<0b1010, {?,1,0,0}, "32">; -def VLD3LNq16Pseudo_UPD : VLDQQQQLNWBPseudo; -def VLD3LNq32Pseudo_UPD : VLDQQQQLNWBPseudo; +def VLD3LNq16Pseudo_UPD : VLDQQQQLNWBPseudo; +def VLD3LNq32Pseudo_UPD : VLDQQQQLNWBPseudo; // VLD4LN : Vector Load (single 4-element structure to one lane) class VLD4LN op11_8, bits<4> op7_4, string Dt> diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td index 521faa196d0..73c677e0d4a 100644 --- a/lib/Target/ARM/ARMSchedule.td +++ b/lib/Target/ARM/ARMSchedule.td @@ -141,6 +141,9 @@ def IIC_VLD2x2u : InstrItinClass; def IIC_VLD2ln : InstrItinClass; def IIC_VLD2lnu : InstrItinClass; def IIC_VLD3 : InstrItinClass; +def IIC_VLD3ln : InstrItinClass; +def IIC_VLD3u : InstrItinClass; +def IIC_VLD3lnu : InstrItinClass; def IIC_VLD4 : InstrItinClass; def IIC_VST : InstrItinClass; def IIC_VUNAD : InstrItinClass; diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td index e7118398451..6c4cf8f122d 100644 --- a/lib/Target/ARM/ARMScheduleA8.td +++ b/lib/Target/ARM/ARMScheduleA8.td @@ -441,39 +441,58 @@ def CortexA8Itineraries : ProcessorItineraries< [2, 2, 1]>, // // VLD2x2 - InstrItinData, + InstrItinData, InstrStage<3, [A8_NLSPipe], 1>, InstrStage<3, [A8_LSPipe]>], [2, 2, 3, 3, 1]>, // // VLD2ln - InstrItinData, + InstrItinData, InstrStage<3, [A8_NLSPipe], 1>, InstrStage<3, [A8_LSPipe]>], [3, 3, 1, 1, 1, 1]>, // // VLD2u - InstrItinData, + InstrItinData, InstrStage<1, [A8_NLSPipe], 1>, InstrStage<1, [A8_LSPipe]>], [2, 2, 2, 1, 1, 1]>, // // VLD2x2u - InstrItinData, + InstrItinData, InstrStage<3, [A8_NLSPipe], 1>, InstrStage<3, [A8_LSPipe]>], [2, 2, 3, 3, 2, 1]>, // // VLD2lnu - InstrItinData, + InstrItinData, InstrStage<3, [A8_NLSPipe], 1>, InstrStage<3, [A8_LSPipe]>], [3, 3, 2, 1, 1, 1, 1, 1]>, // // VLD3 InstrItinData, - InstrStage<1, [A8_NLSPipe]>, - InstrStage<1, [A8_LSPipe]>], [2, 2, 2, 1]>, + InstrStage<4, [A8_NLSPipe], 1>, + InstrStage<4, [A8_LSPipe]>], + [3, 3, 4, 1]>, + // + // VLD3ln + InstrItinData, + InstrStage<5, [A8_NLSPipe], 1>, + InstrStage<5, [A8_LSPipe]>], + [4, 4, 5, 1, 1, 1, 1, 2]>, + // + // VLD3u + InstrItinData, + InstrStage<4, [A8_NLSPipe], 1>, + InstrStage<4, [A8_LSPipe]>], + [3, 3, 4, 2, 1]>, + // + // VLD3lnu + InstrItinData, + InstrStage<5, [A8_NLSPipe], 1>, + InstrStage<5, [A8_LSPipe]>], + [4, 4, 5, 2, 1, 1, 1, 1, 1, 2]>, // // VLD4 InstrItinData, diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index c199ef7f2b2..2d2bc370f52 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -822,14 +822,36 @@ def CortexA9Itineraries : ProcessorItineraries< [4, 4, 2, 1, 1, 1, 1, 1]>, // // VLD3 - // FIXME: We don't model this instruction properly InstrItinData, - // Extra latency cycles since wbck is 6 cycles - InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<10, [A9_DRegsVFP], 0, Reserved>, InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, - InstrStage<1, [A9_NPipe]>], - [2, 2, 2, 1]>, + InstrStage<4, [A9_NPipe]>], + [4, 4, 5, 1]>, + // + // VLD3ln + InstrItinData, + InstrStage<11, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<5, [A9_NPipe]>], + [5, 5, 6, 1, 1, 1, 1, 2]>, + // + // VLD3u + InstrItinData, + InstrStage<10, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<4, [A9_NPipe]>], + [4, 4, 5, 2, 1]>, + // + // VLD3lnu + InstrItinData, + InstrStage<11, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<5, [A9_NPipe]>], + [5, 5, 6, 2, 1, 1, 1, 1, 1, 2]>, // // VLD4 // FIXME: We don't model this instruction properly