mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-10-25 10:27:04 +00:00
Model operand cycles of vldm / vstm; also fixes scheduling itineraries of vldr / vstr, etc.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@115898 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
@@ -1667,6 +1667,41 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
|
||||
default:
|
||||
DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
|
||||
break;
|
||||
case ARM::VLDMD:
|
||||
case ARM::VLDMS:
|
||||
case ARM::VLDMD_UPD:
|
||||
case ARM::VLDMS_UPD: {
|
||||
int RegNo = (int)(DefIdx+1) - DefTID.getNumOperands() + 1;
|
||||
if (RegNo <= 0) {
|
||||
// Def is the address writeback.
|
||||
DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
|
||||
break;
|
||||
}
|
||||
if (Subtarget.isCortexA8()) {
|
||||
// (regno / 2) + (regno % 2) + 1
|
||||
DefCycle = RegNo / 2 + 1;
|
||||
if (RegNo % 2)
|
||||
++DefCycle;
|
||||
} else if (Subtarget.isCortexA9()) {
|
||||
DefCycle = RegNo;
|
||||
bool isSLoad = false;
|
||||
switch (UseTID.getOpcode()) {
|
||||
default: break;
|
||||
case ARM::VLDMS:
|
||||
case ARM::VLDMS_UPD:
|
||||
isSLoad = true;
|
||||
break;
|
||||
}
|
||||
// If there are odd number of 'S' registers or if it's not 64-bit aligned,
|
||||
// then it takes an extra cycle.
|
||||
if ((isSLoad && (RegNo % 2)) || DefAlign < 8)
|
||||
++DefCycle;
|
||||
} else {
|
||||
// Assume the worst.
|
||||
DefCycle = RegNo + 2;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ARM::LDM_RET:
|
||||
case ARM::LDM:
|
||||
case ARM::LDM_UPD:
|
||||
@@ -1677,7 +1712,12 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
|
||||
case ARM::t2LDM:
|
||||
case ARM::t2LDM_UPD: {
|
||||
LdmBypass = 1;
|
||||
unsigned RegNo = (DefIdx+1) - DefTID.getNumOperands() + 1;
|
||||
int RegNo = (int)(DefIdx+1) - DefTID.getNumOperands() + 1;
|
||||
if (RegNo <= 0) {
|
||||
// Def is the address writeback.
|
||||
DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
|
||||
break;
|
||||
}
|
||||
if (Subtarget.isCortexA8()) {
|
||||
// 4 registers would be issued: 1, 2, 1.
|
||||
// 5 registers would be issued: 1, 2, 2.
|
||||
@@ -1710,6 +1750,40 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
|
||||
default:
|
||||
UseCycle = ItinData->getOperandCycle(UseClass, UseIdx);
|
||||
break;
|
||||
case ARM::VSTMD:
|
||||
case ARM::VSTMS:
|
||||
case ARM::VSTMD_UPD:
|
||||
case ARM::VSTMS_UPD: {
|
||||
int RegNo = (int)(UseIdx+1) - UseTID.getNumOperands() + 1;
|
||||
if (RegNo <= 0) {
|
||||
UseCycle = ItinData->getOperandCycle(UseClass, UseIdx);
|
||||
break;
|
||||
}
|
||||
if (Subtarget.isCortexA8()) {
|
||||
// (regno / 2) + (regno % 2) + 1
|
||||
UseCycle = RegNo / 2 + 1;
|
||||
if (RegNo % 2)
|
||||
++UseCycle;
|
||||
} else if (Subtarget.isCortexA9()) {
|
||||
UseCycle = RegNo;
|
||||
bool isSStore = false;
|
||||
switch (UseTID.getOpcode()) {
|
||||
default: break;
|
||||
case ARM::VSTMS:
|
||||
case ARM::VSTMS_UPD:
|
||||
isSStore = true;
|
||||
break;
|
||||
}
|
||||
// If there are odd number of 'S' registers or if it's not 64-bit aligned,
|
||||
// then it takes an extra cycle.
|
||||
if ((isSStore && (RegNo % 2)) || UseAlign < 8)
|
||||
++UseCycle;
|
||||
} else {
|
||||
// Assume the worst.
|
||||
UseCycle = RegNo + 2;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ARM::STM:
|
||||
case ARM::STM_UPD:
|
||||
case ARM::tSTM_UPD:
|
||||
@@ -1717,14 +1791,16 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
|
||||
case ARM::tPOP:
|
||||
case ARM::t2STM:
|
||||
case ARM::t2STM_UPD: {
|
||||
unsigned RegNo = UseIdx - UseTID.getNumOperands() + 1;
|
||||
int RegNo = (int)(UseIdx+1) - UseTID.getNumOperands() + 1;
|
||||
if (RegNo <= 0) {
|
||||
UseCycle = ItinData->getOperandCycle(UseClass, UseIdx);
|
||||
break;
|
||||
}
|
||||
if (Subtarget.isCortexA8()) {
|
||||
// 4 registers would be issued: 1, 2, 1.
|
||||
// 5 registers would be issued: 1, 2, 2.
|
||||
UseCycle = RegNo / 2;
|
||||
if (UseCycle < 2)
|
||||
UseCycle = 2;
|
||||
// Result latency is issue cycle + 2: E2.
|
||||
// Read in E3.
|
||||
UseCycle += 2;
|
||||
} else if (Subtarget.isCortexA9()) {
|
||||
UseCycle = (RegNo / 2);
|
||||
@@ -1732,12 +1808,11 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
|
||||
// then it takes an extra AGU (Address Generation Unit) cycle.
|
||||
if ((RegNo % 2) || UseAlign < 8)
|
||||
++UseCycle;
|
||||
// Result latency is AGU cycles + 2.
|
||||
UseCycle += 2;
|
||||
} else {
|
||||
// Assume the worst.
|
||||
UseCycle = RegNo + 2;
|
||||
UseCycle = 1;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -132,13 +132,13 @@ def nModImm : Operand<i32> {
|
||||
// Use VLDM to load a Q register as a D register pair.
|
||||
// This is a pseudo instruction that is expanded to VLDMD after reg alloc.
|
||||
def VLDMQ
|
||||
: PseudoVFPLdStM<(outs QPR:$dst), (ins addrmode4:$addr), IIC_fpLoadm, "",
|
||||
: PseudoVFPLdStM<(outs QPR:$dst), (ins addrmode4:$addr), IIC_fpLoad_m, "",
|
||||
[(set QPR:$dst, (v2f64 (load addrmode4:$addr)))]>;
|
||||
|
||||
// Use VSTM to store a Q register as a D register pair.
|
||||
// This is a pseudo instruction that is expanded to VSTMD after reg alloc.
|
||||
def VSTMQ
|
||||
: PseudoVFPLdStM<(outs), (ins QPR:$src, addrmode4:$addr), IIC_fpStorem, "",
|
||||
: PseudoVFPLdStM<(outs), (ins QPR:$src, addrmode4:$addr), IIC_fpStore_m, "",
|
||||
[(store (v2f64 QPR:$src), addrmode4:$addr)]>;
|
||||
|
||||
let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
|
||||
|
||||
@@ -78,20 +78,20 @@ def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$src, addrmode5:$addr),
|
||||
|
||||
let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
|
||||
def VLDMD : AXDI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$dsts,
|
||||
variable_ops), IndexModeNone, IIC_fpLoadm,
|
||||
variable_ops), IndexModeNone, IIC_fpLoad_m,
|
||||
"vldm${addr:submode}${p}\t$addr, $dsts", "", []> {
|
||||
let Inst{20} = 1;
|
||||
}
|
||||
|
||||
def VLDMS : AXSI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$dsts,
|
||||
variable_ops), IndexModeNone, IIC_fpLoadm,
|
||||
variable_ops), IndexModeNone, IIC_fpLoad_m,
|
||||
"vldm${addr:submode}${p}\t$addr, $dsts", "", []> {
|
||||
let Inst{20} = 1;
|
||||
}
|
||||
|
||||
def VLDMD_UPD : AXDI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
|
||||
reglist:$dsts, variable_ops),
|
||||
IndexModeUpd, IIC_fpLoadm,
|
||||
IndexModeUpd, IIC_fpLoad_mu,
|
||||
"vldm${addr:submode}${p}\t$addr!, $dsts",
|
||||
"$addr.addr = $wb", []> {
|
||||
let Inst{20} = 1;
|
||||
@@ -99,7 +99,7 @@ def VLDMD_UPD : AXDI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
|
||||
|
||||
def VLDMS_UPD : AXSI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
|
||||
reglist:$dsts, variable_ops),
|
||||
IndexModeUpd, IIC_fpLoadm,
|
||||
IndexModeUpd, IIC_fpLoad_mu,
|
||||
"vldm${addr:submode}${p}\t$addr!, $dsts",
|
||||
"$addr.addr = $wb", []> {
|
||||
let Inst{20} = 1;
|
||||
@@ -108,20 +108,20 @@ def VLDMS_UPD : AXSI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
|
||||
|
||||
let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
|
||||
def VSTMD : AXDI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$srcs,
|
||||
variable_ops), IndexModeNone, IIC_fpStorem,
|
||||
variable_ops), IndexModeNone, IIC_fpStore_m,
|
||||
"vstm${addr:submode}${p}\t$addr, $srcs", "", []> {
|
||||
let Inst{20} = 0;
|
||||
}
|
||||
|
||||
def VSTMS : AXSI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$srcs,
|
||||
variable_ops), IndexModeNone, IIC_fpStorem,
|
||||
variable_ops), IndexModeNone, IIC_fpStore_m,
|
||||
"vstm${addr:submode}${p}\t$addr, $srcs", "", []> {
|
||||
let Inst{20} = 0;
|
||||
}
|
||||
|
||||
def VSTMD_UPD : AXDI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
|
||||
reglist:$srcs, variable_ops),
|
||||
IndexModeUpd, IIC_fpStorem,
|
||||
IndexModeUpd, IIC_fpStore_mu,
|
||||
"vstm${addr:submode}${p}\t$addr!, $srcs",
|
||||
"$addr.addr = $wb", []> {
|
||||
let Inst{20} = 0;
|
||||
@@ -129,7 +129,7 @@ def VSTMD_UPD : AXDI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
|
||||
|
||||
def VSTMS_UPD : AXSI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
|
||||
reglist:$srcs, variable_ops),
|
||||
IndexModeUpd, IIC_fpStorem,
|
||||
IndexModeUpd, IIC_fpStore_mu,
|
||||
"vstm${addr:submode}${p}\t$addr!, $srcs",
|
||||
"$addr.addr = $wb", []> {
|
||||
let Inst{20} = 0;
|
||||
|
||||
@@ -120,10 +120,12 @@ def IIC_fpSQRT32 : InstrItinClass;
|
||||
def IIC_fpSQRT64 : InstrItinClass;
|
||||
def IIC_fpLoad32 : InstrItinClass;
|
||||
def IIC_fpLoad64 : InstrItinClass;
|
||||
def IIC_fpLoadm : InstrItinClass<0>; // micro-coded
|
||||
def IIC_fpLoad_m : InstrItinClass<0>; // micro-coded
|
||||
def IIC_fpLoad_mu : InstrItinClass<0>; // micro-coded
|
||||
def IIC_fpStore32 : InstrItinClass;
|
||||
def IIC_fpStore64 : InstrItinClass;
|
||||
def IIC_fpStorem : InstrItinClass<0>; // micro-coded
|
||||
def IIC_fpStore_m : InstrItinClass<0>; // micro-coded
|
||||
def IIC_fpStore_mu : InstrItinClass<0>; // micro-coded
|
||||
def IIC_VLD1 : InstrItinClass;
|
||||
def IIC_VLD2 : InstrItinClass;
|
||||
def IIC_VLD3 : InstrItinClass;
|
||||
|
||||
@@ -414,54 +414,58 @@ def CortexA8Itineraries : ProcessorItineraries<
|
||||
InstrItinData<IIC_fpLoad32, [InstrStage<1, [A8_Issue], 0>,
|
||||
InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
|
||||
InstrStage<1, [A8_LdSt0], 0>,
|
||||
InstrStage<1, [A8_NLSPipe]>],
|
||||
InstrStage<2, [A8_NLSPipe]>],
|
||||
[2, 1]>,
|
||||
//
|
||||
// Double-precision FP Load
|
||||
// use A8_Issue to enforce the 1 load/store per cycle limit
|
||||
InstrItinData<IIC_fpLoad64, [InstrStage<2, [A8_Issue], 0>,
|
||||
InstrStage<1, [A8_Pipe0], 0>,
|
||||
InstrStage<1, [A8_Pipe1]>,
|
||||
InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
|
||||
InstrStage<1, [A8_LdSt0], 0>,
|
||||
InstrStage<1, [A8_NLSPipe]>],
|
||||
InstrStage<2, [A8_NLSPipe]>],
|
||||
[2, 1]>,
|
||||
//
|
||||
// FP Load Multiple
|
||||
// use A8_Issue to enforce the 1 load/store per cycle limit
|
||||
InstrItinData<IIC_fpLoadm, [InstrStage<3, [A8_Issue], 0>,
|
||||
InstrStage<2, [A8_Pipe0], 0>,
|
||||
InstrStage<2, [A8_Pipe1]>,
|
||||
InstrItinData<IIC_fpLoad_m, [InstrStage<3, [A8_Issue], 0>,
|
||||
InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
|
||||
InstrStage<1, [A8_LdSt0], 0>,
|
||||
InstrStage<1, [A8_NLSPipe]>]>,
|
||||
InstrStage<1, [A8_NLSPipe]>], [1, 1, 1, 2]>,
|
||||
//
|
||||
// FP Load Multiple + update
|
||||
InstrItinData<IIC_fpLoad_mu,[InstrStage<3, [A8_Issue], 0>,
|
||||
InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
|
||||
InstrStage<1, [A8_LdSt0], 0>,
|
||||
InstrStage<1, [A8_NLSPipe]>], [2, 1, 1, 1, 2]>,
|
||||
//
|
||||
// Single-precision FP Store
|
||||
// use A8_Issue to enforce the 1 load/store per cycle limit
|
||||
InstrItinData<IIC_fpStore32,[InstrStage<1, [A8_Issue], 0>,
|
||||
InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
|
||||
InstrStage<1, [A8_LdSt0], 0>,
|
||||
InstrStage<1, [A8_NLSPipe]>],
|
||||
InstrStage<2, [A8_NLSPipe]>],
|
||||
[1, 1]>,
|
||||
//
|
||||
// Double-precision FP Store
|
||||
// use A8_Issue to enforce the 1 load/store per cycle limit
|
||||
InstrItinData<IIC_fpStore64,[InstrStage<2, [A8_Issue], 0>,
|
||||
InstrStage<1, [A8_Pipe0], 0>,
|
||||
InstrStage<1, [A8_Pipe1]>,
|
||||
InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
|
||||
InstrStage<1, [A8_LdSt0], 0>,
|
||||
InstrStage<1, [A8_NLSPipe]>],
|
||||
InstrStage<2, [A8_NLSPipe]>],
|
||||
[1, 1]>,
|
||||
//
|
||||
// FP Store Multiple
|
||||
// use A8_Issue to enforce the 1 load/store per cycle limit
|
||||
InstrItinData<IIC_fpStorem, [InstrStage<3, [A8_Issue], 0>,
|
||||
InstrStage<2, [A8_Pipe0], 0>,
|
||||
InstrStage<2, [A8_Pipe1]>,
|
||||
InstrItinData<IIC_fpStore_m,[InstrStage<3, [A8_Issue], 0>,
|
||||
InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
|
||||
InstrStage<1, [A8_LdSt0], 0>,
|
||||
InstrStage<1, [A8_NLSPipe]>]>,
|
||||
InstrStage<1, [A8_NLSPipe]>], [1, 1, 1, 1]>,
|
||||
//
|
||||
// FP Store Multiple + update
|
||||
InstrItinData<IIC_fpStore_mu,[InstrStage<3, [A8_Issue], 0>,
|
||||
InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
|
||||
InstrStage<1, [A8_LdSt0], 0>,
|
||||
InstrStage<1, [A8_NLSPipe]>], [2, 1, 1, 1, 1]>,
|
||||
|
||||
// NEON
|
||||
// Issue through integer pipeline, and execute in NEON unit.
|
||||
|
||||
@@ -629,11 +629,18 @@ def CortexA9Itineraries : ProcessorItineraries<
|
||||
[2, 1]>,
|
||||
//
|
||||
// FP Load Multiple
|
||||
InstrItinData<IIC_fpLoadm, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
|
||||
InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
|
||||
InstrStage<2, [A9_DRegsN], 0, Reserved>,
|
||||
InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
|
||||
InstrStage<1, [A9_MUX0], 0>,
|
||||
InstrStage<1, [A9_NPipe]>]>,
|
||||
InstrStage<1, [A9_NPipe]>], [1, 1, 1, 1]>,
|
||||
//
|
||||
// FP Load Multiple + update
|
||||
InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
|
||||
InstrStage<2, [A9_DRegsN], 0, Reserved>,
|
||||
InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
|
||||
InstrStage<1, [A9_MUX0], 0>,
|
||||
InstrStage<1, [A9_NPipe]>], [2, 1, 1, 1]>,
|
||||
//
|
||||
// Single-precision FP Store
|
||||
InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
|
||||
@@ -652,11 +659,18 @@ def CortexA9Itineraries : ProcessorItineraries<
|
||||
[1, 1]>,
|
||||
//
|
||||
// FP Store Multiple
|
||||
InstrItinData<IIC_fpStorem, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
|
||||
InstrItinData<IIC_fpStore_m,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
|
||||
InstrStage<2, [A9_DRegsN], 0, Reserved>,
|
||||
InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
|
||||
InstrStage<1, [A9_MUX0], 0>,
|
||||
InstrStage<1, [A9_NPipe]>]>,
|
||||
InstrStage<1, [A9_NPipe]>], [1, 1, 1, 1]>,
|
||||
//
|
||||
// FP Store Multiple + update
|
||||
InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
|
||||
InstrStage<2, [A9_DRegsN], 0, Reserved>,
|
||||
InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
|
||||
InstrStage<1, [A9_MUX0], 0>,
|
||||
InstrStage<1, [A9_NPipe]>], [2, 1, 1, 1]>,
|
||||
// NEON
|
||||
// Issue through integer pipeline, and execute in NEON unit.
|
||||
// VLD1
|
||||
|
||||
@@ -254,7 +254,10 @@ def ARMV6Itineraries : ProcessorItineraries<
|
||||
InstrItinData<IIC_fpLoad64 , [InstrStage<1, [V6_Pipe]>], [5, 2, 2]>,
|
||||
//
|
||||
// FP Load Multiple
|
||||
InstrItinData<IIC_fpLoadm , [InstrStage<3, [V6_Pipe]>]>,
|
||||
InstrItinData<IIC_fpLoad_m , [InstrStage<3, [V6_Pipe]>], [2, 1, 1, 5]>,
|
||||
//
|
||||
// FP Load Multiple + update
|
||||
InstrItinData<IIC_fpLoad_mu, [InstrStage<3, [V6_Pipe]>], [3, 2, 1, 1, 5]>,
|
||||
//
|
||||
// Single-precision FP Store
|
||||
InstrItinData<IIC_fpStore32 , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>,
|
||||
@@ -264,5 +267,8 @@ def ARMV6Itineraries : ProcessorItineraries<
|
||||
InstrItinData<IIC_fpStore64 , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>,
|
||||
//
|
||||
// FP Store Multiple
|
||||
InstrItinData<IIC_fpStorem , [InstrStage<3, [V6_Pipe]>]>
|
||||
InstrItinData<IIC_fpStore_m, [InstrStage<3, [V6_Pipe]>], [2, 2, 2, 2]>,
|
||||
//
|
||||
// FP Store Multiple + update
|
||||
InstrItinData<IIC_fpStore_mu,[InstrStage<3, [V6_Pipe]>], [3, 2, 2, 2, 2]>
|
||||
]>;
|
||||
|
||||
Reference in New Issue
Block a user