mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-02-21 21:29:41 +00:00
Fix itins for VABA
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@100657 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
fc2b08438c
commit
0a3e2b591c
@ -1711,21 +1711,22 @@ multiclass N3VMulOpSL_HS<bits<4> op11_8,
|
||||
// Neon 3-argument intrinsics,
|
||||
// element sizes of 8, 16 and 32 bits:
|
||||
multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
|
||||
InstrItinClass itinD, InstrItinClass itinQ,
|
||||
string OpcodeStr, string Dt, Intrinsic IntOp> {
|
||||
// 64-bit vector types.
|
||||
def v8i8 : N3VDInt3<op24, op23, 0b00, op11_8, op4, IIC_VMACi16D,
|
||||
def v8i8 : N3VDInt3<op24, op23, 0b00, op11_8, op4, itinD,
|
||||
OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
|
||||
def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16D,
|
||||
def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4, itinD,
|
||||
OpcodeStr, !strconcat(Dt, "16"), v4i16, v4i16, IntOp>;
|
||||
def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4, IIC_VMACi32D,
|
||||
def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4, itinD,
|
||||
OpcodeStr, !strconcat(Dt, "32"), v2i32, v2i32, IntOp>;
|
||||
|
||||
// 128-bit vector types.
|
||||
def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4, IIC_VMACi16Q,
|
||||
def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4, itinQ,
|
||||
OpcodeStr, !strconcat(Dt, "8"), v16i8, v16i8, IntOp>;
|
||||
def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16Q,
|
||||
def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4, itinQ,
|
||||
OpcodeStr, !strconcat(Dt, "16"), v8i16, v8i16, IntOp>;
|
||||
def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4, IIC_VMACi32Q,
|
||||
def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4, itinQ,
|
||||
OpcodeStr, !strconcat(Dt, "32"), v4i32, v4i32, IntOp>;
|
||||
}
|
||||
|
||||
@ -1734,10 +1735,11 @@ multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
|
||||
|
||||
// First with only element sizes of 16 and 32 bits:
|
||||
multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
|
||||
InstrItinClass itin,
|
||||
string OpcodeStr, string Dt, Intrinsic IntOp> {
|
||||
def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16D,
|
||||
def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4, itin,
|
||||
OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>;
|
||||
def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4, IIC_VMACi16D,
|
||||
def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4, itin,
|
||||
OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>;
|
||||
}
|
||||
|
||||
@ -1751,9 +1753,10 @@ multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8,
|
||||
|
||||
// ....then also with element size of 8 bits:
|
||||
multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
|
||||
InstrItinClass itin,
|
||||
string OpcodeStr, string Dt, Intrinsic IntOp>
|
||||
: N3VLInt3_HS<op24, op23, op11_8, op4, OpcodeStr, Dt, IntOp> {
|
||||
def v8i16 : N3VLInt3<op24, op23, 0b00, op11_8, op4, IIC_VMACi16D,
|
||||
: N3VLInt3_HS<op24, op23, op11_8, op4, itin, OpcodeStr, Dt, IntOp> {
|
||||
def v8i16 : N3VLInt3<op24, op23, 0b00, op11_8, op4, itin,
|
||||
OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp>;
|
||||
}
|
||||
|
||||
@ -2177,15 +2180,17 @@ def : Pat<(v4f32 (fadd (v4f32 QPR:$src1),
|
||||
(SubReg_i32_lane imm:$lane)))>;
|
||||
|
||||
// VMLAL : Vector Multiply Accumulate Long (Q += D * D)
|
||||
defm VMLALs : N3VLInt3_QHS<0,1,0b1000,0, "vmlal", "s", int_arm_neon_vmlals>;
|
||||
defm VMLALu : N3VLInt3_QHS<1,1,0b1000,0, "vmlal", "u", int_arm_neon_vmlalu>;
|
||||
defm VMLALs : N3VLInt3_QHS<0,1,0b1000,0, IIC_VMACi16D,
|
||||
"vmlal", "s", int_arm_neon_vmlals>;
|
||||
defm VMLALu : N3VLInt3_QHS<1,1,0b1000,0, IIC_VMACi16D,
|
||||
"vmlal", "u", int_arm_neon_vmlalu>;
|
||||
|
||||
defm VMLALsls : N3VLInt3SL_HS<0, 0b0010, "vmlal", "s", int_arm_neon_vmlals>;
|
||||
defm VMLALslu : N3VLInt3SL_HS<1, 0b0010, "vmlal", "u", int_arm_neon_vmlalu>;
|
||||
|
||||
// VQDMLAL : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D)
|
||||
defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, "vqdmlal", "s",
|
||||
int_arm_neon_vqdmlal>;
|
||||
defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D,
|
||||
"vqdmlal", "s", int_arm_neon_vqdmlal>;
|
||||
defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", int_arm_neon_vqdmlal>;
|
||||
|
||||
// VMLS : Vector Multiply Subtract (integer and floating-point)
|
||||
@ -2227,15 +2232,17 @@ def : Pat<(v4f32 (fsub (v4f32 QPR:$src1),
|
||||
(SubReg_i32_lane imm:$lane)))>;
|
||||
|
||||
// VMLSL : Vector Multiply Subtract Long (Q -= D * D)
|
||||
defm VMLSLs : N3VLInt3_QHS<0,1,0b1010,0, "vmlsl", "s", int_arm_neon_vmlsls>;
|
||||
defm VMLSLu : N3VLInt3_QHS<1,1,0b1010,0, "vmlsl", "u", int_arm_neon_vmlslu>;
|
||||
defm VMLSLs : N3VLInt3_QHS<0,1,0b1010,0, IIC_VMACi16D,
|
||||
"vmlsl", "s", int_arm_neon_vmlsls>;
|
||||
defm VMLSLu : N3VLInt3_QHS<1,1,0b1010,0, IIC_VMACi16D,
|
||||
"vmlsl", "u", int_arm_neon_vmlslu>;
|
||||
|
||||
defm VMLSLsls : N3VLInt3SL_HS<0, 0b0110, "vmlsl", "s", int_arm_neon_vmlsls>;
|
||||
defm VMLSLslu : N3VLInt3SL_HS<1, 0b0110, "vmlsl", "u", int_arm_neon_vmlslu>;
|
||||
|
||||
// VQDMLSL : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)
|
||||
defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, "vqdmlsl", "s",
|
||||
int_arm_neon_vqdmlsl>;
|
||||
defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D,
|
||||
"vqdmlsl", "s", int_arm_neon_vqdmlsl>;
|
||||
defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", int_arm_neon_vqdmlsl>;
|
||||
|
||||
// Vector Subtract Operations.
|
||||
@ -2464,12 +2471,16 @@ defm VABDLu : N3VLInt_QHS<1,1,0b0111,0, IIC_VSUBi4Q,
|
||||
"vabdl", "u", int_arm_neon_vabdlu, 0>;
|
||||
|
||||
// VABA : Vector Absolute Difference and Accumulate
|
||||
defm VABAs : N3VInt3_QHS<0,0,0b0111,1, "vaba", "s", int_arm_neon_vabas>;
|
||||
defm VABAu : N3VInt3_QHS<1,0,0b0111,1, "vaba", "u", int_arm_neon_vabau>;
|
||||
defm VABAs : N3VInt3_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
|
||||
"vaba", "s", int_arm_neon_vabas>;
|
||||
defm VABAu : N3VInt3_QHS<1,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
|
||||
"vaba", "u", int_arm_neon_vabau>;
|
||||
|
||||
// VABAL : Vector Absolute Difference and Accumulate Long (Q += | D - D |)
|
||||
defm VABALs : N3VLInt3_QHS<0,1,0b0101,0, "vabal", "s", int_arm_neon_vabals>;
|
||||
defm VABALu : N3VLInt3_QHS<1,1,0b0101,0, "vabal", "u", int_arm_neon_vabalu>;
|
||||
defm VABALs : N3VLInt3_QHS<0,1,0b0101,0, IIC_VABAD,
|
||||
"vabal", "s", int_arm_neon_vabals>;
|
||||
defm VABALu : N3VLInt3_QHS<1,1,0b0101,0, IIC_VABAD,
|
||||
"vabal", "u", int_arm_neon_vabalu>;
|
||||
|
||||
// Vector Maximum and Minimum.
|
||||
|
||||
|
@ -135,6 +135,8 @@ def IIC_VBINi4D : InstrItinClass;
|
||||
def IIC_VBINi4Q : InstrItinClass;
|
||||
def IIC_VSUBi4D : InstrItinClass;
|
||||
def IIC_VSUBi4Q : InstrItinClass;
|
||||
def IIC_VABAD : InstrItinClass;
|
||||
def IIC_VABAQ : InstrItinClass;
|
||||
def IIC_VSHLiD : InstrItinClass;
|
||||
def IIC_VSHLiQ : InstrItinClass;
|
||||
def IIC_VSHLi4D : InstrItinClass;
|
||||
|
@ -522,6 +522,15 @@ def CortexA8Itineraries : ProcessorItineraries<[
|
||||
InstrItinData<IIC_VPALiQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
|
||||
InstrStage<2, [FU_NPipe]>], [7, 3, 2, 1]>,
|
||||
//
|
||||
// Double-register Absolute Difference and Accumulate
|
||||
InstrItinData<IIC_VABAD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
|
||||
InstrStage<1, [FU_NPipe]>], [6, 3, 2, 1]>,
|
||||
//
|
||||
// Quad-register Absolute Difference and Accumulate
|
||||
InstrItinData<IIC_VABAQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
|
||||
InstrStage<2, [FU_NPipe]>], [6, 3, 2, 1]>,
|
||||
|
||||
//
|
||||
// Double-register Integer Multiply (.8, .16)
|
||||
InstrItinData<IIC_VMULi16D, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
|
||||
InstrStage<1, [FU_NPipe]>], [6, 2, 2]>,
|
||||
@ -883,7 +892,38 @@ def CortexA9Itineraries : ProcessorItineraries<[
|
||||
// Extra 3 latency cycle since wbck is 6 cycles
|
||||
InstrStage2<7, [FU_DRegsVFP], 0, Reserved>,
|
||||
InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
|
||||
InstrStage<1, [FU_NPipe]>], [4, 2, 1]>
|
||||
InstrStage<1, [FU_NPipe]>], [4, 2, 1]>,
|
||||
|
||||
//
|
||||
// Double-register Integer Count
|
||||
InstrItinData<IIC_VCNTiD, [InstrStage2<1, [FU_DRegsN], 0, Required>,
|
||||
// Extra 3 latency cycle since wbck is 6 cycles
|
||||
InstrStage2<7, [FU_DRegsVFP], 0, Reserved>,
|
||||
InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
|
||||
InstrStage<1, [FU_NPipe]>], [3, 2, 2]>,
|
||||
//
|
||||
// Quad-register Integer Count
|
||||
// Result written in N3, but that is relative to the last cycle of multicycle,
|
||||
// so we use 4 for those cases
|
||||
InstrItinData<IIC_VCNTiQ, [InstrStage2<1, [FU_DRegsN], 0, Required>,
|
||||
// Extra 3 latency cycle since wbck is 7 cycles
|
||||
InstrStage2<8, [FU_DRegsVFP], 0, Reserved>,
|
||||
InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
|
||||
InstrStage<2, [FU_NPipe]>], [4, 2, 2]>,
|
||||
//
|
||||
// Double-register Absolute Difference and Accumulate
|
||||
InstrItinData<IIC_VABAD, [InstrStage2<1, [FU_DRegsN], 0, Required>,
|
||||
// Extra 3 latency cycle since wbck is 6 cycles
|
||||
InstrStage2<7, [FU_DRegsVFP], 0, Reserved>,
|
||||
InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
|
||||
InstrStage<1, [FU_NPipe]>], [6, 3, 2, 1]>,
|
||||
//
|
||||
// Quad-register Absolute Difference and Accumulate
|
||||
InstrItinData<IIC_VABAQ, [InstrStage2<1, [FU_DRegsN], 0, Required>,
|
||||
// Extra 3 latency cycle since wbck is 6 cycles
|
||||
InstrStage2<7, [FU_DRegsVFP], 0, Reserved>,
|
||||
InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
|
||||
InstrStage<2, [FU_NPipe]>], [6, 3, 2, 1]>
|
||||
]>;
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user