mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-11-04 22:07:27 +00:00
[X86][SchedModel] SSE reciprocal square root instruction latencies.
The SSE rsqrt instruction (a fast reciprocal square root estimate) was grouped in the same scheduling IIC_SSE_SQRT* class as the accurate (but very slow) SSE sqrt instruction. For code which uses rsqrt (possibly with newton-raphson iterations) this poor scheduling was affecting performances. This patch splits off the rsqrt instruction from the sqrt instruction scheduling classes and creates new IIC_SSE_RSQER* classes with latency values based on Agner's table. Differential Revision: http://reviews.llvm.org/D5370 Patch by Simon Pilgrim. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218517 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
a0d5d7aed8
commit
a5ab9baf83
@ -3344,6 +3344,16 @@ def SSE_SQRTSD : OpndItins<
|
|||||||
>;
|
>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let Sched = WriteFRsqrt in {
|
||||||
|
def SSE_RSQRTPS : OpndItins<
|
||||||
|
IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM
|
||||||
|
>;
|
||||||
|
|
||||||
|
def SSE_RSQRTSS : OpndItins<
|
||||||
|
IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM
|
||||||
|
>;
|
||||||
|
}
|
||||||
|
|
||||||
let Sched = WriteFRcp in {
|
let Sched = WriteFRcp in {
|
||||||
def SSE_RCPP : OpndItins<
|
def SSE_RCPP : OpndItins<
|
||||||
IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
|
IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
|
||||||
@ -3622,10 +3632,10 @@ defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss,
|
|||||||
|
|
||||||
// Reciprocal approximations. Note that these typically require refinement
|
// Reciprocal approximations. Note that these typically require refinement
|
||||||
// in order to obtain suitable precision.
|
// in order to obtain suitable precision.
|
||||||
defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTSS>,
|
defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
|
||||||
sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTPS>,
|
sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>,
|
||||||
sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
|
sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
|
||||||
int_x86_avx_rsqrt_ps_256, SSE_SQRTPS>;
|
int_x86_avx_rsqrt_ps_256, SSE_RSQRTPS>;
|
||||||
defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>,
|
defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>,
|
||||||
sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>,
|
sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>,
|
||||||
sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps,
|
sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps,
|
||||||
|
@ -129,6 +129,7 @@ defm : HWWriteResPair<WriteFAdd, HWPort1, 3>;
|
|||||||
defm : HWWriteResPair<WriteFMul, HWPort0, 5>;
|
defm : HWWriteResPair<WriteFMul, HWPort0, 5>;
|
||||||
defm : HWWriteResPair<WriteFDiv, HWPort0, 12>; // 10-14 cycles.
|
defm : HWWriteResPair<WriteFDiv, HWPort0, 12>; // 10-14 cycles.
|
||||||
defm : HWWriteResPair<WriteFRcp, HWPort0, 5>;
|
defm : HWWriteResPair<WriteFRcp, HWPort0, 5>;
|
||||||
|
defm : HWWriteResPair<WriteFRsqrt, HWPort0, 5>;
|
||||||
defm : HWWriteResPair<WriteFSqrt, HWPort0, 15>;
|
defm : HWWriteResPair<WriteFSqrt, HWPort0, 15>;
|
||||||
defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>;
|
defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>;
|
||||||
defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>;
|
defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>;
|
||||||
|
@ -117,6 +117,7 @@ defm : SBWriteResPair<WriteFAdd, SBPort1, 3>;
|
|||||||
defm : SBWriteResPair<WriteFMul, SBPort0, 5>;
|
defm : SBWriteResPair<WriteFMul, SBPort0, 5>;
|
||||||
defm : SBWriteResPair<WriteFDiv, SBPort0, 12>; // 10-14 cycles.
|
defm : SBWriteResPair<WriteFDiv, SBPort0, 12>; // 10-14 cycles.
|
||||||
defm : SBWriteResPair<WriteFRcp, SBPort0, 5>;
|
defm : SBWriteResPair<WriteFRcp, SBPort0, 5>;
|
||||||
|
defm : SBWriteResPair<WriteFRsqrt, SBPort0, 5>;
|
||||||
defm : SBWriteResPair<WriteFSqrt, SBPort0, 15>;
|
defm : SBWriteResPair<WriteFSqrt, SBPort0, 15>;
|
||||||
defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>;
|
defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>;
|
||||||
defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>;
|
defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>;
|
||||||
|
@ -63,12 +63,13 @@ def WriteZero : SchedWrite;
|
|||||||
defm WriteJump : X86SchedWritePair;
|
defm WriteJump : X86SchedWritePair;
|
||||||
|
|
||||||
// Floating point. This covers both scalar and vector operations.
|
// Floating point. This covers both scalar and vector operations.
|
||||||
defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare.
|
defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare.
|
||||||
defm WriteFMul : X86SchedWritePair; // Floating point multiplication.
|
defm WriteFMul : X86SchedWritePair; // Floating point multiplication.
|
||||||
defm WriteFDiv : X86SchedWritePair; // Floating point division.
|
defm WriteFDiv : X86SchedWritePair; // Floating point division.
|
||||||
defm WriteFSqrt : X86SchedWritePair; // Floating point square root.
|
defm WriteFSqrt : X86SchedWritePair; // Floating point square root.
|
||||||
defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal.
|
defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal estimate.
|
||||||
defm WriteFMA : X86SchedWritePair; // Fused Multiply Add.
|
defm WriteFRsqrt : X86SchedWritePair; // Floating point reciprocal square root estimate.
|
||||||
|
defm WriteFMA : X86SchedWritePair; // Fused Multiply Add.
|
||||||
defm WriteFShuffle : X86SchedWritePair; // Floating point vector shuffles.
|
defm WriteFShuffle : X86SchedWritePair; // Floating point vector shuffles.
|
||||||
defm WriteFBlend : X86SchedWritePair; // Floating point vector blends.
|
defm WriteFBlend : X86SchedWritePair; // Floating point vector blends.
|
||||||
defm WriteFVarBlend : X86SchedWritePair; // Fp vector variable blends.
|
defm WriteFVarBlend : X86SchedWritePair; // Fp vector variable blends.
|
||||||
@ -314,6 +315,11 @@ def IIC_SSE_SQRTPD_RM : InstrItinClass;
|
|||||||
def IIC_SSE_SQRTSD_RR : InstrItinClass;
|
def IIC_SSE_SQRTSD_RR : InstrItinClass;
|
||||||
def IIC_SSE_SQRTSD_RM : InstrItinClass;
|
def IIC_SSE_SQRTSD_RM : InstrItinClass;
|
||||||
|
|
||||||
|
def IIC_SSE_RSQRTPS_RR : InstrItinClass;
|
||||||
|
def IIC_SSE_RSQRTPS_RM : InstrItinClass;
|
||||||
|
def IIC_SSE_RSQRTSS_RR : InstrItinClass;
|
||||||
|
def IIC_SSE_RSQRTSS_RM : InstrItinClass;
|
||||||
|
|
||||||
def IIC_SSE_RCPP_RR : InstrItinClass;
|
def IIC_SSE_RCPP_RR : InstrItinClass;
|
||||||
def IIC_SSE_RCPP_RM : InstrItinClass;
|
def IIC_SSE_RCPP_RM : InstrItinClass;
|
||||||
def IIC_SSE_RCPS_RR : InstrItinClass;
|
def IIC_SSE_RCPS_RR : InstrItinClass;
|
||||||
|
@ -224,6 +224,11 @@ def AtomItineraries : ProcessorItineraries<
|
|||||||
InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >,
|
InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >,
|
||||||
InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >,
|
InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >,
|
||||||
|
|
||||||
|
InstrItinData<IIC_SSE_RSQRTPS_RR, [InstrStage<9, [Port0, Port1]>] >,
|
||||||
|
InstrItinData<IIC_SSE_RSQRTPS_RM, [InstrStage<10, [Port0, Port1]>] >,
|
||||||
|
InstrItinData<IIC_SSE_RSQRTSS_RR, [InstrStage<4, [Port0]>] >,
|
||||||
|
InstrItinData<IIC_SSE_RSQRTSS_RM, [InstrStage<4, [Port0]>] >,
|
||||||
|
|
||||||
InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >,
|
InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >,
|
||||||
InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >,
|
InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >,
|
||||||
InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >,
|
InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >,
|
||||||
|
@ -163,15 +163,15 @@ defm : JWriteResIntPair<WriteJump, JALU01, 1>;
|
|||||||
// FIXME: should we bother splitting JFPU pipe + unit stages for fast instructions?
|
// FIXME: should we bother splitting JFPU pipe + unit stages for fast instructions?
|
||||||
// FIXME: Double precision latencies
|
// FIXME: Double precision latencies
|
||||||
// FIXME: SS vs PS latencies
|
// FIXME: SS vs PS latencies
|
||||||
// FIXME: RSQRT latencies
|
|
||||||
// FIXME: ymm latencies
|
// FIXME: ymm latencies
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
defm : JWriteResFpuPair<WriteFAdd, JFPU0, 3>;
|
defm : JWriteResFpuPair<WriteFAdd, JFPU0, 3>;
|
||||||
defm : JWriteResFpuPair<WriteFMul, JFPU1, 2>;
|
defm : JWriteResFpuPair<WriteFMul, JFPU1, 2>;
|
||||||
defm : JWriteResFpuPair<WriteFRcp, JFPU1, 2>;
|
defm : JWriteResFpuPair<WriteFRcp, JFPU1, 2>;
|
||||||
defm : JWriteResFpuPair<WriteFShuffle, JFPU01, 1>;
|
defm : JWriteResFpuPair<WriteFRsqrt, JFPU1, 2>;
|
||||||
defm : JWriteResFpuPair<WriteFBlend, JFPU01, 1>;
|
defm : JWriteResFpuPair<WriteFShuffle, JFPU01, 1>;
|
||||||
|
defm : JWriteResFpuPair<WriteFBlend, JFPU01, 1>;
|
||||||
defm : JWriteResFpuPair<WriteFShuffle256, JFPU01, 1>;
|
defm : JWriteResFpuPair<WriteFShuffle256, JFPU01, 1>;
|
||||||
|
|
||||||
def : WriteRes<WriteFSqrt, [JFPU1, JLAGU, JFPM]> {
|
def : WriteRes<WriteFSqrt, [JFPU1, JLAGU, JFPM]> {
|
||||||
|
@ -101,6 +101,7 @@ def : WriteRes<WriteIDivLd, [MEC_RSV, IEC_RSV01, SMDivider]> {
|
|||||||
// Scalar and vector floating point.
|
// Scalar and vector floating point.
|
||||||
defm : SMWriteResPair<WriteFAdd, FPC_RSV1, 3>;
|
defm : SMWriteResPair<WriteFAdd, FPC_RSV1, 3>;
|
||||||
defm : SMWriteResPair<WriteFRcp, FPC_RSV0, 5>;
|
defm : SMWriteResPair<WriteFRcp, FPC_RSV0, 5>;
|
||||||
|
defm : SMWriteResPair<WriteFRsqrt, FPC_RSV0, 5>;
|
||||||
defm : SMWriteResPair<WriteFSqrt, FPC_RSV0, 15>;
|
defm : SMWriteResPair<WriteFSqrt, FPC_RSV0, 15>;
|
||||||
defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>;
|
defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>;
|
||||||
defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>;
|
defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>;
|
||||||
|
Loading…
Reference in New Issue
Block a user