[X86][SchedModel] SSE reciprocal square root instruction latencies.

The SSE rsqrt instruction (a fast reciprocal square root estimate) was
grouped in the same scheduling IIC_SSE_SQRT* class as the accurate (but very
slow) SSE sqrt instruction. For code which uses rsqrt (possibly with
newton-raphson iterations) this poor scheduling was affecting performances.

This patch splits off the rsqrt instruction from the sqrt instruction scheduling
classes and creates new IIC_SSE_RSQER* classes with latency values based on
Agner's table.

Differential Revision: http://reviews.llvm.org/D5370

Patch by Simon Pilgrim.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218517 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Andrea Di Biagio 2014-09-26 12:56:44 +00:00
parent a0d5d7aed8
commit a5ab9baf83
7 changed files with 39 additions and 15 deletions

View File

@ -3344,6 +3344,16 @@ def SSE_SQRTSD : OpndItins<
>; >;
} }
let Sched = WriteFRsqrt in {
def SSE_RSQRTPS : OpndItins<
IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM
>;
def SSE_RSQRTSS : OpndItins<
IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM
>;
}
let Sched = WriteFRcp in { let Sched = WriteFRcp in {
def SSE_RCPP : OpndItins< def SSE_RCPP : OpndItins<
IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
@ -3622,10 +3632,10 @@ defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss,
// Reciprocal approximations. Note that these typically require refinement // Reciprocal approximations. Note that these typically require refinement
// in order to obtain suitable precision. // in order to obtain suitable precision.
defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTSS>, defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTPS>, sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>,
sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps, sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
int_x86_avx_rsqrt_ps_256, SSE_SQRTPS>; int_x86_avx_rsqrt_ps_256, SSE_RSQRTPS>;
defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>, defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>,
sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>, sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>,
sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps,

View File

@ -129,6 +129,7 @@ defm : HWWriteResPair<WriteFAdd, HWPort1, 3>;
defm : HWWriteResPair<WriteFMul, HWPort0, 5>; defm : HWWriteResPair<WriteFMul, HWPort0, 5>;
defm : HWWriteResPair<WriteFDiv, HWPort0, 12>; // 10-14 cycles. defm : HWWriteResPair<WriteFDiv, HWPort0, 12>; // 10-14 cycles.
defm : HWWriteResPair<WriteFRcp, HWPort0, 5>; defm : HWWriteResPair<WriteFRcp, HWPort0, 5>;
defm : HWWriteResPair<WriteFRsqrt, HWPort0, 5>;
defm : HWWriteResPair<WriteFSqrt, HWPort0, 15>; defm : HWWriteResPair<WriteFSqrt, HWPort0, 15>;
defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>; defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>;
defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>; defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>;

View File

@ -117,6 +117,7 @@ defm : SBWriteResPair<WriteFAdd, SBPort1, 3>;
defm : SBWriteResPair<WriteFMul, SBPort0, 5>; defm : SBWriteResPair<WriteFMul, SBPort0, 5>;
defm : SBWriteResPair<WriteFDiv, SBPort0, 12>; // 10-14 cycles. defm : SBWriteResPair<WriteFDiv, SBPort0, 12>; // 10-14 cycles.
defm : SBWriteResPair<WriteFRcp, SBPort0, 5>; defm : SBWriteResPair<WriteFRcp, SBPort0, 5>;
defm : SBWriteResPair<WriteFRsqrt, SBPort0, 5>;
defm : SBWriteResPair<WriteFSqrt, SBPort0, 15>; defm : SBWriteResPair<WriteFSqrt, SBPort0, 15>;
defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>; defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>;
defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>; defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>;

View File

@ -63,12 +63,13 @@ def WriteZero : SchedWrite;
defm WriteJump : X86SchedWritePair; defm WriteJump : X86SchedWritePair;
// Floating point. This covers both scalar and vector operations. // Floating point. This covers both scalar and vector operations.
defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare. defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare.
defm WriteFMul : X86SchedWritePair; // Floating point multiplication. defm WriteFMul : X86SchedWritePair; // Floating point multiplication.
defm WriteFDiv : X86SchedWritePair; // Floating point division. defm WriteFDiv : X86SchedWritePair; // Floating point division.
defm WriteFSqrt : X86SchedWritePair; // Floating point square root. defm WriteFSqrt : X86SchedWritePair; // Floating point square root.
defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal. defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal estimate.
defm WriteFMA : X86SchedWritePair; // Fused Multiply Add. defm WriteFRsqrt : X86SchedWritePair; // Floating point reciprocal square root estimate.
defm WriteFMA : X86SchedWritePair; // Fused Multiply Add.
defm WriteFShuffle : X86SchedWritePair; // Floating point vector shuffles. defm WriteFShuffle : X86SchedWritePair; // Floating point vector shuffles.
defm WriteFBlend : X86SchedWritePair; // Floating point vector blends. defm WriteFBlend : X86SchedWritePair; // Floating point vector blends.
defm WriteFVarBlend : X86SchedWritePair; // Fp vector variable blends. defm WriteFVarBlend : X86SchedWritePair; // Fp vector variable blends.
@ -314,6 +315,11 @@ def IIC_SSE_SQRTPD_RM : InstrItinClass;
def IIC_SSE_SQRTSD_RR : InstrItinClass; def IIC_SSE_SQRTSD_RR : InstrItinClass;
def IIC_SSE_SQRTSD_RM : InstrItinClass; def IIC_SSE_SQRTSD_RM : InstrItinClass;
def IIC_SSE_RSQRTPS_RR : InstrItinClass;
def IIC_SSE_RSQRTPS_RM : InstrItinClass;
def IIC_SSE_RSQRTSS_RR : InstrItinClass;
def IIC_SSE_RSQRTSS_RM : InstrItinClass;
def IIC_SSE_RCPP_RR : InstrItinClass; def IIC_SSE_RCPP_RR : InstrItinClass;
def IIC_SSE_RCPP_RM : InstrItinClass; def IIC_SSE_RCPP_RM : InstrItinClass;
def IIC_SSE_RCPS_RR : InstrItinClass; def IIC_SSE_RCPS_RR : InstrItinClass;

View File

@ -224,6 +224,11 @@ def AtomItineraries : ProcessorItineraries<
InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >, InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >,
InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >, InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >,
InstrItinData<IIC_SSE_RSQRTPS_RR, [InstrStage<9, [Port0, Port1]>] >,
InstrItinData<IIC_SSE_RSQRTPS_RM, [InstrStage<10, [Port0, Port1]>] >,
InstrItinData<IIC_SSE_RSQRTSS_RR, [InstrStage<4, [Port0]>] >,
InstrItinData<IIC_SSE_RSQRTSS_RM, [InstrStage<4, [Port0]>] >,
InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >, InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >,
InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >, InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >,
InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >, InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >,

View File

@ -163,15 +163,15 @@ defm : JWriteResIntPair<WriteJump, JALU01, 1>;
// FIXME: should we bother splitting JFPU pipe + unit stages for fast instructions? // FIXME: should we bother splitting JFPU pipe + unit stages for fast instructions?
// FIXME: Double precision latencies // FIXME: Double precision latencies
// FIXME: SS vs PS latencies // FIXME: SS vs PS latencies
// FIXME: RSQRT latencies
// FIXME: ymm latencies // FIXME: ymm latencies
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
defm : JWriteResFpuPair<WriteFAdd, JFPU0, 3>; defm : JWriteResFpuPair<WriteFAdd, JFPU0, 3>;
defm : JWriteResFpuPair<WriteFMul, JFPU1, 2>; defm : JWriteResFpuPair<WriteFMul, JFPU1, 2>;
defm : JWriteResFpuPair<WriteFRcp, JFPU1, 2>; defm : JWriteResFpuPair<WriteFRcp, JFPU1, 2>;
defm : JWriteResFpuPair<WriteFShuffle, JFPU01, 1>; defm : JWriteResFpuPair<WriteFRsqrt, JFPU1, 2>;
defm : JWriteResFpuPair<WriteFBlend, JFPU01, 1>; defm : JWriteResFpuPair<WriteFShuffle, JFPU01, 1>;
defm : JWriteResFpuPair<WriteFBlend, JFPU01, 1>;
defm : JWriteResFpuPair<WriteFShuffle256, JFPU01, 1>; defm : JWriteResFpuPair<WriteFShuffle256, JFPU01, 1>;
def : WriteRes<WriteFSqrt, [JFPU1, JLAGU, JFPM]> { def : WriteRes<WriteFSqrt, [JFPU1, JLAGU, JFPM]> {

View File

@ -101,6 +101,7 @@ def : WriteRes<WriteIDivLd, [MEC_RSV, IEC_RSV01, SMDivider]> {
// Scalar and vector floating point. // Scalar and vector floating point.
defm : SMWriteResPair<WriteFAdd, FPC_RSV1, 3>; defm : SMWriteResPair<WriteFAdd, FPC_RSV1, 3>;
defm : SMWriteResPair<WriteFRcp, FPC_RSV0, 5>; defm : SMWriteResPair<WriteFRcp, FPC_RSV0, 5>;
defm : SMWriteResPair<WriteFRsqrt, FPC_RSV0, 5>;
defm : SMWriteResPair<WriteFSqrt, FPC_RSV0, 15>; defm : SMWriteResPair<WriteFSqrt, FPC_RSV0, 15>;
defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>; defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>;
defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>; defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>;