diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 0d6a9f0f692..d6849767705 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -3123,10 +3123,9 @@ let isCodeGenOnly = 1 in { } // Patterns used to select SSE scalar fp arithmetic instructions from -// a scalar fp operation followed by a blend. +// either: // -// These patterns know, for example, how to select an ADDSS from a -// float add plus vector insert. +// (1) a scalar fp operation followed by a blend // // The effect is that the backend no longer emits unnecessary vector // insert instructions immediately after SSE scalar fp instructions @@ -3138,218 +3137,14 @@ let isCodeGenOnly = 1 in { // return A; // } // -// previously we generated: +// Previously we generated: // addss %xmm0, %xmm1 // movss %xmm1, %xmm0 // -// we now generate: +// We now generate: // addss %xmm1, %xmm0 - -let Predicates = [UseSSE1] in { - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))))), - (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))))), - (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))))), - (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))))), - (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; -} - -let Predicates = [UseSSE2] in { - // SSE2 patterns to select scalar double-precision fp arithmetic instructions - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; -} - -let Predicates = [UseSSE41] in { - // If the subtarget has SSE4.1 but not AVX, the vector insert instruction is - // lowered into a X86insertps or a X86Blendi rather than a X86Movss. When - // selecting SSE scalar single-precision fp arithmetic instructions, make - // sure that we correctly match them. - - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; -} - -let Predicates = [HasAVX] in { - // The following patterns select AVX Scalar single/double precision fp - // arithmetic instructions. - - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; -} - -// Patterns used to select SSE scalar fp arithmetic instructions from -// a vector packed single/double fp operation followed by a vector insert. +// +// (2) a vector packed single/double fp operation followed by a vector insert // // The effect is that the backend converts the packed fp instruction // followed by a vector insert into a single SSE scalar fp instruction. @@ -3360,159 +3155,171 @@ let Predicates = [HasAVX] in { // return (__m128) {c[0], a[1], a[2], a[3]}; // } // -// previously we generated: +// Previously we generated: // addps %xmm0, %xmm1 // movss %xmm1, %xmm0 // -// we now generate: +// We now generate: // addss %xmm1, %xmm0 -let Predicates = [UseSSE1] in { - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (ADDSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (SUBSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (MULSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (DIVSSrr_Int v4f32:$dst, v4f32:$src)>; +// TODO: Some canonicalization in lowering would simplify the number of +// patterns we have to try to match. In particular, the reversed order blends +// seem unnecessary. +multiclass scalar_math_f32_patterns { + let Predicates = [UseSSE1] in { + // extracted scalar math op with insert via movss + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))))), + (!cast(OpcPrefix#SSrr_Int) v4f32:$dst, + (COPY_TO_REGCLASS FR32:$src, VR128))>; + + // vector math op with insert via movss + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (!cast(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; + } + + // With SSE 4.1, insertps/blendi are preferred to movsd, so match those too. + let Predicates = [UseSSE41] in { + // extracted scalar math op with insert via insertps + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (iPTR 0))), + (!cast(OpcPrefix#SSrr_Int) v4f32:$dst, + (COPY_TO_REGCLASS FR32:$src, VR128))>; + + // extracted scalar math op with insert via blend + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (!cast(OpcPrefix#SSrr_Int) v4f32:$dst, + (COPY_TO_REGCLASS FR32:$src, VR128))>; + + // vector math op with insert via blend + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (!cast(OpcPrefix#SSrr_Int)v4f32:$dst, v4f32:$src)>; + + } + + // Repeat everything for AVX, except for the movss + scalar combo... + // because that one shouldn't occur with AVX codegen? + let Predicates = [HasAVX] in { + // extracted scalar math op with insert via insertps + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (iPTR 0))), + (!cast("V"#OpcPrefix#SSrr_Int) v4f32:$dst, + (COPY_TO_REGCLASS FR32:$src, VR128))>; + + // extracted scalar math op with insert via blend + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (!cast("V"#OpcPrefix#SSrr_Int) v4f32:$dst, + (COPY_TO_REGCLASS FR32:$src, VR128))>; + + // vector math op with insert via movss + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (!cast("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; + + // vector math op with insert via blend + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (!cast("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; + } } -let Predicates = [UseSSE2] in { - // SSE2 patterns to select scalar double-precision fp arithmetic instructions - // from a packed double-precision fp instruction plus movsd. +defm : scalar_math_f32_patterns; +defm : scalar_math_f32_patterns; +defm : scalar_math_f32_patterns; +defm : scalar_math_f32_patterns; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (MULSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; +multiclass scalar_math_f64_patterns { + let Predicates = [UseSSE2] in { + // extracted scalar math op with insert via movsd + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector + (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))))), + (!cast(OpcPrefix#SDrr_Int) v2f64:$dst, + (COPY_TO_REGCLASS FR64:$src, VR128))>; + + // vector math op with insert via movsd + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (!cast(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; + } + + // With SSE 4.1, blendi is preferred to movsd, so match those too. + let Predicates = [UseSSE41] in { + // extracted scalar math op with insert via blend + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector + (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (!cast(OpcPrefix#SDrr_Int) v2f64:$dst, + (COPY_TO_REGCLASS FR64:$src, VR128))>; + + // vector math op with insert via blend + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (!cast(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; + + // vector math op with insert via blend (reversed order) + def : Pat<(v2f64 (X86Blendi + (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (!cast(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; + } + + // Repeat everything for AVX and add one more pattern + // (the scalar + blend reversed order) for good measure. + let Predicates = [HasAVX] in { + // extracted scalar math op with insert via movsd + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector + (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))))), + (!cast("V"#OpcPrefix#SDrr_Int) v2f64:$dst, + (COPY_TO_REGCLASS FR64:$src, VR128))>; + + // extracted scalar math op with insert via blend + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector + (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (!cast("V"#OpcPrefix#SDrr_Int) v2f64:$dst, + (COPY_TO_REGCLASS FR64:$src, VR128))>; + + // extracted scalar math op with insert via blend (reversed order) + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector + (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (!cast("V"#OpcPrefix#SDrr_Int) v2f64:$dst, + (COPY_TO_REGCLASS FR64:$src, VR128))>; + + // vector math op with insert via movsd + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (!cast("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; + + // vector math op with insert via blend + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (!cast("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; + + // vector math op with insert via blend (reversed order) + def : Pat<(v2f64 (X86Blendi + (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (!cast("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; + } } -let Predicates = [UseSSE41] in { - // With SSE4.1 we may see these operations using X86Blendi rather than - // X86Movs{s,d}. - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (ADDSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (SUBSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (MULSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (DIVSSrr_Int v4f32:$dst, v4f32:$src)>; +defm : scalar_math_f64_patterns; +defm : scalar_math_f64_patterns; +defm : scalar_math_f64_patterns; +defm : scalar_math_f64_patterns; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (MULSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; - - def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (MULSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; -} - -let Predicates = [HasAVX] in { - // The following patterns select AVX Scalar single/double precision fp - // arithmetic instructions from a packed single precision fp instruction - // plus movss/movsd. - - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (VADDSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (VMULSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; - - // Also handle X86Blendi-based patterns. - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (VADDSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (VMULSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>; - - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; - - def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; -} /// Unop Arithmetic /// In addition, we also have a special variant of the scalar form here to diff --git a/test/CodeGen/X86/sse-scalar-fp-arith.ll b/test/CodeGen/X86/sse-scalar-fp-arith.ll index b122ef67544..8b1c6d0c882 100644 --- a/test/CodeGen/X86/sse-scalar-fp-arith.ll +++ b/test/CodeGen/X86/sse-scalar-fp-arith.ll @@ -370,8 +370,155 @@ define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } +; With SSE4.1 or greater, the shuffles in the following tests may +; be lowered to X86Blendi nodes. + +define <4 x float> @blend_add_ss(<4 x float> %a, float %b) { +; SSE-LABEL: blend_add_ss: +; SSE: # BB#0: +; SSE-NEXT: addss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: blend_add_ss: +; AVX: # BB#0: +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + + %ext = extractelement <4 x float> %a, i32 0 + %op = fadd float %b, %ext + %ins = insertelement <4 x float> undef, float %op, i32 0 + %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> + ret <4 x float> %shuf +} + +define <4 x float> @blend_sub_ss(<4 x float> %a, float %b) { +; SSE-LABEL: blend_sub_ss: +; SSE: # BB#0: +; SSE-NEXT: subss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: blend_sub_ss: +; AVX: # BB#0: +; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + + %ext = extractelement <4 x float> %a, i32 0 + %op = fsub float %ext, %b + %ins = insertelement <4 x float> undef, float %op, i32 0 + %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> + ret <4 x float> %shuf +} + +define <4 x float> @blend_mul_ss(<4 x float> %a, float %b) { +; SSE-LABEL: blend_mul_ss: +; SSE: # BB#0: +; SSE-NEXT: mulss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: blend_mul_ss: +; AVX: # BB#0: +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + + %ext = extractelement <4 x float> %a, i32 0 + %op = fmul float %b, %ext + %ins = insertelement <4 x float> undef, float %op, i32 0 + %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> + ret <4 x float> %shuf +} + +define <4 x float> @blend_div_ss(<4 x float> %a, float %b) { +; SSE-LABEL: blend_div_ss: +; SSE: # BB#0: +; SSE-NEXT: divss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: blend_div_ss: +; AVX: # BB#0: +; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + + %ext = extractelement <4 x float> %a, i32 0 + %op = fdiv float %ext, %b + %ins = insertelement <4 x float> undef, float %op, i32 0 + %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> + ret <4 x float> %shuf +} + +define <2 x double> @blend_add_sd(<2 x double> %a, double %b) { +; SSE-LABEL: blend_add_sd: +; SSE: # BB#0: +; SSE-NEXT: addsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: blend_add_sd: +; AVX: # BB#0: +; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + + %ext = extractelement <2 x double> %a, i32 0 + %op = fadd double %b, %ext + %ins = insertelement <2 x double> undef, double %op, i32 0 + %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> + ret <2 x double> %shuf +} + +define <2 x double> @blend_sub_sd(<2 x double> %a, double %b) { +; SSE-LABEL: blend_sub_sd: +; SSE: # BB#0: +; SSE-NEXT: subsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: blend_sub_sd: +; AVX: # BB#0: +; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + + %ext = extractelement <2 x double> %a, i32 0 + %op = fsub double %ext, %b + %ins = insertelement <2 x double> undef, double %op, i32 0 + %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> + ret <2 x double> %shuf +} + +define <2 x double> @blend_mul_sd(<2 x double> %a, double %b) { +; SSE-LABEL: blend_mul_sd: +; SSE: # BB#0: +; SSE-NEXT: mulsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: blend_mul_sd: +; AVX: # BB#0: +; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + + %ext = extractelement <2 x double> %a, i32 0 + %op = fmul double %b, %ext + %ins = insertelement <2 x double> undef, double %op, i32 0 + %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> + ret <2 x double> %shuf +} + +define <2 x double> @blend_div_sd(<2 x double> %a, double %b) { +; SSE-LABEL: blend_div_sd: +; SSE: # BB#0: +; SSE-NEXT: divsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: blend_div_sd: +; AVX: # BB#0: +; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + + %ext = extractelement <2 x double> %a, i32 0 + %op = fdiv double %ext, %b + %ins = insertelement <2 x double> undef, double %op, i32 0 + %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> + ret <2 x double> %shuf +} + ; Ensure that the backend selects SSE/AVX scalar fp instructions -; from a packed fp instrution plus a vector insert. +; from a packed fp instruction plus a vector insert. define <4 x float> @insert_test_add_ss(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: insert_test_add_ss: