diff --git a/include/llvm/IntrinsicsX86.td b/include/llvm/IntrinsicsX86.td index ea7597ca2ca..2d5d9ff0d27 100644 --- a/include/llvm/IntrinsicsX86.td +++ b/include/llvm/IntrinsicsX86.td @@ -1825,10 +1825,138 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // FMA4 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". + def int_x86_fma4_vfmadd_ss : GCCBuiltin<"__builtin_ia32_vfmaddss">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], + [IntrNoMem]>; def int_x86_fma4_vfmadd_sd : GCCBuiltin<"__builtin_ia32_vfmaddsd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; + def int_x86_fma4_vfmadd_ps : GCCBuiltin<"__builtin_ia32_vfmaddps">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], + [IntrNoMem]>; + def int_x86_fma4_vfmadd_pd : GCCBuiltin<"__builtin_ia32_vfmaddpd">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty], + [IntrNoMem]>; + def int_x86_fma4_vfmadd_ps_256 : GCCBuiltin<"__builtin_ia32_vfmaddps256">, + Intrinsic<[llvm_v8f32_ty], + [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty], + [IntrNoMem]>; + def int_x86_fma4_vfmadd_pd_256 : GCCBuiltin<"__builtin_ia32_vfmaddpd256">, + Intrinsic<[llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty], + [IntrNoMem]>; + def int_x86_fma4_vfmsub_ss : GCCBuiltin<"__builtin_ia32_vfmsubss">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], + [IntrNoMem]>; + def int_x86_fma4_vfmsub_sd : GCCBuiltin<"__builtin_ia32_vfmsubsd">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty], + [IntrNoMem]>; + def int_x86_fma4_vfmsub_ps : GCCBuiltin<"__builtin_ia32_vfmsubps">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], + [IntrNoMem]>; + def int_x86_fma4_vfmsub_pd : GCCBuiltin<"__builtin_ia32_vfmsubpd">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty], + [IntrNoMem]>; + def int_x86_fma4_vfmsub_ps_256 : GCCBuiltin<"__builtin_ia32_vfmsubps256">, + Intrinsic<[llvm_v8f32_ty], + [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty], + [IntrNoMem]>; + def int_x86_fma4_vfmsub_pd_256 : GCCBuiltin<"__builtin_ia32_vfmsubpd256">, + Intrinsic<[llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty], + [IntrNoMem]>; + def int_x86_fma4_vfnmadd_ss : GCCBuiltin<"__builtin_ia32_vfnmaddss">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], + [IntrNoMem]>; + def int_x86_fma4_vfnmadd_sd : GCCBuiltin<"__builtin_ia32_vfnmaddsd">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty], + [IntrNoMem]>; + def int_x86_fma4_vfnmadd_ps : GCCBuiltin<"__builtin_ia32_vfnmaddps">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], + [IntrNoMem]>; + def int_x86_fma4_vfnmadd_pd : GCCBuiltin<"__builtin_ia32_vfnmaddpd">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty], + [IntrNoMem]>; + def int_x86_fma4_vfnmadd_ps_256 : GCCBuiltin<"__builtin_ia32_vfnmaddps256">, + Intrinsic<[llvm_v8f32_ty], + [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty], + [IntrNoMem]>; + def int_x86_fma4_vfnmadd_pd_256 : GCCBuiltin<"__builtin_ia32_vfnmaddpd256">, + Intrinsic<[llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty], + [IntrNoMem]>; + def int_x86_fma4_vfnmsub_ss : GCCBuiltin<"__builtin_ia32_vfnmsubss">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], + [IntrNoMem]>; + def int_x86_fma4_vfnmsub_sd : GCCBuiltin<"__builtin_ia32_vfnmsubsd">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty], + [IntrNoMem]>; + def int_x86_fma4_vfnmsub_ps : GCCBuiltin<"__builtin_ia32_vfnmsubps">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], + [IntrNoMem]>; + def int_x86_fma4_vfnmsub_pd : GCCBuiltin<"__builtin_ia32_vfnmsubpd">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty], + [IntrNoMem]>; + def int_x86_fma4_vfnmsub_ps_256 : GCCBuiltin<"__builtin_ia32_vfnmsubps256">, + Intrinsic<[llvm_v8f32_ty], + [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty], + [IntrNoMem]>; + def int_x86_fma4_vfnmsub_pd_256 : GCCBuiltin<"__builtin_ia32_vfnmsubpd256">, + Intrinsic<[llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty], + [IntrNoMem]>; + def int_x86_fma4_vfmaddsub_ps : GCCBuiltin<"__builtin_ia32_vfmaddsubps">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], + [IntrNoMem]>; + def int_x86_fma4_vfmaddsub_pd : GCCBuiltin<"__builtin_ia32_vfmaddsubpd">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty], + [IntrNoMem]>; + def int_x86_fma4_vfmaddsub_ps_256 : + GCCBuiltin<"__builtin_ia32_vfmaddsubps256">, + Intrinsic<[llvm_v8f32_ty], + [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty], + [IntrNoMem]>; + def int_x86_fma4_vfmaddsub_pd_256 : + GCCBuiltin<"__builtin_ia32_vfmaddsubpd256">, + Intrinsic<[llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty], + [IntrNoMem]>; + def int_x86_fma4_vfmsubadd_ps : GCCBuiltin<"__builtin_ia32_vfmsubaddps">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], + [IntrNoMem]>; + def int_x86_fma4_vfmsubadd_pd : GCCBuiltin<"__builtin_ia32_vfmsubaddpd">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty], + [IntrNoMem]>; + def int_x86_fma4_vfmsubadd_ps_256 : + GCCBuiltin<"__builtin_ia32_vfmsubaddps256">, + Intrinsic<[llvm_v8f32_ty], + [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty], + [IntrNoMem]>; + def int_x86_fma4_vfmsubadd_pd_256 : + GCCBuiltin<"__builtin_ia32_vfmsubaddpd256">, + Intrinsic<[llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty], + [IntrNoMem]>; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td index bdf797d5e19..015b01ecffd 100644 --- a/lib/Target/X86/X86InstrFMA.td +++ b/lib/Target/X86/X86InstrFMA.td @@ -83,12 +83,74 @@ multiclass fma4s opc, string OpcodeStr> { } +multiclass fma4p opc, string OpcodeStr> { + def rr : FMA4, XOP_W; + def rm : FMA4, XOP_W; + def mr : FMA4; + def rrY : FMA4, XOP_W; + def rmY : FMA4, XOP_W; + def mrY : FMA4; +} + let isAsmParserOnly = 1 in { + defm VFMADDSS4 : fma4s<0x6A, "vfmaddss">; defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd">; + defm VFMADDPS4 : fma4p<0x68, "vfmaddps">; + defm VFMADDPD4 : fma4p<0x69, "vfmaddpd">; + defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss">; + defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd">; + defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps">; + defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd">; + defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss">; + defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd">; + defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps">; + defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd">; + defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss">; + defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd">; + defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps">; + defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd">; + defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps">; + defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd">; + defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps">; + defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd">; } // FMA4 Intrinsics patterns +// VFMADD +def : Pat<(int_x86_fma4_vfmadd_ss VR128:$src1, VR128:$src2, VR128:$src3), + (VFMADDSS4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_ss VR128:$src1, VR128:$src2, + (alignedloadv4f32 addr:$src3)), + (VFMADDSS4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_ss VR128:$src1, (alignedloadv4f32 addr:$src2), + VR128:$src3), + (VFMADDSS4mr VR128:$src1, addr:$src2, VR128:$src3)>; + def : Pat<(int_x86_fma4_vfmadd_sd VR128:$src1, VR128:$src2, VR128:$src3), (VFMADDSD4rr VR128:$src1, VR128:$src2, VR128:$src3)>; def : Pat<(int_x86_fma4_vfmadd_sd VR128:$src1, VR128:$src2, @@ -97,3 +159,290 @@ def : Pat<(int_x86_fma4_vfmadd_sd VR128:$src1, VR128:$src2, def : Pat<(int_x86_fma4_vfmadd_sd VR128:$src1, (alignedloadv2f64 addr:$src2), VR128:$src3), (VFMADDSD4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfmadd_ps VR128:$src1, VR128:$src2, VR128:$src3), + (VFMADDPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_ps VR128:$src1, VR128:$src2, + (alignedloadv4f32 addr:$src3)), + (VFMADDPS4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_ps VR128:$src1, (alignedloadv4f32 addr:$src2), + VR128:$src3), + (VFMADDPS4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfmadd_pd VR128:$src1, VR128:$src2, VR128:$src3), + (VFMADDPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_pd VR128:$src1, VR128:$src2, + (alignedloadv2f64 addr:$src3)), + (VFMADDPD4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_pd VR128:$src1, (alignedloadv2f64 addr:$src2), + VR128:$src3), + (VFMADDPD4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfmadd_ps_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFMADDPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_ps_256 VR256:$src1, VR256:$src2, + (alignedloadv8f32 addr:$src3)), + (VFMADDPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_ps_256 VR256:$src1, + (alignedloadv8f32 addr:$src2), + VR256:$src3), + (VFMADDPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>; + +def : Pat<(int_x86_fma4_vfmadd_pd_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFMADDPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_pd_256 VR256:$src1, VR256:$src2, + (alignedloadv4f64 addr:$src3)), + (VFMADDPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_pd_256 VR256:$src1, + (alignedloadv4f64 addr:$src2), + VR256:$src3), + (VFMADDPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>; + +// VFMSUB +def : Pat<(int_x86_fma4_vfmsub_ss VR128:$src1, VR128:$src2, VR128:$src3), + (VFMSUBSS4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_ss VR128:$src1, VR128:$src2, + (alignedloadv4f32 addr:$src3)), + (VFMSUBSS4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_ss VR128:$src1, (alignedloadv4f32 addr:$src2), + VR128:$src3), + (VFMSUBSS4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfmsub_sd VR128:$src1, VR128:$src2, VR128:$src3), + (VFMSUBSD4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_sd VR128:$src1, VR128:$src2, + (alignedloadv2f64 addr:$src3)), + (VFMSUBSD4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_sd VR128:$src1, (alignedloadv2f64 addr:$src2), + VR128:$src3), + (VFMSUBSD4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfmsub_ps VR128:$src1, VR128:$src2, VR128:$src3), + (VFMSUBPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_ps VR128:$src1, VR128:$src2, + (alignedloadv4f32 addr:$src3)), + (VFMSUBPS4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_ps VR128:$src1, (alignedloadv4f32 addr:$src2), + VR128:$src3), + (VFMSUBPS4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfmsub_pd VR128:$src1, VR128:$src2, VR128:$src3), + (VFMSUBPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_pd VR128:$src1, VR128:$src2, + (alignedloadv2f64 addr:$src3)), + (VFMSUBPD4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_pd VR128:$src1, (alignedloadv2f64 addr:$src2), + VR128:$src3), + (VFMSUBPD4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfmsub_ps_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFMSUBPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_ps_256 VR256:$src1, VR256:$src2, + (alignedloadv8f32 addr:$src3)), + (VFMSUBPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_ps_256 VR256:$src1, + (alignedloadv8f32 addr:$src2), + VR256:$src3), + (VFMSUBPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>; + +def : Pat<(int_x86_fma4_vfmsub_pd_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFMSUBPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_pd_256 VR256:$src1, VR256:$src2, + (alignedloadv4f64 addr:$src3)), + (VFMSUBPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_pd_256 VR256:$src1, + (alignedloadv4f64 addr:$src2), + VR256:$src3), + (VFMSUBPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>; + +// VFNMADD +def : Pat<(int_x86_fma4_vfnmadd_ss VR128:$src1, VR128:$src2, VR128:$src3), + (VFNMADDSS4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_ss VR128:$src1, VR128:$src2, + (alignedloadv4f32 addr:$src3)), + (VFNMADDSS4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_ss VR128:$src1, (alignedloadv4f32 addr:$src2), + VR128:$src3), + (VFNMADDSS4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfnmadd_sd VR128:$src1, VR128:$src2, VR128:$src3), + (VFNMADDSD4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_sd VR128:$src1, VR128:$src2, + (alignedloadv2f64 addr:$src3)), + (VFNMADDSD4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_sd VR128:$src1, (alignedloadv2f64 addr:$src2), + VR128:$src3), + (VFNMADDSD4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfnmadd_ps VR128:$src1, VR128:$src2, VR128:$src3), + (VFNMADDPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_ps VR128:$src1, VR128:$src2, + (alignedloadv4f32 addr:$src3)), + (VFNMADDPS4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_ps VR128:$src1, (alignedloadv4f32 addr:$src2), + VR128:$src3), + (VFNMADDPS4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfnmadd_pd VR128:$src1, VR128:$src2, VR128:$src3), + (VFNMADDPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_pd VR128:$src1, VR128:$src2, + (alignedloadv2f64 addr:$src3)), + (VFNMADDPD4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_pd VR128:$src1, (alignedloadv2f64 addr:$src2), + VR128:$src3), + (VFNMADDPD4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfnmadd_ps_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFNMADDPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_ps_256 VR256:$src1, VR256:$src2, + (alignedloadv8f32 addr:$src3)), + (VFNMADDPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_ps_256 VR256:$src1, + (alignedloadv8f32 addr:$src2), + VR256:$src3), + (VFNMADDPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>; + +def : Pat<(int_x86_fma4_vfnmadd_pd_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFNMADDPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_pd_256 VR256:$src1, VR256:$src2, + (alignedloadv4f64 addr:$src3)), + (VFNMADDPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_pd_256 VR256:$src1, + (alignedloadv4f64 addr:$src2), + VR256:$src3), + (VFNMADDPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>; + +// VFNMSUB +def : Pat<(int_x86_fma4_vfnmsub_ss VR128:$src1, VR128:$src2, VR128:$src3), + (VFNMSUBSS4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_ss VR128:$src1, VR128:$src2, + (alignedloadv4f32 addr:$src3)), + (VFNMSUBSS4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_ss VR128:$src1, (alignedloadv4f32 addr:$src2), + VR128:$src3), + (VFNMSUBSS4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfnmsub_sd VR128:$src1, VR128:$src2, VR128:$src3), + (VFNMSUBSD4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_sd VR128:$src1, VR128:$src2, + (alignedloadv2f64 addr:$src3)), + (VFNMSUBSD4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_sd VR128:$src1, (alignedloadv2f64 addr:$src2), + VR128:$src3), + (VFNMSUBSD4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfnmsub_ps VR128:$src1, VR128:$src2, VR128:$src3), + (VFNMSUBPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_ps VR128:$src1, VR128:$src2, + (alignedloadv4f32 addr:$src3)), + (VFNMSUBPS4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_ps VR128:$src1, (alignedloadv4f32 addr:$src2), + VR128:$src3), + (VFNMSUBPS4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfnmsub_pd VR128:$src1, VR128:$src2, VR128:$src3), + (VFNMSUBPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_pd VR128:$src1, VR128:$src2, + (alignedloadv2f64 addr:$src3)), + (VFNMSUBPD4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_pd VR128:$src1, (alignedloadv2f64 addr:$src2), + VR128:$src3), + (VFNMSUBPD4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfnmsub_ps_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFNMSUBPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_ps_256 VR256:$src1, VR256:$src2, + (alignedloadv8f32 addr:$src3)), + (VFNMSUBPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_ps_256 VR256:$src1, + (alignedloadv8f32 addr:$src2), + VR256:$src3), + (VFNMSUBPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>; + +def : Pat<(int_x86_fma4_vfnmsub_pd_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFNMSUBPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_pd_256 VR256:$src1, VR256:$src2, + (alignedloadv4f64 addr:$src3)), + (VFNMSUBPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_pd_256 VR256:$src1, + (alignedloadv4f64 addr:$src2), + VR256:$src3), + (VFNMSUBPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>; + +// VFMADDSUB +def : Pat<(int_x86_fma4_vfmaddsub_ps VR128:$src1, VR128:$src2, VR128:$src3), + (VFMADDSUBPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmaddsub_ps VR128:$src1, VR128:$src2, + (alignedloadv4f32 addr:$src3)), + (VFMADDSUBPS4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmaddsub_ps VR128:$src1, (alignedloadv4f32 addr:$src2), + VR128:$src3), + (VFMADDSUBPS4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfmaddsub_pd VR128:$src1, VR128:$src2, VR128:$src3), + (VFMADDSUBPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmaddsub_pd VR128:$src1, VR128:$src2, + (alignedloadv2f64 addr:$src3)), + (VFMADDSUBPD4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmaddsub_pd VR128:$src1, (alignedloadv2f64 addr:$src2), + VR128:$src3), + (VFMADDSUBPD4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfmaddsub_ps_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFMADDSUBPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfmaddsub_ps_256 VR256:$src1, VR256:$src2, + (alignedloadv8f32 addr:$src3)), + (VFMADDSUBPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmaddsub_ps_256 VR256:$src1, + (alignedloadv8f32 addr:$src2), + VR256:$src3), + (VFMADDSUBPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>; + +def : Pat<(int_x86_fma4_vfmaddsub_pd_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFMADDSUBPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfmaddsub_pd_256 VR256:$src1, VR256:$src2, + (alignedloadv4f64 addr:$src3)), + (VFMADDSUBPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmaddsub_pd_256 VR256:$src1, + (alignedloadv4f64 addr:$src2), + VR256:$src3), + (VFMADDSUBPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>; + +// VFMSUBADD +def : Pat<(int_x86_fma4_vfmsubadd_ps VR128:$src1, VR128:$src2, VR128:$src3), + (VFMSUBADDPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmsubadd_ps VR128:$src1, VR128:$src2, + (alignedloadv4f32 addr:$src3)), + (VFMSUBADDPS4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmsubadd_ps VR128:$src1, (alignedloadv4f32 addr:$src2), + VR128:$src3), + (VFMSUBADDPS4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfmsubadd_pd VR128:$src1, VR128:$src2, VR128:$src3), + (VFMSUBADDPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmsubadd_pd VR128:$src1, VR128:$src2, + (alignedloadv2f64 addr:$src3)), + (VFMSUBADDPD4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmsubadd_pd VR128:$src1, (alignedloadv2f64 addr:$src2), + VR128:$src3), + (VFMSUBADDPD4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfmsubadd_ps_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFMSUBADDPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfmsubadd_ps_256 VR256:$src1, VR256:$src2, + (alignedloadv8f32 addr:$src3)), + (VFMSUBADDPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmsubadd_ps_256 VR256:$src1, + (alignedloadv8f32 addr:$src2), + VR256:$src3), + (VFMSUBADDPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>; + +def : Pat<(int_x86_fma4_vfmsubadd_pd_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFMSUBADDPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfmsubadd_pd_256 VR256:$src1, VR256:$src2, + (alignedloadv4f64 addr:$src3)), + (VFMSUBADDPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmsubadd_pd_256 VR256:$src1, + (alignedloadv4f64 addr:$src2), + VR256:$src3), + (VFMSUBADDPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>; diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll index 39c2311eb52..bd94c134ce2 100644 --- a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll +++ b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll @@ -1,4 +1,12 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=fma4 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=+avx,+fma4 | FileCheck %s + +; VFMADD +define < 4 x float > @test_x86_fma4_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { + ; CHECK: vfmaddss + %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; [#uses=1] + ret < 4 x float > %res +} +declare < 4 x float > @llvm.x86.fma4.vfmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone define < 2 x double > @test_x86_fma4_vfmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { ; CHECK: vfmaddsd @@ -7,3 +15,229 @@ define < 2 x double > @test_x86_fma4_vfmadd_sd(< 2 x double > %a0, < 2 x double } declare < 2 x double > @llvm.x86.fma4.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone +define < 4 x float > @test_x86_fma4_vfmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { + ; CHECK: vfmaddps + %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; [#uses=1] + ret < 4 x float > %res +} +declare < 4 x float > @llvm.x86.fma4.vfmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone + +define < 2 x double > @test_x86_fma4_vfmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { + ; CHECK: vfmaddpd + %res = call < 2 x double > @llvm.x86.fma4.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; [#uses=1] + ret < 2 x double > %res +} +declare < 2 x double > @llvm.x86.fma4.vfmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone + +define < 8 x float > @test_x86_fma4_vfmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { + ; CHECK: vfmaddps + ; CHECK: ymm + %res = call < 8 x float > @llvm.x86.fma4.vfmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; [#uses=1] + ret < 8 x float > %res +} +declare < 8 x float > @llvm.x86.fma4.vfmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone + +define < 4 x double > @test_x86_fma4_vfmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { + ; CHECK: vfmaddpd + ; CHECK: ymm + %res = call < 4 x double > @llvm.x86.fma4.vfmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; [#uses=1] + ret < 4 x double > %res +} +declare < 4 x double > @llvm.x86.fma4.vfmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone + +; VFMSUB +define < 4 x float > @test_x86_fma4_vfmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { + ; CHECK: vfmsubss + %res = call < 4 x float > @llvm.x86.fma4.vfmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; [#uses=1] + ret < 4 x float > %res +} +declare < 4 x float > @llvm.x86.fma4.vfmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone + +define < 2 x double > @test_x86_fma4_vfmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { + ; CHECK: vfmsubsd + %res = call < 2 x double > @llvm.x86.fma4.vfmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; [#uses=1] + ret < 2 x double > %res +} +declare < 2 x double > @llvm.x86.fma4.vfmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone + +define < 4 x float > @test_x86_fma4_vfmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { + ; CHECK: vfmsubps + %res = call < 4 x float > @llvm.x86.fma4.vfmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; [#uses=1] + ret < 4 x float > %res +} +declare < 4 x float > @llvm.x86.fma4.vfmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone + +define < 2 x double > @test_x86_fma4_vfmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { + ; CHECK: vfmsubpd + %res = call < 2 x double > @llvm.x86.fma4.vfmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; [#uses=1] + ret < 2 x double > %res +} +declare < 2 x double > @llvm.x86.fma4.vfmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone + +define < 8 x float > @test_x86_fma4_vfmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { + ; CHECK: vfmsubps + ; CHECK: ymm + %res = call < 8 x float > @llvm.x86.fma4.vfmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; [#uses=1] + ret < 8 x float > %res +} +declare < 8 x float > @llvm.x86.fma4.vfmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone + +define < 4 x double > @test_x86_fma4_vfmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { + ; CHECK: vfmsubpd + ; CHECK: ymm + %res = call < 4 x double > @llvm.x86.fma4.vfmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; [#uses=1] + ret < 4 x double > %res +} +declare < 4 x double > @llvm.x86.fma4.vfmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone + +; VFNMADD +define < 4 x float > @test_x86_fma4_vfnmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { + ; CHECK: vfnmaddss + %res = call < 4 x float > @llvm.x86.fma4.vfnmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; [#uses=1] + ret < 4 x float > %res +} +declare < 4 x float > @llvm.x86.fma4.vfnmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone + +define < 2 x double > @test_x86_fma4_vfnmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { + ; CHECK: vfnmaddsd + %res = call < 2 x double > @llvm.x86.fma4.vfnmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; [#uses=1] + ret < 2 x double > %res +} +declare < 2 x double > @llvm.x86.fma4.vfnmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone + +define < 4 x float > @test_x86_fma4_vfnmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { + ; CHECK: vfnmaddps + %res = call < 4 x float > @llvm.x86.fma4.vfnmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; [#uses=1] + ret < 4 x float > %res +} +declare < 4 x float > @llvm.x86.fma4.vfnmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone + +define < 2 x double > @test_x86_fma4_vfnmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { + ; CHECK: vfnmaddpd + %res = call < 2 x double > @llvm.x86.fma4.vfnmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; [#uses=1] + ret < 2 x double > %res +} +declare < 2 x double > @llvm.x86.fma4.vfnmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone + +define < 8 x float > @test_x86_fma4_vfnmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { + ; CHECK: vfnmaddps + ; CHECK: ymm + %res = call < 8 x float > @llvm.x86.fma4.vfnmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; [#uses=1] + ret < 8 x float > %res +} +declare < 8 x float > @llvm.x86.fma4.vfnmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone + +define < 4 x double > @test_x86_fma4_vfnmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { + ; CHECK: vfnmaddpd + ; CHECK: ymm + %res = call < 4 x double > @llvm.x86.fma4.vfnmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; [#uses=1] + ret < 4 x double > %res +} +declare < 4 x double > @llvm.x86.fma4.vfnmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone + +; VFNMSUB +define < 4 x float > @test_x86_fma4_vfnmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { + ; CHECK: vfnmsubss + %res = call < 4 x float > @llvm.x86.fma4.vfnmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; [#uses=1] + ret < 4 x float > %res +} +declare < 4 x float > @llvm.x86.fma4.vfnmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone + +define < 2 x double > @test_x86_fma4_vfnmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { + ; CHECK: vfnmsubsd + %res = call < 2 x double > @llvm.x86.fma4.vfnmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; [#uses=1] + ret < 2 x double > %res +} +declare < 2 x double > @llvm.x86.fma4.vfnmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone + +define < 4 x float > @test_x86_fma4_vfnmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { + ; CHECK: vfnmsubps + %res = call < 4 x float > @llvm.x86.fma4.vfnmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; [#uses=1] + ret < 4 x float > %res +} +declare < 4 x float > @llvm.x86.fma4.vfnmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone + +define < 2 x double > @test_x86_fma4_vfnmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { + ; CHECK: vfnmsubpd + %res = call < 2 x double > @llvm.x86.fma4.vfnmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; [#uses=1] + ret < 2 x double > %res +} +declare < 2 x double > @llvm.x86.fma4.vfnmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone + +define < 8 x float > @test_x86_fma4_vfnmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { + ; CHECK: vfnmsubps + ; CHECK: ymm + %res = call < 8 x float > @llvm.x86.fma4.vfnmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; [#uses=1] + ret < 8 x float > %res +} +declare < 8 x float > @llvm.x86.fma4.vfnmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone + +define < 4 x double > @test_x86_fma4_vfnmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { + ; CHECK: vfnmsubpd + ; CHECK: ymm + %res = call < 4 x double > @llvm.x86.fma4.vfnmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; [#uses=1] + ret < 4 x double > %res +} +declare < 4 x double > @llvm.x86.fma4.vfnmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone + +; VFMADDSUB +define < 4 x float > @test_x86_fma4_vfmaddsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { + ; CHECK: vfmaddsubps + %res = call < 4 x float > @llvm.x86.fma4.vfmaddsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; [#uses=1] + ret < 4 x float > %res +} +declare < 4 x float > @llvm.x86.fma4.vfmaddsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone + +define < 2 x double > @test_x86_fma4_vfmaddsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { + ; CHECK: vfmaddsubpd + %res = call < 2 x double > @llvm.x86.fma4.vfmaddsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; [#uses=1] + ret < 2 x double > %res +} +declare < 2 x double > @llvm.x86.fma4.vfmaddsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone + +define < 8 x float > @test_x86_fma4_vfmaddsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { + ; CHECK: vfmaddsubps + ; CHECK: ymm + %res = call < 8 x float > @llvm.x86.fma4.vfmaddsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; [#uses=1] + ret < 8 x float > %res +} +declare < 8 x float > @llvm.x86.fma4.vfmaddsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone + +define < 4 x double > @test_x86_fma4_vfmaddsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { + ; CHECK: vfmaddsubpd + ; CHECK: ymm + %res = call < 4 x double > @llvm.x86.fma4.vfmaddsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; [#uses=1] + ret < 4 x double > %res +} +declare < 4 x double > @llvm.x86.fma4.vfmaddsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone + +; VFMSUBADD +define < 4 x float > @test_x86_fma4_vfmsubadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { + ; CHECK: vfmsubaddps + %res = call < 4 x float > @llvm.x86.fma4.vfmsubadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; [#uses=1] + ret < 4 x float > %res +} +declare < 4 x float > @llvm.x86.fma4.vfmsubadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone + +define < 2 x double > @test_x86_fma4_vfmsubadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { + ; CHECK: vfmsubaddpd + %res = call < 2 x double > @llvm.x86.fma4.vfmsubadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; [#uses=1] + ret < 2 x double > %res +} +declare < 2 x double > @llvm.x86.fma4.vfmsubadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone + +define < 8 x float > @test_x86_fma4_vfmsubadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { + ; CHECK: vfmsubaddps + ; CHECK: ymm + %res = call < 8 x float > @llvm.x86.fma4.vfmsubadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; [#uses=1] + ret < 8 x float > %res +} +declare < 8 x float > @llvm.x86.fma4.vfmsubadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone + +define < 4 x double > @test_x86_fma4_vfmsubadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { + ; CHECK: vfmsubaddpd + ; CHECK: ymm + %res = call < 4 x double > @llvm.x86.fma4.vfmsubadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; [#uses=1] + ret < 4 x double > %res +} +declare < 4 x double > @llvm.x86.fma4.vfmsubadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone diff --git a/test/MC/X86/x86_64-fma4-encoding.s b/test/MC/X86/x86_64-fma4-encoding.s index e0d2602901e..805fc23cf4c 100644 --- a/test/MC/X86/x86_64-fma4-encoding.s +++ b/test/MC/X86/x86_64-fma4-encoding.s @@ -1,5 +1,18 @@ // RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s +// vfmadd +// CHECK: vfmaddss (%rcx), %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x6a,0x01,0x10] + vfmaddss (%rcx), %xmm1, %xmm0, %xmm0 + +// CHECK: vfmaddss %xmm1, (%rcx), %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0x79,0x6a,0x01,0x10] + vfmaddss %xmm1, (%rcx),%xmm0, %xmm0 + +// CHECK: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x6a,0xc2,0x10] + vfmaddss %xmm2, %xmm1, %xmm0, %xmm0 + // CHECK: vfmaddsd (%rcx), %xmm1, %xmm0, %xmm0 // CHECK: encoding: [0xc4,0xe3,0xf9,0x6b,0x01,0x10] vfmaddsd (%rcx), %xmm1, %xmm0, %xmm0 @@ -11,3 +24,368 @@ // CHECK: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0 // CHECK: encoding: [0xc4,0xe3,0xf9,0x6b,0xc2,0x10] vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0 + +// CHECK: vfmaddps (%rcx), %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x68,0x01,0x10] + vfmaddps (%rcx), %xmm1, %xmm0, %xmm0 + +// CHECK: vfmaddps %xmm1, (%rcx), %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0x79,0x68,0x01,0x10] + vfmaddps %xmm1, (%rcx),%xmm0, %xmm0 + +// CHECK: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x68,0xc2,0x10] + vfmaddps %xmm2, %xmm1, %xmm0, %xmm0 + +// CHECK: vfmaddpd (%rcx), %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x69,0x01,0x10] + vfmaddpd (%rcx), %xmm1, %xmm0, %xmm0 + +// CHECK: vfmaddpd %xmm1, (%rcx), %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0x79,0x69,0x01,0x10] + vfmaddpd %xmm1, (%rcx),%xmm0, %xmm0 + +// CHECK: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x69,0xc2,0x10] + vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0 + +// CHECK: vfmaddps (%rcx), %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x68,0x01,0x10] + vfmaddps (%rcx), %ymm1, %ymm0, %ymm0 + +// CHECK: vfmaddps %ymm1, (%rcx), %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0x7d,0x68,0x01,0x10] + vfmaddps %ymm1, (%rcx),%ymm0, %ymm0 + +// CHECK: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x68,0xc2,0x10] + vfmaddps %ymm2, %ymm1, %ymm0, %ymm0 + +// CHECK: vfmaddpd (%rcx), %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x69,0x01,0x10] + vfmaddpd (%rcx), %ymm1, %ymm0, %ymm0 + +// CHECK: vfmaddpd %ymm1, (%rcx), %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0x7d,0x69,0x01,0x10] + vfmaddpd %ymm1, (%rcx),%ymm0, %ymm0 + +// CHECK: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x69,0xc2,0x10] + vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0 + +// vfmsub +// CHECK: vfmsubss (%rcx), %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x6e,0x01,0x10] + vfmsubss (%rcx), %xmm1, %xmm0, %xmm0 + +// CHECK: vfmsubss %xmm1, (%rcx), %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0x79,0x6e,0x01,0x10] + vfmsubss %xmm1, (%rcx),%xmm0, %xmm0 + +// CHECK: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x6e,0xc2,0x10] + vfmsubss %xmm2, %xmm1, %xmm0, %xmm0 + +// CHECK: vfmsubsd (%rcx), %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x6f,0x01,0x10] + vfmsubsd (%rcx), %xmm1, %xmm0, %xmm0 + +// CHECK: vfmsubsd %xmm1, (%rcx), %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0x79,0x6f,0x01,0x10] + vfmsubsd %xmm1, (%rcx),%xmm0, %xmm0 + +// CHECK: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x6f,0xc2,0x10] + vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0 + +// CHECK: vfmsubps (%rcx), %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x6c,0x01,0x10] + vfmsubps (%rcx), %xmm1, %xmm0, %xmm0 + +// CHECK: vfmsubps %xmm1, (%rcx), %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0x79,0x6c,0x01,0x10] + vfmsubps %xmm1, (%rcx),%xmm0, %xmm0 + +// CHECK: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x6c,0xc2,0x10] + vfmsubps %xmm2, %xmm1, %xmm0, %xmm0 + +// CHECK: vfmsubpd (%rcx), %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x6d,0x01,0x10] + vfmsubpd (%rcx), %xmm1, %xmm0, %xmm0 + +// CHECK: vfmsubpd %xmm1, (%rcx), %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0x79,0x6d,0x01,0x10] + vfmsubpd %xmm1, (%rcx),%xmm0, %xmm0 + +// CHECK: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x6d,0xc2,0x10] + vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0 + +// CHECK: vfmsubps (%rcx), %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x6c,0x01,0x10] + vfmsubps (%rcx), %ymm1, %ymm0, %ymm0 + +// CHECK: vfmsubps %ymm1, (%rcx), %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0x7d,0x6c,0x01,0x10] + vfmsubps %ymm1, (%rcx),%ymm0, %ymm0 + +// CHECK: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x6c,0xc2,0x10] + vfmsubps %ymm2, %ymm1, %ymm0, %ymm0 + +// CHECK: vfmsubpd (%rcx), %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x6d,0x01,0x10] + vfmsubpd (%rcx), %ymm1, %ymm0, %ymm0 + +// CHECK: vfmsubpd %ymm1, (%rcx), %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0x7d,0x6d,0x01,0x10] + vfmsubpd %ymm1, (%rcx),%ymm0, %ymm0 + +// CHECK: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x6d,0xc2,0x10] + vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0 + +// vfnmadd +// CHECK: vfnmaddss (%rcx), %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x7a,0x01,0x10] + vfnmaddss (%rcx), %xmm1, %xmm0, %xmm0 + +// CHECK: vfnmaddss %xmm1, (%rcx), %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0x79,0x7a,0x01,0x10] + vfnmaddss %xmm1, (%rcx),%xmm0, %xmm0 + +// CHECK: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x7a,0xc2,0x10] + vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 + +// CHECK: vfnmaddsd (%rcx), %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x7b,0x01,0x10] + vfnmaddsd (%rcx), %xmm1, %xmm0, %xmm0 + +// CHECK: vfnmaddsd %xmm1, (%rcx), %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0x79,0x7b,0x01,0x10] + vfnmaddsd %xmm1, (%rcx),%xmm0, %xmm0 + +// CHECK: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x7b,0xc2,0x10] + vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0 + +// CHECK: vfnmaddps (%rcx), %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x78,0x01,0x10] + vfnmaddps (%rcx), %xmm1, %xmm0, %xmm0 + +// CHECK: vfnmaddps %xmm1, (%rcx), %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0x79,0x78,0x01,0x10] + vfnmaddps %xmm1, (%rcx),%xmm0, %xmm0 + +// CHECK: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x78,0xc2,0x10] + vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 + +// CHECK: vfnmaddpd (%rcx), %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x79,0x01,0x10] + vfnmaddpd (%rcx), %xmm1, %xmm0, %xmm0 + +// CHECK: vfnmaddpd %xmm1, (%rcx), %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0x79,0x79,0x01,0x10] + vfnmaddpd %xmm1, (%rcx),%xmm0, %xmm0 + +// CHECK: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x79,0xc2,0x10] + vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0 + +// CHECK: vfnmaddps (%rcx), %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x78,0x01,0x10] + vfnmaddps (%rcx), %ymm1, %ymm0, %ymm0 + +// CHECK: vfnmaddps %ymm1, (%rcx), %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0x7d,0x78,0x01,0x10] + vfnmaddps %ymm1, (%rcx),%ymm0, %ymm0 + +// CHECK: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x78,0xc2,0x10] + vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 + +// CHECK: vfnmaddpd (%rcx), %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x79,0x01,0x10] + vfnmaddpd (%rcx), %ymm1, %ymm0, %ymm0 + +// CHECK: vfnmaddpd %ymm1, (%rcx), %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0x7d,0x79,0x01,0x10] + vfnmaddpd %ymm1, (%rcx),%ymm0, %ymm0 + +// CHECK: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x79,0xc2,0x10] + vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0 + +// vfnmsub +// CHECK: vfnmsubss (%rcx), %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x7e,0x01,0x10] + vfnmsubss (%rcx), %xmm1, %xmm0, %xmm0 + +// CHECK: vfnmsubss %xmm1, (%rcx), %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0x79,0x7e,0x01,0x10] + vfnmsubss %xmm1, (%rcx),%xmm0, %xmm0 + +// CHECK: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x7e,0xc2,0x10] + vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0 + +// CHECK: vfnmsubsd (%rcx), %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x7f,0x01,0x10] + vfnmsubsd (%rcx), %xmm1, %xmm0, %xmm0 + +// CHECK: vfnmsubsd %xmm1, (%rcx), %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0x79,0x7f,0x01,0x10] + vfnmsubsd %xmm1, (%rcx),%xmm0, %xmm0 + +// CHECK: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x7f,0xc2,0x10] + vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0 + +// CHECK: vfnmsubps (%rcx), %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x7c,0x01,0x10] + vfnmsubps (%rcx), %xmm1, %xmm0, %xmm0 + +// CHECK: vfnmsubps %xmm1, (%rcx), %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0x79,0x7c,0x01,0x10] + vfnmsubps %xmm1, (%rcx),%xmm0, %xmm0 + +// CHECK: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x7c,0xc2,0x10] + vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0 + +// CHECK: vfnmsubpd (%rcx), %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x7d,0x01,0x10] + vfnmsubpd (%rcx), %xmm1, %xmm0, %xmm0 + +// CHECK: vfnmsubpd %xmm1, (%rcx), %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0x79,0x7d,0x01,0x10] + vfnmsubpd %xmm1, (%rcx),%xmm0, %xmm0 + +// CHECK: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x7d,0xc2,0x10] + vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0 + +// CHECK: vfnmsubps (%rcx), %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x7c,0x01,0x10] + vfnmsubps (%rcx), %ymm1, %ymm0, %ymm0 + +// CHECK: vfnmsubps %ymm1, (%rcx), %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0x7d,0x7c,0x01,0x10] + vfnmsubps %ymm1, (%rcx),%ymm0, %ymm0 + +// CHECK: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x7c,0xc2,0x10] + vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0 + +// CHECK: vfnmsubpd (%rcx), %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x7d,0x01,0x10] + vfnmsubpd (%rcx), %ymm1, %ymm0, %ymm0 + +// CHECK: vfnmsubpd %ymm1, (%rcx), %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0x7d,0x7d,0x01,0x10] + vfnmsubpd %ymm1, (%rcx),%ymm0, %ymm0 + +// CHECK: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x7d,0xc2,0x10] + vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0 + +// vfmaddsub +// CHECK: vfmaddsubps (%rcx), %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x5c,0x01,0x10] + vfmaddsubps (%rcx), %xmm1, %xmm0, %xmm0 + +// CHECK: vfmaddsubps %xmm1, (%rcx), %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0x79,0x5c,0x01,0x10] + vfmaddsubps %xmm1, (%rcx),%xmm0, %xmm0 + +// CHECK: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x5c,0xc2,0x10] + vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0 + +// CHECK: vfmaddsubpd (%rcx), %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x5d,0x01,0x10] + vfmaddsubpd (%rcx), %xmm1, %xmm0, %xmm0 + +// CHECK: vfmaddsubpd %xmm1, (%rcx), %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0x79,0x5d,0x01,0x10] + vfmaddsubpd %xmm1, (%rcx),%xmm0, %xmm0 + +// CHECK: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x5d,0xc2,0x10] + vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0 + +// CHECK: vfmaddsubps (%rcx), %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x5c,0x01,0x10] + vfmaddsubps (%rcx), %ymm1, %ymm0, %ymm0 + +// CHECK: vfmaddsubps %ymm1, (%rcx), %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0x7d,0x5c,0x01,0x10] + vfmaddsubps %ymm1, (%rcx),%ymm0, %ymm0 + +// CHECK: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x5c,0xc2,0x10] + vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0 + +// CHECK: vfmaddsubpd (%rcx), %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x5d,0x01,0x10] + vfmaddsubpd (%rcx), %ymm1, %ymm0, %ymm0 + +// CHECK: vfmaddsubpd %ymm1, (%rcx), %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0x7d,0x5d,0x01,0x10] + vfmaddsubpd %ymm1, (%rcx),%ymm0, %ymm0 + +// CHECK: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x5d,0xc2,0x10] + vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0 + +// vfmsubadd +// CHECK: vfmsubaddps (%rcx), %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x5e,0x01,0x10] + vfmsubaddps (%rcx), %xmm1, %xmm0, %xmm0 + +// CHECK: vfmsubaddps %xmm1, (%rcx), %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0x79,0x5e,0x01,0x10] + vfmsubaddps %xmm1, (%rcx),%xmm0, %xmm0 + +// CHECK: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x5e,0xc2,0x10] + vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0 + +// CHECK: vfmsubaddpd (%rcx), %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x5f,0x01,0x10] + vfmsubaddpd (%rcx), %xmm1, %xmm0, %xmm0 + +// CHECK: vfmsubaddpd %xmm1, (%rcx), %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0x79,0x5f,0x01,0x10] + vfmsubaddpd %xmm1, (%rcx),%xmm0, %xmm0 + +// CHECK: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x5f,0xc2,0x10] + vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0 + +// CHECK: vfmsubaddps (%rcx), %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x5e,0x01,0x10] + vfmsubaddps (%rcx), %ymm1, %ymm0, %ymm0 + +// CHECK: vfmsubaddps %ymm1, (%rcx), %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0x7d,0x5e,0x01,0x10] + vfmsubaddps %ymm1, (%rcx),%ymm0, %ymm0 + +// CHECK: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x5e,0xc2,0x10] + vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0 + +// CHECK: vfmsubaddpd (%rcx), %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x5f,0x01,0x10] + vfmsubaddpd (%rcx), %ymm1, %ymm0, %ymm0 + +// CHECK: vfmsubaddpd %ymm1, (%rcx), %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0x7d,0x5f,0x01,0x10] + vfmsubaddpd %ymm1, (%rcx),%ymm0, %ymm0 + +// CHECK: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0 +// CHECK: encoding: [0xc4,0xe3,0xfd,0x5f,0xc2,0x10] + vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0