From 2fb982aa720ec1ef149b2d9add2673c313f08792 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Wed, 28 Aug 2013 11:21:58 +0000 Subject: [PATCH] AVX-512: added SQRT, VRSQRT14, VCOMISS, VUCOMISS, VRCP14, VPABS git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@189472 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrAVX512.td | 443 ++++++++++++++++++++++++++ lib/Target/X86/X86InstrSSE.td | 42 +-- test/CodeGen/X86/avx512-arith.ll | 37 +++ test/CodeGen/X86/avx512-cmp.ll | 27 ++ test/CodeGen/X86/avx512-intrinsics.ll | 88 +++++ 5 files changed, 616 insertions(+), 21 deletions(-) create mode 100644 test/CodeGen/X86/avx512-cmp.ll create mode 100644 test/CodeGen/X86/avx512-intrinsics.ll diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 90eb7d91899..95b0de41fa8 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -2378,6 +2378,433 @@ let Predicates = [HasAVX512] in { def : Pat<(v8f64 (extloadv8f32 addr:$src)), (VCVTPS2PDZrm addr:$src)>; } + +let Defs = [EFLAGS], Predicates = [HasAVX512] in { + defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32, + "ucomiss{z}">, TB, EVEX, VEX_LIG, + EVEX_CD8<32, CD8VT1>; + defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86cmp, f64, f64mem, loadf64, + "ucomisd{z}">, TB, OpSize, EVEX, + VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; + let Pattern = [] in { + defm VCOMISSZ : sse12_ord_cmp<0x2F, VR128X, undef, v4f32, f128mem, load, + "comiss{z}">, TB, EVEX, VEX_LIG, + EVEX_CD8<32, CD8VT1>; + defm VCOMISDZ : sse12_ord_cmp<0x2F, VR128X, undef, v2f64, f128mem, load, + "comisd{z}">, TB, OpSize, EVEX, + VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; + } + defm Int_VUCOMISSZ : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v4f32, f128mem, + load, "ucomiss">, TB, EVEX, VEX_LIG, + EVEX_CD8<32, CD8VT1>; + defm Int_VUCOMISDZ : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v2f64, f128mem, + load, "ucomisd">, TB, OpSize, EVEX, + VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; + + defm Int_VCOMISSZ : sse12_ord_cmp<0x2F, VR128X, X86comi, v4f32, f128mem, + load, "comiss">, TB, EVEX, VEX_LIG, + EVEX_CD8<32, CD8VT1>; + defm Int_VCOMISDZ : sse12_ord_cmp<0x2F, VR128X, X86comi, v2f64, f128mem, + load, "comisd">, TB, OpSize, EVEX, + VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; +} + +/// avx512_unop_p - AVX-512 unops in packed form. +multiclass avx512_fp_unop_p opc, string OpcodeStr, SDNode OpNode> { + def PSZr : AVX5128I, + EVEX, EVEX_V512; + def PSZm : AVX5128I, + EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>; + def PDZr : AVX5128I, + EVEX, EVEX_V512, VEX_W; + def PDZm : AVX5128I, + EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +} + +/// avx512_fp_unop_p_int - AVX-512 intrinsics unops in packed forms. +multiclass avx512_fp_unop_p_int opc, string OpcodeStr, + Intrinsic V16F32Int, Intrinsic V8F64Int> { + def PSZr_Int : AVX5128I, + EVEX, EVEX_V512; + def PSZm_Int : AVX5128I, EVEX, + EVEX_V512, EVEX_CD8<32, CD8VF>; + def PDZr_Int : AVX5128I, + EVEX, EVEX_V512, VEX_W; + def PDZm_Int : AVX5128I, + EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +} + +/// avx512_fp_unop_s - AVX-512 unops in scalar form. +multiclass avx512_fp_unop_s opc, string OpcodeStr, + Intrinsic F32Int, Intrinsic F64Int> { + let hasSideEffects = 0 in { + def SSZr : AVX5128I, EVEX_4V; + let mayLoad = 1 in { + def SSZm : AVX5128I, EVEX_4V, EVEX_CD8<32, CD8VT1>; + def SSZm_Int : AVX5128I, + EVEX_4V, EVEX_CD8<32, CD8VT1>; + } + def SDZr : AVX5128I, + EVEX_4V, VEX_W; + let mayLoad = 1 in { + def SDZm : AVX5128I, + EVEX_4V, VEX_W, EVEX_CD8<32, CD8VT1>; + def SDZm_Int : AVX5128I, + EVEX_4V, VEX_W, EVEX_CD8<32, CD8VT1>; + } +} +} + +defm VRCP14 : avx512_fp_unop_s<0x4D, "vrcp14", int_x86_avx512_rcp14_ss, + int_x86_avx512_rcp14_sd>, + avx512_fp_unop_p<0x4C, "vrcp14", X86frcp>, + avx512_fp_unop_p_int<0x4C, "vrcp14", + int_x86_avx512_rcp14_ps_512, int_x86_avx512_rcp14_pd_512>; + +defm VRSQRT14 : avx512_fp_unop_s<0x4F, "vrsqrt14", int_x86_avx512_rsqrt14_ss, + int_x86_avx512_rsqrt14_sd>, + avx512_fp_unop_p<0x4E, "vrsqrt14", X86frsqrt>, + avx512_fp_unop_p_int<0x4E, "vrsqrt14", + int_x86_avx512_rsqrt14_ps_512, int_x86_avx512_rsqrt14_pd_512>; + +multiclass avx512_sqrt_packed opc, string OpcodeStr, SDNode OpNode, + Intrinsic V16F32Int, Intrinsic V8F64Int, + OpndItins itins_s, OpndItins itins_d> { + def PSZrr :AVX512PSI, + EVEX, EVEX_V512; + + let mayLoad = 1 in + def PSZrm : AVX512PSI, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>; + + def PDZrr : AVX512PDI, + EVEX, EVEX_V512; + + let mayLoad = 1 in + def PDZrm : AVX512PDI, EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>; + + def PSZr_Int : AVX512PSI, + EVEX, EVEX_V512; + def PSZm_Int : AVX512PSI, EVEX, + EVEX_V512, EVEX_CD8<32, CD8VF>; + def PDZr_Int : AVX512PDI, + EVEX, EVEX_V512, VEX_W; + def PDZm_Int : AVX512PDI, + EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +} + +multiclass avx512_sqrt_scalar opc, string OpcodeStr, + Intrinsic F32Int, Intrinsic F64Int, + OpndItins itins_s, OpndItins itins_d> { + def SSZr : SI, XS, EVEX_4V; + def SSZr_Int : SIi8, XS, EVEX_4V; + let mayLoad = 1 in { + def SSZm : SI, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>; + def SSZm_Int : SIi8, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>; + } + def SDZr : SI, + XD, EVEX_4V, VEX_W; + def SDZr_Int : SIi8, XD, EVEX_4V, VEX_W; + let mayLoad = 1 in { + def SDZm : SI, + XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>; + def SDZm_Int : SIi8, + XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>; + } +} + + +defm VSQRT : avx512_sqrt_scalar<0x51, "sqrt", + int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd, + SSE_SQRTSS, SSE_SQRTSD>, + avx512_sqrt_packed<0x51, "vsqrt", fsqrt, + int_x86_avx512_sqrt_ps_512, int_x86_avx512_sqrt_pd_512, + SSE_SQRTPS, SSE_SQRTPD>; + +def : Pat<(f32 (fsqrt FR32X:$src)), + (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>; +def : Pat<(f32 (fsqrt (load addr:$src))), + (VSQRTSSZm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[OptForSize]>; +def : Pat<(f64 (fsqrt FR64X:$src)), + (VSQRTSDZr (f64 (IMPLICIT_DEF)), FR64X:$src)>; +def : Pat<(f64 (fsqrt (load addr:$src))), + (VSQRTSDZm (f64 (IMPLICIT_DEF)), addr:$src)>, + Requires<[OptForSize]>; + +def : Pat<(f32 (X86frsqrt FR32X:$src)), + (VRSQRT14SSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>; +def : Pat<(f32 (X86frsqrt (load addr:$src))), + (VRSQRT14SSZm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[OptForSize]>; + +def : Pat<(f32 (X86frcp FR32X:$src)), + (VRCP14SSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>; +def : Pat<(f32 (X86frcp (load addr:$src))), + (VRCP14SSZm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[OptForSize]>; + +multiclass avx512_fp_unop_rm opcps, bits<8> opcpd, string OpcodeStr, + X86MemOperand x86memop, RegisterClass RC, + PatFrag mem_frag32, PatFrag mem_frag64, + Intrinsic V4F32Int, Intrinsic V2F64Int, + CD8VForm VForm> { +let ExeDomain = SSEPackedSingle in { + // Intrinsic operation, reg. + // Vector intrinsic operation, reg + def PSr : AVX512AIi8; + + // Vector intrinsic operation, mem + def PSm : AVX512AIi8, + EVEX_CD8<32, VForm>; +} // ExeDomain = SSEPackedSingle + +let ExeDomain = SSEPackedDouble in { + // Vector intrinsic operation, reg + def PDr : AVX512AIi8; + + // Vector intrinsic operation, mem + def PDm : AVX512AIi8, + EVEX_CD8<64, VForm>; +} // ExeDomain = SSEPackedDouble +} + +multiclass avx512_fp_binop_rm opcss, bits<8> opcsd, + string OpcodeStr, + Intrinsic F32Int, + Intrinsic F64Int> { +let ExeDomain = GenericDomain in { + // Operation, reg. + let hasSideEffects = 0 in + def SSr : AVX512AIi8; + + // Intrinsic operation, reg. + def SSr_Int : AVX512AIi8; + + // Intrinsic operation, mem. + def SSm : AVX512AIi8, + EVEX_CD8<32, CD8VT1>; + + // Operation, reg. + let hasSideEffects = 0 in + def SDr : AVX512AIi8, VEX_W; + + // Intrinsic operation, reg. + def SDr_Int : AVX512AIi8, + VEX_W; + + // Intrinsic operation, mem. + def SDm : AVX512AIi8, + VEX_W, EVEX_CD8<64, CD8VT1>; +} // ExeDomain = GenericDomain +} + +let Predicates = [HasAVX512] in { + defm VRNDSCALE : avx512_fp_binop_rm<0x0A, 0x0B, "vrndscale", + int_x86_avx512_rndscale_ss, + int_x86_avx512_rndscale_sd>, EVEX_4V; + + defm VRNDSCALEZ : avx512_fp_unop_rm<0x08, 0x09, "vrndscale", f256mem, VR512, + memopv16f32, memopv8f64, + int_x86_avx512_rndscale_ps_512, + int_x86_avx512_rndscale_pd_512, CD8VF>, + EVEX, EVEX_V512; +} + +def : Pat<(ffloor FR32X:$src), + (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x1))>; +def : Pat<(f64 (ffloor FR64X:$src)), + (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x1))>; +def : Pat<(f32 (fnearbyint FR32X:$src)), + (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0xC))>; +def : Pat<(f64 (fnearbyint FR64X:$src)), + (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0xC))>; +def : Pat<(f32 (fceil FR32X:$src)), + (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x2))>; +def : Pat<(f64 (fceil FR64X:$src)), + (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x2))>; +def : Pat<(f32 (frint FR32X:$src)), + (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x4))>; +def : Pat<(f64 (frint FR64X:$src)), + (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x4))>; +def : Pat<(f32 (ftrunc FR32X:$src)), + (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x3))>; +def : Pat<(f64 (ftrunc FR64X:$src)), + (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x3))>; + +def : Pat<(v16f32 (ffloor VR512:$src)), + (VRNDSCALEZPSr VR512:$src, (i32 0x1))>; +def : Pat<(v16f32 (fnearbyint VR512:$src)), + (VRNDSCALEZPSr VR512:$src, (i32 0xC))>; +def : Pat<(v16f32 (fceil VR512:$src)), + (VRNDSCALEZPSr VR512:$src, (i32 0x2))>; +def : Pat<(v16f32 (frint VR512:$src)), + (VRNDSCALEZPSr VR512:$src, (i32 0x4))>; +def : Pat<(v16f32 (ftrunc VR512:$src)), + (VRNDSCALEZPSr VR512:$src, (i32 0x3))>; + +def : Pat<(v8f64 (ffloor VR512:$src)), + (VRNDSCALEZPDr VR512:$src, (i32 0x1))>; +def : Pat<(v8f64 (fnearbyint VR512:$src)), + (VRNDSCALEZPDr VR512:$src, (i32 0xC))>; +def : Pat<(v8f64 (fceil VR512:$src)), + (VRNDSCALEZPDr VR512:$src, (i32 0x2))>; +def : Pat<(v8f64 (frint VR512:$src)), + (VRNDSCALEZPDr VR512:$src, (i32 0x4))>; +def : Pat<(v8f64 (ftrunc VR512:$src)), + (VRNDSCALEZPDr VR512:$src, (i32 0x3))>; + //===----------------------------------------------------------------------===// // VSHUFPS - VSHUFPD Operations @@ -2433,3 +2860,19 @@ def : Pat<(v16i32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))), def : Pat<(v8i64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))), (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>; +multiclass avx512_vpabs opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop> { + def rr : AVX5128I, + EVEX; + def rm : AVX5128I, + EVEX; +} + +defm VPABSD : avx512_vpabs<0x1E, "vpabsd", VR512, i512mem>, EVEX_V512, + EVEX_CD8<32, CD8VF>; +defm VPABSQ : avx512_vpabs<0x1F, "vpabsq", VR512, i512mem>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; + diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 2d7ac73bbe6..b1cfbee6356 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -3322,30 +3322,30 @@ defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>, sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, int_x86_avx_rcp_ps_256, SSE_RCPP>; -def : Pat<(f32 (fsqrt FR32:$src)), - (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; -def : Pat<(f32 (fsqrt (load addr:$src))), - (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[HasAVX, OptForSize]>; -def : Pat<(f64 (fsqrt FR64:$src)), - (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>; -def : Pat<(f64 (fsqrt (load addr:$src))), - (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>, - Requires<[HasAVX, OptForSize]>; +let Predicates = [UseAVX] in { + def : Pat<(f32 (fsqrt FR32:$src)), + (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; + def : Pat<(f32 (fsqrt (load addr:$src))), + (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasAVX, OptForSize]>; + def : Pat<(f64 (fsqrt FR64:$src)), + (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>; + def : Pat<(f64 (fsqrt (load addr:$src))), + (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasAVX, OptForSize]>; -def : Pat<(f32 (X86frsqrt FR32:$src)), - (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; -def : Pat<(f32 (X86frsqrt (load addr:$src))), - (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[HasAVX, OptForSize]>; + def : Pat<(f32 (X86frsqrt FR32:$src)), + (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; + def : Pat<(f32 (X86frsqrt (load addr:$src))), + (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasAVX, OptForSize]>; -def : Pat<(f32 (X86frcp FR32:$src)), - (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; -def : Pat<(f32 (X86frcp (load addr:$src))), - (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[HasAVX, OptForSize]>; + def : Pat<(f32 (X86frcp FR32:$src)), + (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; + def : Pat<(f32 (X86frcp (load addr:$src))), + (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasAVX, OptForSize]>; -let Predicates = [HasAVX] in { def : Pat<(int_x86_sse_sqrt_ss VR128:$src), (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS VR128:$src, FR32)), diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll index 55ce9f9512d..d5af76fdfa4 100644 --- a/test/CodeGen/X86/avx512-arith.ll +++ b/test/CodeGen/X86/avx512-arith.ll @@ -186,6 +186,43 @@ define <16 x i32> @vpmulld_test(<16 x i32> %i, <16 x i32> %j) { ret <16 x i32> %x } +; CHECK-LABEL: sqrtA +; CHECK: vsqrtssz +; CHECK: ret +declare float @sqrtf(float) readnone +define float @sqrtA(float %a) nounwind uwtable readnone ssp { +entry: + %conv1 = tail call float @sqrtf(float %a) nounwind readnone + ret float %conv1 +} + +; CHECK-LABEL: sqrtB +; CHECK: vsqrtsdz +; CHECK: ret +declare double @sqrt(double) readnone +define double @sqrtB(double %a) nounwind uwtable readnone ssp { +entry: + %call = tail call double @sqrt(double %a) nounwind readnone + ret double %call +} + +; CHECK-LABEL: sqrtC +; CHECK: vsqrtssz +; CHECK: ret +declare float @llvm.sqrt.f32(float) +define float @sqrtC(float %a) nounwind { + %b = call float @llvm.sqrt.f32(float %a) + ret float %b +} + +; CHECK-LABEL: fadd_broadcast +; CHECK: LCP{{.*}}(%rip){1to16}, %zmm0, %zmm0 +; CHECK: ret +define <16 x float> @fadd_broadcast(<16 x float> %a) nounwind { + %b = fadd <16 x float> %a, + ret <16 x float> %b +} + ; CHECK-LABEL: addq_broadcast ; CHECK: vpaddq LCP{{.*}}(%rip){1to8}, %zmm0, %zmm0 ; CHECK: ret diff --git a/test/CodeGen/X86/avx512-cmp.ll b/test/CodeGen/X86/avx512-cmp.ll new file mode 100644 index 00000000000..ba52745e6c1 --- /dev/null +++ b/test/CodeGen/X86/avx512-cmp.ll @@ -0,0 +1,27 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s + +; CHECK: vucomisdz +define double @test1(double %a, double %b) nounwind { + %tobool = fcmp une double %a, %b + br i1 %tobool, label %l1, label %l2 + +l1: + %c = fsub double %a, %b + ret double %c +l2: + %c1 = fadd double %a, %b + ret double %c1 +} + +; CHECK: vucomissz +define float @test2(float %a, float %b) nounwind { + %tobool = fcmp olt float %a, %b + br i1 %tobool, label %l1, label %l2 + +l1: + %c = fsub float %a, %b + ret float %c +l2: + %c1 = fadd float %a, %b + ret float %c1 +} diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll new file mode 100644 index 00000000000..c0ac719f71a --- /dev/null +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -0,0 +1,88 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s + +declare i32 @llvm.x86.avx512.kortestz(i16, i16) nounwind readnone +; CHECK: test_x86_avx3_kortestz +; CHECK: kortestw +; CHECK: sete +define i32 @test_x86_avx3_kortestz(i16 %a0, i16 %a1) { + %res = call i32 @llvm.x86.avx512.kortestz(i16 %a0, i16 %a1) + ret i32 %res +} + +declare i32 @llvm.x86.avx512.kortestc(i16, i16) nounwind readnone +; CHECK: test_x86_avx3_kortestc +; CHECK: kortestw +; CHECK: sbbl +define i32 @test_x86_avx3_kortestc(i16 %a0, i16 %a1) { + %res = call i32 @llvm.x86.avx512.kortestc(i16 %a0, i16 %a1) + ret i32 %res +} + +define <16 x float> @test_x86_avx3_rcp_ps_512(<16 x float> %a0) { + ; CHECK: vrcp14ps + %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1] + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>) nounwind readnone + +define <8 x double> @test_x86_avx3_rcp_pd_512(<8 x double> %a0) { + ; CHECK: vrcp14pd + %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0) ; <<8 x double>> [#uses=1] + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>) nounwind readnone + + +define <8 x double> @test_x86_avx3_rndscale_pd_512(<8 x double> %a0) { + ; CHECK: vrndscale + %res = call <8 x double> @llvm.x86.avx512.rndscale.pd.512(<8 x double> %a0, i32 7) ; <<8 x double>> [#uses=1] + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.rndscale.pd.512(<8 x double>, i32) nounwind readnone + + +define <16 x float> @test_x86_avx3_rndscale_ps_512(<16 x float> %a0) { + ; CHECK: vrndscale + %res = call <16 x float> @llvm.x86.avx512.rndscale.ps.512(<16 x float> %a0, i32 7) ; <<16 x float>> [#uses=1] + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.rndscale.ps.512(<16 x float>, i32) nounwind readnone + + +define <16 x float> @test_x86_avx3_rsqrt_ps_512(<16 x float> %a0) { + ; CHECK: vrsqrt14ps + %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1] + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>) nounwind readnone + + +define <8 x double> @test_x86_avx3_sqrt_pd_512(<8 x double> %a0) { + ; CHECK: vsqrtpd + %res = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0) ; <<8 x double>> [#uses=1] + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>) nounwind readnone + + +define <16 x float> @test_x86_avx3_sqrt_ps_512(<16 x float> %a0) { + ; CHECK: vsqrtps + %res = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1] + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>) nounwind readnone + +define <4 x float> @test_x86_avx3_sqrt_ss(<4 x float> %a0, <4 x float> %a1) { + ; CHECK: vsqrtssz + %res = call <4 x float> @llvm.x86.avx512.sqrt.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.sqrt.ss(<4 x float>, <4 x float>) nounwind readnone + +define <2 x double> @test_x86_avx3_sqrt_sd(<2 x double> %a0, <2 x double> %a1) { + ; CHECK: vsqrtsdz + %res = call <2 x double> @llvm.x86.avx512.sqrt.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1] + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.avx512.sqrt.sd(<2 x double>, <2 x double>) nounwind readnone +