AVX-512: Added intrinsics for ADDSS/D, MULSS/D, SUBSS/D, DIVSS/D

instructions. These intrinsics are comming with rounding mode.
Added intrinsics for MAXSS/D, MINSS/D - with and without  sae.

By Asaf Badouh (asaf.badouh@intel.com)



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@237560 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Elena Demikhovsky 2015-05-18 07:24:19 +00:00
parent 324d41ce49
commit 1c21f2ef8c
4 changed files with 261 additions and 10 deletions

View File

@ -3241,7 +3241,7 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx512_mask_sub_pd_512 : GCCBuiltin<"__builtin_ia32_subpd512_mask">,
Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_mul_ps_128 : GCCBuiltin<"__builtin_ia32_mulps128_mask">,
def int_x86_avx512_mask_mul_ps_128 : GCCBuiltin<"__builtin_ia32_mulps_mask">,
Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_mul_ps_256 : GCCBuiltin<"__builtin_ia32_mulps256_mask">,
@ -3250,7 +3250,7 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx512_mask_mul_ps_512 : GCCBuiltin<"__builtin_ia32_mulps512_mask">,
Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_mul_pd_128 : GCCBuiltin<"__builtin_ia32_mulpd128_mask">,
def int_x86_avx512_mask_mul_pd_128 : GCCBuiltin<"__builtin_ia32_mulpd_mask">,
Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_mul_pd_256 : GCCBuiltin<"__builtin_ia32_mulpd256_mask">,
@ -3259,7 +3259,7 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx512_mask_mul_pd_512 : GCCBuiltin<"__builtin_ia32_mulpd512_mask">,
Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_div_ps_128 : GCCBuiltin<"__builtin_ia32_divps128_mask">,
def int_x86_avx512_mask_div_ps_128 : GCCBuiltin<"__builtin_ia32_divps_mask">,
Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_div_ps_256 : GCCBuiltin<"__builtin_ia32_divps256_mask">,
@ -3268,7 +3268,7 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx512_mask_div_ps_512 : GCCBuiltin<"__builtin_ia32_divps512_mask">,
Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_div_pd_128 : GCCBuiltin<"__builtin_ia32_divpd128_mask">,
def int_x86_avx512_mask_div_pd_128 : GCCBuiltin<"__builtin_ia32_divpd_mask">,
Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_div_pd_256 : GCCBuiltin<"__builtin_ia32_divpd256_mask">,
@ -3277,7 +3277,7 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx512_mask_div_pd_512 : GCCBuiltin<"__builtin_ia32_divpd512_mask">,
Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_max_ps_128 : GCCBuiltin<"__builtin_ia32_maxps128_mask">,
def int_x86_avx512_mask_max_ps_128 : GCCBuiltin<"__builtin_ia32_maxps_mask">,
Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_max_ps_256 : GCCBuiltin<"__builtin_ia32_maxps256_mask">,
@ -3286,7 +3286,7 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx512_mask_max_ps_512 : GCCBuiltin<"__builtin_ia32_maxps512_mask">,
Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_max_pd_128 : GCCBuiltin<"__builtin_ia32_maxpd128_mask">,
def int_x86_avx512_mask_max_pd_128 : GCCBuiltin<"__builtin_ia32_maxpd_mask">,
Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_max_pd_256 : GCCBuiltin<"__builtin_ia32_maxpd256_mask">,
@ -3295,7 +3295,7 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx512_mask_max_pd_512 : GCCBuiltin<"__builtin_ia32_maxpd512_mask">,
Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_min_ps_128 : GCCBuiltin<"__builtin_ia32_minps128_mask">,
def int_x86_avx512_mask_min_ps_128 : GCCBuiltin<"__builtin_ia32_minps_mask">,
Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_min_ps_256 : GCCBuiltin<"__builtin_ia32_minps256_mask">,
@ -3304,7 +3304,7 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx512_mask_min_ps_512 : GCCBuiltin<"__builtin_ia32_minps512_mask">,
Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_min_pd_128 : GCCBuiltin<"__builtin_ia32_minpd128_mask">,
def int_x86_avx512_mask_min_pd_128 : GCCBuiltin<"__builtin_ia32_minpd_mask">,
Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_min_pd_256 : GCCBuiltin<"__builtin_ia32_minpd256_mask">,
@ -3314,6 +3314,43 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_add_ss_round : GCCBuiltin<"__builtin_ia32_addss_round">,
Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_div_ss_round : GCCBuiltin<"__builtin_ia32_divss_round">,
Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_mul_ss_round : GCCBuiltin<"__builtin_ia32_mulss_round">,
Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_sub_ss_round : GCCBuiltin<"__builtin_ia32_subss_round">,
Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_max_ss_round : GCCBuiltin<"__builtin_ia32_maxss_round">,
Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_min_ss_round : GCCBuiltin<"__builtin_ia32_minss_round">,
Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_add_sd_round : GCCBuiltin<"__builtin_ia32_addsd_round">,
Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_div_sd_round : GCCBuiltin<"__builtin_ia32_divsd_round">,
Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_mul_sd_round : GCCBuiltin<"__builtin_ia32_mulsd_round">,
Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_sub_sd_round : GCCBuiltin<"__builtin_ia32_subsd_round">,
Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_max_sd_round : GCCBuiltin<"__builtin_ia32_maxsd_round">,
Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_min_sd_round : GCCBuiltin<"__builtin_ia32_minsd_round">,
Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_rndscale_ss : GCCBuiltin<"__builtin_ia32_rndscaless_mask">,
Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],

View File

@ -14899,11 +14899,12 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
SDValue Src0 = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
// There are 2 kinds of intrinsics in this group:
// (1) With supress-all-exceptions (sae) - 6 operands
// (1) With supress-all-exceptions (sae) or rounding mode- 6 operands
// (2) With rounding mode and sae - 7 operands.
if (Op.getNumOperands() == 6) {
SDValue Sae = Op.getOperand(5);
return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
unsigned Opc = IntrData->Opc1 ? IntrData->Opc1 : IntrData->Opc0;
return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2,
Sae),
Mask, Src0, Subtarget, DAG);
}

View File

@ -251,6 +251,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_add_ps_256, INTR_TYPE_2OP_MASK, ISD::FADD, 0),
X86_INTRINSIC_DATA(avx512_mask_add_ps_512, INTR_TYPE_2OP_MASK, ISD::FADD,
X86ISD::FADD_RND),
X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FADD,
X86ISD::FADD_RND),
X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FADD,
X86ISD::FADD_RND),
X86_INTRINSIC_DATA(avx512_mask_and_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0),
X86_INTRINSIC_DATA(avx512_mask_and_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0),
X86_INTRINSIC_DATA(avx512_mask_and_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0),
@ -334,6 +338,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_div_ps_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
X86_INTRINSIC_DATA(avx512_mask_div_ps_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
X86ISD::FDIV_RND),
X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FDIV,
X86ISD::FDIV_RND),
X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FDIV,
X86ISD::FDIV_RND),
X86_INTRINSIC_DATA(avx512_mask_expand_d_128, COMPRESS_EXPAND_IN_REG,
X86ISD::EXPAND, 0),
X86_INTRINSIC_DATA(avx512_mask_expand_d_256, COMPRESS_EXPAND_IN_REG,
@ -366,6 +374,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_max_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
X86ISD::FMAX_RND),
X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMAX,
X86ISD::FMAX_RND),
X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMAX,
X86ISD::FMAX_RND),
X86_INTRINSIC_DATA(avx512_mask_min_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
X86_INTRINSIC_DATA(avx512_mask_min_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
@ -374,6 +386,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_min_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
X86ISD::FMIN_RND),
X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMIN,
X86ISD::FMIN_RND),
X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMIN,
X86ISD::FMIN_RND),
X86_INTRINSIC_DATA(avx512_mask_mul_pd_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
X86_INTRINSIC_DATA(avx512_mask_mul_pd_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
@ -382,6 +398,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_mul_ps_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
X86ISD::FMUL_RND),
X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FMUL,
X86ISD::FMUL_RND),
X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FMUL,
X86ISD::FMUL_RND),
X86_INTRINSIC_DATA(avx512_mask_or_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0),
X86_INTRINSIC_DATA(avx512_mask_or_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0),
X86_INTRINSIC_DATA(avx512_mask_or_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0),
@ -547,6 +567,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_sub_ps_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
X86ISD::FSUB_RND),
X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FSUB,
X86ISD::FSUB_RND),
X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FSUB,
X86ISD::FSUB_RND),
X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512, CMP_MASK_CC, X86ISD::CMPMU, 0),

View File

@ -2581,3 +2581,192 @@ define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x flo
ret <16 x float> %res
}
declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
define <4 x float> @test_mask_add_ss_rn(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_ss_rn
; CHECK: vaddss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
%res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 0)
ret <4 x float> %res
}
define <4 x float> @test_mask_add_ss_rd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_ss_rd
; CHECK: vaddss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
%res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1)
ret <4 x float> %res
}
define <4 x float> @test_mask_add_ss_ru(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_ss_ru
; CHECK: vaddss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
%res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 2)
ret <4 x float> %res
}
define <4 x float> @test_mask_add_ss_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_ss_rz
; CHECK: vaddss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
%res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 3)
ret <4 x float> %res
}
define <4 x float> @test_mask_add_ss_current(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_ss_current
; CHECK: vaddss %xmm1, %xmm0, %xmm2 {%k1}
%res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
ret <4 x float> %res
}
define <4 x float> @test_maskz_add_ss_rn(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_maskz_add_ss_rn
; CHECK: vaddss {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
%res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 0)
ret <4 x float> %res
}
define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: test_add_ss_rn
; CHECK: vaddss {rn-sae}, %xmm1, %xmm0, %xmm0
%res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 0)
ret <4 x float> %res
}
declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_sd_rn
; CHECK: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
%res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 0)
ret <2 x double> %res
}
define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_sd_rd
; CHECK: vaddsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
%res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1)
ret <2 x double> %res
}
define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_sd_ru
; CHECK: vaddsd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
%res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 2)
ret <2 x double> %res
}
define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_sd_rz
; CHECK: vaddsd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
%res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 3)
ret <2 x double> %res
}
define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_sd_current
; CHECK: vaddsd %xmm1, %xmm0, %xmm2 {%k1}
%res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
ret <2 x double> %res
}
define <2 x double> @test_maskz_add_sd_rn(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_maskz_add_sd_rn
; CHECK: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
%res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 0)
ret <2 x double> %res
}
define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_add_sd_rn
; CHECK: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0
%res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 0)
ret <2 x double> %res
}
declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_max_ss_sae
; CHECK: vmaxss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
%res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
ret <4 x float> %res
}
define <4 x float> @test_maskz_max_ss_sae(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_maskz_max_ss_sae
; CHECK: vmaxss {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
%res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
ret <4 x float> %res
}
define <4 x float> @test_max_ss_sae(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: test_max_ss_sae
; CHECK: vmaxss {sae}, %xmm1, %xmm0, %xmm0
%res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
ret <4 x float> %res
}
define <4 x float> @test_mask_max_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_max_ss
; CHECK: vmaxss %xmm1, %xmm0, %xmm2 {%k1}
%res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
ret <4 x float> %res
}
define <4 x float> @test_maskz_max_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_maskz_max_ss
; CHECK: vmaxss %xmm1, %xmm0, %xmm0 {%k1} {z}
%res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 4)
ret <4 x float> %res
}
define <4 x float> @test_max_ss(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: test_max_ss
; CHECK: vmaxss %xmm1, %xmm0, %xmm0
%res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 4)
ret <4 x float> %res
}
declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_max_sd_sae
; CHECK: vmaxsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
%res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
ret <2 x double> %res
}
define <2 x double> @test_maskz_max_sd_sae(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_maskz_max_sd_sae
; CHECK: vmaxsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
%res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
ret <2 x double> %res
}
define <2 x double> @test_max_sd_sae(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_max_sd_sae
; CHECK: vmaxsd {sae}, %xmm1, %xmm0, %xmm0
%res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8)
ret <2 x double> %res
}
define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_max_sd
; CHECK: vmaxsd %xmm1, %xmm0, %xmm2 {%k1}
%res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
ret <2 x double> %res
}
define <2 x double> @test_maskz_max_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_maskz_max_sd
; CHECK: vmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z}
%res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 4)
ret <2 x double> %res
}
define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_max_sd
; CHECK: vmaxsd %xmm1, %xmm0, %xmm0
%res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
ret <2 x double> %res
}