From c1aa521fb4dd2fa3a9c166e45d70f320adea750f Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Mon, 22 Dec 2014 13:52:48 +0000 Subject: [PATCH] AVX-512: Added all forms of BLENDM instructions, intrinsics, encoding tests for AVX-512F and skx instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@224707 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsX86.td | 58 ++++++ lib/Target/X86/X86ISelLowering.cpp | 14 ++ lib/Target/X86/X86InstrAVX512.td | 129 +++++++++----- lib/Target/X86/X86IntrinsicsInfo.h | 20 ++- test/CodeGen/X86/avx512-intrinsics.ll | 4 +- test/CodeGen/X86/avx512vl-intrinsics.ll | 73 ++++++++ test/MC/X86/avx512vl-encoding.s | 226 ++++++++++++++++++++++++ 7 files changed, 473 insertions(+), 51 deletions(-) create mode 100644 test/MC/X86/avx512vl-encoding.s diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index cf3e3a6e02d..e1c8b1f0908 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -3294,10 +3294,26 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_mask_blend_ps_256 : GCCBuiltin<"__builtin_ia32_blendmps_256_mask">, + Intrinsic<[llvm_v8f32_ty], + [llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_blend_ps_128 : GCCBuiltin<"__builtin_ia32_blendmps_128_mask">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty], + [IntrNoMem]>; def int_x86_avx512_mask_blend_pd_512 : GCCBuiltin<"__builtin_ia32_blendmpd_512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_blend_pd_256 : GCCBuiltin<"__builtin_ia32_blendmpd_256_mask">, + Intrinsic<[llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_blend_pd_128 : GCCBuiltin<"__builtin_ia32_blendmpd_128_mask">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty], + [IntrNoMem]>; def int_x86_avx512_mask_blend_d_512 : GCCBuiltin<"__builtin_ia32_blendmd_512_mask">, Intrinsic<[llvm_v16i32_ty], @@ -3307,6 +3323,48 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_blend_d_256 : GCCBuiltin<"__builtin_ia32_blendmd_256_mask">, + Intrinsic<[llvm_v8i32_ty], + [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_blend_q_256 : GCCBuiltin<"__builtin_ia32_blendmq_256_mask">, + Intrinsic<[llvm_v4i64_ty], + [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_blend_d_128 : GCCBuiltin<"__builtin_ia32_blendmd_128_mask">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_blend_q_128 : GCCBuiltin<"__builtin_ia32_blendmq_128_mask">, + Intrinsic<[llvm_v2i64_ty], + [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty], + [IntrNoMem]>; + + def int_x86_avx512_mask_blend_w_512 : GCCBuiltin<"__builtin_ia32_blendmw_512_mask">, + Intrinsic<[llvm_v32i16_ty], + [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_blend_w_256 : GCCBuiltin<"__builtin_ia32_blendmw_256_mask">, + Intrinsic<[llvm_v16i16_ty], + [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_blend_w_128 : GCCBuiltin<"__builtin_ia32_blendmw_128_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_blend_b_512 : GCCBuiltin<"__builtin_ia32_blendmb_512_mask">, + Intrinsic<[llvm_v64i8_ty], + [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_blend_b_256 : GCCBuiltin<"__builtin_ia32_blendmb_256_mask">, + Intrinsic<[llvm_v32i8_ty], + [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_blend_b_128 : GCCBuiltin<"__builtin_ia32_blendmb_128_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty], + [IntrNoMem]>; + } let TargetPrefix = "x86" in { diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 19bfb9f2659..80902a64994 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -16988,6 +16988,20 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress, PassThru); } + case BLEND: { + SDValue Mask = Op.getOperand(3); + EVT VT = Op.getValueType(); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + SDLoc dl(Op); + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), + DAG.getIntPtrConstant(0)); + return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1), + Op.getOperand(2)); + } default: break; } diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 312c6800cdf..709ec19ab82 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -1085,77 +1085,110 @@ defm VPERMT2PD : avx512_perm_table_3src<0x7F, "pd", VR512, memopv8f64, i512mem, //===----------------------------------------------------------------------===// // AVX-512 - BLEND using mask // -multiclass avx512_blendmask opc, string OpcodeStr, - RegisterClass KRC, RegisterClass RC, - X86MemOperand x86memop, PatFrag mem_frag, - SDNode OpNode, ValueType vt> { - def rr : AVX5128I opc, string OpcodeStr, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { + def rr : AVX5128I, EVEX_4V; + def rrk : AVX5128I, EVEX_4V, EVEX_K; - let mayLoad = 1 in - def rm : AVX5128I, EVEX_4V, EVEX_K; + def rrkz : AVX5128I, EVEX_4V, EVEX_KZ; + let mayLoad = 1 in { + def rm : AVX5128I, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + def rmk : AVX5128I, EVEX_4V, EVEX_K; + [(set _.RC:$dst, (X86select _.KRCWM:$mask, (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>, + EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>; + def rmkz : AVX5128I, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>; + } + } +} +multiclass avx512_blendmask_rmb opc, string OpcodeStr, X86VectorVTInfo _> { + + def rmbk : AVX5128I, + EVEX_4V, EVEX_K, EVEX_B; + + def rmb : AVX5128I, EVEX_4V, EVEX_B; + } -let ExeDomain = SSEPackedSingle in -defm VBLENDMPSZ : avx512_blendmask<0x65, "vblendmps", - VK16WM, VR512, f512mem, - memopv16f32, vselect, v16f32>, - EVEX_CD8<32, CD8VF>, EVEX_V512; -let ExeDomain = SSEPackedDouble in -defm VBLENDMPDZ : avx512_blendmask<0x65, "vblendmpd", - VK8WM, VR512, f512mem, - memopv8f64, vselect, v8f64>, - VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512; +multiclass blendmask_dq opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + defm Z : avx512_blendmask , + avx512_blendmask_rmb , EVEX_V512; -def : Pat<(v16f32 (int_x86_avx512_mask_blend_ps_512 (v16f32 VR512:$src1), - (v16f32 VR512:$src2), (i16 GR16:$mask))), - (VBLENDMPSZrr (COPY_TO_REGCLASS GR16:$mask, VK16WM), - VR512:$src1, VR512:$src2)>; + let Predicates = [HasVLX] in { + defm Z256 : avx512_blendmask, + avx512_blendmask_rmb , EVEX_V256; + defm Z128 : avx512_blendmask, + avx512_blendmask_rmb , EVEX_V128; + } +} -def : Pat<(v8f64 (int_x86_avx512_mask_blend_pd_512 (v8f64 VR512:$src1), - (v8f64 VR512:$src2), (i8 GR8:$mask))), - (VBLENDMPDZrr (COPY_TO_REGCLASS GR8:$mask, VK8WM), - VR512:$src1, VR512:$src2)>; +multiclass blendmask_bw opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + let Predicates = [HasBWI] in + defm Z : avx512_blendmask , EVEX_V512; -defm VPBLENDMDZ : avx512_blendmask<0x64, "vpblendmd", - VK16WM, VR512, f512mem, - memopv16i32, vselect, v16i32>, - EVEX_CD8<32, CD8VF>, EVEX_V512; + let Predicates = [HasBWI, HasVLX] in { + defm Z256 : avx512_blendmask , EVEX_V256; + defm Z128 : avx512_blendmask , EVEX_V128; + } +} -defm VPBLENDMQZ : avx512_blendmask<0x64, "vpblendmq", - VK8WM, VR512, f512mem, - memopv8i64, vselect, v8i64>, - VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512; -def : Pat<(v16i32 (int_x86_avx512_mask_blend_d_512 (v16i32 VR512:$src1), - (v16i32 VR512:$src2), (i16 GR16:$mask))), - (VPBLENDMDZrr (COPY_TO_REGCLASS GR16:$mask, VK16), - VR512:$src1, VR512:$src2)>; +defm VBLENDMPS : blendmask_dq <0x65, "vblendmps", avx512vl_f32_info>; +defm VBLENDMPD : blendmask_dq <0x65, "vblendmpd", avx512vl_f64_info>, VEX_W; +defm VPBLENDMD : blendmask_dq <0x64, "vpblendmd", avx512vl_i32_info>; +defm VPBLENDMQ : blendmask_dq <0x64, "vpblendmq", avx512vl_i64_info>, VEX_W; +defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", avx512vl_i8_info>; +defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W; -def : Pat<(v8i64 (int_x86_avx512_mask_blend_q_512 (v8i64 VR512:$src1), - (v8i64 VR512:$src2), (i8 GR8:$mask))), - (VPBLENDMQZrr (COPY_TO_REGCLASS GR8:$mask, VK8), - VR512:$src1, VR512:$src2)>; let Predicates = [HasAVX512] in { def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1), (v8f32 VR256X:$src2))), (EXTRACT_SUBREG - (v16f32 (VBLENDMPSZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + (v16f32 (VBLENDMPSZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), (EXTRACT_SUBREG - (v16i32 (VPBLENDMDZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + (v16i32 (VPBLENDMDZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; } diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 8fa0efa2fc2..77ae0389aed 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -22,7 +22,7 @@ enum IntrinsicType { INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_SCALAR_MASK_RM, - COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, EXPAND_FROM_MEM + COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, EXPAND_FROM_MEM, BLEND }; struct IntrinsicData { @@ -244,6 +244,24 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::EXP2, 0), X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::EXP2, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_b_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_b_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_b_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_d_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_d_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_d_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_pd_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_pd_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_pd_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_ps_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_ps_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_ps_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_q_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_q_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_q_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_w_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_w_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_w_512, BLEND, X86ISD::SELECT, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_b_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_b_256, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_b_512, CMP_MASK_CC, X86ISD::CMPM, 0), diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 8e5165e0a06..7cd01683fa9 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -340,7 +340,7 @@ define <8 x i64> @test_ctlz_q(<8 x i64> %a) { declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>, i1) nounwind readonly define <16 x float> @test_x86_mask_blend_ps_512(i16 %a0, <16 x float> %a1, <16 x float> %a2) { - ; CHECK: vblendmps + ; CHECK: vblendmps %zmm1, %zmm0 %res = call <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float> %a1, <16 x float> %a2, i16 %a0) ; <<16 x float>> [#uses=1] ret <16 x float> %res } @@ -348,7 +348,7 @@ define <16 x float> @test_x86_mask_blend_ps_512(i16 %a0, <16 x float> %a1, <16 x declare <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float>, <16 x float>, i16) nounwind readonly define <8 x double> @test_x86_mask_blend_pd_512(i8 %a0, <8 x double> %a1, <8 x double> %a2) { - ; CHECK: vblendmpd + ; CHECK: vblendmpd %zmm1, %zmm0 %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double> %a1, <8 x double> %a2, i8 %a0) ; <<8 x double>> [#uses=1] ret <8 x double> %res } diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index 23b05e38ed4..d349f4f5378 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -788,4 +788,77 @@ define <4 x i32> @expand10(<4 x i32> %data, i8 %mask) { declare <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask) +define <8 x float> @test_x86_mask_blend_ps_256(i8 %a0, <8 x float> %a1, <8 x float> %a2) { + ; CHECK: vblendmps %ymm1, %ymm0 + %res = call <8 x float> @llvm.x86.avx512.mask.blend.ps.256(<8 x float> %a1, <8 x float> %a2, i8 %a0) ; <<8 x float>> [#uses=1] + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.avx512.mask.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readonly + +define <4 x double> @test_x86_mask_blend_pd_256(i8 %a0, <4 x double> %a1, <4 x double> %a2) { + ; CHECK: vblendmpd %ymm1, %ymm0 + %res = call <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double> %a1, <4 x double> %a2, i8 %a0) ; <<4 x double>> [#uses=1] + ret <4 x double> %res +} + +define <4 x double> @test_x86_mask_blend_pd_256_memop(<4 x double> %a, <4 x double>* %ptr, i8 %mask) { + ; CHECK-LABEL: test_x86_mask_blend_pd_256_memop + ; CHECK: vblendmpd (% + %b = load <4 x double>* %ptr + %res = call <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double> %a, <4 x double> %b, i8 %mask) ; <<4 x double>> [#uses=1] + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readonly + +; CHECK-LABEL: test_x86_mask_blend_d_256 +; CHECK: vpblendmd +define <8 x i32> @test_x86_mask_blend_d_256(i8 %a0, <8 x i32> %a1, <8 x i32> %a2) { + %res = call <8 x i32> @llvm.x86.avx512.mask.blend.d.256(<8 x i32> %a1, <8 x i32> %a2, i8 %a0) ; <<8 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx512.mask.blend.d.256(<8 x i32>, <8 x i32>, i8) nounwind readonly + +define <4 x i64> @test_x86_mask_blend_q_256(i8 %a0, <4 x i64> %a1, <4 x i64> %a2) { + ; CHECK: vpblendmq + %res = call <4 x i64> @llvm.x86.avx512.mask.blend.q.256(<4 x i64> %a1, <4 x i64> %a2, i8 %a0) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx512.mask.blend.q.256(<4 x i64>, <4 x i64>, i8) nounwind readonly + +define <4 x float> @test_x86_mask_blend_ps_128(i8 %a0, <4 x float> %a1, <4 x float> %a2) { + ; CHECK: vblendmps %xmm1, %xmm0 + %res = call <4 x float> @llvm.x86.avx512.mask.blend.ps.128(<4 x float> %a1, <4 x float> %a2, i8 %a0) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.blend.ps.128(<4 x float>, <4 x float>, i8) nounwind readonly + +define <2 x double> @test_x86_mask_blend_pd_128(i8 %a0, <2 x double> %a1, <2 x double> %a2) { + ; CHECK: vblendmpd %xmm1, %xmm0 + %res = call <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double> %a1, <2 x double> %a2, i8 %a0) ; <<2 x double>> [#uses=1] + ret <2 x double> %res +} + +define <2 x double> @test_x86_mask_blend_pd_128_memop(<2 x double> %a, <2 x double>* %ptr, i8 %mask) { + ; CHECK-LABEL: test_x86_mask_blend_pd_128_memop + ; CHECK: vblendmpd (% + %b = load <2 x double>* %ptr + %res = call <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double> %a, <2 x double> %b, i8 %mask) ; <<2 x double>> [#uses=1] + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double>, <2 x double>, i8) nounwind readonly + +define <4 x i32> @test_x86_mask_blend_d_128(i8 %a0, <4 x i32> %a1, <4 x i32> %a2) { + ; CHECK: vpblendmd + %res = call <4 x i32> @llvm.x86.avx512.mask.blend.d.128(<4 x i32> %a1, <4 x i32> %a2, i8 %a0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.avx512.mask.blend.d.128(<4 x i32>, <4 x i32>, i8) nounwind readonly + +define <2 x i64> @test_x86_mask_blend_q_128(i8 %a0, <2 x i64> %a1, <2 x i64> %a2) { + ; CHECK: vpblendmq + %res = call <2 x i64> @llvm.x86.avx512.mask.blend.q.128(<2 x i64> %a1, <2 x i64> %a2, i8 %a0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.avx512.mask.blend.q.128(<2 x i64>, <2 x i64>, i8) nounwind readonly diff --git a/test/MC/X86/avx512vl-encoding.s b/test/MC/X86/avx512vl-encoding.s new file mode 100644 index 00000000000..36d14776285 --- /dev/null +++ b/test/MC/X86/avx512vl-encoding.s @@ -0,0 +1,226 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown -mcpu=skx --show-encoding %s | FileCheck %s + +// CHECK: vblendmpd %xmm19, %xmm20, %xmm27 + vblendmpd %xmm19, %xmm20, %xmm27 +// CHECK: vblendmpd %xmm19, %xmm20, %xmm27 {%k7} + vblendmpd %xmm19, %xmm20, %xmm27 {%k7} +// CHECK: vblendmpd %xmm19, %xmm20, %xmm27 {%k7} {z} + vblendmpd %xmm19, %xmm20, %xmm27 {%k7} {z} +// CHECK: vblendmpd (%rcx), %xmm20, %xmm27 + vblendmpd (%rcx), %xmm20, %xmm27 +// CHECK: vblendmpd 291(%rax,%r14,8), %xmm20, %xmm27 + vblendmpd 291(%rax,%r14,8), %xmm20, %xmm27 +// CHECK: vblendmpd (%rcx){1to2}, %xmm20, %xmm27 + vblendmpd (%rcx){1to2}, %xmm20, %xmm27 +// CHECK: vblendmpd 2032(%rdx), %xmm20, %xmm27 + vblendmpd 2032(%rdx), %xmm20, %xmm27 +// CHECK: vblendmpd 2048(%rdx), %xmm20, %xmm27 + vblendmpd 2048(%rdx), %xmm20, %xmm27 +// CHECK: vblendmpd -2048(%rdx), %xmm20, %xmm27 + vblendmpd -2048(%rdx), %xmm20, %xmm27 +// CHECK: vblendmpd -2064(%rdx), %xmm20, %xmm27 + vblendmpd -2064(%rdx), %xmm20, %xmm27 +// CHECK: vblendmpd 1016(%rdx){1to2}, %xmm20, %xmm27 + vblendmpd 1016(%rdx){1to2}, %xmm20, %xmm27 +// CHECK: vblendmpd 1024(%rdx){1to2}, %xmm20, %xmm27 + vblendmpd 1024(%rdx){1to2}, %xmm20, %xmm27 +// CHECK: vblendmpd -1024(%rdx){1to2}, %xmm20, %xmm27 + vblendmpd -1024(%rdx){1to2}, %xmm20, %xmm27 +// CHECK: vblendmpd -1032(%rdx){1to2}, %xmm20, %xmm27 + vblendmpd -1032(%rdx){1to2}, %xmm20, %xmm27 +// CHECK: vblendmpd %ymm23, %ymm21, %ymm28 + vblendmpd %ymm23, %ymm21, %ymm28 +// CHECK: vblendmpd %ymm23, %ymm21, %ymm28 {%k3} + vblendmpd %ymm23, %ymm21, %ymm28 {%k3} +// CHECK: vblendmpd %ymm23, %ymm21, %ymm28 {%k3} {z} + vblendmpd %ymm23, %ymm21, %ymm28 {%k3} {z} +// CHECK: vblendmpd (%rcx), %ymm21, %ymm28 + vblendmpd (%rcx), %ymm21, %ymm28 +// CHECK: vblendmpd 291(%rax,%r14,8), %ymm21, %ymm28 + vblendmpd 291(%rax,%r14,8), %ymm21, %ymm28 +// CHECK: vblendmpd (%rcx){1to4}, %ymm21, %ymm28 + vblendmpd (%rcx){1to4}, %ymm21, %ymm28 +// CHECK: vblendmpd 4064(%rdx), %ymm21, %ymm28 + vblendmpd 4064(%rdx), %ymm21, %ymm28 +// CHECK: vblendmpd 4096(%rdx), %ymm21, %ymm28 + vblendmpd 4096(%rdx), %ymm21, %ymm28 +// CHECK: vblendmpd -4096(%rdx), %ymm21, %ymm28 + vblendmpd -4096(%rdx), %ymm21, %ymm28 +// CHECK: vblendmpd -4128(%rdx), %ymm21, %ymm28 + vblendmpd -4128(%rdx), %ymm21, %ymm28 +// CHECK: vblendmpd 1016(%rdx){1to4}, %ymm21, %ymm28 + vblendmpd 1016(%rdx){1to4}, %ymm21, %ymm28 +// CHECK: vblendmpd 1024(%rdx){1to4}, %ymm21, %ymm28 + vblendmpd 1024(%rdx){1to4}, %ymm21, %ymm28 +// CHECK: vblendmpd -1024(%rdx){1to4}, %ymm21, %ymm28 + vblendmpd -1024(%rdx){1to4}, %ymm21, %ymm28 +// CHECK: vblendmpd -1032(%rdx){1to4}, %ymm21, %ymm28 + vblendmpd -1032(%rdx){1to4}, %ymm21, %ymm28 +// CHECK: vblendmps %xmm20, %xmm20, %xmm24 + vblendmps %xmm20, %xmm20, %xmm24 +// CHECK: vblendmps %xmm20, %xmm20, %xmm24 {%k1} + vblendmps %xmm20, %xmm20, %xmm24 {%k1} +// CHECK: vblendmps %xmm20, %xmm20, %xmm24 {%k1} {z} + vblendmps %xmm20, %xmm20, %xmm24 {%k1} {z} +// CHECK: vblendmps (%rcx), %xmm20, %xmm24 + vblendmps (%rcx), %xmm20, %xmm24 +// CHECK: vblendmps 291(%rax,%r14,8), %xmm20, %xmm24 + vblendmps 291(%rax,%r14,8), %xmm20, %xmm24 +// CHECK: vblendmps (%rcx){1to4}, %xmm20, %xmm24 + vblendmps (%rcx){1to4}, %xmm20, %xmm24 +// CHECK: vblendmps 2032(%rdx), %xmm20, %xmm24 + vblendmps 2032(%rdx), %xmm20, %xmm24 +// CHECK: vblendmps 2048(%rdx), %xmm20, %xmm24 + vblendmps 2048(%rdx), %xmm20, %xmm24 +// CHECK: vblendmps -2048(%rdx), %xmm20, %xmm24 + vblendmps -2048(%rdx), %xmm20, %xmm24 +// CHECK: vblendmps -2064(%rdx), %xmm20, %xmm24 + vblendmps -2064(%rdx), %xmm20, %xmm24 +// CHECK: vblendmps 508(%rdx){1to4}, %xmm20, %xmm24 + vblendmps 508(%rdx){1to4}, %xmm20, %xmm24 +// CHECK: vblendmps 512(%rdx){1to4}, %xmm20, %xmm24 + vblendmps 512(%rdx){1to4}, %xmm20, %xmm24 +// CHECK: vblendmps -512(%rdx){1to4}, %xmm20, %xmm24 + vblendmps -512(%rdx){1to4}, %xmm20, %xmm24 +// CHECK: vblendmps -516(%rdx){1to4}, %xmm20, %xmm24 + vblendmps -516(%rdx){1to4}, %xmm20, %xmm24 +// CHECK: vblendmps %ymm24, %ymm23, %ymm17 + vblendmps %ymm24, %ymm23, %ymm17 +// CHECK: vblendmps %ymm24, %ymm23, %ymm17 {%k6} + vblendmps %ymm24, %ymm23, %ymm17 {%k6} +// CHECK: vblendmps %ymm24, %ymm23, %ymm17 {%k6} {z} + vblendmps %ymm24, %ymm23, %ymm17 {%k6} {z} +// CHECK: vblendmps (%rcx), %ymm23, %ymm17 + vblendmps (%rcx), %ymm23, %ymm17 +// CHECK: vblendmps 291(%rax,%r14,8), %ymm23, %ymm17 + vblendmps 291(%rax,%r14,8), %ymm23, %ymm17 +// CHECK: vblendmps (%rcx){1to8}, %ymm23, %ymm17 + vblendmps (%rcx){1to8}, %ymm23, %ymm17 +// CHECK: vblendmps 4064(%rdx), %ymm23, %ymm17 + vblendmps 4064(%rdx), %ymm23, %ymm17 +// CHECK: vblendmps 4096(%rdx), %ymm23, %ymm17 + vblendmps 4096(%rdx), %ymm23, %ymm17 +// CHECK: vblendmps -4096(%rdx), %ymm23, %ymm17 + vblendmps -4096(%rdx), %ymm23, %ymm17 +// CHECK: vblendmps -4128(%rdx), %ymm23, %ymm17 + vblendmps -4128(%rdx), %ymm23, %ymm17 +// CHECK: vblendmps 508(%rdx){1to8}, %ymm23, %ymm17 + vblendmps 508(%rdx){1to8}, %ymm23, %ymm17 +// CHECK: vblendmps 512(%rdx){1to8}, %ymm23, %ymm17 + vblendmps 512(%rdx){1to8}, %ymm23, %ymm17 +// CHECK: vblendmps -512(%rdx){1to8}, %ymm23, %ymm17 + vblendmps -512(%rdx){1to8}, %ymm23, %ymm17 +// CHECK: vblendmps -516(%rdx){1to8}, %ymm23, %ymm17 + vblendmps -516(%rdx){1to8}, %ymm23, %ymm17 +// CHECK: vpblendmd %xmm26, %xmm25, %xmm17 + vpblendmd %xmm26, %xmm25, %xmm17 +// CHECK: vpblendmd %xmm26, %xmm25, %xmm17 {%k5} + vpblendmd %xmm26, %xmm25, %xmm17 {%k5} +// CHECK: vpblendmd %xmm26, %xmm25, %xmm17 {%k5} {z} + vpblendmd %xmm26, %xmm25, %xmm17 {%k5} {z} +// CHECK: vpblendmd (%rcx), %xmm25, %xmm17 + vpblendmd (%rcx), %xmm25, %xmm17 +// CHECK: vpblendmd 291(%rax,%r14,8), %xmm25, %xmm17 + vpblendmd 291(%rax,%r14,8), %xmm25, %xmm17 +// CHECK: vpblendmd (%rcx){1to4}, %xmm25, %xmm17 + vpblendmd (%rcx){1to4}, %xmm25, %xmm17 +// CHECK: vpblendmd 2032(%rdx), %xmm25, %xmm17 + vpblendmd 2032(%rdx), %xmm25, %xmm17 +// CHECK: vpblendmd 2048(%rdx), %xmm25, %xmm17 + vpblendmd 2048(%rdx), %xmm25, %xmm17 +// CHECK: vpblendmd -2048(%rdx), %xmm25, %xmm17 + vpblendmd -2048(%rdx), %xmm25, %xmm17 +// CHECK: vpblendmd -2064(%rdx), %xmm25, %xmm17 + vpblendmd -2064(%rdx), %xmm25, %xmm17 +// CHECK: vpblendmd 508(%rdx){1to4}, %xmm25, %xmm17 + vpblendmd 508(%rdx){1to4}, %xmm25, %xmm17 +// CHECK: vpblendmd 512(%rdx){1to4}, %xmm25, %xmm17 + vpblendmd 512(%rdx){1to4}, %xmm25, %xmm17 +// CHECK: vpblendmd -512(%rdx){1to4}, %xmm25, %xmm17 + vpblendmd -512(%rdx){1to4}, %xmm25, %xmm17 +// CHECK: vpblendmd -516(%rdx){1to4}, %xmm25, %xmm17 + vpblendmd -516(%rdx){1to4}, %xmm25, %xmm17 +// CHECK: vpblendmd %ymm23, %ymm29, %ymm26 + vpblendmd %ymm23, %ymm29, %ymm26 +// CHECK: vpblendmd %ymm23, %ymm29, %ymm26 {%k7} + vpblendmd %ymm23, %ymm29, %ymm26 {%k7} +// CHECK: vpblendmd %ymm23, %ymm29, %ymm26 {%k7} {z} + vpblendmd %ymm23, %ymm29, %ymm26 {%k7} {z} +// CHECK: vpblendmd (%rcx), %ymm29, %ymm26 + vpblendmd (%rcx), %ymm29, %ymm26 +// CHECK: vpblendmd 291(%rax,%r14,8), %ymm29, %ymm26 + vpblendmd 291(%rax,%r14,8), %ymm29, %ymm26 +// CHECK: vpblendmd (%rcx){1to8}, %ymm29, %ymm26 + vpblendmd (%rcx){1to8}, %ymm29, %ymm26 +// CHECK: vpblendmd 4064(%rdx), %ymm29, %ymm26 + vpblendmd 4064(%rdx), %ymm29, %ymm26 +// CHECK: vpblendmd 4096(%rdx), %ymm29, %ymm26 + vpblendmd 4096(%rdx), %ymm29, %ymm26 +// CHECK: vpblendmd -4096(%rdx), %ymm29, %ymm26 + vpblendmd -4096(%rdx), %ymm29, %ymm26 +// CHECK: vpblendmd -4128(%rdx), %ymm29, %ymm26 + vpblendmd -4128(%rdx), %ymm29, %ymm26 +// CHECK: vpblendmd 508(%rdx){1to8}, %ymm29, %ymm26 + vpblendmd 508(%rdx){1to8}, %ymm29, %ymm26 +// CHECK: vpblendmd 512(%rdx){1to8}, %ymm29, %ymm26 + vpblendmd 512(%rdx){1to8}, %ymm29, %ymm26 +// CHECK: vpblendmd -512(%rdx){1to8}, %ymm29, %ymm26 + vpblendmd -512(%rdx){1to8}, %ymm29, %ymm26 +// CHECK: vpblendmd -516(%rdx){1to8}, %ymm29, %ymm26 + vpblendmd -516(%rdx){1to8}, %ymm29, %ymm26 +// CHECK: vpblendmq %xmm17, %xmm27, %xmm29 + vpblendmq %xmm17, %xmm27, %xmm29 +// CHECK: vpblendmq %xmm17, %xmm27, %xmm29 {%k6} + vpblendmq %xmm17, %xmm27, %xmm29 {%k6} +// CHECK: vpblendmq %xmm17, %xmm27, %xmm29 {%k6} {z} + vpblendmq %xmm17, %xmm27, %xmm29 {%k6} {z} +// CHECK: vpblendmq (%rcx), %xmm27, %xmm29 + vpblendmq (%rcx), %xmm27, %xmm29 +// CHECK: vpblendmq 291(%rax,%r14,8), %xmm27, %xmm29 + vpblendmq 291(%rax,%r14,8), %xmm27, %xmm29 +// CHECK: vpblendmq (%rcx){1to2}, %xmm27, %xmm29 + vpblendmq (%rcx){1to2}, %xmm27, %xmm29 +// CHECK: vpblendmq 2032(%rdx), %xmm27, %xmm29 + vpblendmq 2032(%rdx), %xmm27, %xmm29 +// CHECK: vpblendmq 2048(%rdx), %xmm27, %xmm29 + vpblendmq 2048(%rdx), %xmm27, %xmm29 +// CHECK: vpblendmq -2048(%rdx), %xmm27, %xmm29 + vpblendmq -2048(%rdx), %xmm27, %xmm29 +// CHECK: vpblendmq -2064(%rdx), %xmm27, %xmm29 + vpblendmq -2064(%rdx), %xmm27, %xmm29 +// CHECK: vpblendmq 1016(%rdx){1to2}, %xmm27, %xmm29 + vpblendmq 1016(%rdx){1to2}, %xmm27, %xmm29 +// CHECK: vpblendmq 1024(%rdx){1to2}, %xmm27, %xmm29 + vpblendmq 1024(%rdx){1to2}, %xmm27, %xmm29 +// CHECK: vpblendmq -1024(%rdx){1to2}, %xmm27, %xmm29 + vpblendmq -1024(%rdx){1to2}, %xmm27, %xmm29 +// CHECK: vpblendmq -1032(%rdx){1to2}, %xmm27, %xmm29 + vpblendmq -1032(%rdx){1to2}, %xmm27, %xmm29 +// CHECK: vpblendmq %ymm21, %ymm23, %ymm21 + vpblendmq %ymm21, %ymm23, %ymm21 +// CHECK: vpblendmq %ymm21, %ymm23, %ymm21 {%k3} + vpblendmq %ymm21, %ymm23, %ymm21 {%k3} +// CHECK: vpblendmq %ymm21, %ymm23, %ymm21 {%k3} {z} + vpblendmq %ymm21, %ymm23, %ymm21 {%k3} {z} +// CHECK: vpblendmq (%rcx), %ymm23, %ymm21 + vpblendmq (%rcx), %ymm23, %ymm21 +// CHECK: vpblendmq 291(%rax,%r14,8), %ymm23, %ymm21 + vpblendmq 291(%rax,%r14,8), %ymm23, %ymm21 +// CHECK: vpblendmq (%rcx){1to4}, %ymm23, %ymm21 + vpblendmq (%rcx){1to4}, %ymm23, %ymm21 +// CHECK: vpblendmq 4064(%rdx), %ymm23, %ymm21 + vpblendmq 4064(%rdx), %ymm23, %ymm21 +// CHECK: vpblendmq 4096(%rdx), %ymm23, %ymm21 + vpblendmq 4096(%rdx), %ymm23, %ymm21 +// CHECK: vpblendmq -4096(%rdx), %ymm23, %ymm21 + vpblendmq -4096(%rdx), %ymm23, %ymm21 +// CHECK: vpblendmq -4128(%rdx), %ymm23, %ymm21 + vpblendmq -4128(%rdx), %ymm23, %ymm21 +// CHECK: vpblendmq 1016(%rdx){1to4}, %ymm23, %ymm21 + vpblendmq 1016(%rdx){1to4}, %ymm23, %ymm21 +// CHECK: vpblendmq 1024(%rdx){1to4}, %ymm23, %ymm21 + vpblendmq 1024(%rdx){1to4}, %ymm23, %ymm21 +// CHECK: vpblendmq -1024(%rdx){1to4}, %ymm23, %ymm21 + vpblendmq -1024(%rdx){1to4}, %ymm23, %ymm21 +// CHECK: vpblendmq -1032(%rdx){1to4}, %ymm23, %ymm21 + vpblendmq -1032(%rdx){1to4}, %ymm23, %ymm21